PyPI - gwaslab - Versions diffs - 3.5.6__py3-none-any.whl → 3.5.8__py3-none-any.whl - Mend

gwaslab 3.5.6py3-none-any.whl → 3.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (64) hide show

gwaslab/__init__.py +2 -0
gwaslab/bd_common_data.py +1 -0
gwaslab/bd_get_hapmap3.py +0 -1
gwaslab/data/formatbook.json +78 -0
gwaslab/g_Sumstats.py +98 -24
gwaslab/g_SumstatsMulti.py +287 -0
gwaslab/g_SumstatsPair.py +101 -16
gwaslab/g_Sumstats_polars.py +245 -0
gwaslab/g_headers.py +12 -3
gwaslab/g_meta.py +123 -47
gwaslab/g_meta_update.py +48 -0
gwaslab/g_vchange_status_polars.py +44 -0
gwaslab/g_version.py +2 -2
gwaslab/hm_casting.py +169 -110
gwaslab/hm_casting_polars.py +202 -0
gwaslab/hm_harmonize_sumstats.py +19 -8
gwaslab/io_load_ld.py +529 -0
gwaslab/io_preformat_input.py +11 -0
gwaslab/io_preformat_input_polars.py +632 -0
gwaslab/io_process_args.py +25 -1
gwaslab/io_read_ldsc.py +34 -3
gwaslab/io_read_pipcs.py +62 -6
gwaslab/prscs_gigrnd.py +122 -0
gwaslab/prscs_mcmc_gtb.py +136 -0
gwaslab/prscs_parse_genet.py +98 -0
gwaslab/qc_build.py +53 -0
gwaslab/qc_check_datatype.py +10 -8
gwaslab/qc_check_datatype_polars.py +128 -0
gwaslab/qc_fix_sumstats.py +25 -23
gwaslab/qc_fix_sumstats_polars.py +193 -0
gwaslab/util_ex_calculate_ldmatrix.py +49 -19
gwaslab/util_ex_gwascatalog.py +71 -28
gwaslab/util_ex_ldsc.py +67 -21
gwaslab/util_ex_match_ldmatrix.py +396 -0
gwaslab/util_ex_run_2samplemr.py +0 -2
gwaslab/util_ex_run_ccgwas.py +155 -0
gwaslab/util_ex_run_coloc.py +1 -1
gwaslab/util_ex_run_hyprcoloc.py +117 -0
gwaslab/util_ex_run_mesusie.py +155 -0
gwaslab/util_ex_run_mtag.py +92 -0
gwaslab/util_ex_run_prscs.py +85 -0
gwaslab/util_ex_run_susie.py +40 -9
gwaslab/util_in_estimate_ess.py +18 -0
gwaslab/util_in_fill_data.py +20 -1
gwaslab/util_in_filter_value.py +10 -5
gwaslab/util_in_get_sig.py +71 -13
gwaslab/util_in_meta.py +168 -4
gwaslab/util_in_meta_polars.py +174 -0
gwaslab/viz_plot_compare_effect.py +87 -23
gwaslab/viz_plot_credible_sets.py +55 -11
gwaslab/viz_plot_effect.py +22 -12
gwaslab/viz_plot_miamiplot2.py +3 -2
gwaslab/viz_plot_mqqplot.py +165 -141
gwaslab/viz_plot_qqplot.py +6 -6
gwaslab/viz_plot_regional2.py +5 -13
gwaslab/viz_plot_rg_heatmap.py +6 -1
gwaslab/viz_plot_stackedregional.py +21 -6
{gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/METADATA +9 -7
gwaslab-3.5.8.dist-info/RECORD +117 -0
{gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
gwaslab-3.5.6.dist-info/RECORD +0 -96
{gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
{gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0

gwaslab/io_read_ldsc.py CHANGED Viewed

@@ -5,6 +5,7 @@ import numpy as np
 def read_ldsc(filelist=[],mode="h2"):
 #h2 mode
 #####################################################################
+    is_liab = False
     if mode=="h2":
         summary = pd.DataFrame(columns = ['Filename', 'h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se"])
@@ -18,6 +19,8 @@ def read_ldsc(filelist=[],mode="h2"):
                 line=""
                 while not re.compile('^Total Observed scale h2').match(line):
                     line = file.readline()
+                    if "h2_liab" in line:
+                        is_liab = True
                     if not line: break
                 try:
@@ -65,6 +68,11 @@ def read_ldsc(filelist=[],mode="h2"):
             #summary = summary.append(row,ignore_index=True)
             row = pd.DataFrame([row], columns = summary.columns)
             summary = pd.concat([summary, row], ignore_index=True)
+            if is_liab == True:
+                summary = summary.rename(columns={
+                    "h2_obs":"h2_liab",
+                    "h2_se":"h2_liab_se"
+                })
 ###############################################################################
     if mode=="rg":
         summary = pd.DataFrame(columns = ['p1',
@@ -76,7 +84,7 @@ def read_ldsc(filelist=[],mode="h2"):
                                           'h2_int','h2_int_se',
                                           'gcov_int','gcov_int_se']
                                )
         for index, ldscfile in enumerate(filelist):
             print("Loading file "+str(index+1)+" :" + ldscfile +" ...")
@@ -87,6 +95,9 @@ def read_ldsc(filelist=[],mode="h2"):
                     if not line: break
                 line = file.readline() # header
+                if "h2_liab" in line:
+                        is_liab = True
                 line = file.readline() #line1
                 ## first line h2 se
@@ -97,7 +108,12 @@ def read_ldsc(filelist=[],mode="h2"):
                     summary = pd.concat([summary, row_series], ignore_index=True)
                     line = file.readline()
         summary = summary.loc[summary["rg"]!="NA",:].copy()
-        summary[['rg','se' ,'z','p','h2_obs','h2_obs_se','h2_int','h2_int_se','gcov_int','gcov_int_se']]  = summary[['rg','se' ,'z','p','h2_obs','h2_obs_se','h2_int','h2_int_se','gcov_int','gcov_int_se']].astype("float32")
+        summary[['rg','se' ,'z','p','h2_obs','h2_obs_se','h2_int','h2_int_se','gcov_int','gcov_int_se']]  = summary[['rg','se' ,'z','p','h2_obs','h2_obs_se','h2_int','h2_int_se','gcov_int','gcov_int_se']].astype("float32")
+    if is_liab == True:
+            summary = summary.rename(columns={
+                "h2_obs":"h2_liab",
+                "h2_se":"h2_liab_se"
+            })
     return summary
@@ -198,7 +214,9 @@ def read_greml(filelist=[]):
     return summary
 def parse_ldsc_summary(ldsc_summary):
+    is_liab = False
+    if "Liability" in ldsc_summary:
+        is_liab = True
     lines = ldsc_summary.split("\n")
     columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se","Catagories"]
@@ -257,9 +275,17 @@ def parse_ldsc_summary(ldsc_summary):
     #summary = summary.append(row,ignore_index=True)
     row = pd.DataFrame([row], columns = summary.columns)
+    if is_liab == True:
+        row = row.rename(columns={
+            "h2_obs":"h2_liab",
+            "h2_se":"h2_liab_se"
+        })
     return row
 def parse_partitioned_ldsc_summary(ldsc_summary):
+    is_liab = False
+    if "Liability" in ldsc_summary:
+        is_liab = True
     summary = pd.DataFrame(columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se"])
     lines = ldsc_summary.split("\n")
     row={}
@@ -306,4 +332,9 @@ def parse_partitioned_ldsc_summary(ldsc_summary):
     #summary = summary.append(row,ignore_index=True)
     row = pd.DataFrame([row], columns = summary.columns)
+    if is_liab == True:
+        row = row.rename(columns={
+            "h2_obs":"h2_liab",
+            "h2_se":"h2_liab_se"
+        })
     return row

gwaslab/io_read_pipcs.py CHANGED Viewed

@@ -2,17 +2,64 @@ import pandas as pd
 from gwaslab.g_Log import Log
 from gwaslab.qc_check_datatype import check_datatype
 from gwaslab.qc_check_datatype import check_dataframe_memory_usage
+import re
+import os
-def _read_pipcs(data, output_prefix, log=Log(),verbose=True):
+def _read_pipcs(data,
+                output_prefix,
+                study=None,
+                group=None,
+                studie_names=None,
+                log=Log(),
+                verbose=True):
     log.write("Start to load PIP and CREDIBLE_SET_INDEX from file...",verbose=verbose)
-    log.write(" -File:{}.pipcs".format(output_prefix),verbose=verbose)
+    log.write(" -File:{}".format(output_prefix),verbose=verbose)
+    if "@" in output_prefix:
+        log.write(" -Detected @ in path: load all matching pipcs files ...",verbose=verbose)
+        pipcs_path_list = []
+        pipcs_loci_list = []
+        dirname = os.path.dirname(output_prefix)
+        files = os.listdir(dirname)
+        target_file_name = os.path.basename(output_prefix).replace('@','([\w:_]+)')
+        for file in files:
+            if re.search(target_file_name, file) is not None:
+                pipcs_path_list.append(dirname+"/"+file)
+                pipcs_loci_list.append(re.search(target_file_name, file)[1])
-    pipcs = pd.read_csv("{}.pipcs".format(output_prefix))
+        pipcs_single_list=[]
+        for index,pipcs_path in enumerate(pipcs_path_list):
+            log.write(" -Loading {}:".format(pipcs_loci_list[index]) + pipcs_path)
+            pipcs_single = pd.read_csv(pipcs_path)
+            if "LOCUS" not in pipcs_single.columns:
+                pipcs_single["LOCUS"]=pipcs_loci_list[index]
+            pipcs_single_list.append(pipcs_single)
+        pipcs = pd.concat(pipcs_single_list, axis=0, ignore_index=True)
+    else:
+        pipcs = pd.read_csv("{}".format(output_prefix))
-    log.write(" -Merging CHR and POS from main dataframe...",verbose=verbose)
-    pipcs = _merge_chrpos(data,pipcs)
+    if "CHR" not in pipcs.columns:
+        log.write(" -Merging CHR and POS from main dataframe...",verbose=verbose)
+        pipcs = _merge_chrpos(data,pipcs)
+    pipcs = pipcs.rename(columns={
+        "cs":"CREDIBLE_SET_INDEX",
+        "variable_prob":"PIP",
+        "variable":"N_SNP"
+    })
     log.write(" -Current pipcs Dataframe shape :",len(pipcs)," x ", len(pipcs.columns),verbose=verbose)
+    if group is not None:
+        pipcs["GROUP"] = group
+    if study is not None:
+        pipcs["STUDY"] = study
+    pipcs = _process_pip(pipcs, group, studie_names)
     check_datatype(pipcs,log=log,verbose=verbose)
     check_dataframe_memory_usage(pipcs,log=log,verbose=verbose)
     log.write("Finished loading PIP and CREDIBLE_SET_INDEX from file!",verbose=verbose)
@@ -20,4 +67,13 @@ def _read_pipcs(data, output_prefix, log=Log(),verbose=True):
 def _merge_chrpos(data,pipcs):
     df = pd.merge(pipcs, data,on="SNPID",how="left")
-    return df
+    return df
+def _process_pip(pipcs, group, studie_names):
+    if group is not None and "PIP" not in pipcs.columns:
+        pipcs["PIP"] = pipcs[studie_names]
+        for i in pipcs["CS_CATEGORY"].dropna().unique():
+            print(i)
+            pipcs.loc[pipcs["CS_CATEGORY"]==i,"PIP"] = pipcs.loc[pipcs["CS_CATEGORY"]==i,i]
+    return pipcs

gwaslab/prscs_gigrnd.py ADDED Viewed

@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+"""
+Random variate generator for the generalized inverse Gaussian distribution.
+Reference: L Devroye. Random variate generation for the generalized inverse Gaussian distribution.
+           Statistics and Computing, 24(2):239-246, 2014.
+"""
+import math
+from numpy import random
+def psi(x, alpha, lam):
+    f = -alpha*(math.cosh(x)-1.0)-lam*(math.exp(x)-x-1.0)
+    return f
+def dpsi(x, alpha, lam):
+    f = -alpha*math.sinh(x)-lam*(math.exp(x)-1.0)
+    return f
+def g(x, sd, td, f1, f2):
+    if (x >= -sd) and (x <= td):
+        f = 1.0
+    elif x > td:
+        f = f1
+    elif x < -sd:
+        f = f2
+    return f
+def gigrnd(p, a, b):
+    # setup -- sample from the two-parameter version gig(lam,omega)
+    p = float(p); a = float(a); b = float(b)
+    lam = p
+    omega = math.sqrt(a*b)
+    if lam < 0:
+        lam = -lam
+        swap = True
+    else:
+        swap = False
+    alpha = math.sqrt(math.pow(omega,2)+math.pow(lam,2))-lam
+    # find t
+    x = -psi(1.0, alpha, lam)
+    if (x >= 0.5) and (x <= 2.0):
+        t = 1.0
+    elif x > 2.0:
+        if (alpha == 0) and (lam == 0):
+            t = 1.0
+        else:
+            t = math.sqrt(2.0/(alpha+lam))
+    elif x < 0.5:
+        if (alpha == 0) and (lam == 0):
+            t = 1.0
+        else:
+            t = math.log(4.0/(alpha+2.0*lam))
+    # find s
+    x = -psi(-1.0, alpha, lam)
+    if (x >= 0.5) and (x <= 2.0):
+        s = 1.0
+    elif x > 2.0:
+        if (alpha == 0) and (lam == 0):
+            s = 1.0
+        else:
+            s = math.sqrt(4.0/(alpha*math.cosh(1)+lam))
+    elif x < 0.5:
+        if (alpha == 0) and (lam == 0):
+            s = 1.0
+        elif alpha == 0:
+            s = 1.0/lam
+        elif lam == 0:
+            s = math.log(1.0+1.0/alpha+math.sqrt(1.0/math.pow(alpha,2)+2.0/alpha))
+        else:
+            s = min(1.0/lam, math.log(1.0+1.0/alpha+math.sqrt(1.0/math.pow(alpha,2)+2.0/alpha)))
+    # find auxiliary parameters
+    eta = -psi(t, alpha, lam)
+    zeta = -dpsi(t, alpha, lam)
+    theta = -psi(-s, alpha, lam)
+    xi = dpsi(-s, alpha, lam)
+    p = 1.0/xi
+    r = 1.0/zeta
+    td = t-r*eta
+    sd = s-p*theta
+    q = td+sd
+    # random variate generation
+    while True:
+        U = random.random()
+        V = random.random()
+        W = random.random()
+        if U < q/(p+q+r):
+            rnd = -sd+q*V
+        elif U < (q+r)/(p+q+r):
+            rnd = td-r*math.log(V)
+        else:
+            rnd = -sd+p*math.log(V)
+        f1 = math.exp(-eta-zeta*(rnd-t))
+        f2 = math.exp(-theta+xi*(rnd+s))
+        if W*g(rnd, sd, td, f1, f2) <= math.exp(psi(rnd, alpha, lam)):
+            break
+    # transform back to the three-parameter version gig(p,a,b)
+    rnd = math.exp(rnd)*(lam/omega+math.sqrt(1.0+math.pow(lam,2)/math.pow(omega,2)))
+    if swap:
+        rnd = 1.0/rnd
+    rnd = rnd/math.sqrt(a/b)
+    return rnd

gwaslab/prscs_mcmc_gtb.py ADDED Viewed

@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+"""
+Markov Chain Monte Carlo (MCMC) sampler for polygenic prediction with continuous shrinkage (CS) priors.
+"""
+import numpy as np
+from scipy import linalg
+from numpy import random
+from gwaslab.prscs_gigrnd import gigrnd
+from gwaslab.g_Log import Log
+import time
+def mcmc(a, b, phi, sst_dict, n, ld_blk, blk_size, n_iter, n_burnin, thin, chrom, out_dir, beta_std, write_psi, write_pst, seed, log):
+    log.write('... MCMC ...')
+    # seed
+    if seed != None:
+        random.seed(seed)
+    # derived stats
+    beta_mrg = np.array(sst_dict['BETA'], ndmin=2).T
+    maf = np.array(sst_dict['MAF'], ndmin=2).T
+    n_pst = int((n_iter-n_burnin)/thin)
+    p = len(sst_dict['SNP'])
+    n_blk = len(ld_blk)
+    # initialization
+    beta = np.zeros((p,1))
+    psi = np.ones((p,1))
+    sigma = 1.0
+    if phi == None:
+        phi = 1.0; phi_updt = True
+    else:
+        phi_updt = False
+    if write_pst == 'TRUE':
+        beta_pst = np.zeros((p,n_pst))
+    beta_est = np.zeros((p,1))
+    psi_est = np.zeros((p,1))
+    sigma_est = 0.0
+    phi_est = 0.0
+    # MCMC
+    pp = 0
+    start_time = time.time()
+    for itr in range(1,n_iter+1):
+        if itr ==2:
+            loop_time = time.time() - start_time
+            log.write(" -Estimated time: {} mins".format((loop_time*n_iter)/60))
+        if itr % 100 == 0:
+            log.write('--- iter-' + str(itr) + ' ---')
+        elif itr % 100 > 2:
+            log.write('-', end="", show_time=False)
+        elif itr % 100 ==2:
+            log.write('-', end="")
+        mm = 0; quad = 0.0
+        for kk in range(n_blk):
+            if blk_size[kk] == 0:
+                continue
+            else:
+                idx_blk = range(mm,mm+blk_size[kk])
+                dinvt = ld_blk[kk]+np.diag(1.0/psi[idx_blk].to_series())
+                dinvt_chol = linalg.cholesky(dinvt)
+                beta_tmp = linalg.solve_triangular(dinvt_chol, beta_mrg[idx_blk], trans='T') + np.sqrt(sigma/n)*random.randn(len(idx_blk),1)
+                beta[idx_blk] = linalg.solve_triangular(dinvt_chol, beta_tmp, trans='N')
+                quad += np.dot(np.dot(beta[idx_blk].T, dinvt), beta[idx_blk])
+                mm += blk_size[kk]
+        err = max(n/2.0*(1.0-2.0*sum(beta*beta_mrg)+quad), n/2.0*sum(beta**2/psi))
+        sigma = 1.0/random.gamma((n+p)/2.0, 1.0/err)
+        delta = random.gamma(a+b, 1.0/(psi+phi))
+        for jj in range(p):
+            psi[jj] = gigrnd(a-0.5, 2.0*delta[jj], n*beta[jj]**2/sigma)
+        psi[psi>1] = 1.0
+        if phi_updt == True:
+            w = random.gamma(1.0, 1.0/(phi+1.0))
+            phi = random.gamma(p*b+0.5, 1.0/(sum(delta)+w))
+        # posterior
+        if (itr>n_burnin) and (itr % thin == 0):
+            beta_est = beta_est + beta/n_pst
+            psi_est = psi_est + psi/n_pst
+            sigma_est = sigma_est + sigma/n_pst
+            phi_est = phi_est + phi/n_pst
+            if write_pst == 'TRUE':
+                beta_pst[:,[pp]] = beta
+                pp += 1
+    # convert standardized beta to per-allele beta
+    if beta_std == 'FALSE':
+        beta_est /= np.sqrt(2.0*maf*(1.0-maf))
+        if write_pst == 'TRUE':
+            beta_pst /= np.sqrt(2.0*maf*(1.0-maf))
+    # write posterior effect sizes
+    if phi_updt == True:
+        eff_file = out_dir + '_pst_eff_a%d_b%.1f_phiauto_chr%d.txt' % (a, b, chrom)
+    else:
+        eff_file = out_dir + '_pst_eff_a%d_b%.1f_phi%1.0e_chr%d.txt' % (a, b, phi, chrom)
+    with open(eff_file, 'w') as ff:
+        if write_pst == 'TRUE':
+            for snp, bp, a1, a2, beta in zip(sst_dict['SNP'], sst_dict['BP'], sst_dict['A1'], sst_dict['A2'], beta_pst):
+                ff.write(('%d\t%s\t%d\t%s\t%s' + '\t%.6e'*n_pst + '\n') % (chrom, snp, bp, a1, a2, *beta))
+        else:
+            for snp, bp, a1, a2, beta in zip(sst_dict['SNP'], sst_dict['BP'], sst_dict['A1'], sst_dict['A2'], beta_est):
+                ff.write('%d\t%s\t%d\t%s\t%s\t%.6e\n' % (chrom, snp, bp, a1, a2, beta))
+    # write posterior estimates of psi
+    if write_psi == 'TRUE':
+        if phi_updt == True:
+            psi_file = out_dir + '_pst_psi_a%d_b%.1f_phiauto_chr%d.txt' % (a, b, chrom)
+        else:
+            psi_file = out_dir + '_pst_psi_a%d_b%.1f_phi%1.0e_chr%d.txt' % (a, b, phi, chrom)
+        with open(psi_file, 'w') as ff:
+            for snp, psi in zip(sst_dict['SNP'], psi_est):
+                ff.write('%s\t%.6e\n' % (snp, psi))
+    # print estimated phi
+    if phi_updt == True:
+        log.write('... Estimated global shrinkage parameter: %1.2e ...' % phi_est )
+    log.write('... Done ...')

gwaslab/prscs_parse_genet.py ADDED Viewed

@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+"""
+Parse the reference panel, summary statistics, and validation set.
+"""
+import os
+import numpy as np
+from scipy.stats import norm
+from scipy import linalg
+import h5py
+import pandas as pd
+def parse_ref(ref_file, chrom, log):
+    # ref_dict = {'CHR':[], 'SNP':[], 'BP':[], 'A1':[], 'A2':[], 'MAF':[]}
+    ref_dict = pd.read_csv(ref_file ,sep="\t")
+    ref_dict = ref_dict.loc[ref_dict["CHR"]==chrom,:]
+    return ref_dict
+def parse_bim(bim_file, chrom):
+    vld_dict = pd.read_csv(bim_file + '.bim' ,sep="\t", usecols=[1,3,4])
+    vld_dict.columns=["SNP","A1","A2"]
+    return vld_dict
+def parse_sumstats(ref_dict, vld_dict, sst_file, n_subj, log):
+    n_sqrt = np.sqrt(n_subj)
+    sst_file.dropna()
+    sst_file["CHR"] = sst_file["CHR"].astype("int64")
+    sst_file["BP"] = sst_file["BP"].astype("int64")
+    sst_file["EA"] = sst_file["EA"].astype("string")
+    sst_file["NEA"] = sst_file["NEA"].astype("string")
+    sst_file = pd.merge(sst_file, ref_dict, on=["SNP","CHR","BP"],how="inner")
+    is_flipped = ((sst_file["NEA"] == sst_file["A1"]) &(sst_file["EA"] == sst_file["A2"]))
+    is_valid = ((sst_file["EA"] == sst_file["A1"]) & (sst_file["NEA"] == sst_file["A2"]))| is_flipped
+    sst_file = sst_file.loc[is_valid,:]
+    sst_file.loc[is_flipped, "MAF"] = 1 - sst_file.loc[is_flipped, "MAF"]
+    sst_file["BETA"] = sst_file["BETA"] / sst_file["SE"] / n_sqrt
+    sst_file.loc[~is_flipped, "BETA"] = 1 * sst_file.loc[~is_flipped, "BETA"]
+    sst_file.loc[is_flipped, "BETA"] = -1 * sst_file.loc[is_flipped, "BETA"]
+    sst_file["FLP"] = 1
+    sst_file.loc[is_flipped, "FLP"] = -1
+    log.write(" -Number of common SNPs:{}".format(len(sst_file)))
+    sst_dict= sst_file[['CHR', 'SNP', 'BP', 'A1', 'A2', 'MAF', 'BETA', 'FLP']].to_dict("list")
+    return sst_dict
+def parse_ldblk(ldblk_dir, sst_dict, chrom, log):
+    log.write('... parse reference LD on chromosome %d ...' % chrom)
+    if '1kg' in os.path.basename(ldblk_dir):
+        chr_name = ldblk_dir + '/ldblk_1kg_chr' + str(chrom) + '.hdf5'
+    elif 'ukbb' in os.path.basename(ldblk_dir):
+        chr_name = ldblk_dir + '/ldblk_ukbb_chr' + str(chrom) + '.hdf5'
+    hdf_chr = h5py.File(chr_name, 'r')
+    n_blk = len(hdf_chr)
+    ld_blk = [np.array(hdf_chr['blk_'+str(blk)]['ldblk']) for blk in range(1,n_blk+1)]
+    snp_blk = []
+    for blk in range(1,n_blk+1):
+        snp_blk.append([bb.decode("UTF-8") for bb in list(hdf_chr['blk_'+str(blk)]['snplist'])])
+    blk_size = []
+    mm = 0
+    for blk in range(n_blk):
+        idx = [ii for (ii, snp) in enumerate(snp_blk[blk]) if snp in sst_dict['SNP']]
+        blk_size.append(len(idx))
+        if idx != []:
+            idx_blk = range(mm,mm+len(idx))
+            flip = [sst_dict['FLP'][jj] for jj in idx_blk]
+            ld_blk[blk] = ld_blk[blk][np.ix_(idx,idx)]*np.outer(flip,flip)
+            _, s, v = linalg.svd(ld_blk[blk])
+            h = np.dot(v.T, np.dot(np.diag(s), v))
+            ld_blk[blk] = (ld_blk[blk]+h)/2
+            mm += len(idx)
+        else:
+            ld_blk[blk] = np.array([])
+    return ld_blk, blk_size

gwaslab/qc_build.py ADDED Viewed

@@ -0,0 +1,53 @@
+import re
+import gc
+import pandas as pd
+import numpy as np
+from itertools import repeat
+from multiprocessing import  Pool
+from liftover import get_lifter
+from liftover import ChainFile
+from functools import partial
+from gwaslab.g_vchange_status import vchange_status
+from gwaslab.g_Log import Log
+def _process_build(build, log, verbose):
+    if str(build).lower() in ["hg19","19","37","b37","grch37"]:
+        log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
+        final_build = "19"
+    elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
+        log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
+        final_build = "18"
+    elif str(build).lower() in ["hg38","38","b38","grch38"]:
+        log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
+        final_build = "38"
+    elif str(build).lower() in ["t2t","hs1","chm13","13"]:
+        log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
+        final_build = "13"
+    else:
+        log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
+        final_build = "99"
+    return final_build
+def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
+    build = _process_build(build,log=log,verbose=verbose)
+    sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
+    sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
+    return sumstats, build
+def _check_build(target_build, build="99", status="STATUS",verbose=True,log=Log()):
+    target_build = _process_build(target_build,log=log,verbose=verbose)
+    build = _process_build(build,log=log,verbose=verbose)
+    if build == "99":
+        raise ValueError("Sumstats build is unknown. Please run infer_build() or set_build()")
+    if target_build == "99":
+        raise ValueError("Target build is unknown.")
+    if build!=target_build:
+        raise ValueError("Please make sure sumstats build is {}".format(target_build))
+    else:
+        log.write(" -Sumstats build matches target build")
+    return True

gwaslab/qc_check_datatype.py CHANGED Viewed

@@ -7,10 +7,10 @@ from gwaslab.g_Log import Log
 dtype_dict ={
     "SNPID":["string","object"],
-    "rsID":["string","object"],
-    "CHR":["Int64","int64","int32","Int32","int"],
-    "POS":["int64","Int64"],
-    "EA":["category"],
+    "rsID": ["string","object"],
+    "CHR":  ["Int64","int64","int32","Int32","int"],
+    "POS":  ["int64","Int64"],
+    "EA":   ["category"],
     "NEA":["category"],
     "REF":["category"],
     "ALT":["category"],
@@ -35,7 +35,7 @@ dtype_dict ={
     "TEST":["string","object","category"],
     "CHISQ":["float64"],
     "I2":["float64"],
-    "PHET":["float64"],
+    "P_HET":["float64"],
     "SNPR2":["float64"],
     "EAF":["float64","float","float32"],
     "NEAF":["float64","float","float32"],
@@ -48,7 +48,11 @@ dtype_dict ={
     'CREDIBLE_SET_INDEX':["Int64","int64","int32","Int32","int"],
     'N_SNP'             :["Int64","int64","int32","Int32","int"],
     'LOCUS'             :["string","object","category"],
-    'STUDY'             :["string","object","category"]
+    'STUDY'             :["string","object","category"],
+    'BETA_RANDOM' :["float64"],
+    'SE_RANDOM' :["float64"],
+    'Z_RANDOM' :["float64"],
+    'P_RANDOM' :["float64"]
     }
 def check_datatype(sumstats, verbose=True, log=Log()):
@@ -108,8 +112,6 @@ def quick_convert_datatype(sumstats, log, verbose):
                     pass
     return sumstats
 def check_dataframe_shape(sumstats, log, verbose):
     memory_in_mb = sumstats.memory_usage().sum()/1024/1024
     try:

gwaslab 3.5.6__py3-none-any.whl → 3.5.8__py3-none-any.whl

Potentially problematic release.

gwaslab 3.5.6py3-none-any.whl → 3.5.8py3-none-any.whl