gwaslab 3.5.6__py3-none-any.whl → 3.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (64) hide show
  1. gwaslab/__init__.py +2 -0
  2. gwaslab/bd_common_data.py +1 -0
  3. gwaslab/bd_get_hapmap3.py +0 -1
  4. gwaslab/data/formatbook.json +78 -0
  5. gwaslab/g_Sumstats.py +98 -24
  6. gwaslab/g_SumstatsMulti.py +287 -0
  7. gwaslab/g_SumstatsPair.py +101 -16
  8. gwaslab/g_Sumstats_polars.py +245 -0
  9. gwaslab/g_headers.py +12 -3
  10. gwaslab/g_meta.py +123 -47
  11. gwaslab/g_meta_update.py +48 -0
  12. gwaslab/g_vchange_status_polars.py +44 -0
  13. gwaslab/g_version.py +2 -2
  14. gwaslab/hm_casting.py +169 -110
  15. gwaslab/hm_casting_polars.py +202 -0
  16. gwaslab/hm_harmonize_sumstats.py +19 -8
  17. gwaslab/io_load_ld.py +529 -0
  18. gwaslab/io_preformat_input.py +11 -0
  19. gwaslab/io_preformat_input_polars.py +632 -0
  20. gwaslab/io_process_args.py +25 -1
  21. gwaslab/io_read_ldsc.py +34 -3
  22. gwaslab/io_read_pipcs.py +62 -6
  23. gwaslab/prscs_gigrnd.py +122 -0
  24. gwaslab/prscs_mcmc_gtb.py +136 -0
  25. gwaslab/prscs_parse_genet.py +98 -0
  26. gwaslab/qc_build.py +53 -0
  27. gwaslab/qc_check_datatype.py +10 -8
  28. gwaslab/qc_check_datatype_polars.py +128 -0
  29. gwaslab/qc_fix_sumstats.py +25 -23
  30. gwaslab/qc_fix_sumstats_polars.py +193 -0
  31. gwaslab/util_ex_calculate_ldmatrix.py +49 -19
  32. gwaslab/util_ex_gwascatalog.py +71 -28
  33. gwaslab/util_ex_ldsc.py +67 -21
  34. gwaslab/util_ex_match_ldmatrix.py +396 -0
  35. gwaslab/util_ex_run_2samplemr.py +0 -2
  36. gwaslab/util_ex_run_ccgwas.py +155 -0
  37. gwaslab/util_ex_run_coloc.py +1 -1
  38. gwaslab/util_ex_run_hyprcoloc.py +117 -0
  39. gwaslab/util_ex_run_mesusie.py +155 -0
  40. gwaslab/util_ex_run_mtag.py +92 -0
  41. gwaslab/util_ex_run_prscs.py +85 -0
  42. gwaslab/util_ex_run_susie.py +40 -9
  43. gwaslab/util_in_estimate_ess.py +18 -0
  44. gwaslab/util_in_fill_data.py +20 -1
  45. gwaslab/util_in_filter_value.py +10 -5
  46. gwaslab/util_in_get_sig.py +71 -13
  47. gwaslab/util_in_meta.py +168 -4
  48. gwaslab/util_in_meta_polars.py +174 -0
  49. gwaslab/viz_plot_compare_effect.py +87 -23
  50. gwaslab/viz_plot_credible_sets.py +55 -11
  51. gwaslab/viz_plot_effect.py +22 -12
  52. gwaslab/viz_plot_miamiplot2.py +3 -2
  53. gwaslab/viz_plot_mqqplot.py +165 -141
  54. gwaslab/viz_plot_qqplot.py +6 -6
  55. gwaslab/viz_plot_regional2.py +5 -13
  56. gwaslab/viz_plot_rg_heatmap.py +6 -1
  57. gwaslab/viz_plot_stackedregional.py +21 -6
  58. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/METADATA +9 -7
  59. gwaslab-3.5.8.dist-info/RECORD +117 -0
  60. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
  61. gwaslab-3.5.6.dist-info/RECORD +0 -96
  62. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
  63. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
  64. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0
gwaslab/io_read_ldsc.py CHANGED
@@ -5,6 +5,7 @@ import numpy as np
5
5
  def read_ldsc(filelist=[],mode="h2"):
6
6
  #h2 mode
7
7
  #####################################################################
8
+ is_liab = False
8
9
  if mode=="h2":
9
10
  summary = pd.DataFrame(columns = ['Filename', 'h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se"])
10
11
 
@@ -18,6 +19,8 @@ def read_ldsc(filelist=[],mode="h2"):
18
19
  line=""
19
20
  while not re.compile('^Total Observed scale h2').match(line):
20
21
  line = file.readline()
22
+ if "h2_liab" in line:
23
+ is_liab = True
21
24
  if not line: break
22
25
 
23
26
  try:
@@ -65,6 +68,11 @@ def read_ldsc(filelist=[],mode="h2"):
65
68
  #summary = summary.append(row,ignore_index=True)
66
69
  row = pd.DataFrame([row], columns = summary.columns)
67
70
  summary = pd.concat([summary, row], ignore_index=True)
71
+ if is_liab == True:
72
+ summary = summary.rename(columns={
73
+ "h2_obs":"h2_liab",
74
+ "h2_se":"h2_liab_se"
75
+ })
68
76
  ###############################################################################
69
77
  if mode=="rg":
70
78
  summary = pd.DataFrame(columns = ['p1',
@@ -76,7 +84,7 @@ def read_ldsc(filelist=[],mode="h2"):
76
84
  'h2_int','h2_int_se',
77
85
  'gcov_int','gcov_int_se']
78
86
  )
79
-
87
+
80
88
  for index, ldscfile in enumerate(filelist):
81
89
  print("Loading file "+str(index+1)+" :" + ldscfile +" ...")
82
90
 
@@ -87,6 +95,9 @@ def read_ldsc(filelist=[],mode="h2"):
87
95
  if not line: break
88
96
  line = file.readline() # header
89
97
 
98
+ if "h2_liab" in line:
99
+ is_liab = True
100
+
90
101
  line = file.readline() #line1
91
102
 
92
103
  ## first line h2 se
@@ -97,7 +108,12 @@ def read_ldsc(filelist=[],mode="h2"):
97
108
  summary = pd.concat([summary, row_series], ignore_index=True)
98
109
  line = file.readline()
99
110
  summary = summary.loc[summary["rg"]!="NA",:].copy()
100
- summary[['rg','se' ,'z','p','h2_obs','h2_obs_se','h2_int','h2_int_se','gcov_int','gcov_int_se']] = summary[['rg','se' ,'z','p','h2_obs','h2_obs_se','h2_int','h2_int_se','gcov_int','gcov_int_se']].astype("float32")
111
+ summary[['rg','se' ,'z','p','h2_obs','h2_obs_se','h2_int','h2_int_se','gcov_int','gcov_int_se']] = summary[['rg','se' ,'z','p','h2_obs','h2_obs_se','h2_int','h2_int_se','gcov_int','gcov_int_se']].astype("float32")
112
+ if is_liab == True:
113
+ summary = summary.rename(columns={
114
+ "h2_obs":"h2_liab",
115
+ "h2_se":"h2_liab_se"
116
+ })
101
117
  return summary
102
118
 
103
119
 
@@ -198,7 +214,9 @@ def read_greml(filelist=[]):
198
214
  return summary
199
215
 
200
216
  def parse_ldsc_summary(ldsc_summary):
201
-
217
+ is_liab = False
218
+ if "Liability" in ldsc_summary:
219
+ is_liab = True
202
220
  lines = ldsc_summary.split("\n")
203
221
 
204
222
  columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se","Catagories"]
@@ -257,9 +275,17 @@ def parse_ldsc_summary(ldsc_summary):
257
275
 
258
276
  #summary = summary.append(row,ignore_index=True)
259
277
  row = pd.DataFrame([row], columns = summary.columns)
278
+ if is_liab == True:
279
+ row = row.rename(columns={
280
+ "h2_obs":"h2_liab",
281
+ "h2_se":"h2_liab_se"
282
+ })
260
283
  return row
261
284
 
262
285
  def parse_partitioned_ldsc_summary(ldsc_summary):
286
+ is_liab = False
287
+ if "Liability" in ldsc_summary:
288
+ is_liab = True
263
289
  summary = pd.DataFrame(columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se"])
264
290
  lines = ldsc_summary.split("\n")
265
291
  row={}
@@ -306,4 +332,9 @@ def parse_partitioned_ldsc_summary(ldsc_summary):
306
332
 
307
333
  #summary = summary.append(row,ignore_index=True)
308
334
  row = pd.DataFrame([row], columns = summary.columns)
335
+ if is_liab == True:
336
+ row = row.rename(columns={
337
+ "h2_obs":"h2_liab",
338
+ "h2_se":"h2_liab_se"
339
+ })
309
340
  return row
gwaslab/io_read_pipcs.py CHANGED
@@ -2,17 +2,64 @@ import pandas as pd
2
2
  from gwaslab.g_Log import Log
3
3
  from gwaslab.qc_check_datatype import check_datatype
4
4
  from gwaslab.qc_check_datatype import check_dataframe_memory_usage
5
+ import re
6
+ import os
5
7
 
6
- def _read_pipcs(data, output_prefix, log=Log(),verbose=True):
8
+ def _read_pipcs(data,
9
+ output_prefix,
10
+ study=None,
11
+ group=None,
12
+ studie_names=None,
13
+ log=Log(),
14
+ verbose=True):
15
+
7
16
  log.write("Start to load PIP and CREDIBLE_SET_INDEX from file...",verbose=verbose)
8
- log.write(" -File:{}.pipcs".format(output_prefix),verbose=verbose)
17
+ log.write(" -File:{}".format(output_prefix),verbose=verbose)
18
+
19
+ if "@" in output_prefix:
20
+ log.write(" -Detected @ in path: load all matching pipcs files ...",verbose=verbose)
21
+ pipcs_path_list = []
22
+ pipcs_loci_list = []
23
+
24
+ dirname = os.path.dirname(output_prefix)
25
+ files = os.listdir(dirname)
26
+ target_file_name = os.path.basename(output_prefix).replace('@','([\w:_]+)')
27
+ for file in files:
28
+ if re.search(target_file_name, file) is not None:
29
+ pipcs_path_list.append(dirname+"/"+file)
30
+ pipcs_loci_list.append(re.search(target_file_name, file)[1])
9
31
 
10
- pipcs = pd.read_csv("{}.pipcs".format(output_prefix))
32
+ pipcs_single_list=[]
33
+ for index,pipcs_path in enumerate(pipcs_path_list):
34
+ log.write(" -Loading {}:".format(pipcs_loci_list[index]) + pipcs_path)
35
+ pipcs_single = pd.read_csv(pipcs_path)
36
+ if "LOCUS" not in pipcs_single.columns:
37
+ pipcs_single["LOCUS"]=pipcs_loci_list[index]
38
+ pipcs_single_list.append(pipcs_single)
39
+
40
+ pipcs = pd.concat(pipcs_single_list, axis=0, ignore_index=True)
41
+ else:
42
+ pipcs = pd.read_csv("{}".format(output_prefix))
11
43
 
12
- log.write(" -Merging CHR and POS from main dataframe...",verbose=verbose)
13
- pipcs = _merge_chrpos(data,pipcs)
44
+ if "CHR" not in pipcs.columns:
45
+ log.write(" -Merging CHR and POS from main dataframe...",verbose=verbose)
46
+ pipcs = _merge_chrpos(data,pipcs)
47
+
48
+ pipcs = pipcs.rename(columns={
49
+ "cs":"CREDIBLE_SET_INDEX",
50
+ "variable_prob":"PIP",
51
+ "variable":"N_SNP"
52
+ })
14
53
 
15
54
  log.write(" -Current pipcs Dataframe shape :",len(pipcs)," x ", len(pipcs.columns),verbose=verbose)
55
+
56
+ if group is not None:
57
+ pipcs["GROUP"] = group
58
+ if study is not None:
59
+ pipcs["STUDY"] = study
60
+
61
+ pipcs = _process_pip(pipcs, group, studie_names)
62
+
16
63
  check_datatype(pipcs,log=log,verbose=verbose)
17
64
  check_dataframe_memory_usage(pipcs,log=log,verbose=verbose)
18
65
  log.write("Finished loading PIP and CREDIBLE_SET_INDEX from file!",verbose=verbose)
@@ -20,4 +67,13 @@ def _read_pipcs(data, output_prefix, log=Log(),verbose=True):
20
67
 
21
68
  def _merge_chrpos(data,pipcs):
22
69
  df = pd.merge(pipcs, data,on="SNPID",how="left")
23
- return df
70
+ return df
71
+
72
+ def _process_pip(pipcs, group, studie_names):
73
+ if group is not None and "PIP" not in pipcs.columns:
74
+ pipcs["PIP"] = pipcs[studie_names]
75
+
76
+ for i in pipcs["CS_CATEGORY"].dropna().unique():
77
+ print(i)
78
+ pipcs.loc[pipcs["CS_CATEGORY"]==i,"PIP"] = pipcs.loc[pipcs["CS_CATEGORY"]==i,i]
79
+ return pipcs
@@ -0,0 +1,122 @@
1
+ #!/usr/bin/env python
2
+
3
+ """
4
+ Random variate generator for the generalized inverse Gaussian distribution.
5
+ Reference: L Devroye. Random variate generation for the generalized inverse Gaussian distribution.
6
+ Statistics and Computing, 24(2):239-246, 2014.
7
+
8
+ """
9
+
10
+
11
+ import math
12
+ from numpy import random
13
+
14
+
15
+ def psi(x, alpha, lam):
16
+ f = -alpha*(math.cosh(x)-1.0)-lam*(math.exp(x)-x-1.0)
17
+ return f
18
+
19
+
20
+ def dpsi(x, alpha, lam):
21
+ f = -alpha*math.sinh(x)-lam*(math.exp(x)-1.0)
22
+ return f
23
+
24
+
25
+ def g(x, sd, td, f1, f2):
26
+ if (x >= -sd) and (x <= td):
27
+ f = 1.0
28
+ elif x > td:
29
+ f = f1
30
+ elif x < -sd:
31
+ f = f2
32
+
33
+ return f
34
+
35
+
36
+ def gigrnd(p, a, b):
37
+ # setup -- sample from the two-parameter version gig(lam,omega)
38
+ p = float(p); a = float(a); b = float(b)
39
+ lam = p
40
+ omega = math.sqrt(a*b)
41
+
42
+ if lam < 0:
43
+ lam = -lam
44
+ swap = True
45
+ else:
46
+ swap = False
47
+
48
+ alpha = math.sqrt(math.pow(omega,2)+math.pow(lam,2))-lam
49
+
50
+ # find t
51
+ x = -psi(1.0, alpha, lam)
52
+ if (x >= 0.5) and (x <= 2.0):
53
+ t = 1.0
54
+ elif x > 2.0:
55
+ if (alpha == 0) and (lam == 0):
56
+ t = 1.0
57
+ else:
58
+ t = math.sqrt(2.0/(alpha+lam))
59
+ elif x < 0.5:
60
+ if (alpha == 0) and (lam == 0):
61
+ t = 1.0
62
+ else:
63
+ t = math.log(4.0/(alpha+2.0*lam))
64
+
65
+ # find s
66
+ x = -psi(-1.0, alpha, lam)
67
+ if (x >= 0.5) and (x <= 2.0):
68
+ s = 1.0
69
+ elif x > 2.0:
70
+ if (alpha == 0) and (lam == 0):
71
+ s = 1.0
72
+ else:
73
+ s = math.sqrt(4.0/(alpha*math.cosh(1)+lam))
74
+ elif x < 0.5:
75
+ if (alpha == 0) and (lam == 0):
76
+ s = 1.0
77
+ elif alpha == 0:
78
+ s = 1.0/lam
79
+ elif lam == 0:
80
+ s = math.log(1.0+1.0/alpha+math.sqrt(1.0/math.pow(alpha,2)+2.0/alpha))
81
+ else:
82
+ s = min(1.0/lam, math.log(1.0+1.0/alpha+math.sqrt(1.0/math.pow(alpha,2)+2.0/alpha)))
83
+
84
+ # find auxiliary parameters
85
+ eta = -psi(t, alpha, lam)
86
+ zeta = -dpsi(t, alpha, lam)
87
+ theta = -psi(-s, alpha, lam)
88
+ xi = dpsi(-s, alpha, lam)
89
+
90
+ p = 1.0/xi
91
+ r = 1.0/zeta
92
+
93
+ td = t-r*eta
94
+ sd = s-p*theta
95
+ q = td+sd
96
+
97
+ # random variate generation
98
+ while True:
99
+ U = random.random()
100
+ V = random.random()
101
+ W = random.random()
102
+ if U < q/(p+q+r):
103
+ rnd = -sd+q*V
104
+ elif U < (q+r)/(p+q+r):
105
+ rnd = td-r*math.log(V)
106
+ else:
107
+ rnd = -sd+p*math.log(V)
108
+
109
+ f1 = math.exp(-eta-zeta*(rnd-t))
110
+ f2 = math.exp(-theta+xi*(rnd+s))
111
+ if W*g(rnd, sd, td, f1, f2) <= math.exp(psi(rnd, alpha, lam)):
112
+ break
113
+
114
+ # transform back to the three-parameter version gig(p,a,b)
115
+ rnd = math.exp(rnd)*(lam/omega+math.sqrt(1.0+math.pow(lam,2)/math.pow(omega,2)))
116
+ if swap:
117
+ rnd = 1.0/rnd
118
+
119
+ rnd = rnd/math.sqrt(a/b)
120
+ return rnd
121
+
122
+
@@ -0,0 +1,136 @@
1
+ #!/usr/bin/env python
2
+
3
+ """
4
+ Markov Chain Monte Carlo (MCMC) sampler for polygenic prediction with continuous shrinkage (CS) priors.
5
+
6
+ """
7
+ import numpy as np
8
+ from scipy import linalg
9
+ from numpy import random
10
+ from gwaslab.prscs_gigrnd import gigrnd
11
+ from gwaslab.g_Log import Log
12
+ import time
13
+ def mcmc(a, b, phi, sst_dict, n, ld_blk, blk_size, n_iter, n_burnin, thin, chrom, out_dir, beta_std, write_psi, write_pst, seed, log):
14
+ log.write('... MCMC ...')
15
+
16
+ # seed
17
+ if seed != None:
18
+ random.seed(seed)
19
+
20
+ # derived stats
21
+ beta_mrg = np.array(sst_dict['BETA'], ndmin=2).T
22
+ maf = np.array(sst_dict['MAF'], ndmin=2).T
23
+ n_pst = int((n_iter-n_burnin)/thin)
24
+ p = len(sst_dict['SNP'])
25
+ n_blk = len(ld_blk)
26
+
27
+ # initialization
28
+ beta = np.zeros((p,1))
29
+ psi = np.ones((p,1))
30
+ sigma = 1.0
31
+
32
+ if phi == None:
33
+ phi = 1.0; phi_updt = True
34
+ else:
35
+ phi_updt = False
36
+
37
+ if write_pst == 'TRUE':
38
+ beta_pst = np.zeros((p,n_pst))
39
+
40
+ beta_est = np.zeros((p,1))
41
+ psi_est = np.zeros((p,1))
42
+ sigma_est = 0.0
43
+ phi_est = 0.0
44
+
45
+ # MCMC
46
+ pp = 0
47
+ start_time = time.time()
48
+ for itr in range(1,n_iter+1):
49
+ if itr ==2:
50
+ loop_time = time.time() - start_time
51
+ log.write(" -Estimated time: {} mins".format((loop_time*n_iter)/60))
52
+
53
+ if itr % 100 == 0:
54
+ log.write('--- iter-' + str(itr) + ' ---')
55
+ elif itr % 100 > 2:
56
+ log.write('-', end="", show_time=False)
57
+ elif itr % 100 ==2:
58
+ log.write('-', end="")
59
+
60
+ mm = 0; quad = 0.0
61
+
62
+ for kk in range(n_blk):
63
+ if blk_size[kk] == 0:
64
+ continue
65
+ else:
66
+ idx_blk = range(mm,mm+blk_size[kk])
67
+ dinvt = ld_blk[kk]+np.diag(1.0/psi[idx_blk].to_series())
68
+ dinvt_chol = linalg.cholesky(dinvt)
69
+ beta_tmp = linalg.solve_triangular(dinvt_chol, beta_mrg[idx_blk], trans='T') + np.sqrt(sigma/n)*random.randn(len(idx_blk),1)
70
+ beta[idx_blk] = linalg.solve_triangular(dinvt_chol, beta_tmp, trans='N')
71
+ quad += np.dot(np.dot(beta[idx_blk].T, dinvt), beta[idx_blk])
72
+ mm += blk_size[kk]
73
+
74
+ err = max(n/2.0*(1.0-2.0*sum(beta*beta_mrg)+quad), n/2.0*sum(beta**2/psi))
75
+
76
+ sigma = 1.0/random.gamma((n+p)/2.0, 1.0/err)
77
+ delta = random.gamma(a+b, 1.0/(psi+phi))
78
+ for jj in range(p):
79
+ psi[jj] = gigrnd(a-0.5, 2.0*delta[jj], n*beta[jj]**2/sigma)
80
+ psi[psi>1] = 1.0
81
+
82
+ if phi_updt == True:
83
+ w = random.gamma(1.0, 1.0/(phi+1.0))
84
+ phi = random.gamma(p*b+0.5, 1.0/(sum(delta)+w))
85
+
86
+ # posterior
87
+ if (itr>n_burnin) and (itr % thin == 0):
88
+ beta_est = beta_est + beta/n_pst
89
+ psi_est = psi_est + psi/n_pst
90
+ sigma_est = sigma_est + sigma/n_pst
91
+ phi_est = phi_est + phi/n_pst
92
+
93
+ if write_pst == 'TRUE':
94
+ beta_pst[:,[pp]] = beta
95
+ pp += 1
96
+
97
+ # convert standardized beta to per-allele beta
98
+ if beta_std == 'FALSE':
99
+ beta_est /= np.sqrt(2.0*maf*(1.0-maf))
100
+
101
+ if write_pst == 'TRUE':
102
+ beta_pst /= np.sqrt(2.0*maf*(1.0-maf))
103
+
104
+
105
+ # write posterior effect sizes
106
+ if phi_updt == True:
107
+ eff_file = out_dir + '_pst_eff_a%d_b%.1f_phiauto_chr%d.txt' % (a, b, chrom)
108
+ else:
109
+ eff_file = out_dir + '_pst_eff_a%d_b%.1f_phi%1.0e_chr%d.txt' % (a, b, phi, chrom)
110
+
111
+ with open(eff_file, 'w') as ff:
112
+ if write_pst == 'TRUE':
113
+ for snp, bp, a1, a2, beta in zip(sst_dict['SNP'], sst_dict['BP'], sst_dict['A1'], sst_dict['A2'], beta_pst):
114
+ ff.write(('%d\t%s\t%d\t%s\t%s' + '\t%.6e'*n_pst + '\n') % (chrom, snp, bp, a1, a2, *beta))
115
+ else:
116
+ for snp, bp, a1, a2, beta in zip(sst_dict['SNP'], sst_dict['BP'], sst_dict['A1'], sst_dict['A2'], beta_est):
117
+ ff.write('%d\t%s\t%d\t%s\t%s\t%.6e\n' % (chrom, snp, bp, a1, a2, beta))
118
+
119
+ # write posterior estimates of psi
120
+ if write_psi == 'TRUE':
121
+ if phi_updt == True:
122
+ psi_file = out_dir + '_pst_psi_a%d_b%.1f_phiauto_chr%d.txt' % (a, b, chrom)
123
+ else:
124
+ psi_file = out_dir + '_pst_psi_a%d_b%.1f_phi%1.0e_chr%d.txt' % (a, b, phi, chrom)
125
+
126
+ with open(psi_file, 'w') as ff:
127
+ for snp, psi in zip(sst_dict['SNP'], psi_est):
128
+ ff.write('%s\t%.6e\n' % (snp, psi))
129
+
130
+ # print estimated phi
131
+ if phi_updt == True:
132
+ log.write('... Estimated global shrinkage parameter: %1.2e ...' % phi_est )
133
+
134
+ log.write('... Done ...')
135
+
136
+
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env python
2
+
3
+ """
4
+ Parse the reference panel, summary statistics, and validation set.
5
+
6
+ """
7
+
8
+ import os
9
+ import numpy as np
10
+ from scipy.stats import norm
11
+ from scipy import linalg
12
+ import h5py
13
+ import pandas as pd
14
+
15
+
16
+ def parse_ref(ref_file, chrom, log):
17
+ # ref_dict = {'CHR':[], 'SNP':[], 'BP':[], 'A1':[], 'A2':[], 'MAF':[]}
18
+ ref_dict = pd.read_csv(ref_file ,sep="\t")
19
+ ref_dict = ref_dict.loc[ref_dict["CHR"]==chrom,:]
20
+ return ref_dict
21
+
22
+
23
+ def parse_bim(bim_file, chrom):
24
+
25
+ vld_dict = pd.read_csv(bim_file + '.bim' ,sep="\t", usecols=[1,3,4])
26
+ vld_dict.columns=["SNP","A1","A2"]
27
+ return vld_dict
28
+
29
+
30
+ def parse_sumstats(ref_dict, vld_dict, sst_file, n_subj, log):
31
+
32
+ n_sqrt = np.sqrt(n_subj)
33
+
34
+ sst_file.dropna()
35
+ sst_file["CHR"] = sst_file["CHR"].astype("int64")
36
+ sst_file["BP"] = sst_file["BP"].astype("int64")
37
+ sst_file["EA"] = sst_file["EA"].astype("string")
38
+ sst_file["NEA"] = sst_file["NEA"].astype("string")
39
+
40
+ sst_file = pd.merge(sst_file, ref_dict, on=["SNP","CHR","BP"],how="inner")
41
+
42
+ is_flipped = ((sst_file["NEA"] == sst_file["A1"]) &(sst_file["EA"] == sst_file["A2"]))
43
+ is_valid = ((sst_file["EA"] == sst_file["A1"]) & (sst_file["NEA"] == sst_file["A2"]))| is_flipped
44
+
45
+ sst_file = sst_file.loc[is_valid,:]
46
+
47
+
48
+ sst_file.loc[is_flipped, "MAF"] = 1 - sst_file.loc[is_flipped, "MAF"]
49
+ sst_file["BETA"] = sst_file["BETA"] / sst_file["SE"] / n_sqrt
50
+
51
+ sst_file.loc[~is_flipped, "BETA"] = 1 * sst_file.loc[~is_flipped, "BETA"]
52
+ sst_file.loc[is_flipped, "BETA"] = -1 * sst_file.loc[is_flipped, "BETA"]
53
+
54
+ sst_file["FLP"] = 1
55
+ sst_file.loc[is_flipped, "FLP"] = -1
56
+ log.write(" -Number of common SNPs:{}".format(len(sst_file)))
57
+ sst_dict= sst_file[['CHR', 'SNP', 'BP', 'A1', 'A2', 'MAF', 'BETA', 'FLP']].to_dict("list")
58
+
59
+ return sst_dict
60
+
61
+
62
+ def parse_ldblk(ldblk_dir, sst_dict, chrom, log):
63
+ log.write('... parse reference LD on chromosome %d ...' % chrom)
64
+
65
+ if '1kg' in os.path.basename(ldblk_dir):
66
+ chr_name = ldblk_dir + '/ldblk_1kg_chr' + str(chrom) + '.hdf5'
67
+ elif 'ukbb' in os.path.basename(ldblk_dir):
68
+ chr_name = ldblk_dir + '/ldblk_ukbb_chr' + str(chrom) + '.hdf5'
69
+
70
+ hdf_chr = h5py.File(chr_name, 'r')
71
+ n_blk = len(hdf_chr)
72
+ ld_blk = [np.array(hdf_chr['blk_'+str(blk)]['ldblk']) for blk in range(1,n_blk+1)]
73
+
74
+ snp_blk = []
75
+ for blk in range(1,n_blk+1):
76
+ snp_blk.append([bb.decode("UTF-8") for bb in list(hdf_chr['blk_'+str(blk)]['snplist'])])
77
+
78
+ blk_size = []
79
+ mm = 0
80
+ for blk in range(n_blk):
81
+ idx = [ii for (ii, snp) in enumerate(snp_blk[blk]) if snp in sst_dict['SNP']]
82
+ blk_size.append(len(idx))
83
+ if idx != []:
84
+ idx_blk = range(mm,mm+len(idx))
85
+ flip = [sst_dict['FLP'][jj] for jj in idx_blk]
86
+ ld_blk[blk] = ld_blk[blk][np.ix_(idx,idx)]*np.outer(flip,flip)
87
+
88
+ _, s, v = linalg.svd(ld_blk[blk])
89
+ h = np.dot(v.T, np.dot(np.diag(s), v))
90
+ ld_blk[blk] = (ld_blk[blk]+h)/2
91
+
92
+ mm += len(idx)
93
+ else:
94
+ ld_blk[blk] = np.array([])
95
+
96
+ return ld_blk, blk_size
97
+
98
+
gwaslab/qc_build.py ADDED
@@ -0,0 +1,53 @@
1
+ import re
2
+ import gc
3
+ import pandas as pd
4
+ import numpy as np
5
+ from itertools import repeat
6
+ from multiprocessing import Pool
7
+ from liftover import get_lifter
8
+ from liftover import ChainFile
9
+ from functools import partial
10
+ from gwaslab.g_vchange_status import vchange_status
11
+ from gwaslab.g_Log import Log
12
+
13
+ def _process_build(build, log, verbose):
14
+ if str(build).lower() in ["hg19","19","37","b37","grch37"]:
15
+ log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
16
+ final_build = "19"
17
+ elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
18
+ log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
19
+ final_build = "18"
20
+ elif str(build).lower() in ["hg38","38","b38","grch38"]:
21
+ log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
22
+ final_build = "38"
23
+ elif str(build).lower() in ["t2t","hs1","chm13","13"]:
24
+ log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
25
+ final_build = "13"
26
+ else:
27
+ log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
28
+ final_build = "99"
29
+ return final_build
30
+
31
+ def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
32
+ build = _process_build(build,log=log,verbose=verbose)
33
+ sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
34
+ sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
35
+ return sumstats, build
36
+
37
+ def _check_build(target_build, build="99", status="STATUS",verbose=True,log=Log()):
38
+ target_build = _process_build(target_build,log=log,verbose=verbose)
39
+ build = _process_build(build,log=log,verbose=verbose)
40
+ if build == "99":
41
+ raise ValueError("Sumstats build is unknown. Please run infer_build() or set_build()")
42
+
43
+ if target_build == "99":
44
+ raise ValueError("Target build is unknown.")
45
+
46
+ if build!=target_build:
47
+ raise ValueError("Please make sure sumstats build is {}".format(target_build))
48
+ else:
49
+ log.write(" -Sumstats build matches target build")
50
+
51
+ return True
52
+
53
+
@@ -7,10 +7,10 @@ from gwaslab.g_Log import Log
7
7
 
8
8
  dtype_dict ={
9
9
  "SNPID":["string","object"],
10
- "rsID":["string","object"],
11
- "CHR":["Int64","int64","int32","Int32","int"],
12
- "POS":["int64","Int64"],
13
- "EA":["category"],
10
+ "rsID": ["string","object"],
11
+ "CHR": ["Int64","int64","int32","Int32","int"],
12
+ "POS": ["int64","Int64"],
13
+ "EA": ["category"],
14
14
  "NEA":["category"],
15
15
  "REF":["category"],
16
16
  "ALT":["category"],
@@ -35,7 +35,7 @@ dtype_dict ={
35
35
  "TEST":["string","object","category"],
36
36
  "CHISQ":["float64"],
37
37
  "I2":["float64"],
38
- "PHET":["float64"],
38
+ "P_HET":["float64"],
39
39
  "SNPR2":["float64"],
40
40
  "EAF":["float64","float","float32"],
41
41
  "NEAF":["float64","float","float32"],
@@ -48,7 +48,11 @@ dtype_dict ={
48
48
  'CREDIBLE_SET_INDEX':["Int64","int64","int32","Int32","int"],
49
49
  'N_SNP' :["Int64","int64","int32","Int32","int"],
50
50
  'LOCUS' :["string","object","category"],
51
- 'STUDY' :["string","object","category"]
51
+ 'STUDY' :["string","object","category"],
52
+ 'BETA_RANDOM' :["float64"],
53
+ 'SE_RANDOM' :["float64"],
54
+ 'Z_RANDOM' :["float64"],
55
+ 'P_RANDOM' :["float64"]
52
56
  }
53
57
 
54
58
  def check_datatype(sumstats, verbose=True, log=Log()):
@@ -108,8 +112,6 @@ def quick_convert_datatype(sumstats, log, verbose):
108
112
  pass
109
113
  return sumstats
110
114
 
111
-
112
-
113
115
  def check_dataframe_shape(sumstats, log, verbose):
114
116
  memory_in_mb = sumstats.memory_usage().sum()/1024/1024
115
117
  try: