gwaslab 3.5.6__py3-none-any.whl → 3.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +2 -0
- gwaslab/bd_common_data.py +1 -0
- gwaslab/bd_get_hapmap3.py +0 -1
- gwaslab/data/formatbook.json +78 -0
- gwaslab/g_Sumstats.py +98 -24
- gwaslab/g_SumstatsMulti.py +287 -0
- gwaslab/g_SumstatsPair.py +101 -16
- gwaslab/g_Sumstats_polars.py +245 -0
- gwaslab/g_headers.py +12 -3
- gwaslab/g_meta.py +123 -47
- gwaslab/g_meta_update.py +48 -0
- gwaslab/g_vchange_status_polars.py +44 -0
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +169 -110
- gwaslab/hm_casting_polars.py +202 -0
- gwaslab/hm_harmonize_sumstats.py +19 -8
- gwaslab/io_load_ld.py +529 -0
- gwaslab/io_preformat_input.py +11 -0
- gwaslab/io_preformat_input_polars.py +632 -0
- gwaslab/io_process_args.py +25 -1
- gwaslab/io_read_ldsc.py +34 -3
- gwaslab/io_read_pipcs.py +62 -6
- gwaslab/prscs_gigrnd.py +122 -0
- gwaslab/prscs_mcmc_gtb.py +136 -0
- gwaslab/prscs_parse_genet.py +98 -0
- gwaslab/qc_build.py +53 -0
- gwaslab/qc_check_datatype.py +10 -8
- gwaslab/qc_check_datatype_polars.py +128 -0
- gwaslab/qc_fix_sumstats.py +25 -23
- gwaslab/qc_fix_sumstats_polars.py +193 -0
- gwaslab/util_ex_calculate_ldmatrix.py +49 -19
- gwaslab/util_ex_gwascatalog.py +71 -28
- gwaslab/util_ex_ldsc.py +67 -21
- gwaslab/util_ex_match_ldmatrix.py +396 -0
- gwaslab/util_ex_run_2samplemr.py +0 -2
- gwaslab/util_ex_run_ccgwas.py +155 -0
- gwaslab/util_ex_run_coloc.py +1 -1
- gwaslab/util_ex_run_hyprcoloc.py +117 -0
- gwaslab/util_ex_run_mesusie.py +155 -0
- gwaslab/util_ex_run_mtag.py +92 -0
- gwaslab/util_ex_run_prscs.py +85 -0
- gwaslab/util_ex_run_susie.py +40 -9
- gwaslab/util_in_estimate_ess.py +18 -0
- gwaslab/util_in_fill_data.py +20 -1
- gwaslab/util_in_filter_value.py +10 -5
- gwaslab/util_in_get_sig.py +71 -13
- gwaslab/util_in_meta.py +168 -4
- gwaslab/util_in_meta_polars.py +174 -0
- gwaslab/viz_plot_compare_effect.py +87 -23
- gwaslab/viz_plot_credible_sets.py +55 -11
- gwaslab/viz_plot_effect.py +22 -12
- gwaslab/viz_plot_miamiplot2.py +3 -2
- gwaslab/viz_plot_mqqplot.py +165 -141
- gwaslab/viz_plot_qqplot.py +6 -6
- gwaslab/viz_plot_regional2.py +5 -13
- gwaslab/viz_plot_rg_heatmap.py +6 -1
- gwaslab/viz_plot_stackedregional.py +21 -6
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/METADATA +9 -7
- gwaslab-3.5.8.dist-info/RECORD +117 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
- gwaslab-3.5.6.dist-info/RECORD +0 -96
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0
gwaslab/io_read_ldsc.py
CHANGED
|
@@ -5,6 +5,7 @@ import numpy as np
|
|
|
5
5
|
def read_ldsc(filelist=[],mode="h2"):
|
|
6
6
|
#h2 mode
|
|
7
7
|
#####################################################################
|
|
8
|
+
is_liab = False
|
|
8
9
|
if mode=="h2":
|
|
9
10
|
summary = pd.DataFrame(columns = ['Filename', 'h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se"])
|
|
10
11
|
|
|
@@ -18,6 +19,8 @@ def read_ldsc(filelist=[],mode="h2"):
|
|
|
18
19
|
line=""
|
|
19
20
|
while not re.compile('^Total Observed scale h2').match(line):
|
|
20
21
|
line = file.readline()
|
|
22
|
+
if "h2_liab" in line:
|
|
23
|
+
is_liab = True
|
|
21
24
|
if not line: break
|
|
22
25
|
|
|
23
26
|
try:
|
|
@@ -65,6 +68,11 @@ def read_ldsc(filelist=[],mode="h2"):
|
|
|
65
68
|
#summary = summary.append(row,ignore_index=True)
|
|
66
69
|
row = pd.DataFrame([row], columns = summary.columns)
|
|
67
70
|
summary = pd.concat([summary, row], ignore_index=True)
|
|
71
|
+
if is_liab == True:
|
|
72
|
+
summary = summary.rename(columns={
|
|
73
|
+
"h2_obs":"h2_liab",
|
|
74
|
+
"h2_se":"h2_liab_se"
|
|
75
|
+
})
|
|
68
76
|
###############################################################################
|
|
69
77
|
if mode=="rg":
|
|
70
78
|
summary = pd.DataFrame(columns = ['p1',
|
|
@@ -76,7 +84,7 @@ def read_ldsc(filelist=[],mode="h2"):
|
|
|
76
84
|
'h2_int','h2_int_se',
|
|
77
85
|
'gcov_int','gcov_int_se']
|
|
78
86
|
)
|
|
79
|
-
|
|
87
|
+
|
|
80
88
|
for index, ldscfile in enumerate(filelist):
|
|
81
89
|
print("Loading file "+str(index+1)+" :" + ldscfile +" ...")
|
|
82
90
|
|
|
@@ -87,6 +95,9 @@ def read_ldsc(filelist=[],mode="h2"):
|
|
|
87
95
|
if not line: break
|
|
88
96
|
line = file.readline() # header
|
|
89
97
|
|
|
98
|
+
if "h2_liab" in line:
|
|
99
|
+
is_liab = True
|
|
100
|
+
|
|
90
101
|
line = file.readline() #line1
|
|
91
102
|
|
|
92
103
|
## first line h2 se
|
|
@@ -97,7 +108,12 @@ def read_ldsc(filelist=[],mode="h2"):
|
|
|
97
108
|
summary = pd.concat([summary, row_series], ignore_index=True)
|
|
98
109
|
line = file.readline()
|
|
99
110
|
summary = summary.loc[summary["rg"]!="NA",:].copy()
|
|
100
|
-
summary[['rg','se' ,'z','p','h2_obs','h2_obs_se','h2_int','h2_int_se','gcov_int','gcov_int_se']] = summary[['rg','se' ,'z','p','h2_obs','h2_obs_se','h2_int','h2_int_se','gcov_int','gcov_int_se']].astype("float32")
|
|
111
|
+
summary[['rg','se' ,'z','p','h2_obs','h2_obs_se','h2_int','h2_int_se','gcov_int','gcov_int_se']] = summary[['rg','se' ,'z','p','h2_obs','h2_obs_se','h2_int','h2_int_se','gcov_int','gcov_int_se']].astype("float32")
|
|
112
|
+
if is_liab == True:
|
|
113
|
+
summary = summary.rename(columns={
|
|
114
|
+
"h2_obs":"h2_liab",
|
|
115
|
+
"h2_se":"h2_liab_se"
|
|
116
|
+
})
|
|
101
117
|
return summary
|
|
102
118
|
|
|
103
119
|
|
|
@@ -198,7 +214,9 @@ def read_greml(filelist=[]):
|
|
|
198
214
|
return summary
|
|
199
215
|
|
|
200
216
|
def parse_ldsc_summary(ldsc_summary):
|
|
201
|
-
|
|
217
|
+
is_liab = False
|
|
218
|
+
if "Liability" in ldsc_summary:
|
|
219
|
+
is_liab = True
|
|
202
220
|
lines = ldsc_summary.split("\n")
|
|
203
221
|
|
|
204
222
|
columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se","Catagories"]
|
|
@@ -257,9 +275,17 @@ def parse_ldsc_summary(ldsc_summary):
|
|
|
257
275
|
|
|
258
276
|
#summary = summary.append(row,ignore_index=True)
|
|
259
277
|
row = pd.DataFrame([row], columns = summary.columns)
|
|
278
|
+
if is_liab == True:
|
|
279
|
+
row = row.rename(columns={
|
|
280
|
+
"h2_obs":"h2_liab",
|
|
281
|
+
"h2_se":"h2_liab_se"
|
|
282
|
+
})
|
|
260
283
|
return row
|
|
261
284
|
|
|
262
285
|
def parse_partitioned_ldsc_summary(ldsc_summary):
|
|
286
|
+
is_liab = False
|
|
287
|
+
if "Liability" in ldsc_summary:
|
|
288
|
+
is_liab = True
|
|
263
289
|
summary = pd.DataFrame(columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se"])
|
|
264
290
|
lines = ldsc_summary.split("\n")
|
|
265
291
|
row={}
|
|
@@ -306,4 +332,9 @@ def parse_partitioned_ldsc_summary(ldsc_summary):
|
|
|
306
332
|
|
|
307
333
|
#summary = summary.append(row,ignore_index=True)
|
|
308
334
|
row = pd.DataFrame([row], columns = summary.columns)
|
|
335
|
+
if is_liab == True:
|
|
336
|
+
row = row.rename(columns={
|
|
337
|
+
"h2_obs":"h2_liab",
|
|
338
|
+
"h2_se":"h2_liab_se"
|
|
339
|
+
})
|
|
309
340
|
return row
|
gwaslab/io_read_pipcs.py
CHANGED
|
@@ -2,17 +2,64 @@ import pandas as pd
|
|
|
2
2
|
from gwaslab.g_Log import Log
|
|
3
3
|
from gwaslab.qc_check_datatype import check_datatype
|
|
4
4
|
from gwaslab.qc_check_datatype import check_dataframe_memory_usage
|
|
5
|
+
import re
|
|
6
|
+
import os
|
|
5
7
|
|
|
6
|
-
def _read_pipcs(data,
|
|
8
|
+
def _read_pipcs(data,
|
|
9
|
+
output_prefix,
|
|
10
|
+
study=None,
|
|
11
|
+
group=None,
|
|
12
|
+
studie_names=None,
|
|
13
|
+
log=Log(),
|
|
14
|
+
verbose=True):
|
|
15
|
+
|
|
7
16
|
log.write("Start to load PIP and CREDIBLE_SET_INDEX from file...",verbose=verbose)
|
|
8
|
-
log.write(" -File:{}
|
|
17
|
+
log.write(" -File:{}".format(output_prefix),verbose=verbose)
|
|
18
|
+
|
|
19
|
+
if "@" in output_prefix:
|
|
20
|
+
log.write(" -Detected @ in path: load all matching pipcs files ...",verbose=verbose)
|
|
21
|
+
pipcs_path_list = []
|
|
22
|
+
pipcs_loci_list = []
|
|
23
|
+
|
|
24
|
+
dirname = os.path.dirname(output_prefix)
|
|
25
|
+
files = os.listdir(dirname)
|
|
26
|
+
target_file_name = os.path.basename(output_prefix).replace('@','([\w:_]+)')
|
|
27
|
+
for file in files:
|
|
28
|
+
if re.search(target_file_name, file) is not None:
|
|
29
|
+
pipcs_path_list.append(dirname+"/"+file)
|
|
30
|
+
pipcs_loci_list.append(re.search(target_file_name, file)[1])
|
|
9
31
|
|
|
10
|
-
|
|
32
|
+
pipcs_single_list=[]
|
|
33
|
+
for index,pipcs_path in enumerate(pipcs_path_list):
|
|
34
|
+
log.write(" -Loading {}:".format(pipcs_loci_list[index]) + pipcs_path)
|
|
35
|
+
pipcs_single = pd.read_csv(pipcs_path)
|
|
36
|
+
if "LOCUS" not in pipcs_single.columns:
|
|
37
|
+
pipcs_single["LOCUS"]=pipcs_loci_list[index]
|
|
38
|
+
pipcs_single_list.append(pipcs_single)
|
|
39
|
+
|
|
40
|
+
pipcs = pd.concat(pipcs_single_list, axis=0, ignore_index=True)
|
|
41
|
+
else:
|
|
42
|
+
pipcs = pd.read_csv("{}".format(output_prefix))
|
|
11
43
|
|
|
12
|
-
|
|
13
|
-
|
|
44
|
+
if "CHR" not in pipcs.columns:
|
|
45
|
+
log.write(" -Merging CHR and POS from main dataframe...",verbose=verbose)
|
|
46
|
+
pipcs = _merge_chrpos(data,pipcs)
|
|
47
|
+
|
|
48
|
+
pipcs = pipcs.rename(columns={
|
|
49
|
+
"cs":"CREDIBLE_SET_INDEX",
|
|
50
|
+
"variable_prob":"PIP",
|
|
51
|
+
"variable":"N_SNP"
|
|
52
|
+
})
|
|
14
53
|
|
|
15
54
|
log.write(" -Current pipcs Dataframe shape :",len(pipcs)," x ", len(pipcs.columns),verbose=verbose)
|
|
55
|
+
|
|
56
|
+
if group is not None:
|
|
57
|
+
pipcs["GROUP"] = group
|
|
58
|
+
if study is not None:
|
|
59
|
+
pipcs["STUDY"] = study
|
|
60
|
+
|
|
61
|
+
pipcs = _process_pip(pipcs, group, studie_names)
|
|
62
|
+
|
|
16
63
|
check_datatype(pipcs,log=log,verbose=verbose)
|
|
17
64
|
check_dataframe_memory_usage(pipcs,log=log,verbose=verbose)
|
|
18
65
|
log.write("Finished loading PIP and CREDIBLE_SET_INDEX from file!",verbose=verbose)
|
|
@@ -20,4 +67,13 @@ def _read_pipcs(data, output_prefix, log=Log(),verbose=True):
|
|
|
20
67
|
|
|
21
68
|
def _merge_chrpos(data,pipcs):
|
|
22
69
|
df = pd.merge(pipcs, data,on="SNPID",how="left")
|
|
23
|
-
return df
|
|
70
|
+
return df
|
|
71
|
+
|
|
72
|
+
def _process_pip(pipcs, group, studie_names):
|
|
73
|
+
if group is not None and "PIP" not in pipcs.columns:
|
|
74
|
+
pipcs["PIP"] = pipcs[studie_names]
|
|
75
|
+
|
|
76
|
+
for i in pipcs["CS_CATEGORY"].dropna().unique():
|
|
77
|
+
print(i)
|
|
78
|
+
pipcs.loc[pipcs["CS_CATEGORY"]==i,"PIP"] = pipcs.loc[pipcs["CS_CATEGORY"]==i,i]
|
|
79
|
+
return pipcs
|
gwaslab/prscs_gigrnd.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Random variate generator for the generalized inverse Gaussian distribution.
|
|
5
|
+
Reference: L Devroye. Random variate generation for the generalized inverse Gaussian distribution.
|
|
6
|
+
Statistics and Computing, 24(2):239-246, 2014.
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
from numpy import random
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def psi(x, alpha, lam):
|
|
16
|
+
f = -alpha*(math.cosh(x)-1.0)-lam*(math.exp(x)-x-1.0)
|
|
17
|
+
return f
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def dpsi(x, alpha, lam):
|
|
21
|
+
f = -alpha*math.sinh(x)-lam*(math.exp(x)-1.0)
|
|
22
|
+
return f
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def g(x, sd, td, f1, f2):
|
|
26
|
+
if (x >= -sd) and (x <= td):
|
|
27
|
+
f = 1.0
|
|
28
|
+
elif x > td:
|
|
29
|
+
f = f1
|
|
30
|
+
elif x < -sd:
|
|
31
|
+
f = f2
|
|
32
|
+
|
|
33
|
+
return f
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def gigrnd(p, a, b):
|
|
37
|
+
# setup -- sample from the two-parameter version gig(lam,omega)
|
|
38
|
+
p = float(p); a = float(a); b = float(b)
|
|
39
|
+
lam = p
|
|
40
|
+
omega = math.sqrt(a*b)
|
|
41
|
+
|
|
42
|
+
if lam < 0:
|
|
43
|
+
lam = -lam
|
|
44
|
+
swap = True
|
|
45
|
+
else:
|
|
46
|
+
swap = False
|
|
47
|
+
|
|
48
|
+
alpha = math.sqrt(math.pow(omega,2)+math.pow(lam,2))-lam
|
|
49
|
+
|
|
50
|
+
# find t
|
|
51
|
+
x = -psi(1.0, alpha, lam)
|
|
52
|
+
if (x >= 0.5) and (x <= 2.0):
|
|
53
|
+
t = 1.0
|
|
54
|
+
elif x > 2.0:
|
|
55
|
+
if (alpha == 0) and (lam == 0):
|
|
56
|
+
t = 1.0
|
|
57
|
+
else:
|
|
58
|
+
t = math.sqrt(2.0/(alpha+lam))
|
|
59
|
+
elif x < 0.5:
|
|
60
|
+
if (alpha == 0) and (lam == 0):
|
|
61
|
+
t = 1.0
|
|
62
|
+
else:
|
|
63
|
+
t = math.log(4.0/(alpha+2.0*lam))
|
|
64
|
+
|
|
65
|
+
# find s
|
|
66
|
+
x = -psi(-1.0, alpha, lam)
|
|
67
|
+
if (x >= 0.5) and (x <= 2.0):
|
|
68
|
+
s = 1.0
|
|
69
|
+
elif x > 2.0:
|
|
70
|
+
if (alpha == 0) and (lam == 0):
|
|
71
|
+
s = 1.0
|
|
72
|
+
else:
|
|
73
|
+
s = math.sqrt(4.0/(alpha*math.cosh(1)+lam))
|
|
74
|
+
elif x < 0.5:
|
|
75
|
+
if (alpha == 0) and (lam == 0):
|
|
76
|
+
s = 1.0
|
|
77
|
+
elif alpha == 0:
|
|
78
|
+
s = 1.0/lam
|
|
79
|
+
elif lam == 0:
|
|
80
|
+
s = math.log(1.0+1.0/alpha+math.sqrt(1.0/math.pow(alpha,2)+2.0/alpha))
|
|
81
|
+
else:
|
|
82
|
+
s = min(1.0/lam, math.log(1.0+1.0/alpha+math.sqrt(1.0/math.pow(alpha,2)+2.0/alpha)))
|
|
83
|
+
|
|
84
|
+
# find auxiliary parameters
|
|
85
|
+
eta = -psi(t, alpha, lam)
|
|
86
|
+
zeta = -dpsi(t, alpha, lam)
|
|
87
|
+
theta = -psi(-s, alpha, lam)
|
|
88
|
+
xi = dpsi(-s, alpha, lam)
|
|
89
|
+
|
|
90
|
+
p = 1.0/xi
|
|
91
|
+
r = 1.0/zeta
|
|
92
|
+
|
|
93
|
+
td = t-r*eta
|
|
94
|
+
sd = s-p*theta
|
|
95
|
+
q = td+sd
|
|
96
|
+
|
|
97
|
+
# random variate generation
|
|
98
|
+
while True:
|
|
99
|
+
U = random.random()
|
|
100
|
+
V = random.random()
|
|
101
|
+
W = random.random()
|
|
102
|
+
if U < q/(p+q+r):
|
|
103
|
+
rnd = -sd+q*V
|
|
104
|
+
elif U < (q+r)/(p+q+r):
|
|
105
|
+
rnd = td-r*math.log(V)
|
|
106
|
+
else:
|
|
107
|
+
rnd = -sd+p*math.log(V)
|
|
108
|
+
|
|
109
|
+
f1 = math.exp(-eta-zeta*(rnd-t))
|
|
110
|
+
f2 = math.exp(-theta+xi*(rnd+s))
|
|
111
|
+
if W*g(rnd, sd, td, f1, f2) <= math.exp(psi(rnd, alpha, lam)):
|
|
112
|
+
break
|
|
113
|
+
|
|
114
|
+
# transform back to the three-parameter version gig(p,a,b)
|
|
115
|
+
rnd = math.exp(rnd)*(lam/omega+math.sqrt(1.0+math.pow(lam,2)/math.pow(omega,2)))
|
|
116
|
+
if swap:
|
|
117
|
+
rnd = 1.0/rnd
|
|
118
|
+
|
|
119
|
+
rnd = rnd/math.sqrt(a/b)
|
|
120
|
+
return rnd
|
|
121
|
+
|
|
122
|
+
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Markov Chain Monte Carlo (MCMC) sampler for polygenic prediction with continuous shrinkage (CS) priors.
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
import numpy as np
|
|
8
|
+
from scipy import linalg
|
|
9
|
+
from numpy import random
|
|
10
|
+
from gwaslab.prscs_gigrnd import gigrnd
|
|
11
|
+
from gwaslab.g_Log import Log
|
|
12
|
+
import time
|
|
13
|
+
def mcmc(a, b, phi, sst_dict, n, ld_blk, blk_size, n_iter, n_burnin, thin, chrom, out_dir, beta_std, write_psi, write_pst, seed, log):
|
|
14
|
+
log.write('... MCMC ...')
|
|
15
|
+
|
|
16
|
+
# seed
|
|
17
|
+
if seed != None:
|
|
18
|
+
random.seed(seed)
|
|
19
|
+
|
|
20
|
+
# derived stats
|
|
21
|
+
beta_mrg = np.array(sst_dict['BETA'], ndmin=2).T
|
|
22
|
+
maf = np.array(sst_dict['MAF'], ndmin=2).T
|
|
23
|
+
n_pst = int((n_iter-n_burnin)/thin)
|
|
24
|
+
p = len(sst_dict['SNP'])
|
|
25
|
+
n_blk = len(ld_blk)
|
|
26
|
+
|
|
27
|
+
# initialization
|
|
28
|
+
beta = np.zeros((p,1))
|
|
29
|
+
psi = np.ones((p,1))
|
|
30
|
+
sigma = 1.0
|
|
31
|
+
|
|
32
|
+
if phi == None:
|
|
33
|
+
phi = 1.0; phi_updt = True
|
|
34
|
+
else:
|
|
35
|
+
phi_updt = False
|
|
36
|
+
|
|
37
|
+
if write_pst == 'TRUE':
|
|
38
|
+
beta_pst = np.zeros((p,n_pst))
|
|
39
|
+
|
|
40
|
+
beta_est = np.zeros((p,1))
|
|
41
|
+
psi_est = np.zeros((p,1))
|
|
42
|
+
sigma_est = 0.0
|
|
43
|
+
phi_est = 0.0
|
|
44
|
+
|
|
45
|
+
# MCMC
|
|
46
|
+
pp = 0
|
|
47
|
+
start_time = time.time()
|
|
48
|
+
for itr in range(1,n_iter+1):
|
|
49
|
+
if itr ==2:
|
|
50
|
+
loop_time = time.time() - start_time
|
|
51
|
+
log.write(" -Estimated time: {} mins".format((loop_time*n_iter)/60))
|
|
52
|
+
|
|
53
|
+
if itr % 100 == 0:
|
|
54
|
+
log.write('--- iter-' + str(itr) + ' ---')
|
|
55
|
+
elif itr % 100 > 2:
|
|
56
|
+
log.write('-', end="", show_time=False)
|
|
57
|
+
elif itr % 100 ==2:
|
|
58
|
+
log.write('-', end="")
|
|
59
|
+
|
|
60
|
+
mm = 0; quad = 0.0
|
|
61
|
+
|
|
62
|
+
for kk in range(n_blk):
|
|
63
|
+
if blk_size[kk] == 0:
|
|
64
|
+
continue
|
|
65
|
+
else:
|
|
66
|
+
idx_blk = range(mm,mm+blk_size[kk])
|
|
67
|
+
dinvt = ld_blk[kk]+np.diag(1.0/psi[idx_blk].to_series())
|
|
68
|
+
dinvt_chol = linalg.cholesky(dinvt)
|
|
69
|
+
beta_tmp = linalg.solve_triangular(dinvt_chol, beta_mrg[idx_blk], trans='T') + np.sqrt(sigma/n)*random.randn(len(idx_blk),1)
|
|
70
|
+
beta[idx_blk] = linalg.solve_triangular(dinvt_chol, beta_tmp, trans='N')
|
|
71
|
+
quad += np.dot(np.dot(beta[idx_blk].T, dinvt), beta[idx_blk])
|
|
72
|
+
mm += blk_size[kk]
|
|
73
|
+
|
|
74
|
+
err = max(n/2.0*(1.0-2.0*sum(beta*beta_mrg)+quad), n/2.0*sum(beta**2/psi))
|
|
75
|
+
|
|
76
|
+
sigma = 1.0/random.gamma((n+p)/2.0, 1.0/err)
|
|
77
|
+
delta = random.gamma(a+b, 1.0/(psi+phi))
|
|
78
|
+
for jj in range(p):
|
|
79
|
+
psi[jj] = gigrnd(a-0.5, 2.0*delta[jj], n*beta[jj]**2/sigma)
|
|
80
|
+
psi[psi>1] = 1.0
|
|
81
|
+
|
|
82
|
+
if phi_updt == True:
|
|
83
|
+
w = random.gamma(1.0, 1.0/(phi+1.0))
|
|
84
|
+
phi = random.gamma(p*b+0.5, 1.0/(sum(delta)+w))
|
|
85
|
+
|
|
86
|
+
# posterior
|
|
87
|
+
if (itr>n_burnin) and (itr % thin == 0):
|
|
88
|
+
beta_est = beta_est + beta/n_pst
|
|
89
|
+
psi_est = psi_est + psi/n_pst
|
|
90
|
+
sigma_est = sigma_est + sigma/n_pst
|
|
91
|
+
phi_est = phi_est + phi/n_pst
|
|
92
|
+
|
|
93
|
+
if write_pst == 'TRUE':
|
|
94
|
+
beta_pst[:,[pp]] = beta
|
|
95
|
+
pp += 1
|
|
96
|
+
|
|
97
|
+
# convert standardized beta to per-allele beta
|
|
98
|
+
if beta_std == 'FALSE':
|
|
99
|
+
beta_est /= np.sqrt(2.0*maf*(1.0-maf))
|
|
100
|
+
|
|
101
|
+
if write_pst == 'TRUE':
|
|
102
|
+
beta_pst /= np.sqrt(2.0*maf*(1.0-maf))
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# write posterior effect sizes
|
|
106
|
+
if phi_updt == True:
|
|
107
|
+
eff_file = out_dir + '_pst_eff_a%d_b%.1f_phiauto_chr%d.txt' % (a, b, chrom)
|
|
108
|
+
else:
|
|
109
|
+
eff_file = out_dir + '_pst_eff_a%d_b%.1f_phi%1.0e_chr%d.txt' % (a, b, phi, chrom)
|
|
110
|
+
|
|
111
|
+
with open(eff_file, 'w') as ff:
|
|
112
|
+
if write_pst == 'TRUE':
|
|
113
|
+
for snp, bp, a1, a2, beta in zip(sst_dict['SNP'], sst_dict['BP'], sst_dict['A1'], sst_dict['A2'], beta_pst):
|
|
114
|
+
ff.write(('%d\t%s\t%d\t%s\t%s' + '\t%.6e'*n_pst + '\n') % (chrom, snp, bp, a1, a2, *beta))
|
|
115
|
+
else:
|
|
116
|
+
for snp, bp, a1, a2, beta in zip(sst_dict['SNP'], sst_dict['BP'], sst_dict['A1'], sst_dict['A2'], beta_est):
|
|
117
|
+
ff.write('%d\t%s\t%d\t%s\t%s\t%.6e\n' % (chrom, snp, bp, a1, a2, beta))
|
|
118
|
+
|
|
119
|
+
# write posterior estimates of psi
|
|
120
|
+
if write_psi == 'TRUE':
|
|
121
|
+
if phi_updt == True:
|
|
122
|
+
psi_file = out_dir + '_pst_psi_a%d_b%.1f_phiauto_chr%d.txt' % (a, b, chrom)
|
|
123
|
+
else:
|
|
124
|
+
psi_file = out_dir + '_pst_psi_a%d_b%.1f_phi%1.0e_chr%d.txt' % (a, b, phi, chrom)
|
|
125
|
+
|
|
126
|
+
with open(psi_file, 'w') as ff:
|
|
127
|
+
for snp, psi in zip(sst_dict['SNP'], psi_est):
|
|
128
|
+
ff.write('%s\t%.6e\n' % (snp, psi))
|
|
129
|
+
|
|
130
|
+
# print estimated phi
|
|
131
|
+
if phi_updt == True:
|
|
132
|
+
log.write('... Estimated global shrinkage parameter: %1.2e ...' % phi_est )
|
|
133
|
+
|
|
134
|
+
log.write('... Done ...')
|
|
135
|
+
|
|
136
|
+
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Parse the reference panel, summary statistics, and validation set.
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import numpy as np
|
|
10
|
+
from scipy.stats import norm
|
|
11
|
+
from scipy import linalg
|
|
12
|
+
import h5py
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def parse_ref(ref_file, chrom, log):
|
|
17
|
+
# ref_dict = {'CHR':[], 'SNP':[], 'BP':[], 'A1':[], 'A2':[], 'MAF':[]}
|
|
18
|
+
ref_dict = pd.read_csv(ref_file ,sep="\t")
|
|
19
|
+
ref_dict = ref_dict.loc[ref_dict["CHR"]==chrom,:]
|
|
20
|
+
return ref_dict
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def parse_bim(bim_file, chrom):
|
|
24
|
+
|
|
25
|
+
vld_dict = pd.read_csv(bim_file + '.bim' ,sep="\t", usecols=[1,3,4])
|
|
26
|
+
vld_dict.columns=["SNP","A1","A2"]
|
|
27
|
+
return vld_dict
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def parse_sumstats(ref_dict, vld_dict, sst_file, n_subj, log):
|
|
31
|
+
|
|
32
|
+
n_sqrt = np.sqrt(n_subj)
|
|
33
|
+
|
|
34
|
+
sst_file.dropna()
|
|
35
|
+
sst_file["CHR"] = sst_file["CHR"].astype("int64")
|
|
36
|
+
sst_file["BP"] = sst_file["BP"].astype("int64")
|
|
37
|
+
sst_file["EA"] = sst_file["EA"].astype("string")
|
|
38
|
+
sst_file["NEA"] = sst_file["NEA"].astype("string")
|
|
39
|
+
|
|
40
|
+
sst_file = pd.merge(sst_file, ref_dict, on=["SNP","CHR","BP"],how="inner")
|
|
41
|
+
|
|
42
|
+
is_flipped = ((sst_file["NEA"] == sst_file["A1"]) &(sst_file["EA"] == sst_file["A2"]))
|
|
43
|
+
is_valid = ((sst_file["EA"] == sst_file["A1"]) & (sst_file["NEA"] == sst_file["A2"]))| is_flipped
|
|
44
|
+
|
|
45
|
+
sst_file = sst_file.loc[is_valid,:]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
sst_file.loc[is_flipped, "MAF"] = 1 - sst_file.loc[is_flipped, "MAF"]
|
|
49
|
+
sst_file["BETA"] = sst_file["BETA"] / sst_file["SE"] / n_sqrt
|
|
50
|
+
|
|
51
|
+
sst_file.loc[~is_flipped, "BETA"] = 1 * sst_file.loc[~is_flipped, "BETA"]
|
|
52
|
+
sst_file.loc[is_flipped, "BETA"] = -1 * sst_file.loc[is_flipped, "BETA"]
|
|
53
|
+
|
|
54
|
+
sst_file["FLP"] = 1
|
|
55
|
+
sst_file.loc[is_flipped, "FLP"] = -1
|
|
56
|
+
log.write(" -Number of common SNPs:{}".format(len(sst_file)))
|
|
57
|
+
sst_dict= sst_file[['CHR', 'SNP', 'BP', 'A1', 'A2', 'MAF', 'BETA', 'FLP']].to_dict("list")
|
|
58
|
+
|
|
59
|
+
return sst_dict
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def parse_ldblk(ldblk_dir, sst_dict, chrom, log):
|
|
63
|
+
log.write('... parse reference LD on chromosome %d ...' % chrom)
|
|
64
|
+
|
|
65
|
+
if '1kg' in os.path.basename(ldblk_dir):
|
|
66
|
+
chr_name = ldblk_dir + '/ldblk_1kg_chr' + str(chrom) + '.hdf5'
|
|
67
|
+
elif 'ukbb' in os.path.basename(ldblk_dir):
|
|
68
|
+
chr_name = ldblk_dir + '/ldblk_ukbb_chr' + str(chrom) + '.hdf5'
|
|
69
|
+
|
|
70
|
+
hdf_chr = h5py.File(chr_name, 'r')
|
|
71
|
+
n_blk = len(hdf_chr)
|
|
72
|
+
ld_blk = [np.array(hdf_chr['blk_'+str(blk)]['ldblk']) for blk in range(1,n_blk+1)]
|
|
73
|
+
|
|
74
|
+
snp_blk = []
|
|
75
|
+
for blk in range(1,n_blk+1):
|
|
76
|
+
snp_blk.append([bb.decode("UTF-8") for bb in list(hdf_chr['blk_'+str(blk)]['snplist'])])
|
|
77
|
+
|
|
78
|
+
blk_size = []
|
|
79
|
+
mm = 0
|
|
80
|
+
for blk in range(n_blk):
|
|
81
|
+
idx = [ii for (ii, snp) in enumerate(snp_blk[blk]) if snp in sst_dict['SNP']]
|
|
82
|
+
blk_size.append(len(idx))
|
|
83
|
+
if idx != []:
|
|
84
|
+
idx_blk = range(mm,mm+len(idx))
|
|
85
|
+
flip = [sst_dict['FLP'][jj] for jj in idx_blk]
|
|
86
|
+
ld_blk[blk] = ld_blk[blk][np.ix_(idx,idx)]*np.outer(flip,flip)
|
|
87
|
+
|
|
88
|
+
_, s, v = linalg.svd(ld_blk[blk])
|
|
89
|
+
h = np.dot(v.T, np.dot(np.diag(s), v))
|
|
90
|
+
ld_blk[blk] = (ld_blk[blk]+h)/2
|
|
91
|
+
|
|
92
|
+
mm += len(idx)
|
|
93
|
+
else:
|
|
94
|
+
ld_blk[blk] = np.array([])
|
|
95
|
+
|
|
96
|
+
return ld_blk, blk_size
|
|
97
|
+
|
|
98
|
+
|
gwaslab/qc_build.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import gc
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
from itertools import repeat
|
|
6
|
+
from multiprocessing import Pool
|
|
7
|
+
from liftover import get_lifter
|
|
8
|
+
from liftover import ChainFile
|
|
9
|
+
from functools import partial
|
|
10
|
+
from gwaslab.g_vchange_status import vchange_status
|
|
11
|
+
from gwaslab.g_Log import Log
|
|
12
|
+
|
|
13
|
+
def _process_build(build, log, verbose):
|
|
14
|
+
if str(build).lower() in ["hg19","19","37","b37","grch37"]:
|
|
15
|
+
log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
|
|
16
|
+
final_build = "19"
|
|
17
|
+
elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
|
|
18
|
+
log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
|
|
19
|
+
final_build = "18"
|
|
20
|
+
elif str(build).lower() in ["hg38","38","b38","grch38"]:
|
|
21
|
+
log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
|
|
22
|
+
final_build = "38"
|
|
23
|
+
elif str(build).lower() in ["t2t","hs1","chm13","13"]:
|
|
24
|
+
log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
|
|
25
|
+
final_build = "13"
|
|
26
|
+
else:
|
|
27
|
+
log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
|
|
28
|
+
final_build = "99"
|
|
29
|
+
return final_build
|
|
30
|
+
|
|
31
|
+
def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
|
|
32
|
+
build = _process_build(build,log=log,verbose=verbose)
|
|
33
|
+
sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
|
|
34
|
+
sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
|
|
35
|
+
return sumstats, build
|
|
36
|
+
|
|
37
|
+
def _check_build(target_build, build="99", status="STATUS",verbose=True,log=Log()):
|
|
38
|
+
target_build = _process_build(target_build,log=log,verbose=verbose)
|
|
39
|
+
build = _process_build(build,log=log,verbose=verbose)
|
|
40
|
+
if build == "99":
|
|
41
|
+
raise ValueError("Sumstats build is unknown. Please run infer_build() or set_build()")
|
|
42
|
+
|
|
43
|
+
if target_build == "99":
|
|
44
|
+
raise ValueError("Target build is unknown.")
|
|
45
|
+
|
|
46
|
+
if build!=target_build:
|
|
47
|
+
raise ValueError("Please make sure sumstats build is {}".format(target_build))
|
|
48
|
+
else:
|
|
49
|
+
log.write(" -Sumstats build matches target build")
|
|
50
|
+
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
|
gwaslab/qc_check_datatype.py
CHANGED
|
@@ -7,10 +7,10 @@ from gwaslab.g_Log import Log
|
|
|
7
7
|
|
|
8
8
|
dtype_dict ={
|
|
9
9
|
"SNPID":["string","object"],
|
|
10
|
-
"rsID":["string","object"],
|
|
11
|
-
"CHR":["Int64","int64","int32","Int32","int"],
|
|
12
|
-
"POS":["int64","Int64"],
|
|
13
|
-
"EA":["category"],
|
|
10
|
+
"rsID": ["string","object"],
|
|
11
|
+
"CHR": ["Int64","int64","int32","Int32","int"],
|
|
12
|
+
"POS": ["int64","Int64"],
|
|
13
|
+
"EA": ["category"],
|
|
14
14
|
"NEA":["category"],
|
|
15
15
|
"REF":["category"],
|
|
16
16
|
"ALT":["category"],
|
|
@@ -35,7 +35,7 @@ dtype_dict ={
|
|
|
35
35
|
"TEST":["string","object","category"],
|
|
36
36
|
"CHISQ":["float64"],
|
|
37
37
|
"I2":["float64"],
|
|
38
|
-
"
|
|
38
|
+
"P_HET":["float64"],
|
|
39
39
|
"SNPR2":["float64"],
|
|
40
40
|
"EAF":["float64","float","float32"],
|
|
41
41
|
"NEAF":["float64","float","float32"],
|
|
@@ -48,7 +48,11 @@ dtype_dict ={
|
|
|
48
48
|
'CREDIBLE_SET_INDEX':["Int64","int64","int32","Int32","int"],
|
|
49
49
|
'N_SNP' :["Int64","int64","int32","Int32","int"],
|
|
50
50
|
'LOCUS' :["string","object","category"],
|
|
51
|
-
'STUDY' :["string","object","category"]
|
|
51
|
+
'STUDY' :["string","object","category"],
|
|
52
|
+
'BETA_RANDOM' :["float64"],
|
|
53
|
+
'SE_RANDOM' :["float64"],
|
|
54
|
+
'Z_RANDOM' :["float64"],
|
|
55
|
+
'P_RANDOM' :["float64"]
|
|
52
56
|
}
|
|
53
57
|
|
|
54
58
|
def check_datatype(sumstats, verbose=True, log=Log()):
|
|
@@ -108,8 +112,6 @@ def quick_convert_datatype(sumstats, log, verbose):
|
|
|
108
112
|
pass
|
|
109
113
|
return sumstats
|
|
110
114
|
|
|
111
|
-
|
|
112
|
-
|
|
113
115
|
def check_dataframe_shape(sumstats, log, verbose):
|
|
114
116
|
memory_in_mb = sumstats.memory_usage().sum()/1024/1024
|
|
115
117
|
try:
|