gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +22 -5
- gwaslab/g_Sumstats.py +110 -163
- gwaslab/g_SumstatsPair.py +76 -25
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +36 -17
- gwaslab/hm_harmonize_sumstats.py +354 -221
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +49 -43
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/io_to_formats.py +428 -295
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +891 -778
- gwaslab/util_ex_calculate_ldmatrix.py +31 -13
- gwaslab/util_ex_gwascatalog.py +25 -25
- gwaslab/util_ex_ldproxyfinder.py +10 -10
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +30 -30
- gwaslab/util_in_filter_value.py +201 -74
- gwaslab/util_in_get_density.py +10 -10
- gwaslab/util_in_get_sig.py +445 -71
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +42 -37
- gwaslab/viz_aux_reposition_text.py +10 -7
- gwaslab/viz_aux_save_figure.py +18 -8
- gwaslab/viz_plot_compare_af.py +32 -33
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +34 -26
- gwaslab/viz_plot_mqqplot.py +126 -75
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +36 -33
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +65 -61
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.37.dist-info/RECORD +0 -72
- /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
|
@@ -21,24 +21,23 @@ def get_power(
|
|
|
21
21
|
log=Log(),
|
|
22
22
|
verbose=True
|
|
23
23
|
):
|
|
24
|
-
|
|
24
|
+
log.write(" Start to calculate statistical power...", verbose=verbose)
|
|
25
25
|
if mode=="b":
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
log.write(" -Significance level: {:.3e}".format(sig_level))
|
|
26
|
+
log.write(" -Input settings (b mode):", verbose=verbose)
|
|
27
|
+
log.write(" -Number of cases:{}".format(ncase), verbose=verbose)
|
|
28
|
+
log.write(" -Number of controls:{}".format(ncontrol), verbose=verbose)
|
|
29
|
+
if genotype_rr is not None:
|
|
30
|
+
log.write(" -Risk allele RR:{:.3f}".format(genotype_rr), verbose=verbose)
|
|
31
|
+
elif genotype_or is not None:
|
|
32
|
+
log.write(" -Risk allele OR:{:.3f}".format(genotype_or), verbose=verbose)
|
|
33
|
+
elif beta is not None:
|
|
34
|
+
log.write(" -Risk allele beta:{:.3f}".format(beta), verbose=verbose)
|
|
35
|
+
else:
|
|
36
|
+
genotype_rr = 0.1
|
|
37
|
+
log.write(" -Risk allele RR:{:.3f}".format(genotype_rr), verbose=verbose)
|
|
38
|
+
log.write(" -Disease prevalence:{:.3f}".format(prevalence), verbose=verbose)
|
|
39
|
+
log.write(" -Risk allele frequency: {:.3f}".format(daf), verbose=verbose)
|
|
40
|
+
log.write(" -Significance level: {:.3e}".format(sig_level), verbose=verbose)
|
|
42
41
|
# Skol, A. D., Scott, L. J., Abecasis, G. R., & Boehnke, M. (2006). Joint analysis is more efficient than replication-based analysis for two-stage genome-wide association studies. Nature genetics, 38(2), 209-213.
|
|
43
42
|
aaf = daf**2
|
|
44
43
|
abf = 2 * (daf) * (1 - daf)
|
|
@@ -56,11 +55,11 @@ def get_power(
|
|
|
56
55
|
# https://jamanetwork.com/journals/jama/fullarticle/188182
|
|
57
56
|
|
|
58
57
|
if or_to_rr ==False:
|
|
59
|
-
|
|
60
|
-
|
|
58
|
+
log.write(" -Alogorithm: Skol, Andrew D., et al. Nature genetics 38.2 (2006): 209-213....", verbose=verbose)
|
|
59
|
+
log.write(" -GRR is approximated using OR. For prevalence < 10%, GRR is very similar to OR....", verbose=verbose)
|
|
61
60
|
else:
|
|
62
|
-
|
|
63
|
-
|
|
61
|
+
log.write(" -OR is converted to GRR using base prevalence: {}".format(prevalence), verbose=verbose)
|
|
62
|
+
log.write(" -Alogorithm: Zhang, J., & Kai, F. Y. (1998). What's the relative risk?: A method of correcting the odds ratio in cohort studies of common outcomes. Jama, 280(19), 1690-1691.....", verbose=verbose)
|
|
64
63
|
|
|
65
64
|
# additive
|
|
66
65
|
x = [ 2*genotype_rr-1, genotype_rr, 1 ]
|
|
@@ -68,19 +67,19 @@ def get_power(
|
|
|
68
67
|
aap= x[0] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
|
|
69
68
|
abp= x[1] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
|
|
70
69
|
bbp= x[2] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
70
|
+
log.write("Probability of disease :", verbose=verbose)
|
|
71
|
+
log.write(" - Individuals with AA genotype: {:.3f}".format(aap), verbose=verbose)
|
|
72
|
+
log.write(" - Individuals with AB genotype: {:.3f}".format(abp), verbose=verbose)
|
|
73
|
+
log.write(" - Individuals with BB genotype: {:.3f}".format(bbp), verbose=verbose)
|
|
75
74
|
|
|
76
75
|
pcase= (aap * aaf + abp * abf*0.5) / prevalence
|
|
77
76
|
pcontrol=((1-aap )* aaf + (1-abp )* abf*0.5) / (1 - prevalence)
|
|
78
77
|
|
|
79
78
|
vcase = pcase *(1-pcase)
|
|
80
79
|
vcontrol =pcontrol *(1-pcontrol)
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
80
|
+
log.write("Expected risk allele frequency:", verbose=verbose)
|
|
81
|
+
log.write(" - In cases: {:.3f}".format(pcase), verbose=verbose)
|
|
82
|
+
log.write(" - In controls: {:.3f}".format(pcontrol), verbose=verbose)
|
|
84
83
|
|
|
85
84
|
num= (pcase - pcontrol)
|
|
86
85
|
den= np.sqrt( (vcase/ncase + vcontrol/ncontrol)*0.5 )
|
|
@@ -88,22 +87,22 @@ def get_power(
|
|
|
88
87
|
|
|
89
88
|
c = ss.norm.isf(sig_level/2)
|
|
90
89
|
power = 1 - ss.norm.cdf(c-u) + ss.norm.cdf(-c-u)
|
|
91
|
-
|
|
90
|
+
log.write("Expected power: {:.3f}".format(power), verbose=verbose)
|
|
92
91
|
|
|
93
92
|
elif mode=="q":
|
|
94
93
|
if beta is None:
|
|
95
94
|
beta = 0.1
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
95
|
+
|
|
96
|
+
log.write(" -Input settings (q mode):", verbose=verbose)
|
|
97
|
+
log.write(" -Significance level: {}".format(sig_level), verbose=verbose)
|
|
98
|
+
log.write(" -EAF: {}".format(eaf), verbose=verbose)
|
|
99
|
+
log.write(" -BETA: {}".format(beta), verbose=verbose)
|
|
100
|
+
log.write(" -N: {}".format(n), verbose=verbose)
|
|
101
|
+
log.write(" -SNPR2: {}".format(2*eaf*(1-eaf)*(beta**2)), verbose=verbose)
|
|
103
102
|
c = ss.chi2.isf(sig_level,df=1)
|
|
104
103
|
NCP = n * 2*eaf*(1-eaf)*(beta**2)/vary
|
|
105
104
|
power = 1 - ss.ncx2.cdf(c, df=1, nc=NCP)
|
|
106
|
-
|
|
105
|
+
log.write("Finished calculating statistical power.", verbose=verbose)
|
|
107
106
|
return power
|
|
108
107
|
|
|
109
108
|
def get_beta(
|
|
@@ -137,11 +136,11 @@ def get_beta(
|
|
|
137
136
|
eafs = np.linspace(eaf_range[1],eaf_range[0],n_matrix)
|
|
138
137
|
betas = np.linspace(beta_range[0],beta_range[1],n_matrix)
|
|
139
138
|
|
|
140
|
-
|
|
139
|
+
log.write(" -Updating eaf-beta matrix...", verbose=verbose)
|
|
141
140
|
for i in range(n_matrix):
|
|
142
141
|
eaf_beta_matrix[i,] = calculate_power_single(beta=betas,eaf=eafs[i],n=n,sig_level=sig_level,vary=vary)
|
|
143
142
|
|
|
144
|
-
|
|
143
|
+
log.write(" -Extracting eaf-beta combinations with power = {}...".format(t), verbose=verbose)
|
|
145
144
|
i,j=1,1
|
|
146
145
|
eaf_beta = []
|
|
147
146
|
while i<n_matrix-1 and j<n_matrix-1:
|
|
@@ -207,11 +206,11 @@ def get_beta_binary(
|
|
|
207
206
|
eafs = np.linspace(eaf_range[1],eaf_range[0],n_matrix)
|
|
208
207
|
betas = np.linspace(beta_range[0],beta_range[1],n_matrix)
|
|
209
208
|
|
|
210
|
-
|
|
209
|
+
log.write(" -Updating eaf-beta matrix...", verbose=verbose)
|
|
211
210
|
if or_to_rr ==False:
|
|
212
|
-
|
|
211
|
+
log.write(" -GRR is approximated using OR. For prevalence < 10%, GRR is very similar to OR....", verbose=verbose)
|
|
213
212
|
else:
|
|
214
|
-
|
|
213
|
+
log.write(" -OR is converted to GRR using base prevalence: {}".format(prevalence), verbose=verbose)
|
|
215
214
|
|
|
216
215
|
for i in range(n_matrix):
|
|
217
216
|
eaf_beta_matrix[i,] = calculate_power_single(beta=betas,
|
|
@@ -222,7 +221,7 @@ def get_beta_binary(
|
|
|
222
221
|
sig_level=sig_level,
|
|
223
222
|
or_to_rr=or_to_rr)
|
|
224
223
|
|
|
225
|
-
|
|
224
|
+
log.write(" -Extracting eaf-beta combinations with power = {}...".format(t), verbose=verbose)
|
|
226
225
|
i,j=1,1
|
|
227
226
|
eaf_beta = []
|
|
228
227
|
while i<n_matrix-1 and j<n_matrix-1:
|
gwaslab/util_in_convert_h2.py
CHANGED
|
@@ -65,7 +65,7 @@ def _get_per_snp_r2(sumstats,
|
|
|
65
65
|
adjuested=False,
|
|
66
66
|
verbose=True):
|
|
67
67
|
# Pierce, B. L., Ahsan, H., & VanderWeele, T. J. (2011). Power and instrument strength requirements for Mendelian randomization studies using multiple genetic variants. International journal of epidemiology, 40(3), 740-752.
|
|
68
|
-
|
|
68
|
+
log.write("Start to calculate per-SNP heritibility...", verbose=verbose)
|
|
69
69
|
if type(k) is int or type(k) is float:
|
|
70
70
|
pass
|
|
71
71
|
elif k =="all":
|
|
@@ -81,18 +81,18 @@ def _get_per_snp_r2(sumstats,
|
|
|
81
81
|
# Var(e) = betase**2 * 2 * N * MAF * (1-MAF)
|
|
82
82
|
# r2 = Var(beta * X) / Var(y)
|
|
83
83
|
|
|
84
|
-
|
|
84
|
+
log.write(" -Calculating per-SNP rsq by 2 * (BETA**2) * AF * (1-AF) / Var(y)...", verbose=verbose)
|
|
85
85
|
sumstats["_VAR(BETAX)"] = 2*(sumstats[beta]**2)*sumstats[af]*(1-sumstats[af])
|
|
86
86
|
|
|
87
87
|
if type(vary) is int or type(vary) is float:
|
|
88
|
-
|
|
88
|
+
log.write(" -Var(y) is provided: {}...".format(vary), verbose=verbose)
|
|
89
89
|
sumstats["SNPR2"] = sumstats["_VAR(BETAX)"] / vary
|
|
90
90
|
elif vary=="se":
|
|
91
|
-
|
|
91
|
+
log.write(" -Var(y) is estimated from VAR(BETA * X), N, MAF, SE: {}...".format(vary), verbose=verbose)
|
|
92
92
|
sumstats["_SIGMA2"] = sumstats[se]**2 * 2*(sumstats[n])*sumstats[af]*(1-sumstats[af])
|
|
93
93
|
sumstats["SNPR2"] = sumstats["_VAR(BETAX)"] / (sumstats["_SIGMA2"] + sumstats["_VAR(BETAX)"])
|
|
94
94
|
else:
|
|
95
|
-
|
|
95
|
+
log.warning("Not enough information for calculation.")
|
|
96
96
|
|
|
97
97
|
if mode=="b":
|
|
98
98
|
if ncase not in sumstats.columns:
|
|
@@ -117,11 +117,11 @@ def _get_per_snp_r2(sumstats,
|
|
|
117
117
|
else:
|
|
118
118
|
snpr2 = "SNPR2"
|
|
119
119
|
if n in sumstats.columns:
|
|
120
|
-
|
|
121
|
-
|
|
120
|
+
log.write(" -Calculating F-statistic: F = [(N-k-1)/k] * (r2/1-r2)... where k = {}".format(k), verbose=verbose)
|
|
121
|
+
log.write(" -For r2, {} is used.".format(snpr2), verbose=verbose)
|
|
122
122
|
sumstats["F"] = sumstats[snpr2]*(sumstats[n]-1 -k)/((1-sumstats[snpr2]) * k)
|
|
123
123
|
|
|
124
|
-
|
|
124
|
+
log.write("Finished calculating per-SNP heritability!", verbose=verbose)
|
|
125
125
|
return sumstats
|
|
126
126
|
#
|
|
127
127
|
def get_population_allele_frequency(af, prop, odds_ratio, prevalence,eps=1e-15):
|
gwaslab/util_in_fill_data.py
CHANGED
|
@@ -9,7 +9,7 @@ from gwaslab.g_version import _get_version
|
|
|
9
9
|
from gwaslab.qc_check_datatype import check_datatype
|
|
10
10
|
|
|
11
11
|
def filldata(
|
|
12
|
-
|
|
12
|
+
insumstats,
|
|
13
13
|
to_fill=None,
|
|
14
14
|
df=None,
|
|
15
15
|
overwrite=False,
|
|
@@ -23,32 +23,32 @@ def filldata(
|
|
|
23
23
|
# if a string is passed to to_fill, convert it to list
|
|
24
24
|
if type(to_fill) is str:
|
|
25
25
|
to_fill = [to_fill]
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
sumstats = insumstats.copy()
|
|
27
|
+
log.write("Start filling data using existing columns...{}".format(_get_version()), verbose=verbose)
|
|
28
28
|
|
|
29
29
|
check_datatype(sumstats,verbose=verbose,log=log)
|
|
30
30
|
|
|
31
31
|
# check dupication ##############################################################################################
|
|
32
32
|
skip_cols=[]
|
|
33
|
-
|
|
33
|
+
log.write(" -Overwrite mode: ",overwrite, verbose=verbose)
|
|
34
34
|
if overwrite is False:
|
|
35
35
|
for i in to_fill:
|
|
36
36
|
if i in sumstats.columns:
|
|
37
37
|
skip_cols.append(i)
|
|
38
38
|
for i in skip_cols:
|
|
39
39
|
to_fill.remove(i)
|
|
40
|
-
|
|
40
|
+
log.write(" -Skipping columns: ",skip_cols, verbose=verbose)
|
|
41
41
|
if len(set(to_fill) & set(["OR","OR_95L","OR_95U","BETA","SE","P","Z","CHISQ","MLOG10P","MAF"]))==0:
|
|
42
42
|
log.write(" -No available columns to fill. Skipping.", verbose=verbose)
|
|
43
43
|
log.write("Finished filling data using existing columns.", verbose=verbose)
|
|
44
44
|
return sumstats
|
|
45
|
-
|
|
45
|
+
log.write(" -Filling columns: ",to_fill, verbose=verbose)
|
|
46
46
|
fill_iteratively(sumstats,to_fill,log,only_sig,df,extreme,verbose,sig_level)
|
|
47
47
|
|
|
48
48
|
# ###################################################################################
|
|
49
49
|
#sumstats = sortcolumn(sumstats, verbose=verbose, log=log)
|
|
50
50
|
gc.collect()
|
|
51
|
-
|
|
51
|
+
log.write("Finished filling data using existing columns.", verbose=verbose)
|
|
52
52
|
return sumstats
|
|
53
53
|
|
|
54
54
|
##########################################################################################################################
|
|
@@ -56,20 +56,20 @@ def filldata(
|
|
|
56
56
|
def fill_p(sumstats,log,df=None,only_sig=False,sig_level=5e-8,overwrite=False,verbose=True,filled_count=0):
|
|
57
57
|
# MLOG10P -> P
|
|
58
58
|
if "MLOG10P" in sumstats.columns:
|
|
59
|
-
|
|
59
|
+
log.write(" - Filling P value using MLOG10P column...", verbose=verbose)
|
|
60
60
|
sumstats["P"] = np.power(10,-sumstats["MLOG10P"])
|
|
61
61
|
filled_count +=1
|
|
62
62
|
|
|
63
63
|
# Z -> P
|
|
64
64
|
elif "Z" in sumstats.columns:
|
|
65
|
-
|
|
65
|
+
log.write(" - Filling P value using Z column...", verbose=verbose)
|
|
66
66
|
stats.chisqprob = lambda chisq, degree_of_freedom: stats.chi2.sf(chisq, degree_of_freedom)
|
|
67
67
|
sumstats["P"] = ss.chisqprob(sumstats["Z"]**2,1)
|
|
68
68
|
filled_count +=1
|
|
69
69
|
|
|
70
70
|
elif "CHISQ" in sumstats.columns:
|
|
71
71
|
#CHISQ -> P
|
|
72
|
-
|
|
72
|
+
log.write(" - Filling P value using CHISQ column...", verbose=verbose)
|
|
73
73
|
stats.chisqprob = lambda chisq, degree_of_freedom: stats.chi2.sf(chisq, degree_of_freedom)
|
|
74
74
|
if df is None:
|
|
75
75
|
if only_sig is True and overwrite is True:
|
|
@@ -80,11 +80,11 @@ def fill_p(sumstats,log,df=None,only_sig=False,sig_level=5e-8,overwrite=False,ve
|
|
|
80
80
|
filled_count +=1
|
|
81
81
|
else:
|
|
82
82
|
if only_sig is True and overwrite is True:
|
|
83
|
-
|
|
83
|
+
log.write(" - Filling P value using CHISQ column for variants:" , sum(sumstats["P"]<sig_level), verbose=verbose)
|
|
84
84
|
sumstats.loc[sumstats["P"]<sig_level,"P"] = stats.chisqprob(sumstats.loc[sumstats["P"]<sig_level,"CHISQ"],sumstats.loc[sumstats["P"]<sig_level,df].astype("int"))
|
|
85
85
|
filled_count +=1
|
|
86
86
|
else:
|
|
87
|
-
|
|
87
|
+
log.write(" - Filling P value using CHISQ column for all valid variants:", verbose=verbose)
|
|
88
88
|
sumstats["P"] = stats.chisqprob(sumstats["CHISQ"],sumstats[df].astype("int"))
|
|
89
89
|
filled_count +=1
|
|
90
90
|
else:
|
|
@@ -94,7 +94,7 @@ def fill_p(sumstats,log,df=None,only_sig=False,sig_level=5e-8,overwrite=False,ve
|
|
|
94
94
|
def fill_z(sumstats,log,verbose=True,filled_count=0):
|
|
95
95
|
# BETA/SE -> Z
|
|
96
96
|
if ("BETA" in sumstats.columns) and ("SE" in sumstats.columns):
|
|
97
|
-
|
|
97
|
+
log.write(" - Filling Z using BETA/SE column...", verbose=verbose)
|
|
98
98
|
sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
|
|
99
99
|
filled_count +=1
|
|
100
100
|
else:
|
|
@@ -104,12 +104,12 @@ def fill_z(sumstats,log,verbose=True,filled_count=0):
|
|
|
104
104
|
def fill_chisq(sumstats,log,verbose=True,filled_count=0):
|
|
105
105
|
# Z -> CHISQ
|
|
106
106
|
if "Z" in sumstats.columns:
|
|
107
|
-
|
|
107
|
+
log.write(" - Filling CHISQ using Z column...", verbose=verbose)
|
|
108
108
|
sumstats["CHISQ"] = (sumstats["Z"])**2
|
|
109
109
|
filled_count +=1
|
|
110
110
|
elif "P" in sumstats.columns:
|
|
111
111
|
# P -> CHISQ
|
|
112
|
-
|
|
112
|
+
log.write(" - Filling CHISQ using P column...", verbose=verbose)
|
|
113
113
|
sumstats["CHISQ"] = ss.chi2.isf(sumstats["P"], 1)
|
|
114
114
|
filled_count +=1
|
|
115
115
|
else:
|
|
@@ -119,13 +119,13 @@ def fill_chisq(sumstats,log,verbose=True,filled_count=0):
|
|
|
119
119
|
def fill_or(sumstats,log,verbose=True,filled_count=0):
|
|
120
120
|
# BETA -> OR
|
|
121
121
|
if "BETA" in sumstats.columns:
|
|
122
|
-
|
|
122
|
+
log.write(" - Filling OR using BETA column...", verbose=verbose)
|
|
123
123
|
sumstats["OR"] = np.exp(sumstats["BETA"])
|
|
124
124
|
filled_count +=1
|
|
125
125
|
# BETA/SE -> OR_95L / OR_95U
|
|
126
126
|
# get confidence interval 95
|
|
127
127
|
if ("BETA" in sumstats.columns) and ("SE" in sumstats.columns):
|
|
128
|
-
|
|
128
|
+
log.write(" - Filling OR_95L/OR_95U using BETA/SE columns...", verbose=verbose)
|
|
129
129
|
# beta - 1.96 x se , beta + 1.96 x se
|
|
130
130
|
sumstats["OR_95L"] = np.exp(sumstats["BETA"]-ss.norm.ppf(0.975)*sumstats["SE"])
|
|
131
131
|
sumstats["OR_95U"] = np.exp(sumstats["BETA"]+ss.norm.ppf(0.975)*sumstats["SE"])
|
|
@@ -136,7 +136,7 @@ def fill_or(sumstats,log,verbose=True,filled_count=0):
|
|
|
136
136
|
def fill_or95(sumstats,log,verbose=True,filled_count=0):
|
|
137
137
|
# get confidence interval 95
|
|
138
138
|
if ("BETA" in sumstats.columns) and ("SE" in sumstats.columns):
|
|
139
|
-
|
|
139
|
+
log.write(" - Filling OR_95L/OR_95U using BETA/SE columns...", verbose=verbose)
|
|
140
140
|
# beta - 1.96 x se , beta + 1.96 x se
|
|
141
141
|
sumstats["OR_95L"] = np.exp(sumstats["BETA"]-ss.norm.ppf(0.975)*sumstats["SE"])
|
|
142
142
|
sumstats["OR_95U"] = np.exp(sumstats["BETA"]+ss.norm.ppf(0.975)*sumstats["SE"])
|
|
@@ -148,7 +148,7 @@ def fill_or95(sumstats,log,verbose=True,filled_count=0):
|
|
|
148
148
|
def fill_beta(sumstats,log,verbose=True,filled_count=0):
|
|
149
149
|
# OR -> beta
|
|
150
150
|
if "OR" in sumstats.columns:
|
|
151
|
-
|
|
151
|
+
log.write(" - Filling BETA value using OR column...", verbose=verbose)
|
|
152
152
|
sumstats["BETA"] = np.log(sumstats["OR"])
|
|
153
153
|
filled_count +=1
|
|
154
154
|
else:
|
|
@@ -158,27 +158,27 @@ def fill_beta(sumstats,log,verbose=True,filled_count=0):
|
|
|
158
158
|
def fill_se(sumstats,log,verbose=True,filled_count=0):
|
|
159
159
|
# OR / OR_95L /OR_95U -> SE
|
|
160
160
|
if ("P" in sumstats.columns) and ("BETA" in sumstats.columns):
|
|
161
|
-
|
|
161
|
+
log.write(" - Filling SE value using BETA and P column...", verbose=verbose)
|
|
162
162
|
sumstats["SE"]= np.abs(sumstats["BETA"]/ ss.norm.ppf(1-sumstats["P"]/2))
|
|
163
163
|
filled_count +=1
|
|
164
164
|
elif ("OR" in sumstats.columns) and ("OR_95U" in sumstats.columns):
|
|
165
|
-
|
|
165
|
+
log.write(" - Filling SE value using OR/OR_95U column...", verbose=verbose)
|
|
166
166
|
#
|
|
167
167
|
sumstats["SE"]=(np.log(sumstats["OR_95U"]) - np.log(sumstats["OR"]))/ss.norm.ppf(0.975)
|
|
168
168
|
filled_count +=1
|
|
169
169
|
elif ("OR" in sumstats.columns) and ("OR_95L" in sumstats.columns):
|
|
170
|
-
|
|
170
|
+
log.write(" - Filling SE value using OR/OR_95L column...", verbose=verbose)
|
|
171
171
|
sumstats["SE"]=(np.log(sumstats["OR"]) - np.log(sumstats["OR_95L"]))/ss.norm.ppf(0.975)
|
|
172
172
|
filled_count +=1
|
|
173
173
|
else:
|
|
174
|
-
|
|
174
|
+
log.write(" - Not enough information to fill SE...", verbose=verbose)
|
|
175
175
|
return 0,filled_count
|
|
176
176
|
return 1,filled_count
|
|
177
177
|
|
|
178
178
|
def fill_mlog10p(sumstats,log,verbose=True,filled_count=0):
|
|
179
179
|
if "P" in sumstats.columns:
|
|
180
180
|
# P -> MLOG10P
|
|
181
|
-
|
|
181
|
+
log.write(" - Filling MLOG10P using P column...", verbose=verbose)
|
|
182
182
|
sumstats["MLOG10P"] = -np.log10(sumstats["P"])
|
|
183
183
|
filled_count +=1
|
|
184
184
|
else:
|
|
@@ -188,14 +188,14 @@ def fill_extreme_mlog10p(sumstats,log,verbose=True,filled_count=0):
|
|
|
188
188
|
# ref: https://stackoverflow.com/questions/46416027/how-to-compute-p-values-from-z-scores-in-r-when-the-z-score-is-large-pvalue-muc/46416222#46416222
|
|
189
189
|
if "Z" in sumstats.columns:
|
|
190
190
|
# P -> MLOG10P
|
|
191
|
-
|
|
191
|
+
log.write(" - Filling MLOG10P using Z column...", verbose=verbose)
|
|
192
192
|
sumstats = fill_extreme_mlog10(sumstats, "Z")
|
|
193
193
|
filled_count +=1
|
|
194
194
|
elif "BETA" in sumstats.columns and "SE" in sumstats.columns:
|
|
195
|
-
|
|
196
|
-
|
|
195
|
+
log.write(" - Z column not available...", verbose=verbose)
|
|
196
|
+
log.write(" - Filling Z using BETA/SE column...", verbose=verbose)
|
|
197
197
|
sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
|
|
198
|
-
|
|
198
|
+
log.write(" - Filling MLOG10P using Z column...", verbose=verbose)
|
|
199
199
|
sumstats = fill_extreme_mlog10(sumstats, "Z")
|
|
200
200
|
filled_count +=1
|
|
201
201
|
else:
|
|
@@ -205,7 +205,7 @@ def fill_extreme_mlog10p(sumstats,log,verbose=True,filled_count=0):
|
|
|
205
205
|
def fill_maf(sumstats,log,verbose=True,filled_count=0):
|
|
206
206
|
if "EAF" in sumstats.columns:
|
|
207
207
|
# EAF -> MAF
|
|
208
|
-
|
|
208
|
+
log.write(" - Filling MAF using EAF column...", verbose=verbose)
|
|
209
209
|
sumstats["MAF"] = sumstats["EAF"].apply(lambda x: min(x,1-x) if pd.notnull(x) else np.nan)
|
|
210
210
|
filled_count +=1
|
|
211
211
|
else:
|
|
@@ -226,7 +226,7 @@ def fill_extreme_mlog10(sumstats, z):
|
|
|
226
226
|
####################################################################################################################
|
|
227
227
|
def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_level):
|
|
228
228
|
to_fill = raw_to_fill.copy()
|
|
229
|
-
|
|
229
|
+
log.write(" - Filling Columns iteratively...", verbose=verbose)
|
|
230
230
|
|
|
231
231
|
filled_count=0
|
|
232
232
|
for i in range(len(to_fill)+1):
|