gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/g_Log.py +14 -5
- gwaslab/g_Sumstats.py +86 -18
- gwaslab/g_SumstatsPair.py +70 -23
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +9 -4
- gwaslab/hm_harmonize_sumstats.py +88 -83
- gwaslab/io_preformat_input.py +14 -14
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +1 -1
- gwaslab/qc_fix_sumstats.py +163 -161
- gwaslab/util_ex_calculate_ldmatrix.py +2 -2
- gwaslab/util_ex_gwascatalog.py +24 -24
- gwaslab/util_ex_ldproxyfinder.py +9 -9
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +28 -28
- gwaslab/util_in_filter_value.py +91 -52
- gwaslab/util_in_get_density.py +8 -8
- gwaslab/util_in_get_sig.py +407 -65
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +18 -18
- gwaslab/viz_aux_reposition_text.py +3 -3
- gwaslab/viz_aux_save_figure.py +14 -5
- gwaslab/viz_plot_compare_af.py +29 -30
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +6 -6
- gwaslab/viz_plot_mqqplot.py +17 -3
- gwaslab/viz_plot_qqplot.py +1 -1
- gwaslab/viz_plot_regionalplot.py +33 -32
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +50 -55
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.38.dist-info/RECORD +0 -72
- /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/util_in_fill_data.py
CHANGED
|
@@ -24,31 +24,31 @@ def filldata(
|
|
|
24
24
|
if type(to_fill) is str:
|
|
25
25
|
to_fill = [to_fill]
|
|
26
26
|
sumstats = insumstats.copy()
|
|
27
|
-
|
|
27
|
+
log.write("Start filling data using existing columns...{}".format(_get_version()), verbose=verbose)
|
|
28
28
|
|
|
29
29
|
check_datatype(sumstats,verbose=verbose,log=log)
|
|
30
30
|
|
|
31
31
|
# check dupication ##############################################################################################
|
|
32
32
|
skip_cols=[]
|
|
33
|
-
|
|
33
|
+
log.write(" -Overwrite mode: ",overwrite, verbose=verbose)
|
|
34
34
|
if overwrite is False:
|
|
35
35
|
for i in to_fill:
|
|
36
36
|
if i in sumstats.columns:
|
|
37
37
|
skip_cols.append(i)
|
|
38
38
|
for i in skip_cols:
|
|
39
39
|
to_fill.remove(i)
|
|
40
|
-
|
|
40
|
+
log.write(" -Skipping columns: ",skip_cols, verbose=verbose)
|
|
41
41
|
if len(set(to_fill) & set(["OR","OR_95L","OR_95U","BETA","SE","P","Z","CHISQ","MLOG10P","MAF"]))==0:
|
|
42
42
|
log.write(" -No available columns to fill. Skipping.", verbose=verbose)
|
|
43
43
|
log.write("Finished filling data using existing columns.", verbose=verbose)
|
|
44
44
|
return sumstats
|
|
45
|
-
|
|
45
|
+
log.write(" -Filling columns: ",to_fill, verbose=verbose)
|
|
46
46
|
fill_iteratively(sumstats,to_fill,log,only_sig,df,extreme,verbose,sig_level)
|
|
47
47
|
|
|
48
48
|
# ###################################################################################
|
|
49
49
|
#sumstats = sortcolumn(sumstats, verbose=verbose, log=log)
|
|
50
50
|
gc.collect()
|
|
51
|
-
|
|
51
|
+
log.write("Finished filling data using existing columns.", verbose=verbose)
|
|
52
52
|
return sumstats
|
|
53
53
|
|
|
54
54
|
##########################################################################################################################
|
|
@@ -56,20 +56,20 @@ def filldata(
|
|
|
56
56
|
def fill_p(sumstats,log,df=None,only_sig=False,sig_level=5e-8,overwrite=False,verbose=True,filled_count=0):
|
|
57
57
|
# MLOG10P -> P
|
|
58
58
|
if "MLOG10P" in sumstats.columns:
|
|
59
|
-
|
|
59
|
+
log.write(" - Filling P value using MLOG10P column...", verbose=verbose)
|
|
60
60
|
sumstats["P"] = np.power(10,-sumstats["MLOG10P"])
|
|
61
61
|
filled_count +=1
|
|
62
62
|
|
|
63
63
|
# Z -> P
|
|
64
64
|
elif "Z" in sumstats.columns:
|
|
65
|
-
|
|
65
|
+
log.write(" - Filling P value using Z column...", verbose=verbose)
|
|
66
66
|
stats.chisqprob = lambda chisq, degree_of_freedom: stats.chi2.sf(chisq, degree_of_freedom)
|
|
67
67
|
sumstats["P"] = ss.chisqprob(sumstats["Z"]**2,1)
|
|
68
68
|
filled_count +=1
|
|
69
69
|
|
|
70
70
|
elif "CHISQ" in sumstats.columns:
|
|
71
71
|
#CHISQ -> P
|
|
72
|
-
|
|
72
|
+
log.write(" - Filling P value using CHISQ column...", verbose=verbose)
|
|
73
73
|
stats.chisqprob = lambda chisq, degree_of_freedom: stats.chi2.sf(chisq, degree_of_freedom)
|
|
74
74
|
if df is None:
|
|
75
75
|
if only_sig is True and overwrite is True:
|
|
@@ -80,11 +80,11 @@ def fill_p(sumstats,log,df=None,only_sig=False,sig_level=5e-8,overwrite=False,ve
|
|
|
80
80
|
filled_count +=1
|
|
81
81
|
else:
|
|
82
82
|
if only_sig is True and overwrite is True:
|
|
83
|
-
|
|
83
|
+
log.write(" - Filling P value using CHISQ column for variants:" , sum(sumstats["P"]<sig_level), verbose=verbose)
|
|
84
84
|
sumstats.loc[sumstats["P"]<sig_level,"P"] = stats.chisqprob(sumstats.loc[sumstats["P"]<sig_level,"CHISQ"],sumstats.loc[sumstats["P"]<sig_level,df].astype("int"))
|
|
85
85
|
filled_count +=1
|
|
86
86
|
else:
|
|
87
|
-
|
|
87
|
+
log.write(" - Filling P value using CHISQ column for all valid variants:", verbose=verbose)
|
|
88
88
|
sumstats["P"] = stats.chisqprob(sumstats["CHISQ"],sumstats[df].astype("int"))
|
|
89
89
|
filled_count +=1
|
|
90
90
|
else:
|
|
@@ -94,7 +94,7 @@ def fill_p(sumstats,log,df=None,only_sig=False,sig_level=5e-8,overwrite=False,ve
|
|
|
94
94
|
def fill_z(sumstats,log,verbose=True,filled_count=0):
|
|
95
95
|
# BETA/SE -> Z
|
|
96
96
|
if ("BETA" in sumstats.columns) and ("SE" in sumstats.columns):
|
|
97
|
-
|
|
97
|
+
log.write(" - Filling Z using BETA/SE column...", verbose=verbose)
|
|
98
98
|
sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
|
|
99
99
|
filled_count +=1
|
|
100
100
|
else:
|
|
@@ -104,12 +104,12 @@ def fill_z(sumstats,log,verbose=True,filled_count=0):
|
|
|
104
104
|
def fill_chisq(sumstats,log,verbose=True,filled_count=0):
|
|
105
105
|
# Z -> CHISQ
|
|
106
106
|
if "Z" in sumstats.columns:
|
|
107
|
-
|
|
107
|
+
log.write(" - Filling CHISQ using Z column...", verbose=verbose)
|
|
108
108
|
sumstats["CHISQ"] = (sumstats["Z"])**2
|
|
109
109
|
filled_count +=1
|
|
110
110
|
elif "P" in sumstats.columns:
|
|
111
111
|
# P -> CHISQ
|
|
112
|
-
|
|
112
|
+
log.write(" - Filling CHISQ using P column...", verbose=verbose)
|
|
113
113
|
sumstats["CHISQ"] = ss.chi2.isf(sumstats["P"], 1)
|
|
114
114
|
filled_count +=1
|
|
115
115
|
else:
|
|
@@ -119,13 +119,13 @@ def fill_chisq(sumstats,log,verbose=True,filled_count=0):
|
|
|
119
119
|
def fill_or(sumstats,log,verbose=True,filled_count=0):
|
|
120
120
|
# BETA -> OR
|
|
121
121
|
if "BETA" in sumstats.columns:
|
|
122
|
-
|
|
122
|
+
log.write(" - Filling OR using BETA column...", verbose=verbose)
|
|
123
123
|
sumstats["OR"] = np.exp(sumstats["BETA"])
|
|
124
124
|
filled_count +=1
|
|
125
125
|
# BETA/SE -> OR_95L / OR_95U
|
|
126
126
|
# get confidence interval 95
|
|
127
127
|
if ("BETA" in sumstats.columns) and ("SE" in sumstats.columns):
|
|
128
|
-
|
|
128
|
+
log.write(" - Filling OR_95L/OR_95U using BETA/SE columns...", verbose=verbose)
|
|
129
129
|
# beta - 1.96 x se , beta + 1.96 x se
|
|
130
130
|
sumstats["OR_95L"] = np.exp(sumstats["BETA"]-ss.norm.ppf(0.975)*sumstats["SE"])
|
|
131
131
|
sumstats["OR_95U"] = np.exp(sumstats["BETA"]+ss.norm.ppf(0.975)*sumstats["SE"])
|
|
@@ -136,7 +136,7 @@ def fill_or(sumstats,log,verbose=True,filled_count=0):
|
|
|
136
136
|
def fill_or95(sumstats,log,verbose=True,filled_count=0):
|
|
137
137
|
# get confidence interval 95
|
|
138
138
|
if ("BETA" in sumstats.columns) and ("SE" in sumstats.columns):
|
|
139
|
-
|
|
139
|
+
log.write(" - Filling OR_95L/OR_95U using BETA/SE columns...", verbose=verbose)
|
|
140
140
|
# beta - 1.96 x se , beta + 1.96 x se
|
|
141
141
|
sumstats["OR_95L"] = np.exp(sumstats["BETA"]-ss.norm.ppf(0.975)*sumstats["SE"])
|
|
142
142
|
sumstats["OR_95U"] = np.exp(sumstats["BETA"]+ss.norm.ppf(0.975)*sumstats["SE"])
|
|
@@ -148,7 +148,7 @@ def fill_or95(sumstats,log,verbose=True,filled_count=0):
|
|
|
148
148
|
def fill_beta(sumstats,log,verbose=True,filled_count=0):
|
|
149
149
|
# OR -> beta
|
|
150
150
|
if "OR" in sumstats.columns:
|
|
151
|
-
|
|
151
|
+
log.write(" - Filling BETA value using OR column...", verbose=verbose)
|
|
152
152
|
sumstats["BETA"] = np.log(sumstats["OR"])
|
|
153
153
|
filled_count +=1
|
|
154
154
|
else:
|
|
@@ -158,27 +158,27 @@ def fill_beta(sumstats,log,verbose=True,filled_count=0):
|
|
|
158
158
|
def fill_se(sumstats,log,verbose=True,filled_count=0):
|
|
159
159
|
# OR / OR_95L /OR_95U -> SE
|
|
160
160
|
if ("P" in sumstats.columns) and ("BETA" in sumstats.columns):
|
|
161
|
-
|
|
161
|
+
log.write(" - Filling SE value using BETA and P column...", verbose=verbose)
|
|
162
162
|
sumstats["SE"]= np.abs(sumstats["BETA"]/ ss.norm.ppf(1-sumstats["P"]/2))
|
|
163
163
|
filled_count +=1
|
|
164
164
|
elif ("OR" in sumstats.columns) and ("OR_95U" in sumstats.columns):
|
|
165
|
-
|
|
165
|
+
log.write(" - Filling SE value using OR/OR_95U column...", verbose=verbose)
|
|
166
166
|
#
|
|
167
167
|
sumstats["SE"]=(np.log(sumstats["OR_95U"]) - np.log(sumstats["OR"]))/ss.norm.ppf(0.975)
|
|
168
168
|
filled_count +=1
|
|
169
169
|
elif ("OR" in sumstats.columns) and ("OR_95L" in sumstats.columns):
|
|
170
|
-
|
|
170
|
+
log.write(" - Filling SE value using OR/OR_95L column...", verbose=verbose)
|
|
171
171
|
sumstats["SE"]=(np.log(sumstats["OR"]) - np.log(sumstats["OR_95L"]))/ss.norm.ppf(0.975)
|
|
172
172
|
filled_count +=1
|
|
173
173
|
else:
|
|
174
|
-
|
|
174
|
+
log.write(" - Not enough information to fill SE...", verbose=verbose)
|
|
175
175
|
return 0,filled_count
|
|
176
176
|
return 1,filled_count
|
|
177
177
|
|
|
178
178
|
def fill_mlog10p(sumstats,log,verbose=True,filled_count=0):
|
|
179
179
|
if "P" in sumstats.columns:
|
|
180
180
|
# P -> MLOG10P
|
|
181
|
-
|
|
181
|
+
log.write(" - Filling MLOG10P using P column...", verbose=verbose)
|
|
182
182
|
sumstats["MLOG10P"] = -np.log10(sumstats["P"])
|
|
183
183
|
filled_count +=1
|
|
184
184
|
else:
|
|
@@ -188,14 +188,14 @@ def fill_extreme_mlog10p(sumstats,log,verbose=True,filled_count=0):
|
|
|
188
188
|
# ref: https://stackoverflow.com/questions/46416027/how-to-compute-p-values-from-z-scores-in-r-when-the-z-score-is-large-pvalue-muc/46416222#46416222
|
|
189
189
|
if "Z" in sumstats.columns:
|
|
190
190
|
# P -> MLOG10P
|
|
191
|
-
|
|
191
|
+
log.write(" - Filling MLOG10P using Z column...", verbose=verbose)
|
|
192
192
|
sumstats = fill_extreme_mlog10(sumstats, "Z")
|
|
193
193
|
filled_count +=1
|
|
194
194
|
elif "BETA" in sumstats.columns and "SE" in sumstats.columns:
|
|
195
|
-
|
|
196
|
-
|
|
195
|
+
log.write(" - Z column not available...", verbose=verbose)
|
|
196
|
+
log.write(" - Filling Z using BETA/SE column...", verbose=verbose)
|
|
197
197
|
sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
|
|
198
|
-
|
|
198
|
+
log.write(" - Filling MLOG10P using Z column...", verbose=verbose)
|
|
199
199
|
sumstats = fill_extreme_mlog10(sumstats, "Z")
|
|
200
200
|
filled_count +=1
|
|
201
201
|
else:
|
|
@@ -205,7 +205,7 @@ def fill_extreme_mlog10p(sumstats,log,verbose=True,filled_count=0):
|
|
|
205
205
|
def fill_maf(sumstats,log,verbose=True,filled_count=0):
|
|
206
206
|
if "EAF" in sumstats.columns:
|
|
207
207
|
# EAF -> MAF
|
|
208
|
-
|
|
208
|
+
log.write(" - Filling MAF using EAF column...", verbose=verbose)
|
|
209
209
|
sumstats["MAF"] = sumstats["EAF"].apply(lambda x: min(x,1-x) if pd.notnull(x) else np.nan)
|
|
210
210
|
filled_count +=1
|
|
211
211
|
else:
|
|
@@ -226,7 +226,7 @@ def fill_extreme_mlog10(sumstats, z):
|
|
|
226
226
|
####################################################################################################################
|
|
227
227
|
def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_level):
|
|
228
228
|
to_fill = raw_to_fill.copy()
|
|
229
|
-
|
|
229
|
+
log.write(" - Filling Columns iteratively...", verbose=verbose)
|
|
230
230
|
|
|
231
231
|
filled_count=0
|
|
232
232
|
for i in range(len(to_fill)+1):
|
gwaslab/util_in_filter_value.py
CHANGED
|
@@ -10,65 +10,66 @@ from gwaslab.g_vchange_status import vchange_status
|
|
|
10
10
|
from gwaslab.qc_fix_sumstats import sortcoordinate
|
|
11
11
|
from gwaslab.qc_fix_sumstats import start_to
|
|
12
12
|
from gwaslab.qc_fix_sumstats import finished
|
|
13
|
+
from gwaslab.hm_harmonize_sumstats import is_palindromic
|
|
13
14
|
|
|
14
15
|
import gc
|
|
15
16
|
def filtervalues(sumstats,expr,remove=False,verbose=True,log=Log()):
|
|
16
|
-
|
|
17
|
+
log.write("Start filtering values by condition:",expr, verbose=verbose)
|
|
17
18
|
prenum = len(sumstats)
|
|
18
19
|
sumstats = sumstats.query(expr,engine='python').copy()
|
|
19
20
|
afternum = len(sumstats)
|
|
20
|
-
|
|
21
|
-
|
|
21
|
+
log.write(" -Removing "+ str(prenum-afternum) +" variants not meeting the conditions:",expr, verbose=verbose)
|
|
22
|
+
log.write("Finished filtering values.", verbose=verbose)
|
|
22
23
|
gc.collect()
|
|
23
24
|
return sumstats
|
|
24
25
|
|
|
25
26
|
def filterout(sumstats,interval={},lt={},gt={},eq={},remove=False,verbose=True,log=Log()):
|
|
26
|
-
|
|
27
|
+
log.write("Start filtering values:", verbose=verbose)
|
|
27
28
|
for key,threshold in gt.items():
|
|
28
29
|
num = len(sumstats.loc[sumstats[key]>threshold,:])
|
|
29
|
-
|
|
30
|
+
log.write(" -Removing "+ str(num) +" variants with "+key+" > "+ str(threshold)+" ...", verbose=verbose)
|
|
30
31
|
sumstats = sumstats.loc[sumstats[key]<threshold,:]
|
|
31
32
|
for key,threshold in lt.items():
|
|
32
33
|
num = len(sumstats.loc[sumstats[key]<threshold,:])
|
|
33
|
-
|
|
34
|
+
log.write(" -Removing "+ str(num) +" variants with "+key+" < "+ str(threshold)+" ...", verbose=verbose)
|
|
34
35
|
sumstats = sumstats.loc[sumstats[key]>threshold,:]
|
|
35
36
|
for key,threshold in eq.items():
|
|
36
37
|
num = len(sumstats.loc[sumstats[key]==threshold,:])
|
|
37
|
-
|
|
38
|
+
log.write(" -Removing "+ str(num) +" variants with "+key+" = "+ str(threshold)+" ...", verbose=verbose)
|
|
38
39
|
sumstats = sumstats.loc[sumstats[key]!=threshold,:]
|
|
39
|
-
|
|
40
|
+
log.write("Finished filtering values.", verbose=verbose)
|
|
40
41
|
gc.collect()
|
|
41
42
|
return sumstats.copy()
|
|
42
43
|
|
|
43
44
|
def filterin(sumstats,lt={},gt={},eq={},remove=False,verbose=True,log=Log()):
|
|
44
|
-
|
|
45
|
+
log.write("Start filtering values:", verbose=verbose)
|
|
45
46
|
for key,threshold in gt.items():
|
|
46
47
|
num = len(sumstats.loc[sumstats[key]>threshold,:])
|
|
47
|
-
|
|
48
|
+
log.write(" -Keeping "+ str(num) +" variants with "+key+" > "+ str(threshold)+" ...", verbose=verbose)
|
|
48
49
|
sumstats = sumstats.loc[sumstats[key]>threshold,:]
|
|
49
50
|
for key,threshold in lt.items():
|
|
50
51
|
num = len(sumstats.loc[sumstats[key]<threshold,:])
|
|
51
|
-
|
|
52
|
+
log.write(" -Keeping "+ str(num) +" variants with "+key+" < "+ str(threshold)+" ...", verbose=verbose)
|
|
52
53
|
sumstats = sumstats.loc[sumstats[key]<threshold,:]
|
|
53
54
|
for key,threshold in eq.items():
|
|
54
55
|
num = len(sumstats.loc[sumstats[key]==threshold,:])
|
|
55
|
-
|
|
56
|
+
log.write(" -Keeping "+ str(num) +" variants with "+key+" = "+ str(threshold)+" ...", verbose=verbose)
|
|
56
57
|
sumstats = sumstats.loc[sumstats[key]==threshold,:]
|
|
57
|
-
|
|
58
|
+
log.write("Finished filtering values.", verbose=verbose)
|
|
58
59
|
gc.collect()
|
|
59
60
|
return sumstats.copy()
|
|
60
61
|
|
|
61
62
|
def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, build="19", verbose=True,log=Log()):
|
|
62
63
|
sumstats = sortcoordinate(sumstats,verbose=verbose)
|
|
63
|
-
|
|
64
|
-
|
|
64
|
+
log.write("Start to filter in variants if in intervals defined in bed files:", verbose=verbose)
|
|
65
|
+
log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns), verbose=verbose)
|
|
65
66
|
|
|
66
67
|
if high_ld is True:
|
|
67
68
|
path = get_high_ld(build=build)
|
|
68
|
-
|
|
69
|
+
log.write(" -Loading bed format file for hg"+build, verbose=verbose)
|
|
69
70
|
|
|
70
71
|
else:
|
|
71
|
-
|
|
72
|
+
log.write(" -Loading bed format file: " , path, verbose=verbose)
|
|
72
73
|
bed = pd.read_csv(path,sep="\s+",header=None,dtype={0:"string",1:"Int64",2:"Int64"})
|
|
73
74
|
|
|
74
75
|
bed["tuple"] = bed.apply(lambda x: (x[1],x[2]),axis=1)
|
|
@@ -80,7 +81,7 @@ def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, bui
|
|
|
80
81
|
sumstats = sumstats.sort_values(["CHR","POS"])
|
|
81
82
|
|
|
82
83
|
if len(bed)<100:
|
|
83
|
-
|
|
84
|
+
log.write(" -Bed file < 100 lines: using pd IntervalIndex... ", verbose=verbose)
|
|
84
85
|
for i in sumstats[chrom].unique():
|
|
85
86
|
if sum(bed[0]==i)>0:
|
|
86
87
|
interval = pd.IntervalIndex.from_tuples(bed.loc[bed[0]==i,"tuple"])
|
|
@@ -88,7 +89,7 @@ def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, bui
|
|
|
88
89
|
else:
|
|
89
90
|
continue
|
|
90
91
|
else:
|
|
91
|
-
|
|
92
|
+
log.write(" -Bed file > 100 lines: using two pointers, please make files are all sorted... ", verbose=verbose)
|
|
92
93
|
bed_num =0
|
|
93
94
|
bed_chr =bed.iloc[bed_num,0]
|
|
94
95
|
bed_left =bed.iloc[bed_num,1]
|
|
@@ -136,23 +137,23 @@ def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, bui
|
|
|
136
137
|
## in
|
|
137
138
|
|
|
138
139
|
sumstats = sumstats.loc[sumstats["bed_indicator"],:]
|
|
139
|
-
|
|
140
|
-
|
|
140
|
+
log.write(" -Number of variants in the specified regions to keep:",sum(sumstats["bed_indicator"]), verbose=verbose)
|
|
141
|
+
log.write(" -Number of variants removed:",sum(~sumstats["bed_indicator"]), verbose=verbose)
|
|
141
142
|
sumstats = sumstats.drop(columns="bed_indicator")
|
|
142
|
-
|
|
143
|
+
log.write("Finished filtering in variants.", verbose=verbose)
|
|
143
144
|
gc.collect()
|
|
144
145
|
return sumstats
|
|
145
146
|
|
|
146
147
|
def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, build="19", verbose=True,log=Log()):
|
|
147
148
|
sumstats = sortcoordinate(sumstats,verbose=verbose)
|
|
148
|
-
|
|
149
|
-
|
|
149
|
+
log.write("Start to filter out variants if in intervals defined in bed files:", verbose=verbose)
|
|
150
|
+
log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns), verbose=verbose)
|
|
150
151
|
if high_ld is True:
|
|
151
152
|
path = get_high_ld(build=build)
|
|
152
|
-
|
|
153
|
+
log.write(" -Loading bed format file for hg"+build, verbose=verbose)
|
|
153
154
|
|
|
154
155
|
else:
|
|
155
|
-
|
|
156
|
+
log.write(" -Loading bed format file: " , path, verbose=verbose)
|
|
156
157
|
|
|
157
158
|
bed = pd.read_csv(path,sep="\s+",header=None,dtype={0:"string",1:"Int64",2:"Int64"})
|
|
158
159
|
bed["tuple"] = bed.apply(lambda x: (x[1],x[2]),axis=1)
|
|
@@ -164,7 +165,7 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
|
|
|
164
165
|
bed[0]=bed[0].astype("Int64")
|
|
165
166
|
|
|
166
167
|
if len(bed)<100:
|
|
167
|
-
|
|
168
|
+
log.write(" -Bed file < 100 lines: using pd IntervalIndex... ", verbose=verbose)
|
|
168
169
|
for i in sumstats[chrom].unique():
|
|
169
170
|
if sum(bed[0]==i)>0:
|
|
170
171
|
interval = pd.IntervalIndex.from_tuples(bed.loc[bed[0]==i,"tuple"])
|
|
@@ -172,7 +173,7 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
|
|
|
172
173
|
else:
|
|
173
174
|
continue
|
|
174
175
|
else:
|
|
175
|
-
|
|
176
|
+
log.write(" -Bed file > 100 lines: using two pointers, please make files are all sorted... ", verbose=verbose)
|
|
176
177
|
bed_num =0
|
|
177
178
|
bed_chr =bed.iloc[bed_num,0]
|
|
178
179
|
bed_left =bed.iloc[bed_num,1]
|
|
@@ -208,10 +209,10 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
|
|
|
208
209
|
## out
|
|
209
210
|
|
|
210
211
|
sumstats = sumstats.loc[~sumstats["bed_indicator"],:]
|
|
211
|
-
|
|
212
|
-
|
|
212
|
+
log.write(" -Number of variants in the specified regions to exclude:",sum(sumstats["bed_indicator"]), verbose=verbose)
|
|
213
|
+
log.write(" -Number of variants left:",len(sumstats), verbose=verbose)
|
|
213
214
|
sumstats = sumstats.drop(columns="bed_indicator")
|
|
214
|
-
|
|
215
|
+
log.write("Finished filtering out variants.", verbose=verbose)
|
|
215
216
|
gc.collect()
|
|
216
217
|
return sumstats
|
|
217
218
|
|
|
@@ -235,14 +236,14 @@ def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NE
|
|
|
235
236
|
############################################################################################
|
|
236
237
|
|
|
237
238
|
inferred_build="Unknown"
|
|
238
|
-
|
|
239
|
+
log.write("Start to infer genome build version using hapmap3 SNPs...", verbose=verbose)
|
|
239
240
|
data_path_19 = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz'
|
|
240
241
|
data_path_38 = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz'
|
|
241
|
-
|
|
242
|
+
log.write(" -Loading Hapmap3 variants data...", verbose=verbose)
|
|
242
243
|
hapmap3_ref_19 = pd.read_csv(data_path_19,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
|
|
243
244
|
hapmap3_ref_38 = pd.read_csv(data_path_38,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
|
|
244
245
|
|
|
245
|
-
|
|
246
|
+
log.write(" -CHR:POS will be used for matching...", verbose=verbose)
|
|
246
247
|
raw_chrpos = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
|
|
247
248
|
|
|
248
249
|
hapmap3_ref_19["chr:pos"] = hapmap3_ref_19["#CHROM"]+":"+hapmap3_ref_19["POS"]
|
|
@@ -251,50 +252,50 @@ def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NE
|
|
|
251
252
|
match_count_for_19 = sum(raw_chrpos.isin(hapmap3_ref_19["chr:pos"].values))
|
|
252
253
|
match_count_for_38 = sum(raw_chrpos.isin(hapmap3_ref_38["chr:pos"].values))
|
|
253
254
|
|
|
254
|
-
|
|
255
|
-
|
|
255
|
+
log.write(" -Matching variants for hg19: num_hg19 = ",match_count_for_19, verbose=verbose)
|
|
256
|
+
log.write(" -Matching variants for hg38: num_hg38 = ",match_count_for_38, verbose=verbose)
|
|
256
257
|
|
|
257
258
|
if max(match_count_for_19, match_count_for_38)<10000:
|
|
258
|
-
|
|
259
|
+
log.warning("Please be cautious due to the limited number of variants.", verbose=verbose)
|
|
259
260
|
|
|
260
261
|
if match_count_for_19 > match_count_for_38:
|
|
261
|
-
|
|
262
|
+
log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...", verbose=verbose)
|
|
262
263
|
sumstats[status] = vchange_status(sumstats[status],1,"9","1")
|
|
263
264
|
sumstats[status] = vchange_status(sumstats[status],2,"9","9")
|
|
264
265
|
inferred_build="19"
|
|
265
266
|
elif match_count_for_19 < match_count_for_38:
|
|
266
|
-
|
|
267
|
+
log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...", verbose=verbose)
|
|
267
268
|
sumstats[status] = vchange_status(sumstats[status],1,"9","3")
|
|
268
269
|
sumstats[status] = vchange_status(sumstats[status],2,"9","8")
|
|
269
270
|
inferred_build="38"
|
|
270
271
|
else:
|
|
271
|
-
|
|
272
|
+
log.write(" -Since num_hg19 = num_hg38, unable to infer...", verbose=verbose)
|
|
272
273
|
|
|
273
274
|
finished(log,verbose,_end_line)
|
|
274
275
|
return sumstats, inferred_build
|
|
275
276
|
|
|
276
277
|
def sampling(sumstats,n=1, p=None, verbose=True,log=Log(),**args):
|
|
277
278
|
|
|
278
|
-
|
|
279
|
+
log.write("Start to randomly select variants from the sumstats...", verbose=verbose)
|
|
279
280
|
if p is None:
|
|
280
|
-
|
|
281
|
+
log.write(" -Number of variants selected from the sumstats:",n, verbose=verbose)
|
|
281
282
|
if n > len(sumstats):
|
|
282
283
|
raise ValueError("Please input a number < {}".format(len(sumstats)))
|
|
283
284
|
else:
|
|
284
285
|
if p>-0.00000001 and p<1.00000001:
|
|
285
|
-
|
|
286
|
+
log.write(" -Percentage of variants selected from the sumstats: ",p, verbose=verbose)
|
|
286
287
|
n = int(len(sumstats)*p)
|
|
287
|
-
|
|
288
|
+
log.write(" -Number of variants selected from the sumstats:",n, verbose=verbose)
|
|
288
289
|
else:
|
|
289
290
|
raise ValueError("Please input a number in (0,1)")
|
|
290
291
|
|
|
291
292
|
if "random_state" in args.keys():
|
|
292
|
-
|
|
293
|
+
log.write(" -Random state (seed): {}".format(args["random_state"]), verbose=verbose)
|
|
293
294
|
else:
|
|
294
295
|
args["random_state"] = np.random.randint(0,4294967295)
|
|
295
|
-
|
|
296
|
+
log.write(" -Random state (seed): {}".format(args["random_state"]), verbose=verbose)
|
|
296
297
|
sampled = sumstats.sample(n=n,**args)
|
|
297
|
-
|
|
298
|
+
log.write("Finished sampling...", verbose=verbose)
|
|
298
299
|
gc.collect()
|
|
299
300
|
return sampled
|
|
300
301
|
|
|
@@ -322,8 +323,8 @@ def _get_flanking(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**ar
|
|
|
322
323
|
def _get_flanking_by_id(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**args):
|
|
323
324
|
|
|
324
325
|
log.write("Start to extract variants in the flanking regions using rsID or SNPID...",verbose=verbose)
|
|
325
|
-
log.write(" - Central variants: {}".format(snpid))
|
|
326
|
-
log.write(" - Flanking windowsize in kb: {}".format(windowsizekb))
|
|
326
|
+
log.write(" - Central variants: {}".format(snpid), verbose=verbose)
|
|
327
|
+
log.write(" - Flanking windowsize in kb: {}".format(windowsizekb), verbose=verbose)
|
|
327
328
|
|
|
328
329
|
if type(snpid) == str:
|
|
329
330
|
snpid = [snpid]
|
|
@@ -361,8 +362,8 @@ def _get_flanking_by_id(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(
|
|
|
361
362
|
def _get_flanking_by_chrpos(sumstats, chrpos, windowsizekb=500, verbose=True,log=Log(),**args):
|
|
362
363
|
|
|
363
364
|
log.write("Start to extract variants in the flanking regions using CHR and POS...",verbose=verbose)
|
|
364
|
-
log.write(" - Central positions: {}".format(chrpos))
|
|
365
|
-
log.write(" - Flanking windowsize in kb: {}".format(windowsizekb))
|
|
365
|
+
log.write(" - Central positions: {}".format(chrpos), verbose=verbose)
|
|
366
|
+
log.write(" - Flanking windowsize in kb: {}".format(windowsizekb), verbose=verbose)
|
|
366
367
|
|
|
367
368
|
if type(chrpos) == tuple:
|
|
368
369
|
chrpos_to_check = [chrpos]
|
|
@@ -389,4 +390,42 @@ def _get_flanking_by_chrpos(sumstats, chrpos, windowsizekb=500, verbose=True,log
|
|
|
389
390
|
log.write(" - Extracted {} variants in the regions.".format(len(flanking)),verbose=verbose)
|
|
390
391
|
log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
|
|
391
392
|
|
|
392
|
-
return flanking
|
|
393
|
+
return flanking
|
|
394
|
+
|
|
395
|
+
def _filter_palindromic(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
|
|
396
|
+
log.write("Start to filter palindromic variants...",verbose=verbose)
|
|
397
|
+
is_palindromic_snp = is_palindromic(sumstats[[nea,ea]],a1=nea,a2=ea)
|
|
398
|
+
|
|
399
|
+
log.write(" -Identified palindromic variants: {}".format(sum(is_palindromic_snp)),verbose=verbose)
|
|
400
|
+
|
|
401
|
+
if mode=="in":
|
|
402
|
+
palindromic = sumstats.loc[is_palindromic_snp,:]
|
|
403
|
+
else:
|
|
404
|
+
palindromic = sumstats.loc[~is_palindromic_snp,:]
|
|
405
|
+
|
|
406
|
+
log.write("Finished filtering palindromic variants.",verbose=verbose)
|
|
407
|
+
return palindromic
|
|
408
|
+
|
|
409
|
+
def _filter_indel(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
|
|
410
|
+
log.write("Start to filter indels...",verbose=verbose)
|
|
411
|
+
is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
|
|
412
|
+
|
|
413
|
+
log.write(" -Identified indels: {}".format(sum(is_indel)),verbose=verbose)
|
|
414
|
+
if mode=="in":
|
|
415
|
+
indel = sumstats.loc[is_indel,:]
|
|
416
|
+
else:
|
|
417
|
+
indel = sumstats.loc[~is_indel,:]
|
|
418
|
+
log.write("Finished filtering indels.",verbose=verbose)
|
|
419
|
+
return indel
|
|
420
|
+
|
|
421
|
+
def _filter_snp(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
|
|
422
|
+
log.write("Start to filter SNPs...",verbose=verbose)
|
|
423
|
+
is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
|
|
424
|
+
|
|
425
|
+
log.write(" -Identified SNPs: {}".format(sum(is_snp)),verbose=verbose)
|
|
426
|
+
if mode=="in":
|
|
427
|
+
snp = sumstats.loc[is_snp,:]
|
|
428
|
+
else:
|
|
429
|
+
snp = sumstats.loc[~is_snp,:]
|
|
430
|
+
log.write("Finished filtering SNPs.",verbose=verbose)
|
|
431
|
+
return snp
|
gwaslab/util_in_get_density.py
CHANGED
|
@@ -5,9 +5,9 @@ from gwaslab.g_Log import Log
|
|
|
5
5
|
import gc
|
|
6
6
|
|
|
7
7
|
def getsignaldensity(insumstats, id="SNPID", chrom="CHR",pos="POS", bwindowsizekb=100,log=Log(),verbose=True):
|
|
8
|
-
|
|
8
|
+
log.write("Start to calculate signal DENSITY..." ,verbose=verbose)
|
|
9
9
|
sumstats = insumstats[[id,chrom,pos]].copy()
|
|
10
|
-
|
|
10
|
+
log.write(" -Calculating DENSITY with windowsize of ",bwindowsizekb ," kb",verbose=verbose)
|
|
11
11
|
#stack=[]
|
|
12
12
|
|
|
13
13
|
large_number = 1000000000
|
|
@@ -58,13 +58,13 @@ def getsignaldensity(insumstats, id="SNPID", chrom="CHR",pos="POS", bwindowsizek
|
|
|
58
58
|
bmax = sumstats["DENSITY"].max()
|
|
59
59
|
bmaxid = sumstats["DENSITY"].idxmax()
|
|
60
60
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
61
|
+
log.write(" -Mean : {} signals per {} kb".format(bmean,bwindowsizekb),verbose=verbose)
|
|
62
|
+
log.write(" -SD : {}".format(bsd),verbose=verbose)
|
|
63
|
+
log.write(" -Median : {} signals per {} kb".format(bmedian,bwindowsizekb),verbose=verbose)
|
|
64
|
+
log.write(" -Max : {} signals per {} kb at variant(s) {}".format(bmax,bwindowsizekb,sumstats.loc[bmaxid,id]),verbose=verbose)
|
|
65
65
|
|
|
66
66
|
sumstats = sumstats.drop("TCHR+POS",axis=1)
|
|
67
|
-
|
|
67
|
+
log.write("Finished calculating signal DENSITY successfully!",verbose=verbose)
|
|
68
68
|
return sumstats["DENSITY"]
|
|
69
69
|
|
|
70
70
|
def assigndensity(insumstats,
|
|
@@ -92,7 +92,7 @@ def assigndensity(insumstats,
|
|
|
92
92
|
to_add =(sumstats["TCHR+POS"]>=(row["TCHR+POS"]- 1000*bwindowsizekb)) & (sumstats["TCHR+POS"]<=(row["TCHR+POS"]+ 1000*bwindowsizekb))
|
|
93
93
|
sumstats.loc[to_add,"DENSITY"] += 1
|
|
94
94
|
if counter%1000==0:
|
|
95
|
-
|
|
95
|
+
log.write(" -Processed {} signals".format(counter//1000),verbose=verbose)
|
|
96
96
|
gc.collect()
|
|
97
97
|
|
|
98
98
|
return sumstats["DENSITY"]
|