gwaslab 3.4.35__py3-none-any.whl → 3.4.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +1 -1
- gwaslab/bd_common_data.py +4 -2
- gwaslab/g_Sumstats.py +56 -33
- gwaslab/g_meta.py +13 -3
- gwaslab/g_version.py +2 -2
- gwaslab/hm_harmonize_sumstats.py +43 -18
- gwaslab/io_preformat_input.py +9 -0
- gwaslab/qc_check_datatype.py +14 -0
- gwaslab/qc_fix_sumstats.py +278 -119
- gwaslab/util_ex_process_h5.py +26 -17
- gwaslab/util_in_fill_data.py +50 -12
- gwaslab/viz_aux_quickfix.py +53 -52
- gwaslab/viz_plot_compare_effect.py +27 -8
- gwaslab/viz_plot_forestplot.py +1 -1
- gwaslab/viz_plot_mqqplot.py +127 -48
- gwaslab/viz_plot_regionalplot.py +20 -9
- gwaslab/viz_plot_rg_heatmap.py +8 -4
- {gwaslab-3.4.35.dist-info → gwaslab-3.4.37.dist-info}/METADATA +5 -6
- {gwaslab-3.4.35.dist-info → gwaslab-3.4.37.dist-info}/RECORD +22 -22
- {gwaslab-3.4.35.dist-info → gwaslab-3.4.37.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.35.dist-info → gwaslab-3.4.37.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.35.dist-info → gwaslab-3.4.37.dist-info}/top_level.txt +0 -0
gwaslab/qc_fix_sumstats.py
CHANGED
|
@@ -14,7 +14,12 @@ from gwaslab.bd_common_data import get_chr_to_number
|
|
|
14
14
|
from gwaslab.bd_common_data import get_number_to_chr
|
|
15
15
|
from gwaslab.bd_common_data import get_chr_list
|
|
16
16
|
from gwaslab.qc_check_datatype import check_datatype
|
|
17
|
+
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
17
18
|
from gwaslab.g_version import _get_version
|
|
19
|
+
from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
|
|
20
|
+
from gwaslab.util_in_fill_data import _convert_betase_to_p
|
|
21
|
+
from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
|
|
22
|
+
#process build
|
|
18
23
|
#setbuild
|
|
19
24
|
#fixID
|
|
20
25
|
#rsidtochrpos
|
|
@@ -26,6 +31,7 @@ from gwaslab.g_version import _get_version
|
|
|
26
31
|
#normalizevariant
|
|
27
32
|
#checkref
|
|
28
33
|
#sanitycheckstats
|
|
34
|
+
#_check_data_consistency
|
|
29
35
|
#flipallelestats
|
|
30
36
|
#parallelizeassignrsid
|
|
31
37
|
#sortcoordinate
|
|
@@ -41,7 +47,7 @@ def _process_build(build,log,verbose):
|
|
|
41
47
|
log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
|
|
42
48
|
final_build = "38"
|
|
43
49
|
else:
|
|
44
|
-
log.write(" -Version of genomic coordinates
|
|
50
|
+
log.write(" -WARNING! Version of genomic coordinates is unknown...", verbose=verbose)
|
|
45
51
|
final_build = "99"
|
|
46
52
|
return final_build
|
|
47
53
|
|
|
@@ -49,10 +55,10 @@ def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
|
|
|
49
55
|
build = _process_build(build,log=log,verbose=verbose)
|
|
50
56
|
sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 1, "139",build[0]*3)
|
|
51
57
|
sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 2, "89",build[1]*3)
|
|
52
|
-
return sumstats
|
|
58
|
+
return sumstats, build
|
|
53
59
|
|
|
54
60
|
def fixID(sumstats,
|
|
55
|
-
snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",
|
|
61
|
+
snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",fixprefix=False,
|
|
56
62
|
fixchrpos=False,fixid=False,fixeanea=False,fixeanea_flip=False,fixsep=False,
|
|
57
63
|
overwrite=False,verbose=True,forcefixid=False,log=Log()):
|
|
58
64
|
'''
|
|
@@ -61,37 +67,64 @@ def fixID(sumstats,
|
|
|
61
67
|
3. checking rsid and chr:pos:nea:ea
|
|
62
68
|
'''
|
|
63
69
|
if verbose: log.write("Start to check IDs...{}".format(_get_version()))
|
|
64
|
-
|
|
65
|
-
|
|
70
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
66
71
|
check_col(sumstats,[snpid,rsid],status)
|
|
67
|
-
|
|
72
|
+
|
|
73
|
+
############################ checking datatype ###################################################
|
|
74
|
+
if rsid in sumstats.columns:
|
|
75
|
+
# convert to string datatype
|
|
76
|
+
try:
|
|
77
|
+
log.write(" -Checking rsID data type...",verbose=verbose)
|
|
78
|
+
if sumstats.loc[:,rsid].dtype == "string":
|
|
79
|
+
pass
|
|
80
|
+
else:
|
|
81
|
+
log.write(" -Converting rsID to pd.string data type...",verbose=verbose)
|
|
82
|
+
sumstats.loc[:,rsid] = sumstats.loc[:,rsid].astype("string")
|
|
83
|
+
except:
|
|
84
|
+
log.write(" -Force converting rsID to pd.string data type...",verbose=verbose)
|
|
85
|
+
sumstats.loc[:,rsid] = sumstats.loc[:,rsid].astype("string")
|
|
86
|
+
if snpid in sumstats.columns:
|
|
87
|
+
# convert to string datatype
|
|
88
|
+
try:
|
|
89
|
+
log.write(" -Checking SNPID data type...",verbose=verbose)
|
|
90
|
+
if sumstats.loc[:,snpid].dtype == "string":
|
|
91
|
+
pass
|
|
92
|
+
else:
|
|
93
|
+
log.write(" -Converting SNPID to pd.string data type...",verbose=verbose)
|
|
94
|
+
sumstats.loc[:,snpid] = sumstats.loc[:,snpid].astype("string")
|
|
95
|
+
except:
|
|
96
|
+
log.write(" -Force converting SNPID to pd.string data type...",verbose=verbose)
|
|
97
|
+
sumstats.loc[:,snpid] = sumstats.loc[:,snpid].astype("string")
|
|
98
|
+
|
|
68
99
|
############################ checking ###################################################
|
|
69
100
|
if snpid in sumstats.columns:
|
|
70
|
-
|
|
71
|
-
#
|
|
101
|
+
log.write(" -Checking if SNPID is CHR:POS:NEA:EA...(separator: - ,: , _)",verbose=verbose)
|
|
102
|
+
# check if SNPID is CHR:POS:EA:NEA
|
|
72
103
|
is_chrposrefalt = sumstats[snpid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
|
|
104
|
+
# check if SNPID is NA
|
|
73
105
|
is_snpid_na = sumstats[snpid].isna()
|
|
106
|
+
|
|
107
|
+
# change STATUS code
|
|
74
108
|
sumstats.loc[ is_chrposrefalt,status] = vchange_status(sumstats.loc[ is_chrposrefalt,status],3 ,"975" ,"630")
|
|
75
109
|
sumstats.loc[(~is_chrposrefalt)&(~is_snpid_na),status] = vchange_status(sumstats.loc[(~is_chrposrefalt)&(~is_snpid_na),status],3 ,"975" ,"842")
|
|
76
110
|
|
|
77
111
|
if rsid in sumstats.columns:
|
|
78
|
-
|
|
79
|
-
is_rsid = sumstats[rsid].str.
|
|
112
|
+
log.write(" -Checking if rsID is rsxxxxxx...", verbose=verbose)
|
|
113
|
+
is_rsid = sumstats[rsid].str.match(r'^rs\d+$', case=False, flags=0, na=False)
|
|
80
114
|
|
|
81
115
|
sumstats.loc[ is_rsid,status] = vchange_status(sumstats.loc[ is_rsid,status], 3, "986","520")
|
|
82
116
|
sumstats.loc[~is_rsid,status] = vchange_status(sumstats.loc[~is_rsid,status], 3, "986","743")
|
|
83
117
|
|
|
84
|
-
if verbose: log.write(" -Checking if
|
|
85
|
-
is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\
|
|
86
|
-
#is_rs_chrpos = sumstats[rsid].str.match(r'(chr)?([0-9XYMT]+)[:_-]([0-9]+)[:_-]([ATCG]+)[:_-]([ATCG]+)', case=False, flags=0, na=False)
|
|
118
|
+
if verbose: log.write(" -Checking if CHR:POS:NEA:EA is mixed in rsID column ...")
|
|
119
|
+
is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
|
|
87
120
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
121
|
+
log.write(" -Number of CHR:POS:NEA:EA mixed in rsID column :",sum(is_rs_chrpos), verbose=verbose)
|
|
122
|
+
log.write(" -Number of Unrecognized rsID :",len(sumstats) - sum(is_rs_chrpos) - sum(is_rsid) , verbose=verbose)
|
|
123
|
+
log.write(" -A look at the unrecognized rsID :",set(sumstats.loc[(~is_rsid)&(~is_rs_chrpos),rsid].head()),"...", verbose=verbose)
|
|
91
124
|
|
|
92
125
|
############################ fixing chr pos###################################################
|
|
93
|
-
if fixchrpos
|
|
94
|
-
# from snpid or rsid, extract
|
|
126
|
+
if fixchrpos == True:
|
|
127
|
+
# from snpid or rsid, extract CHR:POS to fix CHR and POS
|
|
95
128
|
if snpid in sumstats.columns:
|
|
96
129
|
if verbose: log.write(" -Fixing CHR and POS...")
|
|
97
130
|
if overwrite is True:
|
|
@@ -99,8 +132,8 @@ def fixID(sumstats,
|
|
|
99
132
|
# fix all
|
|
100
133
|
to_fix = is_chrposrefalt
|
|
101
134
|
|
|
102
|
-
#fix variants with chr and pos being empty
|
|
103
135
|
elif (chrom in sumstats.columns) and (pos in sumstats.columns) :
|
|
136
|
+
#fix variants with chr and pos being NA
|
|
104
137
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
105
138
|
to_fix_num = sum(to_fix)
|
|
106
139
|
if to_fix_num and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
@@ -121,6 +154,7 @@ def fixID(sumstats,
|
|
|
121
154
|
to_fix_num = sum(to_fix)
|
|
122
155
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
123
156
|
elif verbose: log.write(" -No fixable variants. ...")
|
|
157
|
+
|
|
124
158
|
else:
|
|
125
159
|
if verbose: log.write(" -Initiating CHR and POS columns...")
|
|
126
160
|
sumstats.loc[:,chrom]=pd.Series(dtype="string")
|
|
@@ -134,8 +168,8 @@ def fixID(sumstats,
|
|
|
134
168
|
if verbose: log.write(" -Filling CHR and POS columns using valid SNPID's chr:pos...")
|
|
135
169
|
# format and qc filled chr and pos
|
|
136
170
|
|
|
137
|
-
sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.
|
|
138
|
-
sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,snpid].str.
|
|
171
|
+
sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
172
|
+
sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[2]
|
|
139
173
|
|
|
140
174
|
#sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.split(':|_|-').str[0].str.strip("chrCHR").astype("string")
|
|
141
175
|
#sumstats.loc[to_fix,pos] =np.floor(pd.to_numeric(sumstats.loc[to_fix,snpid].str.split(':|_|-').str[1], errors='coerce')).astype('Int64')
|
|
@@ -179,55 +213,62 @@ def fixID(sumstats,
|
|
|
179
213
|
#sumstats.loc[to_fix,status] = vchange_status(sumstats.loc[to_fix,status], 4, "98765432","00000000").astype("string")
|
|
180
214
|
|
|
181
215
|
############################ fixing chr pos###################################################
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
216
|
+
if fixeanea == True:
|
|
217
|
+
if verbose: log.write(" -WARNING! gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT")
|
|
218
|
+
if overwrite is True:
|
|
219
|
+
if verbose: log.write(" -Overwrite mode is applied...")
|
|
220
|
+
to_fix = is_chrposrefalt
|
|
221
|
+
elif (nea in sumstats.columns) and (nea in sumstats.columns):
|
|
222
|
+
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
223
|
+
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
224
|
+
elif (nea in sumstats.columns) and (ea not in sumstats.columns):
|
|
225
|
+
if verbose: log.write(" -Initiating EA columns...")
|
|
226
|
+
sumstats.loc[:,ea]=pd.Series(dtype="string")
|
|
227
|
+
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
228
|
+
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
229
|
+
elif (nea not in sumstats.columns) and (ea in sumstats.columns):
|
|
230
|
+
if verbose: log.write(" -Initiating NEA columns...")
|
|
231
|
+
sumstats.loc[:,nea]=pd.Series(dtype="string")
|
|
232
|
+
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
233
|
+
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
234
|
+
else:
|
|
235
|
+
if verbose: log.write(" -Initiating EA and NEA columns...")
|
|
236
|
+
sumstats[nea]=pd.Series(dtype="string")
|
|
237
|
+
sumstats[ea]=pd.Series(dtype="string")
|
|
238
|
+
to_fix = is_chrposrefalt
|
|
239
|
+
if sum(to_fix)>0:
|
|
240
|
+
if verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
207
241
|
#
|
|
208
|
-
|
|
209
|
-
|
|
242
|
+
if sum(to_fix)>0:
|
|
243
|
+
if verbose: log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's CHR:POS:NEA:EA...")
|
|
210
244
|
#
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
245
|
+
if fixeanea_flip == True:
|
|
246
|
+
if verbose: log.write(" -Flipped : CHR:POS:NEA:EA -> CHR:POS:EA:NEA ")
|
|
247
|
+
sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
|
|
248
|
+
sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
|
|
249
|
+
else:
|
|
250
|
+
if verbose: log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ")
|
|
251
|
+
sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
|
|
252
|
+
sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
|
|
253
|
+
|
|
219
254
|
# #to_change_status = sumstats[status].str.match(r"\w\w\w[45]\w\w\w")
|
|
220
255
|
# #sumstats.loc[to_fix&to_change_status,status] = vchange_status(sumstats.loc[to_fix&to_change_status,status],4,"2")
|
|
221
256
|
# #sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[1]).astype("string")
|
|
222
257
|
# #sumstats.loc[to_fix,rsid].apply(lambda x:re.split(':|_|-',x)[1]).astype("Int64")
|
|
223
258
|
|
|
224
259
|
############################ fixing id ###################################################
|
|
225
|
-
if fixsep
|
|
260
|
+
if fixsep == True:
|
|
226
261
|
if snpid in sumstats.columns:
|
|
227
262
|
if verbose: log.write(' -Replacing [_-] in SNPID with ":" ...')
|
|
228
263
|
sumstats.loc[:,snpid] = sumstats.loc[:,snpid].str.replace(r"[_-]",":",regex=True)
|
|
264
|
+
|
|
265
|
+
if fixprefix == True:
|
|
266
|
+
if snpid in sumstats.columns:
|
|
267
|
+
if verbose: log.write(' -Removing /^chr/ in SNPID ...')
|
|
268
|
+
prefix_removed = sumstats.loc[:,snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
269
|
+
sumstats.loc[~prefix_removed.isna(),snpid] = prefix_removed[~prefix_removed.isna()]
|
|
229
270
|
|
|
230
|
-
if fixid
|
|
271
|
+
if fixid == True:
|
|
231
272
|
if snpid not in sumstats.columns:
|
|
232
273
|
# initiate a SNPID column
|
|
233
274
|
sumstats.loc[:,snpid]=pd.Series(dtype="string")
|
|
@@ -304,19 +345,21 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
304
345
|
remove multiallelic SNPs based on 4. CHR, POS
|
|
305
346
|
'''
|
|
306
347
|
|
|
348
|
+
if verbose: log.write("Start to remove duplicated/multiallelic variants...{}".format(_get_version()))
|
|
349
|
+
if verbose: log.write(" -Removing mode:{}".format(mode))
|
|
307
350
|
# sort the variants using the specified column before removing
|
|
308
351
|
if keep_col is not None :
|
|
309
352
|
if keep_col in sumstats.columns:
|
|
310
|
-
if verbose: log.write("Start to sort the sumstats using "
|
|
353
|
+
if verbose: log.write("Start to sort the sumstats using {}...".format(keep_col))
|
|
311
354
|
sumstats = sumstats.sort_values(by=keep_col,ascending=keep_ascend)
|
|
312
355
|
else:
|
|
313
356
|
if verbose: log.write("Column" + keep_col +" was not detected... skipping... ")
|
|
314
357
|
total_number = len(sumstats)
|
|
315
358
|
|
|
316
359
|
# remove by duplicated SNPID
|
|
317
|
-
if (snpid in sumstats.columns) and "d" in mode:
|
|
360
|
+
if (snpid in sumstats.columns) and ("d" in mode or "s" in mode):
|
|
318
361
|
if verbose: log.write("Start to remove duplicated variants based on snpid...{}".format(_get_version()))
|
|
319
|
-
|
|
362
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
320
363
|
if verbose: log.write(" -Which variant to keep: ", keep )
|
|
321
364
|
pre_number =len(sumstats)
|
|
322
365
|
if snpid in sumstats.columns:
|
|
@@ -326,18 +369,19 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
326
369
|
if verbose: log.write(" -Removed ",pre_number -after_number ," based on SNPID...")
|
|
327
370
|
|
|
328
371
|
# remove by duplicated rsID
|
|
329
|
-
if (rsid in sumstats.columns) and ("d" in mode):
|
|
372
|
+
if (rsid in sumstats.columns) and ("d" in mode or "r" in mode):
|
|
330
373
|
# keep na and remove duplicated
|
|
331
374
|
pre_number =len(sumstats)
|
|
332
375
|
if verbose: log.write("Start to remove duplicated variants based on rsID...")
|
|
376
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
333
377
|
sumstats = sumstats.loc[sumstats[rsid].isna() | (~sumstats.duplicated(subset=rsid, keep=keep)),:]
|
|
334
378
|
after_number=len(sumstats)
|
|
335
379
|
if verbose: log.write(" -Removed ",pre_number -after_number ," based on rsID...")
|
|
336
380
|
|
|
337
381
|
# remove by duplicated variants by CHR:POS:NEA:EA
|
|
338
|
-
if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and "d" in mode:
|
|
382
|
+
if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and ("d" in mode or "c" in mode):
|
|
339
383
|
if verbose: log.write("Start to remove duplicated variants based on CHR,POS,EA and NEA...")
|
|
340
|
-
|
|
384
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
341
385
|
if verbose: log.write(" -Which variant to keep: ", keep )
|
|
342
386
|
pre_number =len(sumstats)
|
|
343
387
|
if snpid in sumstats.columns:
|
|
@@ -351,6 +395,7 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
351
395
|
# keep na and remove duplicated
|
|
352
396
|
pre_number =len(sumstats)
|
|
353
397
|
if verbose: log.write("Start to remove multiallelic variants based on chr:pos...")
|
|
398
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
354
399
|
if verbose: log.write(" -Which variant to keep: ", keep )
|
|
355
400
|
sumstats = sumstats.loc[(~sumstats.loc[:,[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
|
|
356
401
|
after_number=len(sumstats)
|
|
@@ -360,17 +405,37 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
360
405
|
# resort the coordinates
|
|
361
406
|
if verbose: log.write(" -Removed ",total_number -after_number," variants in total.")
|
|
362
407
|
if keep_col is not None :
|
|
363
|
-
if verbose: log.write(" -Sort the coordinates...")
|
|
408
|
+
if verbose: log.write(" -Sort the coordinates based on CHR and POS...")
|
|
364
409
|
sumstats = sortcoordinate(sumstats,verbose=False)
|
|
365
410
|
|
|
366
|
-
if
|
|
411
|
+
if "n" in mode or remove==True:
|
|
367
412
|
# if remove==True, remove NAs
|
|
368
413
|
if verbose: log.write(" -Removing NAs...")
|
|
369
414
|
pre_number =len(sumstats)
|
|
370
|
-
|
|
415
|
+
specified_columns = []
|
|
416
|
+
if "d" in mode:
|
|
417
|
+
specified_columns.append(rsid)
|
|
418
|
+
specified_columns.append(snpid)
|
|
419
|
+
specified_columns.append(chrom)
|
|
420
|
+
specified_columns.append(pos)
|
|
421
|
+
specified_columns.append(ea)
|
|
422
|
+
specified_columns.append(nea)
|
|
423
|
+
if "r" in mode:
|
|
424
|
+
specified_columns.append(rsid)
|
|
425
|
+
if "s" in mode:
|
|
426
|
+
specified_columns.append(snpid)
|
|
427
|
+
if "m" in mode:
|
|
428
|
+
specified_columns.append(chrom)
|
|
429
|
+
specified_columns.append(pos)
|
|
430
|
+
if "c" in mode:
|
|
431
|
+
specified_columns.append(chrom)
|
|
432
|
+
specified_columns.append(pos)
|
|
433
|
+
specified_columns.append(ea)
|
|
434
|
+
specified_columns.append(nea)
|
|
435
|
+
sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
|
|
371
436
|
after_number=len(sumstats)
|
|
372
|
-
if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values.")
|
|
373
|
-
if verbose: log.write("Finished removing successfully!")
|
|
437
|
+
if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)))
|
|
438
|
+
if verbose: log.write("Finished removing duplicated/multiallelic variants successfully!")
|
|
374
439
|
return sumstats
|
|
375
440
|
|
|
376
441
|
###############################################################################################################
|
|
@@ -383,7 +448,7 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
|
|
|
383
448
|
if verbose: log.write(".fix_chr: Specified not detected..skipping...")
|
|
384
449
|
return sumstats
|
|
385
450
|
if verbose: log.write("Start to fix chromosome notation...{}".format(_get_version()))
|
|
386
|
-
|
|
451
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
387
452
|
|
|
388
453
|
# convert to string datatype
|
|
389
454
|
try:
|
|
@@ -406,7 +471,8 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
|
|
|
406
471
|
if sum(is_chr_fixed)<len(sumstats):
|
|
407
472
|
|
|
408
473
|
#extract the CHR number or X Y M MT
|
|
409
|
-
chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'(chr)?(
|
|
474
|
+
chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
475
|
+
|
|
410
476
|
is_chr_fixable = ~chr_extracted.isna()
|
|
411
477
|
if verbose: log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable))
|
|
412
478
|
|
|
@@ -419,7 +485,10 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
|
|
|
419
485
|
is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
|
|
420
486
|
if sum(is_chr_invalid)>0 and verbose:
|
|
421
487
|
log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
|
|
422
|
-
|
|
488
|
+
try:
|
|
489
|
+
log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
|
|
490
|
+
except:
|
|
491
|
+
pass
|
|
423
492
|
elif verbose:
|
|
424
493
|
log.write(" -No unrecognized chromosome notations...")
|
|
425
494
|
|
|
@@ -464,7 +533,15 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
|
|
|
464
533
|
unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
|
|
465
534
|
if (remove is True) and unrecognized_num>0:
|
|
466
535
|
# remove variants with unrecognized CHR
|
|
467
|
-
|
|
536
|
+
try:
|
|
537
|
+
if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
|
|
538
|
+
except:
|
|
539
|
+
pass
|
|
540
|
+
if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
|
|
541
|
+
try:
|
|
542
|
+
log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
|
|
543
|
+
except:
|
|
544
|
+
pass
|
|
468
545
|
#sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
|
|
469
546
|
good_chr = sumstats[chrom].isin(chrom_list)
|
|
470
547
|
sumstats = sumstats.loc[good_chr, :].copy()
|
|
@@ -480,45 +557,48 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
|
|
|
480
557
|
sumstats.loc[:,chrom] = np.floor(pd.to_numeric(sumstats.loc[:,chrom], errors='coerce')).astype('Int64')
|
|
481
558
|
|
|
482
559
|
# filter out variants with CHR <=0
|
|
483
|
-
if verbose: log.write(" -Sanity check for CHR...")
|
|
484
|
-
|
|
485
560
|
out_of_range_chr = sumstats[chrom] < minchr
|
|
486
561
|
out_of_range_chr = out_of_range_chr.fillna(False)
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
562
|
+
if sum(out_of_range_chr)>0:
|
|
563
|
+
if verbose: log.write(" -Sanity check for CHR...")
|
|
564
|
+
if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
|
|
565
|
+
sumstats = sumstats.loc[~out_of_range_chr,:]
|
|
491
566
|
|
|
492
567
|
if verbose: log.write("Finished fixing chromosome notation successfully!")
|
|
568
|
+
|
|
493
569
|
return sumstats
|
|
494
570
|
|
|
495
571
|
###############################################################################################################
|
|
496
572
|
# 20230128
|
|
497
|
-
def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True,limit=250000000, log=Log()):
|
|
573
|
+
def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_limit=0 , upper_limit=None , limit=250000000, log=Log()):
|
|
574
|
+
if upper_limit is None:
|
|
575
|
+
upper_limit = limit
|
|
498
576
|
if check_col(sumstats,pos,status) is not True:
|
|
499
577
|
if verbose: log.write(".fix_pos: Specified not detected..skipping...")
|
|
500
578
|
return sumstats
|
|
501
579
|
if verbose: log.write("Start to fix basepair positions...{}".format(_get_version()))
|
|
502
|
-
|
|
580
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
503
581
|
|
|
504
582
|
all_var_num = len(sumstats)
|
|
505
583
|
#convert to numeric
|
|
506
584
|
is_pos_na = sumstats.loc[:,pos].isna()
|
|
507
585
|
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
586
|
+
try:
|
|
587
|
+
if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
|
|
588
|
+
sumstats.loc[:,pos] = sumstats.loc[:,pos].astype('string')
|
|
589
|
+
# if so, remove thousands separator
|
|
590
|
+
if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
|
|
591
|
+
sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
|
|
592
|
+
except:
|
|
593
|
+
pass
|
|
513
594
|
|
|
514
595
|
# convert POS to integer
|
|
515
596
|
try:
|
|
516
597
|
if verbose: log.write(' -Converting to Int64 data type ...')
|
|
517
|
-
sumstats
|
|
598
|
+
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
518
599
|
except:
|
|
519
600
|
if verbose: log.write(' -Force converting to Int64 data type ...')
|
|
520
|
-
sumstats
|
|
521
|
-
|
|
601
|
+
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
522
602
|
is_pos_fixed = ~sumstats.loc[:,pos].isna()
|
|
523
603
|
is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
|
|
524
604
|
|
|
@@ -526,11 +606,11 @@ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True,limit=2
|
|
|
526
606
|
sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
|
|
527
607
|
|
|
528
608
|
# remove outlier, limit:250,000,000
|
|
529
|
-
if verbose: log.write(" -Position
|
|
530
|
-
|
|
531
|
-
|
|
609
|
+
if verbose: log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit))
|
|
610
|
+
is_pos_na = sumstats.loc[:,pos].isna()
|
|
611
|
+
out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
|
|
612
|
+
if verbose: log.write(" -Removed outliers:",sum(out_lier))
|
|
532
613
|
sumstats = sumstats.loc[~out_lier,:]
|
|
533
|
-
|
|
534
614
|
#remove na
|
|
535
615
|
if remove is True:
|
|
536
616
|
sumstats = sumstats.loc[~sumstats[pos].isna(),:]
|
|
@@ -539,6 +619,7 @@ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True,limit=2
|
|
|
539
619
|
|
|
540
620
|
if verbose: log.write(" -Converted all position to datatype Int64.")
|
|
541
621
|
if verbose: log.write("Finished fixing basepair position successfully!")
|
|
622
|
+
|
|
542
623
|
return sumstats
|
|
543
624
|
|
|
544
625
|
###############################################################################################################
|
|
@@ -549,11 +630,26 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
|
|
|
549
630
|
if verbose: log.write("EA and NEA not detected..skipping...")
|
|
550
631
|
return sumstats
|
|
551
632
|
if verbose: log.write("Start to fix alleles...{}".format(_get_version()))
|
|
552
|
-
|
|
633
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
553
634
|
|
|
554
635
|
#if (ea not in sumstats.columns) or (nea not in sumstats.columns):
|
|
555
636
|
if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
|
|
637
|
+
|
|
638
|
+
#try:
|
|
639
|
+
# ea_missing = sum(sumstats[ea].isna())
|
|
640
|
+
# nea_missing = sum(sumstats[nea].isna())
|
|
641
|
+
# if sum(ea_missing)>0:
|
|
642
|
+
# if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
|
|
643
|
+
# sumstats.loc[:,ea] = sumstats.loc[:,ea].add_categories("N").fillna("N")
|
|
644
|
+
# if sum(sumstats[nea].isna())>0:
|
|
645
|
+
# if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
|
|
646
|
+
# sumstats.loc[:,nea] = sumstats.loc[:,nea].add_categories("N").fillna("N")
|
|
647
|
+
#except:
|
|
648
|
+
# pass
|
|
649
|
+
|
|
556
650
|
categories = set(sumstats.loc[:,ea].str.upper())|set(sumstats.loc[:,nea].str.upper())|set("N")
|
|
651
|
+
categories = {x for x in categories if pd.notna(x)}
|
|
652
|
+
|
|
557
653
|
sumstats.loc[:,ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
|
|
558
654
|
sumstats.loc[:,nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
|
|
559
655
|
all_var_num = len(sumstats)
|
|
@@ -620,6 +716,7 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
|
|
|
620
716
|
sumstats.loc[is_eanea_fixed&is_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_normalized, status], 5,"4","3")
|
|
621
717
|
gc.collect()
|
|
622
718
|
if verbose: log.write("Finished fixing allele successfully!")
|
|
719
|
+
|
|
623
720
|
return sumstats
|
|
624
721
|
|
|
625
722
|
###############################################################################################################
|
|
@@ -627,11 +724,11 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
|
|
|
627
724
|
|
|
628
725
|
def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
|
|
629
726
|
if check_col(sumstats,pos,ea,nea,status) is not True:
|
|
630
|
-
if verbose: log.write("WARNING
|
|
727
|
+
if verbose: log.write("WARNING! .normalize(): specified columns not detected..skipping...")
|
|
631
728
|
return sumstats
|
|
632
729
|
|
|
633
730
|
if verbose: log.write("Start to normalize variants...{}".format(_get_version()))
|
|
634
|
-
|
|
731
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
635
732
|
#variants_to_check = status_match(sumstats[status],5,[4,5]) #
|
|
636
733
|
#r'\w\w\w\w[45]\w\w'
|
|
637
734
|
variants_to_check = sumstats[status].str[4].str.match(r'4|5', case=False, flags=0, na=False)
|
|
@@ -689,7 +786,8 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
689
786
|
|
|
690
787
|
def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
|
|
691
788
|
#single df
|
|
692
|
-
normalized = sumstats.apply(lambda x: normalizevariant(x[0],x[1],x[2],x[3]),axis=1)
|
|
789
|
+
#normalized = sumstats.apply(lambda x: normalizevariant(x[0],x[1],x[2],x[3]),axis=1)
|
|
790
|
+
normalized = sumstats.apply(lambda x: normalizevariant(x[pos],x[nea],x[ea],x[status]),axis=1)
|
|
693
791
|
sumstats = pd.DataFrame(normalized.to_list(), columns=[pos,nea,ea,status],index=sumstats.index)
|
|
694
792
|
return sumstats
|
|
695
793
|
|
|
@@ -811,7 +909,7 @@ def sanitycheckstats(sumstats,
|
|
|
811
909
|
if coltocheck is None:
|
|
812
910
|
coltocheck = ["P","MLOG10P","INFO","Z","BETA","SE","EAF","CHISQ","F","N","N_CASE","N_CONTROL","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","STATUS"]
|
|
813
911
|
if verbose: log.write("Start sanity check for statistics...{}".format(_get_version()))
|
|
814
|
-
|
|
912
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
815
913
|
cols_to_check=[]
|
|
816
914
|
oringinal_number=len(sumstats)
|
|
817
915
|
sumstats = sumstats.copy()
|
|
@@ -822,7 +920,7 @@ def sanitycheckstats(sumstats,
|
|
|
822
920
|
if "N" in coltocheck and "N" in sumstats.columns:
|
|
823
921
|
cols_to_check.append("N")
|
|
824
922
|
if verbose: log.write(" -Checking if ",n[0],"<=N<=",n[1]," ...")
|
|
825
|
-
sumstats.loc[:,"N"] = np.floor(pd.to_numeric(sumstats.loc[:,"N"], errors='coerce')).astype("
|
|
923
|
+
sumstats.loc[:,"N"] = np.floor(pd.to_numeric(sumstats.loc[:,"N"], errors='coerce')).astype("Int64")
|
|
826
924
|
sumstats = sumstats.loc[(sumstats["N"]>=n[0]) & (sumstats["N"]<=n[1]),:]
|
|
827
925
|
after_number=len(sumstats)
|
|
828
926
|
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N.")
|
|
@@ -830,7 +928,7 @@ def sanitycheckstats(sumstats,
|
|
|
830
928
|
if "N_CASE" in coltocheck and "N_CASE" in sumstats.columns:
|
|
831
929
|
cols_to_check.append("N_CASE")
|
|
832
930
|
if verbose: log.write(" -Checking if ",ncase[0],"<=N_CASE<=",ncase[1]," ...")
|
|
833
|
-
sumstats.loc[:,"N_CASE"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CASE"], errors='coerce')).astype("
|
|
931
|
+
sumstats.loc[:,"N_CASE"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CASE"], errors='coerce')).astype("Int64")
|
|
834
932
|
sumstats = sumstats.loc[(sumstats["N_CASE"]>=ncase[0]) & (sumstats["N_CASE"]<=ncase[1]),:]
|
|
835
933
|
after_number=len(sumstats)
|
|
836
934
|
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CASE.")
|
|
@@ -838,17 +936,11 @@ def sanitycheckstats(sumstats,
|
|
|
838
936
|
if "N_CONTROL" in coltocheck and "N_CONTROL" in sumstats.columns:
|
|
839
937
|
cols_to_check.append("N_CONTROL")
|
|
840
938
|
if verbose: log.write(" -Checking if ",ncontrol[0],"<=N_CONTROL<=",ncontrol[1]," ...")
|
|
841
|
-
sumstats.loc[:,"N_CONTROL"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CONTROL"], errors='coerce')).astype("
|
|
939
|
+
sumstats.loc[:,"N_CONTROL"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CONTROL"], errors='coerce')).astype("Int64")
|
|
842
940
|
sumstats = sumstats.loc[(sumstats["N_CONTROL"]>=ncontrol[0]) & (sumstats["N_CONTROL"]<=ncontrol[1]),:]
|
|
843
941
|
after_number=len(sumstats)
|
|
844
942
|
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CONTROL.")
|
|
845
|
-
|
|
846
|
-
if "N" in coltocheck and "N" in sumstats.columns and "N_CONTROL" in coltocheck and "N_CONTROL" in sumstats.columns and "N_CASE" in coltocheck and "N_CASE" in sumstats.columns:
|
|
847
|
-
if verbose: log.write(" -Checking if N = N_CASE + N_CONTROL ...")
|
|
848
|
-
matched_n = sumstats.loc[:,"N"] == sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"]
|
|
849
|
-
sumstats = sumstats.loc[matched_n,:]
|
|
850
|
-
after_number=len(sumstats)
|
|
851
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with N != N_CASE + N_CONTROL.")
|
|
943
|
+
|
|
852
944
|
|
|
853
945
|
###ALLELE FREQUENCY################################################################################################################################################
|
|
854
946
|
pre_number=len(sumstats)
|
|
@@ -908,6 +1000,11 @@ def sanitycheckstats(sumstats,
|
|
|
908
1000
|
if verbose: log.write(" -Checking if ",p[0],"< P <",p[1]," ...")
|
|
909
1001
|
sumstats.loc[:,"P"] = pd.to_numeric(sumstats.loc[:,"P"], errors='coerce').astype("float64")
|
|
910
1002
|
sumstats = sumstats.loc[(sumstats["P"]>p[0]) & (sumstats["P"]<p[1]),:]
|
|
1003
|
+
|
|
1004
|
+
is_low_p = sumstats["P"] == 0
|
|
1005
|
+
if sum(is_low_p) >0:
|
|
1006
|
+
log.write(" -WARNING! Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)), verbose=verbose)
|
|
1007
|
+
log.write(" -WARNING! Please consider using MLOG10P instead.", verbose=verbose)
|
|
911
1008
|
after_number=len(sumstats)
|
|
912
1009
|
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad P.")
|
|
913
1010
|
|
|
@@ -1008,11 +1105,11 @@ def sanitycheckstats(sumstats,
|
|
|
1008
1105
|
if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
|
|
1009
1106
|
categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
1010
1107
|
sumstats.loc[:,"STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
|
|
1011
|
-
|
|
1012
|
-
pre_number=len(sumstats)
|
|
1013
|
-
sumstats = sumstats.dropna(subset=cols_to_check)
|
|
1108
|
+
|
|
1109
|
+
#pre_number=len(sumstats)
|
|
1110
|
+
#sumstats = sumstats.dropna(subset=cols_to_check)
|
|
1014
1111
|
after_number=len(sumstats)
|
|
1015
|
-
if verbose:log.write(" -Removed {} variants with NAs in the checked columns...".format(pre_number - after_number))
|
|
1112
|
+
#if verbose:log.write(" -Removed {} variants with NAs in the checked columns...".format(pre_number - after_number))
|
|
1016
1113
|
|
|
1017
1114
|
if verbose: log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.")
|
|
1018
1115
|
if verbose:
|
|
@@ -1021,6 +1118,67 @@ def sanitycheckstats(sumstats,
|
|
|
1021
1118
|
if verbose: log.write("Finished sanity check successfully!")
|
|
1022
1119
|
return sumstats
|
|
1023
1120
|
|
|
1121
|
+
### check consistency #############################################################################################################################################
|
|
1122
|
+
|
|
1123
|
+
def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
|
|
1124
|
+
if verbose: log.write("Start to check data consistency across columns...{}".format(_get_version()))
|
|
1125
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
1126
|
+
log.write(" -Tolerance: {} (Relative) and {} (Absolute)".format(rtol, atol),verbose=verbose)
|
|
1127
|
+
|
|
1128
|
+
|
|
1129
|
+
if "SNPID" not in sumstats.columns:
|
|
1130
|
+
id_to_use = "rsID"
|
|
1131
|
+
else:
|
|
1132
|
+
id_to_use = "SNPID"
|
|
1133
|
+
|
|
1134
|
+
if "BETA" in sumstats.columns and "SE" in sumstats.columns:
|
|
1135
|
+
if "MLOG10P" in sumstats.columns:
|
|
1136
|
+
log.write(" -Checking if BETA/SE-derived-MLOG10P is consistent with MLOG10P...",verbose=verbose)
|
|
1137
|
+
betase_derived_mlog10p = _convert_betase_to_mlog10p(sumstats["BETA"], sumstats["SE"])
|
|
1138
|
+
is_close = np.isclose(betase_derived_mlog10p, sumstats["MLOG10P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1139
|
+
diff = betase_derived_mlog10p - sumstats["MLOG10P"]
|
|
1140
|
+
if sum(~is_close)>0:
|
|
1141
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
|
|
1142
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
|
|
1143
|
+
else:
|
|
1144
|
+
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1145
|
+
|
|
1146
|
+
if "P" in sumstats.columns:
|
|
1147
|
+
log.write(" -Checking if BETA/SE-derived-P is consistent with P...",verbose=verbose)
|
|
1148
|
+
betase_derived_p = _convert_betase_to_p(sumstats["BETA"], sumstats["SE"])
|
|
1149
|
+
is_close = np.isclose(betase_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1150
|
+
diff = betase_derived_p - sumstats["P"]
|
|
1151
|
+
if sum(~is_close)>0:
|
|
1152
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
|
|
1153
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
|
|
1154
|
+
else:
|
|
1155
|
+
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1156
|
+
|
|
1157
|
+
if "MLOG10P" in sumstats.columns and "P" in sumstats.columns:
|
|
1158
|
+
log.write(" -Checking if MLOG10P-derived-P is consistent with P...",verbose=verbose)
|
|
1159
|
+
mlog10p_derived_p = _convert_mlog10p_to_p(sumstats["MLOG10P"])
|
|
1160
|
+
is_close = np.isclose(mlog10p_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1161
|
+
diff = mlog10p_derived_p - sumstats["P"]
|
|
1162
|
+
if sum(~is_close)>0:
|
|
1163
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
|
|
1164
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
|
|
1165
|
+
else:
|
|
1166
|
+
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1167
|
+
|
|
1168
|
+
if "N" in sumstats.columns and "N_CONTROL" in sumstats.columns and "N_CASE" in sumstats.columns:
|
|
1169
|
+
if verbose: log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...")
|
|
1170
|
+
is_close = sumstats.loc[:,"N"] == sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"]
|
|
1171
|
+
#is_close = np.isclose(sumstats.loc[:,"N"], sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1172
|
+
diff = abs(sumstats.loc[:,"N"] - (sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"] ))
|
|
1173
|
+
if sum(~is_close)>0:
|
|
1174
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
|
|
1175
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
|
|
1176
|
+
else:
|
|
1177
|
+
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1178
|
+
|
|
1179
|
+
log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
|
|
1180
|
+
|
|
1181
|
+
if verbose: log.write("Finished checking data consistency across columns.")
|
|
1024
1182
|
###############################################################################################################
|
|
1025
1183
|
# 20220426
|
|
1026
1184
|
def get_reverse_complementary_allele(a):
|
|
@@ -1046,7 +1204,7 @@ def flip_direction(string):
|
|
|
1046
1204
|
|
|
1047
1205
|
def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
1048
1206
|
|
|
1049
|
-
|
|
1207
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
1050
1208
|
|
|
1051
1209
|
###################get reverse complementary####################
|
|
1052
1210
|
pattern = r"\w\w\w\w\w[45]\w"
|
|
@@ -1245,10 +1403,10 @@ def liftover_variant(sumstats,
|
|
|
1245
1403
|
|
|
1246
1404
|
def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
|
|
1247
1405
|
if check_col(sumstats,chrom,pos,status) is not True:
|
|
1248
|
-
if verbose: log.write("WARNING
|
|
1406
|
+
if verbose: log.write("WARNING! .liftover(): specified columns not detected..skipping...")
|
|
1249
1407
|
return sumstats
|
|
1250
1408
|
if verbose: log.write("Start to perform liftover...{}".format(_get_version()))
|
|
1251
|
-
|
|
1409
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
1252
1410
|
if verbose: log.write(" -CPU Cores to use :",n_cores)
|
|
1253
1411
|
if verbose: log.write(" -Performing liftover ...")
|
|
1254
1412
|
if verbose: log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build)
|
|
@@ -1292,7 +1450,7 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
|
|
|
1292
1450
|
return sumstats
|
|
1293
1451
|
|
|
1294
1452
|
if verbose: log.write("Start to sort the genome coordinates...{}".format(_get_version()))
|
|
1295
|
-
|
|
1453
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
1296
1454
|
|
|
1297
1455
|
try:
|
|
1298
1456
|
if sumstats[pos].dtype == "Int64":
|
|
@@ -1311,11 +1469,11 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
|
|
|
1311
1469
|
###############################################################################################################
|
|
1312
1470
|
# 20230430 added HR HR_95 BETA_95 N_CASE N_CONTROL
|
|
1313
1471
|
def sortcolumn(sumstats,verbose=True,log=Log(),order = [
|
|
1314
|
-
"SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z",
|
|
1472
|
+
"SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z","T","F",
|
|
1315
1473
|
"CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"
|
|
1316
1474
|
]):
|
|
1317
1475
|
if verbose: log.write("Start to reorder the columns...{}".format(_get_version()))
|
|
1318
|
-
|
|
1476
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
1319
1477
|
|
|
1320
1478
|
output_columns = []
|
|
1321
1479
|
for i in order:
|
|
@@ -1347,4 +1505,5 @@ def check_col(df,*args):
|
|
|
1347
1505
|
if len(not_in_df)>0:
|
|
1348
1506
|
return False
|
|
1349
1507
|
print(" -Specified columns names was not detected. Please check:"+",".join(not_in_df))
|
|
1350
|
-
return True
|
|
1508
|
+
return True
|
|
1509
|
+
|