gwaslab 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +1 -1
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +8 -0
- gwaslab/g_Sumstats.py +80 -178
- gwaslab/g_SumstatsPair.py +6 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_meta.py +13 -3
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +29 -15
- gwaslab/hm_harmonize_sumstats.py +312 -159
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +46 -37
- gwaslab/io_to_formats.py +428 -295
- gwaslab/qc_check_datatype.py +15 -1
- gwaslab/qc_fix_sumstats.py +956 -719
- gwaslab/util_ex_calculate_ldmatrix.py +29 -11
- gwaslab/util_ex_gwascatalog.py +1 -1
- gwaslab/util_ex_ldproxyfinder.py +1 -1
- gwaslab/util_ex_process_h5.py +26 -17
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_convert_h2.py +1 -1
- gwaslab/util_in_fill_data.py +44 -5
- gwaslab/util_in_filter_value.py +122 -34
- gwaslab/util_in_get_density.py +2 -2
- gwaslab/util_in_get_sig.py +41 -9
- gwaslab/viz_aux_quickfix.py +26 -21
- gwaslab/viz_aux_reposition_text.py +7 -4
- gwaslab/viz_aux_save_figure.py +6 -5
- gwaslab/viz_plot_compare_af.py +5 -5
- gwaslab/viz_plot_compare_effect.py +22 -5
- gwaslab/viz_plot_miamiplot2.py +28 -20
- gwaslab/viz_plot_mqqplot.py +214 -98
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +16 -9
- gwaslab/viz_plot_trumpetplot.py +15 -6
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/METADATA +3 -3
- gwaslab-3.4.38.dist-info/RECORD +72 -0
- gwaslab-3.4.36.dist-info/RECORD +0 -72
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
gwaslab/qc_fix_sumstats.py
CHANGED
|
@@ -14,7 +14,12 @@ from gwaslab.bd_common_data import get_chr_to_number
|
|
|
14
14
|
from gwaslab.bd_common_data import get_number_to_chr
|
|
15
15
|
from gwaslab.bd_common_data import get_chr_list
|
|
16
16
|
from gwaslab.qc_check_datatype import check_datatype
|
|
17
|
+
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
17
18
|
from gwaslab.g_version import _get_version
|
|
19
|
+
from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
|
|
20
|
+
from gwaslab.util_in_fill_data import _convert_betase_to_p
|
|
21
|
+
from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
|
|
22
|
+
#process build
|
|
18
23
|
#setbuild
|
|
19
24
|
#fixID
|
|
20
25
|
#rsidtochrpos
|
|
@@ -26,6 +31,7 @@ from gwaslab.g_version import _get_version
|
|
|
26
31
|
#normalizevariant
|
|
27
32
|
#checkref
|
|
28
33
|
#sanitycheckstats
|
|
34
|
+
#_check_data_consistency
|
|
29
35
|
#flipallelestats
|
|
30
36
|
#parallelizeassignrsid
|
|
31
37
|
#sortcoordinate
|
|
@@ -41,18 +47,18 @@ def _process_build(build,log,verbose):
|
|
|
41
47
|
log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
|
|
42
48
|
final_build = "38"
|
|
43
49
|
else:
|
|
44
|
-
log.
|
|
50
|
+
log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
|
|
45
51
|
final_build = "99"
|
|
46
52
|
return final_build
|
|
47
53
|
|
|
48
54
|
def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
|
|
49
55
|
build = _process_build(build,log=log,verbose=verbose)
|
|
50
|
-
sumstats
|
|
51
|
-
sumstats
|
|
52
|
-
return sumstats
|
|
56
|
+
sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
|
|
57
|
+
sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
|
|
58
|
+
return sumstats, build
|
|
53
59
|
|
|
54
60
|
def fixID(sumstats,
|
|
55
|
-
snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",
|
|
61
|
+
snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",fixprefix=False,
|
|
56
62
|
fixchrpos=False,fixid=False,fixeanea=False,fixeanea_flip=False,fixsep=False,
|
|
57
63
|
overwrite=False,verbose=True,forcefixid=False,log=Log()):
|
|
58
64
|
'''
|
|
@@ -60,38 +66,79 @@ def fixID(sumstats,
|
|
|
60
66
|
2. fix chr and pos using snpid
|
|
61
67
|
3. checking rsid and chr:pos:nea:ea
|
|
62
68
|
'''
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
69
|
+
##start function with col checking##########################################################
|
|
70
|
+
_start_line = "check SNPID/rsID"
|
|
71
|
+
_end_line = "checking SNPID/rsID"
|
|
72
|
+
_start_cols =[]
|
|
73
|
+
_start_function = ".fix_id()"
|
|
74
|
+
_must_args ={}
|
|
75
|
+
|
|
76
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
77
|
+
log=log,
|
|
78
|
+
verbose=verbose,
|
|
79
|
+
start_line=_start_line,
|
|
80
|
+
end_line=_end_line,
|
|
81
|
+
start_cols=_start_cols,
|
|
82
|
+
start_function=_start_function,
|
|
83
|
+
**_must_args)
|
|
84
|
+
if is_enough_info == False: return sumstats
|
|
85
|
+
############################################################################################
|
|
86
|
+
|
|
87
|
+
############################ checking datatype ###################################################
|
|
88
|
+
if rsid in sumstats.columns:
|
|
89
|
+
# convert to string datatype
|
|
90
|
+
try:
|
|
91
|
+
log.write(" -Checking rsID data type...",verbose=verbose)
|
|
92
|
+
if sumstats[rsid].dtype == "string":
|
|
93
|
+
pass
|
|
94
|
+
else:
|
|
95
|
+
log.write(" -Converting rsID to pd.string data type...",verbose=verbose)
|
|
96
|
+
sumstats[rsid] = sumstats[rsid].astype("string")
|
|
97
|
+
except:
|
|
98
|
+
log.write(" -Force converting rsID to pd.string data type...",verbose=verbose)
|
|
99
|
+
sumstats[rsid] = sumstats[rsid].astype("string")
|
|
100
|
+
if snpid in sumstats.columns:
|
|
101
|
+
# convert to string datatype
|
|
102
|
+
try:
|
|
103
|
+
log.write(" -Checking SNPID data type...",verbose=verbose)
|
|
104
|
+
if sumstats[snpid].dtype == "string":
|
|
105
|
+
pass
|
|
106
|
+
else:
|
|
107
|
+
log.write(" -Converting SNPID to pd.string data type...",verbose=verbose)
|
|
108
|
+
sumstats[snpid] = sumstats[snpid].astype("string")
|
|
109
|
+
except:
|
|
110
|
+
log.write(" -Force converting SNPID to pd.string data type...",verbose=verbose)
|
|
111
|
+
sumstats[snpid] = sumstats[snpid].astype("string")
|
|
112
|
+
|
|
68
113
|
############################ checking ###################################################
|
|
69
114
|
if snpid in sumstats.columns:
|
|
70
|
-
|
|
71
|
-
#
|
|
115
|
+
log.write(" -Checking if SNPID is CHR:POS:NEA:EA...(separator: - ,: , _)",verbose=verbose)
|
|
116
|
+
# check if SNPID is CHR:POS:EA:NEA
|
|
72
117
|
is_chrposrefalt = sumstats[snpid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
|
|
118
|
+
# check if SNPID is NA
|
|
73
119
|
is_snpid_na = sumstats[snpid].isna()
|
|
120
|
+
|
|
121
|
+
# change STATUS code
|
|
74
122
|
sumstats.loc[ is_chrposrefalt,status] = vchange_status(sumstats.loc[ is_chrposrefalt,status],3 ,"975" ,"630")
|
|
75
123
|
sumstats.loc[(~is_chrposrefalt)&(~is_snpid_na),status] = vchange_status(sumstats.loc[(~is_chrposrefalt)&(~is_snpid_na),status],3 ,"975" ,"842")
|
|
76
124
|
|
|
77
125
|
if rsid in sumstats.columns:
|
|
78
|
-
|
|
79
|
-
is_rsid = sumstats[rsid].str.
|
|
126
|
+
log.write(" -Checking if rsID is rsxxxxxx...", verbose=verbose)
|
|
127
|
+
is_rsid = sumstats[rsid].str.match(r'^rs\d+$', case=False, flags=0, na=False)
|
|
80
128
|
|
|
81
129
|
sumstats.loc[ is_rsid,status] = vchange_status(sumstats.loc[ is_rsid,status], 3, "986","520")
|
|
82
130
|
sumstats.loc[~is_rsid,status] = vchange_status(sumstats.loc[~is_rsid,status], 3, "986","743")
|
|
83
131
|
|
|
84
|
-
if verbose: log.write(" -Checking if
|
|
85
|
-
is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\
|
|
86
|
-
#is_rs_chrpos = sumstats[rsid].str.match(r'(chr)?([0-9XYMT]+)[:_-]([0-9]+)[:_-]([ATCG]+)[:_-]([ATCG]+)', case=False, flags=0, na=False)
|
|
132
|
+
if verbose: log.write(" -Checking if CHR:POS:NEA:EA is mixed in rsID column ...")
|
|
133
|
+
is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
|
|
87
134
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
135
|
+
log.write(" -Number of CHR:POS:NEA:EA mixed in rsID column :",sum(is_rs_chrpos), verbose=verbose)
|
|
136
|
+
log.write(" -Number of Unrecognized rsID :",len(sumstats) - sum(is_rs_chrpos) - sum(is_rsid) , verbose=verbose)
|
|
137
|
+
log.write(" -A look at the unrecognized rsID :",set(sumstats.loc[(~is_rsid)&(~is_rs_chrpos),rsid].head()),"...", verbose=verbose)
|
|
91
138
|
|
|
92
139
|
############################ fixing chr pos###################################################
|
|
93
|
-
if fixchrpos
|
|
94
|
-
# from snpid or rsid, extract
|
|
140
|
+
if fixchrpos == True:
|
|
141
|
+
# from snpid or rsid, extract CHR:POS to fix CHR and POS
|
|
95
142
|
if snpid in sumstats.columns:
|
|
96
143
|
if verbose: log.write(" -Fixing CHR and POS...")
|
|
97
144
|
if overwrite is True:
|
|
@@ -99,8 +146,8 @@ def fixID(sumstats,
|
|
|
99
146
|
# fix all
|
|
100
147
|
to_fix = is_chrposrefalt
|
|
101
148
|
|
|
102
|
-
#fix variants with chr and pos being empty
|
|
103
149
|
elif (chrom in sumstats.columns) and (pos in sumstats.columns) :
|
|
150
|
+
#fix variants with chr and pos being NA
|
|
104
151
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
105
152
|
to_fix_num = sum(to_fix)
|
|
106
153
|
if to_fix_num and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
@@ -108,7 +155,7 @@ def fixID(sumstats,
|
|
|
108
155
|
|
|
109
156
|
elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
|
|
110
157
|
if verbose: log.write(" -Initiating CHR columns...")
|
|
111
|
-
sumstats
|
|
158
|
+
sumstats[chrom]=pd.Series(dtype="string")
|
|
112
159
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
113
160
|
to_fix_num = sum(to_fix)
|
|
114
161
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
@@ -116,15 +163,16 @@ def fixID(sumstats,
|
|
|
116
163
|
|
|
117
164
|
elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
|
|
118
165
|
if verbose: log.write(" -Initiating CHR and POS column...")
|
|
119
|
-
sumstats
|
|
166
|
+
sumstats[pos]=pd.Series(dtype="Int64")
|
|
120
167
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
121
168
|
to_fix_num = sum(to_fix)
|
|
122
169
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
123
170
|
elif verbose: log.write(" -No fixable variants. ...")
|
|
171
|
+
|
|
124
172
|
else:
|
|
125
173
|
if verbose: log.write(" -Initiating CHR and POS columns...")
|
|
126
|
-
sumstats
|
|
127
|
-
sumstats
|
|
174
|
+
sumstats[chrom]=pd.Series(dtype="string")
|
|
175
|
+
sumstats[pos]=pd.Series(dtype="Int64")
|
|
128
176
|
to_fix = is_chrposrefalt
|
|
129
177
|
to_fix_num = sum(to_fix)
|
|
130
178
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
@@ -134,8 +182,8 @@ def fixID(sumstats,
|
|
|
134
182
|
if verbose: log.write(" -Filling CHR and POS columns using valid SNPID's chr:pos...")
|
|
135
183
|
# format and qc filled chr and pos
|
|
136
184
|
|
|
137
|
-
sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.
|
|
138
|
-
sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,snpid].str.
|
|
185
|
+
sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
186
|
+
sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[2]
|
|
139
187
|
|
|
140
188
|
#sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.split(':|_|-').str[0].str.strip("chrCHR").astype("string")
|
|
141
189
|
#sumstats.loc[to_fix,pos] =np.floor(pd.to_numeric(sumstats.loc[to_fix,snpid].str.split(':|_|-').str[1], errors='coerce')).astype('Int64')
|
|
@@ -153,20 +201,20 @@ def fixID(sumstats,
|
|
|
153
201
|
elif verbose: log.write(" -No fixable variants ...")
|
|
154
202
|
elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
|
|
155
203
|
if verbose: log.write(" -Initiating CHR columns...")
|
|
156
|
-
sumstats
|
|
204
|
+
sumstats[chrom]=pd.Series(dtype="string")
|
|
157
205
|
to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
158
206
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
159
207
|
elif verbose: log.write(" -No fixable variants ...")
|
|
160
208
|
elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
|
|
161
209
|
if verbose: log.write(" -Initiating CHR and POS column...")
|
|
162
|
-
sumstats
|
|
210
|
+
sumstats[pos]=pd.Series(dtype="Int64")
|
|
163
211
|
to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
164
212
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
165
213
|
elif verbose: log.write(" -No fixable variants ...")
|
|
166
214
|
else:
|
|
167
215
|
if verbose: log.write(" -Initiating CHR and POS columns...")
|
|
168
|
-
sumstats
|
|
169
|
-
sumstats
|
|
216
|
+
sumstats[chrom]=pd.Series(dtype="string")
|
|
217
|
+
sumstats[pos]=pd.Series(dtype="Int64")
|
|
170
218
|
to_fix = is_rs_chrpos
|
|
171
219
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
172
220
|
elif verbose: log.write(" -No fixable variants ...")
|
|
@@ -179,61 +227,68 @@ def fixID(sumstats,
|
|
|
179
227
|
#sumstats.loc[to_fix,status] = vchange_status(sumstats.loc[to_fix,status], 4, "98765432","00000000").astype("string")
|
|
180
228
|
|
|
181
229
|
############################ fixing chr pos###################################################
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
230
|
+
if fixeanea == True:
|
|
231
|
+
if verbose: log.warning("gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT")
|
|
232
|
+
if overwrite is True:
|
|
233
|
+
if verbose: log.write(" -Overwrite mode is applied...")
|
|
234
|
+
to_fix = is_chrposrefalt
|
|
235
|
+
elif (nea in sumstats.columns) and (nea in sumstats.columns):
|
|
236
|
+
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
237
|
+
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
238
|
+
elif (nea in sumstats.columns) and (ea not in sumstats.columns):
|
|
239
|
+
if verbose: log.write(" -Initiating EA columns...")
|
|
240
|
+
sumstats[ea]=pd.Series(dtype="string")
|
|
241
|
+
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
242
|
+
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
243
|
+
elif (nea not in sumstats.columns) and (ea in sumstats.columns):
|
|
244
|
+
if verbose: log.write(" -Initiating NEA columns...")
|
|
245
|
+
sumstats[nea]=pd.Series(dtype="string")
|
|
246
|
+
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
247
|
+
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
248
|
+
else:
|
|
249
|
+
if verbose: log.write(" -Initiating EA and NEA columns...")
|
|
250
|
+
sumstats[nea]=pd.Series(dtype="string")
|
|
251
|
+
sumstats[ea]=pd.Series(dtype="string")
|
|
252
|
+
to_fix = is_chrposrefalt
|
|
253
|
+
if sum(to_fix)>0:
|
|
254
|
+
if verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
207
255
|
#
|
|
208
|
-
|
|
209
|
-
|
|
256
|
+
if sum(to_fix)>0:
|
|
257
|
+
if verbose: log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's CHR:POS:NEA:EA...")
|
|
210
258
|
#
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
259
|
+
if fixeanea_flip == True:
|
|
260
|
+
if verbose: log.write(" -Flipped : CHR:POS:NEA:EA -> CHR:POS:EA:NEA ")
|
|
261
|
+
sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
|
|
262
|
+
sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
|
|
263
|
+
else:
|
|
264
|
+
if verbose: log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ")
|
|
265
|
+
sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
|
|
266
|
+
sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
|
|
267
|
+
|
|
219
268
|
# #to_change_status = sumstats[status].str.match(r"\w\w\w[45]\w\w\w")
|
|
220
269
|
# #sumstats.loc[to_fix&to_change_status,status] = vchange_status(sumstats.loc[to_fix&to_change_status,status],4,"2")
|
|
221
270
|
# #sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[1]).astype("string")
|
|
222
271
|
# #sumstats.loc[to_fix,rsid].apply(lambda x:re.split(':|_|-',x)[1]).astype("Int64")
|
|
223
272
|
|
|
224
273
|
############################ fixing id ###################################################
|
|
225
|
-
if fixsep
|
|
274
|
+
if fixsep == True:
|
|
226
275
|
if snpid in sumstats.columns:
|
|
227
276
|
if verbose: log.write(' -Replacing [_-] in SNPID with ":" ...')
|
|
228
|
-
sumstats
|
|
277
|
+
sumstats[snpid] = sumstats[snpid].str.replace(r"[_-]",":",regex=True)
|
|
278
|
+
|
|
279
|
+
if fixprefix == True:
|
|
280
|
+
if snpid in sumstats.columns:
|
|
281
|
+
if verbose: log.write(' -Removing /^chr/ in SNPID ...')
|
|
282
|
+
prefix_removed = sumstats[snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
283
|
+
sumstats.loc[~prefix_removed.isna(),snpid] = prefix_removed[~prefix_removed.isna()]
|
|
229
284
|
|
|
230
|
-
if fixid
|
|
285
|
+
if fixid == True:
|
|
231
286
|
if snpid not in sumstats.columns:
|
|
232
287
|
# initiate a SNPID column
|
|
233
|
-
sumstats
|
|
288
|
+
sumstats[snpid]=pd.Series(dtype="string")
|
|
234
289
|
|
|
235
290
|
if (rsid in sumstats.columns) and (sum(is_rs_chrpos)>0) :
|
|
236
|
-
sumstats
|
|
291
|
+
sumstats[snpid]= sumstats.loc[is_rs_chrpos,rsid]
|
|
237
292
|
|
|
238
293
|
if (chrom in sumstats.columns) and (pos in sumstats.columns):
|
|
239
294
|
#only fix when CHR and POS is available
|
|
@@ -288,7 +343,8 @@ def fixID(sumstats,
|
|
|
288
343
|
after_number=sum(sumstats[snpid].isna())
|
|
289
344
|
if verbose: log.write(" -Fixed "+ str(pre_number - after_number) +" variants ID...")
|
|
290
345
|
elif verbose: log.write(" -ID unfixable: no CHR and POS columns or no SNPID. ")
|
|
291
|
-
|
|
346
|
+
|
|
347
|
+
finished(log,verbose,_end_line)
|
|
292
348
|
return sumstats
|
|
293
349
|
|
|
294
350
|
""
|
|
@@ -303,20 +359,39 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
303
359
|
remove duplicate SNPs based on 3. rsID
|
|
304
360
|
remove multiallelic SNPs based on 4. CHR, POS
|
|
305
361
|
'''
|
|
306
|
-
|
|
362
|
+
|
|
363
|
+
##start function with col checking##########################################################
|
|
364
|
+
_start_line = "remove duplicated/multiallelic variants"
|
|
365
|
+
_end_line = "removing duplicated/multiallelic variants"
|
|
366
|
+
_start_cols =[]
|
|
367
|
+
_start_function = ".remove_dup()"
|
|
368
|
+
_must_args ={}
|
|
369
|
+
|
|
370
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
371
|
+
log=log,
|
|
372
|
+
verbose=verbose,
|
|
373
|
+
start_line=_start_line,
|
|
374
|
+
end_line=_end_line,
|
|
375
|
+
start_cols=_start_cols,
|
|
376
|
+
start_function=_start_function,
|
|
377
|
+
**_must_args)
|
|
378
|
+
if is_enough_info == False: return sumstats
|
|
379
|
+
############################################################################################
|
|
380
|
+
|
|
381
|
+
if verbose: log.write(" -Removing mode:{}".format(mode))
|
|
307
382
|
# sort the variants using the specified column before removing
|
|
308
383
|
if keep_col is not None :
|
|
309
384
|
if keep_col in sumstats.columns:
|
|
310
|
-
if verbose: log.write("Start to sort the sumstats using "
|
|
385
|
+
if verbose: log.write("Start to sort the sumstats using {}...".format(keep_col))
|
|
311
386
|
sumstats = sumstats.sort_values(by=keep_col,ascending=keep_ascend)
|
|
312
387
|
else:
|
|
313
388
|
if verbose: log.write("Column" + keep_col +" was not detected... skipping... ")
|
|
314
389
|
total_number = len(sumstats)
|
|
315
390
|
|
|
316
391
|
# remove by duplicated SNPID
|
|
317
|
-
if (snpid in sumstats.columns) and "d" in mode:
|
|
392
|
+
if (snpid in sumstats.columns) and ("d" in mode or "s" in mode):
|
|
318
393
|
if verbose: log.write("Start to remove duplicated variants based on snpid...{}".format(_get_version()))
|
|
319
|
-
|
|
394
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
320
395
|
if verbose: log.write(" -Which variant to keep: ", keep )
|
|
321
396
|
pre_number =len(sumstats)
|
|
322
397
|
if snpid in sumstats.columns:
|
|
@@ -326,18 +401,19 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
326
401
|
if verbose: log.write(" -Removed ",pre_number -after_number ," based on SNPID...")
|
|
327
402
|
|
|
328
403
|
# remove by duplicated rsID
|
|
329
|
-
if (rsid in sumstats.columns) and ("d" in mode):
|
|
404
|
+
if (rsid in sumstats.columns) and ("d" in mode or "r" in mode):
|
|
330
405
|
# keep na and remove duplicated
|
|
331
406
|
pre_number =len(sumstats)
|
|
332
407
|
if verbose: log.write("Start to remove duplicated variants based on rsID...")
|
|
408
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
333
409
|
sumstats = sumstats.loc[sumstats[rsid].isna() | (~sumstats.duplicated(subset=rsid, keep=keep)),:]
|
|
334
410
|
after_number=len(sumstats)
|
|
335
411
|
if verbose: log.write(" -Removed ",pre_number -after_number ," based on rsID...")
|
|
336
412
|
|
|
337
413
|
# remove by duplicated variants by CHR:POS:NEA:EA
|
|
338
|
-
if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and "d" in mode:
|
|
414
|
+
if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and ("d" in mode or "c" in mode):
|
|
339
415
|
if verbose: log.write("Start to remove duplicated variants based on CHR,POS,EA and NEA...")
|
|
340
|
-
|
|
416
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
341
417
|
if verbose: log.write(" -Which variant to keep: ", keep )
|
|
342
418
|
pre_number =len(sumstats)
|
|
343
419
|
if snpid in sumstats.columns:
|
|
@@ -351,8 +427,9 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
351
427
|
# keep na and remove duplicated
|
|
352
428
|
pre_number =len(sumstats)
|
|
353
429
|
if verbose: log.write("Start to remove multiallelic variants based on chr:pos...")
|
|
430
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
354
431
|
if verbose: log.write(" -Which variant to keep: ", keep )
|
|
355
|
-
sumstats = sumstats.loc[(~sumstats
|
|
432
|
+
sumstats = sumstats.loc[(~sumstats[[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
|
|
356
433
|
after_number=len(sumstats)
|
|
357
434
|
if verbose: log.write(" -Removed ",pre_number -after_number," multiallelic variants...")
|
|
358
435
|
after_number=len(sumstats)
|
|
@@ -360,310 +437,376 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
360
437
|
# resort the coordinates
|
|
361
438
|
if verbose: log.write(" -Removed ",total_number -after_number," variants in total.")
|
|
362
439
|
if keep_col is not None :
|
|
363
|
-
if verbose: log.write(" -Sort the coordinates...")
|
|
440
|
+
if verbose: log.write(" -Sort the coordinates based on CHR and POS...")
|
|
364
441
|
sumstats = sortcoordinate(sumstats,verbose=False)
|
|
365
442
|
|
|
366
|
-
if
|
|
443
|
+
if "n" in mode or remove==True:
|
|
367
444
|
# if remove==True, remove NAs
|
|
368
445
|
if verbose: log.write(" -Removing NAs...")
|
|
369
446
|
pre_number =len(sumstats)
|
|
370
|
-
|
|
447
|
+
specified_columns = []
|
|
448
|
+
if "d" in mode:
|
|
449
|
+
specified_columns.append(rsid)
|
|
450
|
+
specified_columns.append(snpid)
|
|
451
|
+
specified_columns.append(chrom)
|
|
452
|
+
specified_columns.append(pos)
|
|
453
|
+
specified_columns.append(ea)
|
|
454
|
+
specified_columns.append(nea)
|
|
455
|
+
if "r" in mode:
|
|
456
|
+
specified_columns.append(rsid)
|
|
457
|
+
if "s" in mode:
|
|
458
|
+
specified_columns.append(snpid)
|
|
459
|
+
if "m" in mode:
|
|
460
|
+
specified_columns.append(chrom)
|
|
461
|
+
specified_columns.append(pos)
|
|
462
|
+
if "c" in mode:
|
|
463
|
+
specified_columns.append(chrom)
|
|
464
|
+
specified_columns.append(pos)
|
|
465
|
+
specified_columns.append(ea)
|
|
466
|
+
specified_columns.append(nea)
|
|
467
|
+
sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
|
|
371
468
|
after_number=len(sumstats)
|
|
372
|
-
if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values.")
|
|
373
|
-
|
|
469
|
+
if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)))
|
|
470
|
+
|
|
471
|
+
finished(log,verbose,_end_line)
|
|
374
472
|
return sumstats
|
|
375
473
|
|
|
376
474
|
###############################################################################################################
|
|
377
475
|
# 20230128
|
|
378
476
|
def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",24),mt=("MT",25), remove=False, verbose=True, chrom_list = None, minchr=1,log=Log()):
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
if verbose: log.write("Start to fix chromosome notation...{}".format(_get_version()))
|
|
386
|
-
if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
|
|
387
|
-
|
|
388
|
-
# convert to string datatype
|
|
389
|
-
try:
|
|
390
|
-
if verbose: log.write(" -Checking CHR data type...")
|
|
391
|
-
if sumstats.loc[:,chrom].dtype == "string":
|
|
392
|
-
pass
|
|
393
|
-
else:
|
|
394
|
-
sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype("string")
|
|
395
|
-
except:
|
|
396
|
-
if verbose: log.write(" -Force converting to pd string data type...")
|
|
397
|
-
sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype("string")
|
|
398
|
-
|
|
399
|
-
# check if CHR is numeric
|
|
400
|
-
is_chr_fixed = sumstats[chrom].str.isnumeric()
|
|
401
|
-
# fill NAs with False
|
|
402
|
-
is_chr_fixed = is_chr_fixed.fillna(False)
|
|
403
|
-
if verbose: log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed))
|
|
404
|
-
|
|
405
|
-
# if there are variants whose CHR need to be fixed
|
|
406
|
-
if sum(is_chr_fixed)<len(sumstats):
|
|
407
|
-
|
|
408
|
-
#extract the CHR number or X Y M MT
|
|
409
|
-
chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
477
|
+
##start function with col checking##########################################################
|
|
478
|
+
_start_line = "fix chromosome notation (CHR)"
|
|
479
|
+
_end_line = "fixing chromosome notation (CHR)"
|
|
480
|
+
_start_cols =[chrom,status]
|
|
481
|
+
_start_function = ".fix_chr()"
|
|
482
|
+
_must_args ={}
|
|
410
483
|
|
|
411
|
-
|
|
412
|
-
|
|
484
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
485
|
+
log=log,
|
|
486
|
+
verbose=verbose,
|
|
487
|
+
start_line=_start_line,
|
|
488
|
+
end_line=_end_line,
|
|
489
|
+
start_cols=_start_cols,
|
|
490
|
+
start_function=_start_function,
|
|
491
|
+
**_must_args)
|
|
492
|
+
if is_enough_info == False: return sumstats
|
|
493
|
+
############################################################################################
|
|
413
494
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
|
|
421
|
-
if sum(is_chr_invalid)>0 and verbose:
|
|
422
|
-
log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
|
|
423
|
-
try:
|
|
424
|
-
log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
|
|
425
|
-
except:
|
|
426
|
-
pass
|
|
427
|
-
elif verbose:
|
|
428
|
-
log.write(" -No unrecognized chromosome notations...")
|
|
429
|
-
|
|
430
|
-
# Assign good chr back to sumstats
|
|
431
|
-
sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
|
|
495
|
+
#chrom_list = get_chr_list() #bottom
|
|
496
|
+
if chrom_list is None:
|
|
497
|
+
chrom_list = get_chr_list()
|
|
498
|
+
#if check_col(sumstats,chrom,status) is not True:
|
|
499
|
+
# if verbose: log.write(".fix_chr: Specified not detected..skipping...")
|
|
500
|
+
# return sumstats
|
|
432
501
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
# if sumstats contain sex CHR
|
|
440
|
-
if sum(sex_chr)>0:
|
|
441
|
-
if verbose: log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]))
|
|
442
|
-
if verbose: log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...")
|
|
443
|
-
|
|
444
|
-
# convert "X, Y, MT" to numbers
|
|
445
|
-
convert_num_to_xymt={}
|
|
446
|
-
if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
|
|
447
|
-
convert_num_to_xymt[x[0].lower()] = str(x[1])
|
|
448
|
-
convert_num_to_xymt[x[0].upper()] = str(x[1])
|
|
449
|
-
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]))
|
|
450
|
-
if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
|
|
451
|
-
convert_num_to_xymt[y[0].lower()] = str(y[1])
|
|
452
|
-
convert_num_to_xymt[y[0].upper()] = str(y[1])
|
|
453
|
-
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]))
|
|
454
|
-
if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
|
|
455
|
-
convert_num_to_xymt[mt[0].lower()] = str(mt[1])
|
|
456
|
-
convert_num_to_xymt[mt[0].upper()] = str(mt[1])
|
|
457
|
-
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]))
|
|
458
|
-
sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
|
|
459
|
-
|
|
460
|
-
# change status code
|
|
461
|
-
sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
|
|
462
|
-
if len(is_chr_fixable.index)>0:
|
|
463
|
-
sumstats.loc[is_chr_fixable.index,status] = vchange_status(sumstats.loc[is_chr_fixable.index,status],4,"986","520")
|
|
464
|
-
if len(is_chr_fixable.index)>0:
|
|
465
|
-
sumstats.loc[is_chr_invalid.index,status] = vchange_status(sumstats.loc[is_chr_invalid.index,status],4,"986","743")
|
|
466
|
-
|
|
467
|
-
# check variants with unrecognized CHR
|
|
468
|
-
unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
|
|
469
|
-
if (remove is True) and unrecognized_num>0:
|
|
470
|
-
# remove variants with unrecognized CHR
|
|
471
|
-
try:
|
|
472
|
-
if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
|
|
473
|
-
except:
|
|
474
|
-
pass
|
|
475
|
-
if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
|
|
476
|
-
try:
|
|
477
|
-
log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
|
|
478
|
-
except:
|
|
479
|
-
pass
|
|
480
|
-
#sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
|
|
481
|
-
good_chr = sumstats[chrom].isin(chrom_list)
|
|
482
|
-
sumstats = sumstats.loc[good_chr, :].copy()
|
|
502
|
+
|
|
503
|
+
# convert to string datatype
|
|
504
|
+
try:
|
|
505
|
+
if verbose: log.write(" -Checking CHR data type...")
|
|
506
|
+
if sumstats[chrom].dtype == "string":
|
|
507
|
+
pass
|
|
483
508
|
else:
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
if sum(out_of_range_chr)>0:
|
|
498
|
-
if verbose: log.write(" -Sanity check for CHR...")
|
|
499
|
-
if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
|
|
500
|
-
sumstats = sumstats.loc[~out_of_range_chr,:]
|
|
501
|
-
|
|
502
|
-
if verbose: log.write("Finished fixing chromosome notation successfully!")
|
|
509
|
+
sumstats[chrom] = sumstats[chrom].astype("string")
|
|
510
|
+
except:
|
|
511
|
+
if verbose: log.write(" -Force converting to pd string data type...")
|
|
512
|
+
sumstats[chrom] = sumstats[chrom].astype("string")
|
|
513
|
+
|
|
514
|
+
# check if CHR is numeric
|
|
515
|
+
is_chr_fixed = sumstats[chrom].str.isnumeric()
|
|
516
|
+
# fill NAs with False
|
|
517
|
+
is_chr_fixed = is_chr_fixed.fillna(False)
|
|
518
|
+
if verbose: log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed))
|
|
519
|
+
|
|
520
|
+
# if there are variants whose CHR need to be fixed
|
|
521
|
+
if sum(is_chr_fixed)<len(sumstats):
|
|
503
522
|
|
|
504
|
-
|
|
523
|
+
#extract the CHR number or X Y M MT
|
|
524
|
+
chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
505
525
|
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
if
|
|
512
|
-
|
|
513
|
-
return sumstats
|
|
514
|
-
if verbose: log.write("Start to fix basepair positions...{}".format(_get_version()))
|
|
515
|
-
if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
|
|
526
|
+
is_chr_fixable = ~chr_extracted.isna()
|
|
527
|
+
if verbose: log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable))
|
|
528
|
+
|
|
529
|
+
# For not fixed variants, check if na
|
|
530
|
+
is_chr_na = sumstats.loc[~is_chr_fixed, chrom].isna()
|
|
531
|
+
if sum(is_chr_na)>0 and verbose:
|
|
532
|
+
log.write(" -Variants with NA chromosome notations:",sum(is_chr_na))
|
|
516
533
|
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
534
|
+
# Check variants with CHR being not NA and not fixable
|
|
535
|
+
is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
|
|
536
|
+
if sum(is_chr_invalid)>0 and verbose:
|
|
537
|
+
log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
|
|
538
|
+
try:
|
|
539
|
+
log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
|
|
540
|
+
except:
|
|
541
|
+
pass
|
|
542
|
+
elif verbose:
|
|
543
|
+
log.write(" -No unrecognized chromosome notations...")
|
|
520
544
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
sumstats.loc[:,pos] = sumstats.loc[:,pos].astype('string')
|
|
524
|
-
# if so, remove thousands separator
|
|
525
|
-
if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
|
|
526
|
-
sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
|
|
527
|
-
except:
|
|
528
|
-
pass
|
|
545
|
+
# Assign good chr back to sumstats
|
|
546
|
+
sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
|
|
529
547
|
|
|
530
|
-
#
|
|
531
|
-
|
|
532
|
-
if verbose: log.write(' -Converting to Int64 data type ...')
|
|
533
|
-
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
534
|
-
except:
|
|
535
|
-
if verbose: log.write(' -Force converting to Int64 data type ...')
|
|
536
|
-
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
537
|
-
is_pos_fixed = ~sumstats.loc[:,pos].isna()
|
|
538
|
-
is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
|
|
548
|
+
# X, Y, MT to 23,24,25
|
|
549
|
+
xymt_list = [x[0].lower(),y[0].lower(),mt[0].lower(),x[0].upper(),y[0].upper(),mt[0].upper()]
|
|
539
550
|
|
|
540
|
-
sumstats
|
|
541
|
-
|
|
551
|
+
# check if sumstats contain sex CHR
|
|
552
|
+
sex_chr = sumstats[chrom].isin(xymt_list)
|
|
542
553
|
|
|
543
|
-
#
|
|
544
|
-
if
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
+
# if sumstats contain sex CHR
|
|
555
|
+
if sum(sex_chr)>0:
|
|
556
|
+
if verbose: log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]))
|
|
557
|
+
if verbose: log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...")
|
|
558
|
+
|
|
559
|
+
# convert "X, Y, MT" to numbers
|
|
560
|
+
convert_num_to_xymt={}
|
|
561
|
+
if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
|
|
562
|
+
convert_num_to_xymt[x[0].lower()] = str(x[1])
|
|
563
|
+
convert_num_to_xymt[x[0].upper()] = str(x[1])
|
|
564
|
+
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]))
|
|
565
|
+
if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
|
|
566
|
+
convert_num_to_xymt[y[0].lower()] = str(y[1])
|
|
567
|
+
convert_num_to_xymt[y[0].upper()] = str(y[1])
|
|
568
|
+
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]))
|
|
569
|
+
if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
|
|
570
|
+
convert_num_to_xymt[mt[0].lower()] = str(mt[1])
|
|
571
|
+
convert_num_to_xymt[mt[0].upper()] = str(mt[1])
|
|
572
|
+
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]))
|
|
573
|
+
sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
|
|
554
574
|
|
|
555
|
-
|
|
556
|
-
|
|
575
|
+
# change status code
|
|
576
|
+
sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
|
|
577
|
+
if len(is_chr_fixable.index)>0:
|
|
578
|
+
sumstats.loc[is_chr_fixable.index,status] = vchange_status(sumstats.loc[is_chr_fixable.index,status],4,"986","520")
|
|
579
|
+
if len(is_chr_fixable.index)>0:
|
|
580
|
+
sumstats.loc[is_chr_invalid.index,status] = vchange_status(sumstats.loc[is_chr_invalid.index,status],4,"986","743")
|
|
557
581
|
|
|
558
|
-
|
|
582
|
+
# check variants with unrecognized CHR
|
|
583
|
+
unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
|
|
584
|
+
if (remove is True) and unrecognized_num>0:
|
|
585
|
+
# remove variants with unrecognized CHR
|
|
586
|
+
try:
|
|
587
|
+
if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
|
|
588
|
+
except:
|
|
589
|
+
pass
|
|
590
|
+
if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
|
|
591
|
+
try:
|
|
592
|
+
log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
|
|
593
|
+
except:
|
|
594
|
+
pass
|
|
595
|
+
#sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
|
|
596
|
+
good_chr = sumstats[chrom].isin(chrom_list)
|
|
597
|
+
sumstats = sumstats.loc[good_chr, :].copy()
|
|
598
|
+
else:
|
|
599
|
+
if verbose: log.write(" -All CHR are already fixed...")
|
|
600
|
+
sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
|
|
601
|
+
|
|
602
|
+
# Convert string to int
|
|
603
|
+
try:
|
|
604
|
+
sumstats[chrom] = sumstats[chrom].astype('Int64')
|
|
605
|
+
except:
|
|
606
|
+
# # force convert
|
|
607
|
+
sumstats[chrom] = np.floor(pd.to_numeric(sumstats[chrom], errors='coerce')).astype('Int64')
|
|
608
|
+
|
|
609
|
+
# filter out variants with CHR <=0
|
|
610
|
+
out_of_range_chr = sumstats[chrom] < minchr
|
|
611
|
+
out_of_range_chr = out_of_range_chr.fillna(False)
|
|
612
|
+
if sum(out_of_range_chr)>0:
|
|
613
|
+
if verbose: log.write(" -Sanity check for CHR...")
|
|
614
|
+
if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
|
|
615
|
+
sumstats = sumstats.loc[~out_of_range_chr,:]
|
|
616
|
+
|
|
617
|
+
finished(log,verbose,_end_line)
|
|
618
|
+
return sumstats
|
|
619
|
+
|
|
620
|
+
###############################################################################################################
|
|
621
|
+
# 20230128
|
|
622
|
+
def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_limit=0 , upper_limit=None , limit=250000000, log=Log()):
|
|
623
|
+
##start function with col checking##########################################################
|
|
624
|
+
_start_line = "fix basepair positions (POS)"
|
|
625
|
+
_end_line = "fixing basepair positions (POS)"
|
|
626
|
+
_start_cols =[pos,status]
|
|
627
|
+
_start_function = ".fix_pos()"
|
|
628
|
+
_must_args ={}
|
|
629
|
+
|
|
630
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
631
|
+
log=log,
|
|
632
|
+
verbose=verbose,
|
|
633
|
+
start_line=_start_line,
|
|
634
|
+
end_line=_end_line,
|
|
635
|
+
start_cols=_start_cols,
|
|
636
|
+
start_function=_start_function,
|
|
637
|
+
**_must_args)
|
|
638
|
+
if is_enough_info == False: return sumstats
|
|
639
|
+
############################################################################################
|
|
640
|
+
|
|
641
|
+
if upper_limit is None:
|
|
642
|
+
upper_limit = limit
|
|
643
|
+
|
|
644
|
+
all_var_num = len(sumstats)
|
|
645
|
+
#convert to numeric
|
|
646
|
+
is_pos_na = sumstats[pos].isna()
|
|
647
|
+
|
|
648
|
+
try:
|
|
649
|
+
if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
|
|
650
|
+
sumstats[pos] = sumstats[pos].astype('string')
|
|
651
|
+
# if so, remove thousands separator
|
|
652
|
+
if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
|
|
653
|
+
sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
|
|
654
|
+
except:
|
|
655
|
+
pass
|
|
656
|
+
|
|
657
|
+
# convert POS to integer
|
|
658
|
+
try:
|
|
659
|
+
if verbose: log.write(' -Converting to Int64 data type ...')
|
|
660
|
+
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
661
|
+
except:
|
|
662
|
+
if verbose: log.write(' -Force converting to Int64 data type ...')
|
|
663
|
+
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
664
|
+
is_pos_fixed = ~sumstats[pos].isna()
|
|
665
|
+
is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
|
|
666
|
+
|
|
667
|
+
sumstats.loc[is_pos_fixed,status] = vchange_status(sumstats.loc[is_pos_fixed,status] ,4,"975","630")
|
|
668
|
+
sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
|
|
669
|
+
|
|
670
|
+
# remove outlier, limit:250,000,000
|
|
671
|
+
if verbose: log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit))
|
|
672
|
+
is_pos_na = sumstats[pos].isna()
|
|
673
|
+
out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
|
|
674
|
+
if verbose: log.write(" -Removed outliers:",sum(out_lier))
|
|
675
|
+
sumstats = sumstats.loc[~out_lier,:]
|
|
676
|
+
#remove na
|
|
677
|
+
if remove is True:
|
|
678
|
+
sumstats = sumstats.loc[~sumstats[pos].isna(),:]
|
|
679
|
+
remain_var_num = len(sumstats)
|
|
680
|
+
if verbose: log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.")
|
|
681
|
+
|
|
682
|
+
finished(log,verbose,_end_line)
|
|
683
|
+
return sumstats
|
|
559
684
|
|
|
560
685
|
###############################################################################################################
|
|
561
686
|
# 20220514
|
|
562
687
|
def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=True,log=Log()):
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
#if (ea not in sumstats.columns) or (nea not in sumstats.columns):
|
|
571
|
-
if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
|
|
572
|
-
|
|
573
|
-
#try:
|
|
574
|
-
# ea_missing = sum(sumstats[ea].isna())
|
|
575
|
-
# nea_missing = sum(sumstats[nea].isna())
|
|
576
|
-
# if sum(ea_missing)>0:
|
|
577
|
-
# if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
|
|
578
|
-
# sumstats.loc[:,ea] = sumstats.loc[:,ea].add_categories("N").fillna("N")
|
|
579
|
-
# if sum(sumstats[nea].isna())>0:
|
|
580
|
-
# if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
|
|
581
|
-
# sumstats.loc[:,nea] = sumstats.loc[:,nea].add_categories("N").fillna("N")
|
|
582
|
-
#except:
|
|
583
|
-
# pass
|
|
584
|
-
|
|
585
|
-
categories = set(sumstats.loc[:,ea].str.upper())|set(sumstats.loc[:,nea].str.upper())|set("N")
|
|
586
|
-
categories = {x for x in categories if pd.notna(x)}
|
|
587
|
-
|
|
588
|
-
sumstats.loc[:,ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
|
|
589
|
-
sumstats.loc[:,nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
|
|
590
|
-
all_var_num = len(sumstats)
|
|
591
|
-
|
|
592
|
-
## check ATCG
|
|
593
|
-
bad_ea = sumstats[ea].str.contains("[^actgACTG]",na=True)
|
|
594
|
-
bad_nea = sumstats[nea].str.contains("[^actgACTG]",na=True)
|
|
595
|
-
good_ea = ~bad_ea
|
|
596
|
-
good_nea = ~bad_nea
|
|
597
|
-
|
|
598
|
-
log.write(" -Variants with bad EA : {}".format(sum(bad_ea)), verbose=verbose)
|
|
599
|
-
log.write(" -Variants with bad NEA : {}".format(sum(bad_nea)), verbose=verbose)
|
|
600
|
-
|
|
601
|
-
## check NA
|
|
602
|
-
is_eanea_na = sumstats[ea].isna() | sumstats[nea].isna()
|
|
603
|
-
log.write(" -Variants with NA for EA or NEA: {}".format(sum(is_eanea_na)), verbose=verbose)
|
|
604
|
-
|
|
605
|
-
## check same alleles
|
|
606
|
-
not_variant = sumstats[nea] == sumstats[ea]
|
|
607
|
-
log.write(" -Variants with same EA and NEA: {}".format(sum(not_variant)), verbose=verbose)
|
|
688
|
+
##start function with col checking##########################################################
|
|
689
|
+
_start_line = "fix alleles (EA and NEA)"
|
|
690
|
+
_end_line = "fixing alleles (EA and NEA)"
|
|
691
|
+
_start_cols =[ea, nea,status]
|
|
692
|
+
_start_function = ".fix_allele()"
|
|
693
|
+
_must_args ={}
|
|
608
694
|
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
695
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
696
|
+
log=log,
|
|
697
|
+
verbose=verbose,
|
|
698
|
+
start_line=_start_line,
|
|
699
|
+
end_line=_end_line,
|
|
700
|
+
start_cols=_start_cols,
|
|
701
|
+
start_function=_start_function,
|
|
702
|
+
**_must_args)
|
|
703
|
+
if is_enough_info == False: return sumstats
|
|
704
|
+
############################################################################################
|
|
705
|
+
#try:
|
|
706
|
+
# ea_missing = sum(sumstats[ea].isna())
|
|
707
|
+
# nea_missing = sum(sumstats[nea].isna())
|
|
708
|
+
# if sum(ea_missing)>0:
|
|
709
|
+
# if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
|
|
710
|
+
# sumstats[ea] = sumstats[ea].add_categories("N").fillna("N")
|
|
711
|
+
# if sum(sumstats[nea].isna())>0:
|
|
712
|
+
# if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
|
|
713
|
+
# sumstats[nea] = sumstats[nea].add_categories("N").fillna("N")
|
|
714
|
+
#except:
|
|
715
|
+
# pass
|
|
716
|
+
|
|
717
|
+
if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
|
|
718
|
+
categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
|
|
719
|
+
categories = {x for x in categories if pd.notna(x)}
|
|
720
|
+
sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
|
|
721
|
+
sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
|
|
722
|
+
all_var_num = len(sumstats)
|
|
723
|
+
|
|
724
|
+
## check ATCG
|
|
725
|
+
bad_ea = sumstats[ea].str.contains("[^actgACTG]",na=True)
|
|
726
|
+
bad_nea = sumstats[nea].str.contains("[^actgACTG]",na=True)
|
|
727
|
+
good_ea = ~bad_ea
|
|
728
|
+
good_nea = ~bad_nea
|
|
729
|
+
|
|
730
|
+
log.write(" -Variants with bad EA : {}".format(sum(bad_ea)), verbose=verbose)
|
|
731
|
+
log.write(" -Variants with bad NEA : {}".format(sum(bad_nea)), verbose=verbose)
|
|
732
|
+
|
|
733
|
+
## check NA
|
|
734
|
+
is_eanea_na = sumstats[ea].isna() | sumstats[nea].isna()
|
|
735
|
+
log.write(" -Variants with NA for EA or NEA: {}".format(sum(is_eanea_na)), verbose=verbose)
|
|
736
|
+
|
|
737
|
+
## check same alleles
|
|
738
|
+
not_variant = sumstats[nea] == sumstats[ea]
|
|
739
|
+
log.write(" -Variants with same EA and NEA: {}".format(sum(not_variant)), verbose=verbose)
|
|
740
|
+
|
|
741
|
+
## sum up invalid variants
|
|
742
|
+
is_invalid = bad_ea | bad_nea | not_variant
|
|
743
|
+
|
|
744
|
+
exclude = bad_nea | bad_ea
|
|
745
|
+
|
|
746
|
+
if verbose:
|
|
747
|
+
if len(set(sumstats.loc[bad_ea,ea].head())) >0:
|
|
748
|
+
log.write(" -A look at the non-ATCG EA:",set(sumstats.loc[bad_ea,ea].head()),"...")
|
|
749
|
+
if len(set(sumstats.loc[bad_nea,nea].head())) >0:
|
|
750
|
+
log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...")
|
|
751
|
+
|
|
752
|
+
if remove == True:
|
|
753
|
+
sumstats = sumstats.loc[(good_ea & good_nea),:].copy()
|
|
754
|
+
good_eanea_num = len(sumstats)
|
|
755
|
+
if verbose: log.write(" -Removed "+str(all_var_num - good_eanea_num)+" variants with NA alleles or alleles that contain bases other than A/C/T/G.")
|
|
756
|
+
sumstats = sumstats.loc[(good_ea & good_nea & (~not_variant)),:].copy()
|
|
757
|
+
good_eanea_notsame_num = len(sumstats)
|
|
758
|
+
if verbose: log.write(" -Removed "+str(good_eanea_num - good_eanea_notsame_num)+" variants with same allele for EA and NEA.")
|
|
759
|
+
else:
|
|
760
|
+
sumstats[[ea,nea]] = sumstats[[ea,nea]].fillna("N")
|
|
761
|
+
if verbose: log.write(" -Detected "+str(sum(exclude))+" variants with alleles that contain bases other than A/C/T/G .")
|
|
762
|
+
categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
|
|
763
|
+
sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
|
|
764
|
+
sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
|
|
765
|
+
|
|
766
|
+
is_eanea_fixed = good_ea | good_nea
|
|
767
|
+
is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
|
|
768
|
+
is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
|
|
769
|
+
is_not_normalized = (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()>1)
|
|
770
|
+
is_normalized = is_indel &( (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()>1) | (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()==1) )
|
|
771
|
+
|
|
772
|
+
if sum(is_invalid)>0:
|
|
773
|
+
sumstats.loc[is_invalid, status] = vchange_status(sumstats.loc[is_invalid,status], 5,"9","6")
|
|
774
|
+
if sum(is_eanea_na)>0:
|
|
775
|
+
sumstats.loc[is_eanea_na,status] = vchange_status(sumstats.loc[is_eanea_na, status], 5,"9","7")
|
|
776
|
+
if sum(is_eanea_fixed&is_not_normalized)>0:
|
|
777
|
+
sumstats.loc[is_eanea_fixed&is_not_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_not_normalized,status], 5,"9","5")
|
|
778
|
+
if sum(is_eanea_fixed&is_snp)>0:
|
|
779
|
+
sumstats.loc[is_eanea_fixed&is_snp, status] = vchange_status(sumstats.loc[is_eanea_fixed&is_snp,status], 5,"9","0")
|
|
780
|
+
if sum(is_eanea_fixed&is_indel)>0:
|
|
781
|
+
sumstats.loc[is_eanea_fixed&is_indel,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_indel, status], 5,"9","4")
|
|
782
|
+
if sum(is_eanea_fixed&is_normalized)>0:
|
|
783
|
+
sumstats.loc[is_eanea_fixed&is_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_normalized, status], 5,"4","3")
|
|
784
|
+
|
|
785
|
+
finished(log,verbose,_end_line)
|
|
786
|
+
return sumstats
|
|
656
787
|
|
|
657
788
|
###############################################################################################################
|
|
658
789
|
# 20220721
|
|
659
790
|
|
|
660
791
|
def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
792
|
+
##start function with col checking##########################################################
|
|
793
|
+
_start_line = "normalize indels"
|
|
794
|
+
_end_line = "normalizing indels"
|
|
795
|
+
_start_cols =[ea, nea,status]
|
|
796
|
+
_start_function = ".normalize()"
|
|
797
|
+
_must_args ={}
|
|
798
|
+
|
|
799
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
800
|
+
log=log,
|
|
801
|
+
verbose=verbose,
|
|
802
|
+
start_line=_start_line,
|
|
803
|
+
end_line=_end_line,
|
|
804
|
+
start_cols=_start_cols,
|
|
805
|
+
start_function=_start_function,
|
|
806
|
+
**_must_args)
|
|
807
|
+
if is_enough_info == False: return sumstats
|
|
808
|
+
############################################################################################
|
|
809
|
+
|
|
667
810
|
#variants_to_check = status_match(sumstats[status],5,[4,5]) #
|
|
668
811
|
#r'\w\w\w\w[45]\w\w'
|
|
669
812
|
variants_to_check = sumstats[status].str[4].str.match(r'4|5', case=False, flags=0, na=False)
|
|
@@ -677,7 +820,8 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
677
820
|
n_cores=1
|
|
678
821
|
pool = Pool(n_cores)
|
|
679
822
|
map_func = partial(normalizeallele,pos=pos,nea=nea,ea=ea,status=status)
|
|
680
|
-
df_split = np.array_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
|
|
823
|
+
#df_split = np.array_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
|
|
824
|
+
df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
|
|
681
825
|
normalized_pd = pd.concat(pool.map(map_func,df_split))
|
|
682
826
|
pool.close()
|
|
683
827
|
pool.join()
|
|
@@ -707,16 +851,16 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
707
851
|
else:
|
|
708
852
|
log.write(" -All variants are already normalized..")
|
|
709
853
|
###################################################################################################################
|
|
710
|
-
categories = set(sumstats
|
|
711
|
-
sumstats
|
|
712
|
-
sumstats
|
|
854
|
+
categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
|
|
855
|
+
sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
|
|
856
|
+
sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
|
|
713
857
|
sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
|
|
714
858
|
try:
|
|
715
|
-
sumstats
|
|
859
|
+
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
716
860
|
except:
|
|
717
|
-
sumstats
|
|
861
|
+
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
718
862
|
|
|
719
|
-
|
|
863
|
+
finished(log,verbose,_end_line)
|
|
720
864
|
return sumstats
|
|
721
865
|
|
|
722
866
|
def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
|
|
@@ -781,6 +925,52 @@ def add_tolerence(stats, float_tolerence, mode):
|
|
|
781
925
|
stats = (stats[0] , stats[1] + float_tolerence if stats[0]!=float("Inf") else float("Inf"))
|
|
782
926
|
return stats
|
|
783
927
|
|
|
928
|
+
|
|
929
|
+
def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, verbose, dtype="Int64"):
|
|
930
|
+
pre_number=len(sumstats)
|
|
931
|
+
if header in coltocheck and header in sumstats.columns:
|
|
932
|
+
cols_to_check.append(header)
|
|
933
|
+
if header=="STATUS":
|
|
934
|
+
if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
|
|
935
|
+
categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
936
|
+
sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
|
|
937
|
+
return sumstats
|
|
938
|
+
|
|
939
|
+
if dtype in ["Int64","Int32","int","int32","in64"]:
|
|
940
|
+
if verbose: log.write(" -Checking if {} <= {} <= {} ...".format( var_range[0] ,header, var_range[1]))
|
|
941
|
+
sumstats[header] = np.floor(pd.to_numeric(sumstats[header], errors='coerce')).astype(dtype)
|
|
942
|
+
|
|
943
|
+
elif dtype in ["Float64","Float32","float","float64","float32"]:
|
|
944
|
+
log.write(" -Checking if {} < {} < {} ...".format( var_range[0] ,header, var_range[1]),verbose=verbose)
|
|
945
|
+
sumstats[header] = pd.to_numeric(sumstats[header], errors='coerce').astype(dtype)
|
|
946
|
+
|
|
947
|
+
is_valid = (sumstats[header]>=var_range[0]) & (sumstats[header]<=var_range[1])
|
|
948
|
+
is_valid = is_valid.fillna(False)
|
|
949
|
+
|
|
950
|
+
if header=="P":
|
|
951
|
+
is_low_p = sumstats["P"] == 0
|
|
952
|
+
if sum(is_low_p) >0:
|
|
953
|
+
log.warning("Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)))
|
|
954
|
+
log.warning("Please consider using MLOG10P instead.")
|
|
955
|
+
|
|
956
|
+
if sum(~is_valid)>0:
|
|
957
|
+
try:
|
|
958
|
+
if "SNPID" in sumstats.columns:
|
|
959
|
+
id_to_use = "SNPID"
|
|
960
|
+
elif "rsID" in sumstats.columns:
|
|
961
|
+
id_to_use = "rsID"
|
|
962
|
+
invalid_ids = sumstats.loc[~is_valid, id_to_use].head().astype("string")
|
|
963
|
+
invalid_values = sumstats.loc[~is_valid, header].head().astype("string").fillna("NA")
|
|
964
|
+
log.write(" -Examples of invalid variants({}): {} ...".format(id_to_use, ",".join(invalid_ids.to_list()) ), verbose=verbose)
|
|
965
|
+
log.write(" -Examples of invalid values ({}): {} ...".format(header, ",".join(invalid_values.to_list()) ), verbose=verbose)
|
|
966
|
+
except:
|
|
967
|
+
pass
|
|
968
|
+
|
|
969
|
+
sumstats = sumstats.loc[is_valid,:]
|
|
970
|
+
after_number=len(sumstats)
|
|
971
|
+
log.write(" -Removed {} variants with bad/na {}.".format(pre_number - after_number, header), verbose=verbose)
|
|
972
|
+
return sumstats
|
|
973
|
+
|
|
784
974
|
def sanitycheckstats(sumstats,
|
|
785
975
|
coltocheck=None,
|
|
786
976
|
n=(0,2**31-1),
|
|
@@ -788,8 +978,10 @@ def sanitycheckstats(sumstats,
|
|
|
788
978
|
ncontrol=(0,2**31-1),
|
|
789
979
|
eaf=(0,1),
|
|
790
980
|
mac=(0,2**31-1),
|
|
981
|
+
maf=(0,0.5),
|
|
791
982
|
chisq=(0,float("Inf")),
|
|
792
983
|
z=(-9999,9999),
|
|
984
|
+
t=(-99999,99999),
|
|
793
985
|
f=(0,float("Inf")),
|
|
794
986
|
p=(0,1),
|
|
795
987
|
mlog10p=(0,9999),
|
|
@@ -820,10 +1012,30 @@ def sanitycheckstats(sumstats,
|
|
|
820
1012
|
HR_95U: float64 , HR_95L >0
|
|
821
1013
|
INFO: float32 , 1>=INFO>0
|
|
822
1014
|
Z float64 , -9999 < Z < 9999
|
|
1015
|
+
T float64 , -99999 < T < 99999
|
|
823
1016
|
F float64 , F > 0
|
|
824
1017
|
'''
|
|
1018
|
+
##start function with col checking##########################################################
|
|
1019
|
+
_start_line = "perform sanity check for statistics"
|
|
1020
|
+
_end_line = "sanity check for statistics"
|
|
1021
|
+
_start_cols =[]
|
|
1022
|
+
_start_function = ".check_sanity()"
|
|
1023
|
+
_must_args ={}
|
|
825
1024
|
|
|
1025
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1026
|
+
log=log,
|
|
1027
|
+
verbose=verbose,
|
|
1028
|
+
start_line=_start_line,
|
|
1029
|
+
end_line=_end_line,
|
|
1030
|
+
start_cols=_start_cols,
|
|
1031
|
+
start_function=_start_function,
|
|
1032
|
+
**_must_args)
|
|
1033
|
+
if is_enough_info == False: return sumstats
|
|
1034
|
+
############################################################################################
|
|
1035
|
+
|
|
1036
|
+
if verbose: log.write(" -Comparison tolerance for floats: {}".format(float_tolerence))
|
|
826
1037
|
eaf = add_tolerence(eaf, float_tolerence, "lr")
|
|
1038
|
+
maf = add_tolerence(maf, float_tolerence, "lr")
|
|
827
1039
|
beta = add_tolerence(beta, float_tolerence, "lr")
|
|
828
1040
|
se = add_tolerence(se, float_tolerence, "lr")
|
|
829
1041
|
mlog10p = add_tolerence(mlog10p, float_tolerence, "lr")
|
|
@@ -838,221 +1050,138 @@ def sanitycheckstats(sumstats,
|
|
|
838
1050
|
p = add_tolerence(p, float_tolerence, "lr")
|
|
839
1051
|
f = add_tolerence(f, float_tolerence, "lr")
|
|
840
1052
|
chisq = add_tolerence(chisq, float_tolerence, "lr")
|
|
841
|
-
|
|
842
|
-
|
|
1053
|
+
############################################################################################
|
|
843
1054
|
## add direction
|
|
844
1055
|
if coltocheck is None:
|
|
845
1056
|
coltocheck = ["P","MLOG10P","INFO","Z","BETA","SE","EAF","CHISQ","F","N","N_CASE","N_CONTROL","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","STATUS"]
|
|
846
|
-
|
|
847
|
-
if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
|
|
1057
|
+
|
|
848
1058
|
cols_to_check=[]
|
|
849
1059
|
oringinal_number=len(sumstats)
|
|
850
1060
|
sumstats = sumstats.copy()
|
|
851
1061
|
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
pre_number=len(sumstats)
|
|
911
|
-
if "CHISQ" in coltocheck and "CHISQ" in sumstats.columns:
|
|
912
|
-
cols_to_check.append("CHISQ")
|
|
913
|
-
if verbose: log.write(" -Checking if ",chisq[0],"<CHISQ<",chisq[1]," ...")
|
|
914
|
-
sumstats.loc[:,"CHISQ"] = pd.to_numeric(sumstats.loc[:,"CHISQ"], errors='coerce').astype("float64")
|
|
915
|
-
sumstats = sumstats.loc[(sumstats["CHISQ"]>chisq[0]) & (sumstats["CHISQ"]<chisq[1]),:]
|
|
916
|
-
after_number=len(sumstats)
|
|
917
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad CHISQ.")
|
|
918
|
-
|
|
919
|
-
pre_number=len(sumstats)
|
|
920
|
-
if "Z" in coltocheck and "Z" in sumstats.columns:
|
|
921
|
-
cols_to_check.append("Z")
|
|
922
|
-
if verbose: log.write(" -Checking if ",z[0],"<Z<",z[1]," ...")
|
|
923
|
-
sumstats.loc[:,"Z"] = pd.to_numeric(sumstats.loc[:,"Z"], errors='coerce').astype("float64")
|
|
924
|
-
sumstats = sumstats.loc[(sumstats["Z"]>z[0]) & (sumstats["Z"]<z[1]),:]
|
|
925
|
-
after_number=len(sumstats)
|
|
926
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad Z.")
|
|
927
|
-
|
|
928
|
-
pre_number=len(sumstats)
|
|
929
|
-
if "F" in coltocheck and "F" in sumstats.columns:
|
|
930
|
-
cols_to_check.append("F")
|
|
931
|
-
if verbose: log.write(" -Checking if ",f[0],"<F<",f[1]," ...")
|
|
932
|
-
sumstats.loc[:,"F"] = pd.to_numeric(sumstats.loc[:,"F"], errors='coerce').astype("float64")
|
|
933
|
-
sumstats = sumstats.loc[(sumstats["F"]>f[0]) & (sumstats["F"]<f[1]),:]
|
|
934
|
-
after_number=len(sumstats)
|
|
935
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad F.")
|
|
936
|
-
|
|
937
|
-
###P ################################################################################################################################################
|
|
938
|
-
pre_number=len(sumstats)
|
|
939
|
-
if "P" in coltocheck and "P" in sumstats.columns:
|
|
940
|
-
cols_to_check.append("P")
|
|
941
|
-
if verbose: log.write(" -Checking if ",p[0],"< P <",p[1]," ...")
|
|
942
|
-
sumstats.loc[:,"P"] = pd.to_numeric(sumstats.loc[:,"P"], errors='coerce').astype("float64")
|
|
943
|
-
sumstats = sumstats.loc[(sumstats["P"]>p[0]) & (sumstats["P"]<p[1]),:]
|
|
944
|
-
after_number=len(sumstats)
|
|
945
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad P.")
|
|
1062
|
+
###Int64 ################################################################################################################################################
|
|
1063
|
+
sumstats = check_range(sumstats, var_range=n, header="N", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
|
|
1064
|
+
sumstats = check_range(sumstats, var_range=ncase, header="N_CASE", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
|
|
1065
|
+
sumstats = check_range(sumstats, var_range=ncontrol, header="N_CONTROL", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
|
|
1066
|
+
|
|
1067
|
+
###float32 ################################################################################################################################################
|
|
1068
|
+
sumstats = check_range(sumstats, var_range=eaf, header="EAF", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
|
|
1069
|
+
sumstats = check_range(sumstats, var_range=maf, header="MAF", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
|
|
1070
|
+
sumstats = check_range(sumstats, var_range=info, header="INFO", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
|
|
1071
|
+
|
|
1072
|
+
###float64 ################################################################################################################################################
|
|
1073
|
+
sumstats = check_range(sumstats, var_range=chisq, header="CHISQ", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1074
|
+
sumstats = check_range(sumstats, var_range=z, header="Z", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1075
|
+
sumstats = check_range(sumstats, var_range=t, header="T", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1076
|
+
sumstats = check_range(sumstats, var_range=f, header="F", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1077
|
+
sumstats = check_range(sumstats, var_range=p, header="P", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1078
|
+
sumstats = check_range(sumstats, var_range=mlog10p, header="MLOG10P", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1079
|
+
sumstats = check_range(sumstats, var_range=beta, header="BETA", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1080
|
+
sumstats = check_range(sumstats, var_range=se, header="SE", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1081
|
+
sumstats = check_range(sumstats, var_range=OR, header="OR", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1082
|
+
sumstats = check_range(sumstats, var_range=OR_95L, header="OR_95L", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1083
|
+
sumstats = check_range(sumstats, var_range=OR_95U, header="OR_95U", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1084
|
+
sumstats = check_range(sumstats, var_range=HR, header="HR", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1085
|
+
sumstats = check_range(sumstats, var_range=HR_95L, header="HR_95L", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1086
|
+
sumstats = check_range(sumstats, var_range=HR_95U, header="HR_95U", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1087
|
+
###STATUS ###############################################################################################################################################
|
|
1088
|
+
sumstats = check_range(sumstats, var_range=None, header="STATUS", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="category")
|
|
1089
|
+
|
|
1090
|
+
after_number=len(sumstats)
|
|
1091
|
+
log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.",verbose=verbose)
|
|
1092
|
+
log.write(" -Data types for each column:",verbose=verbose)
|
|
1093
|
+
check_datatype(sumstats,verbose=verbose, log=log)
|
|
1094
|
+
finished(log,verbose,_end_line)
|
|
1095
|
+
return sumstats
|
|
1096
|
+
|
|
1097
|
+
### check consistency #############################################################################################################################################
|
|
1098
|
+
|
|
1099
|
+
def _check_data_consistency(sumstats, beta="BETA", se="SE", p="P",mlog10p="MLOG10P",rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
|
|
1100
|
+
##start function with col checking##########################################################
|
|
1101
|
+
_start_line = "check data consistency across columns"
|
|
1102
|
+
_end_line = "checking data consistency across columns"
|
|
1103
|
+
_start_cols =[]
|
|
1104
|
+
_start_function = ".check_data_consistency()"
|
|
1105
|
+
_must_args ={}
|
|
1106
|
+
|
|
1107
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1108
|
+
log=log,
|
|
1109
|
+
verbose=verbose,
|
|
1110
|
+
start_line=_start_line,
|
|
1111
|
+
end_line=_end_line,
|
|
1112
|
+
start_cols=_start_cols,
|
|
1113
|
+
start_function=_start_function,
|
|
1114
|
+
**_must_args)
|
|
1115
|
+
if is_enough_info == False: return sumstats
|
|
1116
|
+
############################################################################################
|
|
1117
|
+
|
|
1118
|
+
log.write(" -Tolerance: {} (Relative) and {} (Absolute)".format(rtol, atol),verbose=verbose)
|
|
1119
|
+
check_status = 0
|
|
946
1120
|
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
1121
|
+
if "SNPID" in sumstats.columns:
|
|
1122
|
+
id_to_use = "SNPID"
|
|
1123
|
+
elif "rsID" in sumstats.columns:
|
|
1124
|
+
id_to_use = "rsID"
|
|
1125
|
+
else:
|
|
1126
|
+
log.write(" -SNPID/rsID not available...SKipping",verbose=verbose)
|
|
1127
|
+
log.write("Finished checking data consistency across columns.",verbose=verbose)
|
|
1128
|
+
return 0
|
|
955
1129
|
|
|
956
|
-
###EFFECT ################################################################################################################################################
|
|
957
|
-
pre_number=len(sumstats)
|
|
958
|
-
if "BETA" in coltocheck and "BETA" in sumstats.columns:
|
|
959
|
-
cols_to_check.append("BETA")
|
|
960
|
-
if verbose: log.write(" -Checking if ",beta[0],"<BETA<",beta[1]," ...")
|
|
961
|
-
sumstats.loc[:,"BETA"] = pd.to_numeric(sumstats.loc[:,"BETA"], errors='coerce').astype("float64")
|
|
962
|
-
sumstats = sumstats.loc[(sumstats["BETA"]>beta[0]) & (sumstats["BETA"]<beta[1]),:]
|
|
963
|
-
after_number=len(sumstats)
|
|
964
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad BETA.")
|
|
965
|
-
|
|
966
|
-
pre_number=len(sumstats)
|
|
967
|
-
if "SE" in coltocheck and "SE" in sumstats.columns:
|
|
968
|
-
cols_to_check.append("SE")
|
|
969
|
-
if verbose: log.write(" -Checking if ",se[0],"<SE<",se[1]," ...")
|
|
970
|
-
sumstats.loc[:,"SE"] = pd.to_numeric(sumstats.loc[:,"SE"], errors='coerce').astype("float64")
|
|
971
|
-
sumstats = sumstats.loc[(sumstats["SE"]>se[0]) & (sumstats["SE"]<se[1]),:]
|
|
972
|
-
after_number=len(sumstats)
|
|
973
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad SE.")
|
|
974
|
-
|
|
975
|
-
pre_number=len(sumstats)
|
|
976
|
-
if "OR" in coltocheck and "OR" in sumstats.columns:
|
|
977
|
-
cols_to_check.append("OR")
|
|
978
|
-
if verbose: log.write(" -Checking if ",OR[0],"<log(OR)<",OR[1]," ...")
|
|
979
|
-
sumstats.loc[:,"OR"] = pd.to_numeric(sumstats.loc[:,"OR"], errors='coerce').astype("float64")
|
|
980
|
-
sumstats = sumstats.loc[(np.log(sumstats["OR"])>OR[0]) & (np.log(sumstats["OR"])<OR[1]),:]
|
|
981
|
-
after_number=len(sumstats)
|
|
982
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR.")
|
|
983
|
-
|
|
984
|
-
pre_number=len(sumstats)
|
|
985
|
-
if "OR_95L" in coltocheck and "OR_95L" in sumstats.columns:
|
|
986
|
-
cols_to_check.append("OR_95L")
|
|
987
|
-
if verbose: log.write(" -Checking if ",OR_95L[0],"<OR_95L<",OR_95L[1]," ...")
|
|
988
|
-
sumstats.loc[:,"OR_95L"] = pd.to_numeric(sumstats.loc[:,"OR_95L"], errors='coerce').astype("float64")
|
|
989
|
-
sumstats = sumstats.loc[(sumstats["OR_95L"]>OR_95L[0]) & (sumstats["OR_95L"]<OR_95L[1]),:]
|
|
990
|
-
after_number=len(sumstats)
|
|
991
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR_95L.")
|
|
992
|
-
|
|
993
|
-
pre_number=len(sumstats)
|
|
994
|
-
if "OR_95U" in coltocheck and "OR_95U" in sumstats.columns:
|
|
995
|
-
cols_to_check.append("OR_95U")
|
|
996
|
-
if verbose: log.write(" -Checking if ",OR_95U[0],"<OR_95U<",OR_95U[1]," ...")
|
|
997
|
-
sumstats.loc[:,"OR_95U"] = pd.to_numeric(sumstats.loc[:,"OR_95U"], errors='coerce').astype("float64")
|
|
998
|
-
sumstats = sumstats.loc[(sumstats["OR_95U"]>OR_95U[0]) & (sumstats["OR_95U"]<OR_95U[1]),:]
|
|
999
|
-
after_number=len(sumstats)
|
|
1000
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR_95U.")
|
|
1001
1130
|
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
after_number=len(sumstats)
|
|
1027
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR_95U.")
|
|
1028
|
-
#INFO #################################################################################################################
|
|
1029
|
-
pre_number=len(sumstats)
|
|
1030
|
-
if "INFO" in coltocheck and "INFO" in sumstats.columns:
|
|
1031
|
-
cols_to_check.append("INFO")
|
|
1032
|
-
if verbose: log.write(" -Checking if ",info[0],"<INFO<",info[1]," ...")
|
|
1033
|
-
sumstats.loc[:,"INFO"] = pd.to_numeric(sumstats.loc[:,"INFO"], errors='coerce').astype("float32")
|
|
1034
|
-
sumstats = sumstats.loc[(sumstats["INFO"]>info[0]) & (sumstats["INFO"]<info[1]),:]
|
|
1035
|
-
after_number=len(sumstats)
|
|
1036
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad INFO.")
|
|
1037
|
-
###STATUS ################################################################################################################################################
|
|
1038
|
-
pre_number=len(sumstats)
|
|
1039
|
-
if "STATUS" in coltocheck and "STATUS" in sumstats.columns:
|
|
1040
|
-
cols_to_check.append("STATUS")
|
|
1041
|
-
if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
|
|
1042
|
-
categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
1043
|
-
sumstats.loc[:,"STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
|
|
1131
|
+
if "BETA" in sumstats.columns and "SE" in sumstats.columns:
|
|
1132
|
+
if "MLOG10P" in sumstats.columns:
|
|
1133
|
+
log.write(" -Checking if BETA/SE-derived-MLOG10P is consistent with MLOG10P...",verbose=verbose)
|
|
1134
|
+
betase_derived_mlog10p = _convert_betase_to_mlog10p(sumstats["BETA"], sumstats["SE"])
|
|
1135
|
+
is_close = np.isclose(betase_derived_mlog10p, sumstats["MLOG10P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1136
|
+
diff = betase_derived_mlog10p - sumstats["MLOG10P"]
|
|
1137
|
+
if sum(~is_close)>0:
|
|
1138
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
|
|
1139
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
|
|
1140
|
+
else:
|
|
1141
|
+
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1142
|
+
check_status=1
|
|
1143
|
+
|
|
1144
|
+
if "P" in sumstats.columns:
|
|
1145
|
+
log.write(" -Checking if BETA/SE-derived-P is consistent with P...",verbose=verbose)
|
|
1146
|
+
betase_derived_p = _convert_betase_to_p(sumstats["BETA"], sumstats["SE"])
|
|
1147
|
+
is_close = np.isclose(betase_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1148
|
+
diff = betase_derived_p - sumstats["P"]
|
|
1149
|
+
if sum(~is_close)>0:
|
|
1150
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
|
|
1151
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
|
|
1152
|
+
else:
|
|
1153
|
+
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1154
|
+
check_status=1
|
|
1044
1155
|
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1156
|
+
if "MLOG10P" in sumstats.columns and "P" in sumstats.columns:
|
|
1157
|
+
log.write(" -Checking if MLOG10P-derived-P is consistent with P...",verbose=verbose)
|
|
1158
|
+
mlog10p_derived_p = _convert_mlog10p_to_p(sumstats["MLOG10P"])
|
|
1159
|
+
is_close = np.isclose(mlog10p_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1160
|
+
diff = mlog10p_derived_p - sumstats["P"]
|
|
1161
|
+
if sum(~is_close)>0:
|
|
1162
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
|
|
1163
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
|
|
1164
|
+
else:
|
|
1165
|
+
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1166
|
+
check_status=1
|
|
1049
1167
|
|
|
1050
|
-
if
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1168
|
+
if "N" in sumstats.columns and "N_CONTROL" in sumstats.columns and "N_CASE" in sumstats.columns:
|
|
1169
|
+
if verbose: log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...")
|
|
1170
|
+
is_close = sumstats["N"] == sumstats["N_CASE"] + sumstats["N_CONTROL"]
|
|
1171
|
+
#is_close = np.isclose(sumstats["N"], sumstats["N_CASE"] + sumstats["N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1172
|
+
diff = abs(sumstats["N"] - (sumstats["N_CASE"] + sumstats["N_CONTROL"] ))
|
|
1173
|
+
if sum(~is_close)>0:
|
|
1174
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
|
|
1175
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
|
|
1176
|
+
else:
|
|
1177
|
+
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1178
|
+
check_status=1
|
|
1179
|
+
|
|
1180
|
+
if check_status==1:
|
|
1181
|
+
log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
|
|
1182
|
+
else:
|
|
1183
|
+
log.write(" -No availalbe columns for data consistency checking...Skipping...",verbose=verbose)
|
|
1184
|
+
finished(log,verbose,_end_line)
|
|
1056
1185
|
|
|
1057
1186
|
###############################################################################################################
|
|
1058
1187
|
# 20220426
|
|
@@ -1076,11 +1205,81 @@ def flip_direction(string):
|
|
|
1076
1205
|
else: #sometime it is 0
|
|
1077
1206
|
flipped_string+=char
|
|
1078
1207
|
return flipped_string
|
|
1079
|
-
|
|
1208
|
+
|
|
1209
|
+
def flip_by_swap(sumstats, matched_index, log, verbose):
|
|
1210
|
+
if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
|
|
1211
|
+
if verbose: log.write(" -Swapping column: NEA <=> EA...")
|
|
1212
|
+
sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
|
|
1213
|
+
return sumstats
|
|
1214
|
+
|
|
1215
|
+
def flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1):
|
|
1216
|
+
if "OR" in sumstats.columns:
|
|
1217
|
+
if verbose: log.write(" -Flipping column: OR = 1 / OR...")
|
|
1218
|
+
sumstats.loc[matched_index,"OR"] = factor / sumstats.loc[matched_index,"OR"].values
|
|
1219
|
+
if "OR_95L" in sumstats.columns:
|
|
1220
|
+
if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95L...")
|
|
1221
|
+
sumstats.loc[matched_index,"OR_95U"] = factor / sumstats.loc[matched_index,"OR_95L"].values
|
|
1222
|
+
if "OR_95U" in sumstats.columns:
|
|
1223
|
+
if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95U...")
|
|
1224
|
+
sumstats.loc[matched_index,"OR_95L"] = factor / sumstats.loc[matched_index,"OR_95U"].values
|
|
1225
|
+
if "HR" in sumstats.columns:
|
|
1226
|
+
if verbose: log.write(" -Flipping column: HR = 1 / HR...")
|
|
1227
|
+
sumstats.loc[matched_index,"HR"] = factor / sumstats.loc[matched_index,"HR"].values
|
|
1228
|
+
if "HR_95L" in sumstats.columns:
|
|
1229
|
+
if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95L...")
|
|
1230
|
+
sumstats.loc[matched_index,"HR_95U"] = factor / sumstats.loc[matched_index,"HR_95L"].values
|
|
1231
|
+
if "HR_95U" in sumstats.columns:
|
|
1232
|
+
if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95U...")
|
|
1233
|
+
sumstats.loc[matched_index,"HR_95L"] = factor / sumstats.loc[matched_index,"HR_95U"].values
|
|
1234
|
+
return sumstats
|
|
1235
|
+
|
|
1236
|
+
def flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1):
|
|
1237
|
+
if "EAF" in sumstats.columns:
|
|
1238
|
+
if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
|
|
1239
|
+
sumstats.loc[matched_index,"EAF"] = factor - sumstats.loc[matched_index,"EAF"].values
|
|
1240
|
+
return sumstats
|
|
1241
|
+
|
|
1242
|
+
def flip_by_sign(sumstats, matched_index, log, verbose, cols=None):
|
|
1243
|
+
if "BETA" in sumstats.columns:
|
|
1244
|
+
if verbose: log.write(" -Flipping column: BETA = - BETA...")
|
|
1245
|
+
sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
|
|
1246
|
+
if "BETA_95L" in sumstats.columns:
|
|
1247
|
+
if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95L...")
|
|
1248
|
+
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1249
|
+
if "BETA_95U" in sumstats.columns:
|
|
1250
|
+
if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95U...")
|
|
1251
|
+
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1252
|
+
if "Z" in sumstats.columns:
|
|
1253
|
+
if verbose: log.write(" -Flipping column: Z = - Z...")
|
|
1254
|
+
sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"Z"].values
|
|
1255
|
+
if "T" in sumstats.columns:
|
|
1256
|
+
if verbose: log.write(" -Flipping column: T = - T...")
|
|
1257
|
+
sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"T"].values
|
|
1258
|
+
if "DIRECTION" in sumstats.columns:
|
|
1259
|
+
if verbose: log.write(" -Flipping column: DIRECTION +-?0 <=> -+?0 ...")
|
|
1260
|
+
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1261
|
+
return sumstats
|
|
1262
|
+
|
|
1080
1263
|
def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1264
|
+
##start function with col checking##########################################################
|
|
1265
|
+
_start_line = "adjust statistics based on STATUS code"
|
|
1266
|
+
_end_line = "adjusting statistics based on STATUS code"
|
|
1267
|
+
_start_cols =[]
|
|
1268
|
+
_start_function = ".check_data_consistency()"
|
|
1269
|
+
_must_args ={}
|
|
1270
|
+
|
|
1271
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1272
|
+
log=log,
|
|
1273
|
+
verbose=verbose,
|
|
1274
|
+
start_line=_start_line,
|
|
1275
|
+
end_line=_end_line,
|
|
1276
|
+
start_cols=_start_cols,
|
|
1277
|
+
start_function=_start_function,
|
|
1278
|
+
**_must_args)
|
|
1279
|
+
if is_enough_info == False: return sumstats
|
|
1280
|
+
############################################################################################
|
|
1281
|
+
|
|
1282
|
+
if_stats_flipped = False
|
|
1084
1283
|
###################get reverse complementary####################
|
|
1085
1284
|
pattern = r"\w\w\w\w\w[45]\w"
|
|
1086
1285
|
#matched_index = status_match(sumstats[status],6,[4,5]) #
|
|
@@ -1092,107 +1291,49 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1092
1291
|
if verbose: log.write(" -Converting to reverse complement : EA and NEA...")
|
|
1093
1292
|
reverse_complement_nea = sumstats.loc[matched_index,'NEA'].apply(lambda x :get_reverse_complementary_allele(x))
|
|
1094
1293
|
reverse_complement_ea = sumstats.loc[matched_index,'EA'].apply(lambda x :get_reverse_complementary_allele(x))
|
|
1095
|
-
categories = set(sumstats
|
|
1096
|
-
sumstats
|
|
1097
|
-
sumstats
|
|
1294
|
+
categories = set(sumstats['EA'])|set(sumstats['NEA']) |set(reverse_complement_ea) |set(reverse_complement_nea)
|
|
1295
|
+
sumstats['EA']=pd.Categorical(sumstats['EA'],categories = categories)
|
|
1296
|
+
sumstats['NEA']=pd.Categorical(sumstats['NEA'],categories = categories )
|
|
1098
1297
|
sumstats.loc[matched_index,['NEA']] = reverse_complement_nea
|
|
1099
1298
|
sumstats.loc[matched_index,['EA']] = reverse_complement_ea
|
|
1100
1299
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "4","2")
|
|
1101
1300
|
if verbose: log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x")
|
|
1102
|
-
|
|
1301
|
+
if_stats_flipped = True
|
|
1103
1302
|
###################flip ref####################
|
|
1104
1303
|
pattern = r"\w\w\w\w\w[35]\w"
|
|
1105
1304
|
#matched_index = status_match(sumstats[status],6,[3,5]) #sumstats[status].str.match(pattern)
|
|
1106
1305
|
matched_index = sumstats[status].str[5].str.match(r"3|5")
|
|
1107
1306
|
if sum(matched_index)>0:
|
|
1108
|
-
if verbose: log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x:
|
|
1307
|
+
if verbose: log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: ALT->EA , REF->NEA ...{}".format(_get_version()))
|
|
1109
1308
|
if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
if "BETA_95L" in sumstats.columns:
|
|
1117
|
-
if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
|
|
1118
|
-
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1119
|
-
if "BETA_95U" in sumstats.columns:
|
|
1120
|
-
if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
|
|
1121
|
-
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1122
|
-
if "EAF" in sumstats.columns:
|
|
1123
|
-
if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
|
|
1124
|
-
sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
|
|
1125
|
-
if "OR" in sumstats.columns:
|
|
1126
|
-
if verbose: log.write(" -Flipping column: OR = 1 / OR...")
|
|
1127
|
-
sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
|
|
1128
|
-
if "OR_95L" in sumstats.columns:
|
|
1129
|
-
if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
|
|
1130
|
-
sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
|
|
1131
|
-
if "OR_95U" in sumstats.columns:
|
|
1132
|
-
if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
|
|
1133
|
-
sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
|
|
1134
|
-
if "HR" in sumstats.columns:
|
|
1135
|
-
if verbose: log.write(" -Flipping column: HR = 1 / HR...")
|
|
1136
|
-
sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
|
|
1137
|
-
if "HR_95L" in sumstats.columns:
|
|
1138
|
-
if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
|
|
1139
|
-
sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
|
|
1140
|
-
if "HR_95U" in sumstats.columns:
|
|
1141
|
-
if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
|
|
1142
|
-
sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
|
|
1143
|
-
if "DIRECTION" in sumstats.columns:
|
|
1144
|
-
if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
|
|
1145
|
-
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1309
|
+
|
|
1310
|
+
flip_by_swap(sumstats, matched_index, log, verbose)
|
|
1311
|
+
flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
1312
|
+
flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1313
|
+
flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1314
|
+
|
|
1146
1315
|
#change status
|
|
1147
1316
|
if verbose: log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x")
|
|
1148
1317
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "35","12")
|
|
1318
|
+
if_stats_flipped = True
|
|
1149
1319
|
|
|
1150
1320
|
###################flip ref for undistingushable indels####################
|
|
1151
1321
|
pattern = r"\w\w\w\w[123][67]6"
|
|
1152
1322
|
#matched_index = status_match(sumstats[status],6,[1,2,3])|status_match(sumstats[status],6,[6,7])|status_match(sumstats[status],7,6) #sumstats[status].str.match(pattern)
|
|
1153
1323
|
matched_index = sumstats[status].str[4:].str.match(r"[123][67]6")
|
|
1154
1324
|
if sum(matched_index)>0:
|
|
1155
|
-
if verbose: log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]:
|
|
1325
|
+
if verbose: log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: ALT->EA , REF->NEA...{}".format(_get_version()))
|
|
1156
1326
|
if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
if "BETA_95L" in sumstats.columns:
|
|
1164
|
-
if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
|
|
1165
|
-
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1166
|
-
if "BETA_95U" in sumstats.columns:
|
|
1167
|
-
if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
|
|
1168
|
-
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1169
|
-
if "EAF" in sumstats.columns:
|
|
1170
|
-
if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
|
|
1171
|
-
sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
|
|
1172
|
-
if "OR" in sumstats.columns:
|
|
1173
|
-
if verbose: log.write(" -Flipping column: OR = 1 / OR...")
|
|
1174
|
-
sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
|
|
1175
|
-
if "OR_95L" in sumstats.columns:
|
|
1176
|
-
if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
|
|
1177
|
-
sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
|
|
1178
|
-
if "OR_95U" in sumstats.columns:
|
|
1179
|
-
if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
|
|
1180
|
-
sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
|
|
1181
|
-
if "HR" in sumstats.columns:
|
|
1182
|
-
if verbose: log.write(" -Flipping column: HR = 1 / HR...")
|
|
1183
|
-
sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
|
|
1184
|
-
if "HR_95L" in sumstats.columns:
|
|
1185
|
-
if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
|
|
1186
|
-
sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
|
|
1187
|
-
if "HR_95U" in sumstats.columns:
|
|
1188
|
-
if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
|
|
1189
|
-
sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
|
|
1190
|
-
if "DIRECTION" in sumstats.columns:
|
|
1191
|
-
if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
|
|
1192
|
-
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1327
|
+
|
|
1328
|
+
flip_by_swap(sumstats, matched_index, log, verbose)
|
|
1329
|
+
flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
1330
|
+
flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1331
|
+
flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1332
|
+
|
|
1193
1333
|
#change status
|
|
1194
1334
|
if verbose: log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4")
|
|
1195
1335
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "6","4")
|
|
1336
|
+
if_stats_flipped = True
|
|
1196
1337
|
# flip ref
|
|
1197
1338
|
###################flip statistics for reverse strand panlindromic variants####################
|
|
1198
1339
|
pattern = r"\w\w\w\w\w[012]5"
|
|
@@ -1201,43 +1342,20 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1201
1342
|
if sum(matched_index)>0:
|
|
1202
1343
|
if verbose: log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()))
|
|
1203
1344
|
if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1210
|
-
if "BETA_95U" in sumstats.columns:
|
|
1211
|
-
if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
|
|
1212
|
-
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1213
|
-
if "EAF" in sumstats.columns:
|
|
1214
|
-
if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
|
|
1215
|
-
sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
|
|
1216
|
-
if "OR" in sumstats.columns:
|
|
1217
|
-
if verbose: log.write(" -Flipping column: OR = 1 / OR...")
|
|
1218
|
-
sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
|
|
1219
|
-
if "OR_95L" in sumstats.columns:
|
|
1220
|
-
if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
|
|
1221
|
-
sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
|
|
1222
|
-
if "OR_95U" in sumstats.columns:
|
|
1223
|
-
if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
|
|
1224
|
-
sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
|
|
1225
|
-
if "HR" in sumstats.columns:
|
|
1226
|
-
if verbose: log.write(" -Flipping column: HR = 1 / HR...")
|
|
1227
|
-
sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
|
|
1228
|
-
if "HR_95L" in sumstats.columns:
|
|
1229
|
-
if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
|
|
1230
|
-
sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
|
|
1231
|
-
if "HR_95U" in sumstats.columns:
|
|
1232
|
-
if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
|
|
1233
|
-
sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
|
|
1234
|
-
if "DIRECTION" in sumstats.columns:
|
|
1235
|
-
if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
|
|
1236
|
-
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1345
|
+
|
|
1346
|
+
flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
1347
|
+
flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1348
|
+
flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1349
|
+
|
|
1237
1350
|
#change status
|
|
1238
1351
|
if verbose: log.write(" -Changed the status for flipped variants: xxxxx[012]5: -> xxxxx[012]2")
|
|
1239
1352
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "5","2")
|
|
1240
|
-
|
|
1353
|
+
if_stats_flipped = True
|
|
1354
|
+
|
|
1355
|
+
if if_stats_flipped == True:
|
|
1356
|
+
finished(log, verbose, "adjusting")
|
|
1357
|
+
else:
|
|
1358
|
+
finished(log, verbose, "adjusting with no statistics changed.")
|
|
1241
1359
|
return sumstats
|
|
1242
1360
|
""
|
|
1243
1361
|
|
|
@@ -1246,8 +1364,8 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1246
1364
|
# 20220426
|
|
1247
1365
|
def liftover_snv(row,chrom,converter,to_build):
|
|
1248
1366
|
status_pre=""
|
|
1249
|
-
status_end=row[1][2]+"9"+row[1][4]+"99"
|
|
1250
|
-
pos_0_based = int(row[0]) - 1
|
|
1367
|
+
status_end=row.iloc[1][2]+"9"+row.iloc[1][4]+"99"
|
|
1368
|
+
pos_0_based = int(row.iloc[0]) - 1
|
|
1251
1369
|
results = converter[chrom][pos_0_based]
|
|
1252
1370
|
if converter[chrom][pos_0_based]:
|
|
1253
1371
|
# return chrom, pos_1_based
|
|
@@ -1277,13 +1395,25 @@ def liftover_variant(sumstats,
|
|
|
1277
1395
|
return sumstats
|
|
1278
1396
|
|
|
1279
1397
|
def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1398
|
+
##start function with col checking##########################################################
|
|
1399
|
+
_start_line = "perform liftover"
|
|
1400
|
+
_end_line = "liftover"
|
|
1401
|
+
_start_cols =[chrom,pos,status]
|
|
1402
|
+
_start_function = ".liftover()"
|
|
1403
|
+
_must_args ={}
|
|
1404
|
+
|
|
1405
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1406
|
+
log=log,
|
|
1407
|
+
verbose=verbose,
|
|
1408
|
+
start_line=_start_line,
|
|
1409
|
+
end_line=_end_line,
|
|
1410
|
+
start_cols=_start_cols,
|
|
1411
|
+
start_function=_start_function,
|
|
1412
|
+
n_cores=n_cores,
|
|
1413
|
+
**_must_args)
|
|
1414
|
+
if is_enough_info == False: return sumstats
|
|
1415
|
+
############################################################################################
|
|
1416
|
+
|
|
1287
1417
|
if verbose: log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build)
|
|
1288
1418
|
# valid chr and pos
|
|
1289
1419
|
pattern = r"\w\w\w0\w\w\w"
|
|
@@ -1295,11 +1425,12 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
|
|
|
1295
1425
|
if sum(to_lift)<10000:
|
|
1296
1426
|
n_cores=1
|
|
1297
1427
|
|
|
1298
|
-
df_split = np.array_split(sumstats
|
|
1428
|
+
#df_split = np.array_split(sumstats[[chrom,pos,status]], n_cores)
|
|
1429
|
+
df_split = _df_split(sumstats[[chrom,pos,status]], n_cores)
|
|
1299
1430
|
pool = Pool(n_cores)
|
|
1300
1431
|
#df = pd.concat(pool.starmap(func, df_split))
|
|
1301
1432
|
func=liftover_variant
|
|
1302
|
-
sumstats
|
|
1433
|
+
sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
|
|
1303
1434
|
pool.close()
|
|
1304
1435
|
pool.join()
|
|
1305
1436
|
############################################################################
|
|
@@ -1314,18 +1445,29 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
|
|
|
1314
1445
|
sumstats = fixchr(sumstats,chrom=chrom,add_prefix="",remove=remove, verbose=True)
|
|
1315
1446
|
sumstats = fixpos(sumstats,pos=pos,remove=remove, verbose=True)
|
|
1316
1447
|
|
|
1317
|
-
|
|
1448
|
+
finished(log,verbose,_end_line)
|
|
1318
1449
|
return sumstats
|
|
1319
1450
|
|
|
1320
1451
|
###############################################################################################################
|
|
1321
1452
|
# 20220426
|
|
1322
1453
|
def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=Log()):
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1454
|
+
##start function with col checking##########################################################
|
|
1455
|
+
_start_line = "sort the genome coordinates"
|
|
1456
|
+
_end_line = "sorting coordinates"
|
|
1457
|
+
_start_cols =[chrom,pos]
|
|
1458
|
+
_start_function = ".sort_coordinate()"
|
|
1459
|
+
_must_args ={}
|
|
1460
|
+
|
|
1461
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1462
|
+
log=log,
|
|
1463
|
+
verbose=verbose,
|
|
1464
|
+
start_line=_start_line,
|
|
1465
|
+
end_line=_end_line,
|
|
1466
|
+
start_cols=_start_cols,
|
|
1467
|
+
start_function=_start_function,
|
|
1468
|
+
**_must_args)
|
|
1469
|
+
if is_enough_info == False: return sumstats
|
|
1470
|
+
############################################################################################
|
|
1329
1471
|
|
|
1330
1472
|
try:
|
|
1331
1473
|
if sumstats[pos].dtype == "Int64":
|
|
@@ -1335,49 +1477,144 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
|
|
|
1335
1477
|
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
1336
1478
|
except:
|
|
1337
1479
|
pass
|
|
1338
|
-
|
|
1339
|
-
if verbose: log.write(" -Sorting genome coordinates...")
|
|
1340
1480
|
sumstats = sumstats.sort_values(by=[chrom,pos],ascending=True,ignore_index=True)
|
|
1341
|
-
|
|
1342
|
-
|
|
1481
|
+
|
|
1482
|
+
finished(log,verbose,_end_line)
|
|
1343
1483
|
return sumstats
|
|
1344
1484
|
###############################################################################################################
|
|
1345
1485
|
# 20230430 added HR HR_95 BETA_95 N_CASE N_CONTROL
|
|
1346
|
-
def sortcolumn(sumstats,verbose=True,log=Log(),order =
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1486
|
+
def sortcolumn(sumstats,verbose=True,log=Log(),order = None):
|
|
1487
|
+
##start function with col checking##########################################################
|
|
1488
|
+
_start_line = "reorder the columns"
|
|
1489
|
+
_end_line = "reordering the columns"
|
|
1490
|
+
_start_cols =[]
|
|
1491
|
+
_start_function = ".sort_column()"
|
|
1492
|
+
_must_args ={}
|
|
1493
|
+
|
|
1494
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1495
|
+
log=log,
|
|
1496
|
+
verbose=verbose,
|
|
1497
|
+
start_line=_start_line,
|
|
1498
|
+
end_line=_end_line,
|
|
1499
|
+
start_cols=_start_cols,
|
|
1500
|
+
start_function=_start_function,
|
|
1501
|
+
**_must_args)
|
|
1502
|
+
if is_enough_info == False: return sumstats
|
|
1503
|
+
############################################################################################
|
|
1504
|
+
|
|
1505
|
+
if order is None:
|
|
1506
|
+
order = [
|
|
1507
|
+
"SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z","T","F",
|
|
1508
|
+
"CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"]
|
|
1353
1509
|
output_columns = []
|
|
1354
1510
|
for i in order:
|
|
1355
1511
|
if i in sumstats.columns: output_columns.append(i)
|
|
1356
1512
|
for i in sumstats.columns:
|
|
1357
1513
|
if i not in order: output_columns.append(i)
|
|
1358
1514
|
if verbose: log.write(" -Reordering columns to :", ",".join(output_columns))
|
|
1359
|
-
sumstats = sumstats
|
|
1360
|
-
|
|
1515
|
+
sumstats = sumstats[ output_columns]
|
|
1516
|
+
|
|
1517
|
+
finished(log,verbose,_end_line)
|
|
1361
1518
|
return sumstats
|
|
1362
1519
|
|
|
1363
|
-
|
|
1520
|
+
|
|
1521
|
+
###############################################################################################################
|
|
1522
|
+
def start_to(sumstats,
|
|
1523
|
+
log,
|
|
1524
|
+
verbose,
|
|
1525
|
+
start_line,
|
|
1526
|
+
end_line,
|
|
1527
|
+
start_cols,
|
|
1528
|
+
start_function,
|
|
1529
|
+
ref_vcf=None,
|
|
1530
|
+
ref_fasta=None,
|
|
1531
|
+
n_cores=None,
|
|
1532
|
+
ref_tsv=None,
|
|
1533
|
+
**args
|
|
1534
|
+
):
|
|
1535
|
+
|
|
1536
|
+
log.write("Start to {}...{}".format(start_line,_get_version()), verbose=verbose)
|
|
1537
|
+
|
|
1538
|
+
check_dataframe_shape(sumstats=sumstats,
|
|
1539
|
+
log=log,
|
|
1540
|
+
verbose=verbose)
|
|
1541
|
+
|
|
1542
|
+
is_enough_col = check_col(sumstats.columns,
|
|
1543
|
+
verbose=verbose,
|
|
1544
|
+
log=log,
|
|
1545
|
+
cols=start_cols,
|
|
1546
|
+
function=start_function)
|
|
1547
|
+
|
|
1548
|
+
if is_enough_col==True:
|
|
1549
|
+
if n_cores is not None:
|
|
1550
|
+
log.write(" -Number of threads/cores to use: {}".format(n_cores))
|
|
1551
|
+
if ref_vcf is not None:
|
|
1552
|
+
log.write(" -Reference VCF: {}".format(ref_vcf))
|
|
1553
|
+
if ref_fasta is not None:
|
|
1554
|
+
log.write(" -Reference FASTA: {}".format(ref_fasta))
|
|
1555
|
+
if ref_tsv is not None:
|
|
1556
|
+
log.write(" -Reference TSV: {}".format(ref_tsv))
|
|
1557
|
+
|
|
1558
|
+
is_args_valid = True
|
|
1559
|
+
for key, value in args.items():
|
|
1560
|
+
is_args_valid = is_args_valid & check_arg(log, verbose, key, value, start_function)
|
|
1561
|
+
is_enough_col = is_args_valid & is_enough_col
|
|
1562
|
+
|
|
1563
|
+
if is_enough_col == False:
|
|
1564
|
+
skipped(log, verbose, end_line)
|
|
1565
|
+
|
|
1566
|
+
return is_enough_col
|
|
1567
|
+
|
|
1568
|
+
def finished(log, verbose, end_line):
|
|
1569
|
+
log.write("Finished {}.".format(end_line), verbose=verbose)
|
|
1570
|
+
gc.collect()
|
|
1571
|
+
|
|
1572
|
+
def skipped(log, verbose, end_line):
|
|
1573
|
+
log.write("Skipped {}.".format(end_line), verbose=verbose)
|
|
1574
|
+
gc.collect()
|
|
1575
|
+
|
|
1576
|
+
def check_arg(log, verbose, key, value, function):
|
|
1577
|
+
if value is None:
|
|
1578
|
+
log.warning("Necessary argument {} for {} is not provided!".format(key, function))
|
|
1579
|
+
return False
|
|
1580
|
+
return True
|
|
1581
|
+
|
|
1582
|
+
def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
|
|
1364
1583
|
not_in_df=[]
|
|
1365
|
-
for i in
|
|
1584
|
+
for i in cols:
|
|
1366
1585
|
if type(i) is str:
|
|
1367
|
-
|
|
1586
|
+
# single check
|
|
1587
|
+
if i in df_col_names:
|
|
1368
1588
|
continue
|
|
1369
1589
|
else:
|
|
1370
1590
|
not_in_df.append(i)
|
|
1371
1591
|
else:
|
|
1592
|
+
# paried check
|
|
1372
1593
|
count=0
|
|
1373
1594
|
for j in i:
|
|
1374
|
-
if j in
|
|
1595
|
+
if j not in df_col_names:
|
|
1596
|
+
not_in_df.append(j)
|
|
1375
1597
|
count+=1
|
|
1376
|
-
|
|
1377
|
-
return False
|
|
1378
|
-
print(" -Specified columns names was not detected. Please check:"+",".join(i))
|
|
1379
|
-
|
|
1598
|
+
|
|
1380
1599
|
if len(not_in_df)>0:
|
|
1600
|
+
if function is None:
|
|
1601
|
+
to_show_title=" "
|
|
1602
|
+
else:
|
|
1603
|
+
to_show_title = " for {} ".format(function)
|
|
1604
|
+
log.warning("Necessary columns{}were not detected:{}".format(to_show_title, ",".join(not_in_df)))
|
|
1605
|
+
skipped(log, verbose, end_line=function)
|
|
1381
1606
|
return False
|
|
1382
|
-
|
|
1383
|
-
return True
|
|
1607
|
+
|
|
1608
|
+
return True
|
|
1609
|
+
|
|
1610
|
+
###############################################################################################################
|
|
1611
|
+
def _df_split(dataframe, n):
|
|
1612
|
+
chunks = []
|
|
1613
|
+
chunk_size = int(dataframe.shape[0] // n)+1
|
|
1614
|
+
|
|
1615
|
+
for index in range(0, dataframe.shape[0], chunk_size):
|
|
1616
|
+
chunks.append(
|
|
1617
|
+
dataframe.iloc[index:index + chunk_size]
|
|
1618
|
+
)
|
|
1619
|
+
|
|
1620
|
+
return chunks
|