gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +22 -5
- gwaslab/g_Sumstats.py +110 -163
- gwaslab/g_SumstatsPair.py +76 -25
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +36 -17
- gwaslab/hm_harmonize_sumstats.py +354 -221
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +49 -43
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/io_to_formats.py +428 -295
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +891 -778
- gwaslab/util_ex_calculate_ldmatrix.py +31 -13
- gwaslab/util_ex_gwascatalog.py +25 -25
- gwaslab/util_ex_ldproxyfinder.py +10 -10
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +30 -30
- gwaslab/util_in_filter_value.py +201 -74
- gwaslab/util_in_get_density.py +10 -10
- gwaslab/util_in_get_sig.py +445 -71
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +42 -37
- gwaslab/viz_aux_reposition_text.py +10 -7
- gwaslab/viz_aux_save_figure.py +18 -8
- gwaslab/viz_plot_compare_af.py +32 -33
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +34 -26
- gwaslab/viz_plot_mqqplot.py +126 -75
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +36 -33
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +65 -61
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.37.dist-info/RECORD +0 -72
- /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/qc_fix_sumstats.py
CHANGED
|
@@ -47,14 +47,14 @@ def _process_build(build,log,verbose):
|
|
|
47
47
|
log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
|
|
48
48
|
final_build = "38"
|
|
49
49
|
else:
|
|
50
|
-
log.
|
|
50
|
+
log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
|
|
51
51
|
final_build = "99"
|
|
52
52
|
return final_build
|
|
53
53
|
|
|
54
54
|
def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
|
|
55
55
|
build = _process_build(build,log=log,verbose=verbose)
|
|
56
|
-
sumstats
|
|
57
|
-
sumstats
|
|
56
|
+
sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
|
|
57
|
+
sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
|
|
58
58
|
return sumstats, build
|
|
59
59
|
|
|
60
60
|
def fixID(sumstats,
|
|
@@ -66,35 +66,49 @@ def fixID(sumstats,
|
|
|
66
66
|
2. fix chr and pos using snpid
|
|
67
67
|
3. checking rsid and chr:pos:nea:ea
|
|
68
68
|
'''
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
69
|
+
##start function with col checking##########################################################
|
|
70
|
+
_start_line = "check SNPID/rsID"
|
|
71
|
+
_end_line = "checking SNPID/rsID"
|
|
72
|
+
_start_cols =[]
|
|
73
|
+
_start_function = ".fix_id()"
|
|
74
|
+
_must_args ={}
|
|
75
|
+
|
|
76
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
77
|
+
log=log,
|
|
78
|
+
verbose=verbose,
|
|
79
|
+
start_line=_start_line,
|
|
80
|
+
end_line=_end_line,
|
|
81
|
+
start_cols=_start_cols,
|
|
82
|
+
start_function=_start_function,
|
|
83
|
+
**_must_args)
|
|
84
|
+
if is_enough_info == False: return sumstats
|
|
85
|
+
############################################################################################
|
|
72
86
|
|
|
73
87
|
############################ checking datatype ###################################################
|
|
74
88
|
if rsid in sumstats.columns:
|
|
75
89
|
# convert to string datatype
|
|
76
90
|
try:
|
|
77
91
|
log.write(" -Checking rsID data type...",verbose=verbose)
|
|
78
|
-
if sumstats
|
|
92
|
+
if sumstats[rsid].dtype == "string":
|
|
79
93
|
pass
|
|
80
94
|
else:
|
|
81
95
|
log.write(" -Converting rsID to pd.string data type...",verbose=verbose)
|
|
82
|
-
sumstats
|
|
96
|
+
sumstats[rsid] = sumstats[rsid].astype("string")
|
|
83
97
|
except:
|
|
84
98
|
log.write(" -Force converting rsID to pd.string data type...",verbose=verbose)
|
|
85
|
-
sumstats
|
|
99
|
+
sumstats[rsid] = sumstats[rsid].astype("string")
|
|
86
100
|
if snpid in sumstats.columns:
|
|
87
101
|
# convert to string datatype
|
|
88
102
|
try:
|
|
89
103
|
log.write(" -Checking SNPID data type...",verbose=verbose)
|
|
90
|
-
if sumstats
|
|
104
|
+
if sumstats[snpid].dtype == "string":
|
|
91
105
|
pass
|
|
92
106
|
else:
|
|
93
107
|
log.write(" -Converting SNPID to pd.string data type...",verbose=verbose)
|
|
94
|
-
sumstats
|
|
108
|
+
sumstats[snpid] = sumstats[snpid].astype("string")
|
|
95
109
|
except:
|
|
96
110
|
log.write(" -Force converting SNPID to pd.string data type...",verbose=verbose)
|
|
97
|
-
sumstats
|
|
111
|
+
sumstats[snpid] = sumstats[snpid].astype("string")
|
|
98
112
|
|
|
99
113
|
############################ checking ###################################################
|
|
100
114
|
if snpid in sumstats.columns:
|
|
@@ -115,7 +129,7 @@ def fixID(sumstats,
|
|
|
115
129
|
sumstats.loc[ is_rsid,status] = vchange_status(sumstats.loc[ is_rsid,status], 3, "986","520")
|
|
116
130
|
sumstats.loc[~is_rsid,status] = vchange_status(sumstats.loc[~is_rsid,status], 3, "986","743")
|
|
117
131
|
|
|
118
|
-
|
|
132
|
+
log.write(" -Checking if CHR:POS:NEA:EA is mixed in rsID column ...", verbose=verbose)
|
|
119
133
|
is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
|
|
120
134
|
|
|
121
135
|
log.write(" -Number of CHR:POS:NEA:EA mixed in rsID column :",sum(is_rs_chrpos), verbose=verbose)
|
|
@@ -126,9 +140,9 @@ def fixID(sumstats,
|
|
|
126
140
|
if fixchrpos == True:
|
|
127
141
|
# from snpid or rsid, extract CHR:POS to fix CHR and POS
|
|
128
142
|
if snpid in sumstats.columns:
|
|
129
|
-
|
|
143
|
+
log.write(" -Fixing CHR and POS...", verbose=verbose)
|
|
130
144
|
if overwrite is True:
|
|
131
|
-
|
|
145
|
+
log.write(" -Overwrite is applied...", verbose=verbose)
|
|
132
146
|
# fix all
|
|
133
147
|
to_fix = is_chrposrefalt
|
|
134
148
|
|
|
@@ -137,35 +151,39 @@ def fixID(sumstats,
|
|
|
137
151
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
138
152
|
to_fix_num = sum(to_fix)
|
|
139
153
|
if to_fix_num and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
140
|
-
|
|
154
|
+
else:
|
|
155
|
+
log.write(" -No fixable variants. ...", verbose=verbose)
|
|
141
156
|
|
|
142
157
|
elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
|
|
143
|
-
|
|
144
|
-
sumstats
|
|
158
|
+
log.write(" -Initiating CHR columns...", verbose=verbose)
|
|
159
|
+
sumstats[chrom]=pd.Series(dtype="string")
|
|
145
160
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
146
161
|
to_fix_num = sum(to_fix)
|
|
147
162
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
148
|
-
|
|
163
|
+
else:
|
|
164
|
+
log.write(" -No fixable variants. ...", verbose=verbose)
|
|
149
165
|
|
|
150
166
|
elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
|
|
151
|
-
|
|
152
|
-
sumstats
|
|
167
|
+
log.write(" -Initiating CHR and POS column...", verbose=verbose)
|
|
168
|
+
sumstats[pos]=pd.Series(dtype="Int64")
|
|
153
169
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
154
170
|
to_fix_num = sum(to_fix)
|
|
155
171
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
156
|
-
|
|
172
|
+
else:
|
|
173
|
+
log.write(" -No fixable variants. ...", verbose=verbose)
|
|
157
174
|
|
|
158
175
|
else:
|
|
159
|
-
|
|
160
|
-
sumstats
|
|
161
|
-
sumstats
|
|
176
|
+
log.write(" -Initiating CHR and POS columns...", verbose=verbose)
|
|
177
|
+
sumstats[chrom]=pd.Series(dtype="string")
|
|
178
|
+
sumstats[pos]=pd.Series(dtype="Int64")
|
|
162
179
|
to_fix = is_chrposrefalt
|
|
163
180
|
to_fix_num = sum(to_fix)
|
|
164
181
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
165
|
-
|
|
182
|
+
else:
|
|
183
|
+
log.write(" -No fixable variants. ...", verbose=verbose)
|
|
166
184
|
|
|
167
185
|
if sum(to_fix)>0:
|
|
168
|
-
|
|
186
|
+
log.write(" -Filling CHR and POS columns using valid SNPID's chr:pos...", verbose=verbose)
|
|
169
187
|
# format and qc filled chr and pos
|
|
170
188
|
|
|
171
189
|
sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
@@ -177,36 +195,40 @@ def fixID(sumstats,
|
|
|
177
195
|
#sumstats.loc[to_fix,status] = vchange_status(sumstats.loc[to_fix,status], 4, "98765432","00000000")
|
|
178
196
|
|
|
179
197
|
if rsid in sumstats.columns:
|
|
180
|
-
|
|
198
|
+
log.write(" -Fixing CHR and POS using chr:pos:ref:alt format variants in rsID column...", verbose=verbose)
|
|
181
199
|
if overwrite is True:
|
|
182
|
-
|
|
200
|
+
log.write(" -Overwrite is applied...", verbose=verbose)
|
|
183
201
|
to_fix = is_rs_chrpos
|
|
184
202
|
elif (chrom in sumstats.columns) and (pos in sumstats.columns) :
|
|
185
203
|
to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
186
204
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
187
|
-
|
|
205
|
+
else:
|
|
206
|
+
log.write(" -No fixable variants ...", verbose=verbose)
|
|
188
207
|
elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
|
|
189
|
-
|
|
190
|
-
sumstats
|
|
208
|
+
log.write(" -Initiating CHR columns...", verbose=verbose)
|
|
209
|
+
sumstats[chrom]=pd.Series(dtype="string")
|
|
191
210
|
to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
192
211
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
193
|
-
|
|
212
|
+
else:
|
|
213
|
+
log.write(" -No fixable variants ...", verbose=verbose)
|
|
194
214
|
elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
|
|
195
|
-
|
|
196
|
-
sumstats
|
|
215
|
+
log.write(" -Initiating CHR and POS column...", verbose=verbose)
|
|
216
|
+
sumstats[pos]=pd.Series(dtype="Int64")
|
|
197
217
|
to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
198
218
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
199
|
-
|
|
219
|
+
else:
|
|
220
|
+
log.write(" -No fixable variants ...", verbose=verbose)
|
|
200
221
|
else:
|
|
201
|
-
|
|
202
|
-
sumstats
|
|
203
|
-
sumstats
|
|
222
|
+
log.write(" -Initiating CHR and POS columns...", verbose=verbose)
|
|
223
|
+
sumstats[chrom]=pd.Series(dtype="string")
|
|
224
|
+
sumstats[pos]=pd.Series(dtype="Int64")
|
|
204
225
|
to_fix = is_rs_chrpos
|
|
205
226
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
206
|
-
|
|
227
|
+
else:
|
|
228
|
+
log.write(" -No fixable variants ...", verbose=verbose)
|
|
207
229
|
|
|
208
230
|
if sum(to_fix)>0:
|
|
209
|
-
|
|
231
|
+
log.write(" -Filling CHR and POS columns using chr:pos:ref:alt format variants in rsID column...", verbose=verbose)
|
|
210
232
|
sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,rsid].str.split(':|_|-',n=2).str[0]
|
|
211
233
|
sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,rsid].str.split(':|_|-',n=2).str[1]
|
|
212
234
|
#sumstats.loc[to_fix,pos] = np.floor(pd.to_numeric(sumstats.loc[to_fix,rsid].str.split(':|_|-',x).get(1), errors='coerce')).astype('Int64')
|
|
@@ -214,40 +236,40 @@ def fixID(sumstats,
|
|
|
214
236
|
|
|
215
237
|
############################ fixing chr pos###################################################
|
|
216
238
|
if fixeanea == True:
|
|
217
|
-
|
|
239
|
+
log.warning("gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT", verbose=verbose)
|
|
218
240
|
if overwrite is True:
|
|
219
|
-
|
|
241
|
+
log.write(" -Overwrite mode is applied...", verbose=verbose)
|
|
220
242
|
to_fix = is_chrposrefalt
|
|
221
243
|
elif (nea in sumstats.columns) and (nea in sumstats.columns):
|
|
222
244
|
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
223
245
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
224
246
|
elif (nea in sumstats.columns) and (ea not in sumstats.columns):
|
|
225
|
-
|
|
226
|
-
sumstats
|
|
247
|
+
log.write(" -Initiating EA columns...", verbose=verbose)
|
|
248
|
+
sumstats[ea]=pd.Series(dtype="string")
|
|
227
249
|
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
228
250
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
229
251
|
elif (nea not in sumstats.columns) and (ea in sumstats.columns):
|
|
230
|
-
|
|
231
|
-
sumstats
|
|
252
|
+
log.write(" -Initiating NEA columns...", verbose=verbose)
|
|
253
|
+
sumstats[nea]=pd.Series(dtype="string")
|
|
232
254
|
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
233
255
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
234
256
|
else:
|
|
235
|
-
|
|
257
|
+
log.write(" -Initiating EA and NEA columns...", verbose=verbose)
|
|
236
258
|
sumstats[nea]=pd.Series(dtype="string")
|
|
237
259
|
sumstats[ea]=pd.Series(dtype="string")
|
|
238
260
|
to_fix = is_chrposrefalt
|
|
239
261
|
if sum(to_fix)>0:
|
|
240
|
-
|
|
262
|
+
log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...", verbose=verbose)
|
|
241
263
|
#
|
|
242
264
|
if sum(to_fix)>0:
|
|
243
|
-
|
|
265
|
+
log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's CHR:POS:NEA:EA...", verbose=verbose)
|
|
244
266
|
#
|
|
245
267
|
if fixeanea_flip == True:
|
|
246
|
-
|
|
268
|
+
log.write(" -Flipped : CHR:POS:NEA:EA -> CHR:POS:EA:NEA ", verbose=verbose)
|
|
247
269
|
sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
|
|
248
270
|
sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
|
|
249
271
|
else:
|
|
250
|
-
|
|
272
|
+
log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ", verbose=verbose)
|
|
251
273
|
sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
|
|
252
274
|
sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
|
|
253
275
|
|
|
@@ -259,22 +281,22 @@ def fixID(sumstats,
|
|
|
259
281
|
############################ fixing id ###################################################
|
|
260
282
|
if fixsep == True:
|
|
261
283
|
if snpid in sumstats.columns:
|
|
262
|
-
|
|
263
|
-
sumstats
|
|
284
|
+
log.write(' -Replacing [_-] in SNPID with ":" ...', verbose=verbose)
|
|
285
|
+
sumstats[snpid] = sumstats[snpid].str.replace(r"[_-]",":",regex=True)
|
|
264
286
|
|
|
265
287
|
if fixprefix == True:
|
|
266
288
|
if snpid in sumstats.columns:
|
|
267
|
-
|
|
268
|
-
prefix_removed = sumstats
|
|
289
|
+
log.write(' -Removing /^chr/ in SNPID ...', verbose=verbose)
|
|
290
|
+
prefix_removed = sumstats[snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
269
291
|
sumstats.loc[~prefix_removed.isna(),snpid] = prefix_removed[~prefix_removed.isna()]
|
|
270
292
|
|
|
271
293
|
if fixid == True:
|
|
272
294
|
if snpid not in sumstats.columns:
|
|
273
295
|
# initiate a SNPID column
|
|
274
|
-
sumstats
|
|
296
|
+
sumstats[snpid]=pd.Series(dtype="string")
|
|
275
297
|
|
|
276
298
|
if (rsid in sumstats.columns) and (sum(is_rs_chrpos)>0) :
|
|
277
|
-
sumstats
|
|
299
|
+
sumstats[snpid]= sumstats.loc[is_rs_chrpos,rsid]
|
|
278
300
|
|
|
279
301
|
if (chrom in sumstats.columns) and (pos in sumstats.columns):
|
|
280
302
|
#only fix when CHR and POS is available
|
|
@@ -313,23 +335,25 @@ def fixID(sumstats,
|
|
|
313
335
|
sumstats.loc[to_part_fix,snpid] = sumstats.loc[to_part_fix,chrom].astype("string") + ":"+sumstats.loc[to_part_fix,pos].astype("string")
|
|
314
336
|
if sum(to_full_fix)>0:
|
|
315
337
|
sumstats.loc[to_full_fix,snpid] = sumstats.loc[to_full_fix,chrom].astype("string") + ":"+sumstats.loc[to_full_fix,pos].astype("string") +":"+ sumstats.loc[to_full_fix,nea].astype("string") +":"+ sumstats.loc[to_full_fix,ea].astype("string")
|
|
316
|
-
|
|
317
|
-
|
|
338
|
+
log.write(" -Filling "+str(sum(to_part_fix)-sum(to_full_fix)) +" SNPID using CHR:POS...", verbose=verbose)
|
|
339
|
+
log.write(" -Filling "+str(sum(to_full_fix)) +" SNPID using CHR:POS:NEA:EA...", verbose=verbose)
|
|
318
340
|
sumstats.loc[(to_full_fix),status] = vchange_status(sumstats.loc[(to_full_fix),status],3,"975","630")
|
|
319
341
|
sumstats.loc[(to_part_fix),status] = vchange_status(sumstats.loc[(to_part_fix),status],3,"975","842")
|
|
320
342
|
|
|
321
343
|
else:
|
|
322
344
|
#when these is no ea or ena, just fix to chr:pos
|
|
323
345
|
to_part_fix = to_fix & sumstats[chrom].notnull() & sumstats[pos].notnull()
|
|
324
|
-
|
|
346
|
+
log.write(" -Filling "+str(sum(to_part_fix)) +" SNPID using CHR POS...", verbose=verbose)
|
|
325
347
|
if sum(to_part_fix)>0:
|
|
326
348
|
sumstats.loc[to_part_fix,snpid] = sumstats.loc[to_part_fix,chrom].astype("string") + ":"+sumstats.loc[to_part_fix,pos].astype("string")
|
|
327
349
|
sumstats.loc[to_part_fix,status] = vchange_status(sumstats.loc[(to_part_fix),status],3,"975","842")
|
|
328
350
|
|
|
329
351
|
after_number=sum(sumstats[snpid].isna())
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
352
|
+
log.write(" -Fixed "+ str(pre_number - after_number) +" variants ID...", verbose=verbose)
|
|
353
|
+
else:
|
|
354
|
+
log.write(" -ID unfixable: no CHR and POS columns or no SNPID. ", verbose=verbose)
|
|
355
|
+
|
|
356
|
+
finished(log,verbose,_end_line)
|
|
333
357
|
return sumstats
|
|
334
358
|
|
|
335
359
|
""
|
|
@@ -344,73 +368,90 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
344
368
|
remove duplicate SNPs based on 3. rsID
|
|
345
369
|
remove multiallelic SNPs based on 4. CHR, POS
|
|
346
370
|
'''
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
371
|
+
|
|
372
|
+
##start function with col checking##########################################################
|
|
373
|
+
_start_line = "remove duplicated/multiallelic variants"
|
|
374
|
+
_end_line = "removing duplicated/multiallelic variants"
|
|
375
|
+
_start_cols =[]
|
|
376
|
+
_start_function = ".remove_dup()"
|
|
377
|
+
_must_args ={}
|
|
378
|
+
|
|
379
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
380
|
+
log=log,
|
|
381
|
+
verbose=verbose,
|
|
382
|
+
start_line=_start_line,
|
|
383
|
+
end_line=_end_line,
|
|
384
|
+
start_cols=_start_cols,
|
|
385
|
+
start_function=_start_function,
|
|
386
|
+
**_must_args)
|
|
387
|
+
if is_enough_info == False: return sumstats
|
|
388
|
+
############################################################################################
|
|
389
|
+
|
|
390
|
+
log.write(" -Removing mode:{}".format(mode), verbose=verbose)
|
|
350
391
|
# sort the variants using the specified column before removing
|
|
351
392
|
if keep_col is not None :
|
|
352
393
|
if keep_col in sumstats.columns:
|
|
353
|
-
|
|
394
|
+
log.write("Start to sort the sumstats using {}...".format(keep_col), verbose=verbose)
|
|
354
395
|
sumstats = sumstats.sort_values(by=keep_col,ascending=keep_ascend)
|
|
355
396
|
else:
|
|
356
|
-
|
|
397
|
+
log.write("Column" + keep_col +" was not detected... skipping... ", verbose=verbose)
|
|
357
398
|
total_number = len(sumstats)
|
|
358
399
|
|
|
359
400
|
# remove by duplicated SNPID
|
|
360
401
|
if (snpid in sumstats.columns) and ("d" in mode or "s" in mode):
|
|
361
|
-
|
|
402
|
+
log.write("Start to remove duplicated variants based on snpid...{}".format(_get_version()), verbose=verbose)
|
|
362
403
|
check_dataframe_shape(sumstats, log, verbose)
|
|
363
|
-
|
|
404
|
+
log.write(" -Which variant to keep: ", keep , verbose=verbose)
|
|
364
405
|
pre_number =len(sumstats)
|
|
365
406
|
if snpid in sumstats.columns:
|
|
366
407
|
# keep na and remove duplicated
|
|
367
408
|
sumstats = sumstats.loc[sumstats[snpid].isna() | (~sumstats.duplicated(subset=[snpid], keep=keep)),:]
|
|
368
409
|
after_number=len(sumstats)
|
|
369
|
-
|
|
410
|
+
log.write(" -Removed ",pre_number -after_number ," based on SNPID...", verbose=verbose)
|
|
370
411
|
|
|
371
412
|
# remove by duplicated rsID
|
|
372
413
|
if (rsid in sumstats.columns) and ("d" in mode or "r" in mode):
|
|
373
414
|
# keep na and remove duplicated
|
|
374
415
|
pre_number =len(sumstats)
|
|
375
|
-
|
|
416
|
+
log.write("Start to remove duplicated variants based on rsID...", verbose=verbose)
|
|
376
417
|
check_dataframe_shape(sumstats, log, verbose)
|
|
377
418
|
sumstats = sumstats.loc[sumstats[rsid].isna() | (~sumstats.duplicated(subset=rsid, keep=keep)),:]
|
|
378
419
|
after_number=len(sumstats)
|
|
379
|
-
|
|
420
|
+
log.write(" -Removed ",pre_number -after_number ," based on rsID...", verbose=verbose)
|
|
380
421
|
|
|
381
422
|
# remove by duplicated variants by CHR:POS:NEA:EA
|
|
382
423
|
if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and ("d" in mode or "c" in mode):
|
|
383
|
-
|
|
424
|
+
log.write("Start to remove duplicated variants based on CHR,POS,EA and NEA...", verbose=verbose)
|
|
384
425
|
check_dataframe_shape(sumstats, log, verbose)
|
|
385
|
-
|
|
426
|
+
log.write(" -Which variant to keep: ", keep , verbose=verbose)
|
|
386
427
|
pre_number =len(sumstats)
|
|
387
428
|
if snpid in sumstats.columns:
|
|
388
429
|
# keep na and remove duplicated
|
|
389
430
|
sumstats = sumstats.loc[(~sumstats[[chrom,pos,ea,nea]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos,ea,nea], keep=keep)),:]
|
|
390
431
|
after_number=len(sumstats)
|
|
391
|
-
|
|
432
|
+
log.write(" -Removed ",pre_number -after_number ," based on CHR,POS,EA and NEA...", verbose=verbose)
|
|
392
433
|
|
|
393
434
|
# remove by multiallelic variants by CHR:POS
|
|
394
435
|
if (chrom in sumstats.columns) and (pos in sumstats.columns) and "m" in mode:
|
|
395
436
|
# keep na and remove duplicated
|
|
396
437
|
pre_number =len(sumstats)
|
|
397
|
-
|
|
438
|
+
log.write("Start to remove multiallelic variants based on chr:pos...", verbose=verbose)
|
|
398
439
|
check_dataframe_shape(sumstats, log, verbose)
|
|
399
|
-
|
|
400
|
-
sumstats = sumstats.loc[(~sumstats
|
|
440
|
+
log.write(" -Which variant to keep: ", keep , verbose=verbose)
|
|
441
|
+
sumstats = sumstats.loc[(~sumstats[[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
|
|
401
442
|
after_number=len(sumstats)
|
|
402
|
-
|
|
443
|
+
log.write(" -Removed ",pre_number -after_number," multiallelic variants...", verbose=verbose)
|
|
403
444
|
after_number=len(sumstats)
|
|
404
445
|
|
|
405
446
|
# resort the coordinates
|
|
406
|
-
|
|
447
|
+
log.write(" -Removed ",total_number -after_number," variants in total.", verbose=verbose)
|
|
407
448
|
if keep_col is not None :
|
|
408
|
-
|
|
449
|
+
log.write(" -Sort the coordinates based on CHR and POS...", verbose=verbose)
|
|
409
450
|
sumstats = sortcoordinate(sumstats,verbose=False)
|
|
410
451
|
|
|
411
452
|
if "n" in mode or remove==True:
|
|
412
453
|
# if remove==True, remove NAs
|
|
413
|
-
|
|
454
|
+
log.write(" -Removing NAs...", verbose=verbose)
|
|
414
455
|
pre_number =len(sumstats)
|
|
415
456
|
specified_columns = []
|
|
416
457
|
if "d" in mode:
|
|
@@ -434,307 +475,348 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
434
475
|
specified_columns.append(nea)
|
|
435
476
|
sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
|
|
436
477
|
after_number=len(sumstats)
|
|
437
|
-
|
|
438
|
-
|
|
478
|
+
log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)), verbose=verbose)
|
|
479
|
+
|
|
480
|
+
finished(log,verbose,_end_line)
|
|
439
481
|
return sumstats
|
|
440
482
|
|
|
441
483
|
###############################################################################################################
|
|
442
484
|
# 20230128
|
|
443
485
|
def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",24),mt=("MT",25), remove=False, verbose=True, chrom_list = None, minchr=1,log=Log()):
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
if verbose: log.write("Start to fix chromosome notation...{}".format(_get_version()))
|
|
451
|
-
check_dataframe_shape(sumstats, log, verbose)
|
|
452
|
-
|
|
453
|
-
# convert to string datatype
|
|
454
|
-
try:
|
|
455
|
-
if verbose: log.write(" -Checking CHR data type...")
|
|
456
|
-
if sumstats.loc[:,chrom].dtype == "string":
|
|
457
|
-
pass
|
|
458
|
-
else:
|
|
459
|
-
sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype("string")
|
|
460
|
-
except:
|
|
461
|
-
if verbose: log.write(" -Force converting to pd string data type...")
|
|
462
|
-
sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype("string")
|
|
463
|
-
|
|
464
|
-
# check if CHR is numeric
|
|
465
|
-
is_chr_fixed = sumstats[chrom].str.isnumeric()
|
|
466
|
-
# fill NAs with False
|
|
467
|
-
is_chr_fixed = is_chr_fixed.fillna(False)
|
|
468
|
-
if verbose: log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed))
|
|
469
|
-
|
|
470
|
-
# if there are variants whose CHR need to be fixed
|
|
471
|
-
if sum(is_chr_fixed)<len(sumstats):
|
|
472
|
-
|
|
473
|
-
#extract the CHR number or X Y M MT
|
|
474
|
-
chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
486
|
+
##start function with col checking##########################################################
|
|
487
|
+
_start_line = "fix chromosome notation (CHR)"
|
|
488
|
+
_end_line = "fixing chromosome notation (CHR)"
|
|
489
|
+
_start_cols =[chrom,status]
|
|
490
|
+
_start_function = ".fix_chr()"
|
|
491
|
+
_must_args ={}
|
|
475
492
|
|
|
476
|
-
|
|
477
|
-
|
|
493
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
494
|
+
log=log,
|
|
495
|
+
verbose=verbose,
|
|
496
|
+
start_line=_start_line,
|
|
497
|
+
end_line=_end_line,
|
|
498
|
+
start_cols=_start_cols,
|
|
499
|
+
start_function=_start_function,
|
|
500
|
+
**_must_args)
|
|
501
|
+
if is_enough_info == False: return sumstats
|
|
502
|
+
############################################################################################
|
|
478
503
|
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
504
|
+
#chrom_list = get_chr_list() #bottom
|
|
505
|
+
if chrom_list is None:
|
|
506
|
+
chrom_list = get_chr_list()
|
|
507
|
+
|
|
508
|
+
# convert to string datatype
|
|
509
|
+
try:
|
|
510
|
+
log.write(" -Checking CHR data type...", verbose=verbose)
|
|
511
|
+
if sumstats[chrom].dtype == "string":
|
|
512
|
+
pass
|
|
513
|
+
else:
|
|
514
|
+
sumstats[chrom] = sumstats[chrom].astype("string")
|
|
515
|
+
except:
|
|
516
|
+
log.write(" -Force converting to pd string data type...", verbose=verbose)
|
|
517
|
+
sumstats[chrom] = sumstats[chrom].astype("string")
|
|
518
|
+
|
|
519
|
+
# check if CHR is numeric
|
|
520
|
+
is_chr_fixed = sumstats[chrom].str.isnumeric()
|
|
521
|
+
# fill NAs with False
|
|
522
|
+
is_chr_fixed = is_chr_fixed.fillna(False)
|
|
523
|
+
log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed), verbose=verbose)
|
|
524
|
+
|
|
525
|
+
# if there are variants whose CHR need to be fixed
|
|
526
|
+
if sum(is_chr_fixed)<len(sumstats):
|
|
527
|
+
|
|
528
|
+
#extract the CHR number or X Y M MT
|
|
529
|
+
chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
497
530
|
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]))
|
|
515
|
-
if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
|
|
516
|
-
convert_num_to_xymt[y[0].lower()] = str(y[1])
|
|
517
|
-
convert_num_to_xymt[y[0].upper()] = str(y[1])
|
|
518
|
-
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]))
|
|
519
|
-
if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
|
|
520
|
-
convert_num_to_xymt[mt[0].lower()] = str(mt[1])
|
|
521
|
-
convert_num_to_xymt[mt[0].upper()] = str(mt[1])
|
|
522
|
-
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]))
|
|
523
|
-
sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
|
|
524
|
-
|
|
525
|
-
# change status code
|
|
526
|
-
sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
|
|
527
|
-
if len(is_chr_fixable.index)>0:
|
|
528
|
-
sumstats.loc[is_chr_fixable.index,status] = vchange_status(sumstats.loc[is_chr_fixable.index,status],4,"986","520")
|
|
529
|
-
if len(is_chr_fixable.index)>0:
|
|
530
|
-
sumstats.loc[is_chr_invalid.index,status] = vchange_status(sumstats.loc[is_chr_invalid.index,status],4,"986","743")
|
|
531
|
-
|
|
532
|
-
# check variants with unrecognized CHR
|
|
533
|
-
unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
|
|
534
|
-
if (remove is True) and unrecognized_num>0:
|
|
535
|
-
# remove variants with unrecognized CHR
|
|
536
|
-
try:
|
|
537
|
-
if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
|
|
538
|
-
except:
|
|
539
|
-
pass
|
|
540
|
-
if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
|
|
541
|
-
try:
|
|
542
|
-
log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
|
|
543
|
-
except:
|
|
544
|
-
pass
|
|
545
|
-
#sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
|
|
546
|
-
good_chr = sumstats[chrom].isin(chrom_list)
|
|
547
|
-
sumstats = sumstats.loc[good_chr, :].copy()
|
|
531
|
+
is_chr_fixable = ~chr_extracted.isna()
|
|
532
|
+
log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable), verbose=verbose)
|
|
533
|
+
|
|
534
|
+
# For not fixed variants, check if na
|
|
535
|
+
is_chr_na = sumstats.loc[~is_chr_fixed, chrom].isna()
|
|
536
|
+
if sum(is_chr_na)>0 and verbose:
|
|
537
|
+
log.write(" -Variants with NA chromosome notations:",sum(is_chr_na))
|
|
538
|
+
|
|
539
|
+
# Check variants with CHR being not NA and not fixable
|
|
540
|
+
is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
|
|
541
|
+
if sum(is_chr_invalid)>0 and verbose:
|
|
542
|
+
log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid), verbose=verbose)
|
|
543
|
+
try:
|
|
544
|
+
log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()), verbose=verbose)
|
|
545
|
+
except:
|
|
546
|
+
pass
|
|
548
547
|
else:
|
|
549
|
-
|
|
550
|
-
sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
|
|
548
|
+
log.write(" -No unrecognized chromosome notations...", verbose=verbose)
|
|
551
549
|
|
|
552
|
-
#
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
sumstats.loc[:,chrom] = np.floor(pd.to_numeric(sumstats.loc[:,chrom], errors='coerce')).astype('Int64')
|
|
550
|
+
# Assign good chr back to sumstats
|
|
551
|
+
sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
|
|
552
|
+
|
|
553
|
+
# X, Y, MT to 23,24,25
|
|
554
|
+
xymt_list = [x[0].lower(),y[0].lower(),mt[0].lower(),x[0].upper(),y[0].upper(),mt[0].upper()]
|
|
558
555
|
|
|
559
|
-
#
|
|
560
|
-
|
|
561
|
-
out_of_range_chr = out_of_range_chr.fillna(False)
|
|
562
|
-
if sum(out_of_range_chr)>0:
|
|
563
|
-
if verbose: log.write(" -Sanity check for CHR...")
|
|
564
|
-
if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
|
|
565
|
-
sumstats = sumstats.loc[~out_of_range_chr,:]
|
|
566
|
-
|
|
567
|
-
if verbose: log.write("Finished fixing chromosome notation successfully!")
|
|
556
|
+
# check if sumstats contain sex CHR
|
|
557
|
+
sex_chr = sumstats[chrom].isin(xymt_list)
|
|
568
558
|
|
|
569
|
-
|
|
559
|
+
# if sumstats contain sex CHR
|
|
560
|
+
if sum(sex_chr)>0:
|
|
561
|
+
log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]), verbose=verbose)
|
|
562
|
+
log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...", verbose=verbose)
|
|
563
|
+
|
|
564
|
+
# convert "X, Y, MT" to numbers
|
|
565
|
+
convert_num_to_xymt={}
|
|
566
|
+
if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
|
|
567
|
+
convert_num_to_xymt[x[0].lower()] = str(x[1])
|
|
568
|
+
convert_num_to_xymt[x[0].upper()] = str(x[1])
|
|
569
|
+
log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]), verbose=verbose)
|
|
570
|
+
if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
|
|
571
|
+
convert_num_to_xymt[y[0].lower()] = str(y[1])
|
|
572
|
+
convert_num_to_xymt[y[0].upper()] = str(y[1])
|
|
573
|
+
log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]), verbose=verbose)
|
|
574
|
+
if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
|
|
575
|
+
convert_num_to_xymt[mt[0].lower()] = str(mt[1])
|
|
576
|
+
convert_num_to_xymt[mt[0].upper()] = str(mt[1])
|
|
577
|
+
log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]), verbose=verbose)
|
|
578
|
+
sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
|
|
579
|
+
|
|
580
|
+
# change status code
|
|
581
|
+
sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
|
|
582
|
+
if len(is_chr_fixable.index)>0:
|
|
583
|
+
sumstats.loc[is_chr_fixable.index,status] = vchange_status(sumstats.loc[is_chr_fixable.index,status],4,"986","520")
|
|
584
|
+
if len(is_chr_fixable.index)>0:
|
|
585
|
+
sumstats.loc[is_chr_invalid.index,status] = vchange_status(sumstats.loc[is_chr_invalid.index,status],4,"986","743")
|
|
586
|
+
|
|
587
|
+
# check variants with unrecognized CHR
|
|
588
|
+
unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
|
|
589
|
+
if (remove is True) and unrecognized_num>0:
|
|
590
|
+
# remove variants with unrecognized CHR
|
|
591
|
+
try:
|
|
592
|
+
log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])), verbose=verbose)
|
|
593
|
+
except:
|
|
594
|
+
pass
|
|
595
|
+
log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.", verbose=verbose)
|
|
596
|
+
try:
|
|
597
|
+
log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()), verbose=verbose)
|
|
598
|
+
except:
|
|
599
|
+
pass
|
|
600
|
+
#sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
|
|
601
|
+
good_chr = sumstats[chrom].isin(chrom_list)
|
|
602
|
+
sumstats = sumstats.loc[good_chr, :].copy()
|
|
603
|
+
else:
|
|
604
|
+
log.write(" -All CHR are already fixed...", verbose=verbose)
|
|
605
|
+
sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
|
|
606
|
+
|
|
607
|
+
# Convert string to int
|
|
608
|
+
try:
|
|
609
|
+
sumstats[chrom] = sumstats[chrom].astype('Int64')
|
|
610
|
+
except:
|
|
611
|
+
# # force convert
|
|
612
|
+
sumstats[chrom] = np.floor(pd.to_numeric(sumstats[chrom], errors='coerce')).astype('Int64')
|
|
613
|
+
|
|
614
|
+
# filter out variants with CHR <=0
|
|
615
|
+
out_of_range_chr = sumstats[chrom] < minchr
|
|
616
|
+
out_of_range_chr = out_of_range_chr.fillna(False)
|
|
617
|
+
if sum(out_of_range_chr)>0:
|
|
618
|
+
log.write(" -Sanity check for CHR...", verbose=verbose)
|
|
619
|
+
log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr), verbose=verbose)
|
|
620
|
+
sumstats = sumstats.loc[~out_of_range_chr,:]
|
|
621
|
+
|
|
622
|
+
finished(log,verbose,_end_line)
|
|
623
|
+
return sumstats
|
|
570
624
|
|
|
571
625
|
###############################################################################################################
|
|
572
626
|
# 20230128
|
|
573
627
|
def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_limit=0 , upper_limit=None , limit=250000000, log=Log()):
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
check_dataframe_shape(sumstats, log, verbose)
|
|
581
|
-
|
|
582
|
-
all_var_num = len(sumstats)
|
|
583
|
-
#convert to numeric
|
|
584
|
-
is_pos_na = sumstats.loc[:,pos].isna()
|
|
585
|
-
|
|
586
|
-
try:
|
|
587
|
-
if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
|
|
588
|
-
sumstats.loc[:,pos] = sumstats.loc[:,pos].astype('string')
|
|
589
|
-
# if so, remove thousands separator
|
|
590
|
-
if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
|
|
591
|
-
sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
|
|
592
|
-
except:
|
|
593
|
-
pass
|
|
628
|
+
##start function with col checking##########################################################
|
|
629
|
+
_start_line = "fix basepair positions (POS)"
|
|
630
|
+
_end_line = "fixing basepair positions (POS)"
|
|
631
|
+
_start_cols =[pos,status]
|
|
632
|
+
_start_function = ".fix_pos()"
|
|
633
|
+
_must_args ={}
|
|
594
634
|
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
# remove outlier, limit:250,000,000
|
|
609
|
-
if verbose: log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit))
|
|
610
|
-
is_pos_na = sumstats.loc[:,pos].isna()
|
|
611
|
-
out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
|
|
612
|
-
if verbose: log.write(" -Removed outliers:",sum(out_lier))
|
|
613
|
-
sumstats = sumstats.loc[~out_lier,:]
|
|
614
|
-
#remove na
|
|
615
|
-
if remove is True:
|
|
616
|
-
sumstats = sumstats.loc[~sumstats[pos].isna(),:]
|
|
617
|
-
remain_var_num = len(sumstats)
|
|
618
|
-
if verbose: log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.")
|
|
619
|
-
|
|
620
|
-
if verbose: log.write(" -Converted all position to datatype Int64.")
|
|
621
|
-
if verbose: log.write("Finished fixing basepair position successfully!")
|
|
635
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
636
|
+
log=log,
|
|
637
|
+
verbose=verbose,
|
|
638
|
+
start_line=_start_line,
|
|
639
|
+
end_line=_end_line,
|
|
640
|
+
start_cols=_start_cols,
|
|
641
|
+
start_function=_start_function,
|
|
642
|
+
**_must_args)
|
|
643
|
+
if is_enough_info == False: return sumstats
|
|
644
|
+
############################################################################################
|
|
645
|
+
|
|
646
|
+
if upper_limit is None:
|
|
647
|
+
upper_limit = limit
|
|
622
648
|
|
|
623
|
-
|
|
649
|
+
all_var_num = len(sumstats)
|
|
650
|
+
#convert to numeric
|
|
651
|
+
is_pos_na = sumstats[pos].isna()
|
|
652
|
+
|
|
653
|
+
try:
|
|
654
|
+
if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
|
|
655
|
+
sumstats[pos] = sumstats[pos].astype('string')
|
|
656
|
+
# if so, remove thousands separator
|
|
657
|
+
log.write(' -Removing thousands separator "," or underbar "_" ...', verbose=verbose)
|
|
658
|
+
sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
|
|
659
|
+
except:
|
|
660
|
+
pass
|
|
661
|
+
|
|
662
|
+
# convert POS to integer
|
|
663
|
+
try:
|
|
664
|
+
log.write(' -Converting to Int64 data type ...', verbose=verbose)
|
|
665
|
+
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
666
|
+
except:
|
|
667
|
+
log.write(' -Force converting to Int64 data type ...', verbose=verbose)
|
|
668
|
+
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
669
|
+
is_pos_fixed = ~sumstats[pos].isna()
|
|
670
|
+
is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
|
|
671
|
+
|
|
672
|
+
sumstats.loc[is_pos_fixed,status] = vchange_status(sumstats.loc[is_pos_fixed,status] ,4,"975","630")
|
|
673
|
+
sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
|
|
674
|
+
|
|
675
|
+
# remove outlier, limit:250,000,000
|
|
676
|
+
log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit), verbose=verbose)
|
|
677
|
+
is_pos_na = sumstats[pos].isna()
|
|
678
|
+
out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
|
|
679
|
+
log.write(" -Removed outliers:",sum(out_lier), verbose=verbose)
|
|
680
|
+
sumstats = sumstats.loc[~out_lier,:]
|
|
681
|
+
#remove na
|
|
682
|
+
if remove is True:
|
|
683
|
+
sumstats = sumstats.loc[~sumstats[pos].isna(),:]
|
|
684
|
+
remain_var_num = len(sumstats)
|
|
685
|
+
log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.", verbose=verbose)
|
|
686
|
+
|
|
687
|
+
finished(log,verbose,_end_line)
|
|
688
|
+
return sumstats
|
|
624
689
|
|
|
625
690
|
###############################################################################################################
|
|
626
691
|
# 20220514
|
|
627
692
|
def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=True,log=Log()):
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
#if (ea not in sumstats.columns) or (nea not in sumstats.columns):
|
|
636
|
-
if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
|
|
637
|
-
|
|
638
|
-
#try:
|
|
639
|
-
# ea_missing = sum(sumstats[ea].isna())
|
|
640
|
-
# nea_missing = sum(sumstats[nea].isna())
|
|
641
|
-
# if sum(ea_missing)>0:
|
|
642
|
-
# if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
|
|
643
|
-
# sumstats.loc[:,ea] = sumstats.loc[:,ea].add_categories("N").fillna("N")
|
|
644
|
-
# if sum(sumstats[nea].isna())>0:
|
|
645
|
-
# if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
|
|
646
|
-
# sumstats.loc[:,nea] = sumstats.loc[:,nea].add_categories("N").fillna("N")
|
|
647
|
-
#except:
|
|
648
|
-
# pass
|
|
649
|
-
|
|
650
|
-
categories = set(sumstats.loc[:,ea].str.upper())|set(sumstats.loc[:,nea].str.upper())|set("N")
|
|
651
|
-
categories = {x for x in categories if pd.notna(x)}
|
|
652
|
-
|
|
653
|
-
sumstats.loc[:,ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
|
|
654
|
-
sumstats.loc[:,nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
|
|
655
|
-
all_var_num = len(sumstats)
|
|
656
|
-
|
|
657
|
-
## check ATCG
|
|
658
|
-
bad_ea = sumstats[ea].str.contains("[^actgACTG]",na=True)
|
|
659
|
-
bad_nea = sumstats[nea].str.contains("[^actgACTG]",na=True)
|
|
660
|
-
good_ea = ~bad_ea
|
|
661
|
-
good_nea = ~bad_nea
|
|
662
|
-
|
|
663
|
-
log.write(" -Variants with bad EA : {}".format(sum(bad_ea)), verbose=verbose)
|
|
664
|
-
log.write(" -Variants with bad NEA : {}".format(sum(bad_nea)), verbose=verbose)
|
|
665
|
-
|
|
666
|
-
## check NA
|
|
667
|
-
is_eanea_na = sumstats[ea].isna() | sumstats[nea].isna()
|
|
668
|
-
log.write(" -Variants with NA for EA or NEA: {}".format(sum(is_eanea_na)), verbose=verbose)
|
|
669
|
-
|
|
670
|
-
## check same alleles
|
|
671
|
-
not_variant = sumstats[nea] == sumstats[ea]
|
|
672
|
-
log.write(" -Variants with same EA and NEA: {}".format(sum(not_variant)), verbose=verbose)
|
|
693
|
+
##start function with col checking##########################################################
|
|
694
|
+
_start_line = "fix alleles (EA and NEA)"
|
|
695
|
+
_end_line = "fixing alleles (EA and NEA)"
|
|
696
|
+
_start_cols =[ea, nea,status]
|
|
697
|
+
_start_function = ".fix_allele()"
|
|
698
|
+
_must_args ={}
|
|
673
699
|
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
700
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
701
|
+
log=log,
|
|
702
|
+
verbose=verbose,
|
|
703
|
+
start_line=_start_line,
|
|
704
|
+
end_line=_end_line,
|
|
705
|
+
start_cols=_start_cols,
|
|
706
|
+
start_function=_start_function,
|
|
707
|
+
**_must_args)
|
|
708
|
+
if is_enough_info == False: return sumstats
|
|
709
|
+
############################################################################################
|
|
710
|
+
#try:
|
|
711
|
+
# ea_missing = sum(sumstats[ea].isna())
|
|
712
|
+
# nea_missing = sum(sumstats[nea].isna())
|
|
713
|
+
# if sum(ea_missing)>0:
|
|
714
|
+
# log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
|
|
715
|
+
# sumstats[ea] = sumstats[ea].add_categories("N").fillna("N")
|
|
716
|
+
# if sum(sumstats[nea].isna())>0:
|
|
717
|
+
# log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
|
|
718
|
+
# sumstats[nea] = sumstats[nea].add_categories("N").fillna("N")
|
|
719
|
+
#except:
|
|
720
|
+
# pass
|
|
721
|
+
|
|
722
|
+
log.write(" -Converted all bases to string datatype and UPPERCASE.", verbose=verbose)
|
|
723
|
+
categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
|
|
724
|
+
categories = {x for x in categories if pd.notna(x)}
|
|
725
|
+
sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
|
|
726
|
+
sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
|
|
727
|
+
all_var_num = len(sumstats)
|
|
728
|
+
|
|
729
|
+
## check ATCG
|
|
730
|
+
bad_ea = sumstats[ea].str.contains("[^actgACTG]",na=True)
|
|
731
|
+
bad_nea = sumstats[nea].str.contains("[^actgACTG]",na=True)
|
|
732
|
+
good_ea = ~bad_ea
|
|
733
|
+
good_nea = ~bad_nea
|
|
734
|
+
|
|
735
|
+
log.write(" -Variants with bad EA : {}".format(sum(bad_ea)), verbose=verbose)
|
|
736
|
+
log.write(" -Variants with bad NEA : {}".format(sum(bad_nea)), verbose=verbose)
|
|
737
|
+
|
|
738
|
+
## check NA
|
|
739
|
+
is_eanea_na = sumstats[ea].isna() | sumstats[nea].isna()
|
|
740
|
+
log.write(" -Variants with NA for EA or NEA: {}".format(sum(is_eanea_na)), verbose=verbose)
|
|
741
|
+
|
|
742
|
+
## check same alleles
|
|
743
|
+
not_variant = sumstats[nea] == sumstats[ea]
|
|
744
|
+
log.write(" -Variants with same EA and NEA: {}".format(sum(not_variant)), verbose=verbose)
|
|
745
|
+
|
|
746
|
+
## sum up invalid variants
|
|
747
|
+
is_invalid = bad_ea | bad_nea | not_variant
|
|
748
|
+
|
|
749
|
+
exclude = bad_nea | bad_ea
|
|
750
|
+
|
|
751
|
+
if len(set(sumstats.loc[bad_ea,ea].head())) >0:
|
|
752
|
+
log.write(" -A look at the non-ATCG EA:",set(sumstats.loc[bad_ea,ea].head()),"...", verbose=verbose)
|
|
753
|
+
if len(set(sumstats.loc[bad_nea,nea].head())) >0:
|
|
754
|
+
log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...", verbose=verbose)
|
|
755
|
+
|
|
756
|
+
if remove == True:
|
|
757
|
+
sumstats = sumstats.loc[(good_ea & good_nea),:].copy()
|
|
758
|
+
good_eanea_num = len(sumstats)
|
|
759
|
+
log.write(" -Removed "+str(all_var_num - good_eanea_num)+" variants with NA alleles or alleles that contain bases other than A/C/T/G.", verbose=verbose)
|
|
760
|
+
sumstats = sumstats.loc[(good_ea & good_nea & (~not_variant)),:].copy()
|
|
761
|
+
good_eanea_notsame_num = len(sumstats)
|
|
762
|
+
log.write(" -Removed "+str(good_eanea_num - good_eanea_notsame_num)+" variants with same allele for EA and NEA.", verbose=verbose)
|
|
763
|
+
else:
|
|
764
|
+
sumstats[[ea,nea]] = sumstats[[ea,nea]].fillna("N")
|
|
765
|
+
log.write(" -Detected "+str(sum(exclude))+" variants with alleles that contain bases other than A/C/T/G .", verbose=verbose)
|
|
766
|
+
categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
|
|
767
|
+
sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
|
|
768
|
+
sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
|
|
769
|
+
|
|
770
|
+
is_eanea_fixed = good_ea | good_nea
|
|
771
|
+
is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
|
|
772
|
+
is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
|
|
773
|
+
is_not_normalized = (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()>1)
|
|
774
|
+
is_normalized = is_indel &( (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()>1) | (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()==1) )
|
|
775
|
+
|
|
776
|
+
if sum(is_invalid)>0:
|
|
777
|
+
sumstats.loc[is_invalid, status] = vchange_status(sumstats.loc[is_invalid,status], 5,"9","6")
|
|
778
|
+
if sum(is_eanea_na)>0:
|
|
779
|
+
sumstats.loc[is_eanea_na,status] = vchange_status(sumstats.loc[is_eanea_na, status], 5,"9","7")
|
|
780
|
+
if sum(is_eanea_fixed&is_not_normalized)>0:
|
|
781
|
+
sumstats.loc[is_eanea_fixed&is_not_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_not_normalized,status], 5,"9","5")
|
|
782
|
+
if sum(is_eanea_fixed&is_snp)>0:
|
|
783
|
+
sumstats.loc[is_eanea_fixed&is_snp, status] = vchange_status(sumstats.loc[is_eanea_fixed&is_snp,status], 5,"9","0")
|
|
784
|
+
if sum(is_eanea_fixed&is_indel)>0:
|
|
785
|
+
sumstats.loc[is_eanea_fixed&is_indel,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_indel, status], 5,"9","4")
|
|
786
|
+
if sum(is_eanea_fixed&is_normalized)>0:
|
|
787
|
+
sumstats.loc[is_eanea_fixed&is_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_normalized, status], 5,"4","3")
|
|
788
|
+
|
|
789
|
+
finished(log,verbose,_end_line)
|
|
790
|
+
return sumstats
|
|
721
791
|
|
|
722
792
|
###############################################################################################################
|
|
723
793
|
# 20220721
|
|
724
794
|
|
|
725
795
|
def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
796
|
+
##start function with col checking##########################################################
|
|
797
|
+
_start_line = "normalize indels"
|
|
798
|
+
_end_line = "normalizing indels"
|
|
799
|
+
_start_cols =[ea, nea,status]
|
|
800
|
+
_start_function = ".normalize()"
|
|
801
|
+
_must_args ={}
|
|
802
|
+
|
|
803
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
804
|
+
log=log,
|
|
805
|
+
verbose=verbose,
|
|
806
|
+
start_line=_start_line,
|
|
807
|
+
end_line=_end_line,
|
|
808
|
+
start_cols=_start_cols,
|
|
809
|
+
start_function=_start_function,
|
|
810
|
+
**_must_args)
|
|
811
|
+
if is_enough_info == False: return sumstats
|
|
812
|
+
############################################################################################
|
|
813
|
+
|
|
732
814
|
#variants_to_check = status_match(sumstats[status],5,[4,5]) #
|
|
733
815
|
#r'\w\w\w\w[45]\w\w'
|
|
734
816
|
variants_to_check = sumstats[status].str[4].str.match(r'4|5', case=False, flags=0, na=False)
|
|
735
817
|
if sum(variants_to_check)==0:
|
|
736
|
-
|
|
737
|
-
|
|
818
|
+
log.write(" -No available variants to normalize..", verbose=verbose)
|
|
819
|
+
log.write("Finished normalizing variants successfully!", verbose=verbose)
|
|
738
820
|
return sumstats
|
|
739
821
|
###############################################################################################################
|
|
740
822
|
if sum(variants_to_check)>0:
|
|
@@ -742,46 +824,46 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
742
824
|
n_cores=1
|
|
743
825
|
pool = Pool(n_cores)
|
|
744
826
|
map_func = partial(normalizeallele,pos=pos,nea=nea,ea=ea,status=status)
|
|
745
|
-
df_split = np.array_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
|
|
827
|
+
#df_split = np.array_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
|
|
828
|
+
df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
|
|
746
829
|
normalized_pd = pd.concat(pool.map(map_func,df_split))
|
|
747
830
|
pool.close()
|
|
748
831
|
pool.join()
|
|
749
832
|
###############################################################################################################
|
|
750
833
|
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
if
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
before_normalize_id = sumstats.loc[variants_to_check,rsid]
|
|
759
|
-
else:
|
|
760
|
-
before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
|
|
761
|
-
|
|
762
|
-
log.write(" -Not normalized allele IDs:",end="")
|
|
763
|
-
for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
|
|
764
|
-
log.write(i,end=" ",show_time=False)
|
|
765
|
-
log.write("... \n",end="",show_time=False)
|
|
766
|
-
|
|
767
|
-
log.write(" -Not normalized allele:",end="")
|
|
768
|
-
for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
|
|
769
|
-
log.write(i,end="",show_time=False)
|
|
770
|
-
log.write("... \n",end="",show_time=False)
|
|
771
|
-
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.")
|
|
834
|
+
before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
|
|
835
|
+
changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
|
|
836
|
+
if changed_num>0:
|
|
837
|
+
if snpid in sumstats.columns:
|
|
838
|
+
before_normalize_id = sumstats.loc[variants_to_check,snpid]
|
|
839
|
+
elif rsid in sumstats.columns:
|
|
840
|
+
before_normalize_id = sumstats.loc[variants_to_check,rsid]
|
|
772
841
|
else:
|
|
773
|
-
|
|
842
|
+
before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
|
|
843
|
+
|
|
844
|
+
log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
|
|
845
|
+
for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
|
|
846
|
+
log.write(i,end=" ",show_time=False)
|
|
847
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
848
|
+
|
|
849
|
+
log.write(" -Not normalized allele:",end="", verbose=verbose)
|
|
850
|
+
for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
|
|
851
|
+
log.write(i,end="",show_time=False, verbose=verbose)
|
|
852
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
853
|
+
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
|
|
854
|
+
else:
|
|
855
|
+
log.write(" -All variants are already normalized..", verbose=verbose)
|
|
774
856
|
###################################################################################################################
|
|
775
|
-
categories = set(sumstats
|
|
776
|
-
sumstats
|
|
777
|
-
sumstats
|
|
857
|
+
categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
|
|
858
|
+
sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
|
|
859
|
+
sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
|
|
778
860
|
sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
|
|
779
861
|
try:
|
|
780
|
-
sumstats
|
|
862
|
+
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
781
863
|
except:
|
|
782
|
-
sumstats
|
|
864
|
+
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
783
865
|
|
|
784
|
-
|
|
866
|
+
finished(log,verbose,_end_line)
|
|
785
867
|
return sumstats
|
|
786
868
|
|
|
787
869
|
def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
|
|
@@ -846,6 +928,52 @@ def add_tolerence(stats, float_tolerence, mode):
|
|
|
846
928
|
stats = (stats[0] , stats[1] + float_tolerence if stats[0]!=float("Inf") else float("Inf"))
|
|
847
929
|
return stats
|
|
848
930
|
|
|
931
|
+
|
|
932
|
+
def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, verbose, dtype="Int64"):
|
|
933
|
+
pre_number=len(sumstats)
|
|
934
|
+
if header in coltocheck and header in sumstats.columns:
|
|
935
|
+
cols_to_check.append(header)
|
|
936
|
+
if header=="STATUS":
|
|
937
|
+
log.write(" -Checking STATUS and converting STATUS to categories....", verbose=verbose)
|
|
938
|
+
categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
939
|
+
sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
|
|
940
|
+
return sumstats
|
|
941
|
+
|
|
942
|
+
if dtype in ["Int64","Int32","int","int32","in64"]:
|
|
943
|
+
log.write(" -Checking if {} <= {} <= {} ...".format( var_range[0] ,header, var_range[1]), verbose=verbose)
|
|
944
|
+
sumstats[header] = np.floor(pd.to_numeric(sumstats[header], errors='coerce')).astype(dtype)
|
|
945
|
+
|
|
946
|
+
elif dtype in ["Float64","Float32","float","float64","float32"]:
|
|
947
|
+
log.write(" -Checking if {} < {} < {} ...".format( var_range[0] ,header, var_range[1]),verbose=verbose)
|
|
948
|
+
sumstats[header] = pd.to_numeric(sumstats[header], errors='coerce').astype(dtype)
|
|
949
|
+
|
|
950
|
+
is_valid = (sumstats[header]>=var_range[0]) & (sumstats[header]<=var_range[1])
|
|
951
|
+
is_valid = is_valid.fillna(False)
|
|
952
|
+
|
|
953
|
+
if header=="P":
|
|
954
|
+
is_low_p = sumstats["P"] == 0
|
|
955
|
+
if sum(is_low_p) >0:
|
|
956
|
+
log.warning("Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)))
|
|
957
|
+
log.warning("Please consider using MLOG10P instead.")
|
|
958
|
+
|
|
959
|
+
if sum(~is_valid)>0:
|
|
960
|
+
try:
|
|
961
|
+
if "SNPID" in sumstats.columns:
|
|
962
|
+
id_to_use = "SNPID"
|
|
963
|
+
elif "rsID" in sumstats.columns:
|
|
964
|
+
id_to_use = "rsID"
|
|
965
|
+
invalid_ids = sumstats.loc[~is_valid, id_to_use].head().astype("string")
|
|
966
|
+
invalid_values = sumstats.loc[~is_valid, header].head().astype("string").fillna("NA")
|
|
967
|
+
log.write(" -Examples of invalid variants({}): {} ...".format(id_to_use, ",".join(invalid_ids.to_list()) ), verbose=verbose)
|
|
968
|
+
log.write(" -Examples of invalid values ({}): {} ...".format(header, ",".join(invalid_values.to_list()) ), verbose=verbose)
|
|
969
|
+
except:
|
|
970
|
+
pass
|
|
971
|
+
|
|
972
|
+
sumstats = sumstats.loc[is_valid,:]
|
|
973
|
+
after_number=len(sumstats)
|
|
974
|
+
log.write(" -Removed {} variants with bad/na {}.".format(pre_number - after_number, header), verbose=verbose)
|
|
975
|
+
return sumstats
|
|
976
|
+
|
|
849
977
|
def sanitycheckstats(sumstats,
|
|
850
978
|
coltocheck=None,
|
|
851
979
|
n=(0,2**31-1),
|
|
@@ -853,8 +981,10 @@ def sanitycheckstats(sumstats,
|
|
|
853
981
|
ncontrol=(0,2**31-1),
|
|
854
982
|
eaf=(0,1),
|
|
855
983
|
mac=(0,2**31-1),
|
|
984
|
+
maf=(0,0.5),
|
|
856
985
|
chisq=(0,float("Inf")),
|
|
857
986
|
z=(-9999,9999),
|
|
987
|
+
t=(-99999,99999),
|
|
858
988
|
f=(0,float("Inf")),
|
|
859
989
|
p=(0,1),
|
|
860
990
|
mlog10p=(0,9999),
|
|
@@ -885,10 +1015,30 @@ def sanitycheckstats(sumstats,
|
|
|
885
1015
|
HR_95U: float64 , HR_95L >0
|
|
886
1016
|
INFO: float32 , 1>=INFO>0
|
|
887
1017
|
Z float64 , -9999 < Z < 9999
|
|
1018
|
+
T float64 , -99999 < T < 99999
|
|
888
1019
|
F float64 , F > 0
|
|
889
1020
|
'''
|
|
1021
|
+
##start function with col checking##########################################################
|
|
1022
|
+
_start_line = "perform sanity check for statistics"
|
|
1023
|
+
_end_line = "sanity check for statistics"
|
|
1024
|
+
_start_cols =[]
|
|
1025
|
+
_start_function = ".check_sanity()"
|
|
1026
|
+
_must_args ={}
|
|
1027
|
+
|
|
1028
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1029
|
+
log=log,
|
|
1030
|
+
verbose=verbose,
|
|
1031
|
+
start_line=_start_line,
|
|
1032
|
+
end_line=_end_line,
|
|
1033
|
+
start_cols=_start_cols,
|
|
1034
|
+
start_function=_start_function,
|
|
1035
|
+
**_must_args)
|
|
1036
|
+
if is_enough_info == False: return sumstats
|
|
1037
|
+
############################################################################################
|
|
890
1038
|
|
|
1039
|
+
log.write(" -Comparison tolerance for floats: {}".format(float_tolerence), verbose=verbose)
|
|
891
1040
|
eaf = add_tolerence(eaf, float_tolerence, "lr")
|
|
1041
|
+
maf = add_tolerence(maf, float_tolerence, "lr")
|
|
892
1042
|
beta = add_tolerence(beta, float_tolerence, "lr")
|
|
893
1043
|
se = add_tolerence(se, float_tolerence, "lr")
|
|
894
1044
|
mlog10p = add_tolerence(mlog10p, float_tolerence, "lr")
|
|
@@ -903,233 +1053,83 @@ def sanitycheckstats(sumstats,
|
|
|
903
1053
|
p = add_tolerence(p, float_tolerence, "lr")
|
|
904
1054
|
f = add_tolerence(f, float_tolerence, "lr")
|
|
905
1055
|
chisq = add_tolerence(chisq, float_tolerence, "lr")
|
|
906
|
-
|
|
907
|
-
|
|
1056
|
+
############################################################################################
|
|
908
1057
|
## add direction
|
|
909
1058
|
if coltocheck is None:
|
|
910
1059
|
coltocheck = ["P","MLOG10P","INFO","Z","BETA","SE","EAF","CHISQ","F","N","N_CASE","N_CONTROL","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","STATUS"]
|
|
911
|
-
|
|
912
|
-
check_dataframe_shape(sumstats, log, verbose)
|
|
1060
|
+
|
|
913
1061
|
cols_to_check=[]
|
|
914
1062
|
oringinal_number=len(sumstats)
|
|
915
1063
|
sumstats = sumstats.copy()
|
|
916
1064
|
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
cols_to_check.append("N")
|
|
922
|
-
if verbose: log.write(" -Checking if ",n[0],"<=N<=",n[1]," ...")
|
|
923
|
-
sumstats.loc[:,"N"] = np.floor(pd.to_numeric(sumstats.loc[:,"N"], errors='coerce')).astype("Int64")
|
|
924
|
-
sumstats = sumstats.loc[(sumstats["N"]>=n[0]) & (sumstats["N"]<=n[1]),:]
|
|
925
|
-
after_number=len(sumstats)
|
|
926
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N.")
|
|
927
|
-
pre_number=len(sumstats)
|
|
928
|
-
if "N_CASE" in coltocheck and "N_CASE" in sumstats.columns:
|
|
929
|
-
cols_to_check.append("N_CASE")
|
|
930
|
-
if verbose: log.write(" -Checking if ",ncase[0],"<=N_CASE<=",ncase[1]," ...")
|
|
931
|
-
sumstats.loc[:,"N_CASE"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CASE"], errors='coerce')).astype("Int64")
|
|
932
|
-
sumstats = sumstats.loc[(sumstats["N_CASE"]>=ncase[0]) & (sumstats["N_CASE"]<=ncase[1]),:]
|
|
933
|
-
after_number=len(sumstats)
|
|
934
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CASE.")
|
|
935
|
-
pre_number=len(sumstats)
|
|
936
|
-
if "N_CONTROL" in coltocheck and "N_CONTROL" in sumstats.columns:
|
|
937
|
-
cols_to_check.append("N_CONTROL")
|
|
938
|
-
if verbose: log.write(" -Checking if ",ncontrol[0],"<=N_CONTROL<=",ncontrol[1]," ...")
|
|
939
|
-
sumstats.loc[:,"N_CONTROL"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CONTROL"], errors='coerce')).astype("Int64")
|
|
940
|
-
sumstats = sumstats.loc[(sumstats["N_CONTROL"]>=ncontrol[0]) & (sumstats["N_CONTROL"]<=ncontrol[1]),:]
|
|
941
|
-
after_number=len(sumstats)
|
|
942
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CONTROL.")
|
|
1065
|
+
###Int64 ################################################################################################################################################
|
|
1066
|
+
sumstats = check_range(sumstats, var_range=n, header="N", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
|
|
1067
|
+
sumstats = check_range(sumstats, var_range=ncase, header="N_CASE", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
|
|
1068
|
+
sumstats = check_range(sumstats, var_range=ncontrol, header="N_CONTROL", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
|
|
943
1069
|
|
|
1070
|
+
###float32 ################################################################################################################################################
|
|
1071
|
+
sumstats = check_range(sumstats, var_range=eaf, header="EAF", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
|
|
1072
|
+
sumstats = check_range(sumstats, var_range=maf, header="MAF", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
|
|
1073
|
+
sumstats = check_range(sumstats, var_range=info, header="INFO", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
|
|
944
1074
|
|
|
945
|
-
###
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
macu = ( sumstats["_MAC"] <= mac[1])
|
|
963
|
-
sumstats = sumstats.loc[macl&macu,:]
|
|
964
|
-
sumstats = sumstats.drop(labels=["_MAF","_MAC"],axis=1)
|
|
965
|
-
after_number=len(sumstats)
|
|
966
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad MAC.")
|
|
967
|
-
|
|
968
|
-
###TEST STATISTICS################################################################################################################################################
|
|
969
|
-
pre_number=len(sumstats)
|
|
970
|
-
if "CHISQ" in coltocheck and "CHISQ" in sumstats.columns:
|
|
971
|
-
cols_to_check.append("CHISQ")
|
|
972
|
-
if verbose: log.write(" -Checking if ",chisq[0],"<CHISQ<",chisq[1]," ...")
|
|
973
|
-
sumstats.loc[:,"CHISQ"] = pd.to_numeric(sumstats.loc[:,"CHISQ"], errors='coerce').astype("float64")
|
|
974
|
-
sumstats = sumstats.loc[(sumstats["CHISQ"]>chisq[0]) & (sumstats["CHISQ"]<chisq[1]),:]
|
|
975
|
-
after_number=len(sumstats)
|
|
976
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad CHISQ.")
|
|
977
|
-
|
|
978
|
-
pre_number=len(sumstats)
|
|
979
|
-
if "Z" in coltocheck and "Z" in sumstats.columns:
|
|
980
|
-
cols_to_check.append("Z")
|
|
981
|
-
if verbose: log.write(" -Checking if ",z[0],"<Z<",z[1]," ...")
|
|
982
|
-
sumstats.loc[:,"Z"] = pd.to_numeric(sumstats.loc[:,"Z"], errors='coerce').astype("float64")
|
|
983
|
-
sumstats = sumstats.loc[(sumstats["Z"]>z[0]) & (sumstats["Z"]<z[1]),:]
|
|
984
|
-
after_number=len(sumstats)
|
|
985
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad Z.")
|
|
986
|
-
|
|
987
|
-
pre_number=len(sumstats)
|
|
988
|
-
if "F" in coltocheck and "F" in sumstats.columns:
|
|
989
|
-
cols_to_check.append("F")
|
|
990
|
-
if verbose: log.write(" -Checking if ",f[0],"<F<",f[1]," ...")
|
|
991
|
-
sumstats.loc[:,"F"] = pd.to_numeric(sumstats.loc[:,"F"], errors='coerce').astype("float64")
|
|
992
|
-
sumstats = sumstats.loc[(sumstats["F"]>f[0]) & (sumstats["F"]<f[1]),:]
|
|
993
|
-
after_number=len(sumstats)
|
|
994
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad F.")
|
|
995
|
-
|
|
996
|
-
###P ################################################################################################################################################
|
|
997
|
-
pre_number=len(sumstats)
|
|
998
|
-
if "P" in coltocheck and "P" in sumstats.columns:
|
|
999
|
-
cols_to_check.append("P")
|
|
1000
|
-
if verbose: log.write(" -Checking if ",p[0],"< P <",p[1]," ...")
|
|
1001
|
-
sumstats.loc[:,"P"] = pd.to_numeric(sumstats.loc[:,"P"], errors='coerce').astype("float64")
|
|
1002
|
-
sumstats = sumstats.loc[(sumstats["P"]>p[0]) & (sumstats["P"]<p[1]),:]
|
|
1003
|
-
|
|
1004
|
-
is_low_p = sumstats["P"] == 0
|
|
1005
|
-
if sum(is_low_p) >0:
|
|
1006
|
-
log.write(" -WARNING! Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)), verbose=verbose)
|
|
1007
|
-
log.write(" -WARNING! Please consider using MLOG10P instead.", verbose=verbose)
|
|
1008
|
-
after_number=len(sumstats)
|
|
1009
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad P.")
|
|
1010
|
-
|
|
1011
|
-
pre_number=len(sumstats)
|
|
1012
|
-
if "MLOG10P" in coltocheck and "MLOG10P" in sumstats.columns:
|
|
1013
|
-
cols_to_check.append("MLOG10P")
|
|
1014
|
-
if verbose: log.write(" -Checking if ",mlog10p[0],"<MLOG10P<",mlog10p[1]," ...")
|
|
1015
|
-
sumstats.loc[:,"MLOG10P"] = pd.to_numeric(sumstats.loc[:,"MLOG10P"], errors='coerce').astype("float64")
|
|
1016
|
-
sumstats = sumstats.loc[(sumstats["MLOG10P"]>mlog10p[0]) & (sumstats["MLOG10P"]<mlog10p[1]),:]
|
|
1017
|
-
after_number=len(sumstats)
|
|
1018
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad MLOG10P.")
|
|
1019
|
-
|
|
1020
|
-
###EFFECT ################################################################################################################################################
|
|
1021
|
-
pre_number=len(sumstats)
|
|
1022
|
-
if "BETA" in coltocheck and "BETA" in sumstats.columns:
|
|
1023
|
-
cols_to_check.append("BETA")
|
|
1024
|
-
if verbose: log.write(" -Checking if ",beta[0],"<BETA<",beta[1]," ...")
|
|
1025
|
-
sumstats.loc[:,"BETA"] = pd.to_numeric(sumstats.loc[:,"BETA"], errors='coerce').astype("float64")
|
|
1026
|
-
sumstats = sumstats.loc[(sumstats["BETA"]>beta[0]) & (sumstats["BETA"]<beta[1]),:]
|
|
1027
|
-
after_number=len(sumstats)
|
|
1028
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad BETA.")
|
|
1029
|
-
|
|
1030
|
-
pre_number=len(sumstats)
|
|
1031
|
-
if "SE" in coltocheck and "SE" in sumstats.columns:
|
|
1032
|
-
cols_to_check.append("SE")
|
|
1033
|
-
if verbose: log.write(" -Checking if ",se[0],"<SE<",se[1]," ...")
|
|
1034
|
-
sumstats.loc[:,"SE"] = pd.to_numeric(sumstats.loc[:,"SE"], errors='coerce').astype("float64")
|
|
1035
|
-
sumstats = sumstats.loc[(sumstats["SE"]>se[0]) & (sumstats["SE"]<se[1]),:]
|
|
1036
|
-
after_number=len(sumstats)
|
|
1037
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad SE.")
|
|
1038
|
-
|
|
1039
|
-
pre_number=len(sumstats)
|
|
1040
|
-
if "OR" in coltocheck and "OR" in sumstats.columns:
|
|
1041
|
-
cols_to_check.append("OR")
|
|
1042
|
-
if verbose: log.write(" -Checking if ",OR[0],"<log(OR)<",OR[1]," ...")
|
|
1043
|
-
sumstats.loc[:,"OR"] = pd.to_numeric(sumstats.loc[:,"OR"], errors='coerce').astype("float64")
|
|
1044
|
-
sumstats = sumstats.loc[(np.log(sumstats["OR"])>OR[0]) & (np.log(sumstats["OR"])<OR[1]),:]
|
|
1045
|
-
after_number=len(sumstats)
|
|
1046
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR.")
|
|
1047
|
-
|
|
1048
|
-
pre_number=len(sumstats)
|
|
1049
|
-
if "OR_95L" in coltocheck and "OR_95L" in sumstats.columns:
|
|
1050
|
-
cols_to_check.append("OR_95L")
|
|
1051
|
-
if verbose: log.write(" -Checking if ",OR_95L[0],"<OR_95L<",OR_95L[1]," ...")
|
|
1052
|
-
sumstats.loc[:,"OR_95L"] = pd.to_numeric(sumstats.loc[:,"OR_95L"], errors='coerce').astype("float64")
|
|
1053
|
-
sumstats = sumstats.loc[(sumstats["OR_95L"]>OR_95L[0]) & (sumstats["OR_95L"]<OR_95L[1]),:]
|
|
1054
|
-
after_number=len(sumstats)
|
|
1055
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR_95L.")
|
|
1056
|
-
|
|
1057
|
-
pre_number=len(sumstats)
|
|
1058
|
-
if "OR_95U" in coltocheck and "OR_95U" in sumstats.columns:
|
|
1059
|
-
cols_to_check.append("OR_95U")
|
|
1060
|
-
if verbose: log.write(" -Checking if ",OR_95U[0],"<OR_95U<",OR_95U[1]," ...")
|
|
1061
|
-
sumstats.loc[:,"OR_95U"] = pd.to_numeric(sumstats.loc[:,"OR_95U"], errors='coerce').astype("float64")
|
|
1062
|
-
sumstats = sumstats.loc[(sumstats["OR_95U"]>OR_95U[0]) & (sumstats["OR_95U"]<OR_95U[1]),:]
|
|
1063
|
-
after_number=len(sumstats)
|
|
1064
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR_95U.")
|
|
1065
|
-
|
|
1066
|
-
pre_number=len(sumstats)
|
|
1067
|
-
if "HR" in coltocheck and "HR" in sumstats.columns:
|
|
1068
|
-
cols_to_check.append("HR")
|
|
1069
|
-
if verbose: log.write(" -Checking if ",HR[0],"<log(HR)<",HR[1]," ...")
|
|
1070
|
-
sumstats.loc[:,"HR"] = pd.to_numeric(sumstats.loc[:,"HR"], errors='coerce').astype("float64")
|
|
1071
|
-
sumstats = sumstats.loc[(np.log(sumstats["HR"])>HR[0]) & (np.log(sumstats["HR"])<HR[1]),:]
|
|
1072
|
-
after_number=len(sumstats)
|
|
1073
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR.")
|
|
1074
|
-
|
|
1075
|
-
pre_number=len(sumstats)
|
|
1076
|
-
if "HR_95L" in coltocheck and "HR_95L" in sumstats.columns:
|
|
1077
|
-
cols_to_check.append("HR_95L")
|
|
1078
|
-
if verbose: log.write(" -Checking if ",HR_95L[0],"<HR_95L<",HR_95L[1]," ...")
|
|
1079
|
-
sumstats.loc[:,"HR_95L"] = pd.to_numeric(sumstats.loc[:,"HR_95L"], errors='coerce').astype("float64")
|
|
1080
|
-
sumstats = sumstats.loc[(sumstats["HR_95L"]>HR_95L[0]) & (sumstats["HR_95L"]<HR_95L[1]),:]
|
|
1081
|
-
after_number=len(sumstats)
|
|
1082
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR_95L.")
|
|
1083
|
-
|
|
1084
|
-
pre_number=len(sumstats)
|
|
1085
|
-
if "HR_95U" in coltocheck and "HR_95U" in sumstats.columns:
|
|
1086
|
-
cols_to_check.append("HR_95U")
|
|
1087
|
-
if verbose: log.write(" -Checking if ",HR_95U[0],"<HR_95U<",HR_95U[1]," ...")
|
|
1088
|
-
sumstats.loc[:,"HR_95U"] = pd.to_numeric(sumstats.loc[:,"HR_95U"], errors='coerce').astype("float64")
|
|
1089
|
-
sumstats = sumstats.loc[(sumstats["HR_95U"]>HR_95U[0]) & (sumstats["HR_95U"]<HR_95U[1]),:]
|
|
1090
|
-
after_number=len(sumstats)
|
|
1091
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR_95U.")
|
|
1092
|
-
#INFO #################################################################################################################
|
|
1093
|
-
pre_number=len(sumstats)
|
|
1094
|
-
if "INFO" in coltocheck and "INFO" in sumstats.columns:
|
|
1095
|
-
cols_to_check.append("INFO")
|
|
1096
|
-
if verbose: log.write(" -Checking if ",info[0],"<INFO<",info[1]," ...")
|
|
1097
|
-
sumstats.loc[:,"INFO"] = pd.to_numeric(sumstats.loc[:,"INFO"], errors='coerce').astype("float32")
|
|
1098
|
-
sumstats = sumstats.loc[(sumstats["INFO"]>info[0]) & (sumstats["INFO"]<info[1]),:]
|
|
1099
|
-
after_number=len(sumstats)
|
|
1100
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad INFO.")
|
|
1101
|
-
###STATUS ################################################################################################################################################
|
|
1102
|
-
pre_number=len(sumstats)
|
|
1103
|
-
if "STATUS" in coltocheck and "STATUS" in sumstats.columns:
|
|
1104
|
-
cols_to_check.append("STATUS")
|
|
1105
|
-
if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
|
|
1106
|
-
categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
1107
|
-
sumstats.loc[:,"STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
|
|
1108
|
-
|
|
1109
|
-
#pre_number=len(sumstats)
|
|
1110
|
-
#sumstats = sumstats.dropna(subset=cols_to_check)
|
|
1111
|
-
after_number=len(sumstats)
|
|
1112
|
-
#if verbose:log.write(" -Removed {} variants with NAs in the checked columns...".format(pre_number - after_number))
|
|
1075
|
+
###float64 ################################################################################################################################################
|
|
1076
|
+
sumstats = check_range(sumstats, var_range=chisq, header="CHISQ", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1077
|
+
sumstats = check_range(sumstats, var_range=z, header="Z", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1078
|
+
sumstats = check_range(sumstats, var_range=t, header="T", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1079
|
+
sumstats = check_range(sumstats, var_range=f, header="F", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1080
|
+
sumstats = check_range(sumstats, var_range=p, header="P", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1081
|
+
sumstats = check_range(sumstats, var_range=mlog10p, header="MLOG10P", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1082
|
+
sumstats = check_range(sumstats, var_range=beta, header="BETA", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1083
|
+
sumstats = check_range(sumstats, var_range=se, header="SE", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1084
|
+
sumstats = check_range(sumstats, var_range=OR, header="OR", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1085
|
+
sumstats = check_range(sumstats, var_range=OR_95L, header="OR_95L", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1086
|
+
sumstats = check_range(sumstats, var_range=OR_95U, header="OR_95U", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1087
|
+
sumstats = check_range(sumstats, var_range=HR, header="HR", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1088
|
+
sumstats = check_range(sumstats, var_range=HR_95L, header="HR_95L", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1089
|
+
sumstats = check_range(sumstats, var_range=HR_95U, header="HR_95U", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1090
|
+
###STATUS ###############################################################################################################################################
|
|
1091
|
+
sumstats = check_range(sumstats, var_range=None, header="STATUS", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="category")
|
|
1113
1092
|
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1093
|
+
after_number=len(sumstats)
|
|
1094
|
+
log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.",verbose=verbose)
|
|
1095
|
+
log.write(" -Data types for each column:",verbose=verbose)
|
|
1096
|
+
check_datatype(sumstats,verbose=verbose, log=log)
|
|
1097
|
+
finished(log,verbose,_end_line)
|
|
1119
1098
|
return sumstats
|
|
1120
1099
|
|
|
1121
1100
|
### check consistency #############################################################################################################################################
|
|
1122
1101
|
|
|
1123
|
-
def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
|
|
1124
|
-
|
|
1125
|
-
|
|
1102
|
+
def _check_data_consistency(sumstats, beta="BETA", se="SE", p="P",mlog10p="MLOG10P",rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
|
|
1103
|
+
##start function with col checking##########################################################
|
|
1104
|
+
_start_line = "check data consistency across columns"
|
|
1105
|
+
_end_line = "checking data consistency across columns"
|
|
1106
|
+
_start_cols =[]
|
|
1107
|
+
_start_function = ".check_data_consistency()"
|
|
1108
|
+
_must_args ={}
|
|
1109
|
+
|
|
1110
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1111
|
+
log=log,
|
|
1112
|
+
verbose=verbose,
|
|
1113
|
+
start_line=_start_line,
|
|
1114
|
+
end_line=_end_line,
|
|
1115
|
+
start_cols=_start_cols,
|
|
1116
|
+
start_function=_start_function,
|
|
1117
|
+
**_must_args)
|
|
1118
|
+
if is_enough_info == False: return sumstats
|
|
1119
|
+
############################################################################################
|
|
1120
|
+
|
|
1126
1121
|
log.write(" -Tolerance: {} (Relative) and {} (Absolute)".format(rtol, atol),verbose=verbose)
|
|
1122
|
+
check_status = 0
|
|
1127
1123
|
|
|
1128
|
-
|
|
1129
|
-
|
|
1124
|
+
if "SNPID" in sumstats.columns:
|
|
1125
|
+
id_to_use = "SNPID"
|
|
1126
|
+
elif "rsID" in sumstats.columns:
|
|
1130
1127
|
id_to_use = "rsID"
|
|
1131
1128
|
else:
|
|
1132
|
-
|
|
1129
|
+
log.write(" -SNPID/rsID not available...SKipping",verbose=verbose)
|
|
1130
|
+
log.write("Finished checking data consistency across columns.",verbose=verbose)
|
|
1131
|
+
return 0
|
|
1132
|
+
|
|
1133
1133
|
|
|
1134
1134
|
if "BETA" in sumstats.columns and "SE" in sumstats.columns:
|
|
1135
1135
|
if "MLOG10P" in sumstats.columns:
|
|
@@ -1138,10 +1138,11 @@ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verb
|
|
|
1138
1138
|
is_close = np.isclose(betase_derived_mlog10p, sumstats["MLOG10P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1139
1139
|
diff = betase_derived_mlog10p - sumstats["MLOG10P"]
|
|
1140
1140
|
if sum(~is_close)>0:
|
|
1141
|
-
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose)
|
|
1142
|
-
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose)
|
|
1141
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
|
|
1142
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
|
|
1143
1143
|
else:
|
|
1144
1144
|
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1145
|
+
check_status=1
|
|
1145
1146
|
|
|
1146
1147
|
if "P" in sumstats.columns:
|
|
1147
1148
|
log.write(" -Checking if BETA/SE-derived-P is consistent with P...",verbose=verbose)
|
|
@@ -1149,10 +1150,11 @@ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verb
|
|
|
1149
1150
|
is_close = np.isclose(betase_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1150
1151
|
diff = betase_derived_p - sumstats["P"]
|
|
1151
1152
|
if sum(~is_close)>0:
|
|
1152
|
-
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose)
|
|
1153
|
-
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose)
|
|
1153
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
|
|
1154
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
|
|
1154
1155
|
else:
|
|
1155
1156
|
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1157
|
+
check_status=1
|
|
1156
1158
|
|
|
1157
1159
|
if "MLOG10P" in sumstats.columns and "P" in sumstats.columns:
|
|
1158
1160
|
log.write(" -Checking if MLOG10P-derived-P is consistent with P...",verbose=verbose)
|
|
@@ -1160,25 +1162,30 @@ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verb
|
|
|
1160
1162
|
is_close = np.isclose(mlog10p_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1161
1163
|
diff = mlog10p_derived_p - sumstats["P"]
|
|
1162
1164
|
if sum(~is_close)>0:
|
|
1163
|
-
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose)
|
|
1164
|
-
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose)
|
|
1165
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
|
|
1166
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
|
|
1165
1167
|
else:
|
|
1166
1168
|
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1169
|
+
check_status=1
|
|
1167
1170
|
|
|
1168
1171
|
if "N" in sumstats.columns and "N_CONTROL" in sumstats.columns and "N_CASE" in sumstats.columns:
|
|
1169
|
-
|
|
1170
|
-
is_close = sumstats
|
|
1171
|
-
#is_close = np.isclose(sumstats
|
|
1172
|
-
diff = abs(sumstats
|
|
1172
|
+
log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...", verbose=verbose)
|
|
1173
|
+
is_close = sumstats["N"] == sumstats["N_CASE"] + sumstats["N_CONTROL"]
|
|
1174
|
+
#is_close = np.isclose(sumstats["N"], sumstats["N_CASE"] + sumstats["N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1175
|
+
diff = abs(sumstats["N"] - (sumstats["N_CASE"] + sumstats["N_CONTROL"] ))
|
|
1173
1176
|
if sum(~is_close)>0:
|
|
1174
|
-
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose)
|
|
1175
|
-
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose)
|
|
1177
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
|
|
1178
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
|
|
1176
1179
|
else:
|
|
1177
1180
|
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1181
|
+
check_status=1
|
|
1182
|
+
|
|
1183
|
+
if check_status==1:
|
|
1184
|
+
log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
|
|
1185
|
+
else:
|
|
1186
|
+
log.write(" -No availalbe columns for data consistency checking...Skipping...",verbose=verbose)
|
|
1187
|
+
finished(log,verbose,_end_line)
|
|
1178
1188
|
|
|
1179
|
-
log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
|
|
1180
|
-
|
|
1181
|
-
if verbose: log.write("Finished checking data consistency across columns.")
|
|
1182
1189
|
###############################################################################################################
|
|
1183
1190
|
# 20220426
|
|
1184
1191
|
def get_reverse_complementary_allele(a):
|
|
@@ -1201,178 +1208,166 @@ def flip_direction(string):
|
|
|
1201
1208
|
else: #sometime it is 0
|
|
1202
1209
|
flipped_string+=char
|
|
1203
1210
|
return flipped_string
|
|
1204
|
-
|
|
1211
|
+
|
|
1212
|
+
def flip_by_swap(sumstats, matched_index, log, verbose):
|
|
1213
|
+
if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
|
|
1214
|
+
log.write(" -Swapping column: NEA <=> EA...", verbose=verbose)
|
|
1215
|
+
sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
|
|
1216
|
+
return sumstats
|
|
1217
|
+
|
|
1218
|
+
def flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1):
|
|
1219
|
+
if "OR" in sumstats.columns:
|
|
1220
|
+
log.write(" -Flipping column: OR = 1 / OR...", verbose=verbose)
|
|
1221
|
+
sumstats.loc[matched_index,"OR"] = factor / sumstats.loc[matched_index,"OR"].values
|
|
1222
|
+
if "OR_95L" in sumstats.columns:
|
|
1223
|
+
log.write(" -Flipping column: OR_95U = 1 / OR_95L...", verbose=verbose)
|
|
1224
|
+
sumstats.loc[matched_index,"OR_95U"] = factor / sumstats.loc[matched_index,"OR_95L"].values
|
|
1225
|
+
if "OR_95U" in sumstats.columns:
|
|
1226
|
+
log.write(" -Flipping column: OR_95L = 1 / OR_95U...", verbose=verbose)
|
|
1227
|
+
sumstats.loc[matched_index,"OR_95L"] = factor / sumstats.loc[matched_index,"OR_95U"].values
|
|
1228
|
+
if "HR" in sumstats.columns:
|
|
1229
|
+
log.write(" -Flipping column: HR = 1 / HR...", verbose=verbose)
|
|
1230
|
+
sumstats.loc[matched_index,"HR"] = factor / sumstats.loc[matched_index,"HR"].values
|
|
1231
|
+
if "HR_95L" in sumstats.columns:
|
|
1232
|
+
log.write(" -Flipping column: HR_95U = 1 / HR_95L...", verbose=verbose)
|
|
1233
|
+
sumstats.loc[matched_index,"HR_95U"] = factor / sumstats.loc[matched_index,"HR_95L"].values
|
|
1234
|
+
if "HR_95U" in sumstats.columns:
|
|
1235
|
+
log.write(" -Flipping column: HR_95L = 1 / HR_95U...", verbose=verbose)
|
|
1236
|
+
sumstats.loc[matched_index,"HR_95L"] = factor / sumstats.loc[matched_index,"HR_95U"].values
|
|
1237
|
+
return sumstats
|
|
1238
|
+
|
|
1239
|
+
def flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1):
|
|
1240
|
+
if "EAF" in sumstats.columns:
|
|
1241
|
+
log.write(" -Flipping column: EAF = 1 - EAF...", verbose=verbose)
|
|
1242
|
+
sumstats.loc[matched_index,"EAF"] = factor - sumstats.loc[matched_index,"EAF"].values
|
|
1243
|
+
return sumstats
|
|
1244
|
+
|
|
1245
|
+
def flip_by_sign(sumstats, matched_index, log, verbose, cols=None):
|
|
1246
|
+
if "BETA" in sumstats.columns:
|
|
1247
|
+
log.write(" -Flipping column: BETA = - BETA...", verbose=verbose)
|
|
1248
|
+
sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
|
|
1249
|
+
if "BETA_95L" in sumstats.columns:
|
|
1250
|
+
log.write(" -Flipping column: BETA_95U = - BETA_95L...", verbose=verbose)
|
|
1251
|
+
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1252
|
+
if "BETA_95U" in sumstats.columns:
|
|
1253
|
+
log.write(" -Flipping column: BETA_95L = - BETA_95U...", verbose=verbose)
|
|
1254
|
+
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1255
|
+
if "Z" in sumstats.columns:
|
|
1256
|
+
log.write(" -Flipping column: Z = - Z...", verbose=verbose)
|
|
1257
|
+
sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"Z"].values
|
|
1258
|
+
if "T" in sumstats.columns:
|
|
1259
|
+
log.write(" -Flipping column: T = - T...", verbose=verbose)
|
|
1260
|
+
sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"T"].values
|
|
1261
|
+
if "DIRECTION" in sumstats.columns:
|
|
1262
|
+
log.write(" -Flipping column: DIRECTION +-?0 <=> -+?0 ...", verbose=verbose)
|
|
1263
|
+
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1264
|
+
return sumstats
|
|
1265
|
+
|
|
1205
1266
|
def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1267
|
+
##start function with col checking##########################################################
|
|
1268
|
+
_start_line = "adjust statistics based on STATUS code"
|
|
1269
|
+
_end_line = "adjusting statistics based on STATUS code"
|
|
1270
|
+
_start_cols =[]
|
|
1271
|
+
_start_function = ".flip_allele_stats()"
|
|
1272
|
+
_must_args ={}
|
|
1273
|
+
|
|
1274
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1275
|
+
log=log,
|
|
1276
|
+
verbose=verbose,
|
|
1277
|
+
start_line=_start_line,
|
|
1278
|
+
end_line=_end_line,
|
|
1279
|
+
start_cols=_start_cols,
|
|
1280
|
+
start_function=_start_function,
|
|
1281
|
+
**_must_args)
|
|
1282
|
+
if is_enough_info == False: return sumstats
|
|
1283
|
+
############################################################################################
|
|
1284
|
+
|
|
1285
|
+
if_stats_flipped = False
|
|
1209
1286
|
###################get reverse complementary####################
|
|
1210
1287
|
pattern = r"\w\w\w\w\w[45]\w"
|
|
1211
1288
|
#matched_index = status_match(sumstats[status],6,[4,5]) #
|
|
1212
1289
|
matched_index = sumstats[status].str[5].str.match(r"4|5")
|
|
1213
1290
|
if sum(matched_index)>0:
|
|
1214
|
-
|
|
1215
|
-
|
|
1291
|
+
log.write("Start to convert alleles to reverse complement for SNPs with status xxxxx[45]x...{}".format(_get_version()), verbose=verbose)
|
|
1292
|
+
log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
|
|
1216
1293
|
if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
|
|
1217
|
-
|
|
1294
|
+
log.write(" -Converting to reverse complement : EA and NEA...", verbose=verbose)
|
|
1218
1295
|
reverse_complement_nea = sumstats.loc[matched_index,'NEA'].apply(lambda x :get_reverse_complementary_allele(x))
|
|
1219
1296
|
reverse_complement_ea = sumstats.loc[matched_index,'EA'].apply(lambda x :get_reverse_complementary_allele(x))
|
|
1220
|
-
categories = set(sumstats
|
|
1221
|
-
sumstats
|
|
1222
|
-
sumstats
|
|
1297
|
+
categories = set(sumstats['EA'])|set(sumstats['NEA']) |set(reverse_complement_ea) |set(reverse_complement_nea)
|
|
1298
|
+
sumstats['EA']=pd.Categorical(sumstats['EA'],categories = categories)
|
|
1299
|
+
sumstats['NEA']=pd.Categorical(sumstats['NEA'],categories = categories )
|
|
1223
1300
|
sumstats.loc[matched_index,['NEA']] = reverse_complement_nea
|
|
1224
1301
|
sumstats.loc[matched_index,['EA']] = reverse_complement_ea
|
|
1225
1302
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "4","2")
|
|
1226
|
-
|
|
1227
|
-
|
|
1303
|
+
log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x", verbose=verbose)
|
|
1304
|
+
if_stats_flipped = True
|
|
1228
1305
|
###################flip ref####################
|
|
1229
1306
|
pattern = r"\w\w\w\w\w[35]\w"
|
|
1230
1307
|
#matched_index = status_match(sumstats[status],6,[3,5]) #sumstats[status].str.match(pattern)
|
|
1231
1308
|
matched_index = sumstats[status].str[5].str.match(r"3|5")
|
|
1232
1309
|
if sum(matched_index)>0:
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
if "BETA_95L" in sumstats.columns:
|
|
1242
|
-
if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
|
|
1243
|
-
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1244
|
-
if "BETA_95U" in sumstats.columns:
|
|
1245
|
-
if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
|
|
1246
|
-
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1247
|
-
if "EAF" in sumstats.columns:
|
|
1248
|
-
if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
|
|
1249
|
-
sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
|
|
1250
|
-
if "OR" in sumstats.columns:
|
|
1251
|
-
if verbose: log.write(" -Flipping column: OR = 1 / OR...")
|
|
1252
|
-
sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
|
|
1253
|
-
if "OR_95L" in sumstats.columns:
|
|
1254
|
-
if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
|
|
1255
|
-
sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
|
|
1256
|
-
if "OR_95U" in sumstats.columns:
|
|
1257
|
-
if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
|
|
1258
|
-
sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
|
|
1259
|
-
if "HR" in sumstats.columns:
|
|
1260
|
-
if verbose: log.write(" -Flipping column: HR = 1 / HR...")
|
|
1261
|
-
sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
|
|
1262
|
-
if "HR_95L" in sumstats.columns:
|
|
1263
|
-
if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
|
|
1264
|
-
sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
|
|
1265
|
-
if "HR_95U" in sumstats.columns:
|
|
1266
|
-
if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
|
|
1267
|
-
sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
|
|
1268
|
-
if "DIRECTION" in sumstats.columns:
|
|
1269
|
-
if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
|
|
1270
|
-
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1310
|
+
log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: ALT->EA , REF->NEA ...{}".format(_get_version()), verbose=verbose)
|
|
1311
|
+
log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
|
|
1312
|
+
|
|
1313
|
+
flip_by_swap(sumstats, matched_index, log, verbose)
|
|
1314
|
+
flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
1315
|
+
flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1316
|
+
flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1317
|
+
|
|
1271
1318
|
#change status
|
|
1272
|
-
|
|
1319
|
+
log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x", verbose=verbose)
|
|
1273
1320
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "35","12")
|
|
1321
|
+
if_stats_flipped = True
|
|
1274
1322
|
|
|
1275
1323
|
###################flip ref for undistingushable indels####################
|
|
1276
1324
|
pattern = r"\w\w\w\w[123][67]6"
|
|
1277
1325
|
#matched_index = status_match(sumstats[status],6,[1,2,3])|status_match(sumstats[status],6,[6,7])|status_match(sumstats[status],7,6) #sumstats[status].str.match(pattern)
|
|
1278
1326
|
matched_index = sumstats[status].str[4:].str.match(r"[123][67]6")
|
|
1279
1327
|
if sum(matched_index)>0:
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
if "BETA_95L" in sumstats.columns:
|
|
1289
|
-
if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
|
|
1290
|
-
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1291
|
-
if "BETA_95U" in sumstats.columns:
|
|
1292
|
-
if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
|
|
1293
|
-
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1294
|
-
if "EAF" in sumstats.columns:
|
|
1295
|
-
if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
|
|
1296
|
-
sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
|
|
1297
|
-
if "OR" in sumstats.columns:
|
|
1298
|
-
if verbose: log.write(" -Flipping column: OR = 1 / OR...")
|
|
1299
|
-
sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
|
|
1300
|
-
if "OR_95L" in sumstats.columns:
|
|
1301
|
-
if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
|
|
1302
|
-
sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
|
|
1303
|
-
if "OR_95U" in sumstats.columns:
|
|
1304
|
-
if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
|
|
1305
|
-
sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
|
|
1306
|
-
if "HR" in sumstats.columns:
|
|
1307
|
-
if verbose: log.write(" -Flipping column: HR = 1 / HR...")
|
|
1308
|
-
sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
|
|
1309
|
-
if "HR_95L" in sumstats.columns:
|
|
1310
|
-
if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
|
|
1311
|
-
sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
|
|
1312
|
-
if "HR_95U" in sumstats.columns:
|
|
1313
|
-
if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
|
|
1314
|
-
sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
|
|
1315
|
-
if "DIRECTION" in sumstats.columns:
|
|
1316
|
-
if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
|
|
1317
|
-
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1328
|
+
log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: ALT->EA , REF->NEA...{}".format(_get_version()), verbose=verbose)
|
|
1329
|
+
log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
|
|
1330
|
+
|
|
1331
|
+
flip_by_swap(sumstats, matched_index, log, verbose)
|
|
1332
|
+
flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
1333
|
+
flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1334
|
+
flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1335
|
+
|
|
1318
1336
|
#change status
|
|
1319
|
-
|
|
1337
|
+
log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4", verbose=verbose)
|
|
1320
1338
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "6","4")
|
|
1339
|
+
if_stats_flipped = True
|
|
1321
1340
|
# flip ref
|
|
1322
1341
|
###################flip statistics for reverse strand panlindromic variants####################
|
|
1323
1342
|
pattern = r"\w\w\w\w\w[012]5"
|
|
1324
1343
|
#matched_index = status_match(sumstats[status],6,[0,1,2]) | status_match(sumstats[status],7,[5])#sumstats[status].str.match(pattern)
|
|
1325
1344
|
matched_index = sumstats[status].str[5:].str.match(r"05|15|25")
|
|
1326
1345
|
if sum(matched_index)>0:
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1335
|
-
if "BETA_95U" in sumstats.columns:
|
|
1336
|
-
if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
|
|
1337
|
-
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1338
|
-
if "EAF" in sumstats.columns:
|
|
1339
|
-
if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
|
|
1340
|
-
sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
|
|
1341
|
-
if "OR" in sumstats.columns:
|
|
1342
|
-
if verbose: log.write(" -Flipping column: OR = 1 / OR...")
|
|
1343
|
-
sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
|
|
1344
|
-
if "OR_95L" in sumstats.columns:
|
|
1345
|
-
if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
|
|
1346
|
-
sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
|
|
1347
|
-
if "OR_95U" in sumstats.columns:
|
|
1348
|
-
if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
|
|
1349
|
-
sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
|
|
1350
|
-
if "HR" in sumstats.columns:
|
|
1351
|
-
if verbose: log.write(" -Flipping column: HR = 1 / HR...")
|
|
1352
|
-
sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
|
|
1353
|
-
if "HR_95L" in sumstats.columns:
|
|
1354
|
-
if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
|
|
1355
|
-
sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
|
|
1356
|
-
if "HR_95U" in sumstats.columns:
|
|
1357
|
-
if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
|
|
1358
|
-
sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
|
|
1359
|
-
if "DIRECTION" in sumstats.columns:
|
|
1360
|
-
if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
|
|
1361
|
-
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1346
|
+
log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()), verbose=verbose)
|
|
1347
|
+
log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
|
|
1348
|
+
|
|
1349
|
+
flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
1350
|
+
flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1351
|
+
flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1352
|
+
|
|
1362
1353
|
#change status
|
|
1363
|
-
|
|
1354
|
+
log.write(" -Changed the status for flipped variants: xxxxx[012]5: -> xxxxx[012]2", verbose=verbose)
|
|
1364
1355
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "5","2")
|
|
1365
|
-
|
|
1356
|
+
if_stats_flipped = True
|
|
1357
|
+
|
|
1358
|
+
if if_stats_flipped != True:
|
|
1359
|
+
log.write(" -No statistics have been changed.")
|
|
1360
|
+
|
|
1361
|
+
finished(log, verbose, _end_line)
|
|
1366
1362
|
return sumstats
|
|
1367
|
-
""
|
|
1368
1363
|
|
|
1369
1364
|
|
|
1370
1365
|
###############################################################################################################
|
|
1371
1366
|
# 20220426
|
|
1372
1367
|
def liftover_snv(row,chrom,converter,to_build):
|
|
1373
1368
|
status_pre=""
|
|
1374
|
-
status_end=row[1][2]+"9"+row[1][4]+"99"
|
|
1375
|
-
pos_0_based = int(row[0]) - 1
|
|
1369
|
+
status_end=row.iloc[1][2]+"9"+row.iloc[1][4]+"99"
|
|
1370
|
+
pos_0_based = int(row.iloc[0]) - 1
|
|
1376
1371
|
results = converter[chrom][pos_0_based]
|
|
1377
1372
|
if converter[chrom][pos_0_based]:
|
|
1378
1373
|
# return chrom, pos_1_based
|
|
@@ -1402,29 +1397,42 @@ def liftover_variant(sumstats,
|
|
|
1402
1397
|
return sumstats
|
|
1403
1398
|
|
|
1404
1399
|
def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1400
|
+
##start function with col checking##########################################################
|
|
1401
|
+
_start_line = "perform liftover"
|
|
1402
|
+
_end_line = "liftover"
|
|
1403
|
+
_start_cols =[chrom,pos,status]
|
|
1404
|
+
_start_function = ".liftover()"
|
|
1405
|
+
_must_args ={}
|
|
1406
|
+
|
|
1407
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1408
|
+
log=log,
|
|
1409
|
+
verbose=verbose,
|
|
1410
|
+
start_line=_start_line,
|
|
1411
|
+
end_line=_end_line,
|
|
1412
|
+
start_cols=_start_cols,
|
|
1413
|
+
start_function=_start_function,
|
|
1414
|
+
n_cores=n_cores,
|
|
1415
|
+
**_must_args)
|
|
1416
|
+
if is_enough_info == False: return sumstats
|
|
1417
|
+
############################################################################################
|
|
1418
|
+
|
|
1419
|
+
log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build, verbose=verbose)
|
|
1413
1420
|
# valid chr and pos
|
|
1414
1421
|
pattern = r"\w\w\w0\w\w\w"
|
|
1415
1422
|
to_lift = sumstats[status].str.match(pattern)
|
|
1416
1423
|
sumstats = sumstats.loc[to_lift,:].copy()
|
|
1417
|
-
|
|
1424
|
+
log.write(" -Converting variants with status code xxx0xxx :"+str(len(sumstats))+"...", verbose=verbose)
|
|
1418
1425
|
###########################################################################
|
|
1419
1426
|
if sum(to_lift)>0:
|
|
1420
1427
|
if sum(to_lift)<10000:
|
|
1421
1428
|
n_cores=1
|
|
1422
1429
|
|
|
1423
|
-
df_split = np.array_split(sumstats
|
|
1430
|
+
#df_split = np.array_split(sumstats[[chrom,pos,status]], n_cores)
|
|
1431
|
+
df_split = _df_split(sumstats[[chrom,pos,status]], n_cores)
|
|
1424
1432
|
pool = Pool(n_cores)
|
|
1425
1433
|
#df = pd.concat(pool.starmap(func, df_split))
|
|
1426
1434
|
func=liftover_variant
|
|
1427
|
-
sumstats
|
|
1435
|
+
sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
|
|
1428
1436
|
pool.close()
|
|
1429
1437
|
pool.join()
|
|
1430
1438
|
############################################################################
|
|
@@ -1432,78 +1440,183 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
|
|
|
1432
1440
|
unmap_num = len(sumstats.loc[sumstats[pos].isna(),:])
|
|
1433
1441
|
|
|
1434
1442
|
if remove is True:
|
|
1435
|
-
|
|
1443
|
+
log.write(" -Removed unmapped variants: "+str(unmap_num), verbose=verbose)
|
|
1436
1444
|
sumstats = sumstats.loc[~sumstats[pos].isna(),:]
|
|
1437
1445
|
|
|
1438
1446
|
# after liftover check chr and pos
|
|
1439
1447
|
sumstats = fixchr(sumstats,chrom=chrom,add_prefix="",remove=remove, verbose=True)
|
|
1440
1448
|
sumstats = fixpos(sumstats,pos=pos,remove=remove, verbose=True)
|
|
1441
1449
|
|
|
1442
|
-
|
|
1450
|
+
finished(log,verbose,_end_line)
|
|
1443
1451
|
return sumstats
|
|
1444
1452
|
|
|
1445
1453
|
###############################################################################################################
|
|
1446
1454
|
# 20220426
|
|
1447
1455
|
def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=Log()):
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1456
|
+
##start function with col checking##########################################################
|
|
1457
|
+
_start_line = "sort the genome coordinates"
|
|
1458
|
+
_end_line = "sorting coordinates"
|
|
1459
|
+
_start_cols =[chrom,pos]
|
|
1460
|
+
_start_function = ".sort_coordinate()"
|
|
1461
|
+
_must_args ={}
|
|
1462
|
+
|
|
1463
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1464
|
+
log=log,
|
|
1465
|
+
verbose=verbose,
|
|
1466
|
+
start_line=_start_line,
|
|
1467
|
+
end_line=_end_line,
|
|
1468
|
+
start_cols=_start_cols,
|
|
1469
|
+
start_function=_start_function,
|
|
1470
|
+
**_must_args)
|
|
1471
|
+
if is_enough_info == False: return sumstats
|
|
1472
|
+
############################################################################################
|
|
1454
1473
|
|
|
1455
1474
|
try:
|
|
1456
1475
|
if sumstats[pos].dtype == "Int64":
|
|
1457
1476
|
pass
|
|
1458
1477
|
else:
|
|
1459
|
-
|
|
1478
|
+
log.write(" -Force converting POS to Int64...", verbose=verbose)
|
|
1460
1479
|
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
1461
1480
|
except:
|
|
1462
1481
|
pass
|
|
1463
|
-
|
|
1464
|
-
if verbose: log.write(" -Sorting genome coordinates...")
|
|
1465
1482
|
sumstats = sumstats.sort_values(by=[chrom,pos],ascending=True,ignore_index=True)
|
|
1466
|
-
|
|
1467
|
-
|
|
1483
|
+
|
|
1484
|
+
finished(log,verbose,_end_line)
|
|
1468
1485
|
return sumstats
|
|
1469
1486
|
###############################################################################################################
|
|
1470
1487
|
# 20230430 added HR HR_95 BETA_95 N_CASE N_CONTROL
|
|
1471
|
-
def sortcolumn(sumstats,verbose=True,log=Log(),order =
|
|
1488
|
+
def sortcolumn(sumstats,verbose=True,log=Log(),order = None):
|
|
1489
|
+
##start function with col checking##########################################################
|
|
1490
|
+
_start_line = "reorder the columns"
|
|
1491
|
+
_end_line = "reordering the columns"
|
|
1492
|
+
_start_cols =[]
|
|
1493
|
+
_start_function = ".sort_column()"
|
|
1494
|
+
_must_args ={}
|
|
1495
|
+
|
|
1496
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1497
|
+
log=log,
|
|
1498
|
+
verbose=verbose,
|
|
1499
|
+
start_line=_start_line,
|
|
1500
|
+
end_line=_end_line,
|
|
1501
|
+
start_cols=_start_cols,
|
|
1502
|
+
start_function=_start_function,
|
|
1503
|
+
**_must_args)
|
|
1504
|
+
if is_enough_info == False: return sumstats
|
|
1505
|
+
############################################################################################
|
|
1506
|
+
|
|
1507
|
+
if order is None:
|
|
1508
|
+
order = [
|
|
1472
1509
|
"SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z","T","F",
|
|
1473
|
-
"CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"
|
|
1474
|
-
]):
|
|
1475
|
-
if verbose: log.write("Start to reorder the columns...{}".format(_get_version()))
|
|
1476
|
-
check_dataframe_shape(sumstats, log, verbose)
|
|
1477
|
-
|
|
1510
|
+
"CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"]
|
|
1478
1511
|
output_columns = []
|
|
1479
1512
|
for i in order:
|
|
1480
1513
|
if i in sumstats.columns: output_columns.append(i)
|
|
1481
1514
|
for i in sumstats.columns:
|
|
1482
1515
|
if i not in order: output_columns.append(i)
|
|
1483
|
-
|
|
1484
|
-
sumstats = sumstats
|
|
1485
|
-
|
|
1516
|
+
log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
|
|
1517
|
+
sumstats = sumstats[ output_columns]
|
|
1518
|
+
|
|
1519
|
+
finished(log,verbose,_end_line)
|
|
1486
1520
|
return sumstats
|
|
1487
1521
|
|
|
1488
|
-
|
|
1522
|
+
|
|
1523
|
+
###############################################################################################################
|
|
1524
|
+
def start_to(sumstats,
|
|
1525
|
+
log,
|
|
1526
|
+
verbose,
|
|
1527
|
+
start_line,
|
|
1528
|
+
end_line,
|
|
1529
|
+
start_cols,
|
|
1530
|
+
start_function,
|
|
1531
|
+
ref_vcf=None,
|
|
1532
|
+
ref_fasta=None,
|
|
1533
|
+
n_cores=None,
|
|
1534
|
+
ref_tsv=None,
|
|
1535
|
+
**args
|
|
1536
|
+
):
|
|
1537
|
+
|
|
1538
|
+
log.write("Start to {}...{}".format(start_line,_get_version()), verbose=verbose)
|
|
1539
|
+
|
|
1540
|
+
check_dataframe_shape(sumstats=sumstats,
|
|
1541
|
+
log=log,
|
|
1542
|
+
verbose=verbose)
|
|
1543
|
+
|
|
1544
|
+
is_enough_col = check_col(sumstats.columns,
|
|
1545
|
+
verbose=verbose,
|
|
1546
|
+
log=log,
|
|
1547
|
+
cols=start_cols,
|
|
1548
|
+
function=start_function)
|
|
1549
|
+
|
|
1550
|
+
if is_enough_col==True:
|
|
1551
|
+
if n_cores is not None:
|
|
1552
|
+
log.write(" -Number of threads/cores to use: {}".format(n_cores))
|
|
1553
|
+
if ref_vcf is not None:
|
|
1554
|
+
log.write(" -Reference VCF: {}".format(ref_vcf))
|
|
1555
|
+
if ref_fasta is not None:
|
|
1556
|
+
log.write(" -Reference FASTA: {}".format(ref_fasta))
|
|
1557
|
+
if ref_tsv is not None:
|
|
1558
|
+
log.write(" -Reference TSV: {}".format(ref_tsv))
|
|
1559
|
+
|
|
1560
|
+
is_args_valid = True
|
|
1561
|
+
for key, value in args.items():
|
|
1562
|
+
is_args_valid = is_args_valid & check_arg(log, verbose, key, value, start_function)
|
|
1563
|
+
is_enough_col = is_args_valid & is_enough_col
|
|
1564
|
+
|
|
1565
|
+
if is_enough_col == False:
|
|
1566
|
+
skipped(log, verbose, end_line)
|
|
1567
|
+
|
|
1568
|
+
return is_enough_col
|
|
1569
|
+
|
|
1570
|
+
def finished(log, verbose, end_line):
|
|
1571
|
+
log.write("Finished {}.".format(end_line), verbose=verbose)
|
|
1572
|
+
gc.collect()
|
|
1573
|
+
|
|
1574
|
+
def skipped(log, verbose, end_line):
|
|
1575
|
+
log.write("Skipped {}.".format(end_line), verbose=verbose)
|
|
1576
|
+
gc.collect()
|
|
1577
|
+
|
|
1578
|
+
def check_arg(log, verbose, key, value, function):
|
|
1579
|
+
if value is None:
|
|
1580
|
+
log.warning("Necessary argument {} for {} is not provided!".format(key, function))
|
|
1581
|
+
return False
|
|
1582
|
+
return True
|
|
1583
|
+
|
|
1584
|
+
def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
|
|
1489
1585
|
not_in_df=[]
|
|
1490
|
-
for i in
|
|
1586
|
+
for i in cols:
|
|
1491
1587
|
if type(i) is str:
|
|
1492
|
-
|
|
1588
|
+
# single check
|
|
1589
|
+
if i in df_col_names:
|
|
1493
1590
|
continue
|
|
1494
1591
|
else:
|
|
1495
1592
|
not_in_df.append(i)
|
|
1496
1593
|
else:
|
|
1594
|
+
# paried check
|
|
1497
1595
|
count=0
|
|
1498
1596
|
for j in i:
|
|
1499
|
-
if j in
|
|
1597
|
+
if j not in df_col_names:
|
|
1598
|
+
not_in_df.append(j)
|
|
1500
1599
|
count+=1
|
|
1501
|
-
|
|
1502
|
-
return False
|
|
1503
|
-
print(" -Specified columns names was not detected. Please check:"+",".join(i))
|
|
1504
|
-
|
|
1600
|
+
|
|
1505
1601
|
if len(not_in_df)>0:
|
|
1602
|
+
if function is None:
|
|
1603
|
+
to_show_title=" "
|
|
1604
|
+
else:
|
|
1605
|
+
to_show_title = " for {} ".format(function)
|
|
1606
|
+
log.warning("Necessary columns{}were not detected:{}".format(to_show_title, ",".join(not_in_df)))
|
|
1607
|
+
skipped(log, verbose, end_line=function)
|
|
1506
1608
|
return False
|
|
1507
|
-
|
|
1609
|
+
|
|
1508
1610
|
return True
|
|
1509
1611
|
|
|
1612
|
+
###############################################################################################################
|
|
1613
|
+
def _df_split(dataframe, n):
|
|
1614
|
+
chunks = []
|
|
1615
|
+
chunk_size = int(dataframe.shape[0] // n)+1
|
|
1616
|
+
|
|
1617
|
+
for index in range(0, dataframe.shape[0], chunk_size):
|
|
1618
|
+
chunks.append(
|
|
1619
|
+
dataframe.iloc[index:index + chunk_size]
|
|
1620
|
+
)
|
|
1621
|
+
|
|
1622
|
+
return chunks
|