gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/g_Log.py +14 -5
- gwaslab/g_Sumstats.py +86 -18
- gwaslab/g_SumstatsPair.py +70 -23
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +9 -4
- gwaslab/hm_harmonize_sumstats.py +88 -83
- gwaslab/io_preformat_input.py +14 -14
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +1 -1
- gwaslab/qc_fix_sumstats.py +163 -161
- gwaslab/util_ex_calculate_ldmatrix.py +2 -2
- gwaslab/util_ex_gwascatalog.py +24 -24
- gwaslab/util_ex_ldproxyfinder.py +9 -9
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +28 -28
- gwaslab/util_in_filter_value.py +91 -52
- gwaslab/util_in_get_density.py +8 -8
- gwaslab/util_in_get_sig.py +407 -65
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +18 -18
- gwaslab/viz_aux_reposition_text.py +3 -3
- gwaslab/viz_aux_save_figure.py +14 -5
- gwaslab/viz_plot_compare_af.py +29 -30
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +6 -6
- gwaslab/viz_plot_mqqplot.py +17 -3
- gwaslab/viz_plot_qqplot.py +1 -1
- gwaslab/viz_plot_regionalplot.py +33 -32
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +50 -55
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.38.dist-info/RECORD +0 -72
- /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/qc_fix_sumstats.py
CHANGED
|
@@ -129,7 +129,7 @@ def fixID(sumstats,
|
|
|
129
129
|
sumstats.loc[ is_rsid,status] = vchange_status(sumstats.loc[ is_rsid,status], 3, "986","520")
|
|
130
130
|
sumstats.loc[~is_rsid,status] = vchange_status(sumstats.loc[~is_rsid,status], 3, "986","743")
|
|
131
131
|
|
|
132
|
-
|
|
132
|
+
log.write(" -Checking if CHR:POS:NEA:EA is mixed in rsID column ...", verbose=verbose)
|
|
133
133
|
is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
|
|
134
134
|
|
|
135
135
|
log.write(" -Number of CHR:POS:NEA:EA mixed in rsID column :",sum(is_rs_chrpos), verbose=verbose)
|
|
@@ -140,9 +140,9 @@ def fixID(sumstats,
|
|
|
140
140
|
if fixchrpos == True:
|
|
141
141
|
# from snpid or rsid, extract CHR:POS to fix CHR and POS
|
|
142
142
|
if snpid in sumstats.columns:
|
|
143
|
-
|
|
143
|
+
log.write(" -Fixing CHR and POS...", verbose=verbose)
|
|
144
144
|
if overwrite is True:
|
|
145
|
-
|
|
145
|
+
log.write(" -Overwrite is applied...", verbose=verbose)
|
|
146
146
|
# fix all
|
|
147
147
|
to_fix = is_chrposrefalt
|
|
148
148
|
|
|
@@ -151,35 +151,39 @@ def fixID(sumstats,
|
|
|
151
151
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
152
152
|
to_fix_num = sum(to_fix)
|
|
153
153
|
if to_fix_num and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
154
|
-
|
|
154
|
+
else:
|
|
155
|
+
log.write(" -No fixable variants. ...", verbose=verbose)
|
|
155
156
|
|
|
156
157
|
elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
|
|
157
|
-
|
|
158
|
+
log.write(" -Initiating CHR columns...", verbose=verbose)
|
|
158
159
|
sumstats[chrom]=pd.Series(dtype="string")
|
|
159
160
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
160
161
|
to_fix_num = sum(to_fix)
|
|
161
162
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
162
|
-
|
|
163
|
+
else:
|
|
164
|
+
log.write(" -No fixable variants. ...", verbose=verbose)
|
|
163
165
|
|
|
164
166
|
elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
|
|
165
|
-
|
|
167
|
+
log.write(" -Initiating CHR and POS column...", verbose=verbose)
|
|
166
168
|
sumstats[pos]=pd.Series(dtype="Int64")
|
|
167
169
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
168
170
|
to_fix_num = sum(to_fix)
|
|
169
171
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
170
|
-
|
|
172
|
+
else:
|
|
173
|
+
log.write(" -No fixable variants. ...", verbose=verbose)
|
|
171
174
|
|
|
172
175
|
else:
|
|
173
|
-
|
|
176
|
+
log.write(" -Initiating CHR and POS columns...", verbose=verbose)
|
|
174
177
|
sumstats[chrom]=pd.Series(dtype="string")
|
|
175
178
|
sumstats[pos]=pd.Series(dtype="Int64")
|
|
176
179
|
to_fix = is_chrposrefalt
|
|
177
180
|
to_fix_num = sum(to_fix)
|
|
178
181
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
179
|
-
|
|
182
|
+
else:
|
|
183
|
+
log.write(" -No fixable variants. ...", verbose=verbose)
|
|
180
184
|
|
|
181
185
|
if sum(to_fix)>0:
|
|
182
|
-
|
|
186
|
+
log.write(" -Filling CHR and POS columns using valid SNPID's chr:pos...", verbose=verbose)
|
|
183
187
|
# format and qc filled chr and pos
|
|
184
188
|
|
|
185
189
|
sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
@@ -191,36 +195,40 @@ def fixID(sumstats,
|
|
|
191
195
|
#sumstats.loc[to_fix,status] = vchange_status(sumstats.loc[to_fix,status], 4, "98765432","00000000")
|
|
192
196
|
|
|
193
197
|
if rsid in sumstats.columns:
|
|
194
|
-
|
|
198
|
+
log.write(" -Fixing CHR and POS using chr:pos:ref:alt format variants in rsID column...", verbose=verbose)
|
|
195
199
|
if overwrite is True:
|
|
196
|
-
|
|
200
|
+
log.write(" -Overwrite is applied...", verbose=verbose)
|
|
197
201
|
to_fix = is_rs_chrpos
|
|
198
202
|
elif (chrom in sumstats.columns) and (pos in sumstats.columns) :
|
|
199
203
|
to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
200
204
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
201
|
-
|
|
205
|
+
else:
|
|
206
|
+
log.write(" -No fixable variants ...", verbose=verbose)
|
|
202
207
|
elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
|
|
203
|
-
|
|
208
|
+
log.write(" -Initiating CHR columns...", verbose=verbose)
|
|
204
209
|
sumstats[chrom]=pd.Series(dtype="string")
|
|
205
210
|
to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
206
211
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
207
|
-
|
|
212
|
+
else:
|
|
213
|
+
log.write(" -No fixable variants ...", verbose=verbose)
|
|
208
214
|
elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
|
|
209
|
-
|
|
215
|
+
log.write(" -Initiating CHR and POS column...", verbose=verbose)
|
|
210
216
|
sumstats[pos]=pd.Series(dtype="Int64")
|
|
211
217
|
to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
212
218
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
213
|
-
|
|
219
|
+
else:
|
|
220
|
+
log.write(" -No fixable variants ...", verbose=verbose)
|
|
214
221
|
else:
|
|
215
|
-
|
|
222
|
+
log.write(" -Initiating CHR and POS columns...", verbose=verbose)
|
|
216
223
|
sumstats[chrom]=pd.Series(dtype="string")
|
|
217
224
|
sumstats[pos]=pd.Series(dtype="Int64")
|
|
218
225
|
to_fix = is_rs_chrpos
|
|
219
226
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
220
|
-
|
|
227
|
+
else:
|
|
228
|
+
log.write(" -No fixable variants ...", verbose=verbose)
|
|
221
229
|
|
|
222
230
|
if sum(to_fix)>0:
|
|
223
|
-
|
|
231
|
+
log.write(" -Filling CHR and POS columns using chr:pos:ref:alt format variants in rsID column...", verbose=verbose)
|
|
224
232
|
sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,rsid].str.split(':|_|-',n=2).str[0]
|
|
225
233
|
sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,rsid].str.split(':|_|-',n=2).str[1]
|
|
226
234
|
#sumstats.loc[to_fix,pos] = np.floor(pd.to_numeric(sumstats.loc[to_fix,rsid].str.split(':|_|-',x).get(1), errors='coerce')).astype('Int64')
|
|
@@ -228,40 +236,40 @@ def fixID(sumstats,
|
|
|
228
236
|
|
|
229
237
|
############################ fixing chr pos###################################################
|
|
230
238
|
if fixeanea == True:
|
|
231
|
-
|
|
239
|
+
log.warning("gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT", verbose=verbose)
|
|
232
240
|
if overwrite is True:
|
|
233
|
-
|
|
241
|
+
log.write(" -Overwrite mode is applied...", verbose=verbose)
|
|
234
242
|
to_fix = is_chrposrefalt
|
|
235
243
|
elif (nea in sumstats.columns) and (nea in sumstats.columns):
|
|
236
244
|
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
237
245
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
238
246
|
elif (nea in sumstats.columns) and (ea not in sumstats.columns):
|
|
239
|
-
|
|
247
|
+
log.write(" -Initiating EA columns...", verbose=verbose)
|
|
240
248
|
sumstats[ea]=pd.Series(dtype="string")
|
|
241
249
|
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
242
250
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
243
251
|
elif (nea not in sumstats.columns) and (ea in sumstats.columns):
|
|
244
|
-
|
|
252
|
+
log.write(" -Initiating NEA columns...", verbose=verbose)
|
|
245
253
|
sumstats[nea]=pd.Series(dtype="string")
|
|
246
254
|
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
247
255
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
248
256
|
else:
|
|
249
|
-
|
|
257
|
+
log.write(" -Initiating EA and NEA columns...", verbose=verbose)
|
|
250
258
|
sumstats[nea]=pd.Series(dtype="string")
|
|
251
259
|
sumstats[ea]=pd.Series(dtype="string")
|
|
252
260
|
to_fix = is_chrposrefalt
|
|
253
261
|
if sum(to_fix)>0:
|
|
254
|
-
|
|
262
|
+
log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...", verbose=verbose)
|
|
255
263
|
#
|
|
256
264
|
if sum(to_fix)>0:
|
|
257
|
-
|
|
265
|
+
log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's CHR:POS:NEA:EA...", verbose=verbose)
|
|
258
266
|
#
|
|
259
267
|
if fixeanea_flip == True:
|
|
260
|
-
|
|
268
|
+
log.write(" -Flipped : CHR:POS:NEA:EA -> CHR:POS:EA:NEA ", verbose=verbose)
|
|
261
269
|
sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
|
|
262
270
|
sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
|
|
263
271
|
else:
|
|
264
|
-
|
|
272
|
+
log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ", verbose=verbose)
|
|
265
273
|
sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
|
|
266
274
|
sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
|
|
267
275
|
|
|
@@ -273,12 +281,12 @@ def fixID(sumstats,
|
|
|
273
281
|
############################ fixing id ###################################################
|
|
274
282
|
if fixsep == True:
|
|
275
283
|
if snpid in sumstats.columns:
|
|
276
|
-
|
|
284
|
+
log.write(' -Replacing [_-] in SNPID with ":" ...', verbose=verbose)
|
|
277
285
|
sumstats[snpid] = sumstats[snpid].str.replace(r"[_-]",":",regex=True)
|
|
278
286
|
|
|
279
287
|
if fixprefix == True:
|
|
280
288
|
if snpid in sumstats.columns:
|
|
281
|
-
|
|
289
|
+
log.write(' -Removing /^chr/ in SNPID ...', verbose=verbose)
|
|
282
290
|
prefix_removed = sumstats[snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
283
291
|
sumstats.loc[~prefix_removed.isna(),snpid] = prefix_removed[~prefix_removed.isna()]
|
|
284
292
|
|
|
@@ -327,22 +335,23 @@ def fixID(sumstats,
|
|
|
327
335
|
sumstats.loc[to_part_fix,snpid] = sumstats.loc[to_part_fix,chrom].astype("string") + ":"+sumstats.loc[to_part_fix,pos].astype("string")
|
|
328
336
|
if sum(to_full_fix)>0:
|
|
329
337
|
sumstats.loc[to_full_fix,snpid] = sumstats.loc[to_full_fix,chrom].astype("string") + ":"+sumstats.loc[to_full_fix,pos].astype("string") +":"+ sumstats.loc[to_full_fix,nea].astype("string") +":"+ sumstats.loc[to_full_fix,ea].astype("string")
|
|
330
|
-
|
|
331
|
-
|
|
338
|
+
log.write(" -Filling "+str(sum(to_part_fix)-sum(to_full_fix)) +" SNPID using CHR:POS...", verbose=verbose)
|
|
339
|
+
log.write(" -Filling "+str(sum(to_full_fix)) +" SNPID using CHR:POS:NEA:EA...", verbose=verbose)
|
|
332
340
|
sumstats.loc[(to_full_fix),status] = vchange_status(sumstats.loc[(to_full_fix),status],3,"975","630")
|
|
333
341
|
sumstats.loc[(to_part_fix),status] = vchange_status(sumstats.loc[(to_part_fix),status],3,"975","842")
|
|
334
342
|
|
|
335
343
|
else:
|
|
336
344
|
#when these is no ea or ena, just fix to chr:pos
|
|
337
345
|
to_part_fix = to_fix & sumstats[chrom].notnull() & sumstats[pos].notnull()
|
|
338
|
-
|
|
346
|
+
log.write(" -Filling "+str(sum(to_part_fix)) +" SNPID using CHR POS...", verbose=verbose)
|
|
339
347
|
if sum(to_part_fix)>0:
|
|
340
348
|
sumstats.loc[to_part_fix,snpid] = sumstats.loc[to_part_fix,chrom].astype("string") + ":"+sumstats.loc[to_part_fix,pos].astype("string")
|
|
341
349
|
sumstats.loc[to_part_fix,status] = vchange_status(sumstats.loc[(to_part_fix),status],3,"975","842")
|
|
342
350
|
|
|
343
351
|
after_number=sum(sumstats[snpid].isna())
|
|
344
|
-
|
|
345
|
-
|
|
352
|
+
log.write(" -Fixed "+ str(pre_number - after_number) +" variants ID...", verbose=verbose)
|
|
353
|
+
else:
|
|
354
|
+
log.write(" -ID unfixable: no CHR and POS columns or no SNPID. ", verbose=verbose)
|
|
346
355
|
|
|
347
356
|
finished(log,verbose,_end_line)
|
|
348
357
|
return sumstats
|
|
@@ -378,71 +387,71 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
378
387
|
if is_enough_info == False: return sumstats
|
|
379
388
|
############################################################################################
|
|
380
389
|
|
|
381
|
-
|
|
390
|
+
log.write(" -Removing mode:{}".format(mode), verbose=verbose)
|
|
382
391
|
# sort the variants using the specified column before removing
|
|
383
392
|
if keep_col is not None :
|
|
384
393
|
if keep_col in sumstats.columns:
|
|
385
|
-
|
|
394
|
+
log.write("Start to sort the sumstats using {}...".format(keep_col), verbose=verbose)
|
|
386
395
|
sumstats = sumstats.sort_values(by=keep_col,ascending=keep_ascend)
|
|
387
396
|
else:
|
|
388
|
-
|
|
397
|
+
log.write("Column" + keep_col +" was not detected... skipping... ", verbose=verbose)
|
|
389
398
|
total_number = len(sumstats)
|
|
390
399
|
|
|
391
400
|
# remove by duplicated SNPID
|
|
392
401
|
if (snpid in sumstats.columns) and ("d" in mode or "s" in mode):
|
|
393
|
-
|
|
402
|
+
log.write("Start to remove duplicated variants based on snpid...{}".format(_get_version()), verbose=verbose)
|
|
394
403
|
check_dataframe_shape(sumstats, log, verbose)
|
|
395
|
-
|
|
404
|
+
log.write(" -Which variant to keep: ", keep , verbose=verbose)
|
|
396
405
|
pre_number =len(sumstats)
|
|
397
406
|
if snpid in sumstats.columns:
|
|
398
407
|
# keep na and remove duplicated
|
|
399
408
|
sumstats = sumstats.loc[sumstats[snpid].isna() | (~sumstats.duplicated(subset=[snpid], keep=keep)),:]
|
|
400
409
|
after_number=len(sumstats)
|
|
401
|
-
|
|
410
|
+
log.write(" -Removed ",pre_number -after_number ," based on SNPID...", verbose=verbose)
|
|
402
411
|
|
|
403
412
|
# remove by duplicated rsID
|
|
404
413
|
if (rsid in sumstats.columns) and ("d" in mode or "r" in mode):
|
|
405
414
|
# keep na and remove duplicated
|
|
406
415
|
pre_number =len(sumstats)
|
|
407
|
-
|
|
416
|
+
log.write("Start to remove duplicated variants based on rsID...", verbose=verbose)
|
|
408
417
|
check_dataframe_shape(sumstats, log, verbose)
|
|
409
418
|
sumstats = sumstats.loc[sumstats[rsid].isna() | (~sumstats.duplicated(subset=rsid, keep=keep)),:]
|
|
410
419
|
after_number=len(sumstats)
|
|
411
|
-
|
|
420
|
+
log.write(" -Removed ",pre_number -after_number ," based on rsID...", verbose=verbose)
|
|
412
421
|
|
|
413
422
|
# remove by duplicated variants by CHR:POS:NEA:EA
|
|
414
423
|
if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and ("d" in mode or "c" in mode):
|
|
415
|
-
|
|
424
|
+
log.write("Start to remove duplicated variants based on CHR,POS,EA and NEA...", verbose=verbose)
|
|
416
425
|
check_dataframe_shape(sumstats, log, verbose)
|
|
417
|
-
|
|
426
|
+
log.write(" -Which variant to keep: ", keep , verbose=verbose)
|
|
418
427
|
pre_number =len(sumstats)
|
|
419
428
|
if snpid in sumstats.columns:
|
|
420
429
|
# keep na and remove duplicated
|
|
421
430
|
sumstats = sumstats.loc[(~sumstats[[chrom,pos,ea,nea]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos,ea,nea], keep=keep)),:]
|
|
422
431
|
after_number=len(sumstats)
|
|
423
|
-
|
|
432
|
+
log.write(" -Removed ",pre_number -after_number ," based on CHR,POS,EA and NEA...", verbose=verbose)
|
|
424
433
|
|
|
425
434
|
# remove by multiallelic variants by CHR:POS
|
|
426
435
|
if (chrom in sumstats.columns) and (pos in sumstats.columns) and "m" in mode:
|
|
427
436
|
# keep na and remove duplicated
|
|
428
437
|
pre_number =len(sumstats)
|
|
429
|
-
|
|
438
|
+
log.write("Start to remove multiallelic variants based on chr:pos...", verbose=verbose)
|
|
430
439
|
check_dataframe_shape(sumstats, log, verbose)
|
|
431
|
-
|
|
440
|
+
log.write(" -Which variant to keep: ", keep , verbose=verbose)
|
|
432
441
|
sumstats = sumstats.loc[(~sumstats[[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
|
|
433
442
|
after_number=len(sumstats)
|
|
434
|
-
|
|
443
|
+
log.write(" -Removed ",pre_number -after_number," multiallelic variants...", verbose=verbose)
|
|
435
444
|
after_number=len(sumstats)
|
|
436
445
|
|
|
437
446
|
# resort the coordinates
|
|
438
|
-
|
|
447
|
+
log.write(" -Removed ",total_number -after_number," variants in total.", verbose=verbose)
|
|
439
448
|
if keep_col is not None :
|
|
440
|
-
|
|
449
|
+
log.write(" -Sort the coordinates based on CHR and POS...", verbose=verbose)
|
|
441
450
|
sumstats = sortcoordinate(sumstats,verbose=False)
|
|
442
451
|
|
|
443
452
|
if "n" in mode or remove==True:
|
|
444
453
|
# if remove==True, remove NAs
|
|
445
|
-
|
|
454
|
+
log.write(" -Removing NAs...", verbose=verbose)
|
|
446
455
|
pre_number =len(sumstats)
|
|
447
456
|
specified_columns = []
|
|
448
457
|
if "d" in mode:
|
|
@@ -466,7 +475,7 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
466
475
|
specified_columns.append(nea)
|
|
467
476
|
sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
|
|
468
477
|
after_number=len(sumstats)
|
|
469
|
-
|
|
478
|
+
log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)), verbose=verbose)
|
|
470
479
|
|
|
471
480
|
finished(log,verbose,_end_line)
|
|
472
481
|
return sumstats
|
|
@@ -495,27 +504,23 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
|
|
|
495
504
|
#chrom_list = get_chr_list() #bottom
|
|
496
505
|
if chrom_list is None:
|
|
497
506
|
chrom_list = get_chr_list()
|
|
498
|
-
#if check_col(sumstats,chrom,status) is not True:
|
|
499
|
-
# if verbose: log.write(".fix_chr: Specified not detected..skipping...")
|
|
500
|
-
# return sumstats
|
|
501
|
-
|
|
502
507
|
|
|
503
508
|
# convert to string datatype
|
|
504
509
|
try:
|
|
505
|
-
|
|
510
|
+
log.write(" -Checking CHR data type...", verbose=verbose)
|
|
506
511
|
if sumstats[chrom].dtype == "string":
|
|
507
512
|
pass
|
|
508
513
|
else:
|
|
509
514
|
sumstats[chrom] = sumstats[chrom].astype("string")
|
|
510
515
|
except:
|
|
511
|
-
|
|
516
|
+
log.write(" -Force converting to pd string data type...", verbose=verbose)
|
|
512
517
|
sumstats[chrom] = sumstats[chrom].astype("string")
|
|
513
518
|
|
|
514
519
|
# check if CHR is numeric
|
|
515
520
|
is_chr_fixed = sumstats[chrom].str.isnumeric()
|
|
516
521
|
# fill NAs with False
|
|
517
522
|
is_chr_fixed = is_chr_fixed.fillna(False)
|
|
518
|
-
|
|
523
|
+
log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed), verbose=verbose)
|
|
519
524
|
|
|
520
525
|
# if there are variants whose CHR need to be fixed
|
|
521
526
|
if sum(is_chr_fixed)<len(sumstats):
|
|
@@ -524,7 +529,7 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
|
|
|
524
529
|
chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
525
530
|
|
|
526
531
|
is_chr_fixable = ~chr_extracted.isna()
|
|
527
|
-
|
|
532
|
+
log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable), verbose=verbose)
|
|
528
533
|
|
|
529
534
|
# For not fixed variants, check if na
|
|
530
535
|
is_chr_na = sumstats.loc[~is_chr_fixed, chrom].isna()
|
|
@@ -534,13 +539,13 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
|
|
|
534
539
|
# Check variants with CHR being not NA and not fixable
|
|
535
540
|
is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
|
|
536
541
|
if sum(is_chr_invalid)>0 and verbose:
|
|
537
|
-
log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
|
|
542
|
+
log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid), verbose=verbose)
|
|
538
543
|
try:
|
|
539
|
-
log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
|
|
544
|
+
log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()), verbose=verbose)
|
|
540
545
|
except:
|
|
541
546
|
pass
|
|
542
|
-
|
|
543
|
-
log.write(" -No unrecognized chromosome notations...")
|
|
547
|
+
else:
|
|
548
|
+
log.write(" -No unrecognized chromosome notations...", verbose=verbose)
|
|
544
549
|
|
|
545
550
|
# Assign good chr back to sumstats
|
|
546
551
|
sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
|
|
@@ -553,23 +558,23 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
|
|
|
553
558
|
|
|
554
559
|
# if sumstats contain sex CHR
|
|
555
560
|
if sum(sex_chr)>0:
|
|
556
|
-
|
|
557
|
-
|
|
561
|
+
log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]), verbose=verbose)
|
|
562
|
+
log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...", verbose=verbose)
|
|
558
563
|
|
|
559
564
|
# convert "X, Y, MT" to numbers
|
|
560
565
|
convert_num_to_xymt={}
|
|
561
566
|
if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
|
|
562
567
|
convert_num_to_xymt[x[0].lower()] = str(x[1])
|
|
563
568
|
convert_num_to_xymt[x[0].upper()] = str(x[1])
|
|
564
|
-
|
|
569
|
+
log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]), verbose=verbose)
|
|
565
570
|
if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
|
|
566
571
|
convert_num_to_xymt[y[0].lower()] = str(y[1])
|
|
567
572
|
convert_num_to_xymt[y[0].upper()] = str(y[1])
|
|
568
|
-
|
|
573
|
+
log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]), verbose=verbose)
|
|
569
574
|
if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
|
|
570
575
|
convert_num_to_xymt[mt[0].lower()] = str(mt[1])
|
|
571
576
|
convert_num_to_xymt[mt[0].upper()] = str(mt[1])
|
|
572
|
-
|
|
577
|
+
log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]), verbose=verbose)
|
|
573
578
|
sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
|
|
574
579
|
|
|
575
580
|
# change status code
|
|
@@ -584,19 +589,19 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
|
|
|
584
589
|
if (remove is True) and unrecognized_num>0:
|
|
585
590
|
# remove variants with unrecognized CHR
|
|
586
591
|
try:
|
|
587
|
-
|
|
592
|
+
log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])), verbose=verbose)
|
|
588
593
|
except:
|
|
589
594
|
pass
|
|
590
|
-
|
|
595
|
+
log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.", verbose=verbose)
|
|
591
596
|
try:
|
|
592
|
-
log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
|
|
597
|
+
log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()), verbose=verbose)
|
|
593
598
|
except:
|
|
594
599
|
pass
|
|
595
600
|
#sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
|
|
596
601
|
good_chr = sumstats[chrom].isin(chrom_list)
|
|
597
602
|
sumstats = sumstats.loc[good_chr, :].copy()
|
|
598
603
|
else:
|
|
599
|
-
|
|
604
|
+
log.write(" -All CHR are already fixed...", verbose=verbose)
|
|
600
605
|
sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
|
|
601
606
|
|
|
602
607
|
# Convert string to int
|
|
@@ -610,8 +615,8 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
|
|
|
610
615
|
out_of_range_chr = sumstats[chrom] < minchr
|
|
611
616
|
out_of_range_chr = out_of_range_chr.fillna(False)
|
|
612
617
|
if sum(out_of_range_chr)>0:
|
|
613
|
-
|
|
614
|
-
|
|
618
|
+
log.write(" -Sanity check for CHR...", verbose=verbose)
|
|
619
|
+
log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr), verbose=verbose)
|
|
615
620
|
sumstats = sumstats.loc[~out_of_range_chr,:]
|
|
616
621
|
|
|
617
622
|
finished(log,verbose,_end_line)
|
|
@@ -649,17 +654,17 @@ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_
|
|
|
649
654
|
if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
|
|
650
655
|
sumstats[pos] = sumstats[pos].astype('string')
|
|
651
656
|
# if so, remove thousands separator
|
|
652
|
-
|
|
657
|
+
log.write(' -Removing thousands separator "," or underbar "_" ...', verbose=verbose)
|
|
653
658
|
sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
|
|
654
659
|
except:
|
|
655
660
|
pass
|
|
656
661
|
|
|
657
662
|
# convert POS to integer
|
|
658
663
|
try:
|
|
659
|
-
|
|
664
|
+
log.write(' -Converting to Int64 data type ...', verbose=verbose)
|
|
660
665
|
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
661
666
|
except:
|
|
662
|
-
|
|
667
|
+
log.write(' -Force converting to Int64 data type ...', verbose=verbose)
|
|
663
668
|
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
664
669
|
is_pos_fixed = ~sumstats[pos].isna()
|
|
665
670
|
is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
|
|
@@ -668,16 +673,16 @@ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_
|
|
|
668
673
|
sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
|
|
669
674
|
|
|
670
675
|
# remove outlier, limit:250,000,000
|
|
671
|
-
|
|
676
|
+
log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit), verbose=verbose)
|
|
672
677
|
is_pos_na = sumstats[pos].isna()
|
|
673
678
|
out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
|
|
674
|
-
|
|
679
|
+
log.write(" -Removed outliers:",sum(out_lier), verbose=verbose)
|
|
675
680
|
sumstats = sumstats.loc[~out_lier,:]
|
|
676
681
|
#remove na
|
|
677
682
|
if remove is True:
|
|
678
683
|
sumstats = sumstats.loc[~sumstats[pos].isna(),:]
|
|
679
684
|
remain_var_num = len(sumstats)
|
|
680
|
-
|
|
685
|
+
log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.", verbose=verbose)
|
|
681
686
|
|
|
682
687
|
finished(log,verbose,_end_line)
|
|
683
688
|
return sumstats
|
|
@@ -706,15 +711,15 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
|
|
|
706
711
|
# ea_missing = sum(sumstats[ea].isna())
|
|
707
712
|
# nea_missing = sum(sumstats[nea].isna())
|
|
708
713
|
# if sum(ea_missing)>0:
|
|
709
|
-
#
|
|
714
|
+
# log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
|
|
710
715
|
# sumstats[ea] = sumstats[ea].add_categories("N").fillna("N")
|
|
711
716
|
# if sum(sumstats[nea].isna())>0:
|
|
712
|
-
#
|
|
717
|
+
# log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
|
|
713
718
|
# sumstats[nea] = sumstats[nea].add_categories("N").fillna("N")
|
|
714
719
|
#except:
|
|
715
720
|
# pass
|
|
716
721
|
|
|
717
|
-
|
|
722
|
+
log.write(" -Converted all bases to string datatype and UPPERCASE.", verbose=verbose)
|
|
718
723
|
categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
|
|
719
724
|
categories = {x for x in categories if pd.notna(x)}
|
|
720
725
|
sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
|
|
@@ -743,22 +748,21 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
|
|
|
743
748
|
|
|
744
749
|
exclude = bad_nea | bad_ea
|
|
745
750
|
|
|
746
|
-
if
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...")
|
|
751
|
+
if len(set(sumstats.loc[bad_ea,ea].head())) >0:
|
|
752
|
+
log.write(" -A look at the non-ATCG EA:",set(sumstats.loc[bad_ea,ea].head()),"...", verbose=verbose)
|
|
753
|
+
if len(set(sumstats.loc[bad_nea,nea].head())) >0:
|
|
754
|
+
log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...", verbose=verbose)
|
|
751
755
|
|
|
752
756
|
if remove == True:
|
|
753
757
|
sumstats = sumstats.loc[(good_ea & good_nea),:].copy()
|
|
754
758
|
good_eanea_num = len(sumstats)
|
|
755
|
-
|
|
759
|
+
log.write(" -Removed "+str(all_var_num - good_eanea_num)+" variants with NA alleles or alleles that contain bases other than A/C/T/G.", verbose=verbose)
|
|
756
760
|
sumstats = sumstats.loc[(good_ea & good_nea & (~not_variant)),:].copy()
|
|
757
761
|
good_eanea_notsame_num = len(sumstats)
|
|
758
|
-
|
|
762
|
+
log.write(" -Removed "+str(good_eanea_num - good_eanea_notsame_num)+" variants with same allele for EA and NEA.", verbose=verbose)
|
|
759
763
|
else:
|
|
760
764
|
sumstats[[ea,nea]] = sumstats[[ea,nea]].fillna("N")
|
|
761
|
-
|
|
765
|
+
log.write(" -Detected "+str(sum(exclude))+" variants with alleles that contain bases other than A/C/T/G .", verbose=verbose)
|
|
762
766
|
categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
|
|
763
767
|
sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
|
|
764
768
|
sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
|
|
@@ -811,8 +815,8 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
811
815
|
#r'\w\w\w\w[45]\w\w'
|
|
812
816
|
variants_to_check = sumstats[status].str[4].str.match(r'4|5', case=False, flags=0, na=False)
|
|
813
817
|
if sum(variants_to_check)==0:
|
|
814
|
-
|
|
815
|
-
|
|
818
|
+
log.write(" -No available variants to normalize..", verbose=verbose)
|
|
819
|
+
log.write("Finished normalizing variants successfully!", verbose=verbose)
|
|
816
820
|
return sumstats
|
|
817
821
|
###############################################################################################################
|
|
818
822
|
if sum(variants_to_check)>0:
|
|
@@ -827,29 +831,28 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
827
831
|
pool.join()
|
|
828
832
|
###############################################################################################################
|
|
829
833
|
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
if
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
before_normalize_id = sumstats.loc[variants_to_check,rsid]
|
|
838
|
-
else:
|
|
839
|
-
before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
|
|
840
|
-
|
|
841
|
-
log.write(" -Not normalized allele IDs:",end="")
|
|
842
|
-
for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
|
|
843
|
-
log.write(i,end=" ",show_time=False)
|
|
844
|
-
log.write("... \n",end="",show_time=False)
|
|
845
|
-
|
|
846
|
-
log.write(" -Not normalized allele:",end="")
|
|
847
|
-
for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
|
|
848
|
-
log.write(i,end="",show_time=False)
|
|
849
|
-
log.write("... \n",end="",show_time=False)
|
|
850
|
-
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.")
|
|
834
|
+
before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
|
|
835
|
+
changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
|
|
836
|
+
if changed_num>0:
|
|
837
|
+
if snpid in sumstats.columns:
|
|
838
|
+
before_normalize_id = sumstats.loc[variants_to_check,snpid]
|
|
839
|
+
elif rsid in sumstats.columns:
|
|
840
|
+
before_normalize_id = sumstats.loc[variants_to_check,rsid]
|
|
851
841
|
else:
|
|
852
|
-
|
|
842
|
+
before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
|
|
843
|
+
|
|
844
|
+
log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
|
|
845
|
+
for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
|
|
846
|
+
log.write(i,end=" ",show_time=False)
|
|
847
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
848
|
+
|
|
849
|
+
log.write(" -Not normalized allele:",end="", verbose=verbose)
|
|
850
|
+
for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
|
|
851
|
+
log.write(i,end="",show_time=False, verbose=verbose)
|
|
852
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
853
|
+
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
|
|
854
|
+
else:
|
|
855
|
+
log.write(" -All variants are already normalized..", verbose=verbose)
|
|
853
856
|
###################################################################################################################
|
|
854
857
|
categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
|
|
855
858
|
sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
|
|
@@ -931,13 +934,13 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
|
|
|
931
934
|
if header in coltocheck and header in sumstats.columns:
|
|
932
935
|
cols_to_check.append(header)
|
|
933
936
|
if header=="STATUS":
|
|
934
|
-
|
|
937
|
+
log.write(" -Checking STATUS and converting STATUS to categories....", verbose=verbose)
|
|
935
938
|
categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
936
939
|
sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
|
|
937
940
|
return sumstats
|
|
938
941
|
|
|
939
942
|
if dtype in ["Int64","Int32","int","int32","in64"]:
|
|
940
|
-
|
|
943
|
+
log.write(" -Checking if {} <= {} <= {} ...".format( var_range[0] ,header, var_range[1]), verbose=verbose)
|
|
941
944
|
sumstats[header] = np.floor(pd.to_numeric(sumstats[header], errors='coerce')).astype(dtype)
|
|
942
945
|
|
|
943
946
|
elif dtype in ["Float64","Float32","float","float64","float32"]:
|
|
@@ -1033,7 +1036,7 @@ def sanitycheckstats(sumstats,
|
|
|
1033
1036
|
if is_enough_info == False: return sumstats
|
|
1034
1037
|
############################################################################################
|
|
1035
1038
|
|
|
1036
|
-
|
|
1039
|
+
log.write(" -Comparison tolerance for floats: {}".format(float_tolerence), verbose=verbose)
|
|
1037
1040
|
eaf = add_tolerence(eaf, float_tolerence, "lr")
|
|
1038
1041
|
maf = add_tolerence(maf, float_tolerence, "lr")
|
|
1039
1042
|
beta = add_tolerence(beta, float_tolerence, "lr")
|
|
@@ -1166,7 +1169,7 @@ def _check_data_consistency(sumstats, beta="BETA", se="SE", p="P",mlog10p="MLOG1
|
|
|
1166
1169
|
check_status=1
|
|
1167
1170
|
|
|
1168
1171
|
if "N" in sumstats.columns and "N_CONTROL" in sumstats.columns and "N_CASE" in sumstats.columns:
|
|
1169
|
-
|
|
1172
|
+
log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...", verbose=verbose)
|
|
1170
1173
|
is_close = sumstats["N"] == sumstats["N_CASE"] + sumstats["N_CONTROL"]
|
|
1171
1174
|
#is_close = np.isclose(sumstats["N"], sumstats["N_CASE"] + sumstats["N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1172
1175
|
diff = abs(sumstats["N"] - (sumstats["N_CASE"] + sumstats["N_CONTROL"] ))
|
|
@@ -1208,55 +1211,55 @@ def flip_direction(string):
|
|
|
1208
1211
|
|
|
1209
1212
|
def flip_by_swap(sumstats, matched_index, log, verbose):
|
|
1210
1213
|
if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
|
|
1211
|
-
|
|
1214
|
+
log.write(" -Swapping column: NEA <=> EA...", verbose=verbose)
|
|
1212
1215
|
sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
|
|
1213
1216
|
return sumstats
|
|
1214
1217
|
|
|
1215
1218
|
def flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1):
|
|
1216
1219
|
if "OR" in sumstats.columns:
|
|
1217
|
-
|
|
1220
|
+
log.write(" -Flipping column: OR = 1 / OR...", verbose=verbose)
|
|
1218
1221
|
sumstats.loc[matched_index,"OR"] = factor / sumstats.loc[matched_index,"OR"].values
|
|
1219
1222
|
if "OR_95L" in sumstats.columns:
|
|
1220
|
-
|
|
1223
|
+
log.write(" -Flipping column: OR_95U = 1 / OR_95L...", verbose=verbose)
|
|
1221
1224
|
sumstats.loc[matched_index,"OR_95U"] = factor / sumstats.loc[matched_index,"OR_95L"].values
|
|
1222
1225
|
if "OR_95U" in sumstats.columns:
|
|
1223
|
-
|
|
1226
|
+
log.write(" -Flipping column: OR_95L = 1 / OR_95U...", verbose=verbose)
|
|
1224
1227
|
sumstats.loc[matched_index,"OR_95L"] = factor / sumstats.loc[matched_index,"OR_95U"].values
|
|
1225
1228
|
if "HR" in sumstats.columns:
|
|
1226
|
-
|
|
1229
|
+
log.write(" -Flipping column: HR = 1 / HR...", verbose=verbose)
|
|
1227
1230
|
sumstats.loc[matched_index,"HR"] = factor / sumstats.loc[matched_index,"HR"].values
|
|
1228
1231
|
if "HR_95L" in sumstats.columns:
|
|
1229
|
-
|
|
1232
|
+
log.write(" -Flipping column: HR_95U = 1 / HR_95L...", verbose=verbose)
|
|
1230
1233
|
sumstats.loc[matched_index,"HR_95U"] = factor / sumstats.loc[matched_index,"HR_95L"].values
|
|
1231
1234
|
if "HR_95U" in sumstats.columns:
|
|
1232
|
-
|
|
1235
|
+
log.write(" -Flipping column: HR_95L = 1 / HR_95U...", verbose=verbose)
|
|
1233
1236
|
sumstats.loc[matched_index,"HR_95L"] = factor / sumstats.loc[matched_index,"HR_95U"].values
|
|
1234
1237
|
return sumstats
|
|
1235
1238
|
|
|
1236
1239
|
def flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1):
|
|
1237
1240
|
if "EAF" in sumstats.columns:
|
|
1238
|
-
|
|
1241
|
+
log.write(" -Flipping column: EAF = 1 - EAF...", verbose=verbose)
|
|
1239
1242
|
sumstats.loc[matched_index,"EAF"] = factor - sumstats.loc[matched_index,"EAF"].values
|
|
1240
1243
|
return sumstats
|
|
1241
1244
|
|
|
1242
1245
|
def flip_by_sign(sumstats, matched_index, log, verbose, cols=None):
|
|
1243
1246
|
if "BETA" in sumstats.columns:
|
|
1244
|
-
|
|
1247
|
+
log.write(" -Flipping column: BETA = - BETA...", verbose=verbose)
|
|
1245
1248
|
sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
|
|
1246
1249
|
if "BETA_95L" in sumstats.columns:
|
|
1247
|
-
|
|
1250
|
+
log.write(" -Flipping column: BETA_95U = - BETA_95L...", verbose=verbose)
|
|
1248
1251
|
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1249
1252
|
if "BETA_95U" in sumstats.columns:
|
|
1250
|
-
|
|
1253
|
+
log.write(" -Flipping column: BETA_95L = - BETA_95U...", verbose=verbose)
|
|
1251
1254
|
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1252
1255
|
if "Z" in sumstats.columns:
|
|
1253
|
-
|
|
1256
|
+
log.write(" -Flipping column: Z = - Z...", verbose=verbose)
|
|
1254
1257
|
sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"Z"].values
|
|
1255
1258
|
if "T" in sumstats.columns:
|
|
1256
|
-
|
|
1259
|
+
log.write(" -Flipping column: T = - T...", verbose=verbose)
|
|
1257
1260
|
sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"T"].values
|
|
1258
1261
|
if "DIRECTION" in sumstats.columns:
|
|
1259
|
-
|
|
1262
|
+
log.write(" -Flipping column: DIRECTION +-?0 <=> -+?0 ...", verbose=verbose)
|
|
1260
1263
|
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1261
1264
|
return sumstats
|
|
1262
1265
|
|
|
@@ -1265,7 +1268,7 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1265
1268
|
_start_line = "adjust statistics based on STATUS code"
|
|
1266
1269
|
_end_line = "adjusting statistics based on STATUS code"
|
|
1267
1270
|
_start_cols =[]
|
|
1268
|
-
_start_function = ".
|
|
1271
|
+
_start_function = ".flip_allele_stats()"
|
|
1269
1272
|
_must_args ={}
|
|
1270
1273
|
|
|
1271
1274
|
is_enough_info = start_to(sumstats=sumstats,
|
|
@@ -1285,10 +1288,10 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1285
1288
|
#matched_index = status_match(sumstats[status],6,[4,5]) #
|
|
1286
1289
|
matched_index = sumstats[status].str[5].str.match(r"4|5")
|
|
1287
1290
|
if sum(matched_index)>0:
|
|
1288
|
-
|
|
1289
|
-
|
|
1291
|
+
log.write("Start to convert alleles to reverse complement for SNPs with status xxxxx[45]x...{}".format(_get_version()), verbose=verbose)
|
|
1292
|
+
log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
|
|
1290
1293
|
if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
|
|
1291
|
-
|
|
1294
|
+
log.write(" -Converting to reverse complement : EA and NEA...", verbose=verbose)
|
|
1292
1295
|
reverse_complement_nea = sumstats.loc[matched_index,'NEA'].apply(lambda x :get_reverse_complementary_allele(x))
|
|
1293
1296
|
reverse_complement_ea = sumstats.loc[matched_index,'EA'].apply(lambda x :get_reverse_complementary_allele(x))
|
|
1294
1297
|
categories = set(sumstats['EA'])|set(sumstats['NEA']) |set(reverse_complement_ea) |set(reverse_complement_nea)
|
|
@@ -1297,15 +1300,15 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1297
1300
|
sumstats.loc[matched_index,['NEA']] = reverse_complement_nea
|
|
1298
1301
|
sumstats.loc[matched_index,['EA']] = reverse_complement_ea
|
|
1299
1302
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "4","2")
|
|
1300
|
-
|
|
1303
|
+
log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x", verbose=verbose)
|
|
1301
1304
|
if_stats_flipped = True
|
|
1302
1305
|
###################flip ref####################
|
|
1303
1306
|
pattern = r"\w\w\w\w\w[35]\w"
|
|
1304
1307
|
#matched_index = status_match(sumstats[status],6,[3,5]) #sumstats[status].str.match(pattern)
|
|
1305
1308
|
matched_index = sumstats[status].str[5].str.match(r"3|5")
|
|
1306
1309
|
if sum(matched_index)>0:
|
|
1307
|
-
|
|
1308
|
-
|
|
1310
|
+
log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: ALT->EA , REF->NEA ...{}".format(_get_version()), verbose=verbose)
|
|
1311
|
+
log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
|
|
1309
1312
|
|
|
1310
1313
|
flip_by_swap(sumstats, matched_index, log, verbose)
|
|
1311
1314
|
flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
@@ -1313,7 +1316,7 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1313
1316
|
flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1314
1317
|
|
|
1315
1318
|
#change status
|
|
1316
|
-
|
|
1319
|
+
log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x", verbose=verbose)
|
|
1317
1320
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "35","12")
|
|
1318
1321
|
if_stats_flipped = True
|
|
1319
1322
|
|
|
@@ -1322,8 +1325,8 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1322
1325
|
#matched_index = status_match(sumstats[status],6,[1,2,3])|status_match(sumstats[status],6,[6,7])|status_match(sumstats[status],7,6) #sumstats[status].str.match(pattern)
|
|
1323
1326
|
matched_index = sumstats[status].str[4:].str.match(r"[123][67]6")
|
|
1324
1327
|
if sum(matched_index)>0:
|
|
1325
|
-
|
|
1326
|
-
|
|
1328
|
+
log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: ALT->EA , REF->NEA...{}".format(_get_version()), verbose=verbose)
|
|
1329
|
+
log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
|
|
1327
1330
|
|
|
1328
1331
|
flip_by_swap(sumstats, matched_index, log, verbose)
|
|
1329
1332
|
flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
@@ -1331,7 +1334,7 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1331
1334
|
flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1332
1335
|
|
|
1333
1336
|
#change status
|
|
1334
|
-
|
|
1337
|
+
log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4", verbose=verbose)
|
|
1335
1338
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "6","4")
|
|
1336
1339
|
if_stats_flipped = True
|
|
1337
1340
|
# flip ref
|
|
@@ -1340,24 +1343,23 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1340
1343
|
#matched_index = status_match(sumstats[status],6,[0,1,2]) | status_match(sumstats[status],7,[5])#sumstats[status].str.match(pattern)
|
|
1341
1344
|
matched_index = sumstats[status].str[5:].str.match(r"05|15|25")
|
|
1342
1345
|
if sum(matched_index)>0:
|
|
1343
|
-
|
|
1344
|
-
|
|
1346
|
+
log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()), verbose=verbose)
|
|
1347
|
+
log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
|
|
1345
1348
|
|
|
1346
1349
|
flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
1347
1350
|
flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1348
1351
|
flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1349
1352
|
|
|
1350
1353
|
#change status
|
|
1351
|
-
|
|
1354
|
+
log.write(" -Changed the status for flipped variants: xxxxx[012]5: -> xxxxx[012]2", verbose=verbose)
|
|
1352
1355
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "5","2")
|
|
1353
1356
|
if_stats_flipped = True
|
|
1354
1357
|
|
|
1355
|
-
if if_stats_flipped
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1358
|
+
if if_stats_flipped != True:
|
|
1359
|
+
log.write(" -No statistics have been changed.")
|
|
1360
|
+
|
|
1361
|
+
finished(log, verbose, _end_line)
|
|
1359
1362
|
return sumstats
|
|
1360
|
-
""
|
|
1361
1363
|
|
|
1362
1364
|
|
|
1363
1365
|
###############################################################################################################
|
|
@@ -1414,12 +1416,12 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
|
|
|
1414
1416
|
if is_enough_info == False: return sumstats
|
|
1415
1417
|
############################################################################################
|
|
1416
1418
|
|
|
1417
|
-
|
|
1419
|
+
log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build, verbose=verbose)
|
|
1418
1420
|
# valid chr and pos
|
|
1419
1421
|
pattern = r"\w\w\w0\w\w\w"
|
|
1420
1422
|
to_lift = sumstats[status].str.match(pattern)
|
|
1421
1423
|
sumstats = sumstats.loc[to_lift,:].copy()
|
|
1422
|
-
|
|
1424
|
+
log.write(" -Converting variants with status code xxx0xxx :"+str(len(sumstats))+"...", verbose=verbose)
|
|
1423
1425
|
###########################################################################
|
|
1424
1426
|
if sum(to_lift)>0:
|
|
1425
1427
|
if sum(to_lift)<10000:
|
|
@@ -1438,7 +1440,7 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
|
|
|
1438
1440
|
unmap_num = len(sumstats.loc[sumstats[pos].isna(),:])
|
|
1439
1441
|
|
|
1440
1442
|
if remove is True:
|
|
1441
|
-
|
|
1443
|
+
log.write(" -Removed unmapped variants: "+str(unmap_num), verbose=verbose)
|
|
1442
1444
|
sumstats = sumstats.loc[~sumstats[pos].isna(),:]
|
|
1443
1445
|
|
|
1444
1446
|
# after liftover check chr and pos
|
|
@@ -1473,7 +1475,7 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
|
|
|
1473
1475
|
if sumstats[pos].dtype == "Int64":
|
|
1474
1476
|
pass
|
|
1475
1477
|
else:
|
|
1476
|
-
|
|
1478
|
+
log.write(" -Force converting POS to Int64...", verbose=verbose)
|
|
1477
1479
|
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
1478
1480
|
except:
|
|
1479
1481
|
pass
|
|
@@ -1511,7 +1513,7 @@ def sortcolumn(sumstats,verbose=True,log=Log(),order = None):
|
|
|
1511
1513
|
if i in sumstats.columns: output_columns.append(i)
|
|
1512
1514
|
for i in sumstats.columns:
|
|
1513
1515
|
if i not in order: output_columns.append(i)
|
|
1514
|
-
|
|
1516
|
+
log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
|
|
1515
1517
|
sumstats = sumstats[ output_columns]
|
|
1516
1518
|
|
|
1517
1519
|
finished(log,verbose,_end_line)
|