gwaslab 3.4.37__py3-none-any.whl → 3.4.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +8 -0
- gwaslab/g_Sumstats.py +26 -147
- gwaslab/g_SumstatsPair.py +6 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +29 -15
- gwaslab/hm_harmonize_sumstats.py +291 -163
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +43 -37
- gwaslab/io_to_formats.py +428 -295
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +793 -682
- gwaslab/util_ex_calculate_ldmatrix.py +29 -11
- gwaslab/util_ex_gwascatalog.py +1 -1
- gwaslab/util_ex_ldproxyfinder.py +1 -1
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_convert_h2.py +1 -1
- gwaslab/util_in_fill_data.py +2 -2
- gwaslab/util_in_filter_value.py +122 -34
- gwaslab/util_in_get_density.py +2 -2
- gwaslab/util_in_get_sig.py +41 -9
- gwaslab/viz_aux_quickfix.py +24 -19
- gwaslab/viz_aux_reposition_text.py +7 -4
- gwaslab/viz_aux_save_figure.py +6 -5
- gwaslab/viz_plot_compare_af.py +5 -5
- gwaslab/viz_plot_miamiplot2.py +28 -20
- gwaslab/viz_plot_mqqplot.py +109 -72
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +3 -1
- gwaslab/viz_plot_trumpetplot.py +15 -6
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/METADATA +2 -2
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/RECORD +37 -37
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
gwaslab/qc_fix_sumstats.py
CHANGED
|
@@ -47,14 +47,14 @@ def _process_build(build,log,verbose):
|
|
|
47
47
|
log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
|
|
48
48
|
final_build = "38"
|
|
49
49
|
else:
|
|
50
|
-
log.
|
|
50
|
+
log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
|
|
51
51
|
final_build = "99"
|
|
52
52
|
return final_build
|
|
53
53
|
|
|
54
54
|
def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
|
|
55
55
|
build = _process_build(build,log=log,verbose=verbose)
|
|
56
|
-
sumstats
|
|
57
|
-
sumstats
|
|
56
|
+
sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
|
|
57
|
+
sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
|
|
58
58
|
return sumstats, build
|
|
59
59
|
|
|
60
60
|
def fixID(sumstats,
|
|
@@ -66,35 +66,49 @@ def fixID(sumstats,
|
|
|
66
66
|
2. fix chr and pos using snpid
|
|
67
67
|
3. checking rsid and chr:pos:nea:ea
|
|
68
68
|
'''
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
69
|
+
##start function with col checking##########################################################
|
|
70
|
+
_start_line = "check SNPID/rsID"
|
|
71
|
+
_end_line = "checking SNPID/rsID"
|
|
72
|
+
_start_cols =[]
|
|
73
|
+
_start_function = ".fix_id()"
|
|
74
|
+
_must_args ={}
|
|
75
|
+
|
|
76
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
77
|
+
log=log,
|
|
78
|
+
verbose=verbose,
|
|
79
|
+
start_line=_start_line,
|
|
80
|
+
end_line=_end_line,
|
|
81
|
+
start_cols=_start_cols,
|
|
82
|
+
start_function=_start_function,
|
|
83
|
+
**_must_args)
|
|
84
|
+
if is_enough_info == False: return sumstats
|
|
85
|
+
############################################################################################
|
|
72
86
|
|
|
73
87
|
############################ checking datatype ###################################################
|
|
74
88
|
if rsid in sumstats.columns:
|
|
75
89
|
# convert to string datatype
|
|
76
90
|
try:
|
|
77
91
|
log.write(" -Checking rsID data type...",verbose=verbose)
|
|
78
|
-
if sumstats
|
|
92
|
+
if sumstats[rsid].dtype == "string":
|
|
79
93
|
pass
|
|
80
94
|
else:
|
|
81
95
|
log.write(" -Converting rsID to pd.string data type...",verbose=verbose)
|
|
82
|
-
sumstats
|
|
96
|
+
sumstats[rsid] = sumstats[rsid].astype("string")
|
|
83
97
|
except:
|
|
84
98
|
log.write(" -Force converting rsID to pd.string data type...",verbose=verbose)
|
|
85
|
-
sumstats
|
|
99
|
+
sumstats[rsid] = sumstats[rsid].astype("string")
|
|
86
100
|
if snpid in sumstats.columns:
|
|
87
101
|
# convert to string datatype
|
|
88
102
|
try:
|
|
89
103
|
log.write(" -Checking SNPID data type...",verbose=verbose)
|
|
90
|
-
if sumstats
|
|
104
|
+
if sumstats[snpid].dtype == "string":
|
|
91
105
|
pass
|
|
92
106
|
else:
|
|
93
107
|
log.write(" -Converting SNPID to pd.string data type...",verbose=verbose)
|
|
94
|
-
sumstats
|
|
108
|
+
sumstats[snpid] = sumstats[snpid].astype("string")
|
|
95
109
|
except:
|
|
96
110
|
log.write(" -Force converting SNPID to pd.string data type...",verbose=verbose)
|
|
97
|
-
sumstats
|
|
111
|
+
sumstats[snpid] = sumstats[snpid].astype("string")
|
|
98
112
|
|
|
99
113
|
############################ checking ###################################################
|
|
100
114
|
if snpid in sumstats.columns:
|
|
@@ -141,7 +155,7 @@ def fixID(sumstats,
|
|
|
141
155
|
|
|
142
156
|
elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
|
|
143
157
|
if verbose: log.write(" -Initiating CHR columns...")
|
|
144
|
-
sumstats
|
|
158
|
+
sumstats[chrom]=pd.Series(dtype="string")
|
|
145
159
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
146
160
|
to_fix_num = sum(to_fix)
|
|
147
161
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
@@ -149,7 +163,7 @@ def fixID(sumstats,
|
|
|
149
163
|
|
|
150
164
|
elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
|
|
151
165
|
if verbose: log.write(" -Initiating CHR and POS column...")
|
|
152
|
-
sumstats
|
|
166
|
+
sumstats[pos]=pd.Series(dtype="Int64")
|
|
153
167
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
154
168
|
to_fix_num = sum(to_fix)
|
|
155
169
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
@@ -157,8 +171,8 @@ def fixID(sumstats,
|
|
|
157
171
|
|
|
158
172
|
else:
|
|
159
173
|
if verbose: log.write(" -Initiating CHR and POS columns...")
|
|
160
|
-
sumstats
|
|
161
|
-
sumstats
|
|
174
|
+
sumstats[chrom]=pd.Series(dtype="string")
|
|
175
|
+
sumstats[pos]=pd.Series(dtype="Int64")
|
|
162
176
|
to_fix = is_chrposrefalt
|
|
163
177
|
to_fix_num = sum(to_fix)
|
|
164
178
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
@@ -187,20 +201,20 @@ def fixID(sumstats,
|
|
|
187
201
|
elif verbose: log.write(" -No fixable variants ...")
|
|
188
202
|
elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
|
|
189
203
|
if verbose: log.write(" -Initiating CHR columns...")
|
|
190
|
-
sumstats
|
|
204
|
+
sumstats[chrom]=pd.Series(dtype="string")
|
|
191
205
|
to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
192
206
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
193
207
|
elif verbose: log.write(" -No fixable variants ...")
|
|
194
208
|
elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
|
|
195
209
|
if verbose: log.write(" -Initiating CHR and POS column...")
|
|
196
|
-
sumstats
|
|
210
|
+
sumstats[pos]=pd.Series(dtype="Int64")
|
|
197
211
|
to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
198
212
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
199
213
|
elif verbose: log.write(" -No fixable variants ...")
|
|
200
214
|
else:
|
|
201
215
|
if verbose: log.write(" -Initiating CHR and POS columns...")
|
|
202
|
-
sumstats
|
|
203
|
-
sumstats
|
|
216
|
+
sumstats[chrom]=pd.Series(dtype="string")
|
|
217
|
+
sumstats[pos]=pd.Series(dtype="Int64")
|
|
204
218
|
to_fix = is_rs_chrpos
|
|
205
219
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
206
220
|
elif verbose: log.write(" -No fixable variants ...")
|
|
@@ -214,7 +228,7 @@ def fixID(sumstats,
|
|
|
214
228
|
|
|
215
229
|
############################ fixing chr pos###################################################
|
|
216
230
|
if fixeanea == True:
|
|
217
|
-
if verbose: log.
|
|
231
|
+
if verbose: log.warning("gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT")
|
|
218
232
|
if overwrite is True:
|
|
219
233
|
if verbose: log.write(" -Overwrite mode is applied...")
|
|
220
234
|
to_fix = is_chrposrefalt
|
|
@@ -223,12 +237,12 @@ def fixID(sumstats,
|
|
|
223
237
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
224
238
|
elif (nea in sumstats.columns) and (ea not in sumstats.columns):
|
|
225
239
|
if verbose: log.write(" -Initiating EA columns...")
|
|
226
|
-
sumstats
|
|
240
|
+
sumstats[ea]=pd.Series(dtype="string")
|
|
227
241
|
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
228
242
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
229
243
|
elif (nea not in sumstats.columns) and (ea in sumstats.columns):
|
|
230
244
|
if verbose: log.write(" -Initiating NEA columns...")
|
|
231
|
-
sumstats
|
|
245
|
+
sumstats[nea]=pd.Series(dtype="string")
|
|
232
246
|
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
233
247
|
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
234
248
|
else:
|
|
@@ -260,21 +274,21 @@ def fixID(sumstats,
|
|
|
260
274
|
if fixsep == True:
|
|
261
275
|
if snpid in sumstats.columns:
|
|
262
276
|
if verbose: log.write(' -Replacing [_-] in SNPID with ":" ...')
|
|
263
|
-
sumstats
|
|
277
|
+
sumstats[snpid] = sumstats[snpid].str.replace(r"[_-]",":",regex=True)
|
|
264
278
|
|
|
265
279
|
if fixprefix == True:
|
|
266
280
|
if snpid in sumstats.columns:
|
|
267
281
|
if verbose: log.write(' -Removing /^chr/ in SNPID ...')
|
|
268
|
-
prefix_removed = sumstats
|
|
282
|
+
prefix_removed = sumstats[snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
269
283
|
sumstats.loc[~prefix_removed.isna(),snpid] = prefix_removed[~prefix_removed.isna()]
|
|
270
284
|
|
|
271
285
|
if fixid == True:
|
|
272
286
|
if snpid not in sumstats.columns:
|
|
273
287
|
# initiate a SNPID column
|
|
274
|
-
sumstats
|
|
288
|
+
sumstats[snpid]=pd.Series(dtype="string")
|
|
275
289
|
|
|
276
290
|
if (rsid in sumstats.columns) and (sum(is_rs_chrpos)>0) :
|
|
277
|
-
sumstats
|
|
291
|
+
sumstats[snpid]= sumstats.loc[is_rs_chrpos,rsid]
|
|
278
292
|
|
|
279
293
|
if (chrom in sumstats.columns) and (pos in sumstats.columns):
|
|
280
294
|
#only fix when CHR and POS is available
|
|
@@ -329,7 +343,8 @@ def fixID(sumstats,
|
|
|
329
343
|
after_number=sum(sumstats[snpid].isna())
|
|
330
344
|
if verbose: log.write(" -Fixed "+ str(pre_number - after_number) +" variants ID...")
|
|
331
345
|
elif verbose: log.write(" -ID unfixable: no CHR and POS columns or no SNPID. ")
|
|
332
|
-
|
|
346
|
+
|
|
347
|
+
finished(log,verbose,_end_line)
|
|
333
348
|
return sumstats
|
|
334
349
|
|
|
335
350
|
""
|
|
@@ -344,8 +359,25 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
344
359
|
remove duplicate SNPs based on 3. rsID
|
|
345
360
|
remove multiallelic SNPs based on 4. CHR, POS
|
|
346
361
|
'''
|
|
347
|
-
|
|
348
|
-
|
|
362
|
+
|
|
363
|
+
##start function with col checking##########################################################
|
|
364
|
+
_start_line = "remove duplicated/multiallelic variants"
|
|
365
|
+
_end_line = "removing duplicated/multiallelic variants"
|
|
366
|
+
_start_cols =[]
|
|
367
|
+
_start_function = ".remove_dup()"
|
|
368
|
+
_must_args ={}
|
|
369
|
+
|
|
370
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
371
|
+
log=log,
|
|
372
|
+
verbose=verbose,
|
|
373
|
+
start_line=_start_line,
|
|
374
|
+
end_line=_end_line,
|
|
375
|
+
start_cols=_start_cols,
|
|
376
|
+
start_function=_start_function,
|
|
377
|
+
**_must_args)
|
|
378
|
+
if is_enough_info == False: return sumstats
|
|
379
|
+
############################################################################################
|
|
380
|
+
|
|
349
381
|
if verbose: log.write(" -Removing mode:{}".format(mode))
|
|
350
382
|
# sort the variants using the specified column before removing
|
|
351
383
|
if keep_col is not None :
|
|
@@ -397,7 +429,7 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
397
429
|
if verbose: log.write("Start to remove multiallelic variants based on chr:pos...")
|
|
398
430
|
check_dataframe_shape(sumstats, log, verbose)
|
|
399
431
|
if verbose: log.write(" -Which variant to keep: ", keep )
|
|
400
|
-
sumstats = sumstats.loc[(~sumstats
|
|
432
|
+
sumstats = sumstats.loc[(~sumstats[[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
|
|
401
433
|
after_number=len(sumstats)
|
|
402
434
|
if verbose: log.write(" -Removed ",pre_number -after_number," multiallelic variants...")
|
|
403
435
|
after_number=len(sumstats)
|
|
@@ -435,300 +467,346 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
435
467
|
sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
|
|
436
468
|
after_number=len(sumstats)
|
|
437
469
|
if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)))
|
|
438
|
-
|
|
470
|
+
|
|
471
|
+
finished(log,verbose,_end_line)
|
|
439
472
|
return sumstats
|
|
440
473
|
|
|
441
474
|
###############################################################################################################
|
|
442
475
|
# 20230128
|
|
443
476
|
def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",24),mt=("MT",25), remove=False, verbose=True, chrom_list = None, minchr=1,log=Log()):
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
if verbose: log.write("Start to fix chromosome notation...{}".format(_get_version()))
|
|
451
|
-
check_dataframe_shape(sumstats, log, verbose)
|
|
452
|
-
|
|
453
|
-
# convert to string datatype
|
|
454
|
-
try:
|
|
455
|
-
if verbose: log.write(" -Checking CHR data type...")
|
|
456
|
-
if sumstats.loc[:,chrom].dtype == "string":
|
|
457
|
-
pass
|
|
458
|
-
else:
|
|
459
|
-
sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype("string")
|
|
460
|
-
except:
|
|
461
|
-
if verbose: log.write(" -Force converting to pd string data type...")
|
|
462
|
-
sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype("string")
|
|
463
|
-
|
|
464
|
-
# check if CHR is numeric
|
|
465
|
-
is_chr_fixed = sumstats[chrom].str.isnumeric()
|
|
466
|
-
# fill NAs with False
|
|
467
|
-
is_chr_fixed = is_chr_fixed.fillna(False)
|
|
468
|
-
if verbose: log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed))
|
|
469
|
-
|
|
470
|
-
# if there are variants whose CHR need to be fixed
|
|
471
|
-
if sum(is_chr_fixed)<len(sumstats):
|
|
472
|
-
|
|
473
|
-
#extract the CHR number or X Y M MT
|
|
474
|
-
chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
477
|
+
##start function with col checking##########################################################
|
|
478
|
+
_start_line = "fix chromosome notation (CHR)"
|
|
479
|
+
_end_line = "fixing chromosome notation (CHR)"
|
|
480
|
+
_start_cols =[chrom,status]
|
|
481
|
+
_start_function = ".fix_chr()"
|
|
482
|
+
_must_args ={}
|
|
475
483
|
|
|
476
|
-
|
|
477
|
-
|
|
484
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
485
|
+
log=log,
|
|
486
|
+
verbose=verbose,
|
|
487
|
+
start_line=_start_line,
|
|
488
|
+
end_line=_end_line,
|
|
489
|
+
start_cols=_start_cols,
|
|
490
|
+
start_function=_start_function,
|
|
491
|
+
**_must_args)
|
|
492
|
+
if is_enough_info == False: return sumstats
|
|
493
|
+
############################################################################################
|
|
478
494
|
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
|
|
486
|
-
if sum(is_chr_invalid)>0 and verbose:
|
|
487
|
-
log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
|
|
488
|
-
try:
|
|
489
|
-
log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
|
|
490
|
-
except:
|
|
491
|
-
pass
|
|
492
|
-
elif verbose:
|
|
493
|
-
log.write(" -No unrecognized chromosome notations...")
|
|
494
|
-
|
|
495
|
-
# Assign good chr back to sumstats
|
|
496
|
-
sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
|
|
495
|
+
#chrom_list = get_chr_list() #bottom
|
|
496
|
+
if chrom_list is None:
|
|
497
|
+
chrom_list = get_chr_list()
|
|
498
|
+
#if check_col(sumstats,chrom,status) is not True:
|
|
499
|
+
# if verbose: log.write(".fix_chr: Specified not detected..skipping...")
|
|
500
|
+
# return sumstats
|
|
497
501
|
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
# if sumstats contain sex CHR
|
|
505
|
-
if sum(sex_chr)>0:
|
|
506
|
-
if verbose: log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]))
|
|
507
|
-
if verbose: log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...")
|
|
508
|
-
|
|
509
|
-
# convert "X, Y, MT" to numbers
|
|
510
|
-
convert_num_to_xymt={}
|
|
511
|
-
if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
|
|
512
|
-
convert_num_to_xymt[x[0].lower()] = str(x[1])
|
|
513
|
-
convert_num_to_xymt[x[0].upper()] = str(x[1])
|
|
514
|
-
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]))
|
|
515
|
-
if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
|
|
516
|
-
convert_num_to_xymt[y[0].lower()] = str(y[1])
|
|
517
|
-
convert_num_to_xymt[y[0].upper()] = str(y[1])
|
|
518
|
-
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]))
|
|
519
|
-
if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
|
|
520
|
-
convert_num_to_xymt[mt[0].lower()] = str(mt[1])
|
|
521
|
-
convert_num_to_xymt[mt[0].upper()] = str(mt[1])
|
|
522
|
-
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]))
|
|
523
|
-
sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
|
|
524
|
-
|
|
525
|
-
# change status code
|
|
526
|
-
sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
|
|
527
|
-
if len(is_chr_fixable.index)>0:
|
|
528
|
-
sumstats.loc[is_chr_fixable.index,status] = vchange_status(sumstats.loc[is_chr_fixable.index,status],4,"986","520")
|
|
529
|
-
if len(is_chr_fixable.index)>0:
|
|
530
|
-
sumstats.loc[is_chr_invalid.index,status] = vchange_status(sumstats.loc[is_chr_invalid.index,status],4,"986","743")
|
|
531
|
-
|
|
532
|
-
# check variants with unrecognized CHR
|
|
533
|
-
unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
|
|
534
|
-
if (remove is True) and unrecognized_num>0:
|
|
535
|
-
# remove variants with unrecognized CHR
|
|
536
|
-
try:
|
|
537
|
-
if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
|
|
538
|
-
except:
|
|
539
|
-
pass
|
|
540
|
-
if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
|
|
541
|
-
try:
|
|
542
|
-
log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
|
|
543
|
-
except:
|
|
544
|
-
pass
|
|
545
|
-
#sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
|
|
546
|
-
good_chr = sumstats[chrom].isin(chrom_list)
|
|
547
|
-
sumstats = sumstats.loc[good_chr, :].copy()
|
|
502
|
+
|
|
503
|
+
# convert to string datatype
|
|
504
|
+
try:
|
|
505
|
+
if verbose: log.write(" -Checking CHR data type...")
|
|
506
|
+
if sumstats[chrom].dtype == "string":
|
|
507
|
+
pass
|
|
548
508
|
else:
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
if sum(out_of_range_chr)>0:
|
|
563
|
-
if verbose: log.write(" -Sanity check for CHR...")
|
|
564
|
-
if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
|
|
565
|
-
sumstats = sumstats.loc[~out_of_range_chr,:]
|
|
566
|
-
|
|
567
|
-
if verbose: log.write("Finished fixing chromosome notation successfully!")
|
|
509
|
+
sumstats[chrom] = sumstats[chrom].astype("string")
|
|
510
|
+
except:
|
|
511
|
+
if verbose: log.write(" -Force converting to pd string data type...")
|
|
512
|
+
sumstats[chrom] = sumstats[chrom].astype("string")
|
|
513
|
+
|
|
514
|
+
# check if CHR is numeric
|
|
515
|
+
is_chr_fixed = sumstats[chrom].str.isnumeric()
|
|
516
|
+
# fill NAs with False
|
|
517
|
+
is_chr_fixed = is_chr_fixed.fillna(False)
|
|
518
|
+
if verbose: log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed))
|
|
519
|
+
|
|
520
|
+
# if there are variants whose CHR need to be fixed
|
|
521
|
+
if sum(is_chr_fixed)<len(sumstats):
|
|
568
522
|
|
|
569
|
-
|
|
523
|
+
#extract the CHR number or X Y M MT
|
|
524
|
+
chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
570
525
|
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
if
|
|
577
|
-
|
|
578
|
-
return sumstats
|
|
579
|
-
if verbose: log.write("Start to fix basepair positions...{}".format(_get_version()))
|
|
580
|
-
check_dataframe_shape(sumstats, log, verbose)
|
|
526
|
+
is_chr_fixable = ~chr_extracted.isna()
|
|
527
|
+
if verbose: log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable))
|
|
528
|
+
|
|
529
|
+
# For not fixed variants, check if na
|
|
530
|
+
is_chr_na = sumstats.loc[~is_chr_fixed, chrom].isna()
|
|
531
|
+
if sum(is_chr_na)>0 and verbose:
|
|
532
|
+
log.write(" -Variants with NA chromosome notations:",sum(is_chr_na))
|
|
581
533
|
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
534
|
+
# Check variants with CHR being not NA and not fixable
|
|
535
|
+
is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
|
|
536
|
+
if sum(is_chr_invalid)>0 and verbose:
|
|
537
|
+
log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
|
|
538
|
+
try:
|
|
539
|
+
log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
|
|
540
|
+
except:
|
|
541
|
+
pass
|
|
542
|
+
elif verbose:
|
|
543
|
+
log.write(" -No unrecognized chromosome notations...")
|
|
585
544
|
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
sumstats.loc[:,pos] = sumstats.loc[:,pos].astype('string')
|
|
589
|
-
# if so, remove thousands separator
|
|
590
|
-
if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
|
|
591
|
-
sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
|
|
592
|
-
except:
|
|
593
|
-
pass
|
|
545
|
+
# Assign good chr back to sumstats
|
|
546
|
+
sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
|
|
594
547
|
|
|
595
|
-
#
|
|
596
|
-
|
|
597
|
-
if verbose: log.write(' -Converting to Int64 data type ...')
|
|
598
|
-
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
599
|
-
except:
|
|
600
|
-
if verbose: log.write(' -Force converting to Int64 data type ...')
|
|
601
|
-
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
602
|
-
is_pos_fixed = ~sumstats.loc[:,pos].isna()
|
|
603
|
-
is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
|
|
548
|
+
# X, Y, MT to 23,24,25
|
|
549
|
+
xymt_list = [x[0].lower(),y[0].lower(),mt[0].lower(),x[0].upper(),y[0].upper(),mt[0].upper()]
|
|
604
550
|
|
|
605
|
-
sumstats
|
|
606
|
-
|
|
551
|
+
# check if sumstats contain sex CHR
|
|
552
|
+
sex_chr = sumstats[chrom].isin(xymt_list)
|
|
607
553
|
|
|
608
|
-
#
|
|
609
|
-
if
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
554
|
+
# if sumstats contain sex CHR
|
|
555
|
+
if sum(sex_chr)>0:
|
|
556
|
+
if verbose: log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]))
|
|
557
|
+
if verbose: log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...")
|
|
558
|
+
|
|
559
|
+
# convert "X, Y, MT" to numbers
|
|
560
|
+
convert_num_to_xymt={}
|
|
561
|
+
if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
|
|
562
|
+
convert_num_to_xymt[x[0].lower()] = str(x[1])
|
|
563
|
+
convert_num_to_xymt[x[0].upper()] = str(x[1])
|
|
564
|
+
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]))
|
|
565
|
+
if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
|
|
566
|
+
convert_num_to_xymt[y[0].lower()] = str(y[1])
|
|
567
|
+
convert_num_to_xymt[y[0].upper()] = str(y[1])
|
|
568
|
+
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]))
|
|
569
|
+
if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
|
|
570
|
+
convert_num_to_xymt[mt[0].lower()] = str(mt[1])
|
|
571
|
+
convert_num_to_xymt[mt[0].upper()] = str(mt[1])
|
|
572
|
+
if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]))
|
|
573
|
+
sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
|
|
619
574
|
|
|
620
|
-
|
|
621
|
-
|
|
575
|
+
# change status code
|
|
576
|
+
sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
|
|
577
|
+
if len(is_chr_fixable.index)>0:
|
|
578
|
+
sumstats.loc[is_chr_fixable.index,status] = vchange_status(sumstats.loc[is_chr_fixable.index,status],4,"986","520")
|
|
579
|
+
if len(is_chr_fixable.index)>0:
|
|
580
|
+
sumstats.loc[is_chr_invalid.index,status] = vchange_status(sumstats.loc[is_chr_invalid.index,status],4,"986","743")
|
|
622
581
|
|
|
623
|
-
|
|
582
|
+
# check variants with unrecognized CHR
|
|
583
|
+
unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
|
|
584
|
+
if (remove is True) and unrecognized_num>0:
|
|
585
|
+
# remove variants with unrecognized CHR
|
|
586
|
+
try:
|
|
587
|
+
if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
|
|
588
|
+
except:
|
|
589
|
+
pass
|
|
590
|
+
if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
|
|
591
|
+
try:
|
|
592
|
+
log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
|
|
593
|
+
except:
|
|
594
|
+
pass
|
|
595
|
+
#sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
|
|
596
|
+
good_chr = sumstats[chrom].isin(chrom_list)
|
|
597
|
+
sumstats = sumstats.loc[good_chr, :].copy()
|
|
598
|
+
else:
|
|
599
|
+
if verbose: log.write(" -All CHR are already fixed...")
|
|
600
|
+
sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
|
|
601
|
+
|
|
602
|
+
# Convert string to int
|
|
603
|
+
try:
|
|
604
|
+
sumstats[chrom] = sumstats[chrom].astype('Int64')
|
|
605
|
+
except:
|
|
606
|
+
# # force convert
|
|
607
|
+
sumstats[chrom] = np.floor(pd.to_numeric(sumstats[chrom], errors='coerce')).astype('Int64')
|
|
608
|
+
|
|
609
|
+
# filter out variants with CHR <=0
|
|
610
|
+
out_of_range_chr = sumstats[chrom] < minchr
|
|
611
|
+
out_of_range_chr = out_of_range_chr.fillna(False)
|
|
612
|
+
if sum(out_of_range_chr)>0:
|
|
613
|
+
if verbose: log.write(" -Sanity check for CHR...")
|
|
614
|
+
if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
|
|
615
|
+
sumstats = sumstats.loc[~out_of_range_chr,:]
|
|
616
|
+
|
|
617
|
+
finished(log,verbose,_end_line)
|
|
618
|
+
return sumstats
|
|
619
|
+
|
|
620
|
+
###############################################################################################################
|
|
621
|
+
# 20230128
|
|
622
|
+
def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_limit=0 , upper_limit=None , limit=250000000, log=Log()):
|
|
623
|
+
##start function with col checking##########################################################
|
|
624
|
+
_start_line = "fix basepair positions (POS)"
|
|
625
|
+
_end_line = "fixing basepair positions (POS)"
|
|
626
|
+
_start_cols =[pos,status]
|
|
627
|
+
_start_function = ".fix_pos()"
|
|
628
|
+
_must_args ={}
|
|
629
|
+
|
|
630
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
631
|
+
log=log,
|
|
632
|
+
verbose=verbose,
|
|
633
|
+
start_line=_start_line,
|
|
634
|
+
end_line=_end_line,
|
|
635
|
+
start_cols=_start_cols,
|
|
636
|
+
start_function=_start_function,
|
|
637
|
+
**_must_args)
|
|
638
|
+
if is_enough_info == False: return sumstats
|
|
639
|
+
############################################################################################
|
|
640
|
+
|
|
641
|
+
if upper_limit is None:
|
|
642
|
+
upper_limit = limit
|
|
643
|
+
|
|
644
|
+
all_var_num = len(sumstats)
|
|
645
|
+
#convert to numeric
|
|
646
|
+
is_pos_na = sumstats[pos].isna()
|
|
647
|
+
|
|
648
|
+
try:
|
|
649
|
+
if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
|
|
650
|
+
sumstats[pos] = sumstats[pos].astype('string')
|
|
651
|
+
# if so, remove thousands separator
|
|
652
|
+
if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
|
|
653
|
+
sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
|
|
654
|
+
except:
|
|
655
|
+
pass
|
|
656
|
+
|
|
657
|
+
# convert POS to integer
|
|
658
|
+
try:
|
|
659
|
+
if verbose: log.write(' -Converting to Int64 data type ...')
|
|
660
|
+
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
661
|
+
except:
|
|
662
|
+
if verbose: log.write(' -Force converting to Int64 data type ...')
|
|
663
|
+
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
664
|
+
is_pos_fixed = ~sumstats[pos].isna()
|
|
665
|
+
is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
|
|
666
|
+
|
|
667
|
+
sumstats.loc[is_pos_fixed,status] = vchange_status(sumstats.loc[is_pos_fixed,status] ,4,"975","630")
|
|
668
|
+
sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
|
|
669
|
+
|
|
670
|
+
# remove outlier, limit:250,000,000
|
|
671
|
+
if verbose: log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit))
|
|
672
|
+
is_pos_na = sumstats[pos].isna()
|
|
673
|
+
out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
|
|
674
|
+
if verbose: log.write(" -Removed outliers:",sum(out_lier))
|
|
675
|
+
sumstats = sumstats.loc[~out_lier,:]
|
|
676
|
+
#remove na
|
|
677
|
+
if remove is True:
|
|
678
|
+
sumstats = sumstats.loc[~sumstats[pos].isna(),:]
|
|
679
|
+
remain_var_num = len(sumstats)
|
|
680
|
+
if verbose: log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.")
|
|
681
|
+
|
|
682
|
+
finished(log,verbose,_end_line)
|
|
683
|
+
return sumstats
|
|
624
684
|
|
|
625
685
|
###############################################################################################################
|
|
626
686
|
# 20220514
|
|
627
687
|
def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=True,log=Log()):
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
#if (ea not in sumstats.columns) or (nea not in sumstats.columns):
|
|
636
|
-
if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
|
|
637
|
-
|
|
638
|
-
#try:
|
|
639
|
-
# ea_missing = sum(sumstats[ea].isna())
|
|
640
|
-
# nea_missing = sum(sumstats[nea].isna())
|
|
641
|
-
# if sum(ea_missing)>0:
|
|
642
|
-
# if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
|
|
643
|
-
# sumstats.loc[:,ea] = sumstats.loc[:,ea].add_categories("N").fillna("N")
|
|
644
|
-
# if sum(sumstats[nea].isna())>0:
|
|
645
|
-
# if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
|
|
646
|
-
# sumstats.loc[:,nea] = sumstats.loc[:,nea].add_categories("N").fillna("N")
|
|
647
|
-
#except:
|
|
648
|
-
# pass
|
|
649
|
-
|
|
650
|
-
categories = set(sumstats.loc[:,ea].str.upper())|set(sumstats.loc[:,nea].str.upper())|set("N")
|
|
651
|
-
categories = {x for x in categories if pd.notna(x)}
|
|
652
|
-
|
|
653
|
-
sumstats.loc[:,ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
|
|
654
|
-
sumstats.loc[:,nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
|
|
655
|
-
all_var_num = len(sumstats)
|
|
656
|
-
|
|
657
|
-
## check ATCG
|
|
658
|
-
bad_ea = sumstats[ea].str.contains("[^actgACTG]",na=True)
|
|
659
|
-
bad_nea = sumstats[nea].str.contains("[^actgACTG]",na=True)
|
|
660
|
-
good_ea = ~bad_ea
|
|
661
|
-
good_nea = ~bad_nea
|
|
662
|
-
|
|
663
|
-
log.write(" -Variants with bad EA : {}".format(sum(bad_ea)), verbose=verbose)
|
|
664
|
-
log.write(" -Variants with bad NEA : {}".format(sum(bad_nea)), verbose=verbose)
|
|
665
|
-
|
|
666
|
-
## check NA
|
|
667
|
-
is_eanea_na = sumstats[ea].isna() | sumstats[nea].isna()
|
|
668
|
-
log.write(" -Variants with NA for EA or NEA: {}".format(sum(is_eanea_na)), verbose=verbose)
|
|
669
|
-
|
|
670
|
-
## check same alleles
|
|
671
|
-
not_variant = sumstats[nea] == sumstats[ea]
|
|
672
|
-
log.write(" -Variants with same EA and NEA: {}".format(sum(not_variant)), verbose=verbose)
|
|
688
|
+
##start function with col checking##########################################################
|
|
689
|
+
_start_line = "fix alleles (EA and NEA)"
|
|
690
|
+
_end_line = "fixing alleles (EA and NEA)"
|
|
691
|
+
_start_cols =[ea, nea,status]
|
|
692
|
+
_start_function = ".fix_allele()"
|
|
693
|
+
_must_args ={}
|
|
673
694
|
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
695
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
696
|
+
log=log,
|
|
697
|
+
verbose=verbose,
|
|
698
|
+
start_line=_start_line,
|
|
699
|
+
end_line=_end_line,
|
|
700
|
+
start_cols=_start_cols,
|
|
701
|
+
start_function=_start_function,
|
|
702
|
+
**_must_args)
|
|
703
|
+
if is_enough_info == False: return sumstats
|
|
704
|
+
############################################################################################
|
|
705
|
+
#try:
|
|
706
|
+
# ea_missing = sum(sumstats[ea].isna())
|
|
707
|
+
# nea_missing = sum(sumstats[nea].isna())
|
|
708
|
+
# if sum(ea_missing)>0:
|
|
709
|
+
# if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
|
|
710
|
+
# sumstats[ea] = sumstats[ea].add_categories("N").fillna("N")
|
|
711
|
+
# if sum(sumstats[nea].isna())>0:
|
|
712
|
+
# if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
|
|
713
|
+
# sumstats[nea] = sumstats[nea].add_categories("N").fillna("N")
|
|
714
|
+
#except:
|
|
715
|
+
# pass
|
|
716
|
+
|
|
717
|
+
if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
|
|
718
|
+
categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
|
|
719
|
+
categories = {x for x in categories if pd.notna(x)}
|
|
720
|
+
sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
|
|
721
|
+
sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
|
|
722
|
+
all_var_num = len(sumstats)
|
|
723
|
+
|
|
724
|
+
## check ATCG
|
|
725
|
+
bad_ea = sumstats[ea].str.contains("[^actgACTG]",na=True)
|
|
726
|
+
bad_nea = sumstats[nea].str.contains("[^actgACTG]",na=True)
|
|
727
|
+
good_ea = ~bad_ea
|
|
728
|
+
good_nea = ~bad_nea
|
|
729
|
+
|
|
730
|
+
log.write(" -Variants with bad EA : {}".format(sum(bad_ea)), verbose=verbose)
|
|
731
|
+
log.write(" -Variants with bad NEA : {}".format(sum(bad_nea)), verbose=verbose)
|
|
732
|
+
|
|
733
|
+
## check NA
|
|
734
|
+
is_eanea_na = sumstats[ea].isna() | sumstats[nea].isna()
|
|
735
|
+
log.write(" -Variants with NA for EA or NEA: {}".format(sum(is_eanea_na)), verbose=verbose)
|
|
736
|
+
|
|
737
|
+
## check same alleles
|
|
738
|
+
not_variant = sumstats[nea] == sumstats[ea]
|
|
739
|
+
log.write(" -Variants with same EA and NEA: {}".format(sum(not_variant)), verbose=verbose)
|
|
740
|
+
|
|
741
|
+
## sum up invalid variants
|
|
742
|
+
is_invalid = bad_ea | bad_nea | not_variant
|
|
743
|
+
|
|
744
|
+
exclude = bad_nea | bad_ea
|
|
745
|
+
|
|
746
|
+
if verbose:
|
|
747
|
+
if len(set(sumstats.loc[bad_ea,ea].head())) >0:
|
|
748
|
+
log.write(" -A look at the non-ATCG EA:",set(sumstats.loc[bad_ea,ea].head()),"...")
|
|
749
|
+
if len(set(sumstats.loc[bad_nea,nea].head())) >0:
|
|
750
|
+
log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...")
|
|
751
|
+
|
|
752
|
+
if remove == True:
|
|
753
|
+
sumstats = sumstats.loc[(good_ea & good_nea),:].copy()
|
|
754
|
+
good_eanea_num = len(sumstats)
|
|
755
|
+
if verbose: log.write(" -Removed "+str(all_var_num - good_eanea_num)+" variants with NA alleles or alleles that contain bases other than A/C/T/G.")
|
|
756
|
+
sumstats = sumstats.loc[(good_ea & good_nea & (~not_variant)),:].copy()
|
|
757
|
+
good_eanea_notsame_num = len(sumstats)
|
|
758
|
+
if verbose: log.write(" -Removed "+str(good_eanea_num - good_eanea_notsame_num)+" variants with same allele for EA and NEA.")
|
|
759
|
+
else:
|
|
760
|
+
sumstats[[ea,nea]] = sumstats[[ea,nea]].fillna("N")
|
|
761
|
+
if verbose: log.write(" -Detected "+str(sum(exclude))+" variants with alleles that contain bases other than A/C/T/G .")
|
|
762
|
+
categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
|
|
763
|
+
sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
|
|
764
|
+
sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
|
|
765
|
+
|
|
766
|
+
is_eanea_fixed = good_ea | good_nea
|
|
767
|
+
is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
|
|
768
|
+
is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
|
|
769
|
+
is_not_normalized = (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()>1)
|
|
770
|
+
is_normalized = is_indel &( (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()>1) | (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()==1) )
|
|
771
|
+
|
|
772
|
+
if sum(is_invalid)>0:
|
|
773
|
+
sumstats.loc[is_invalid, status] = vchange_status(sumstats.loc[is_invalid,status], 5,"9","6")
|
|
774
|
+
if sum(is_eanea_na)>0:
|
|
775
|
+
sumstats.loc[is_eanea_na,status] = vchange_status(sumstats.loc[is_eanea_na, status], 5,"9","7")
|
|
776
|
+
if sum(is_eanea_fixed&is_not_normalized)>0:
|
|
777
|
+
sumstats.loc[is_eanea_fixed&is_not_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_not_normalized,status], 5,"9","5")
|
|
778
|
+
if sum(is_eanea_fixed&is_snp)>0:
|
|
779
|
+
sumstats.loc[is_eanea_fixed&is_snp, status] = vchange_status(sumstats.loc[is_eanea_fixed&is_snp,status], 5,"9","0")
|
|
780
|
+
if sum(is_eanea_fixed&is_indel)>0:
|
|
781
|
+
sumstats.loc[is_eanea_fixed&is_indel,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_indel, status], 5,"9","4")
|
|
782
|
+
if sum(is_eanea_fixed&is_normalized)>0:
|
|
783
|
+
sumstats.loc[is_eanea_fixed&is_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_normalized, status], 5,"4","3")
|
|
784
|
+
|
|
785
|
+
finished(log,verbose,_end_line)
|
|
786
|
+
return sumstats
|
|
721
787
|
|
|
722
788
|
###############################################################################################################
|
|
723
789
|
# 20220721
|
|
724
790
|
|
|
725
791
|
def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
792
|
+
##start function with col checking##########################################################
|
|
793
|
+
_start_line = "normalize indels"
|
|
794
|
+
_end_line = "normalizing indels"
|
|
795
|
+
_start_cols =[ea, nea,status]
|
|
796
|
+
_start_function = ".normalize()"
|
|
797
|
+
_must_args ={}
|
|
798
|
+
|
|
799
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
800
|
+
log=log,
|
|
801
|
+
verbose=verbose,
|
|
802
|
+
start_line=_start_line,
|
|
803
|
+
end_line=_end_line,
|
|
804
|
+
start_cols=_start_cols,
|
|
805
|
+
start_function=_start_function,
|
|
806
|
+
**_must_args)
|
|
807
|
+
if is_enough_info == False: return sumstats
|
|
808
|
+
############################################################################################
|
|
809
|
+
|
|
732
810
|
#variants_to_check = status_match(sumstats[status],5,[4,5]) #
|
|
733
811
|
#r'\w\w\w\w[45]\w\w'
|
|
734
812
|
variants_to_check = sumstats[status].str[4].str.match(r'4|5', case=False, flags=0, na=False)
|
|
@@ -742,7 +820,8 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
742
820
|
n_cores=1
|
|
743
821
|
pool = Pool(n_cores)
|
|
744
822
|
map_func = partial(normalizeallele,pos=pos,nea=nea,ea=ea,status=status)
|
|
745
|
-
df_split = np.array_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
|
|
823
|
+
#df_split = np.array_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
|
|
824
|
+
df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
|
|
746
825
|
normalized_pd = pd.concat(pool.map(map_func,df_split))
|
|
747
826
|
pool.close()
|
|
748
827
|
pool.join()
|
|
@@ -772,16 +851,16 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
772
851
|
else:
|
|
773
852
|
log.write(" -All variants are already normalized..")
|
|
774
853
|
###################################################################################################################
|
|
775
|
-
categories = set(sumstats
|
|
776
|
-
sumstats
|
|
777
|
-
sumstats
|
|
854
|
+
categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
|
|
855
|
+
sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
|
|
856
|
+
sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
|
|
778
857
|
sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
|
|
779
858
|
try:
|
|
780
|
-
sumstats
|
|
859
|
+
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
781
860
|
except:
|
|
782
|
-
sumstats
|
|
861
|
+
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
783
862
|
|
|
784
|
-
|
|
863
|
+
finished(log,verbose,_end_line)
|
|
785
864
|
return sumstats
|
|
786
865
|
|
|
787
866
|
def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
|
|
@@ -846,6 +925,52 @@ def add_tolerence(stats, float_tolerence, mode):
|
|
|
846
925
|
stats = (stats[0] , stats[1] + float_tolerence if stats[0]!=float("Inf") else float("Inf"))
|
|
847
926
|
return stats
|
|
848
927
|
|
|
928
|
+
|
|
929
|
+
def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, verbose, dtype="Int64"):
|
|
930
|
+
pre_number=len(sumstats)
|
|
931
|
+
if header in coltocheck and header in sumstats.columns:
|
|
932
|
+
cols_to_check.append(header)
|
|
933
|
+
if header=="STATUS":
|
|
934
|
+
if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
|
|
935
|
+
categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
936
|
+
sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
|
|
937
|
+
return sumstats
|
|
938
|
+
|
|
939
|
+
if dtype in ["Int64","Int32","int","int32","in64"]:
|
|
940
|
+
if verbose: log.write(" -Checking if {} <= {} <= {} ...".format( var_range[0] ,header, var_range[1]))
|
|
941
|
+
sumstats[header] = np.floor(pd.to_numeric(sumstats[header], errors='coerce')).astype(dtype)
|
|
942
|
+
|
|
943
|
+
elif dtype in ["Float64","Float32","float","float64","float32"]:
|
|
944
|
+
log.write(" -Checking if {} < {} < {} ...".format( var_range[0] ,header, var_range[1]),verbose=verbose)
|
|
945
|
+
sumstats[header] = pd.to_numeric(sumstats[header], errors='coerce').astype(dtype)
|
|
946
|
+
|
|
947
|
+
is_valid = (sumstats[header]>=var_range[0]) & (sumstats[header]<=var_range[1])
|
|
948
|
+
is_valid = is_valid.fillna(False)
|
|
949
|
+
|
|
950
|
+
if header=="P":
|
|
951
|
+
is_low_p = sumstats["P"] == 0
|
|
952
|
+
if sum(is_low_p) >0:
|
|
953
|
+
log.warning("Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)))
|
|
954
|
+
log.warning("Please consider using MLOG10P instead.")
|
|
955
|
+
|
|
956
|
+
if sum(~is_valid)>0:
|
|
957
|
+
try:
|
|
958
|
+
if "SNPID" in sumstats.columns:
|
|
959
|
+
id_to_use = "SNPID"
|
|
960
|
+
elif "rsID" in sumstats.columns:
|
|
961
|
+
id_to_use = "rsID"
|
|
962
|
+
invalid_ids = sumstats.loc[~is_valid, id_to_use].head().astype("string")
|
|
963
|
+
invalid_values = sumstats.loc[~is_valid, header].head().astype("string").fillna("NA")
|
|
964
|
+
log.write(" -Examples of invalid variants({}): {} ...".format(id_to_use, ",".join(invalid_ids.to_list()) ), verbose=verbose)
|
|
965
|
+
log.write(" -Examples of invalid values ({}): {} ...".format(header, ",".join(invalid_values.to_list()) ), verbose=verbose)
|
|
966
|
+
except:
|
|
967
|
+
pass
|
|
968
|
+
|
|
969
|
+
sumstats = sumstats.loc[is_valid,:]
|
|
970
|
+
after_number=len(sumstats)
|
|
971
|
+
log.write(" -Removed {} variants with bad/na {}.".format(pre_number - after_number, header), verbose=verbose)
|
|
972
|
+
return sumstats
|
|
973
|
+
|
|
849
974
|
def sanitycheckstats(sumstats,
|
|
850
975
|
coltocheck=None,
|
|
851
976
|
n=(0,2**31-1),
|
|
@@ -853,8 +978,10 @@ def sanitycheckstats(sumstats,
|
|
|
853
978
|
ncontrol=(0,2**31-1),
|
|
854
979
|
eaf=(0,1),
|
|
855
980
|
mac=(0,2**31-1),
|
|
981
|
+
maf=(0,0.5),
|
|
856
982
|
chisq=(0,float("Inf")),
|
|
857
983
|
z=(-9999,9999),
|
|
984
|
+
t=(-99999,99999),
|
|
858
985
|
f=(0,float("Inf")),
|
|
859
986
|
p=(0,1),
|
|
860
987
|
mlog10p=(0,9999),
|
|
@@ -885,10 +1012,30 @@ def sanitycheckstats(sumstats,
|
|
|
885
1012
|
HR_95U: float64 , HR_95L >0
|
|
886
1013
|
INFO: float32 , 1>=INFO>0
|
|
887
1014
|
Z float64 , -9999 < Z < 9999
|
|
1015
|
+
T float64 , -99999 < T < 99999
|
|
888
1016
|
F float64 , F > 0
|
|
889
1017
|
'''
|
|
1018
|
+
##start function with col checking##########################################################
|
|
1019
|
+
_start_line = "perform sanity check for statistics"
|
|
1020
|
+
_end_line = "sanity check for statistics"
|
|
1021
|
+
_start_cols =[]
|
|
1022
|
+
_start_function = ".check_sanity()"
|
|
1023
|
+
_must_args ={}
|
|
1024
|
+
|
|
1025
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1026
|
+
log=log,
|
|
1027
|
+
verbose=verbose,
|
|
1028
|
+
start_line=_start_line,
|
|
1029
|
+
end_line=_end_line,
|
|
1030
|
+
start_cols=_start_cols,
|
|
1031
|
+
start_function=_start_function,
|
|
1032
|
+
**_must_args)
|
|
1033
|
+
if is_enough_info == False: return sumstats
|
|
1034
|
+
############################################################################################
|
|
890
1035
|
|
|
1036
|
+
if verbose: log.write(" -Comparison tolerance for floats: {}".format(float_tolerence))
|
|
891
1037
|
eaf = add_tolerence(eaf, float_tolerence, "lr")
|
|
1038
|
+
maf = add_tolerence(maf, float_tolerence, "lr")
|
|
892
1039
|
beta = add_tolerence(beta, float_tolerence, "lr")
|
|
893
1040
|
se = add_tolerence(se, float_tolerence, "lr")
|
|
894
1041
|
mlog10p = add_tolerence(mlog10p, float_tolerence, "lr")
|
|
@@ -903,233 +1050,83 @@ def sanitycheckstats(sumstats,
|
|
|
903
1050
|
p = add_tolerence(p, float_tolerence, "lr")
|
|
904
1051
|
f = add_tolerence(f, float_tolerence, "lr")
|
|
905
1052
|
chisq = add_tolerence(chisq, float_tolerence, "lr")
|
|
906
|
-
|
|
907
|
-
|
|
1053
|
+
############################################################################################
|
|
908
1054
|
## add direction
|
|
909
1055
|
if coltocheck is None:
|
|
910
1056
|
coltocheck = ["P","MLOG10P","INFO","Z","BETA","SE","EAF","CHISQ","F","N","N_CASE","N_CONTROL","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","STATUS"]
|
|
911
|
-
|
|
912
|
-
check_dataframe_shape(sumstats, log, verbose)
|
|
1057
|
+
|
|
913
1058
|
cols_to_check=[]
|
|
914
1059
|
oringinal_number=len(sumstats)
|
|
915
1060
|
sumstats = sumstats.copy()
|
|
916
1061
|
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
cols_to_check.append("N")
|
|
922
|
-
if verbose: log.write(" -Checking if ",n[0],"<=N<=",n[1]," ...")
|
|
923
|
-
sumstats.loc[:,"N"] = np.floor(pd.to_numeric(sumstats.loc[:,"N"], errors='coerce')).astype("Int64")
|
|
924
|
-
sumstats = sumstats.loc[(sumstats["N"]>=n[0]) & (sumstats["N"]<=n[1]),:]
|
|
925
|
-
after_number=len(sumstats)
|
|
926
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N.")
|
|
927
|
-
pre_number=len(sumstats)
|
|
928
|
-
if "N_CASE" in coltocheck and "N_CASE" in sumstats.columns:
|
|
929
|
-
cols_to_check.append("N_CASE")
|
|
930
|
-
if verbose: log.write(" -Checking if ",ncase[0],"<=N_CASE<=",ncase[1]," ...")
|
|
931
|
-
sumstats.loc[:,"N_CASE"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CASE"], errors='coerce')).astype("Int64")
|
|
932
|
-
sumstats = sumstats.loc[(sumstats["N_CASE"]>=ncase[0]) & (sumstats["N_CASE"]<=ncase[1]),:]
|
|
933
|
-
after_number=len(sumstats)
|
|
934
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CASE.")
|
|
935
|
-
pre_number=len(sumstats)
|
|
936
|
-
if "N_CONTROL" in coltocheck and "N_CONTROL" in sumstats.columns:
|
|
937
|
-
cols_to_check.append("N_CONTROL")
|
|
938
|
-
if verbose: log.write(" -Checking if ",ncontrol[0],"<=N_CONTROL<=",ncontrol[1]," ...")
|
|
939
|
-
sumstats.loc[:,"N_CONTROL"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CONTROL"], errors='coerce')).astype("Int64")
|
|
940
|
-
sumstats = sumstats.loc[(sumstats["N_CONTROL"]>=ncontrol[0]) & (sumstats["N_CONTROL"]<=ncontrol[1]),:]
|
|
941
|
-
after_number=len(sumstats)
|
|
942
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CONTROL.")
|
|
1062
|
+
###Int64 ################################################################################################################################################
|
|
1063
|
+
sumstats = check_range(sumstats, var_range=n, header="N", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
|
|
1064
|
+
sumstats = check_range(sumstats, var_range=ncase, header="N_CASE", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
|
|
1065
|
+
sumstats = check_range(sumstats, var_range=ncontrol, header="N_CONTROL", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
|
|
943
1066
|
|
|
1067
|
+
###float32 ################################################################################################################################################
|
|
1068
|
+
sumstats = check_range(sumstats, var_range=eaf, header="EAF", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
|
|
1069
|
+
sumstats = check_range(sumstats, var_range=maf, header="MAF", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
|
|
1070
|
+
sumstats = check_range(sumstats, var_range=info, header="INFO", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
|
|
944
1071
|
|
|
945
|
-
###
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
macu = ( sumstats["_MAC"] <= mac[1])
|
|
963
|
-
sumstats = sumstats.loc[macl&macu,:]
|
|
964
|
-
sumstats = sumstats.drop(labels=["_MAF","_MAC"],axis=1)
|
|
965
|
-
after_number=len(sumstats)
|
|
966
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad MAC.")
|
|
967
|
-
|
|
968
|
-
###TEST STATISTICS################################################################################################################################################
|
|
969
|
-
pre_number=len(sumstats)
|
|
970
|
-
if "CHISQ" in coltocheck and "CHISQ" in sumstats.columns:
|
|
971
|
-
cols_to_check.append("CHISQ")
|
|
972
|
-
if verbose: log.write(" -Checking if ",chisq[0],"<CHISQ<",chisq[1]," ...")
|
|
973
|
-
sumstats.loc[:,"CHISQ"] = pd.to_numeric(sumstats.loc[:,"CHISQ"], errors='coerce').astype("float64")
|
|
974
|
-
sumstats = sumstats.loc[(sumstats["CHISQ"]>chisq[0]) & (sumstats["CHISQ"]<chisq[1]),:]
|
|
975
|
-
after_number=len(sumstats)
|
|
976
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad CHISQ.")
|
|
977
|
-
|
|
978
|
-
pre_number=len(sumstats)
|
|
979
|
-
if "Z" in coltocheck and "Z" in sumstats.columns:
|
|
980
|
-
cols_to_check.append("Z")
|
|
981
|
-
if verbose: log.write(" -Checking if ",z[0],"<Z<",z[1]," ...")
|
|
982
|
-
sumstats.loc[:,"Z"] = pd.to_numeric(sumstats.loc[:,"Z"], errors='coerce').astype("float64")
|
|
983
|
-
sumstats = sumstats.loc[(sumstats["Z"]>z[0]) & (sumstats["Z"]<z[1]),:]
|
|
984
|
-
after_number=len(sumstats)
|
|
985
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad Z.")
|
|
986
|
-
|
|
987
|
-
pre_number=len(sumstats)
|
|
988
|
-
if "F" in coltocheck and "F" in sumstats.columns:
|
|
989
|
-
cols_to_check.append("F")
|
|
990
|
-
if verbose: log.write(" -Checking if ",f[0],"<F<",f[1]," ...")
|
|
991
|
-
sumstats.loc[:,"F"] = pd.to_numeric(sumstats.loc[:,"F"], errors='coerce').astype("float64")
|
|
992
|
-
sumstats = sumstats.loc[(sumstats["F"]>f[0]) & (sumstats["F"]<f[1]),:]
|
|
993
|
-
after_number=len(sumstats)
|
|
994
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad F.")
|
|
995
|
-
|
|
996
|
-
###P ################################################################################################################################################
|
|
997
|
-
pre_number=len(sumstats)
|
|
998
|
-
if "P" in coltocheck and "P" in sumstats.columns:
|
|
999
|
-
cols_to_check.append("P")
|
|
1000
|
-
if verbose: log.write(" -Checking if ",p[0],"< P <",p[1]," ...")
|
|
1001
|
-
sumstats.loc[:,"P"] = pd.to_numeric(sumstats.loc[:,"P"], errors='coerce').astype("float64")
|
|
1002
|
-
sumstats = sumstats.loc[(sumstats["P"]>p[0]) & (sumstats["P"]<p[1]),:]
|
|
1003
|
-
|
|
1004
|
-
is_low_p = sumstats["P"] == 0
|
|
1005
|
-
if sum(is_low_p) >0:
|
|
1006
|
-
log.write(" -WARNING! Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)), verbose=verbose)
|
|
1007
|
-
log.write(" -WARNING! Please consider using MLOG10P instead.", verbose=verbose)
|
|
1008
|
-
after_number=len(sumstats)
|
|
1009
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad P.")
|
|
1010
|
-
|
|
1011
|
-
pre_number=len(sumstats)
|
|
1012
|
-
if "MLOG10P" in coltocheck and "MLOG10P" in sumstats.columns:
|
|
1013
|
-
cols_to_check.append("MLOG10P")
|
|
1014
|
-
if verbose: log.write(" -Checking if ",mlog10p[0],"<MLOG10P<",mlog10p[1]," ...")
|
|
1015
|
-
sumstats.loc[:,"MLOG10P"] = pd.to_numeric(sumstats.loc[:,"MLOG10P"], errors='coerce').astype("float64")
|
|
1016
|
-
sumstats = sumstats.loc[(sumstats["MLOG10P"]>mlog10p[0]) & (sumstats["MLOG10P"]<mlog10p[1]),:]
|
|
1017
|
-
after_number=len(sumstats)
|
|
1018
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad MLOG10P.")
|
|
1019
|
-
|
|
1020
|
-
###EFFECT ################################################################################################################################################
|
|
1021
|
-
pre_number=len(sumstats)
|
|
1022
|
-
if "BETA" in coltocheck and "BETA" in sumstats.columns:
|
|
1023
|
-
cols_to_check.append("BETA")
|
|
1024
|
-
if verbose: log.write(" -Checking if ",beta[0],"<BETA<",beta[1]," ...")
|
|
1025
|
-
sumstats.loc[:,"BETA"] = pd.to_numeric(sumstats.loc[:,"BETA"], errors='coerce').astype("float64")
|
|
1026
|
-
sumstats = sumstats.loc[(sumstats["BETA"]>beta[0]) & (sumstats["BETA"]<beta[1]),:]
|
|
1027
|
-
after_number=len(sumstats)
|
|
1028
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad BETA.")
|
|
1029
|
-
|
|
1030
|
-
pre_number=len(sumstats)
|
|
1031
|
-
if "SE" in coltocheck and "SE" in sumstats.columns:
|
|
1032
|
-
cols_to_check.append("SE")
|
|
1033
|
-
if verbose: log.write(" -Checking if ",se[0],"<SE<",se[1]," ...")
|
|
1034
|
-
sumstats.loc[:,"SE"] = pd.to_numeric(sumstats.loc[:,"SE"], errors='coerce').astype("float64")
|
|
1035
|
-
sumstats = sumstats.loc[(sumstats["SE"]>se[0]) & (sumstats["SE"]<se[1]),:]
|
|
1036
|
-
after_number=len(sumstats)
|
|
1037
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad SE.")
|
|
1038
|
-
|
|
1039
|
-
pre_number=len(sumstats)
|
|
1040
|
-
if "OR" in coltocheck and "OR" in sumstats.columns:
|
|
1041
|
-
cols_to_check.append("OR")
|
|
1042
|
-
if verbose: log.write(" -Checking if ",OR[0],"<log(OR)<",OR[1]," ...")
|
|
1043
|
-
sumstats.loc[:,"OR"] = pd.to_numeric(sumstats.loc[:,"OR"], errors='coerce').astype("float64")
|
|
1044
|
-
sumstats = sumstats.loc[(np.log(sumstats["OR"])>OR[0]) & (np.log(sumstats["OR"])<OR[1]),:]
|
|
1045
|
-
after_number=len(sumstats)
|
|
1046
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR.")
|
|
1047
|
-
|
|
1048
|
-
pre_number=len(sumstats)
|
|
1049
|
-
if "OR_95L" in coltocheck and "OR_95L" in sumstats.columns:
|
|
1050
|
-
cols_to_check.append("OR_95L")
|
|
1051
|
-
if verbose: log.write(" -Checking if ",OR_95L[0],"<OR_95L<",OR_95L[1]," ...")
|
|
1052
|
-
sumstats.loc[:,"OR_95L"] = pd.to_numeric(sumstats.loc[:,"OR_95L"], errors='coerce').astype("float64")
|
|
1053
|
-
sumstats = sumstats.loc[(sumstats["OR_95L"]>OR_95L[0]) & (sumstats["OR_95L"]<OR_95L[1]),:]
|
|
1054
|
-
after_number=len(sumstats)
|
|
1055
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR_95L.")
|
|
1056
|
-
|
|
1057
|
-
pre_number=len(sumstats)
|
|
1058
|
-
if "OR_95U" in coltocheck and "OR_95U" in sumstats.columns:
|
|
1059
|
-
cols_to_check.append("OR_95U")
|
|
1060
|
-
if verbose: log.write(" -Checking if ",OR_95U[0],"<OR_95U<",OR_95U[1]," ...")
|
|
1061
|
-
sumstats.loc[:,"OR_95U"] = pd.to_numeric(sumstats.loc[:,"OR_95U"], errors='coerce').astype("float64")
|
|
1062
|
-
sumstats = sumstats.loc[(sumstats["OR_95U"]>OR_95U[0]) & (sumstats["OR_95U"]<OR_95U[1]),:]
|
|
1063
|
-
after_number=len(sumstats)
|
|
1064
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR_95U.")
|
|
1065
|
-
|
|
1066
|
-
pre_number=len(sumstats)
|
|
1067
|
-
if "HR" in coltocheck and "HR" in sumstats.columns:
|
|
1068
|
-
cols_to_check.append("HR")
|
|
1069
|
-
if verbose: log.write(" -Checking if ",HR[0],"<log(HR)<",HR[1]," ...")
|
|
1070
|
-
sumstats.loc[:,"HR"] = pd.to_numeric(sumstats.loc[:,"HR"], errors='coerce').astype("float64")
|
|
1071
|
-
sumstats = sumstats.loc[(np.log(sumstats["HR"])>HR[0]) & (np.log(sumstats["HR"])<HR[1]),:]
|
|
1072
|
-
after_number=len(sumstats)
|
|
1073
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR.")
|
|
1074
|
-
|
|
1075
|
-
pre_number=len(sumstats)
|
|
1076
|
-
if "HR_95L" in coltocheck and "HR_95L" in sumstats.columns:
|
|
1077
|
-
cols_to_check.append("HR_95L")
|
|
1078
|
-
if verbose: log.write(" -Checking if ",HR_95L[0],"<HR_95L<",HR_95L[1]," ...")
|
|
1079
|
-
sumstats.loc[:,"HR_95L"] = pd.to_numeric(sumstats.loc[:,"HR_95L"], errors='coerce').astype("float64")
|
|
1080
|
-
sumstats = sumstats.loc[(sumstats["HR_95L"]>HR_95L[0]) & (sumstats["HR_95L"]<HR_95L[1]),:]
|
|
1081
|
-
after_number=len(sumstats)
|
|
1082
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR_95L.")
|
|
1083
|
-
|
|
1084
|
-
pre_number=len(sumstats)
|
|
1085
|
-
if "HR_95U" in coltocheck and "HR_95U" in sumstats.columns:
|
|
1086
|
-
cols_to_check.append("HR_95U")
|
|
1087
|
-
if verbose: log.write(" -Checking if ",HR_95U[0],"<HR_95U<",HR_95U[1]," ...")
|
|
1088
|
-
sumstats.loc[:,"HR_95U"] = pd.to_numeric(sumstats.loc[:,"HR_95U"], errors='coerce').astype("float64")
|
|
1089
|
-
sumstats = sumstats.loc[(sumstats["HR_95U"]>HR_95U[0]) & (sumstats["HR_95U"]<HR_95U[1]),:]
|
|
1090
|
-
after_number=len(sumstats)
|
|
1091
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR_95U.")
|
|
1092
|
-
#INFO #################################################################################################################
|
|
1093
|
-
pre_number=len(sumstats)
|
|
1094
|
-
if "INFO" in coltocheck and "INFO" in sumstats.columns:
|
|
1095
|
-
cols_to_check.append("INFO")
|
|
1096
|
-
if verbose: log.write(" -Checking if ",info[0],"<INFO<",info[1]," ...")
|
|
1097
|
-
sumstats.loc[:,"INFO"] = pd.to_numeric(sumstats.loc[:,"INFO"], errors='coerce').astype("float32")
|
|
1098
|
-
sumstats = sumstats.loc[(sumstats["INFO"]>info[0]) & (sumstats["INFO"]<info[1]),:]
|
|
1099
|
-
after_number=len(sumstats)
|
|
1100
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad INFO.")
|
|
1101
|
-
###STATUS ################################################################################################################################################
|
|
1102
|
-
pre_number=len(sumstats)
|
|
1103
|
-
if "STATUS" in coltocheck and "STATUS" in sumstats.columns:
|
|
1104
|
-
cols_to_check.append("STATUS")
|
|
1105
|
-
if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
|
|
1106
|
-
categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
1107
|
-
sumstats.loc[:,"STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
|
|
1108
|
-
|
|
1109
|
-
#pre_number=len(sumstats)
|
|
1110
|
-
#sumstats = sumstats.dropna(subset=cols_to_check)
|
|
1111
|
-
after_number=len(sumstats)
|
|
1112
|
-
#if verbose:log.write(" -Removed {} variants with NAs in the checked columns...".format(pre_number - after_number))
|
|
1072
|
+
###float64 ################################################################################################################################################
|
|
1073
|
+
sumstats = check_range(sumstats, var_range=chisq, header="CHISQ", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1074
|
+
sumstats = check_range(sumstats, var_range=z, header="Z", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1075
|
+
sumstats = check_range(sumstats, var_range=t, header="T", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1076
|
+
sumstats = check_range(sumstats, var_range=f, header="F", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1077
|
+
sumstats = check_range(sumstats, var_range=p, header="P", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1078
|
+
sumstats = check_range(sumstats, var_range=mlog10p, header="MLOG10P", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1079
|
+
sumstats = check_range(sumstats, var_range=beta, header="BETA", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1080
|
+
sumstats = check_range(sumstats, var_range=se, header="SE", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1081
|
+
sumstats = check_range(sumstats, var_range=OR, header="OR", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1082
|
+
sumstats = check_range(sumstats, var_range=OR_95L, header="OR_95L", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1083
|
+
sumstats = check_range(sumstats, var_range=OR_95U, header="OR_95U", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1084
|
+
sumstats = check_range(sumstats, var_range=HR, header="HR", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1085
|
+
sumstats = check_range(sumstats, var_range=HR_95L, header="HR_95L", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1086
|
+
sumstats = check_range(sumstats, var_range=HR_95U, header="HR_95U", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
|
|
1087
|
+
###STATUS ###############################################################################################################################################
|
|
1088
|
+
sumstats = check_range(sumstats, var_range=None, header="STATUS", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="category")
|
|
1113
1089
|
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1090
|
+
after_number=len(sumstats)
|
|
1091
|
+
log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.",verbose=verbose)
|
|
1092
|
+
log.write(" -Data types for each column:",verbose=verbose)
|
|
1093
|
+
check_datatype(sumstats,verbose=verbose, log=log)
|
|
1094
|
+
finished(log,verbose,_end_line)
|
|
1119
1095
|
return sumstats
|
|
1120
1096
|
|
|
1121
1097
|
### check consistency #############################################################################################################################################
|
|
1122
1098
|
|
|
1123
|
-
def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
|
|
1124
|
-
|
|
1125
|
-
|
|
1099
|
+
def _check_data_consistency(sumstats, beta="BETA", se="SE", p="P",mlog10p="MLOG10P",rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
|
|
1100
|
+
##start function with col checking##########################################################
|
|
1101
|
+
_start_line = "check data consistency across columns"
|
|
1102
|
+
_end_line = "checking data consistency across columns"
|
|
1103
|
+
_start_cols =[]
|
|
1104
|
+
_start_function = ".check_data_consistency()"
|
|
1105
|
+
_must_args ={}
|
|
1106
|
+
|
|
1107
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1108
|
+
log=log,
|
|
1109
|
+
verbose=verbose,
|
|
1110
|
+
start_line=_start_line,
|
|
1111
|
+
end_line=_end_line,
|
|
1112
|
+
start_cols=_start_cols,
|
|
1113
|
+
start_function=_start_function,
|
|
1114
|
+
**_must_args)
|
|
1115
|
+
if is_enough_info == False: return sumstats
|
|
1116
|
+
############################################################################################
|
|
1117
|
+
|
|
1126
1118
|
log.write(" -Tolerance: {} (Relative) and {} (Absolute)".format(rtol, atol),verbose=verbose)
|
|
1119
|
+
check_status = 0
|
|
1127
1120
|
|
|
1128
|
-
|
|
1129
|
-
|
|
1121
|
+
if "SNPID" in sumstats.columns:
|
|
1122
|
+
id_to_use = "SNPID"
|
|
1123
|
+
elif "rsID" in sumstats.columns:
|
|
1130
1124
|
id_to_use = "rsID"
|
|
1131
1125
|
else:
|
|
1132
|
-
|
|
1126
|
+
log.write(" -SNPID/rsID not available...SKipping",verbose=verbose)
|
|
1127
|
+
log.write("Finished checking data consistency across columns.",verbose=verbose)
|
|
1128
|
+
return 0
|
|
1129
|
+
|
|
1133
1130
|
|
|
1134
1131
|
if "BETA" in sumstats.columns and "SE" in sumstats.columns:
|
|
1135
1132
|
if "MLOG10P" in sumstats.columns:
|
|
@@ -1138,10 +1135,11 @@ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verb
|
|
|
1138
1135
|
is_close = np.isclose(betase_derived_mlog10p, sumstats["MLOG10P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1139
1136
|
diff = betase_derived_mlog10p - sumstats["MLOG10P"]
|
|
1140
1137
|
if sum(~is_close)>0:
|
|
1141
|
-
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose)
|
|
1142
|
-
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose)
|
|
1138
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
|
|
1139
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
|
|
1143
1140
|
else:
|
|
1144
1141
|
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1142
|
+
check_status=1
|
|
1145
1143
|
|
|
1146
1144
|
if "P" in sumstats.columns:
|
|
1147
1145
|
log.write(" -Checking if BETA/SE-derived-P is consistent with P...",verbose=verbose)
|
|
@@ -1149,10 +1147,11 @@ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verb
|
|
|
1149
1147
|
is_close = np.isclose(betase_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1150
1148
|
diff = betase_derived_p - sumstats["P"]
|
|
1151
1149
|
if sum(~is_close)>0:
|
|
1152
|
-
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose)
|
|
1153
|
-
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose)
|
|
1150
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
|
|
1151
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
|
|
1154
1152
|
else:
|
|
1155
1153
|
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1154
|
+
check_status=1
|
|
1156
1155
|
|
|
1157
1156
|
if "MLOG10P" in sumstats.columns and "P" in sumstats.columns:
|
|
1158
1157
|
log.write(" -Checking if MLOG10P-derived-P is consistent with P...",verbose=verbose)
|
|
@@ -1160,25 +1159,30 @@ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verb
|
|
|
1160
1159
|
is_close = np.isclose(mlog10p_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1161
1160
|
diff = mlog10p_derived_p - sumstats["P"]
|
|
1162
1161
|
if sum(~is_close)>0:
|
|
1163
|
-
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose)
|
|
1164
|
-
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose)
|
|
1162
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
|
|
1163
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
|
|
1165
1164
|
else:
|
|
1166
1165
|
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1166
|
+
check_status=1
|
|
1167
1167
|
|
|
1168
1168
|
if "N" in sumstats.columns and "N_CONTROL" in sumstats.columns and "N_CASE" in sumstats.columns:
|
|
1169
1169
|
if verbose: log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...")
|
|
1170
|
-
is_close = sumstats
|
|
1171
|
-
#is_close = np.isclose(sumstats
|
|
1172
|
-
diff = abs(sumstats
|
|
1170
|
+
is_close = sumstats["N"] == sumstats["N_CASE"] + sumstats["N_CONTROL"]
|
|
1171
|
+
#is_close = np.isclose(sumstats["N"], sumstats["N_CASE"] + sumstats["N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1172
|
+
diff = abs(sumstats["N"] - (sumstats["N_CASE"] + sumstats["N_CONTROL"] ))
|
|
1173
1173
|
if sum(~is_close)>0:
|
|
1174
|
-
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose)
|
|
1175
|
-
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose)
|
|
1174
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
|
|
1175
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
|
|
1176
1176
|
else:
|
|
1177
1177
|
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1178
|
+
check_status=1
|
|
1179
|
+
|
|
1180
|
+
if check_status==1:
|
|
1181
|
+
log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
|
|
1182
|
+
else:
|
|
1183
|
+
log.write(" -No availalbe columns for data consistency checking...Skipping...",verbose=verbose)
|
|
1184
|
+
finished(log,verbose,_end_line)
|
|
1178
1185
|
|
|
1179
|
-
log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
|
|
1180
|
-
|
|
1181
|
-
if verbose: log.write("Finished checking data consistency across columns.")
|
|
1182
1186
|
###############################################################################################################
|
|
1183
1187
|
# 20220426
|
|
1184
1188
|
def get_reverse_complementary_allele(a):
|
|
@@ -1201,11 +1205,81 @@ def flip_direction(string):
|
|
|
1201
1205
|
else: #sometime it is 0
|
|
1202
1206
|
flipped_string+=char
|
|
1203
1207
|
return flipped_string
|
|
1204
|
-
|
|
1208
|
+
|
|
1209
|
+
def flip_by_swap(sumstats, matched_index, log, verbose):
|
|
1210
|
+
if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
|
|
1211
|
+
if verbose: log.write(" -Swapping column: NEA <=> EA...")
|
|
1212
|
+
sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
|
|
1213
|
+
return sumstats
|
|
1214
|
+
|
|
1215
|
+
def flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1):
|
|
1216
|
+
if "OR" in sumstats.columns:
|
|
1217
|
+
if verbose: log.write(" -Flipping column: OR = 1 / OR...")
|
|
1218
|
+
sumstats.loc[matched_index,"OR"] = factor / sumstats.loc[matched_index,"OR"].values
|
|
1219
|
+
if "OR_95L" in sumstats.columns:
|
|
1220
|
+
if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95L...")
|
|
1221
|
+
sumstats.loc[matched_index,"OR_95U"] = factor / sumstats.loc[matched_index,"OR_95L"].values
|
|
1222
|
+
if "OR_95U" in sumstats.columns:
|
|
1223
|
+
if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95U...")
|
|
1224
|
+
sumstats.loc[matched_index,"OR_95L"] = factor / sumstats.loc[matched_index,"OR_95U"].values
|
|
1225
|
+
if "HR" in sumstats.columns:
|
|
1226
|
+
if verbose: log.write(" -Flipping column: HR = 1 / HR...")
|
|
1227
|
+
sumstats.loc[matched_index,"HR"] = factor / sumstats.loc[matched_index,"HR"].values
|
|
1228
|
+
if "HR_95L" in sumstats.columns:
|
|
1229
|
+
if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95L...")
|
|
1230
|
+
sumstats.loc[matched_index,"HR_95U"] = factor / sumstats.loc[matched_index,"HR_95L"].values
|
|
1231
|
+
if "HR_95U" in sumstats.columns:
|
|
1232
|
+
if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95U...")
|
|
1233
|
+
sumstats.loc[matched_index,"HR_95L"] = factor / sumstats.loc[matched_index,"HR_95U"].values
|
|
1234
|
+
return sumstats
|
|
1235
|
+
|
|
1236
|
+
def flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1):
|
|
1237
|
+
if "EAF" in sumstats.columns:
|
|
1238
|
+
if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
|
|
1239
|
+
sumstats.loc[matched_index,"EAF"] = factor - sumstats.loc[matched_index,"EAF"].values
|
|
1240
|
+
return sumstats
|
|
1241
|
+
|
|
1242
|
+
def flip_by_sign(sumstats, matched_index, log, verbose, cols=None):
|
|
1243
|
+
if "BETA" in sumstats.columns:
|
|
1244
|
+
if verbose: log.write(" -Flipping column: BETA = - BETA...")
|
|
1245
|
+
sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
|
|
1246
|
+
if "BETA_95L" in sumstats.columns:
|
|
1247
|
+
if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95L...")
|
|
1248
|
+
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1249
|
+
if "BETA_95U" in sumstats.columns:
|
|
1250
|
+
if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95U...")
|
|
1251
|
+
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1252
|
+
if "Z" in sumstats.columns:
|
|
1253
|
+
if verbose: log.write(" -Flipping column: Z = - Z...")
|
|
1254
|
+
sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"Z"].values
|
|
1255
|
+
if "T" in sumstats.columns:
|
|
1256
|
+
if verbose: log.write(" -Flipping column: T = - T...")
|
|
1257
|
+
sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"T"].values
|
|
1258
|
+
if "DIRECTION" in sumstats.columns:
|
|
1259
|
+
if verbose: log.write(" -Flipping column: DIRECTION +-?0 <=> -+?0 ...")
|
|
1260
|
+
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1261
|
+
return sumstats
|
|
1262
|
+
|
|
1205
1263
|
def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1264
|
+
##start function with col checking##########################################################
|
|
1265
|
+
_start_line = "adjust statistics based on STATUS code"
|
|
1266
|
+
_end_line = "adjusting statistics based on STATUS code"
|
|
1267
|
+
_start_cols =[]
|
|
1268
|
+
_start_function = ".check_data_consistency()"
|
|
1269
|
+
_must_args ={}
|
|
1270
|
+
|
|
1271
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1272
|
+
log=log,
|
|
1273
|
+
verbose=verbose,
|
|
1274
|
+
start_line=_start_line,
|
|
1275
|
+
end_line=_end_line,
|
|
1276
|
+
start_cols=_start_cols,
|
|
1277
|
+
start_function=_start_function,
|
|
1278
|
+
**_must_args)
|
|
1279
|
+
if is_enough_info == False: return sumstats
|
|
1280
|
+
############################################################################################
|
|
1281
|
+
|
|
1282
|
+
if_stats_flipped = False
|
|
1209
1283
|
###################get reverse complementary####################
|
|
1210
1284
|
pattern = r"\w\w\w\w\w[45]\w"
|
|
1211
1285
|
#matched_index = status_match(sumstats[status],6,[4,5]) #
|
|
@@ -1217,107 +1291,49 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1217
1291
|
if verbose: log.write(" -Converting to reverse complement : EA and NEA...")
|
|
1218
1292
|
reverse_complement_nea = sumstats.loc[matched_index,'NEA'].apply(lambda x :get_reverse_complementary_allele(x))
|
|
1219
1293
|
reverse_complement_ea = sumstats.loc[matched_index,'EA'].apply(lambda x :get_reverse_complementary_allele(x))
|
|
1220
|
-
categories = set(sumstats
|
|
1221
|
-
sumstats
|
|
1222
|
-
sumstats
|
|
1294
|
+
categories = set(sumstats['EA'])|set(sumstats['NEA']) |set(reverse_complement_ea) |set(reverse_complement_nea)
|
|
1295
|
+
sumstats['EA']=pd.Categorical(sumstats['EA'],categories = categories)
|
|
1296
|
+
sumstats['NEA']=pd.Categorical(sumstats['NEA'],categories = categories )
|
|
1223
1297
|
sumstats.loc[matched_index,['NEA']] = reverse_complement_nea
|
|
1224
1298
|
sumstats.loc[matched_index,['EA']] = reverse_complement_ea
|
|
1225
1299
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "4","2")
|
|
1226
1300
|
if verbose: log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x")
|
|
1227
|
-
|
|
1301
|
+
if_stats_flipped = True
|
|
1228
1302
|
###################flip ref####################
|
|
1229
1303
|
pattern = r"\w\w\w\w\w[35]\w"
|
|
1230
1304
|
#matched_index = status_match(sumstats[status],6,[3,5]) #sumstats[status].str.match(pattern)
|
|
1231
1305
|
matched_index = sumstats[status].str[5].str.match(r"3|5")
|
|
1232
1306
|
if sum(matched_index)>0:
|
|
1233
|
-
if verbose: log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x:
|
|
1307
|
+
if verbose: log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: ALT->EA , REF->NEA ...{}".format(_get_version()))
|
|
1234
1308
|
if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
if "BETA_95L" in sumstats.columns:
|
|
1242
|
-
if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
|
|
1243
|
-
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1244
|
-
if "BETA_95U" in sumstats.columns:
|
|
1245
|
-
if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
|
|
1246
|
-
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1247
|
-
if "EAF" in sumstats.columns:
|
|
1248
|
-
if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
|
|
1249
|
-
sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
|
|
1250
|
-
if "OR" in sumstats.columns:
|
|
1251
|
-
if verbose: log.write(" -Flipping column: OR = 1 / OR...")
|
|
1252
|
-
sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
|
|
1253
|
-
if "OR_95L" in sumstats.columns:
|
|
1254
|
-
if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
|
|
1255
|
-
sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
|
|
1256
|
-
if "OR_95U" in sumstats.columns:
|
|
1257
|
-
if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
|
|
1258
|
-
sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
|
|
1259
|
-
if "HR" in sumstats.columns:
|
|
1260
|
-
if verbose: log.write(" -Flipping column: HR = 1 / HR...")
|
|
1261
|
-
sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
|
|
1262
|
-
if "HR_95L" in sumstats.columns:
|
|
1263
|
-
if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
|
|
1264
|
-
sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
|
|
1265
|
-
if "HR_95U" in sumstats.columns:
|
|
1266
|
-
if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
|
|
1267
|
-
sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
|
|
1268
|
-
if "DIRECTION" in sumstats.columns:
|
|
1269
|
-
if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
|
|
1270
|
-
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1309
|
+
|
|
1310
|
+
flip_by_swap(sumstats, matched_index, log, verbose)
|
|
1311
|
+
flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
1312
|
+
flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1313
|
+
flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1314
|
+
|
|
1271
1315
|
#change status
|
|
1272
1316
|
if verbose: log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x")
|
|
1273
1317
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "35","12")
|
|
1318
|
+
if_stats_flipped = True
|
|
1274
1319
|
|
|
1275
1320
|
###################flip ref for undistingushable indels####################
|
|
1276
1321
|
pattern = r"\w\w\w\w[123][67]6"
|
|
1277
1322
|
#matched_index = status_match(sumstats[status],6,[1,2,3])|status_match(sumstats[status],6,[6,7])|status_match(sumstats[status],7,6) #sumstats[status].str.match(pattern)
|
|
1278
1323
|
matched_index = sumstats[status].str[4:].str.match(r"[123][67]6")
|
|
1279
1324
|
if sum(matched_index)>0:
|
|
1280
|
-
if verbose: log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]:
|
|
1325
|
+
if verbose: log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: ALT->EA , REF->NEA...{}".format(_get_version()))
|
|
1281
1326
|
if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
if "BETA_95L" in sumstats.columns:
|
|
1289
|
-
if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
|
|
1290
|
-
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1291
|
-
if "BETA_95U" in sumstats.columns:
|
|
1292
|
-
if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
|
|
1293
|
-
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1294
|
-
if "EAF" in sumstats.columns:
|
|
1295
|
-
if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
|
|
1296
|
-
sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
|
|
1297
|
-
if "OR" in sumstats.columns:
|
|
1298
|
-
if verbose: log.write(" -Flipping column: OR = 1 / OR...")
|
|
1299
|
-
sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
|
|
1300
|
-
if "OR_95L" in sumstats.columns:
|
|
1301
|
-
if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
|
|
1302
|
-
sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
|
|
1303
|
-
if "OR_95U" in sumstats.columns:
|
|
1304
|
-
if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
|
|
1305
|
-
sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
|
|
1306
|
-
if "HR" in sumstats.columns:
|
|
1307
|
-
if verbose: log.write(" -Flipping column: HR = 1 / HR...")
|
|
1308
|
-
sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
|
|
1309
|
-
if "HR_95L" in sumstats.columns:
|
|
1310
|
-
if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
|
|
1311
|
-
sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
|
|
1312
|
-
if "HR_95U" in sumstats.columns:
|
|
1313
|
-
if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
|
|
1314
|
-
sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
|
|
1315
|
-
if "DIRECTION" in sumstats.columns:
|
|
1316
|
-
if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
|
|
1317
|
-
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1327
|
+
|
|
1328
|
+
flip_by_swap(sumstats, matched_index, log, verbose)
|
|
1329
|
+
flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
1330
|
+
flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1331
|
+
flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1332
|
+
|
|
1318
1333
|
#change status
|
|
1319
1334
|
if verbose: log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4")
|
|
1320
1335
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "6","4")
|
|
1336
|
+
if_stats_flipped = True
|
|
1321
1337
|
# flip ref
|
|
1322
1338
|
###################flip statistics for reverse strand panlindromic variants####################
|
|
1323
1339
|
pattern = r"\w\w\w\w\w[012]5"
|
|
@@ -1326,43 +1342,20 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1326
1342
|
if sum(matched_index)>0:
|
|
1327
1343
|
if verbose: log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()))
|
|
1328
1344
|
if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
|
|
1335
|
-
if "BETA_95U" in sumstats.columns:
|
|
1336
|
-
if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
|
|
1337
|
-
sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
|
|
1338
|
-
if "EAF" in sumstats.columns:
|
|
1339
|
-
if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
|
|
1340
|
-
sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
|
|
1341
|
-
if "OR" in sumstats.columns:
|
|
1342
|
-
if verbose: log.write(" -Flipping column: OR = 1 / OR...")
|
|
1343
|
-
sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
|
|
1344
|
-
if "OR_95L" in sumstats.columns:
|
|
1345
|
-
if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
|
|
1346
|
-
sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
|
|
1347
|
-
if "OR_95U" in sumstats.columns:
|
|
1348
|
-
if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
|
|
1349
|
-
sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
|
|
1350
|
-
if "HR" in sumstats.columns:
|
|
1351
|
-
if verbose: log.write(" -Flipping column: HR = 1 / HR...")
|
|
1352
|
-
sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
|
|
1353
|
-
if "HR_95L" in sumstats.columns:
|
|
1354
|
-
if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
|
|
1355
|
-
sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
|
|
1356
|
-
if "HR_95U" in sumstats.columns:
|
|
1357
|
-
if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
|
|
1358
|
-
sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
|
|
1359
|
-
if "DIRECTION" in sumstats.columns:
|
|
1360
|
-
if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
|
|
1361
|
-
sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
|
|
1345
|
+
|
|
1346
|
+
flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
1347
|
+
flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1348
|
+
flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
1349
|
+
|
|
1362
1350
|
#change status
|
|
1363
1351
|
if verbose: log.write(" -Changed the status for flipped variants: xxxxx[012]5: -> xxxxx[012]2")
|
|
1364
1352
|
sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "5","2")
|
|
1365
|
-
|
|
1353
|
+
if_stats_flipped = True
|
|
1354
|
+
|
|
1355
|
+
if if_stats_flipped == True:
|
|
1356
|
+
finished(log, verbose, "adjusting")
|
|
1357
|
+
else:
|
|
1358
|
+
finished(log, verbose, "adjusting with no statistics changed.")
|
|
1366
1359
|
return sumstats
|
|
1367
1360
|
""
|
|
1368
1361
|
|
|
@@ -1371,8 +1364,8 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
|
1371
1364
|
# 20220426
|
|
1372
1365
|
def liftover_snv(row,chrom,converter,to_build):
|
|
1373
1366
|
status_pre=""
|
|
1374
|
-
status_end=row[1][2]+"9"+row[1][4]+"99"
|
|
1375
|
-
pos_0_based = int(row[0]) - 1
|
|
1367
|
+
status_end=row.iloc[1][2]+"9"+row.iloc[1][4]+"99"
|
|
1368
|
+
pos_0_based = int(row.iloc[0]) - 1
|
|
1376
1369
|
results = converter[chrom][pos_0_based]
|
|
1377
1370
|
if converter[chrom][pos_0_based]:
|
|
1378
1371
|
# return chrom, pos_1_based
|
|
@@ -1402,13 +1395,25 @@ def liftover_variant(sumstats,
|
|
|
1402
1395
|
return sumstats
|
|
1403
1396
|
|
|
1404
1397
|
def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1398
|
+
##start function with col checking##########################################################
|
|
1399
|
+
_start_line = "perform liftover"
|
|
1400
|
+
_end_line = "liftover"
|
|
1401
|
+
_start_cols =[chrom,pos,status]
|
|
1402
|
+
_start_function = ".liftover()"
|
|
1403
|
+
_must_args ={}
|
|
1404
|
+
|
|
1405
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1406
|
+
log=log,
|
|
1407
|
+
verbose=verbose,
|
|
1408
|
+
start_line=_start_line,
|
|
1409
|
+
end_line=_end_line,
|
|
1410
|
+
start_cols=_start_cols,
|
|
1411
|
+
start_function=_start_function,
|
|
1412
|
+
n_cores=n_cores,
|
|
1413
|
+
**_must_args)
|
|
1414
|
+
if is_enough_info == False: return sumstats
|
|
1415
|
+
############################################################################################
|
|
1416
|
+
|
|
1412
1417
|
if verbose: log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build)
|
|
1413
1418
|
# valid chr and pos
|
|
1414
1419
|
pattern = r"\w\w\w0\w\w\w"
|
|
@@ -1420,11 +1425,12 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
|
|
|
1420
1425
|
if sum(to_lift)<10000:
|
|
1421
1426
|
n_cores=1
|
|
1422
1427
|
|
|
1423
|
-
df_split = np.array_split(sumstats
|
|
1428
|
+
#df_split = np.array_split(sumstats[[chrom,pos,status]], n_cores)
|
|
1429
|
+
df_split = _df_split(sumstats[[chrom,pos,status]], n_cores)
|
|
1424
1430
|
pool = Pool(n_cores)
|
|
1425
1431
|
#df = pd.concat(pool.starmap(func, df_split))
|
|
1426
1432
|
func=liftover_variant
|
|
1427
|
-
sumstats
|
|
1433
|
+
sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
|
|
1428
1434
|
pool.close()
|
|
1429
1435
|
pool.join()
|
|
1430
1436
|
############################################################################
|
|
@@ -1439,18 +1445,29 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
|
|
|
1439
1445
|
sumstats = fixchr(sumstats,chrom=chrom,add_prefix="",remove=remove, verbose=True)
|
|
1440
1446
|
sumstats = fixpos(sumstats,pos=pos,remove=remove, verbose=True)
|
|
1441
1447
|
|
|
1442
|
-
|
|
1448
|
+
finished(log,verbose,_end_line)
|
|
1443
1449
|
return sumstats
|
|
1444
1450
|
|
|
1445
1451
|
###############################################################################################################
|
|
1446
1452
|
# 20220426
|
|
1447
1453
|
def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=Log()):
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
+
##start function with col checking##########################################################
|
|
1455
|
+
_start_line = "sort the genome coordinates"
|
|
1456
|
+
_end_line = "sorting coordinates"
|
|
1457
|
+
_start_cols =[chrom,pos]
|
|
1458
|
+
_start_function = ".sort_coordinate()"
|
|
1459
|
+
_must_args ={}
|
|
1460
|
+
|
|
1461
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1462
|
+
log=log,
|
|
1463
|
+
verbose=verbose,
|
|
1464
|
+
start_line=_start_line,
|
|
1465
|
+
end_line=_end_line,
|
|
1466
|
+
start_cols=_start_cols,
|
|
1467
|
+
start_function=_start_function,
|
|
1468
|
+
**_must_args)
|
|
1469
|
+
if is_enough_info == False: return sumstats
|
|
1470
|
+
############################################################################################
|
|
1454
1471
|
|
|
1455
1472
|
try:
|
|
1456
1473
|
if sumstats[pos].dtype == "Int64":
|
|
@@ -1460,50 +1477,144 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
|
|
|
1460
1477
|
sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
|
|
1461
1478
|
except:
|
|
1462
1479
|
pass
|
|
1463
|
-
|
|
1464
|
-
if verbose: log.write(" -Sorting genome coordinates...")
|
|
1465
1480
|
sumstats = sumstats.sort_values(by=[chrom,pos],ascending=True,ignore_index=True)
|
|
1466
|
-
|
|
1467
|
-
|
|
1481
|
+
|
|
1482
|
+
finished(log,verbose,_end_line)
|
|
1468
1483
|
return sumstats
|
|
1469
1484
|
###############################################################################################################
|
|
1470
1485
|
# 20230430 added HR HR_95 BETA_95 N_CASE N_CONTROL
|
|
1471
|
-
def sortcolumn(sumstats,verbose=True,log=Log(),order =
|
|
1486
|
+
def sortcolumn(sumstats,verbose=True,log=Log(),order = None):
|
|
1487
|
+
##start function with col checking##########################################################
|
|
1488
|
+
_start_line = "reorder the columns"
|
|
1489
|
+
_end_line = "reordering the columns"
|
|
1490
|
+
_start_cols =[]
|
|
1491
|
+
_start_function = ".sort_column()"
|
|
1492
|
+
_must_args ={}
|
|
1493
|
+
|
|
1494
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1495
|
+
log=log,
|
|
1496
|
+
verbose=verbose,
|
|
1497
|
+
start_line=_start_line,
|
|
1498
|
+
end_line=_end_line,
|
|
1499
|
+
start_cols=_start_cols,
|
|
1500
|
+
start_function=_start_function,
|
|
1501
|
+
**_must_args)
|
|
1502
|
+
if is_enough_info == False: return sumstats
|
|
1503
|
+
############################################################################################
|
|
1504
|
+
|
|
1505
|
+
if order is None:
|
|
1506
|
+
order = [
|
|
1472
1507
|
"SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z","T","F",
|
|
1473
|
-
"CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"
|
|
1474
|
-
]):
|
|
1475
|
-
if verbose: log.write("Start to reorder the columns...{}".format(_get_version()))
|
|
1476
|
-
check_dataframe_shape(sumstats, log, verbose)
|
|
1477
|
-
|
|
1508
|
+
"CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"]
|
|
1478
1509
|
output_columns = []
|
|
1479
1510
|
for i in order:
|
|
1480
1511
|
if i in sumstats.columns: output_columns.append(i)
|
|
1481
1512
|
for i in sumstats.columns:
|
|
1482
1513
|
if i not in order: output_columns.append(i)
|
|
1483
1514
|
if verbose: log.write(" -Reordering columns to :", ",".join(output_columns))
|
|
1484
|
-
sumstats = sumstats
|
|
1485
|
-
|
|
1515
|
+
sumstats = sumstats[ output_columns]
|
|
1516
|
+
|
|
1517
|
+
finished(log,verbose,_end_line)
|
|
1486
1518
|
return sumstats
|
|
1487
1519
|
|
|
1488
|
-
|
|
1520
|
+
|
|
1521
|
+
###############################################################################################################
|
|
1522
|
+
def start_to(sumstats,
|
|
1523
|
+
log,
|
|
1524
|
+
verbose,
|
|
1525
|
+
start_line,
|
|
1526
|
+
end_line,
|
|
1527
|
+
start_cols,
|
|
1528
|
+
start_function,
|
|
1529
|
+
ref_vcf=None,
|
|
1530
|
+
ref_fasta=None,
|
|
1531
|
+
n_cores=None,
|
|
1532
|
+
ref_tsv=None,
|
|
1533
|
+
**args
|
|
1534
|
+
):
|
|
1535
|
+
|
|
1536
|
+
log.write("Start to {}...{}".format(start_line,_get_version()), verbose=verbose)
|
|
1537
|
+
|
|
1538
|
+
check_dataframe_shape(sumstats=sumstats,
|
|
1539
|
+
log=log,
|
|
1540
|
+
verbose=verbose)
|
|
1541
|
+
|
|
1542
|
+
is_enough_col = check_col(sumstats.columns,
|
|
1543
|
+
verbose=verbose,
|
|
1544
|
+
log=log,
|
|
1545
|
+
cols=start_cols,
|
|
1546
|
+
function=start_function)
|
|
1547
|
+
|
|
1548
|
+
if is_enough_col==True:
|
|
1549
|
+
if n_cores is not None:
|
|
1550
|
+
log.write(" -Number of threads/cores to use: {}".format(n_cores))
|
|
1551
|
+
if ref_vcf is not None:
|
|
1552
|
+
log.write(" -Reference VCF: {}".format(ref_vcf))
|
|
1553
|
+
if ref_fasta is not None:
|
|
1554
|
+
log.write(" -Reference FASTA: {}".format(ref_fasta))
|
|
1555
|
+
if ref_tsv is not None:
|
|
1556
|
+
log.write(" -Reference TSV: {}".format(ref_tsv))
|
|
1557
|
+
|
|
1558
|
+
is_args_valid = True
|
|
1559
|
+
for key, value in args.items():
|
|
1560
|
+
is_args_valid = is_args_valid & check_arg(log, verbose, key, value, start_function)
|
|
1561
|
+
is_enough_col = is_args_valid & is_enough_col
|
|
1562
|
+
|
|
1563
|
+
if is_enough_col == False:
|
|
1564
|
+
skipped(log, verbose, end_line)
|
|
1565
|
+
|
|
1566
|
+
return is_enough_col
|
|
1567
|
+
|
|
1568
|
+
def finished(log, verbose, end_line):
|
|
1569
|
+
log.write("Finished {}.".format(end_line), verbose=verbose)
|
|
1570
|
+
gc.collect()
|
|
1571
|
+
|
|
1572
|
+
def skipped(log, verbose, end_line):
|
|
1573
|
+
log.write("Skipped {}.".format(end_line), verbose=verbose)
|
|
1574
|
+
gc.collect()
|
|
1575
|
+
|
|
1576
|
+
def check_arg(log, verbose, key, value, function):
|
|
1577
|
+
if value is None:
|
|
1578
|
+
log.warning("Necessary argument {} for {} is not provided!".format(key, function))
|
|
1579
|
+
return False
|
|
1580
|
+
return True
|
|
1581
|
+
|
|
1582
|
+
def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
|
|
1489
1583
|
not_in_df=[]
|
|
1490
|
-
for i in
|
|
1584
|
+
for i in cols:
|
|
1491
1585
|
if type(i) is str:
|
|
1492
|
-
|
|
1586
|
+
# single check
|
|
1587
|
+
if i in df_col_names:
|
|
1493
1588
|
continue
|
|
1494
1589
|
else:
|
|
1495
1590
|
not_in_df.append(i)
|
|
1496
1591
|
else:
|
|
1592
|
+
# paried check
|
|
1497
1593
|
count=0
|
|
1498
1594
|
for j in i:
|
|
1499
|
-
if j in
|
|
1595
|
+
if j not in df_col_names:
|
|
1596
|
+
not_in_df.append(j)
|
|
1500
1597
|
count+=1
|
|
1501
|
-
|
|
1502
|
-
return False
|
|
1503
|
-
print(" -Specified columns names was not detected. Please check:"+",".join(i))
|
|
1504
|
-
|
|
1598
|
+
|
|
1505
1599
|
if len(not_in_df)>0:
|
|
1600
|
+
if function is None:
|
|
1601
|
+
to_show_title=" "
|
|
1602
|
+
else:
|
|
1603
|
+
to_show_title = " for {} ".format(function)
|
|
1604
|
+
log.warning("Necessary columns{}were not detected:{}".format(to_show_title, ",".join(not_in_df)))
|
|
1605
|
+
skipped(log, verbose, end_line=function)
|
|
1506
1606
|
return False
|
|
1507
|
-
|
|
1607
|
+
|
|
1508
1608
|
return True
|
|
1509
1609
|
|
|
1610
|
+
###############################################################################################################
|
|
1611
|
+
def _df_split(dataframe, n):
|
|
1612
|
+
chunks = []
|
|
1613
|
+
chunk_size = int(dataframe.shape[0] // n)+1
|
|
1614
|
+
|
|
1615
|
+
for index in range(0, dataframe.shape[0], chunk_size):
|
|
1616
|
+
chunks.append(
|
|
1617
|
+
dataframe.iloc[index:index + chunk_size]
|
|
1618
|
+
)
|
|
1619
|
+
|
|
1620
|
+
return chunks
|