gwaslab 3.4.36__py3-none-any.whl → 3.4.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +1 -1
- gwaslab/g_Sumstats.py +54 -31
- gwaslab/g_meta.py +13 -3
- gwaslab/g_version.py +2 -2
- gwaslab/hm_harmonize_sumstats.py +43 -18
- gwaslab/io_preformat_input.py +3 -0
- gwaslab/qc_check_datatype.py +14 -0
- gwaslab/qc_fix_sumstats.py +217 -91
- gwaslab/util_ex_process_h5.py +26 -17
- gwaslab/util_in_fill_data.py +42 -3
- gwaslab/viz_aux_quickfix.py +2 -2
- gwaslab/viz_plot_compare_effect.py +22 -5
- gwaslab/viz_plot_mqqplot.py +127 -48
- gwaslab/viz_plot_regionalplot.py +13 -8
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.37.dist-info}/METADATA +2 -2
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.37.dist-info}/RECORD +19 -19
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.37.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.37.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.37.dist-info}/top_level.txt +0 -0
gwaslab/qc_fix_sumstats.py
CHANGED
|
@@ -14,7 +14,12 @@ from gwaslab.bd_common_data import get_chr_to_number
|
|
|
14
14
|
from gwaslab.bd_common_data import get_number_to_chr
|
|
15
15
|
from gwaslab.bd_common_data import get_chr_list
|
|
16
16
|
from gwaslab.qc_check_datatype import check_datatype
|
|
17
|
+
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
17
18
|
from gwaslab.g_version import _get_version
|
|
19
|
+
from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
|
|
20
|
+
from gwaslab.util_in_fill_data import _convert_betase_to_p
|
|
21
|
+
from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
|
|
22
|
+
#process build
|
|
18
23
|
#setbuild
|
|
19
24
|
#fixID
|
|
20
25
|
#rsidtochrpos
|
|
@@ -26,6 +31,7 @@ from gwaslab.g_version import _get_version
|
|
|
26
31
|
#normalizevariant
|
|
27
32
|
#checkref
|
|
28
33
|
#sanitycheckstats
|
|
34
|
+
#_check_data_consistency
|
|
29
35
|
#flipallelestats
|
|
30
36
|
#parallelizeassignrsid
|
|
31
37
|
#sortcoordinate
|
|
@@ -41,7 +47,7 @@ def _process_build(build,log,verbose):
|
|
|
41
47
|
log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
|
|
42
48
|
final_build = "38"
|
|
43
49
|
else:
|
|
44
|
-
log.write(" -Version of genomic coordinates
|
|
50
|
+
log.write(" -WARNING! Version of genomic coordinates is unknown...", verbose=verbose)
|
|
45
51
|
final_build = "99"
|
|
46
52
|
return final_build
|
|
47
53
|
|
|
@@ -49,10 +55,10 @@ def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
|
|
|
49
55
|
build = _process_build(build,log=log,verbose=verbose)
|
|
50
56
|
sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 1, "139",build[0]*3)
|
|
51
57
|
sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 2, "89",build[1]*3)
|
|
52
|
-
return sumstats
|
|
58
|
+
return sumstats, build
|
|
53
59
|
|
|
54
60
|
def fixID(sumstats,
|
|
55
|
-
snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",
|
|
61
|
+
snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",fixprefix=False,
|
|
56
62
|
fixchrpos=False,fixid=False,fixeanea=False,fixeanea_flip=False,fixsep=False,
|
|
57
63
|
overwrite=False,verbose=True,forcefixid=False,log=Log()):
|
|
58
64
|
'''
|
|
@@ -61,37 +67,64 @@ def fixID(sumstats,
|
|
|
61
67
|
3. checking rsid and chr:pos:nea:ea
|
|
62
68
|
'''
|
|
63
69
|
if verbose: log.write("Start to check IDs...{}".format(_get_version()))
|
|
64
|
-
|
|
65
|
-
|
|
70
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
66
71
|
check_col(sumstats,[snpid,rsid],status)
|
|
67
|
-
|
|
72
|
+
|
|
73
|
+
############################ checking datatype ###################################################
|
|
74
|
+
if rsid in sumstats.columns:
|
|
75
|
+
# convert to string datatype
|
|
76
|
+
try:
|
|
77
|
+
log.write(" -Checking rsID data type...",verbose=verbose)
|
|
78
|
+
if sumstats.loc[:,rsid].dtype == "string":
|
|
79
|
+
pass
|
|
80
|
+
else:
|
|
81
|
+
log.write(" -Converting rsID to pd.string data type...",verbose=verbose)
|
|
82
|
+
sumstats.loc[:,rsid] = sumstats.loc[:,rsid].astype("string")
|
|
83
|
+
except:
|
|
84
|
+
log.write(" -Force converting rsID to pd.string data type...",verbose=verbose)
|
|
85
|
+
sumstats.loc[:,rsid] = sumstats.loc[:,rsid].astype("string")
|
|
86
|
+
if snpid in sumstats.columns:
|
|
87
|
+
# convert to string datatype
|
|
88
|
+
try:
|
|
89
|
+
log.write(" -Checking SNPID data type...",verbose=verbose)
|
|
90
|
+
if sumstats.loc[:,snpid].dtype == "string":
|
|
91
|
+
pass
|
|
92
|
+
else:
|
|
93
|
+
log.write(" -Converting SNPID to pd.string data type...",verbose=verbose)
|
|
94
|
+
sumstats.loc[:,snpid] = sumstats.loc[:,snpid].astype("string")
|
|
95
|
+
except:
|
|
96
|
+
log.write(" -Force converting SNPID to pd.string data type...",verbose=verbose)
|
|
97
|
+
sumstats.loc[:,snpid] = sumstats.loc[:,snpid].astype("string")
|
|
98
|
+
|
|
68
99
|
############################ checking ###################################################
|
|
69
100
|
if snpid in sumstats.columns:
|
|
70
|
-
|
|
71
|
-
#
|
|
101
|
+
log.write(" -Checking if SNPID is CHR:POS:NEA:EA...(separator: - ,: , _)",verbose=verbose)
|
|
102
|
+
# check if SNPID is CHR:POS:EA:NEA
|
|
72
103
|
is_chrposrefalt = sumstats[snpid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
|
|
104
|
+
# check if SNPID is NA
|
|
73
105
|
is_snpid_na = sumstats[snpid].isna()
|
|
106
|
+
|
|
107
|
+
# change STATUS code
|
|
74
108
|
sumstats.loc[ is_chrposrefalt,status] = vchange_status(sumstats.loc[ is_chrposrefalt,status],3 ,"975" ,"630")
|
|
75
109
|
sumstats.loc[(~is_chrposrefalt)&(~is_snpid_na),status] = vchange_status(sumstats.loc[(~is_chrposrefalt)&(~is_snpid_na),status],3 ,"975" ,"842")
|
|
76
110
|
|
|
77
111
|
if rsid in sumstats.columns:
|
|
78
|
-
|
|
79
|
-
is_rsid = sumstats[rsid].str.
|
|
112
|
+
log.write(" -Checking if rsID is rsxxxxxx...", verbose=verbose)
|
|
113
|
+
is_rsid = sumstats[rsid].str.match(r'^rs\d+$', case=False, flags=0, na=False)
|
|
80
114
|
|
|
81
115
|
sumstats.loc[ is_rsid,status] = vchange_status(sumstats.loc[ is_rsid,status], 3, "986","520")
|
|
82
116
|
sumstats.loc[~is_rsid,status] = vchange_status(sumstats.loc[~is_rsid,status], 3, "986","743")
|
|
83
117
|
|
|
84
|
-
if verbose: log.write(" -Checking if
|
|
85
|
-
is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\
|
|
86
|
-
#is_rs_chrpos = sumstats[rsid].str.match(r'(chr)?([0-9XYMT]+)[:_-]([0-9]+)[:_-]([ATCG]+)[:_-]([ATCG]+)', case=False, flags=0, na=False)
|
|
118
|
+
if verbose: log.write(" -Checking if CHR:POS:NEA:EA is mixed in rsID column ...")
|
|
119
|
+
is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
|
|
87
120
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
121
|
+
log.write(" -Number of CHR:POS:NEA:EA mixed in rsID column :",sum(is_rs_chrpos), verbose=verbose)
|
|
122
|
+
log.write(" -Number of Unrecognized rsID :",len(sumstats) - sum(is_rs_chrpos) - sum(is_rsid) , verbose=verbose)
|
|
123
|
+
log.write(" -A look at the unrecognized rsID :",set(sumstats.loc[(~is_rsid)&(~is_rs_chrpos),rsid].head()),"...", verbose=verbose)
|
|
91
124
|
|
|
92
125
|
############################ fixing chr pos###################################################
|
|
93
|
-
if fixchrpos
|
|
94
|
-
# from snpid or rsid, extract
|
|
126
|
+
if fixchrpos == True:
|
|
127
|
+
# from snpid or rsid, extract CHR:POS to fix CHR and POS
|
|
95
128
|
if snpid in sumstats.columns:
|
|
96
129
|
if verbose: log.write(" -Fixing CHR and POS...")
|
|
97
130
|
if overwrite is True:
|
|
@@ -99,8 +132,8 @@ def fixID(sumstats,
|
|
|
99
132
|
# fix all
|
|
100
133
|
to_fix = is_chrposrefalt
|
|
101
134
|
|
|
102
|
-
#fix variants with chr and pos being empty
|
|
103
135
|
elif (chrom in sumstats.columns) and (pos in sumstats.columns) :
|
|
136
|
+
#fix variants with chr and pos being NA
|
|
104
137
|
to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
|
|
105
138
|
to_fix_num = sum(to_fix)
|
|
106
139
|
if to_fix_num and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
@@ -121,6 +154,7 @@ def fixID(sumstats,
|
|
|
121
154
|
to_fix_num = sum(to_fix)
|
|
122
155
|
if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
|
|
123
156
|
elif verbose: log.write(" -No fixable variants. ...")
|
|
157
|
+
|
|
124
158
|
else:
|
|
125
159
|
if verbose: log.write(" -Initiating CHR and POS columns...")
|
|
126
160
|
sumstats.loc[:,chrom]=pd.Series(dtype="string")
|
|
@@ -134,8 +168,8 @@ def fixID(sumstats,
|
|
|
134
168
|
if verbose: log.write(" -Filling CHR and POS columns using valid SNPID's chr:pos...")
|
|
135
169
|
# format and qc filled chr and pos
|
|
136
170
|
|
|
137
|
-
sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.
|
|
138
|
-
sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,snpid].str.
|
|
171
|
+
sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
172
|
+
sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[2]
|
|
139
173
|
|
|
140
174
|
#sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.split(':|_|-').str[0].str.strip("chrCHR").astype("string")
|
|
141
175
|
#sumstats.loc[to_fix,pos] =np.floor(pd.to_numeric(sumstats.loc[to_fix,snpid].str.split(':|_|-').str[1], errors='coerce')).astype('Int64')
|
|
@@ -179,55 +213,62 @@ def fixID(sumstats,
|
|
|
179
213
|
#sumstats.loc[to_fix,status] = vchange_status(sumstats.loc[to_fix,status], 4, "98765432","00000000").astype("string")
|
|
180
214
|
|
|
181
215
|
############################ fixing chr pos###################################################
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
216
|
+
if fixeanea == True:
|
|
217
|
+
if verbose: log.write(" -WARNING! gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT")
|
|
218
|
+
if overwrite is True:
|
|
219
|
+
if verbose: log.write(" -Overwrite mode is applied...")
|
|
220
|
+
to_fix = is_chrposrefalt
|
|
221
|
+
elif (nea in sumstats.columns) and (nea in sumstats.columns):
|
|
222
|
+
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
223
|
+
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
224
|
+
elif (nea in sumstats.columns) and (ea not in sumstats.columns):
|
|
225
|
+
if verbose: log.write(" -Initiating EA columns...")
|
|
226
|
+
sumstats.loc[:,ea]=pd.Series(dtype="string")
|
|
227
|
+
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
228
|
+
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
229
|
+
elif (nea not in sumstats.columns) and (ea in sumstats.columns):
|
|
230
|
+
if verbose: log.write(" -Initiating NEA columns...")
|
|
231
|
+
sumstats.loc[:,nea]=pd.Series(dtype="string")
|
|
232
|
+
to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
|
|
233
|
+
if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
234
|
+
else:
|
|
235
|
+
if verbose: log.write(" -Initiating EA and NEA columns...")
|
|
236
|
+
sumstats[nea]=pd.Series(dtype="string")
|
|
237
|
+
sumstats[ea]=pd.Series(dtype="string")
|
|
238
|
+
to_fix = is_chrposrefalt
|
|
239
|
+
if sum(to_fix)>0:
|
|
240
|
+
if verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
|
|
207
241
|
#
|
|
208
|
-
|
|
209
|
-
|
|
242
|
+
if sum(to_fix)>0:
|
|
243
|
+
if verbose: log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's CHR:POS:NEA:EA...")
|
|
210
244
|
#
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
245
|
+
if fixeanea_flip == True:
|
|
246
|
+
if verbose: log.write(" -Flipped : CHR:POS:NEA:EA -> CHR:POS:EA:NEA ")
|
|
247
|
+
sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
|
|
248
|
+
sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
|
|
249
|
+
else:
|
|
250
|
+
if verbose: log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ")
|
|
251
|
+
sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
|
|
252
|
+
sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
|
|
253
|
+
|
|
219
254
|
# #to_change_status = sumstats[status].str.match(r"\w\w\w[45]\w\w\w")
|
|
220
255
|
# #sumstats.loc[to_fix&to_change_status,status] = vchange_status(sumstats.loc[to_fix&to_change_status,status],4,"2")
|
|
221
256
|
# #sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[1]).astype("string")
|
|
222
257
|
# #sumstats.loc[to_fix,rsid].apply(lambda x:re.split(':|_|-',x)[1]).astype("Int64")
|
|
223
258
|
|
|
224
259
|
############################ fixing id ###################################################
|
|
225
|
-
if fixsep
|
|
260
|
+
if fixsep == True:
|
|
226
261
|
if snpid in sumstats.columns:
|
|
227
262
|
if verbose: log.write(' -Replacing [_-] in SNPID with ":" ...')
|
|
228
263
|
sumstats.loc[:,snpid] = sumstats.loc[:,snpid].str.replace(r"[_-]",":",regex=True)
|
|
264
|
+
|
|
265
|
+
if fixprefix == True:
|
|
266
|
+
if snpid in sumstats.columns:
|
|
267
|
+
if verbose: log.write(' -Removing /^chr/ in SNPID ...')
|
|
268
|
+
prefix_removed = sumstats.loc[:,snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
|
|
269
|
+
sumstats.loc[~prefix_removed.isna(),snpid] = prefix_removed[~prefix_removed.isna()]
|
|
229
270
|
|
|
230
|
-
if fixid
|
|
271
|
+
if fixid == True:
|
|
231
272
|
if snpid not in sumstats.columns:
|
|
232
273
|
# initiate a SNPID column
|
|
233
274
|
sumstats.loc[:,snpid]=pd.Series(dtype="string")
|
|
@@ -304,19 +345,21 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
304
345
|
remove multiallelic SNPs based on 4. CHR, POS
|
|
305
346
|
'''
|
|
306
347
|
|
|
348
|
+
if verbose: log.write("Start to remove duplicated/multiallelic variants...{}".format(_get_version()))
|
|
349
|
+
if verbose: log.write(" -Removing mode:{}".format(mode))
|
|
307
350
|
# sort the variants using the specified column before removing
|
|
308
351
|
if keep_col is not None :
|
|
309
352
|
if keep_col in sumstats.columns:
|
|
310
|
-
if verbose: log.write("Start to sort the sumstats using "
|
|
353
|
+
if verbose: log.write("Start to sort the sumstats using {}...".format(keep_col))
|
|
311
354
|
sumstats = sumstats.sort_values(by=keep_col,ascending=keep_ascend)
|
|
312
355
|
else:
|
|
313
356
|
if verbose: log.write("Column" + keep_col +" was not detected... skipping... ")
|
|
314
357
|
total_number = len(sumstats)
|
|
315
358
|
|
|
316
359
|
# remove by duplicated SNPID
|
|
317
|
-
if (snpid in sumstats.columns) and "d" in mode:
|
|
360
|
+
if (snpid in sumstats.columns) and ("d" in mode or "s" in mode):
|
|
318
361
|
if verbose: log.write("Start to remove duplicated variants based on snpid...{}".format(_get_version()))
|
|
319
|
-
|
|
362
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
320
363
|
if verbose: log.write(" -Which variant to keep: ", keep )
|
|
321
364
|
pre_number =len(sumstats)
|
|
322
365
|
if snpid in sumstats.columns:
|
|
@@ -326,18 +369,19 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
326
369
|
if verbose: log.write(" -Removed ",pre_number -after_number ," based on SNPID...")
|
|
327
370
|
|
|
328
371
|
# remove by duplicated rsID
|
|
329
|
-
if (rsid in sumstats.columns) and ("d" in mode):
|
|
372
|
+
if (rsid in sumstats.columns) and ("d" in mode or "r" in mode):
|
|
330
373
|
# keep na and remove duplicated
|
|
331
374
|
pre_number =len(sumstats)
|
|
332
375
|
if verbose: log.write("Start to remove duplicated variants based on rsID...")
|
|
376
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
333
377
|
sumstats = sumstats.loc[sumstats[rsid].isna() | (~sumstats.duplicated(subset=rsid, keep=keep)),:]
|
|
334
378
|
after_number=len(sumstats)
|
|
335
379
|
if verbose: log.write(" -Removed ",pre_number -after_number ," based on rsID...")
|
|
336
380
|
|
|
337
381
|
# remove by duplicated variants by CHR:POS:NEA:EA
|
|
338
|
-
if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and "d" in mode:
|
|
382
|
+
if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and ("d" in mode or "c" in mode):
|
|
339
383
|
if verbose: log.write("Start to remove duplicated variants based on CHR,POS,EA and NEA...")
|
|
340
|
-
|
|
384
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
341
385
|
if verbose: log.write(" -Which variant to keep: ", keep )
|
|
342
386
|
pre_number =len(sumstats)
|
|
343
387
|
if snpid in sumstats.columns:
|
|
@@ -351,6 +395,7 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
351
395
|
# keep na and remove duplicated
|
|
352
396
|
pre_number =len(sumstats)
|
|
353
397
|
if verbose: log.write("Start to remove multiallelic variants based on chr:pos...")
|
|
398
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
354
399
|
if verbose: log.write(" -Which variant to keep: ", keep )
|
|
355
400
|
sumstats = sumstats.loc[(~sumstats.loc[:,[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
|
|
356
401
|
after_number=len(sumstats)
|
|
@@ -360,17 +405,37 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
360
405
|
# resort the coordinates
|
|
361
406
|
if verbose: log.write(" -Removed ",total_number -after_number," variants in total.")
|
|
362
407
|
if keep_col is not None :
|
|
363
|
-
if verbose: log.write(" -Sort the coordinates...")
|
|
408
|
+
if verbose: log.write(" -Sort the coordinates based on CHR and POS...")
|
|
364
409
|
sumstats = sortcoordinate(sumstats,verbose=False)
|
|
365
410
|
|
|
366
|
-
if
|
|
411
|
+
if "n" in mode or remove==True:
|
|
367
412
|
# if remove==True, remove NAs
|
|
368
413
|
if verbose: log.write(" -Removing NAs...")
|
|
369
414
|
pre_number =len(sumstats)
|
|
370
|
-
|
|
415
|
+
specified_columns = []
|
|
416
|
+
if "d" in mode:
|
|
417
|
+
specified_columns.append(rsid)
|
|
418
|
+
specified_columns.append(snpid)
|
|
419
|
+
specified_columns.append(chrom)
|
|
420
|
+
specified_columns.append(pos)
|
|
421
|
+
specified_columns.append(ea)
|
|
422
|
+
specified_columns.append(nea)
|
|
423
|
+
if "r" in mode:
|
|
424
|
+
specified_columns.append(rsid)
|
|
425
|
+
if "s" in mode:
|
|
426
|
+
specified_columns.append(snpid)
|
|
427
|
+
if "m" in mode:
|
|
428
|
+
specified_columns.append(chrom)
|
|
429
|
+
specified_columns.append(pos)
|
|
430
|
+
if "c" in mode:
|
|
431
|
+
specified_columns.append(chrom)
|
|
432
|
+
specified_columns.append(pos)
|
|
433
|
+
specified_columns.append(ea)
|
|
434
|
+
specified_columns.append(nea)
|
|
435
|
+
sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
|
|
371
436
|
after_number=len(sumstats)
|
|
372
|
-
if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values.")
|
|
373
|
-
if verbose: log.write("Finished removing successfully!")
|
|
437
|
+
if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)))
|
|
438
|
+
if verbose: log.write("Finished removing duplicated/multiallelic variants successfully!")
|
|
374
439
|
return sumstats
|
|
375
440
|
|
|
376
441
|
###############################################################################################################
|
|
@@ -383,7 +448,7 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
|
|
|
383
448
|
if verbose: log.write(".fix_chr: Specified not detected..skipping...")
|
|
384
449
|
return sumstats
|
|
385
450
|
if verbose: log.write("Start to fix chromosome notation...{}".format(_get_version()))
|
|
386
|
-
|
|
451
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
387
452
|
|
|
388
453
|
# convert to string datatype
|
|
389
454
|
try:
|
|
@@ -512,7 +577,7 @@ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_
|
|
|
512
577
|
if verbose: log.write(".fix_pos: Specified not detected..skipping...")
|
|
513
578
|
return sumstats
|
|
514
579
|
if verbose: log.write("Start to fix basepair positions...{}".format(_get_version()))
|
|
515
|
-
|
|
580
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
516
581
|
|
|
517
582
|
all_var_num = len(sumstats)
|
|
518
583
|
#convert to numeric
|
|
@@ -565,7 +630,7 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
|
|
|
565
630
|
if verbose: log.write("EA and NEA not detected..skipping...")
|
|
566
631
|
return sumstats
|
|
567
632
|
if verbose: log.write("Start to fix alleles...{}".format(_get_version()))
|
|
568
|
-
|
|
633
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
569
634
|
|
|
570
635
|
#if (ea not in sumstats.columns) or (nea not in sumstats.columns):
|
|
571
636
|
if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
|
|
@@ -659,11 +724,11 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
|
|
|
659
724
|
|
|
660
725
|
def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
|
|
661
726
|
if check_col(sumstats,pos,ea,nea,status) is not True:
|
|
662
|
-
if verbose: log.write("WARNING
|
|
727
|
+
if verbose: log.write("WARNING! .normalize(): specified columns not detected..skipping...")
|
|
663
728
|
return sumstats
|
|
664
729
|
|
|
665
730
|
if verbose: log.write("Start to normalize variants...{}".format(_get_version()))
|
|
666
|
-
|
|
731
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
667
732
|
#variants_to_check = status_match(sumstats[status],5,[4,5]) #
|
|
668
733
|
#r'\w\w\w\w[45]\w\w'
|
|
669
734
|
variants_to_check = sumstats[status].str[4].str.match(r'4|5', case=False, flags=0, na=False)
|
|
@@ -844,7 +909,7 @@ def sanitycheckstats(sumstats,
|
|
|
844
909
|
if coltocheck is None:
|
|
845
910
|
coltocheck = ["P","MLOG10P","INFO","Z","BETA","SE","EAF","CHISQ","F","N","N_CASE","N_CONTROL","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","STATUS"]
|
|
846
911
|
if verbose: log.write("Start sanity check for statistics...{}".format(_get_version()))
|
|
847
|
-
|
|
912
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
848
913
|
cols_to_check=[]
|
|
849
914
|
oringinal_number=len(sumstats)
|
|
850
915
|
sumstats = sumstats.copy()
|
|
@@ -875,13 +940,7 @@ def sanitycheckstats(sumstats,
|
|
|
875
940
|
sumstats = sumstats.loc[(sumstats["N_CONTROL"]>=ncontrol[0]) & (sumstats["N_CONTROL"]<=ncontrol[1]),:]
|
|
876
941
|
after_number=len(sumstats)
|
|
877
942
|
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CONTROL.")
|
|
878
|
-
|
|
879
|
-
if "N" in coltocheck and "N" in sumstats.columns and "N_CONTROL" in coltocheck and "N_CONTROL" in sumstats.columns and "N_CASE" in coltocheck and "N_CASE" in sumstats.columns:
|
|
880
|
-
if verbose: log.write(" -Checking if N = N_CASE + N_CONTROL ...")
|
|
881
|
-
matched_n = sumstats.loc[:,"N"] == sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"]
|
|
882
|
-
sumstats = sumstats.loc[matched_n,:]
|
|
883
|
-
after_number=len(sumstats)
|
|
884
|
-
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with N != N_CASE + N_CONTROL.")
|
|
943
|
+
|
|
885
944
|
|
|
886
945
|
###ALLELE FREQUENCY################################################################################################################################################
|
|
887
946
|
pre_number=len(sumstats)
|
|
@@ -941,6 +1000,11 @@ def sanitycheckstats(sumstats,
|
|
|
941
1000
|
if verbose: log.write(" -Checking if ",p[0],"< P <",p[1]," ...")
|
|
942
1001
|
sumstats.loc[:,"P"] = pd.to_numeric(sumstats.loc[:,"P"], errors='coerce').astype("float64")
|
|
943
1002
|
sumstats = sumstats.loc[(sumstats["P"]>p[0]) & (sumstats["P"]<p[1]),:]
|
|
1003
|
+
|
|
1004
|
+
is_low_p = sumstats["P"] == 0
|
|
1005
|
+
if sum(is_low_p) >0:
|
|
1006
|
+
log.write(" -WARNING! Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)), verbose=verbose)
|
|
1007
|
+
log.write(" -WARNING! Please consider using MLOG10P instead.", verbose=verbose)
|
|
944
1008
|
after_number=len(sumstats)
|
|
945
1009
|
if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad P.")
|
|
946
1010
|
|
|
@@ -1041,10 +1105,10 @@ def sanitycheckstats(sumstats,
|
|
|
1041
1105
|
if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
|
|
1042
1106
|
categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
1043
1107
|
sumstats.loc[:,"STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
|
|
1044
|
-
|
|
1108
|
+
|
|
1045
1109
|
#pre_number=len(sumstats)
|
|
1046
1110
|
#sumstats = sumstats.dropna(subset=cols_to_check)
|
|
1047
|
-
|
|
1111
|
+
after_number=len(sumstats)
|
|
1048
1112
|
#if verbose:log.write(" -Removed {} variants with NAs in the checked columns...".format(pre_number - after_number))
|
|
1049
1113
|
|
|
1050
1114
|
if verbose: log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.")
|
|
@@ -1054,6 +1118,67 @@ def sanitycheckstats(sumstats,
|
|
|
1054
1118
|
if verbose: log.write("Finished sanity check successfully!")
|
|
1055
1119
|
return sumstats
|
|
1056
1120
|
|
|
1121
|
+
### check consistency #############################################################################################################################################
|
|
1122
|
+
|
|
1123
|
+
def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
|
|
1124
|
+
if verbose: log.write("Start to check data consistency across columns...{}".format(_get_version()))
|
|
1125
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
1126
|
+
log.write(" -Tolerance: {} (Relative) and {} (Absolute)".format(rtol, atol),verbose=verbose)
|
|
1127
|
+
|
|
1128
|
+
|
|
1129
|
+
if "SNPID" not in sumstats.columns:
|
|
1130
|
+
id_to_use = "rsID"
|
|
1131
|
+
else:
|
|
1132
|
+
id_to_use = "SNPID"
|
|
1133
|
+
|
|
1134
|
+
if "BETA" in sumstats.columns and "SE" in sumstats.columns:
|
|
1135
|
+
if "MLOG10P" in sumstats.columns:
|
|
1136
|
+
log.write(" -Checking if BETA/SE-derived-MLOG10P is consistent with MLOG10P...",verbose=verbose)
|
|
1137
|
+
betase_derived_mlog10p = _convert_betase_to_mlog10p(sumstats["BETA"], sumstats["SE"])
|
|
1138
|
+
is_close = np.isclose(betase_derived_mlog10p, sumstats["MLOG10P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1139
|
+
diff = betase_derived_mlog10p - sumstats["MLOG10P"]
|
|
1140
|
+
if sum(~is_close)>0:
|
|
1141
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
|
|
1142
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
|
|
1143
|
+
else:
|
|
1144
|
+
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1145
|
+
|
|
1146
|
+
if "P" in sumstats.columns:
|
|
1147
|
+
log.write(" -Checking if BETA/SE-derived-P is consistent with P...",verbose=verbose)
|
|
1148
|
+
betase_derived_p = _convert_betase_to_p(sumstats["BETA"], sumstats["SE"])
|
|
1149
|
+
is_close = np.isclose(betase_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1150
|
+
diff = betase_derived_p - sumstats["P"]
|
|
1151
|
+
if sum(~is_close)>0:
|
|
1152
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
|
|
1153
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
|
|
1154
|
+
else:
|
|
1155
|
+
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1156
|
+
|
|
1157
|
+
if "MLOG10P" in sumstats.columns and "P" in sumstats.columns:
|
|
1158
|
+
log.write(" -Checking if MLOG10P-derived-P is consistent with P...",verbose=verbose)
|
|
1159
|
+
mlog10p_derived_p = _convert_mlog10p_to_p(sumstats["MLOG10P"])
|
|
1160
|
+
is_close = np.isclose(mlog10p_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1161
|
+
diff = mlog10p_derived_p - sumstats["P"]
|
|
1162
|
+
if sum(~is_close)>0:
|
|
1163
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
|
|
1164
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
|
|
1165
|
+
else:
|
|
1166
|
+
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1167
|
+
|
|
1168
|
+
if "N" in sumstats.columns and "N_CONTROL" in sumstats.columns and "N_CASE" in sumstats.columns:
|
|
1169
|
+
if verbose: log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...")
|
|
1170
|
+
is_close = sumstats.loc[:,"N"] == sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"]
|
|
1171
|
+
#is_close = np.isclose(sumstats.loc[:,"N"], sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
|
|
1172
|
+
diff = abs(sumstats.loc[:,"N"] - (sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"] ))
|
|
1173
|
+
if sum(~is_close)>0:
|
|
1174
|
+
log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
|
|
1175
|
+
log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
|
|
1176
|
+
else:
|
|
1177
|
+
log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
|
|
1178
|
+
|
|
1179
|
+
log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
|
|
1180
|
+
|
|
1181
|
+
if verbose: log.write("Finished checking data consistency across columns.")
|
|
1057
1182
|
###############################################################################################################
|
|
1058
1183
|
# 20220426
|
|
1059
1184
|
def get_reverse_complementary_allele(a):
|
|
@@ -1079,7 +1204,7 @@ def flip_direction(string):
|
|
|
1079
1204
|
|
|
1080
1205
|
def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
1081
1206
|
|
|
1082
|
-
|
|
1207
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
1083
1208
|
|
|
1084
1209
|
###################get reverse complementary####################
|
|
1085
1210
|
pattern = r"\w\w\w\w\w[45]\w"
|
|
@@ -1278,10 +1403,10 @@ def liftover_variant(sumstats,
|
|
|
1278
1403
|
|
|
1279
1404
|
def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
|
|
1280
1405
|
if check_col(sumstats,chrom,pos,status) is not True:
|
|
1281
|
-
if verbose: log.write("WARNING
|
|
1406
|
+
if verbose: log.write("WARNING! .liftover(): specified columns not detected..skipping...")
|
|
1282
1407
|
return sumstats
|
|
1283
1408
|
if verbose: log.write("Start to perform liftover...{}".format(_get_version()))
|
|
1284
|
-
|
|
1409
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
1285
1410
|
if verbose: log.write(" -CPU Cores to use :",n_cores)
|
|
1286
1411
|
if verbose: log.write(" -Performing liftover ...")
|
|
1287
1412
|
if verbose: log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build)
|
|
@@ -1325,7 +1450,7 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
|
|
|
1325
1450
|
return sumstats
|
|
1326
1451
|
|
|
1327
1452
|
if verbose: log.write("Start to sort the genome coordinates...{}".format(_get_version()))
|
|
1328
|
-
|
|
1453
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
1329
1454
|
|
|
1330
1455
|
try:
|
|
1331
1456
|
if sumstats[pos].dtype == "Int64":
|
|
@@ -1344,11 +1469,11 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
|
|
|
1344
1469
|
###############################################################################################################
|
|
1345
1470
|
# 20230430 added HR HR_95 BETA_95 N_CASE N_CONTROL
|
|
1346
1471
|
def sortcolumn(sumstats,verbose=True,log=Log(),order = [
|
|
1347
|
-
"SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z",
|
|
1472
|
+
"SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z","T","F",
|
|
1348
1473
|
"CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"
|
|
1349
1474
|
]):
|
|
1350
1475
|
if verbose: log.write("Start to reorder the columns...{}".format(_get_version()))
|
|
1351
|
-
|
|
1476
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
1352
1477
|
|
|
1353
1478
|
output_columns = []
|
|
1354
1479
|
for i in order:
|
|
@@ -1380,4 +1505,5 @@ def check_col(df,*args):
|
|
|
1380
1505
|
if len(not_in_df)>0:
|
|
1381
1506
|
return False
|
|
1382
1507
|
print(" -Specified columns names was not detected. Please check:"+",".join(not_in_df))
|
|
1383
|
-
return True
|
|
1508
|
+
return True
|
|
1509
|
+
|
gwaslab/util_ex_process_h5.py
CHANGED
|
@@ -3,31 +3,40 @@ import os
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
from gwaslab.g_Log import Log
|
|
5
5
|
|
|
6
|
-
def
|
|
6
|
+
def process_vcf_to_hfd5(vcf,
|
|
7
|
+
directory=None,
|
|
8
|
+
chr_dict=None,
|
|
9
|
+
group_size=20000000,
|
|
10
|
+
complevel=9,
|
|
11
|
+
chunksize=20000000,
|
|
12
|
+
log=Log(),
|
|
13
|
+
verbose=True):
|
|
14
|
+
|
|
7
15
|
#load vcf
|
|
8
|
-
log.write("Start
|
|
9
|
-
log.write(" -Reference VCF path:{}".format(vcf))
|
|
10
|
-
log.write(" -Output group size:{}".format(group_size))
|
|
11
|
-
log.write(" -Compression level:{}".format(complevel))
|
|
12
|
-
log.write(" -Loading chunksize:{}".format(chunksize))
|
|
16
|
+
log.write("Start to process VCF file to HDF5:", verbose=verbose)
|
|
17
|
+
log.write(" -Reference VCF path:{}".format(vcf), verbose=verbose)
|
|
18
|
+
log.write(" -Output group size:{}".format(group_size), verbose=verbose)
|
|
19
|
+
log.write(" -Compression level:{}".format(complevel), verbose=verbose)
|
|
20
|
+
log.write(" -Loading chunksize:{}".format(chunksize), verbose=verbose)
|
|
13
21
|
|
|
22
|
+
vcf_file_name = os.path.basename(vcf)
|
|
23
|
+
vcf_dir_path = os.path.dirname(vcf)
|
|
24
|
+
|
|
14
25
|
if directory is None:
|
|
15
|
-
directory=
|
|
16
|
-
|
|
26
|
+
directory = vcf_dir_path
|
|
17
27
|
elif directory[-1] == "/":
|
|
18
28
|
directory = directory.rstrip('/')
|
|
19
29
|
|
|
20
|
-
h5_path = "{}/rsID_CHR_POS_groups_{}.h5".format(directory,int(group_size))
|
|
21
|
-
log_path = "{}/rsID_CHR_POS_groups_{}.log".format(directory,int(group_size))
|
|
22
|
-
log.write(" -HDF5 Output path: {}".format(h5_path))
|
|
23
|
-
log.write(" -Log output path: {}".format(log_path))
|
|
30
|
+
h5_path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(directory,vcf_file_name,int(group_size))
|
|
31
|
+
log_path = "{}/{}.rsID_CHR_POS_groups_{}.log".format(directory,vcf_file_name, int(group_size))
|
|
32
|
+
log.write(" -HDF5 Output path: {}".format(h5_path), verbose=verbose)
|
|
33
|
+
log.write(" -Log output path: {}".format(log_path), verbose=verbose)
|
|
24
34
|
df = pd.read_table(vcf,comment="#",usecols=[0,1,2],header=None,chunksize=chunksize)
|
|
25
35
|
|
|
26
|
-
|
|
27
|
-
log.write(" -Processing chunk: ",end="")
|
|
36
|
+
log.write(" -Processing chunk: ",end="", verbose=verbose)
|
|
28
37
|
|
|
29
38
|
for index,chunk in enumerate(df):
|
|
30
|
-
log.write(index,end=" ",show_time=False)
|
|
39
|
+
log.write(index,end=" ",show_time=False, verbose=verbose)
|
|
31
40
|
chunk = chunk.rename(columns={0:"CHR",1:"POS",2:"rsn"})
|
|
32
41
|
if chr_dict is not None:
|
|
33
42
|
chunk["CHR"] = chunk["CHR"].map(chr_dict)
|
|
@@ -47,5 +56,5 @@ def process_ref_vcf(vcf, directory=None, chr_dict=None, group_size=20000000,comp
|
|
|
47
56
|
dropna=True,
|
|
48
57
|
format="table",
|
|
49
58
|
complevel=complevel)
|
|
50
|
-
log.write("Processing finished!")
|
|
51
|
-
log.save(log_path, verbose=
|
|
59
|
+
log.write("Processing finished!", verbose=verbose)
|
|
60
|
+
log.save(log_path, verbose=verbose)
|