gwaslab 3.4.36__py3-none-any.whl → 3.4.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

@@ -14,7 +14,12 @@ from gwaslab.bd_common_data import get_chr_to_number
14
14
  from gwaslab.bd_common_data import get_number_to_chr
15
15
  from gwaslab.bd_common_data import get_chr_list
16
16
  from gwaslab.qc_check_datatype import check_datatype
17
+ from gwaslab.qc_check_datatype import check_dataframe_shape
17
18
  from gwaslab.g_version import _get_version
19
+ from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
20
+ from gwaslab.util_in_fill_data import _convert_betase_to_p
21
+ from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
22
+ #process build
18
23
  #setbuild
19
24
  #fixID
20
25
  #rsidtochrpos
@@ -26,6 +31,7 @@ from gwaslab.g_version import _get_version
26
31
  #normalizevariant
27
32
  #checkref
28
33
  #sanitycheckstats
34
+ #_check_data_consistency
29
35
  #flipallelestats
30
36
  #parallelizeassignrsid
31
37
  #sortcoordinate
@@ -41,7 +47,7 @@ def _process_build(build,log,verbose):
41
47
  log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
42
48
  final_build = "38"
43
49
  else:
44
- log.write(" -Version of genomic coordinates are unknown...", verbose=verbose)
50
+ log.write(" -WARNING! Version of genomic coordinates is unknown...", verbose=verbose)
45
51
  final_build = "99"
46
52
  return final_build
47
53
 
@@ -49,10 +55,10 @@ def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
49
55
  build = _process_build(build,log=log,verbose=verbose)
50
56
  sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 1, "139",build[0]*3)
51
57
  sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 2, "89",build[1]*3)
52
- return sumstats
58
+ return sumstats, build
53
59
 
54
60
  def fixID(sumstats,
55
- snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",
61
+ snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",fixprefix=False,
56
62
  fixchrpos=False,fixid=False,fixeanea=False,fixeanea_flip=False,fixsep=False,
57
63
  overwrite=False,verbose=True,forcefixid=False,log=Log()):
58
64
  '''
@@ -61,37 +67,64 @@ def fixID(sumstats,
61
67
  3. checking rsid and chr:pos:nea:ea
62
68
  '''
63
69
  if verbose: log.write("Start to check IDs...{}".format(_get_version()))
64
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
65
-
70
+ check_dataframe_shape(sumstats, log, verbose)
66
71
  check_col(sumstats,[snpid,rsid],status)
67
-
72
+
73
+ ############################ checking datatype ###################################################
74
+ if rsid in sumstats.columns:
75
+ # convert to string datatype
76
+ try:
77
+ log.write(" -Checking rsID data type...",verbose=verbose)
78
+ if sumstats.loc[:,rsid].dtype == "string":
79
+ pass
80
+ else:
81
+ log.write(" -Converting rsID to pd.string data type...",verbose=verbose)
82
+ sumstats.loc[:,rsid] = sumstats.loc[:,rsid].astype("string")
83
+ except:
84
+ log.write(" -Force converting rsID to pd.string data type...",verbose=verbose)
85
+ sumstats.loc[:,rsid] = sumstats.loc[:,rsid].astype("string")
86
+ if snpid in sumstats.columns:
87
+ # convert to string datatype
88
+ try:
89
+ log.write(" -Checking SNPID data type...",verbose=verbose)
90
+ if sumstats.loc[:,snpid].dtype == "string":
91
+ pass
92
+ else:
93
+ log.write(" -Converting SNPID to pd.string data type...",verbose=verbose)
94
+ sumstats.loc[:,snpid] = sumstats.loc[:,snpid].astype("string")
95
+ except:
96
+ log.write(" -Force converting SNPID to pd.string data type...",verbose=verbose)
97
+ sumstats.loc[:,snpid] = sumstats.loc[:,snpid].astype("string")
98
+
68
99
  ############################ checking ###################################################
69
100
  if snpid in sumstats.columns:
70
- if verbose: log.write(" -Checking if SNPID is chr:pos:ref:alt...(separator: - ,: , _)")
71
- #is_chrposrefalt = sumstats[snpid].str.match(r'(chr)?([0-9XYMT]+)[:_-]([0-9]+)[:_-]([ATCG]+)[:_-]([ATCG]+)', case=False, flags=0, na=False)
101
+ log.write(" -Checking if SNPID is CHR:POS:NEA:EA...(separator: - ,: , _)",verbose=verbose)
102
+ # check if SNPID is CHR:POS:EA:NEA
72
103
  is_chrposrefalt = sumstats[snpid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
104
+ # check if SNPID is NA
73
105
  is_snpid_na = sumstats[snpid].isna()
106
+
107
+ # change STATUS code
74
108
  sumstats.loc[ is_chrposrefalt,status] = vchange_status(sumstats.loc[ is_chrposrefalt,status],3 ,"975" ,"630")
75
109
  sumstats.loc[(~is_chrposrefalt)&(~is_snpid_na),status] = vchange_status(sumstats.loc[(~is_chrposrefalt)&(~is_snpid_na),status],3 ,"975" ,"842")
76
110
 
77
111
  if rsid in sumstats.columns:
78
- if verbose: log.write(" -Checking if rsID is rsxxxxxx or RSxxxxxxx...")
79
- is_rsid = sumstats[rsid].str.startswith(r'rs',na=False)
112
+ log.write(" -Checking if rsID is rsxxxxxx...", verbose=verbose)
113
+ is_rsid = sumstats[rsid].str.match(r'^rs\d+$', case=False, flags=0, na=False)
80
114
 
81
115
  sumstats.loc[ is_rsid,status] = vchange_status(sumstats.loc[ is_rsid,status], 3, "986","520")
82
116
  sumstats.loc[~is_rsid,status] = vchange_status(sumstats.loc[~is_rsid,status], 3, "986","743")
83
117
 
84
- if verbose: log.write(" -Checking if chr:pos:ref:alt is mixed in rsID column ...")
85
- is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\w+[:_-]\w+[:_-]\w+$', case=False, flags=0, na=False)
86
- #is_rs_chrpos = sumstats[rsid].str.match(r'(chr)?([0-9XYMT]+)[:_-]([0-9]+)[:_-]([ATCG]+)[:_-]([ATCG]+)', case=False, flags=0, na=False)
118
+ if verbose: log.write(" -Checking if CHR:POS:NEA:EA is mixed in rsID column ...")
119
+ is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
87
120
 
88
- if verbose: log.write(" -Number of chr:pos:ref:alt mixed in rsID column :",sum(is_rs_chrpos))
89
- if verbose: log.write(" -Number of Unrecognized rsID :",len(sumstats) - sum(is_rs_chrpos) - sum(is_rsid) )
90
- if verbose: log.write(" -A look at the unrecognized rsID :",set(sumstats.loc[(~is_rsid)&(~is_rs_chrpos),rsid].head()),"...")
121
+ log.write(" -Number of CHR:POS:NEA:EA mixed in rsID column :",sum(is_rs_chrpos), verbose=verbose)
122
+ log.write(" -Number of Unrecognized rsID :",len(sumstats) - sum(is_rs_chrpos) - sum(is_rsid) , verbose=verbose)
123
+ log.write(" -A look at the unrecognized rsID :",set(sumstats.loc[(~is_rsid)&(~is_rs_chrpos),rsid].head()),"...", verbose=verbose)
91
124
 
92
125
  ############################ fixing chr pos###################################################
93
- if fixchrpos is True:
94
- # from snpid or rsid, extract chr:pos to fix CHR and POS
126
+ if fixchrpos == True:
127
+ # from snpid or rsid, extract CHR:POS to fix CHR and POS
95
128
  if snpid in sumstats.columns:
96
129
  if verbose: log.write(" -Fixing CHR and POS...")
97
130
  if overwrite is True:
@@ -99,8 +132,8 @@ def fixID(sumstats,
99
132
  # fix all
100
133
  to_fix = is_chrposrefalt
101
134
 
102
- #fix variants with chr and pos being empty
103
135
  elif (chrom in sumstats.columns) and (pos in sumstats.columns) :
136
+ #fix variants with chr and pos being NA
104
137
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
105
138
  to_fix_num = sum(to_fix)
106
139
  if to_fix_num and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
@@ -121,6 +154,7 @@ def fixID(sumstats,
121
154
  to_fix_num = sum(to_fix)
122
155
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
123
156
  elif verbose: log.write(" -No fixable variants. ...")
157
+
124
158
  else:
125
159
  if verbose: log.write(" -Initiating CHR and POS columns...")
126
160
  sumstats.loc[:,chrom]=pd.Series(dtype="string")
@@ -134,8 +168,8 @@ def fixID(sumstats,
134
168
  if verbose: log.write(" -Filling CHR and POS columns using valid SNPID's chr:pos...")
135
169
  # format and qc filled chr and pos
136
170
 
137
- sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.split(':|_|-',n=2).str.get(0)
138
- sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,snpid].str.split(':|_|-',n=2).str.get(1)
171
+ sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
172
+ sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[2]
139
173
 
140
174
  #sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.split(':|_|-').str[0].str.strip("chrCHR").astype("string")
141
175
  #sumstats.loc[to_fix,pos] =np.floor(pd.to_numeric(sumstats.loc[to_fix,snpid].str.split(':|_|-').str[1], errors='coerce')).astype('Int64')
@@ -179,55 +213,62 @@ def fixID(sumstats,
179
213
  #sumstats.loc[to_fix,status] = vchange_status(sumstats.loc[to_fix,status], 4, "98765432","00000000").astype("string")
180
214
 
181
215
  ############################ fixing chr pos###################################################
182
- #if fixeanea is True:
183
- # if verbose: log.write(" -Warning: Please make sure a1 is ref or not in Chr:pos:a1:a2")
184
- # if overwrite is True:
185
- # if verbose: log.write(" -Overwrite is applied...")
186
- # to_fix = is_chrposrefalt
187
- # elif (nea in sumstats.columns) and (nea in sumstats.columns):
188
- # to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
189
- # if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
190
- # elif (nea in sumstats.columns) and (ea not in sumstats.columns):
191
- # if verbose: log.write(" -Initiating EA columns...")
192
- # sumstats[ea]=pd.Series(dtype="string")
193
- # to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
194
- # if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
195
- # elif (nea not in sumstats.columns) and (ea in sumstats.columns):
196
- # if verbose: log.write(" -Initiating NEA columns...")
197
- # sumstats[nea]=pd.Series(dtype="string")
198
- # to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
199
- # if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
200
- # else:
201
- # if verbose: log.write(" -Initiating EA and NEA columns...")
202
- # sumstats[nea]=pd.Series(dtype="string")
203
- # sumstats[ea]=pd.Series(dtype="string")
204
- # to_fix = is_chrposrefalt
205
- # if sum(to_fix)>0:
206
- # if verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
216
+ if fixeanea == True:
217
+ if verbose: log.write(" -WARNING! gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT")
218
+ if overwrite is True:
219
+ if verbose: log.write(" -Overwrite mode is applied...")
220
+ to_fix = is_chrposrefalt
221
+ elif (nea in sumstats.columns) and (nea in sumstats.columns):
222
+ to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
223
+ if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
224
+ elif (nea in sumstats.columns) and (ea not in sumstats.columns):
225
+ if verbose: log.write(" -Initiating EA columns...")
226
+ sumstats.loc[:,ea]=pd.Series(dtype="string")
227
+ to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
228
+ if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
229
+ elif (nea not in sumstats.columns) and (ea in sumstats.columns):
230
+ if verbose: log.write(" -Initiating NEA columns...")
231
+ sumstats.loc[:,nea]=pd.Series(dtype="string")
232
+ to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
233
+ if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
234
+ else:
235
+ if verbose: log.write(" -Initiating EA and NEA columns...")
236
+ sumstats[nea]=pd.Series(dtype="string")
237
+ sumstats[ea]=pd.Series(dtype="string")
238
+ to_fix = is_chrposrefalt
239
+ if sum(to_fix)>0:
240
+ if verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
207
241
  #
208
- # if sum(to_fix)>0:
209
- # if verbose: log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's chr:pos:nea:ea...")
242
+ if sum(to_fix)>0:
243
+ if verbose: log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's CHR:POS:NEA:EA...")
210
244
  #
211
- # if fixeanea_flip is True:
212
- # if verbose: log.write(" -Flipped : chr:pos:a1:a2...a1->EA , a2->NEA ")
213
- # sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[2]).astype("string")
214
- # sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[3]).astype("string")
215
- # else:
216
- # if verbose: log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ")
217
- # sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[3]).astype("string")
218
- # sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[2]).astype("string")
245
+ if fixeanea_flip == True:
246
+ if verbose: log.write(" -Flipped : CHR:POS:NEA:EA -> CHR:POS:EA:NEA ")
247
+ sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
248
+ sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
249
+ else:
250
+ if verbose: log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ")
251
+ sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
252
+ sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
253
+
219
254
  # #to_change_status = sumstats[status].str.match(r"\w\w\w[45]\w\w\w")
220
255
  # #sumstats.loc[to_fix&to_change_status,status] = vchange_status(sumstats.loc[to_fix&to_change_status,status],4,"2")
221
256
  # #sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[1]).astype("string")
222
257
  # #sumstats.loc[to_fix,rsid].apply(lambda x:re.split(':|_|-',x)[1]).astype("Int64")
223
258
 
224
259
  ############################ fixing id ###################################################
225
- if fixsep is True:
260
+ if fixsep == True:
226
261
  if snpid in sumstats.columns:
227
262
  if verbose: log.write(' -Replacing [_-] in SNPID with ":" ...')
228
263
  sumstats.loc[:,snpid] = sumstats.loc[:,snpid].str.replace(r"[_-]",":",regex=True)
264
+
265
+ if fixprefix == True:
266
+ if snpid in sumstats.columns:
267
+ if verbose: log.write(' -Removing /^chr/ in SNPID ...')
268
+ prefix_removed = sumstats.loc[:,snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
269
+ sumstats.loc[~prefix_removed.isna(),snpid] = prefix_removed[~prefix_removed.isna()]
229
270
 
230
- if fixid is True:
271
+ if fixid == True:
231
272
  if snpid not in sumstats.columns:
232
273
  # initiate a SNPID column
233
274
  sumstats.loc[:,snpid]=pd.Series(dtype="string")
@@ -304,19 +345,21 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
304
345
  remove multiallelic SNPs based on 4. CHR, POS
305
346
  '''
306
347
 
348
+ if verbose: log.write("Start to remove duplicated/multiallelic variants...{}".format(_get_version()))
349
+ if verbose: log.write(" -Removing mode:{}".format(mode))
307
350
  # sort the variants using the specified column before removing
308
351
  if keep_col is not None :
309
352
  if keep_col in sumstats.columns:
310
- if verbose: log.write("Start to sort the sumstats using " + keep_col +"...")
353
+ if verbose: log.write("Start to sort the sumstats using {}...".format(keep_col))
311
354
  sumstats = sumstats.sort_values(by=keep_col,ascending=keep_ascend)
312
355
  else:
313
356
  if verbose: log.write("Column" + keep_col +" was not detected... skipping... ")
314
357
  total_number = len(sumstats)
315
358
 
316
359
  # remove by duplicated SNPID
317
- if (snpid in sumstats.columns) and "d" in mode:
360
+ if (snpid in sumstats.columns) and ("d" in mode or "s" in mode):
318
361
  if verbose: log.write("Start to remove duplicated variants based on snpid...{}".format(_get_version()))
319
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
362
+ check_dataframe_shape(sumstats, log, verbose)
320
363
  if verbose: log.write(" -Which variant to keep: ", keep )
321
364
  pre_number =len(sumstats)
322
365
  if snpid in sumstats.columns:
@@ -326,18 +369,19 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
326
369
  if verbose: log.write(" -Removed ",pre_number -after_number ," based on SNPID...")
327
370
 
328
371
  # remove by duplicated rsID
329
- if (rsid in sumstats.columns) and ("d" in mode):
372
+ if (rsid in sumstats.columns) and ("d" in mode or "r" in mode):
330
373
  # keep na and remove duplicated
331
374
  pre_number =len(sumstats)
332
375
  if verbose: log.write("Start to remove duplicated variants based on rsID...")
376
+ check_dataframe_shape(sumstats, log, verbose)
333
377
  sumstats = sumstats.loc[sumstats[rsid].isna() | (~sumstats.duplicated(subset=rsid, keep=keep)),:]
334
378
  after_number=len(sumstats)
335
379
  if verbose: log.write(" -Removed ",pre_number -after_number ," based on rsID...")
336
380
 
337
381
  # remove by duplicated variants by CHR:POS:NEA:EA
338
- if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and "d" in mode:
382
+ if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and ("d" in mode or "c" in mode):
339
383
  if verbose: log.write("Start to remove duplicated variants based on CHR,POS,EA and NEA...")
340
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
384
+ check_dataframe_shape(sumstats, log, verbose)
341
385
  if verbose: log.write(" -Which variant to keep: ", keep )
342
386
  pre_number =len(sumstats)
343
387
  if snpid in sumstats.columns:
@@ -351,6 +395,7 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
351
395
  # keep na and remove duplicated
352
396
  pre_number =len(sumstats)
353
397
  if verbose: log.write("Start to remove multiallelic variants based on chr:pos...")
398
+ check_dataframe_shape(sumstats, log, verbose)
354
399
  if verbose: log.write(" -Which variant to keep: ", keep )
355
400
  sumstats = sumstats.loc[(~sumstats.loc[:,[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
356
401
  after_number=len(sumstats)
@@ -360,17 +405,37 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
360
405
  # resort the coordinates
361
406
  if verbose: log.write(" -Removed ",total_number -after_number," variants in total.")
362
407
  if keep_col is not None :
363
- if verbose: log.write(" -Sort the coordinates...")
408
+ if verbose: log.write(" -Sort the coordinates based on CHR and POS...")
364
409
  sumstats = sortcoordinate(sumstats,verbose=False)
365
410
 
366
- if remove is True:
411
+ if "n" in mode or remove==True:
367
412
  # if remove==True, remove NAs
368
413
  if verbose: log.write(" -Removing NAs...")
369
414
  pre_number =len(sumstats)
370
- sumstats = sumstats.loc[~sumstats.isna().any(axis=1),:]
415
+ specified_columns = []
416
+ if "d" in mode:
417
+ specified_columns.append(rsid)
418
+ specified_columns.append(snpid)
419
+ specified_columns.append(chrom)
420
+ specified_columns.append(pos)
421
+ specified_columns.append(ea)
422
+ specified_columns.append(nea)
423
+ if "r" in mode:
424
+ specified_columns.append(rsid)
425
+ if "s" in mode:
426
+ specified_columns.append(snpid)
427
+ if "m" in mode:
428
+ specified_columns.append(chrom)
429
+ specified_columns.append(pos)
430
+ if "c" in mode:
431
+ specified_columns.append(chrom)
432
+ specified_columns.append(pos)
433
+ specified_columns.append(ea)
434
+ specified_columns.append(nea)
435
+ sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
371
436
  after_number=len(sumstats)
372
- if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values.")
373
- if verbose: log.write("Finished removing successfully!")
437
+ if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)))
438
+ if verbose: log.write("Finished removing duplicated/multiallelic variants successfully!")
374
439
  return sumstats
375
440
 
376
441
  ###############################################################################################################
@@ -383,7 +448,7 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
383
448
  if verbose: log.write(".fix_chr: Specified not detected..skipping...")
384
449
  return sumstats
385
450
  if verbose: log.write("Start to fix chromosome notation...{}".format(_get_version()))
386
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
451
+ check_dataframe_shape(sumstats, log, verbose)
387
452
 
388
453
  # convert to string datatype
389
454
  try:
@@ -512,7 +577,7 @@ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_
512
577
  if verbose: log.write(".fix_pos: Specified not detected..skipping...")
513
578
  return sumstats
514
579
  if verbose: log.write("Start to fix basepair positions...{}".format(_get_version()))
515
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
580
+ check_dataframe_shape(sumstats, log, verbose)
516
581
 
517
582
  all_var_num = len(sumstats)
518
583
  #convert to numeric
@@ -565,7 +630,7 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
565
630
  if verbose: log.write("EA and NEA not detected..skipping...")
566
631
  return sumstats
567
632
  if verbose: log.write("Start to fix alleles...{}".format(_get_version()))
568
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
633
+ check_dataframe_shape(sumstats, log, verbose)
569
634
 
570
635
  #if (ea not in sumstats.columns) or (nea not in sumstats.columns):
571
636
  if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
@@ -659,11 +724,11 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
659
724
 
660
725
  def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
661
726
  if check_col(sumstats,pos,ea,nea,status) is not True:
662
- if verbose: log.write("WARNING:.normalize(): specified columns not detected..skipping...")
727
+ if verbose: log.write("WARNING! .normalize(): specified columns not detected..skipping...")
663
728
  return sumstats
664
729
 
665
730
  if verbose: log.write("Start to normalize variants...{}".format(_get_version()))
666
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
731
+ check_dataframe_shape(sumstats, log, verbose)
667
732
  #variants_to_check = status_match(sumstats[status],5,[4,5]) #
668
733
  #r'\w\w\w\w[45]\w\w'
669
734
  variants_to_check = sumstats[status].str[4].str.match(r'4|5', case=False, flags=0, na=False)
@@ -844,7 +909,7 @@ def sanitycheckstats(sumstats,
844
909
  if coltocheck is None:
845
910
  coltocheck = ["P","MLOG10P","INFO","Z","BETA","SE","EAF","CHISQ","F","N","N_CASE","N_CONTROL","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","STATUS"]
846
911
  if verbose: log.write("Start sanity check for statistics...{}".format(_get_version()))
847
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
912
+ check_dataframe_shape(sumstats, log, verbose)
848
913
  cols_to_check=[]
849
914
  oringinal_number=len(sumstats)
850
915
  sumstats = sumstats.copy()
@@ -875,13 +940,7 @@ def sanitycheckstats(sumstats,
875
940
  sumstats = sumstats.loc[(sumstats["N_CONTROL"]>=ncontrol[0]) & (sumstats["N_CONTROL"]<=ncontrol[1]),:]
876
941
  after_number=len(sumstats)
877
942
  if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CONTROL.")
878
- pre_number=len(sumstats)
879
- if "N" in coltocheck and "N" in sumstats.columns and "N_CONTROL" in coltocheck and "N_CONTROL" in sumstats.columns and "N_CASE" in coltocheck and "N_CASE" in sumstats.columns:
880
- if verbose: log.write(" -Checking if N = N_CASE + N_CONTROL ...")
881
- matched_n = sumstats.loc[:,"N"] == sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"]
882
- sumstats = sumstats.loc[matched_n,:]
883
- after_number=len(sumstats)
884
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with N != N_CASE + N_CONTROL.")
943
+
885
944
 
886
945
  ###ALLELE FREQUENCY################################################################################################################################################
887
946
  pre_number=len(sumstats)
@@ -941,6 +1000,11 @@ def sanitycheckstats(sumstats,
941
1000
  if verbose: log.write(" -Checking if ",p[0],"< P <",p[1]," ...")
942
1001
  sumstats.loc[:,"P"] = pd.to_numeric(sumstats.loc[:,"P"], errors='coerce').astype("float64")
943
1002
  sumstats = sumstats.loc[(sumstats["P"]>p[0]) & (sumstats["P"]<p[1]),:]
1003
+
1004
+ is_low_p = sumstats["P"] == 0
1005
+ if sum(is_low_p) >0:
1006
+ log.write(" -WARNING! Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)), verbose=verbose)
1007
+ log.write(" -WARNING! Please consider using MLOG10P instead.", verbose=verbose)
944
1008
  after_number=len(sumstats)
945
1009
  if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad P.")
946
1010
 
@@ -1041,10 +1105,10 @@ def sanitycheckstats(sumstats,
1041
1105
  if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
1042
1106
  categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
1043
1107
  sumstats.loc[:,"STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
1044
-
1108
+
1045
1109
  #pre_number=len(sumstats)
1046
1110
  #sumstats = sumstats.dropna(subset=cols_to_check)
1047
- #after_number=len(sumstats)
1111
+ after_number=len(sumstats)
1048
1112
  #if verbose:log.write(" -Removed {} variants with NAs in the checked columns...".format(pre_number - after_number))
1049
1113
 
1050
1114
  if verbose: log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.")
@@ -1054,6 +1118,67 @@ def sanitycheckstats(sumstats,
1054
1118
  if verbose: log.write("Finished sanity check successfully!")
1055
1119
  return sumstats
1056
1120
 
1121
+ ### check consistency #############################################################################################################################################
1122
+
1123
+ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
1124
+ if verbose: log.write("Start to check data consistency across columns...{}".format(_get_version()))
1125
+ check_dataframe_shape(sumstats, log, verbose)
1126
+ log.write(" -Tolerance: {} (Relative) and {} (Absolute)".format(rtol, atol),verbose=verbose)
1127
+
1128
+
1129
+ if "SNPID" not in sumstats.columns:
1130
+ id_to_use = "rsID"
1131
+ else:
1132
+ id_to_use = "SNPID"
1133
+
1134
+ if "BETA" in sumstats.columns and "SE" in sumstats.columns:
1135
+ if "MLOG10P" in sumstats.columns:
1136
+ log.write(" -Checking if BETA/SE-derived-MLOG10P is consistent with MLOG10P...",verbose=verbose)
1137
+ betase_derived_mlog10p = _convert_betase_to_mlog10p(sumstats["BETA"], sumstats["SE"])
1138
+ is_close = np.isclose(betase_derived_mlog10p, sumstats["MLOG10P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1139
+ diff = betase_derived_mlog10p - sumstats["MLOG10P"]
1140
+ if sum(~is_close)>0:
1141
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1142
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1143
+ else:
1144
+ log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1145
+
1146
+ if "P" in sumstats.columns:
1147
+ log.write(" -Checking if BETA/SE-derived-P is consistent with P...",verbose=verbose)
1148
+ betase_derived_p = _convert_betase_to_p(sumstats["BETA"], sumstats["SE"])
1149
+ is_close = np.isclose(betase_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1150
+ diff = betase_derived_p - sumstats["P"]
1151
+ if sum(~is_close)>0:
1152
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1153
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1154
+ else:
1155
+ log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1156
+
1157
+ if "MLOG10P" in sumstats.columns and "P" in sumstats.columns:
1158
+ log.write(" -Checking if MLOG10P-derived-P is consistent with P...",verbose=verbose)
1159
+ mlog10p_derived_p = _convert_mlog10p_to_p(sumstats["MLOG10P"])
1160
+ is_close = np.isclose(mlog10p_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1161
+ diff = mlog10p_derived_p - sumstats["P"]
1162
+ if sum(~is_close)>0:
1163
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1164
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1165
+ else:
1166
+ log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1167
+
1168
+ if "N" in sumstats.columns and "N_CONTROL" in sumstats.columns and "N_CASE" in sumstats.columns:
1169
+ if verbose: log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...")
1170
+ is_close = sumstats.loc[:,"N"] == sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"]
1171
+ #is_close = np.isclose(sumstats.loc[:,"N"], sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
1172
+ diff = abs(sumstats.loc[:,"N"] - (sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"] ))
1173
+ if sum(~is_close)>0:
1174
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1175
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1176
+ else:
1177
+ log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1178
+
1179
+ log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
1180
+
1181
+ if verbose: log.write("Finished checking data consistency across columns.")
1057
1182
  ###############################################################################################################
1058
1183
  # 20220426
1059
1184
  def get_reverse_complementary_allele(a):
@@ -1079,7 +1204,7 @@ def flip_direction(string):
1079
1204
 
1080
1205
  def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1081
1206
 
1082
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1207
+ check_dataframe_shape(sumstats, log, verbose)
1083
1208
 
1084
1209
  ###################get reverse complementary####################
1085
1210
  pattern = r"\w\w\w\w\w[45]\w"
@@ -1278,10 +1403,10 @@ def liftover_variant(sumstats,
1278
1403
 
1279
1404
  def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
1280
1405
  if check_col(sumstats,chrom,pos,status) is not True:
1281
- if verbose: log.write("WARNING:.liftover(): specified columns not detected..skipping...")
1406
+ if verbose: log.write("WARNING! .liftover(): specified columns not detected..skipping...")
1282
1407
  return sumstats
1283
1408
  if verbose: log.write("Start to perform liftover...{}".format(_get_version()))
1284
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1409
+ check_dataframe_shape(sumstats, log, verbose)
1285
1410
  if verbose: log.write(" -CPU Cores to use :",n_cores)
1286
1411
  if verbose: log.write(" -Performing liftover ...")
1287
1412
  if verbose: log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build)
@@ -1325,7 +1450,7 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
1325
1450
  return sumstats
1326
1451
 
1327
1452
  if verbose: log.write("Start to sort the genome coordinates...{}".format(_get_version()))
1328
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1453
+ check_dataframe_shape(sumstats, log, verbose)
1329
1454
 
1330
1455
  try:
1331
1456
  if sumstats[pos].dtype == "Int64":
@@ -1344,11 +1469,11 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
1344
1469
  ###############################################################################################################
1345
1470
  # 20230430 added HR HR_95 BETA_95 N_CASE N_CONTROL
1346
1471
  def sortcolumn(sumstats,verbose=True,log=Log(),order = [
1347
- "SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z",
1472
+ "SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z","T","F",
1348
1473
  "CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"
1349
1474
  ]):
1350
1475
  if verbose: log.write("Start to reorder the columns...{}".format(_get_version()))
1351
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1476
+ check_dataframe_shape(sumstats, log, verbose)
1352
1477
 
1353
1478
  output_columns = []
1354
1479
  for i in order:
@@ -1380,4 +1505,5 @@ def check_col(df,*args):
1380
1505
  if len(not_in_df)>0:
1381
1506
  return False
1382
1507
  print(" -Specified columns names was not detected. Please check:"+",".join(not_in_df))
1383
- return True
1508
+ return True
1509
+
@@ -3,31 +3,40 @@ import os
3
3
  import numpy as np
4
4
  from gwaslab.g_Log import Log
5
5
 
6
- def process_ref_vcf(vcf, directory=None, chr_dict=None, group_size=20000000,complevel=9,chunksize=20000000,log=Log()):
6
+ def process_vcf_to_hfd5(vcf,
7
+ directory=None,
8
+ chr_dict=None,
9
+ group_size=20000000,
10
+ complevel=9,
11
+ chunksize=20000000,
12
+ log=Log(),
13
+ verbose=True):
14
+
7
15
  #load vcf
8
- log.write("Start processing VCF files:")
9
- log.write(" -Reference VCF path:{}".format(vcf))
10
- log.write(" -Output group size:{}".format(group_size))
11
- log.write(" -Compression level:{}".format(complevel))
12
- log.write(" -Loading chunksize:{}".format(chunksize))
16
+ log.write("Start to process VCF file to HDF5:", verbose=verbose)
17
+ log.write(" -Reference VCF path:{}".format(vcf), verbose=verbose)
18
+ log.write(" -Output group size:{}".format(group_size), verbose=verbose)
19
+ log.write(" -Compression level:{}".format(complevel), verbose=verbose)
20
+ log.write(" -Loading chunksize:{}".format(chunksize), verbose=verbose)
13
21
 
22
+ vcf_file_name = os.path.basename(vcf)
23
+ vcf_dir_path = os.path.dirname(vcf)
24
+
14
25
  if directory is None:
15
- directory="./"
16
-
26
+ directory = vcf_dir_path
17
27
  elif directory[-1] == "/":
18
28
  directory = directory.rstrip('/')
19
29
 
20
- h5_path = "{}/rsID_CHR_POS_groups_{}.h5".format(directory,int(group_size))
21
- log_path = "{}/rsID_CHR_POS_groups_{}.log".format(directory,int(group_size))
22
- log.write(" -HDF5 Output path: {}".format(h5_path))
23
- log.write(" -Log output path: {}".format(log_path))
30
+ h5_path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(directory,vcf_file_name,int(group_size))
31
+ log_path = "{}/{}.rsID_CHR_POS_groups_{}.log".format(directory,vcf_file_name, int(group_size))
32
+ log.write(" -HDF5 Output path: {}".format(h5_path), verbose=verbose)
33
+ log.write(" -Log output path: {}".format(log_path), verbose=verbose)
24
34
  df = pd.read_table(vcf,comment="#",usecols=[0,1,2],header=None,chunksize=chunksize)
25
35
 
26
-
27
- log.write(" -Processing chunk: ",end="")
36
+ log.write(" -Processing chunk: ",end="", verbose=verbose)
28
37
 
29
38
  for index,chunk in enumerate(df):
30
- log.write(index,end=" ",show_time=False)
39
+ log.write(index,end=" ",show_time=False, verbose=verbose)
31
40
  chunk = chunk.rename(columns={0:"CHR",1:"POS",2:"rsn"})
32
41
  if chr_dict is not None:
33
42
  chunk["CHR"] = chunk["CHR"].map(chr_dict)
@@ -47,5 +56,5 @@ def process_ref_vcf(vcf, directory=None, chr_dict=None, group_size=20000000,comp
47
56
  dropna=True,
48
57
  format="table",
49
58
  complevel=complevel)
50
- log.write("Processing finished!")
51
- log.save(log_path, verbose=False)
59
+ log.write("Processing finished!", verbose=verbose)
60
+ log.save(log_path, verbose=verbose)