gwaslab 3.4.35__py3-none-any.whl → 3.4.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

@@ -14,7 +14,12 @@ from gwaslab.bd_common_data import get_chr_to_number
14
14
  from gwaslab.bd_common_data import get_number_to_chr
15
15
  from gwaslab.bd_common_data import get_chr_list
16
16
  from gwaslab.qc_check_datatype import check_datatype
17
+ from gwaslab.qc_check_datatype import check_dataframe_shape
17
18
  from gwaslab.g_version import _get_version
19
+ from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
20
+ from gwaslab.util_in_fill_data import _convert_betase_to_p
21
+ from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
22
+ #process build
18
23
  #setbuild
19
24
  #fixID
20
25
  #rsidtochrpos
@@ -26,6 +31,7 @@ from gwaslab.g_version import _get_version
26
31
  #normalizevariant
27
32
  #checkref
28
33
  #sanitycheckstats
34
+ #_check_data_consistency
29
35
  #flipallelestats
30
36
  #parallelizeassignrsid
31
37
  #sortcoordinate
@@ -41,7 +47,7 @@ def _process_build(build,log,verbose):
41
47
  log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
42
48
  final_build = "38"
43
49
  else:
44
- log.write(" -Version of genomic coordinates are unknown...", verbose=verbose)
50
+ log.write(" -WARNING! Version of genomic coordinates is unknown...", verbose=verbose)
45
51
  final_build = "99"
46
52
  return final_build
47
53
 
@@ -49,10 +55,10 @@ def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
49
55
  build = _process_build(build,log=log,verbose=verbose)
50
56
  sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 1, "139",build[0]*3)
51
57
  sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 2, "89",build[1]*3)
52
- return sumstats
58
+ return sumstats, build
53
59
 
54
60
  def fixID(sumstats,
55
- snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",
61
+ snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",fixprefix=False,
56
62
  fixchrpos=False,fixid=False,fixeanea=False,fixeanea_flip=False,fixsep=False,
57
63
  overwrite=False,verbose=True,forcefixid=False,log=Log()):
58
64
  '''
@@ -61,37 +67,64 @@ def fixID(sumstats,
61
67
  3. checking rsid and chr:pos:nea:ea
62
68
  '''
63
69
  if verbose: log.write("Start to check IDs...{}".format(_get_version()))
64
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
65
-
70
+ check_dataframe_shape(sumstats, log, verbose)
66
71
  check_col(sumstats,[snpid,rsid],status)
67
-
72
+
73
+ ############################ checking datatype ###################################################
74
+ if rsid in sumstats.columns:
75
+ # convert to string datatype
76
+ try:
77
+ log.write(" -Checking rsID data type...",verbose=verbose)
78
+ if sumstats.loc[:,rsid].dtype == "string":
79
+ pass
80
+ else:
81
+ log.write(" -Converting rsID to pd.string data type...",verbose=verbose)
82
+ sumstats.loc[:,rsid] = sumstats.loc[:,rsid].astype("string")
83
+ except:
84
+ log.write(" -Force converting rsID to pd.string data type...",verbose=verbose)
85
+ sumstats.loc[:,rsid] = sumstats.loc[:,rsid].astype("string")
86
+ if snpid in sumstats.columns:
87
+ # convert to string datatype
88
+ try:
89
+ log.write(" -Checking SNPID data type...",verbose=verbose)
90
+ if sumstats.loc[:,snpid].dtype == "string":
91
+ pass
92
+ else:
93
+ log.write(" -Converting SNPID to pd.string data type...",verbose=verbose)
94
+ sumstats.loc[:,snpid] = sumstats.loc[:,snpid].astype("string")
95
+ except:
96
+ log.write(" -Force converting SNPID to pd.string data type...",verbose=verbose)
97
+ sumstats.loc[:,snpid] = sumstats.loc[:,snpid].astype("string")
98
+
68
99
  ############################ checking ###################################################
69
100
  if snpid in sumstats.columns:
70
- if verbose: log.write(" -Checking if SNPID is chr:pos:ref:alt...(separator: - ,: , _)")
71
- #is_chrposrefalt = sumstats[snpid].str.match(r'(chr)?([0-9XYMT]+)[:_-]([0-9]+)[:_-]([ATCG]+)[:_-]([ATCG]+)', case=False, flags=0, na=False)
101
+ log.write(" -Checking if SNPID is CHR:POS:NEA:EA...(separator: - ,: , _)",verbose=verbose)
102
+ # check if SNPID is CHR:POS:EA:NEA
72
103
  is_chrposrefalt = sumstats[snpid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
104
+ # check if SNPID is NA
73
105
  is_snpid_na = sumstats[snpid].isna()
106
+
107
+ # change STATUS code
74
108
  sumstats.loc[ is_chrposrefalt,status] = vchange_status(sumstats.loc[ is_chrposrefalt,status],3 ,"975" ,"630")
75
109
  sumstats.loc[(~is_chrposrefalt)&(~is_snpid_na),status] = vchange_status(sumstats.loc[(~is_chrposrefalt)&(~is_snpid_na),status],3 ,"975" ,"842")
76
110
 
77
111
  if rsid in sumstats.columns:
78
- if verbose: log.write(" -Checking if rsID is rsxxxxxx or RSxxxxxxx...")
79
- is_rsid = sumstats[rsid].str.startswith(r'rs',na=False)
112
+ log.write(" -Checking if rsID is rsxxxxxx...", verbose=verbose)
113
+ is_rsid = sumstats[rsid].str.match(r'^rs\d+$', case=False, flags=0, na=False)
80
114
 
81
115
  sumstats.loc[ is_rsid,status] = vchange_status(sumstats.loc[ is_rsid,status], 3, "986","520")
82
116
  sumstats.loc[~is_rsid,status] = vchange_status(sumstats.loc[~is_rsid,status], 3, "986","743")
83
117
 
84
- if verbose: log.write(" -Checking if chr:pos:ref:alt is mixed in rsID column ...")
85
- is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\w+[:_-]\w+[:_-]\w+$', case=False, flags=0, na=False)
86
- #is_rs_chrpos = sumstats[rsid].str.match(r'(chr)?([0-9XYMT]+)[:_-]([0-9]+)[:_-]([ATCG]+)[:_-]([ATCG]+)', case=False, flags=0, na=False)
118
+ if verbose: log.write(" -Checking if CHR:POS:NEA:EA is mixed in rsID column ...")
119
+ is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
87
120
 
88
- if verbose: log.write(" -Number of chr:pos:ref:alt mixed in rsID column :",sum(is_rs_chrpos))
89
- if verbose: log.write(" -Number of Unrecognized rsID :",len(sumstats) - sum(is_rs_chrpos) - sum(is_rsid) )
90
- if verbose: log.write(" -A look at the unrecognized rsID :",set(sumstats.loc[(~is_rsid)&(~is_rs_chrpos),rsid].head()),"...")
121
+ log.write(" -Number of CHR:POS:NEA:EA mixed in rsID column :",sum(is_rs_chrpos), verbose=verbose)
122
+ log.write(" -Number of Unrecognized rsID :",len(sumstats) - sum(is_rs_chrpos) - sum(is_rsid) , verbose=verbose)
123
+ log.write(" -A look at the unrecognized rsID :",set(sumstats.loc[(~is_rsid)&(~is_rs_chrpos),rsid].head()),"...", verbose=verbose)
91
124
 
92
125
  ############################ fixing chr pos###################################################
93
- if fixchrpos is True:
94
- # from snpid or rsid, extract chr:pos to fix CHR and POS
126
+ if fixchrpos == True:
127
+ # from snpid or rsid, extract CHR:POS to fix CHR and POS
95
128
  if snpid in sumstats.columns:
96
129
  if verbose: log.write(" -Fixing CHR and POS...")
97
130
  if overwrite is True:
@@ -99,8 +132,8 @@ def fixID(sumstats,
99
132
  # fix all
100
133
  to_fix = is_chrposrefalt
101
134
 
102
- #fix variants with chr and pos being empty
103
135
  elif (chrom in sumstats.columns) and (pos in sumstats.columns) :
136
+ #fix variants with chr and pos being NA
104
137
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
105
138
  to_fix_num = sum(to_fix)
106
139
  if to_fix_num and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
@@ -121,6 +154,7 @@ def fixID(sumstats,
121
154
  to_fix_num = sum(to_fix)
122
155
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
123
156
  elif verbose: log.write(" -No fixable variants. ...")
157
+
124
158
  else:
125
159
  if verbose: log.write(" -Initiating CHR and POS columns...")
126
160
  sumstats.loc[:,chrom]=pd.Series(dtype="string")
@@ -134,8 +168,8 @@ def fixID(sumstats,
134
168
  if verbose: log.write(" -Filling CHR and POS columns using valid SNPID's chr:pos...")
135
169
  # format and qc filled chr and pos
136
170
 
137
- sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.split(':|_|-',n=2).str.get(0)
138
- sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,snpid].str.split(':|_|-',n=2).str.get(1)
171
+ sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
172
+ sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[2]
139
173
 
140
174
  #sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.split(':|_|-').str[0].str.strip("chrCHR").astype("string")
141
175
  #sumstats.loc[to_fix,pos] =np.floor(pd.to_numeric(sumstats.loc[to_fix,snpid].str.split(':|_|-').str[1], errors='coerce')).astype('Int64')
@@ -179,55 +213,62 @@ def fixID(sumstats,
179
213
  #sumstats.loc[to_fix,status] = vchange_status(sumstats.loc[to_fix,status], 4, "98765432","00000000").astype("string")
180
214
 
181
215
  ############################ fixing chr pos###################################################
182
- #if fixeanea is True:
183
- # if verbose: log.write(" -Warning: Please make sure a1 is ref or not in Chr:pos:a1:a2")
184
- # if overwrite is True:
185
- # if verbose: log.write(" -Overwrite is applied...")
186
- # to_fix = is_chrposrefalt
187
- # elif (nea in sumstats.columns) and (nea in sumstats.columns):
188
- # to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
189
- # if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
190
- # elif (nea in sumstats.columns) and (ea not in sumstats.columns):
191
- # if verbose: log.write(" -Initiating EA columns...")
192
- # sumstats[ea]=pd.Series(dtype="string")
193
- # to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
194
- # if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
195
- # elif (nea not in sumstats.columns) and (ea in sumstats.columns):
196
- # if verbose: log.write(" -Initiating NEA columns...")
197
- # sumstats[nea]=pd.Series(dtype="string")
198
- # to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
199
- # if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
200
- # else:
201
- # if verbose: log.write(" -Initiating EA and NEA columns...")
202
- # sumstats[nea]=pd.Series(dtype="string")
203
- # sumstats[ea]=pd.Series(dtype="string")
204
- # to_fix = is_chrposrefalt
205
- # if sum(to_fix)>0:
206
- # if verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
216
+ if fixeanea == True:
217
+ if verbose: log.write(" -WARNING! gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT")
218
+ if overwrite is True:
219
+ if verbose: log.write(" -Overwrite mode is applied...")
220
+ to_fix = is_chrposrefalt
221
+ elif (nea in sumstats.columns) and (nea in sumstats.columns):
222
+ to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
223
+ if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
224
+ elif (nea in sumstats.columns) and (ea not in sumstats.columns):
225
+ if verbose: log.write(" -Initiating EA columns...")
226
+ sumstats.loc[:,ea]=pd.Series(dtype="string")
227
+ to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
228
+ if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
229
+ elif (nea not in sumstats.columns) and (ea in sumstats.columns):
230
+ if verbose: log.write(" -Initiating NEA columns...")
231
+ sumstats.loc[:,nea]=pd.Series(dtype="string")
232
+ to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
233
+ if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
234
+ else:
235
+ if verbose: log.write(" -Initiating EA and NEA columns...")
236
+ sumstats[nea]=pd.Series(dtype="string")
237
+ sumstats[ea]=pd.Series(dtype="string")
238
+ to_fix = is_chrposrefalt
239
+ if sum(to_fix)>0:
240
+ if verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
207
241
  #
208
- # if sum(to_fix)>0:
209
- # if verbose: log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's chr:pos:nea:ea...")
242
+ if sum(to_fix)>0:
243
+ if verbose: log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's CHR:POS:NEA:EA...")
210
244
  #
211
- # if fixeanea_flip is True:
212
- # if verbose: log.write(" -Flipped : chr:pos:a1:a2...a1->EA , a2->NEA ")
213
- # sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[2]).astype("string")
214
- # sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[3]).astype("string")
215
- # else:
216
- # if verbose: log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ")
217
- # sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[3]).astype("string")
218
- # sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[2]).astype("string")
245
+ if fixeanea_flip == True:
246
+ if verbose: log.write(" -Flipped : CHR:POS:NEA:EA -> CHR:POS:EA:NEA ")
247
+ sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
248
+ sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
249
+ else:
250
+ if verbose: log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ")
251
+ sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
252
+ sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
253
+
219
254
  # #to_change_status = sumstats[status].str.match(r"\w\w\w[45]\w\w\w")
220
255
  # #sumstats.loc[to_fix&to_change_status,status] = vchange_status(sumstats.loc[to_fix&to_change_status,status],4,"2")
221
256
  # #sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[1]).astype("string")
222
257
  # #sumstats.loc[to_fix,rsid].apply(lambda x:re.split(':|_|-',x)[1]).astype("Int64")
223
258
 
224
259
  ############################ fixing id ###################################################
225
- if fixsep is True:
260
+ if fixsep == True:
226
261
  if snpid in sumstats.columns:
227
262
  if verbose: log.write(' -Replacing [_-] in SNPID with ":" ...')
228
263
  sumstats.loc[:,snpid] = sumstats.loc[:,snpid].str.replace(r"[_-]",":",regex=True)
264
+
265
+ if fixprefix == True:
266
+ if snpid in sumstats.columns:
267
+ if verbose: log.write(' -Removing /^chr/ in SNPID ...')
268
+ prefix_removed = sumstats.loc[:,snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
269
+ sumstats.loc[~prefix_removed.isna(),snpid] = prefix_removed[~prefix_removed.isna()]
229
270
 
230
- if fixid is True:
271
+ if fixid == True:
231
272
  if snpid not in sumstats.columns:
232
273
  # initiate a SNPID column
233
274
  sumstats.loc[:,snpid]=pd.Series(dtype="string")
@@ -304,19 +345,21 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
304
345
  remove multiallelic SNPs based on 4. CHR, POS
305
346
  '''
306
347
 
348
+ if verbose: log.write("Start to remove duplicated/multiallelic variants...{}".format(_get_version()))
349
+ if verbose: log.write(" -Removing mode:{}".format(mode))
307
350
  # sort the variants using the specified column before removing
308
351
  if keep_col is not None :
309
352
  if keep_col in sumstats.columns:
310
- if verbose: log.write("Start to sort the sumstats using " + keep_col +"...")
353
+ if verbose: log.write("Start to sort the sumstats using {}...".format(keep_col))
311
354
  sumstats = sumstats.sort_values(by=keep_col,ascending=keep_ascend)
312
355
  else:
313
356
  if verbose: log.write("Column" + keep_col +" was not detected... skipping... ")
314
357
  total_number = len(sumstats)
315
358
 
316
359
  # remove by duplicated SNPID
317
- if (snpid in sumstats.columns) and "d" in mode:
360
+ if (snpid in sumstats.columns) and ("d" in mode or "s" in mode):
318
361
  if verbose: log.write("Start to remove duplicated variants based on snpid...{}".format(_get_version()))
319
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
362
+ check_dataframe_shape(sumstats, log, verbose)
320
363
  if verbose: log.write(" -Which variant to keep: ", keep )
321
364
  pre_number =len(sumstats)
322
365
  if snpid in sumstats.columns:
@@ -326,18 +369,19 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
326
369
  if verbose: log.write(" -Removed ",pre_number -after_number ," based on SNPID...")
327
370
 
328
371
  # remove by duplicated rsID
329
- if (rsid in sumstats.columns) and ("d" in mode):
372
+ if (rsid in sumstats.columns) and ("d" in mode or "r" in mode):
330
373
  # keep na and remove duplicated
331
374
  pre_number =len(sumstats)
332
375
  if verbose: log.write("Start to remove duplicated variants based on rsID...")
376
+ check_dataframe_shape(sumstats, log, verbose)
333
377
  sumstats = sumstats.loc[sumstats[rsid].isna() | (~sumstats.duplicated(subset=rsid, keep=keep)),:]
334
378
  after_number=len(sumstats)
335
379
  if verbose: log.write(" -Removed ",pre_number -after_number ," based on rsID...")
336
380
 
337
381
  # remove by duplicated variants by CHR:POS:NEA:EA
338
- if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and "d" in mode:
382
+ if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and ("d" in mode or "c" in mode):
339
383
  if verbose: log.write("Start to remove duplicated variants based on CHR,POS,EA and NEA...")
340
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
384
+ check_dataframe_shape(sumstats, log, verbose)
341
385
  if verbose: log.write(" -Which variant to keep: ", keep )
342
386
  pre_number =len(sumstats)
343
387
  if snpid in sumstats.columns:
@@ -351,6 +395,7 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
351
395
  # keep na and remove duplicated
352
396
  pre_number =len(sumstats)
353
397
  if verbose: log.write("Start to remove multiallelic variants based on chr:pos...")
398
+ check_dataframe_shape(sumstats, log, verbose)
354
399
  if verbose: log.write(" -Which variant to keep: ", keep )
355
400
  sumstats = sumstats.loc[(~sumstats.loc[:,[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
356
401
  after_number=len(sumstats)
@@ -360,17 +405,37 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
360
405
  # resort the coordinates
361
406
  if verbose: log.write(" -Removed ",total_number -after_number," variants in total.")
362
407
  if keep_col is not None :
363
- if verbose: log.write(" -Sort the coordinates...")
408
+ if verbose: log.write(" -Sort the coordinates based on CHR and POS...")
364
409
  sumstats = sortcoordinate(sumstats,verbose=False)
365
410
 
366
- if remove is True:
411
+ if "n" in mode or remove==True:
367
412
  # if remove==True, remove NAs
368
413
  if verbose: log.write(" -Removing NAs...")
369
414
  pre_number =len(sumstats)
370
- sumstats = sumstats.loc[~sumstats.isna().any(axis=1),:]
415
+ specified_columns = []
416
+ if "d" in mode:
417
+ specified_columns.append(rsid)
418
+ specified_columns.append(snpid)
419
+ specified_columns.append(chrom)
420
+ specified_columns.append(pos)
421
+ specified_columns.append(ea)
422
+ specified_columns.append(nea)
423
+ if "r" in mode:
424
+ specified_columns.append(rsid)
425
+ if "s" in mode:
426
+ specified_columns.append(snpid)
427
+ if "m" in mode:
428
+ specified_columns.append(chrom)
429
+ specified_columns.append(pos)
430
+ if "c" in mode:
431
+ specified_columns.append(chrom)
432
+ specified_columns.append(pos)
433
+ specified_columns.append(ea)
434
+ specified_columns.append(nea)
435
+ sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
371
436
  after_number=len(sumstats)
372
- if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values.")
373
- if verbose: log.write("Finished removing successfully!")
437
+ if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)))
438
+ if verbose: log.write("Finished removing duplicated/multiallelic variants successfully!")
374
439
  return sumstats
375
440
 
376
441
  ###############################################################################################################
@@ -383,7 +448,7 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
383
448
  if verbose: log.write(".fix_chr: Specified not detected..skipping...")
384
449
  return sumstats
385
450
  if verbose: log.write("Start to fix chromosome notation...{}".format(_get_version()))
386
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
451
+ check_dataframe_shape(sumstats, log, verbose)
387
452
 
388
453
  # convert to string datatype
389
454
  try:
@@ -406,7 +471,8 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
406
471
  if sum(is_chr_fixed)<len(sumstats):
407
472
 
408
473
  #extract the CHR number or X Y M MT
409
- chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'(chr)?([0-9]{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
474
+ chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
475
+
410
476
  is_chr_fixable = ~chr_extracted.isna()
411
477
  if verbose: log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable))
412
478
 
@@ -419,7 +485,10 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
419
485
  is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
420
486
  if sum(is_chr_invalid)>0 and verbose:
421
487
  log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
422
- log.write(" -Invalid chromosome notations converted to NA :" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
488
+ try:
489
+ log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
490
+ except:
491
+ pass
423
492
  elif verbose:
424
493
  log.write(" -No unrecognized chromosome notations...")
425
494
 
@@ -464,7 +533,15 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
464
533
  unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
465
534
  if (remove is True) and unrecognized_num>0:
466
535
  # remove variants with unrecognized CHR
467
- if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with unrecognized chromosome notations.")
536
+ try:
537
+ if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
538
+ except:
539
+ pass
540
+ if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
541
+ try:
542
+ log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
543
+ except:
544
+ pass
468
545
  #sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
469
546
  good_chr = sumstats[chrom].isin(chrom_list)
470
547
  sumstats = sumstats.loc[good_chr, :].copy()
@@ -480,45 +557,48 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
480
557
  sumstats.loc[:,chrom] = np.floor(pd.to_numeric(sumstats.loc[:,chrom], errors='coerce')).astype('Int64')
481
558
 
482
559
  # filter out variants with CHR <=0
483
- if verbose: log.write(" -Sanity check for CHR...")
484
-
485
560
  out_of_range_chr = sumstats[chrom] < minchr
486
561
  out_of_range_chr = out_of_range_chr.fillna(False)
487
-
488
- if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
489
-
490
- sumstats = sumstats.loc[~out_of_range_chr,:]
562
+ if sum(out_of_range_chr)>0:
563
+ if verbose: log.write(" -Sanity check for CHR...")
564
+ if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
565
+ sumstats = sumstats.loc[~out_of_range_chr,:]
491
566
 
492
567
  if verbose: log.write("Finished fixing chromosome notation successfully!")
568
+
493
569
  return sumstats
494
570
 
495
571
  ###############################################################################################################
496
572
  # 20230128
497
- def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True,limit=250000000, log=Log()):
573
+ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_limit=0 , upper_limit=None , limit=250000000, log=Log()):
574
+ if upper_limit is None:
575
+ upper_limit = limit
498
576
  if check_col(sumstats,pos,status) is not True:
499
577
  if verbose: log.write(".fix_pos: Specified not detected..skipping...")
500
578
  return sumstats
501
579
  if verbose: log.write("Start to fix basepair positions...{}".format(_get_version()))
502
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
580
+ check_dataframe_shape(sumstats, log, verbose)
503
581
 
504
582
  all_var_num = len(sumstats)
505
583
  #convert to numeric
506
584
  is_pos_na = sumstats.loc[:,pos].isna()
507
585
 
508
- # check if POS is string
509
- if pd.api.types.is_string_dtype(sumstats[pos]):
510
- # if so, remove thousands separator
511
- if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
512
- sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].astype("string").str.replace(",|_", "",regex=True)
586
+ try:
587
+ if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
588
+ sumstats.loc[:,pos] = sumstats.loc[:,pos].astype('string')
589
+ # if so, remove thousands separator
590
+ if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
591
+ sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
592
+ except:
593
+ pass
513
594
 
514
595
  # convert POS to integer
515
596
  try:
516
597
  if verbose: log.write(' -Converting to Int64 data type ...')
517
- sumstats.loc[:,pos] = sumstats.loc[:,pos].astype('Int64')
598
+ sumstats[pos] = sumstats[pos].astype('Int64')
518
599
  except:
519
600
  if verbose: log.write(' -Force converting to Int64 data type ...')
520
- sumstats.loc[:,pos] = np.floor(pd.to_numeric(sumstats.loc[:,pos], errors='coerce')).astype('Int64')
521
-
601
+ sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
522
602
  is_pos_fixed = ~sumstats.loc[:,pos].isna()
523
603
  is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
524
604
 
@@ -526,11 +606,11 @@ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True,limit=2
526
606
  sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
527
607
 
528
608
  # remove outlier, limit:250,000,000
529
- if verbose: log.write(" -Position upper_bound is: " + "{:,}".format(limit))
530
- out_lier=(sumstats[pos]>limit) & (~is_pos_na)
531
- if verbose: log.write(" -Remove outliers:",sum(out_lier))
609
+ if verbose: log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit))
610
+ is_pos_na = sumstats.loc[:,pos].isna()
611
+ out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
612
+ if verbose: log.write(" -Removed outliers:",sum(out_lier))
532
613
  sumstats = sumstats.loc[~out_lier,:]
533
-
534
614
  #remove na
535
615
  if remove is True:
536
616
  sumstats = sumstats.loc[~sumstats[pos].isna(),:]
@@ -539,6 +619,7 @@ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True,limit=2
539
619
 
540
620
  if verbose: log.write(" -Converted all position to datatype Int64.")
541
621
  if verbose: log.write("Finished fixing basepair position successfully!")
622
+
542
623
  return sumstats
543
624
 
544
625
  ###############################################################################################################
@@ -549,11 +630,26 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
549
630
  if verbose: log.write("EA and NEA not detected..skipping...")
550
631
  return sumstats
551
632
  if verbose: log.write("Start to fix alleles...{}".format(_get_version()))
552
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
633
+ check_dataframe_shape(sumstats, log, verbose)
553
634
 
554
635
  #if (ea not in sumstats.columns) or (nea not in sumstats.columns):
555
636
  if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
637
+
638
+ #try:
639
+ # ea_missing = sum(sumstats[ea].isna())
640
+ # nea_missing = sum(sumstats[nea].isna())
641
+ # if sum(ea_missing)>0:
642
+ # if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
643
+ # sumstats.loc[:,ea] = sumstats.loc[:,ea].add_categories("N").fillna("N")
644
+ # if sum(sumstats[nea].isna())>0:
645
+ # if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
646
+ # sumstats.loc[:,nea] = sumstats.loc[:,nea].add_categories("N").fillna("N")
647
+ #except:
648
+ # pass
649
+
556
650
  categories = set(sumstats.loc[:,ea].str.upper())|set(sumstats.loc[:,nea].str.upper())|set("N")
651
+ categories = {x for x in categories if pd.notna(x)}
652
+
557
653
  sumstats.loc[:,ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
558
654
  sumstats.loc[:,nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
559
655
  all_var_num = len(sumstats)
@@ -620,6 +716,7 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
620
716
  sumstats.loc[is_eanea_fixed&is_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_normalized, status], 5,"4","3")
621
717
  gc.collect()
622
718
  if verbose: log.write("Finished fixing allele successfully!")
719
+
623
720
  return sumstats
624
721
 
625
722
  ###############################################################################################################
@@ -627,11 +724,11 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
627
724
 
628
725
  def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
629
726
  if check_col(sumstats,pos,ea,nea,status) is not True:
630
- if verbose: log.write("WARNING:.normalize(): specified columns not detected..skipping...")
727
+ if verbose: log.write("WARNING! .normalize(): specified columns not detected..skipping...")
631
728
  return sumstats
632
729
 
633
730
  if verbose: log.write("Start to normalize variants...{}".format(_get_version()))
634
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
731
+ check_dataframe_shape(sumstats, log, verbose)
635
732
  #variants_to_check = status_match(sumstats[status],5,[4,5]) #
636
733
  #r'\w\w\w\w[45]\w\w'
637
734
  variants_to_check = sumstats[status].str[4].str.match(r'4|5', case=False, flags=0, na=False)
@@ -689,7 +786,8 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
689
786
 
690
787
  def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
691
788
  #single df
692
- normalized = sumstats.apply(lambda x: normalizevariant(x[0],x[1],x[2],x[3]),axis=1)
789
+ #normalized = sumstats.apply(lambda x: normalizevariant(x[0],x[1],x[2],x[3]),axis=1)
790
+ normalized = sumstats.apply(lambda x: normalizevariant(x[pos],x[nea],x[ea],x[status]),axis=1)
693
791
  sumstats = pd.DataFrame(normalized.to_list(), columns=[pos,nea,ea,status],index=sumstats.index)
694
792
  return sumstats
695
793
 
@@ -811,7 +909,7 @@ def sanitycheckstats(sumstats,
811
909
  if coltocheck is None:
812
910
  coltocheck = ["P","MLOG10P","INFO","Z","BETA","SE","EAF","CHISQ","F","N","N_CASE","N_CONTROL","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","STATUS"]
813
911
  if verbose: log.write("Start sanity check for statistics...{}".format(_get_version()))
814
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
912
+ check_dataframe_shape(sumstats, log, verbose)
815
913
  cols_to_check=[]
816
914
  oringinal_number=len(sumstats)
817
915
  sumstats = sumstats.copy()
@@ -822,7 +920,7 @@ def sanitycheckstats(sumstats,
822
920
  if "N" in coltocheck and "N" in sumstats.columns:
823
921
  cols_to_check.append("N")
824
922
  if verbose: log.write(" -Checking if ",n[0],"<=N<=",n[1]," ...")
825
- sumstats.loc[:,"N"] = np.floor(pd.to_numeric(sumstats.loc[:,"N"], errors='coerce')).astype("Int32")
923
+ sumstats.loc[:,"N"] = np.floor(pd.to_numeric(sumstats.loc[:,"N"], errors='coerce')).astype("Int64")
826
924
  sumstats = sumstats.loc[(sumstats["N"]>=n[0]) & (sumstats["N"]<=n[1]),:]
827
925
  after_number=len(sumstats)
828
926
  if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N.")
@@ -830,7 +928,7 @@ def sanitycheckstats(sumstats,
830
928
  if "N_CASE" in coltocheck and "N_CASE" in sumstats.columns:
831
929
  cols_to_check.append("N_CASE")
832
930
  if verbose: log.write(" -Checking if ",ncase[0],"<=N_CASE<=",ncase[1]," ...")
833
- sumstats.loc[:,"N_CASE"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CASE"], errors='coerce')).astype("Int32")
931
+ sumstats.loc[:,"N_CASE"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CASE"], errors='coerce')).astype("Int64")
834
932
  sumstats = sumstats.loc[(sumstats["N_CASE"]>=ncase[0]) & (sumstats["N_CASE"]<=ncase[1]),:]
835
933
  after_number=len(sumstats)
836
934
  if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CASE.")
@@ -838,17 +936,11 @@ def sanitycheckstats(sumstats,
838
936
  if "N_CONTROL" in coltocheck and "N_CONTROL" in sumstats.columns:
839
937
  cols_to_check.append("N_CONTROL")
840
938
  if verbose: log.write(" -Checking if ",ncontrol[0],"<=N_CONTROL<=",ncontrol[1]," ...")
841
- sumstats.loc[:,"N_CONTROL"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CONTROL"], errors='coerce')).astype("Int32")
939
+ sumstats.loc[:,"N_CONTROL"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CONTROL"], errors='coerce')).astype("Int64")
842
940
  sumstats = sumstats.loc[(sumstats["N_CONTROL"]>=ncontrol[0]) & (sumstats["N_CONTROL"]<=ncontrol[1]),:]
843
941
  after_number=len(sumstats)
844
942
  if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CONTROL.")
845
- pre_number=len(sumstats)
846
- if "N" in coltocheck and "N" in sumstats.columns and "N_CONTROL" in coltocheck and "N_CONTROL" in sumstats.columns and "N_CASE" in coltocheck and "N_CASE" in sumstats.columns:
847
- if verbose: log.write(" -Checking if N = N_CASE + N_CONTROL ...")
848
- matched_n = sumstats.loc[:,"N"] == sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"]
849
- sumstats = sumstats.loc[matched_n,:]
850
- after_number=len(sumstats)
851
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with N != N_CASE + N_CONTROL.")
943
+
852
944
 
853
945
  ###ALLELE FREQUENCY################################################################################################################################################
854
946
  pre_number=len(sumstats)
@@ -908,6 +1000,11 @@ def sanitycheckstats(sumstats,
908
1000
  if verbose: log.write(" -Checking if ",p[0],"< P <",p[1]," ...")
909
1001
  sumstats.loc[:,"P"] = pd.to_numeric(sumstats.loc[:,"P"], errors='coerce').astype("float64")
910
1002
  sumstats = sumstats.loc[(sumstats["P"]>p[0]) & (sumstats["P"]<p[1]),:]
1003
+
1004
+ is_low_p = sumstats["P"] == 0
1005
+ if sum(is_low_p) >0:
1006
+ log.write(" -WARNING! Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)), verbose=verbose)
1007
+ log.write(" -WARNING! Please consider using MLOG10P instead.", verbose=verbose)
911
1008
  after_number=len(sumstats)
912
1009
  if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad P.")
913
1010
 
@@ -1008,11 +1105,11 @@ def sanitycheckstats(sumstats,
1008
1105
  if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
1009
1106
  categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
1010
1107
  sumstats.loc[:,"STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
1011
-
1012
- pre_number=len(sumstats)
1013
- sumstats = sumstats.dropna(subset=cols_to_check)
1108
+
1109
+ #pre_number=len(sumstats)
1110
+ #sumstats = sumstats.dropna(subset=cols_to_check)
1014
1111
  after_number=len(sumstats)
1015
- if verbose:log.write(" -Removed {} variants with NAs in the checked columns...".format(pre_number - after_number))
1112
+ #if verbose:log.write(" -Removed {} variants with NAs in the checked columns...".format(pre_number - after_number))
1016
1113
 
1017
1114
  if verbose: log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.")
1018
1115
  if verbose:
@@ -1021,6 +1118,67 @@ def sanitycheckstats(sumstats,
1021
1118
  if verbose: log.write("Finished sanity check successfully!")
1022
1119
  return sumstats
1023
1120
 
1121
+ ### check consistency #############################################################################################################################################
1122
+
1123
+ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
1124
+ if verbose: log.write("Start to check data consistency across columns...{}".format(_get_version()))
1125
+ check_dataframe_shape(sumstats, log, verbose)
1126
+ log.write(" -Tolerance: {} (Relative) and {} (Absolute)".format(rtol, atol),verbose=verbose)
1127
+
1128
+
1129
+ if "SNPID" not in sumstats.columns:
1130
+ id_to_use = "rsID"
1131
+ else:
1132
+ id_to_use = "SNPID"
1133
+
1134
+ if "BETA" in sumstats.columns and "SE" in sumstats.columns:
1135
+ if "MLOG10P" in sumstats.columns:
1136
+ log.write(" -Checking if BETA/SE-derived-MLOG10P is consistent with MLOG10P...",verbose=verbose)
1137
+ betase_derived_mlog10p = _convert_betase_to_mlog10p(sumstats["BETA"], sumstats["SE"])
1138
+ is_close = np.isclose(betase_derived_mlog10p, sumstats["MLOG10P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1139
+ diff = betase_derived_mlog10p - sumstats["MLOG10P"]
1140
+ if sum(~is_close)>0:
1141
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1142
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1143
+ else:
1144
+ log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1145
+
1146
+ if "P" in sumstats.columns:
1147
+ log.write(" -Checking if BETA/SE-derived-P is consistent with P...",verbose=verbose)
1148
+ betase_derived_p = _convert_betase_to_p(sumstats["BETA"], sumstats["SE"])
1149
+ is_close = np.isclose(betase_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1150
+ diff = betase_derived_p - sumstats["P"]
1151
+ if sum(~is_close)>0:
1152
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1153
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1154
+ else:
1155
+ log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1156
+
1157
+ if "MLOG10P" in sumstats.columns and "P" in sumstats.columns:
1158
+ log.write(" -Checking if MLOG10P-derived-P is consistent with P...",verbose=verbose)
1159
+ mlog10p_derived_p = _convert_mlog10p_to_p(sumstats["MLOG10P"])
1160
+ is_close = np.isclose(mlog10p_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1161
+ diff = mlog10p_derived_p - sumstats["P"]
1162
+ if sum(~is_close)>0:
1163
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1164
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1165
+ else:
1166
+ log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1167
+
1168
+ if "N" in sumstats.columns and "N_CONTROL" in sumstats.columns and "N_CASE" in sumstats.columns:
1169
+ if verbose: log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...")
1170
+ is_close = sumstats.loc[:,"N"] == sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"]
1171
+ #is_close = np.isclose(sumstats.loc[:,"N"], sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
1172
+ diff = abs(sumstats.loc[:,"N"] - (sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"] ))
1173
+ if sum(~is_close)>0:
1174
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1175
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1176
+ else:
1177
+ log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1178
+
1179
+ log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
1180
+
1181
+ if verbose: log.write("Finished checking data consistency across columns.")
1024
1182
  ###############################################################################################################
1025
1183
  # 20220426
1026
1184
  def get_reverse_complementary_allele(a):
@@ -1046,7 +1204,7 @@ def flip_direction(string):
1046
1204
 
1047
1205
  def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1048
1206
 
1049
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1207
+ check_dataframe_shape(sumstats, log, verbose)
1050
1208
 
1051
1209
  ###################get reverse complementary####################
1052
1210
  pattern = r"\w\w\w\w\w[45]\w"
@@ -1245,10 +1403,10 @@ def liftover_variant(sumstats,
1245
1403
 
1246
1404
  def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
1247
1405
  if check_col(sumstats,chrom,pos,status) is not True:
1248
- if verbose: log.write("WARNING:.liftover(): specified columns not detected..skipping...")
1406
+ if verbose: log.write("WARNING! .liftover(): specified columns not detected..skipping...")
1249
1407
  return sumstats
1250
1408
  if verbose: log.write("Start to perform liftover...{}".format(_get_version()))
1251
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1409
+ check_dataframe_shape(sumstats, log, verbose)
1252
1410
  if verbose: log.write(" -CPU Cores to use :",n_cores)
1253
1411
  if verbose: log.write(" -Performing liftover ...")
1254
1412
  if verbose: log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build)
@@ -1292,7 +1450,7 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
1292
1450
  return sumstats
1293
1451
 
1294
1452
  if verbose: log.write("Start to sort the genome coordinates...{}".format(_get_version()))
1295
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1453
+ check_dataframe_shape(sumstats, log, verbose)
1296
1454
 
1297
1455
  try:
1298
1456
  if sumstats[pos].dtype == "Int64":
@@ -1311,11 +1469,11 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
1311
1469
  ###############################################################################################################
1312
1470
  # 20230430 added HR HR_95 BETA_95 N_CASE N_CONTROL
1313
1471
  def sortcolumn(sumstats,verbose=True,log=Log(),order = [
1314
- "SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z",
1472
+ "SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z","T","F",
1315
1473
  "CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"
1316
1474
  ]):
1317
1475
  if verbose: log.write("Start to reorder the columns...{}".format(_get_version()))
1318
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1476
+ check_dataframe_shape(sumstats, log, verbose)
1319
1477
 
1320
1478
  output_columns = []
1321
1479
  for i in order:
@@ -1347,4 +1505,5 @@ def check_col(df,*args):
1347
1505
  if len(not_in_df)>0:
1348
1506
  return False
1349
1507
  print(" -Specified columns names was not detected. Please check:"+",".join(not_in_df))
1350
- return True
1508
+ return True
1509
+