gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (51) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/g_Log.py +14 -5
  5. gwaslab/g_Sumstats.py +86 -18
  6. gwaslab/g_SumstatsPair.py +70 -23
  7. gwaslab/g_SumstatsT.py +2 -2
  8. gwaslab/g_version.py +10 -10
  9. gwaslab/hm_casting.py +9 -4
  10. gwaslab/hm_harmonize_sumstats.py +88 -83
  11. gwaslab/io_preformat_input.py +14 -14
  12. gwaslab/io_read_ldsc.py +49 -1
  13. gwaslab/ldsc_irwls.py +198 -0
  14. gwaslab/ldsc_jackknife.py +514 -0
  15. gwaslab/ldsc_ldscore.py +417 -0
  16. gwaslab/ldsc_parse.py +294 -0
  17. gwaslab/ldsc_regressions.py +747 -0
  18. gwaslab/ldsc_sumstats.py +629 -0
  19. gwaslab/qc_check_datatype.py +1 -1
  20. gwaslab/qc_fix_sumstats.py +163 -161
  21. gwaslab/util_ex_calculate_ldmatrix.py +2 -2
  22. gwaslab/util_ex_gwascatalog.py +24 -24
  23. gwaslab/util_ex_ldproxyfinder.py +9 -9
  24. gwaslab/util_ex_ldsc.py +189 -0
  25. gwaslab/util_in_calculate_gc.py +6 -6
  26. gwaslab/util_in_calculate_power.py +42 -43
  27. gwaslab/util_in_convert_h2.py +8 -8
  28. gwaslab/util_in_fill_data.py +28 -28
  29. gwaslab/util_in_filter_value.py +91 -52
  30. gwaslab/util_in_get_density.py +8 -8
  31. gwaslab/util_in_get_sig.py +407 -65
  32. gwaslab/viz_aux_annotate_plot.py +12 -12
  33. gwaslab/viz_aux_quickfix.py +18 -18
  34. gwaslab/viz_aux_reposition_text.py +3 -3
  35. gwaslab/viz_aux_save_figure.py +14 -5
  36. gwaslab/viz_plot_compare_af.py +29 -30
  37. gwaslab/viz_plot_compare_effect.py +63 -71
  38. gwaslab/viz_plot_miamiplot2.py +6 -6
  39. gwaslab/viz_plot_mqqplot.py +17 -3
  40. gwaslab/viz_plot_qqplot.py +1 -1
  41. gwaslab/viz_plot_regionalplot.py +33 -32
  42. gwaslab/viz_plot_rg_heatmap.py +28 -26
  43. gwaslab/viz_plot_stackedregional.py +40 -21
  44. gwaslab/viz_plot_trumpetplot.py +50 -55
  45. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  46. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
  47. gwaslab-3.4.39.dist-info/RECORD +80 -0
  48. gwaslab-3.4.38.dist-info/RECORD +0 -72
  49. /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  50. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  51. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -129,7 +129,7 @@ def fixID(sumstats,
129
129
  sumstats.loc[ is_rsid,status] = vchange_status(sumstats.loc[ is_rsid,status], 3, "986","520")
130
130
  sumstats.loc[~is_rsid,status] = vchange_status(sumstats.loc[~is_rsid,status], 3, "986","743")
131
131
 
132
- if verbose: log.write(" -Checking if CHR:POS:NEA:EA is mixed in rsID column ...")
132
+ log.write(" -Checking if CHR:POS:NEA:EA is mixed in rsID column ...", verbose=verbose)
133
133
  is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
134
134
 
135
135
  log.write(" -Number of CHR:POS:NEA:EA mixed in rsID column :",sum(is_rs_chrpos), verbose=verbose)
@@ -140,9 +140,9 @@ def fixID(sumstats,
140
140
  if fixchrpos == True:
141
141
  # from snpid or rsid, extract CHR:POS to fix CHR and POS
142
142
  if snpid in sumstats.columns:
143
- if verbose: log.write(" -Fixing CHR and POS...")
143
+ log.write(" -Fixing CHR and POS...", verbose=verbose)
144
144
  if overwrite is True:
145
- if verbose: log.write(" -Overwrite is applied...")
145
+ log.write(" -Overwrite is applied...", verbose=verbose)
146
146
  # fix all
147
147
  to_fix = is_chrposrefalt
148
148
 
@@ -151,35 +151,39 @@ def fixID(sumstats,
151
151
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
152
152
  to_fix_num = sum(to_fix)
153
153
  if to_fix_num and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
154
- elif verbose: log.write(" -No fixable variants. ...")
154
+ else:
155
+ log.write(" -No fixable variants. ...", verbose=verbose)
155
156
 
156
157
  elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
157
- if verbose: log.write(" -Initiating CHR columns...")
158
+ log.write(" -Initiating CHR columns...", verbose=verbose)
158
159
  sumstats[chrom]=pd.Series(dtype="string")
159
160
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
160
161
  to_fix_num = sum(to_fix)
161
162
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
162
- elif verbose: log.write(" -No fixable variants. ...")
163
+ else:
164
+ log.write(" -No fixable variants. ...", verbose=verbose)
163
165
 
164
166
  elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
165
- if verbose: log.write(" -Initiating CHR and POS column...")
167
+ log.write(" -Initiating CHR and POS column...", verbose=verbose)
166
168
  sumstats[pos]=pd.Series(dtype="Int64")
167
169
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
168
170
  to_fix_num = sum(to_fix)
169
171
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
170
- elif verbose: log.write(" -No fixable variants. ...")
172
+ else:
173
+ log.write(" -No fixable variants. ...", verbose=verbose)
171
174
 
172
175
  else:
173
- if verbose: log.write(" -Initiating CHR and POS columns...")
176
+ log.write(" -Initiating CHR and POS columns...", verbose=verbose)
174
177
  sumstats[chrom]=pd.Series(dtype="string")
175
178
  sumstats[pos]=pd.Series(dtype="Int64")
176
179
  to_fix = is_chrposrefalt
177
180
  to_fix_num = sum(to_fix)
178
181
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
179
- elif verbose: log.write(" -No fixable variants. ...")
182
+ else:
183
+ log.write(" -No fixable variants. ...", verbose=verbose)
180
184
 
181
185
  if sum(to_fix)>0:
182
- if verbose: log.write(" -Filling CHR and POS columns using valid SNPID's chr:pos...")
186
+ log.write(" -Filling CHR and POS columns using valid SNPID's chr:pos...", verbose=verbose)
183
187
  # format and qc filled chr and pos
184
188
 
185
189
  sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
@@ -191,36 +195,40 @@ def fixID(sumstats,
191
195
  #sumstats.loc[to_fix,status] = vchange_status(sumstats.loc[to_fix,status], 4, "98765432","00000000")
192
196
 
193
197
  if rsid in sumstats.columns:
194
- if verbose: log.write(" -Fixing CHR and POS using chr:pos:ref:alt format variants in rsID column...")
198
+ log.write(" -Fixing CHR and POS using chr:pos:ref:alt format variants in rsID column...", verbose=verbose)
195
199
  if overwrite is True:
196
- if verbose: log.write(" -Overwrite is applied...")
200
+ log.write(" -Overwrite is applied...", verbose=verbose)
197
201
  to_fix = is_rs_chrpos
198
202
  elif (chrom in sumstats.columns) and (pos in sumstats.columns) :
199
203
  to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
200
204
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
201
- elif verbose: log.write(" -No fixable variants ...")
205
+ else:
206
+ log.write(" -No fixable variants ...", verbose=verbose)
202
207
  elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
203
- if verbose: log.write(" -Initiating CHR columns...")
208
+ log.write(" -Initiating CHR columns...", verbose=verbose)
204
209
  sumstats[chrom]=pd.Series(dtype="string")
205
210
  to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
206
211
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
207
- elif verbose: log.write(" -No fixable variants ...")
212
+ else:
213
+ log.write(" -No fixable variants ...", verbose=verbose)
208
214
  elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
209
- if verbose: log.write(" -Initiating CHR and POS column...")
215
+ log.write(" -Initiating CHR and POS column...", verbose=verbose)
210
216
  sumstats[pos]=pd.Series(dtype="Int64")
211
217
  to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
212
218
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
213
- elif verbose: log.write(" -No fixable variants ...")
219
+ else:
220
+ log.write(" -No fixable variants ...", verbose=verbose)
214
221
  else:
215
- if verbose: log.write(" -Initiating CHR and POS columns...")
222
+ log.write(" -Initiating CHR and POS columns...", verbose=verbose)
216
223
  sumstats[chrom]=pd.Series(dtype="string")
217
224
  sumstats[pos]=pd.Series(dtype="Int64")
218
225
  to_fix = is_rs_chrpos
219
226
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
220
- elif verbose: log.write(" -No fixable variants ...")
227
+ else:
228
+ log.write(" -No fixable variants ...", verbose=verbose)
221
229
 
222
230
  if sum(to_fix)>0:
223
- if verbose: log.write(" -Filling CHR and POS columns using chr:pos:ref:alt format variants in rsID column...")
231
+ log.write(" -Filling CHR and POS columns using chr:pos:ref:alt format variants in rsID column...", verbose=verbose)
224
232
  sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,rsid].str.split(':|_|-',n=2).str[0]
225
233
  sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,rsid].str.split(':|_|-',n=2).str[1]
226
234
  #sumstats.loc[to_fix,pos] = np.floor(pd.to_numeric(sumstats.loc[to_fix,rsid].str.split(':|_|-',x).get(1), errors='coerce')).astype('Int64')
@@ -228,40 +236,40 @@ def fixID(sumstats,
228
236
 
229
237
  ############################ fixing chr pos###################################################
230
238
  if fixeanea == True:
231
- if verbose: log.warning("gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT")
239
+ log.warning("gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT", verbose=verbose)
232
240
  if overwrite is True:
233
- if verbose: log.write(" -Overwrite mode is applied...")
241
+ log.write(" -Overwrite mode is applied...", verbose=verbose)
234
242
  to_fix = is_chrposrefalt
235
243
  elif (nea in sumstats.columns) and (nea in sumstats.columns):
236
244
  to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
237
245
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
238
246
  elif (nea in sumstats.columns) and (ea not in sumstats.columns):
239
- if verbose: log.write(" -Initiating EA columns...")
247
+ log.write(" -Initiating EA columns...", verbose=verbose)
240
248
  sumstats[ea]=pd.Series(dtype="string")
241
249
  to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
242
250
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
243
251
  elif (nea not in sumstats.columns) and (ea in sumstats.columns):
244
- if verbose: log.write(" -Initiating NEA columns...")
252
+ log.write(" -Initiating NEA columns...", verbose=verbose)
245
253
  sumstats[nea]=pd.Series(dtype="string")
246
254
  to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
247
255
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
248
256
  else:
249
- if verbose: log.write(" -Initiating EA and NEA columns...")
257
+ log.write(" -Initiating EA and NEA columns...", verbose=verbose)
250
258
  sumstats[nea]=pd.Series(dtype="string")
251
259
  sumstats[ea]=pd.Series(dtype="string")
252
260
  to_fix = is_chrposrefalt
253
261
  if sum(to_fix)>0:
254
- if verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
262
+ log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...", verbose=verbose)
255
263
  #
256
264
  if sum(to_fix)>0:
257
- if verbose: log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's CHR:POS:NEA:EA...")
265
+ log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's CHR:POS:NEA:EA...", verbose=verbose)
258
266
  #
259
267
  if fixeanea_flip == True:
260
- if verbose: log.write(" -Flipped : CHR:POS:NEA:EA -> CHR:POS:EA:NEA ")
268
+ log.write(" -Flipped : CHR:POS:NEA:EA -> CHR:POS:EA:NEA ", verbose=verbose)
261
269
  sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
262
270
  sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
263
271
  else:
264
- if verbose: log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ")
272
+ log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ", verbose=verbose)
265
273
  sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
266
274
  sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
267
275
 
@@ -273,12 +281,12 @@ def fixID(sumstats,
273
281
  ############################ fixing id ###################################################
274
282
  if fixsep == True:
275
283
  if snpid in sumstats.columns:
276
- if verbose: log.write(' -Replacing [_-] in SNPID with ":" ...')
284
+ log.write(' -Replacing [_-] in SNPID with ":" ...', verbose=verbose)
277
285
  sumstats[snpid] = sumstats[snpid].str.replace(r"[_-]",":",regex=True)
278
286
 
279
287
  if fixprefix == True:
280
288
  if snpid in sumstats.columns:
281
- if verbose: log.write(' -Removing /^chr/ in SNPID ...')
289
+ log.write(' -Removing /^chr/ in SNPID ...', verbose=verbose)
282
290
  prefix_removed = sumstats[snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
283
291
  sumstats.loc[~prefix_removed.isna(),snpid] = prefix_removed[~prefix_removed.isna()]
284
292
 
@@ -327,22 +335,23 @@ def fixID(sumstats,
327
335
  sumstats.loc[to_part_fix,snpid] = sumstats.loc[to_part_fix,chrom].astype("string") + ":"+sumstats.loc[to_part_fix,pos].astype("string")
328
336
  if sum(to_full_fix)>0:
329
337
  sumstats.loc[to_full_fix,snpid] = sumstats.loc[to_full_fix,chrom].astype("string") + ":"+sumstats.loc[to_full_fix,pos].astype("string") +":"+ sumstats.loc[to_full_fix,nea].astype("string") +":"+ sumstats.loc[to_full_fix,ea].astype("string")
330
- if verbose: log.write(" -Filling "+str(sum(to_part_fix)-sum(to_full_fix)) +" SNPID using CHR:POS...")
331
- if verbose: log.write(" -Filling "+str(sum(to_full_fix)) +" SNPID using CHR:POS:NEA:EA...")
338
+ log.write(" -Filling "+str(sum(to_part_fix)-sum(to_full_fix)) +" SNPID using CHR:POS...", verbose=verbose)
339
+ log.write(" -Filling "+str(sum(to_full_fix)) +" SNPID using CHR:POS:NEA:EA...", verbose=verbose)
332
340
  sumstats.loc[(to_full_fix),status] = vchange_status(sumstats.loc[(to_full_fix),status],3,"975","630")
333
341
  sumstats.loc[(to_part_fix),status] = vchange_status(sumstats.loc[(to_part_fix),status],3,"975","842")
334
342
 
335
343
  else:
336
344
  #when these is no ea or ena, just fix to chr:pos
337
345
  to_part_fix = to_fix & sumstats[chrom].notnull() & sumstats[pos].notnull()
338
- if verbose: log.write(" -Filling "+str(sum(to_part_fix)) +" SNPID using CHR POS...")
346
+ log.write(" -Filling "+str(sum(to_part_fix)) +" SNPID using CHR POS...", verbose=verbose)
339
347
  if sum(to_part_fix)>0:
340
348
  sumstats.loc[to_part_fix,snpid] = sumstats.loc[to_part_fix,chrom].astype("string") + ":"+sumstats.loc[to_part_fix,pos].astype("string")
341
349
  sumstats.loc[to_part_fix,status] = vchange_status(sumstats.loc[(to_part_fix),status],3,"975","842")
342
350
 
343
351
  after_number=sum(sumstats[snpid].isna())
344
- if verbose: log.write(" -Fixed "+ str(pre_number - after_number) +" variants ID...")
345
- elif verbose: log.write(" -ID unfixable: no CHR and POS columns or no SNPID. ")
352
+ log.write(" -Fixed "+ str(pre_number - after_number) +" variants ID...", verbose=verbose)
353
+ else:
354
+ log.write(" -ID unfixable: no CHR and POS columns or no SNPID. ", verbose=verbose)
346
355
 
347
356
  finished(log,verbose,_end_line)
348
357
  return sumstats
@@ -378,71 +387,71 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
378
387
  if is_enough_info == False: return sumstats
379
388
  ############################################################################################
380
389
 
381
- if verbose: log.write(" -Removing mode:{}".format(mode))
390
+ log.write(" -Removing mode:{}".format(mode), verbose=verbose)
382
391
  # sort the variants using the specified column before removing
383
392
  if keep_col is not None :
384
393
  if keep_col in sumstats.columns:
385
- if verbose: log.write("Start to sort the sumstats using {}...".format(keep_col))
394
+ log.write("Start to sort the sumstats using {}...".format(keep_col), verbose=verbose)
386
395
  sumstats = sumstats.sort_values(by=keep_col,ascending=keep_ascend)
387
396
  else:
388
- if verbose: log.write("Column" + keep_col +" was not detected... skipping... ")
397
+ log.write("Column" + keep_col +" was not detected... skipping... ", verbose=verbose)
389
398
  total_number = len(sumstats)
390
399
 
391
400
  # remove by duplicated SNPID
392
401
  if (snpid in sumstats.columns) and ("d" in mode or "s" in mode):
393
- if verbose: log.write("Start to remove duplicated variants based on snpid...{}".format(_get_version()))
402
+ log.write("Start to remove duplicated variants based on snpid...{}".format(_get_version()), verbose=verbose)
394
403
  check_dataframe_shape(sumstats, log, verbose)
395
- if verbose: log.write(" -Which variant to keep: ", keep )
404
+ log.write(" -Which variant to keep: ", keep , verbose=verbose)
396
405
  pre_number =len(sumstats)
397
406
  if snpid in sumstats.columns:
398
407
  # keep na and remove duplicated
399
408
  sumstats = sumstats.loc[sumstats[snpid].isna() | (~sumstats.duplicated(subset=[snpid], keep=keep)),:]
400
409
  after_number=len(sumstats)
401
- if verbose: log.write(" -Removed ",pre_number -after_number ," based on SNPID...")
410
+ log.write(" -Removed ",pre_number -after_number ," based on SNPID...", verbose=verbose)
402
411
 
403
412
  # remove by duplicated rsID
404
413
  if (rsid in sumstats.columns) and ("d" in mode or "r" in mode):
405
414
  # keep na and remove duplicated
406
415
  pre_number =len(sumstats)
407
- if verbose: log.write("Start to remove duplicated variants based on rsID...")
416
+ log.write("Start to remove duplicated variants based on rsID...", verbose=verbose)
408
417
  check_dataframe_shape(sumstats, log, verbose)
409
418
  sumstats = sumstats.loc[sumstats[rsid].isna() | (~sumstats.duplicated(subset=rsid, keep=keep)),:]
410
419
  after_number=len(sumstats)
411
- if verbose: log.write(" -Removed ",pre_number -after_number ," based on rsID...")
420
+ log.write(" -Removed ",pre_number -after_number ," based on rsID...", verbose=verbose)
412
421
 
413
422
  # remove by duplicated variants by CHR:POS:NEA:EA
414
423
  if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and ("d" in mode or "c" in mode):
415
- if verbose: log.write("Start to remove duplicated variants based on CHR,POS,EA and NEA...")
424
+ log.write("Start to remove duplicated variants based on CHR,POS,EA and NEA...", verbose=verbose)
416
425
  check_dataframe_shape(sumstats, log, verbose)
417
- if verbose: log.write(" -Which variant to keep: ", keep )
426
+ log.write(" -Which variant to keep: ", keep , verbose=verbose)
418
427
  pre_number =len(sumstats)
419
428
  if snpid in sumstats.columns:
420
429
  # keep na and remove duplicated
421
430
  sumstats = sumstats.loc[(~sumstats[[chrom,pos,ea,nea]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos,ea,nea], keep=keep)),:]
422
431
  after_number=len(sumstats)
423
- if verbose: log.write(" -Removed ",pre_number -after_number ," based on CHR,POS,EA and NEA...")
432
+ log.write(" -Removed ",pre_number -after_number ," based on CHR,POS,EA and NEA...", verbose=verbose)
424
433
 
425
434
  # remove by multiallelic variants by CHR:POS
426
435
  if (chrom in sumstats.columns) and (pos in sumstats.columns) and "m" in mode:
427
436
  # keep na and remove duplicated
428
437
  pre_number =len(sumstats)
429
- if verbose: log.write("Start to remove multiallelic variants based on chr:pos...")
438
+ log.write("Start to remove multiallelic variants based on chr:pos...", verbose=verbose)
430
439
  check_dataframe_shape(sumstats, log, verbose)
431
- if verbose: log.write(" -Which variant to keep: ", keep )
440
+ log.write(" -Which variant to keep: ", keep , verbose=verbose)
432
441
  sumstats = sumstats.loc[(~sumstats[[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
433
442
  after_number=len(sumstats)
434
- if verbose: log.write(" -Removed ",pre_number -after_number," multiallelic variants...")
443
+ log.write(" -Removed ",pre_number -after_number," multiallelic variants...", verbose=verbose)
435
444
  after_number=len(sumstats)
436
445
 
437
446
  # resort the coordinates
438
- if verbose: log.write(" -Removed ",total_number -after_number," variants in total.")
447
+ log.write(" -Removed ",total_number -after_number," variants in total.", verbose=verbose)
439
448
  if keep_col is not None :
440
- if verbose: log.write(" -Sort the coordinates based on CHR and POS...")
449
+ log.write(" -Sort the coordinates based on CHR and POS...", verbose=verbose)
441
450
  sumstats = sortcoordinate(sumstats,verbose=False)
442
451
 
443
452
  if "n" in mode or remove==True:
444
453
  # if remove==True, remove NAs
445
- if verbose: log.write(" -Removing NAs...")
454
+ log.write(" -Removing NAs...", verbose=verbose)
446
455
  pre_number =len(sumstats)
447
456
  specified_columns = []
448
457
  if "d" in mode:
@@ -466,7 +475,7 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
466
475
  specified_columns.append(nea)
467
476
  sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
468
477
  after_number=len(sumstats)
469
- if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)))
478
+ log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)), verbose=verbose)
470
479
 
471
480
  finished(log,verbose,_end_line)
472
481
  return sumstats
@@ -495,27 +504,23 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
495
504
  #chrom_list = get_chr_list() #bottom
496
505
  if chrom_list is None:
497
506
  chrom_list = get_chr_list()
498
- #if check_col(sumstats,chrom,status) is not True:
499
- # if verbose: log.write(".fix_chr: Specified not detected..skipping...")
500
- # return sumstats
501
-
502
507
 
503
508
  # convert to string datatype
504
509
  try:
505
- if verbose: log.write(" -Checking CHR data type...")
510
+ log.write(" -Checking CHR data type...", verbose=verbose)
506
511
  if sumstats[chrom].dtype == "string":
507
512
  pass
508
513
  else:
509
514
  sumstats[chrom] = sumstats[chrom].astype("string")
510
515
  except:
511
- if verbose: log.write(" -Force converting to pd string data type...")
516
+ log.write(" -Force converting to pd string data type...", verbose=verbose)
512
517
  sumstats[chrom] = sumstats[chrom].astype("string")
513
518
 
514
519
  # check if CHR is numeric
515
520
  is_chr_fixed = sumstats[chrom].str.isnumeric()
516
521
  # fill NAs with False
517
522
  is_chr_fixed = is_chr_fixed.fillna(False)
518
- if verbose: log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed))
523
+ log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed), verbose=verbose)
519
524
 
520
525
  # if there are variants whose CHR need to be fixed
521
526
  if sum(is_chr_fixed)<len(sumstats):
@@ -524,7 +529,7 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
524
529
  chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
525
530
 
526
531
  is_chr_fixable = ~chr_extracted.isna()
527
- if verbose: log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable))
532
+ log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable), verbose=verbose)
528
533
 
529
534
  # For not fixed variants, check if na
530
535
  is_chr_na = sumstats.loc[~is_chr_fixed, chrom].isna()
@@ -534,13 +539,13 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
534
539
  # Check variants with CHR being not NA and not fixable
535
540
  is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
536
541
  if sum(is_chr_invalid)>0 and verbose:
537
- log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
542
+ log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid), verbose=verbose)
538
543
  try:
539
- log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
544
+ log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()), verbose=verbose)
540
545
  except:
541
546
  pass
542
- elif verbose:
543
- log.write(" -No unrecognized chromosome notations...")
547
+ else:
548
+ log.write(" -No unrecognized chromosome notations...", verbose=verbose)
544
549
 
545
550
  # Assign good chr back to sumstats
546
551
  sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
@@ -553,23 +558,23 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
553
558
 
554
559
  # if sumstats contain sex CHR
555
560
  if sum(sex_chr)>0:
556
- if verbose: log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]))
557
- if verbose: log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...")
561
+ log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]), verbose=verbose)
562
+ log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...", verbose=verbose)
558
563
 
559
564
  # convert "X, Y, MT" to numbers
560
565
  convert_num_to_xymt={}
561
566
  if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
562
567
  convert_num_to_xymt[x[0].lower()] = str(x[1])
563
568
  convert_num_to_xymt[x[0].upper()] = str(x[1])
564
- if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]))
569
+ log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]), verbose=verbose)
565
570
  if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
566
571
  convert_num_to_xymt[y[0].lower()] = str(y[1])
567
572
  convert_num_to_xymt[y[0].upper()] = str(y[1])
568
- if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]))
573
+ log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]), verbose=verbose)
569
574
  if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
570
575
  convert_num_to_xymt[mt[0].lower()] = str(mt[1])
571
576
  convert_num_to_xymt[mt[0].upper()] = str(mt[1])
572
- if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]))
577
+ log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]), verbose=verbose)
573
578
  sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
574
579
 
575
580
  # change status code
@@ -584,19 +589,19 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
584
589
  if (remove is True) and unrecognized_num>0:
585
590
  # remove variants with unrecognized CHR
586
591
  try:
587
- if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
592
+ log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])), verbose=verbose)
588
593
  except:
589
594
  pass
590
- if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
595
+ log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.", verbose=verbose)
591
596
  try:
592
- log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
597
+ log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()), verbose=verbose)
593
598
  except:
594
599
  pass
595
600
  #sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
596
601
  good_chr = sumstats[chrom].isin(chrom_list)
597
602
  sumstats = sumstats.loc[good_chr, :].copy()
598
603
  else:
599
- if verbose: log.write(" -All CHR are already fixed...")
604
+ log.write(" -All CHR are already fixed...", verbose=verbose)
600
605
  sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
601
606
 
602
607
  # Convert string to int
@@ -610,8 +615,8 @@ def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",
610
615
  out_of_range_chr = sumstats[chrom] < minchr
611
616
  out_of_range_chr = out_of_range_chr.fillna(False)
612
617
  if sum(out_of_range_chr)>0:
613
- if verbose: log.write(" -Sanity check for CHR...")
614
- if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
618
+ log.write(" -Sanity check for CHR...", verbose=verbose)
619
+ log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr), verbose=verbose)
615
620
  sumstats = sumstats.loc[~out_of_range_chr,:]
616
621
 
617
622
  finished(log,verbose,_end_line)
@@ -649,17 +654,17 @@ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_
649
654
  if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
650
655
  sumstats[pos] = sumstats[pos].astype('string')
651
656
  # if so, remove thousands separator
652
- if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
657
+ log.write(' -Removing thousands separator "," or underbar "_" ...', verbose=verbose)
653
658
  sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
654
659
  except:
655
660
  pass
656
661
 
657
662
  # convert POS to integer
658
663
  try:
659
- if verbose: log.write(' -Converting to Int64 data type ...')
664
+ log.write(' -Converting to Int64 data type ...', verbose=verbose)
660
665
  sumstats[pos] = sumstats[pos].astype('Int64')
661
666
  except:
662
- if verbose: log.write(' -Force converting to Int64 data type ...')
667
+ log.write(' -Force converting to Int64 data type ...', verbose=verbose)
663
668
  sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
664
669
  is_pos_fixed = ~sumstats[pos].isna()
665
670
  is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
@@ -668,16 +673,16 @@ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_
668
673
  sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
669
674
 
670
675
  # remove outlier, limit:250,000,000
671
- if verbose: log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit))
676
+ log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit), verbose=verbose)
672
677
  is_pos_na = sumstats[pos].isna()
673
678
  out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
674
- if verbose: log.write(" -Removed outliers:",sum(out_lier))
679
+ log.write(" -Removed outliers:",sum(out_lier), verbose=verbose)
675
680
  sumstats = sumstats.loc[~out_lier,:]
676
681
  #remove na
677
682
  if remove is True:
678
683
  sumstats = sumstats.loc[~sumstats[pos].isna(),:]
679
684
  remain_var_num = len(sumstats)
680
- if verbose: log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.")
685
+ log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.", verbose=verbose)
681
686
 
682
687
  finished(log,verbose,_end_line)
683
688
  return sumstats
@@ -706,15 +711,15 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
706
711
  # ea_missing = sum(sumstats[ea].isna())
707
712
  # nea_missing = sum(sumstats[nea].isna())
708
713
  # if sum(ea_missing)>0:
709
- # if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
714
+ # log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
710
715
  # sumstats[ea] = sumstats[ea].add_categories("N").fillna("N")
711
716
  # if sum(sumstats[nea].isna())>0:
712
- # if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
717
+ # log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
713
718
  # sumstats[nea] = sumstats[nea].add_categories("N").fillna("N")
714
719
  #except:
715
720
  # pass
716
721
 
717
- if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
722
+ log.write(" -Converted all bases to string datatype and UPPERCASE.", verbose=verbose)
718
723
  categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
719
724
  categories = {x for x in categories if pd.notna(x)}
720
725
  sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
@@ -743,22 +748,21 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
743
748
 
744
749
  exclude = bad_nea | bad_ea
745
750
 
746
- if verbose:
747
- if len(set(sumstats.loc[bad_ea,ea].head())) >0:
748
- log.write(" -A look at the non-ATCG EA:",set(sumstats.loc[bad_ea,ea].head()),"...")
749
- if len(set(sumstats.loc[bad_nea,nea].head())) >0:
750
- log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...")
751
+ if len(set(sumstats.loc[bad_ea,ea].head())) >0:
752
+ log.write(" -A look at the non-ATCG EA:",set(sumstats.loc[bad_ea,ea].head()),"...", verbose=verbose)
753
+ if len(set(sumstats.loc[bad_nea,nea].head())) >0:
754
+ log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...", verbose=verbose)
751
755
 
752
756
  if remove == True:
753
757
  sumstats = sumstats.loc[(good_ea & good_nea),:].copy()
754
758
  good_eanea_num = len(sumstats)
755
- if verbose: log.write(" -Removed "+str(all_var_num - good_eanea_num)+" variants with NA alleles or alleles that contain bases other than A/C/T/G.")
759
+ log.write(" -Removed "+str(all_var_num - good_eanea_num)+" variants with NA alleles or alleles that contain bases other than A/C/T/G.", verbose=verbose)
756
760
  sumstats = sumstats.loc[(good_ea & good_nea & (~not_variant)),:].copy()
757
761
  good_eanea_notsame_num = len(sumstats)
758
- if verbose: log.write(" -Removed "+str(good_eanea_num - good_eanea_notsame_num)+" variants with same allele for EA and NEA.")
762
+ log.write(" -Removed "+str(good_eanea_num - good_eanea_notsame_num)+" variants with same allele for EA and NEA.", verbose=verbose)
759
763
  else:
760
764
  sumstats[[ea,nea]] = sumstats[[ea,nea]].fillna("N")
761
- if verbose: log.write(" -Detected "+str(sum(exclude))+" variants with alleles that contain bases other than A/C/T/G .")
765
+ log.write(" -Detected "+str(sum(exclude))+" variants with alleles that contain bases other than A/C/T/G .", verbose=verbose)
762
766
  categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
763
767
  sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
764
768
  sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
@@ -811,8 +815,8 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
811
815
  #r'\w\w\w\w[45]\w\w'
812
816
  variants_to_check = sumstats[status].str[4].str.match(r'4|5', case=False, flags=0, na=False)
813
817
  if sum(variants_to_check)==0:
814
- if verbose: log.write(" -No available variants to normalize..")
815
- if verbose: log.write("Finished normalizing variants successfully!")
818
+ log.write(" -No available variants to normalize..", verbose=verbose)
819
+ log.write("Finished normalizing variants successfully!", verbose=verbose)
816
820
  return sumstats
817
821
  ###############################################################################################################
818
822
  if sum(variants_to_check)>0:
@@ -827,29 +831,28 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
827
831
  pool.join()
828
832
  ###############################################################################################################
829
833
 
830
- if verbose:
831
- before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
832
- changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
833
- if changed_num>0:
834
- if snpid in sumstats.columns:
835
- before_normalize_id = sumstats.loc[variants_to_check,snpid]
836
- elif rsid in sumstats.columns:
837
- before_normalize_id = sumstats.loc[variants_to_check,rsid]
838
- else:
839
- before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
840
-
841
- log.write(" -Not normalized allele IDs:",end="")
842
- for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
843
- log.write(i,end=" ",show_time=False)
844
- log.write("... \n",end="",show_time=False)
845
-
846
- log.write(" -Not normalized allele:",end="")
847
- for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
848
- log.write(i,end="",show_time=False)
849
- log.write("... \n",end="",show_time=False)
850
- log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.")
834
+ before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
835
+ changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
836
+ if changed_num>0:
837
+ if snpid in sumstats.columns:
838
+ before_normalize_id = sumstats.loc[variants_to_check,snpid]
839
+ elif rsid in sumstats.columns:
840
+ before_normalize_id = sumstats.loc[variants_to_check,rsid]
851
841
  else:
852
- log.write(" -All variants are already normalized..")
842
+ before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
843
+
844
+ log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
845
+ for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
846
+ log.write(i,end=" ",show_time=False)
847
+ log.write("... \n",end="",show_time=False, verbose=verbose)
848
+
849
+ log.write(" -Not normalized allele:",end="", verbose=verbose)
850
+ for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
851
+ log.write(i,end="",show_time=False, verbose=verbose)
852
+ log.write("... \n",end="",show_time=False, verbose=verbose)
853
+ log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
854
+ else:
855
+ log.write(" -All variants are already normalized..", verbose=verbose)
853
856
  ###################################################################################################################
854
857
  categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
855
858
  sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
@@ -931,13 +934,13 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
931
934
  if header in coltocheck and header in sumstats.columns:
932
935
  cols_to_check.append(header)
933
936
  if header=="STATUS":
934
- if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
937
+ log.write(" -Checking STATUS and converting STATUS to categories....", verbose=verbose)
935
938
  categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
936
939
  sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
937
940
  return sumstats
938
941
 
939
942
  if dtype in ["Int64","Int32","int","int32","in64"]:
940
- if verbose: log.write(" -Checking if {} <= {} <= {} ...".format( var_range[0] ,header, var_range[1]))
943
+ log.write(" -Checking if {} <= {} <= {} ...".format( var_range[0] ,header, var_range[1]), verbose=verbose)
941
944
  sumstats[header] = np.floor(pd.to_numeric(sumstats[header], errors='coerce')).astype(dtype)
942
945
 
943
946
  elif dtype in ["Float64","Float32","float","float64","float32"]:
@@ -1033,7 +1036,7 @@ def sanitycheckstats(sumstats,
1033
1036
  if is_enough_info == False: return sumstats
1034
1037
  ############################################################################################
1035
1038
 
1036
- if verbose: log.write(" -Comparison tolerance for floats: {}".format(float_tolerence))
1039
+ log.write(" -Comparison tolerance for floats: {}".format(float_tolerence), verbose=verbose)
1037
1040
  eaf = add_tolerence(eaf, float_tolerence, "lr")
1038
1041
  maf = add_tolerence(maf, float_tolerence, "lr")
1039
1042
  beta = add_tolerence(beta, float_tolerence, "lr")
@@ -1166,7 +1169,7 @@ def _check_data_consistency(sumstats, beta="BETA", se="SE", p="P",mlog10p="MLOG1
1166
1169
  check_status=1
1167
1170
 
1168
1171
  if "N" in sumstats.columns and "N_CONTROL" in sumstats.columns and "N_CASE" in sumstats.columns:
1169
- if verbose: log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...")
1172
+ log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...", verbose=verbose)
1170
1173
  is_close = sumstats["N"] == sumstats["N_CASE"] + sumstats["N_CONTROL"]
1171
1174
  #is_close = np.isclose(sumstats["N"], sumstats["N_CASE"] + sumstats["N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
1172
1175
  diff = abs(sumstats["N"] - (sumstats["N_CASE"] + sumstats["N_CONTROL"] ))
@@ -1208,55 +1211,55 @@ def flip_direction(string):
1208
1211
 
1209
1212
  def flip_by_swap(sumstats, matched_index, log, verbose):
1210
1213
  if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
1211
- if verbose: log.write(" -Swapping column: NEA <=> EA...")
1214
+ log.write(" -Swapping column: NEA <=> EA...", verbose=verbose)
1212
1215
  sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
1213
1216
  return sumstats
1214
1217
 
1215
1218
  def flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1):
1216
1219
  if "OR" in sumstats.columns:
1217
- if verbose: log.write(" -Flipping column: OR = 1 / OR...")
1220
+ log.write(" -Flipping column: OR = 1 / OR...", verbose=verbose)
1218
1221
  sumstats.loc[matched_index,"OR"] = factor / sumstats.loc[matched_index,"OR"].values
1219
1222
  if "OR_95L" in sumstats.columns:
1220
- if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95L...")
1223
+ log.write(" -Flipping column: OR_95U = 1 / OR_95L...", verbose=verbose)
1221
1224
  sumstats.loc[matched_index,"OR_95U"] = factor / sumstats.loc[matched_index,"OR_95L"].values
1222
1225
  if "OR_95U" in sumstats.columns:
1223
- if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95U...")
1226
+ log.write(" -Flipping column: OR_95L = 1 / OR_95U...", verbose=verbose)
1224
1227
  sumstats.loc[matched_index,"OR_95L"] = factor / sumstats.loc[matched_index,"OR_95U"].values
1225
1228
  if "HR" in sumstats.columns:
1226
- if verbose: log.write(" -Flipping column: HR = 1 / HR...")
1229
+ log.write(" -Flipping column: HR = 1 / HR...", verbose=verbose)
1227
1230
  sumstats.loc[matched_index,"HR"] = factor / sumstats.loc[matched_index,"HR"].values
1228
1231
  if "HR_95L" in sumstats.columns:
1229
- if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95L...")
1232
+ log.write(" -Flipping column: HR_95U = 1 / HR_95L...", verbose=verbose)
1230
1233
  sumstats.loc[matched_index,"HR_95U"] = factor / sumstats.loc[matched_index,"HR_95L"].values
1231
1234
  if "HR_95U" in sumstats.columns:
1232
- if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95U...")
1235
+ log.write(" -Flipping column: HR_95L = 1 / HR_95U...", verbose=verbose)
1233
1236
  sumstats.loc[matched_index,"HR_95L"] = factor / sumstats.loc[matched_index,"HR_95U"].values
1234
1237
  return sumstats
1235
1238
 
1236
1239
  def flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1):
1237
1240
  if "EAF" in sumstats.columns:
1238
- if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
1241
+ log.write(" -Flipping column: EAF = 1 - EAF...", verbose=verbose)
1239
1242
  sumstats.loc[matched_index,"EAF"] = factor - sumstats.loc[matched_index,"EAF"].values
1240
1243
  return sumstats
1241
1244
 
1242
1245
  def flip_by_sign(sumstats, matched_index, log, verbose, cols=None):
1243
1246
  if "BETA" in sumstats.columns:
1244
- if verbose: log.write(" -Flipping column: BETA = - BETA...")
1247
+ log.write(" -Flipping column: BETA = - BETA...", verbose=verbose)
1245
1248
  sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1246
1249
  if "BETA_95L" in sumstats.columns:
1247
- if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95L...")
1250
+ log.write(" -Flipping column: BETA_95U = - BETA_95L...", verbose=verbose)
1248
1251
  sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95L"].values
1249
1252
  if "BETA_95U" in sumstats.columns:
1250
- if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95U...")
1253
+ log.write(" -Flipping column: BETA_95L = - BETA_95U...", verbose=verbose)
1251
1254
  sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95U"].values
1252
1255
  if "Z" in sumstats.columns:
1253
- if verbose: log.write(" -Flipping column: Z = - Z...")
1256
+ log.write(" -Flipping column: Z = - Z...", verbose=verbose)
1254
1257
  sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"Z"].values
1255
1258
  if "T" in sumstats.columns:
1256
- if verbose: log.write(" -Flipping column: T = - T...")
1259
+ log.write(" -Flipping column: T = - T...", verbose=verbose)
1257
1260
  sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"T"].values
1258
1261
  if "DIRECTION" in sumstats.columns:
1259
- if verbose: log.write(" -Flipping column: DIRECTION +-?0 <=> -+?0 ...")
1262
+ log.write(" -Flipping column: DIRECTION +-?0 <=> -+?0 ...", verbose=verbose)
1260
1263
  sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1261
1264
  return sumstats
1262
1265
 
@@ -1265,7 +1268,7 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1265
1268
  _start_line = "adjust statistics based on STATUS code"
1266
1269
  _end_line = "adjusting statistics based on STATUS code"
1267
1270
  _start_cols =[]
1268
- _start_function = ".check_data_consistency()"
1271
+ _start_function = ".flip_allele_stats()"
1269
1272
  _must_args ={}
1270
1273
 
1271
1274
  is_enough_info = start_to(sumstats=sumstats,
@@ -1285,10 +1288,10 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1285
1288
  #matched_index = status_match(sumstats[status],6,[4,5]) #
1286
1289
  matched_index = sumstats[status].str[5].str.match(r"4|5")
1287
1290
  if sum(matched_index)>0:
1288
- if verbose: log.write("Start to convert alleles to reverse complement for SNPs with status xxxxx[45]x...{}".format(_get_version()))
1289
- if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1291
+ log.write("Start to convert alleles to reverse complement for SNPs with status xxxxx[45]x...{}".format(_get_version()), verbose=verbose)
1292
+ log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
1290
1293
  if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
1291
- if verbose: log.write(" -Converting to reverse complement : EA and NEA...")
1294
+ log.write(" -Converting to reverse complement : EA and NEA...", verbose=verbose)
1292
1295
  reverse_complement_nea = sumstats.loc[matched_index,'NEA'].apply(lambda x :get_reverse_complementary_allele(x))
1293
1296
  reverse_complement_ea = sumstats.loc[matched_index,'EA'].apply(lambda x :get_reverse_complementary_allele(x))
1294
1297
  categories = set(sumstats['EA'])|set(sumstats['NEA']) |set(reverse_complement_ea) |set(reverse_complement_nea)
@@ -1297,15 +1300,15 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1297
1300
  sumstats.loc[matched_index,['NEA']] = reverse_complement_nea
1298
1301
  sumstats.loc[matched_index,['EA']] = reverse_complement_ea
1299
1302
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "4","2")
1300
- if verbose: log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x")
1303
+ log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x", verbose=verbose)
1301
1304
  if_stats_flipped = True
1302
1305
  ###################flip ref####################
1303
1306
  pattern = r"\w\w\w\w\w[35]\w"
1304
1307
  #matched_index = status_match(sumstats[status],6,[3,5]) #sumstats[status].str.match(pattern)
1305
1308
  matched_index = sumstats[status].str[5].str.match(r"3|5")
1306
1309
  if sum(matched_index)>0:
1307
- if verbose: log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: ALT->EA , REF->NEA ...{}".format(_get_version()))
1308
- if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1310
+ log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: ALT->EA , REF->NEA ...{}".format(_get_version()), verbose=verbose)
1311
+ log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
1309
1312
 
1310
1313
  flip_by_swap(sumstats, matched_index, log, verbose)
1311
1314
  flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
@@ -1313,7 +1316,7 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1313
1316
  flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
1314
1317
 
1315
1318
  #change status
1316
- if verbose: log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x")
1319
+ log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x", verbose=verbose)
1317
1320
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "35","12")
1318
1321
  if_stats_flipped = True
1319
1322
 
@@ -1322,8 +1325,8 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1322
1325
  #matched_index = status_match(sumstats[status],6,[1,2,3])|status_match(sumstats[status],6,[6,7])|status_match(sumstats[status],7,6) #sumstats[status].str.match(pattern)
1323
1326
  matched_index = sumstats[status].str[4:].str.match(r"[123][67]6")
1324
1327
  if sum(matched_index)>0:
1325
- if verbose: log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: ALT->EA , REF->NEA...{}".format(_get_version()))
1326
- if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1328
+ log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: ALT->EA , REF->NEA...{}".format(_get_version()), verbose=verbose)
1329
+ log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
1327
1330
 
1328
1331
  flip_by_swap(sumstats, matched_index, log, verbose)
1329
1332
  flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
@@ -1331,7 +1334,7 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1331
1334
  flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
1332
1335
 
1333
1336
  #change status
1334
- if verbose: log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4")
1337
+ log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4", verbose=verbose)
1335
1338
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "6","4")
1336
1339
  if_stats_flipped = True
1337
1340
  # flip ref
@@ -1340,24 +1343,23 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1340
1343
  #matched_index = status_match(sumstats[status],6,[0,1,2]) | status_match(sumstats[status],7,[5])#sumstats[status].str.match(pattern)
1341
1344
  matched_index = sumstats[status].str[5:].str.match(r"05|15|25")
1342
1345
  if sum(matched_index)>0:
1343
- if verbose: log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()))
1344
- if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1346
+ log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()), verbose=verbose)
1347
+ log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
1345
1348
 
1346
1349
  flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
1347
1350
  flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
1348
1351
  flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
1349
1352
 
1350
1353
  #change status
1351
- if verbose: log.write(" -Changed the status for flipped variants: xxxxx[012]5: -> xxxxx[012]2")
1354
+ log.write(" -Changed the status for flipped variants: xxxxx[012]5: -> xxxxx[012]2", verbose=verbose)
1352
1355
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "5","2")
1353
1356
  if_stats_flipped = True
1354
1357
 
1355
- if if_stats_flipped == True:
1356
- finished(log, verbose, "adjusting")
1357
- else:
1358
- finished(log, verbose, "adjusting with no statistics changed.")
1358
+ if if_stats_flipped != True:
1359
+ log.write(" -No statistics have been changed.")
1360
+
1361
+ finished(log, verbose, _end_line)
1359
1362
  return sumstats
1360
- ""
1361
1363
 
1362
1364
 
1363
1365
  ###############################################################################################################
@@ -1414,12 +1416,12 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
1414
1416
  if is_enough_info == False: return sumstats
1415
1417
  ############################################################################################
1416
1418
 
1417
- if verbose: log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build)
1419
+ log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build, verbose=verbose)
1418
1420
  # valid chr and pos
1419
1421
  pattern = r"\w\w\w0\w\w\w"
1420
1422
  to_lift = sumstats[status].str.match(pattern)
1421
1423
  sumstats = sumstats.loc[to_lift,:].copy()
1422
- if verbose: log.write(" -Converting variants with status code xxx0xxx :"+str(len(sumstats))+"...")
1424
+ log.write(" -Converting variants with status code xxx0xxx :"+str(len(sumstats))+"...", verbose=verbose)
1423
1425
  ###########################################################################
1424
1426
  if sum(to_lift)>0:
1425
1427
  if sum(to_lift)<10000:
@@ -1438,7 +1440,7 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
1438
1440
  unmap_num = len(sumstats.loc[sumstats[pos].isna(),:])
1439
1441
 
1440
1442
  if remove is True:
1441
- if verbose: log.write(" -Removed unmapped variants: "+str(unmap_num))
1443
+ log.write(" -Removed unmapped variants: "+str(unmap_num), verbose=verbose)
1442
1444
  sumstats = sumstats.loc[~sumstats[pos].isna(),:]
1443
1445
 
1444
1446
  # after liftover check chr and pos
@@ -1473,7 +1475,7 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
1473
1475
  if sumstats[pos].dtype == "Int64":
1474
1476
  pass
1475
1477
  else:
1476
- if verbose: log.write(" -Force converting POS to Int64...")
1478
+ log.write(" -Force converting POS to Int64...", verbose=verbose)
1477
1479
  sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
1478
1480
  except:
1479
1481
  pass
@@ -1511,7 +1513,7 @@ def sortcolumn(sumstats,verbose=True,log=Log(),order = None):
1511
1513
  if i in sumstats.columns: output_columns.append(i)
1512
1514
  for i in sumstats.columns:
1513
1515
  if i not in order: output_columns.append(i)
1514
- if verbose: log.write(" -Reordering columns to :", ",".join(output_columns))
1516
+ log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
1515
1517
  sumstats = sumstats[ output_columns]
1516
1518
 
1517
1519
  finished(log,verbose,_end_line)