gwaslab 3.4.37__py3-none-any.whl → 3.4.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (37) hide show
  1. gwaslab/data/formatbook.json +722 -721
  2. gwaslab/g_Log.py +8 -0
  3. gwaslab/g_Sumstats.py +26 -147
  4. gwaslab/g_SumstatsPair.py +6 -2
  5. gwaslab/g_Sumstats_summary.py +3 -3
  6. gwaslab/g_version.py +2 -2
  7. gwaslab/hm_casting.py +29 -15
  8. gwaslab/hm_harmonize_sumstats.py +291 -163
  9. gwaslab/hm_rsid_to_chrpos.py +1 -1
  10. gwaslab/io_preformat_input.py +43 -37
  11. gwaslab/io_to_formats.py +428 -295
  12. gwaslab/qc_check_datatype.py +3 -3
  13. gwaslab/qc_fix_sumstats.py +793 -682
  14. gwaslab/util_ex_calculate_ldmatrix.py +29 -11
  15. gwaslab/util_ex_gwascatalog.py +1 -1
  16. gwaslab/util_ex_ldproxyfinder.py +1 -1
  17. gwaslab/util_ex_process_ref.py +3 -3
  18. gwaslab/util_ex_run_coloc.py +26 -4
  19. gwaslab/util_in_convert_h2.py +1 -1
  20. gwaslab/util_in_fill_data.py +2 -2
  21. gwaslab/util_in_filter_value.py +122 -34
  22. gwaslab/util_in_get_density.py +2 -2
  23. gwaslab/util_in_get_sig.py +41 -9
  24. gwaslab/viz_aux_quickfix.py +24 -19
  25. gwaslab/viz_aux_reposition_text.py +7 -4
  26. gwaslab/viz_aux_save_figure.py +6 -5
  27. gwaslab/viz_plot_compare_af.py +5 -5
  28. gwaslab/viz_plot_miamiplot2.py +28 -20
  29. gwaslab/viz_plot_mqqplot.py +109 -72
  30. gwaslab/viz_plot_qqplot.py +11 -8
  31. gwaslab/viz_plot_regionalplot.py +3 -1
  32. gwaslab/viz_plot_trumpetplot.py +15 -6
  33. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/METADATA +2 -2
  34. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/RECORD +37 -37
  35. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
  36. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
  37. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
@@ -47,14 +47,14 @@ def _process_build(build,log,verbose):
47
47
  log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
48
48
  final_build = "38"
49
49
  else:
50
- log.write(" -WARNING! Version of genomic coordinates is unknown...", verbose=verbose)
50
+ log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
51
51
  final_build = "99"
52
52
  return final_build
53
53
 
54
54
  def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
55
55
  build = _process_build(build,log=log,verbose=verbose)
56
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 1, "139",build[0]*3)
57
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 2, "89",build[1]*3)
56
+ sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
57
+ sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
58
58
  return sumstats, build
59
59
 
60
60
  def fixID(sumstats,
@@ -66,35 +66,49 @@ def fixID(sumstats,
66
66
  2. fix chr and pos using snpid
67
67
  3. checking rsid and chr:pos:nea:ea
68
68
  '''
69
- if verbose: log.write("Start to check IDs...{}".format(_get_version()))
70
- check_dataframe_shape(sumstats, log, verbose)
71
- check_col(sumstats,[snpid,rsid],status)
69
+ ##start function with col checking##########################################################
70
+ _start_line = "check SNPID/rsID"
71
+ _end_line = "checking SNPID/rsID"
72
+ _start_cols =[]
73
+ _start_function = ".fix_id()"
74
+ _must_args ={}
75
+
76
+ is_enough_info = start_to(sumstats=sumstats,
77
+ log=log,
78
+ verbose=verbose,
79
+ start_line=_start_line,
80
+ end_line=_end_line,
81
+ start_cols=_start_cols,
82
+ start_function=_start_function,
83
+ **_must_args)
84
+ if is_enough_info == False: return sumstats
85
+ ############################################################################################
72
86
 
73
87
  ############################ checking datatype ###################################################
74
88
  if rsid in sumstats.columns:
75
89
  # convert to string datatype
76
90
  try:
77
91
  log.write(" -Checking rsID data type...",verbose=verbose)
78
- if sumstats.loc[:,rsid].dtype == "string":
92
+ if sumstats[rsid].dtype == "string":
79
93
  pass
80
94
  else:
81
95
  log.write(" -Converting rsID to pd.string data type...",verbose=verbose)
82
- sumstats.loc[:,rsid] = sumstats.loc[:,rsid].astype("string")
96
+ sumstats[rsid] = sumstats[rsid].astype("string")
83
97
  except:
84
98
  log.write(" -Force converting rsID to pd.string data type...",verbose=verbose)
85
- sumstats.loc[:,rsid] = sumstats.loc[:,rsid].astype("string")
99
+ sumstats[rsid] = sumstats[rsid].astype("string")
86
100
  if snpid in sumstats.columns:
87
101
  # convert to string datatype
88
102
  try:
89
103
  log.write(" -Checking SNPID data type...",verbose=verbose)
90
- if sumstats.loc[:,snpid].dtype == "string":
104
+ if sumstats[snpid].dtype == "string":
91
105
  pass
92
106
  else:
93
107
  log.write(" -Converting SNPID to pd.string data type...",verbose=verbose)
94
- sumstats.loc[:,snpid] = sumstats.loc[:,snpid].astype("string")
108
+ sumstats[snpid] = sumstats[snpid].astype("string")
95
109
  except:
96
110
  log.write(" -Force converting SNPID to pd.string data type...",verbose=verbose)
97
- sumstats.loc[:,snpid] = sumstats.loc[:,snpid].astype("string")
111
+ sumstats[snpid] = sumstats[snpid].astype("string")
98
112
 
99
113
  ############################ checking ###################################################
100
114
  if snpid in sumstats.columns:
@@ -141,7 +155,7 @@ def fixID(sumstats,
141
155
 
142
156
  elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
143
157
  if verbose: log.write(" -Initiating CHR columns...")
144
- sumstats.loc[:,chrom]=pd.Series(dtype="string")
158
+ sumstats[chrom]=pd.Series(dtype="string")
145
159
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
146
160
  to_fix_num = sum(to_fix)
147
161
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
@@ -149,7 +163,7 @@ def fixID(sumstats,
149
163
 
150
164
  elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
151
165
  if verbose: log.write(" -Initiating CHR and POS column...")
152
- sumstats.loc[:,pos]=pd.Series(dtype="Int64")
166
+ sumstats[pos]=pd.Series(dtype="Int64")
153
167
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
154
168
  to_fix_num = sum(to_fix)
155
169
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
@@ -157,8 +171,8 @@ def fixID(sumstats,
157
171
 
158
172
  else:
159
173
  if verbose: log.write(" -Initiating CHR and POS columns...")
160
- sumstats.loc[:,chrom]=pd.Series(dtype="string")
161
- sumstats.loc[:,pos]=pd.Series(dtype="Int64")
174
+ sumstats[chrom]=pd.Series(dtype="string")
175
+ sumstats[pos]=pd.Series(dtype="Int64")
162
176
  to_fix = is_chrposrefalt
163
177
  to_fix_num = sum(to_fix)
164
178
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
@@ -187,20 +201,20 @@ def fixID(sumstats,
187
201
  elif verbose: log.write(" -No fixable variants ...")
188
202
  elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
189
203
  if verbose: log.write(" -Initiating CHR columns...")
190
- sumstats.loc[:,chrom]=pd.Series(dtype="string")
204
+ sumstats[chrom]=pd.Series(dtype="string")
191
205
  to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
192
206
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
193
207
  elif verbose: log.write(" -No fixable variants ...")
194
208
  elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
195
209
  if verbose: log.write(" -Initiating CHR and POS column...")
196
- sumstats.loc[:,pos]=pd.Series(dtype="Int64")
210
+ sumstats[pos]=pd.Series(dtype="Int64")
197
211
  to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
198
212
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
199
213
  elif verbose: log.write(" -No fixable variants ...")
200
214
  else:
201
215
  if verbose: log.write(" -Initiating CHR and POS columns...")
202
- sumstats.loc[:,chrom]=pd.Series(dtype="string")
203
- sumstats.loc[:,pos]=pd.Series(dtype="Int64")
216
+ sumstats[chrom]=pd.Series(dtype="string")
217
+ sumstats[pos]=pd.Series(dtype="Int64")
204
218
  to_fix = is_rs_chrpos
205
219
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
206
220
  elif verbose: log.write(" -No fixable variants ...")
@@ -214,7 +228,7 @@ def fixID(sumstats,
214
228
 
215
229
  ############################ fixing chr pos###################################################
216
230
  if fixeanea == True:
217
- if verbose: log.write(" -WARNING! gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT")
231
+ if verbose: log.warning("gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT")
218
232
  if overwrite is True:
219
233
  if verbose: log.write(" -Overwrite mode is applied...")
220
234
  to_fix = is_chrposrefalt
@@ -223,12 +237,12 @@ def fixID(sumstats,
223
237
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
224
238
  elif (nea in sumstats.columns) and (ea not in sumstats.columns):
225
239
  if verbose: log.write(" -Initiating EA columns...")
226
- sumstats.loc[:,ea]=pd.Series(dtype="string")
240
+ sumstats[ea]=pd.Series(dtype="string")
227
241
  to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
228
242
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
229
243
  elif (nea not in sumstats.columns) and (ea in sumstats.columns):
230
244
  if verbose: log.write(" -Initiating NEA columns...")
231
- sumstats.loc[:,nea]=pd.Series(dtype="string")
245
+ sumstats[nea]=pd.Series(dtype="string")
232
246
  to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
233
247
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
234
248
  else:
@@ -260,21 +274,21 @@ def fixID(sumstats,
260
274
  if fixsep == True:
261
275
  if snpid in sumstats.columns:
262
276
  if verbose: log.write(' -Replacing [_-] in SNPID with ":" ...')
263
- sumstats.loc[:,snpid] = sumstats.loc[:,snpid].str.replace(r"[_-]",":",regex=True)
277
+ sumstats[snpid] = sumstats[snpid].str.replace(r"[_-]",":",regex=True)
264
278
 
265
279
  if fixprefix == True:
266
280
  if snpid in sumstats.columns:
267
281
  if verbose: log.write(' -Removing /^chr/ in SNPID ...')
268
- prefix_removed = sumstats.loc[:,snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
282
+ prefix_removed = sumstats[snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
269
283
  sumstats.loc[~prefix_removed.isna(),snpid] = prefix_removed[~prefix_removed.isna()]
270
284
 
271
285
  if fixid == True:
272
286
  if snpid not in sumstats.columns:
273
287
  # initiate a SNPID column
274
- sumstats.loc[:,snpid]=pd.Series(dtype="string")
288
+ sumstats[snpid]=pd.Series(dtype="string")
275
289
 
276
290
  if (rsid in sumstats.columns) and (sum(is_rs_chrpos)>0) :
277
- sumstats.loc[:,snpid]= sumstats.loc[is_rs_chrpos,rsid]
291
+ sumstats[snpid]= sumstats.loc[is_rs_chrpos,rsid]
278
292
 
279
293
  if (chrom in sumstats.columns) and (pos in sumstats.columns):
280
294
  #only fix when CHR and POS is available
@@ -329,7 +343,8 @@ def fixID(sumstats,
329
343
  after_number=sum(sumstats[snpid].isna())
330
344
  if verbose: log.write(" -Fixed "+ str(pre_number - after_number) +" variants ID...")
331
345
  elif verbose: log.write(" -ID unfixable: no CHR and POS columns or no SNPID. ")
332
- if verbose: log.write("Finished checking IDs successfully!")
346
+
347
+ finished(log,verbose,_end_line)
333
348
  return sumstats
334
349
 
335
350
  ""
@@ -344,8 +359,25 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
344
359
  remove duplicate SNPs based on 3. rsID
345
360
  remove multiallelic SNPs based on 4. CHR, POS
346
361
  '''
347
-
348
- if verbose: log.write("Start to remove duplicated/multiallelic variants...{}".format(_get_version()))
362
+
363
+ ##start function with col checking##########################################################
364
+ _start_line = "remove duplicated/multiallelic variants"
365
+ _end_line = "removing duplicated/multiallelic variants"
366
+ _start_cols =[]
367
+ _start_function = ".remove_dup()"
368
+ _must_args ={}
369
+
370
+ is_enough_info = start_to(sumstats=sumstats,
371
+ log=log,
372
+ verbose=verbose,
373
+ start_line=_start_line,
374
+ end_line=_end_line,
375
+ start_cols=_start_cols,
376
+ start_function=_start_function,
377
+ **_must_args)
378
+ if is_enough_info == False: return sumstats
379
+ ############################################################################################
380
+
349
381
  if verbose: log.write(" -Removing mode:{}".format(mode))
350
382
  # sort the variants using the specified column before removing
351
383
  if keep_col is not None :
@@ -397,7 +429,7 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
397
429
  if verbose: log.write("Start to remove multiallelic variants based on chr:pos...")
398
430
  check_dataframe_shape(sumstats, log, verbose)
399
431
  if verbose: log.write(" -Which variant to keep: ", keep )
400
- sumstats = sumstats.loc[(~sumstats.loc[:,[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
432
+ sumstats = sumstats.loc[(~sumstats[[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
401
433
  after_number=len(sumstats)
402
434
  if verbose: log.write(" -Removed ",pre_number -after_number," multiallelic variants...")
403
435
  after_number=len(sumstats)
@@ -435,300 +467,346 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
435
467
  sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
436
468
  after_number=len(sumstats)
437
469
  if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)))
438
- if verbose: log.write("Finished removing duplicated/multiallelic variants successfully!")
470
+
471
+ finished(log,verbose,_end_line)
439
472
  return sumstats
440
473
 
441
474
  ###############################################################################################################
442
475
  # 20230128
443
476
  def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",24),mt=("MT",25), remove=False, verbose=True, chrom_list = None, minchr=1,log=Log()):
444
- #chrom_list = get_chr_list() #bottom
445
- if chrom_list is None:
446
- chrom_list = get_chr_list()
447
- if check_col(sumstats,chrom,status) is not True:
448
- if verbose: log.write(".fix_chr: Specified not detected..skipping...")
449
- return sumstats
450
- if verbose: log.write("Start to fix chromosome notation...{}".format(_get_version()))
451
- check_dataframe_shape(sumstats, log, verbose)
452
-
453
- # convert to string datatype
454
- try:
455
- if verbose: log.write(" -Checking CHR data type...")
456
- if sumstats.loc[:,chrom].dtype == "string":
457
- pass
458
- else:
459
- sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype("string")
460
- except:
461
- if verbose: log.write(" -Force converting to pd string data type...")
462
- sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype("string")
463
-
464
- # check if CHR is numeric
465
- is_chr_fixed = sumstats[chrom].str.isnumeric()
466
- # fill NAs with False
467
- is_chr_fixed = is_chr_fixed.fillna(False)
468
- if verbose: log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed))
469
-
470
- # if there are variants whose CHR need to be fixed
471
- if sum(is_chr_fixed)<len(sumstats):
472
-
473
- #extract the CHR number or X Y M MT
474
- chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
477
+ ##start function with col checking##########################################################
478
+ _start_line = "fix chromosome notation (CHR)"
479
+ _end_line = "fixing chromosome notation (CHR)"
480
+ _start_cols =[chrom,status]
481
+ _start_function = ".fix_chr()"
482
+ _must_args ={}
475
483
 
476
- is_chr_fixable = ~chr_extracted.isna()
477
- if verbose: log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable))
484
+ is_enough_info = start_to(sumstats=sumstats,
485
+ log=log,
486
+ verbose=verbose,
487
+ start_line=_start_line,
488
+ end_line=_end_line,
489
+ start_cols=_start_cols,
490
+ start_function=_start_function,
491
+ **_must_args)
492
+ if is_enough_info == False: return sumstats
493
+ ############################################################################################
478
494
 
479
- # For not fixed variants, check if na
480
- is_chr_na = sumstats.loc[~is_chr_fixed, chrom].isna()
481
- if sum(is_chr_na)>0 and verbose:
482
- log.write(" -Variants with NA chromosome notations:",sum(is_chr_na))
483
-
484
- # Check variants with CHR being not NA and not fixable
485
- is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
486
- if sum(is_chr_invalid)>0 and verbose:
487
- log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
488
- try:
489
- log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
490
- except:
491
- pass
492
- elif verbose:
493
- log.write(" -No unrecognized chromosome notations...")
494
-
495
- # Assign good chr back to sumstats
496
- sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
495
+ #chrom_list = get_chr_list() #bottom
496
+ if chrom_list is None:
497
+ chrom_list = get_chr_list()
498
+ #if check_col(sumstats,chrom,status) is not True:
499
+ # if verbose: log.write(".fix_chr: Specified not detected..skipping...")
500
+ # return sumstats
497
501
 
498
- # X, Y, MT to 23,24,25
499
- xymt_list = [x[0].lower(),y[0].lower(),mt[0].lower(),x[0].upper(),y[0].upper(),mt[0].upper()]
500
-
501
- # check if sumstats contain sex CHR
502
- sex_chr = sumstats[chrom].isin(xymt_list)
503
-
504
- # if sumstats contain sex CHR
505
- if sum(sex_chr)>0:
506
- if verbose: log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]))
507
- if verbose: log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...")
508
-
509
- # convert "X, Y, MT" to numbers
510
- convert_num_to_xymt={}
511
- if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
512
- convert_num_to_xymt[x[0].lower()] = str(x[1])
513
- convert_num_to_xymt[x[0].upper()] = str(x[1])
514
- if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]))
515
- if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
516
- convert_num_to_xymt[y[0].lower()] = str(y[1])
517
- convert_num_to_xymt[y[0].upper()] = str(y[1])
518
- if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]))
519
- if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
520
- convert_num_to_xymt[mt[0].lower()] = str(mt[1])
521
- convert_num_to_xymt[mt[0].upper()] = str(mt[1])
522
- if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]))
523
- sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
524
-
525
- # change status code
526
- sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
527
- if len(is_chr_fixable.index)>0:
528
- sumstats.loc[is_chr_fixable.index,status] = vchange_status(sumstats.loc[is_chr_fixable.index,status],4,"986","520")
529
- if len(is_chr_fixable.index)>0:
530
- sumstats.loc[is_chr_invalid.index,status] = vchange_status(sumstats.loc[is_chr_invalid.index,status],4,"986","743")
531
-
532
- # check variants with unrecognized CHR
533
- unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
534
- if (remove is True) and unrecognized_num>0:
535
- # remove variants with unrecognized CHR
536
- try:
537
- if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
538
- except:
539
- pass
540
- if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
541
- try:
542
- log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
543
- except:
544
- pass
545
- #sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
546
- good_chr = sumstats[chrom].isin(chrom_list)
547
- sumstats = sumstats.loc[good_chr, :].copy()
502
+
503
+ # convert to string datatype
504
+ try:
505
+ if verbose: log.write(" -Checking CHR data type...")
506
+ if sumstats[chrom].dtype == "string":
507
+ pass
548
508
  else:
549
- if verbose: log.write(" -All CHR are already fixed...")
550
- sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
551
-
552
- # Convert string to int
553
- try:
554
- sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype('Int64')
555
- except:
556
- # force convert
557
- sumstats.loc[:,chrom] = np.floor(pd.to_numeric(sumstats.loc[:,chrom], errors='coerce')).astype('Int64')
558
-
559
- # filter out variants with CHR <=0
560
- out_of_range_chr = sumstats[chrom] < minchr
561
- out_of_range_chr = out_of_range_chr.fillna(False)
562
- if sum(out_of_range_chr)>0:
563
- if verbose: log.write(" -Sanity check for CHR...")
564
- if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
565
- sumstats = sumstats.loc[~out_of_range_chr,:]
566
-
567
- if verbose: log.write("Finished fixing chromosome notation successfully!")
509
+ sumstats[chrom] = sumstats[chrom].astype("string")
510
+ except:
511
+ if verbose: log.write(" -Force converting to pd string data type...")
512
+ sumstats[chrom] = sumstats[chrom].astype("string")
513
+
514
+ # check if CHR is numeric
515
+ is_chr_fixed = sumstats[chrom].str.isnumeric()
516
+ # fill NAs with False
517
+ is_chr_fixed = is_chr_fixed.fillna(False)
518
+ if verbose: log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed))
519
+
520
+ # if there are variants whose CHR need to be fixed
521
+ if sum(is_chr_fixed)<len(sumstats):
568
522
 
569
- return sumstats
523
+ #extract the CHR number or X Y M MT
524
+ chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
570
525
 
571
- ###############################################################################################################
572
- # 20230128
573
- def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_limit=0 , upper_limit=None , limit=250000000, log=Log()):
574
- if upper_limit is None:
575
- upper_limit = limit
576
- if check_col(sumstats,pos,status) is not True:
577
- if verbose: log.write(".fix_pos: Specified not detected..skipping...")
578
- return sumstats
579
- if verbose: log.write("Start to fix basepair positions...{}".format(_get_version()))
580
- check_dataframe_shape(sumstats, log, verbose)
526
+ is_chr_fixable = ~chr_extracted.isna()
527
+ if verbose: log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable))
528
+
529
+ # For not fixed variants, check if na
530
+ is_chr_na = sumstats.loc[~is_chr_fixed, chrom].isna()
531
+ if sum(is_chr_na)>0 and verbose:
532
+ log.write(" -Variants with NA chromosome notations:",sum(is_chr_na))
581
533
 
582
- all_var_num = len(sumstats)
583
- #convert to numeric
584
- is_pos_na = sumstats.loc[:,pos].isna()
534
+ # Check variants with CHR being not NA and not fixable
535
+ is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
536
+ if sum(is_chr_invalid)>0 and verbose:
537
+ log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
538
+ try:
539
+ log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
540
+ except:
541
+ pass
542
+ elif verbose:
543
+ log.write(" -No unrecognized chromosome notations...")
585
544
 
586
- try:
587
- if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
588
- sumstats.loc[:,pos] = sumstats.loc[:,pos].astype('string')
589
- # if so, remove thousands separator
590
- if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
591
- sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
592
- except:
593
- pass
545
+ # Assign good chr back to sumstats
546
+ sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
594
547
 
595
- # convert POS to integer
596
- try:
597
- if verbose: log.write(' -Converting to Int64 data type ...')
598
- sumstats[pos] = sumstats[pos].astype('Int64')
599
- except:
600
- if verbose: log.write(' -Force converting to Int64 data type ...')
601
- sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
602
- is_pos_fixed = ~sumstats.loc[:,pos].isna()
603
- is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
548
+ # X, Y, MT to 23,24,25
549
+ xymt_list = [x[0].lower(),y[0].lower(),mt[0].lower(),x[0].upper(),y[0].upper(),mt[0].upper()]
604
550
 
605
- sumstats.loc[is_pos_fixed,status] = vchange_status(sumstats.loc[is_pos_fixed,status] ,4,"975","630")
606
- sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
551
+ # check if sumstats contain sex CHR
552
+ sex_chr = sumstats[chrom].isin(xymt_list)
607
553
 
608
- # remove outlier, limit:250,000,000
609
- if verbose: log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit))
610
- is_pos_na = sumstats.loc[:,pos].isna()
611
- out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
612
- if verbose: log.write(" -Removed outliers:",sum(out_lier))
613
- sumstats = sumstats.loc[~out_lier,:]
614
- #remove na
615
- if remove is True:
616
- sumstats = sumstats.loc[~sumstats[pos].isna(),:]
617
- remain_var_num = len(sumstats)
618
- if verbose: log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.")
554
+ # if sumstats contain sex CHR
555
+ if sum(sex_chr)>0:
556
+ if verbose: log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]))
557
+ if verbose: log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...")
558
+
559
+ # convert "X, Y, MT" to numbers
560
+ convert_num_to_xymt={}
561
+ if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
562
+ convert_num_to_xymt[x[0].lower()] = str(x[1])
563
+ convert_num_to_xymt[x[0].upper()] = str(x[1])
564
+ if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]))
565
+ if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
566
+ convert_num_to_xymt[y[0].lower()] = str(y[1])
567
+ convert_num_to_xymt[y[0].upper()] = str(y[1])
568
+ if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]))
569
+ if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
570
+ convert_num_to_xymt[mt[0].lower()] = str(mt[1])
571
+ convert_num_to_xymt[mt[0].upper()] = str(mt[1])
572
+ if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]))
573
+ sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
619
574
 
620
- if verbose: log.write(" -Converted all position to datatype Int64.")
621
- if verbose: log.write("Finished fixing basepair position successfully!")
575
+ # change status code
576
+ sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
577
+ if len(is_chr_fixable.index)>0:
578
+ sumstats.loc[is_chr_fixable.index,status] = vchange_status(sumstats.loc[is_chr_fixable.index,status],4,"986","520")
579
+ if len(is_chr_fixable.index)>0:
580
+ sumstats.loc[is_chr_invalid.index,status] = vchange_status(sumstats.loc[is_chr_invalid.index,status],4,"986","743")
622
581
 
623
- return sumstats
582
+ # check variants with unrecognized CHR
583
+ unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
584
+ if (remove is True) and unrecognized_num>0:
585
+ # remove variants with unrecognized CHR
586
+ try:
587
+ if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
588
+ except:
589
+ pass
590
+ if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
591
+ try:
592
+ log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
593
+ except:
594
+ pass
595
+ #sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
596
+ good_chr = sumstats[chrom].isin(chrom_list)
597
+ sumstats = sumstats.loc[good_chr, :].copy()
598
+ else:
599
+ if verbose: log.write(" -All CHR are already fixed...")
600
+ sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
601
+
602
+ # Convert string to int
603
+ try:
604
+ sumstats[chrom] = sumstats[chrom].astype('Int64')
605
+ except:
606
+ # # force convert
607
+ sumstats[chrom] = np.floor(pd.to_numeric(sumstats[chrom], errors='coerce')).astype('Int64')
608
+
609
+ # filter out variants with CHR <=0
610
+ out_of_range_chr = sumstats[chrom] < minchr
611
+ out_of_range_chr = out_of_range_chr.fillna(False)
612
+ if sum(out_of_range_chr)>0:
613
+ if verbose: log.write(" -Sanity check for CHR...")
614
+ if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
615
+ sumstats = sumstats.loc[~out_of_range_chr,:]
616
+
617
+ finished(log,verbose,_end_line)
618
+ return sumstats
619
+
620
+ ###############################################################################################################
621
+ # 20230128
622
+ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_limit=0 , upper_limit=None , limit=250000000, log=Log()):
623
+ ##start function with col checking##########################################################
624
+ _start_line = "fix basepair positions (POS)"
625
+ _end_line = "fixing basepair positions (POS)"
626
+ _start_cols =[pos,status]
627
+ _start_function = ".fix_pos()"
628
+ _must_args ={}
629
+
630
+ is_enough_info = start_to(sumstats=sumstats,
631
+ log=log,
632
+ verbose=verbose,
633
+ start_line=_start_line,
634
+ end_line=_end_line,
635
+ start_cols=_start_cols,
636
+ start_function=_start_function,
637
+ **_must_args)
638
+ if is_enough_info == False: return sumstats
639
+ ############################################################################################
640
+
641
+ if upper_limit is None:
642
+ upper_limit = limit
643
+
644
+ all_var_num = len(sumstats)
645
+ #convert to numeric
646
+ is_pos_na = sumstats[pos].isna()
647
+
648
+ try:
649
+ if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
650
+ sumstats[pos] = sumstats[pos].astype('string')
651
+ # if so, remove thousands separator
652
+ if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
653
+ sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
654
+ except:
655
+ pass
656
+
657
+ # convert POS to integer
658
+ try:
659
+ if verbose: log.write(' -Converting to Int64 data type ...')
660
+ sumstats[pos] = sumstats[pos].astype('Int64')
661
+ except:
662
+ if verbose: log.write(' -Force converting to Int64 data type ...')
663
+ sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
664
+ is_pos_fixed = ~sumstats[pos].isna()
665
+ is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
666
+
667
+ sumstats.loc[is_pos_fixed,status] = vchange_status(sumstats.loc[is_pos_fixed,status] ,4,"975","630")
668
+ sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
669
+
670
+ # remove outlier, limit:250,000,000
671
+ if verbose: log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit))
672
+ is_pos_na = sumstats[pos].isna()
673
+ out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
674
+ if verbose: log.write(" -Removed outliers:",sum(out_lier))
675
+ sumstats = sumstats.loc[~out_lier,:]
676
+ #remove na
677
+ if remove is True:
678
+ sumstats = sumstats.loc[~sumstats[pos].isna(),:]
679
+ remain_var_num = len(sumstats)
680
+ if verbose: log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.")
681
+
682
+ finished(log,verbose,_end_line)
683
+ return sumstats
624
684
 
625
685
  ###############################################################################################################
626
686
  # 20220514
627
687
  def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=True,log=Log()):
628
- # remove variants with alleles other than actgACTG
629
- if check_col(sumstats,ea,nea,status) is not True:
630
- if verbose: log.write("EA and NEA not detected..skipping...")
631
- return sumstats
632
- if verbose: log.write("Start to fix alleles...{}".format(_get_version()))
633
- check_dataframe_shape(sumstats, log, verbose)
634
-
635
- #if (ea not in sumstats.columns) or (nea not in sumstats.columns):
636
- if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
637
-
638
- #try:
639
- # ea_missing = sum(sumstats[ea].isna())
640
- # nea_missing = sum(sumstats[nea].isna())
641
- # if sum(ea_missing)>0:
642
- # if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
643
- # sumstats.loc[:,ea] = sumstats.loc[:,ea].add_categories("N").fillna("N")
644
- # if sum(sumstats[nea].isna())>0:
645
- # if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
646
- # sumstats.loc[:,nea] = sumstats.loc[:,nea].add_categories("N").fillna("N")
647
- #except:
648
- # pass
649
-
650
- categories = set(sumstats.loc[:,ea].str.upper())|set(sumstats.loc[:,nea].str.upper())|set("N")
651
- categories = {x for x in categories if pd.notna(x)}
652
-
653
- sumstats.loc[:,ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
654
- sumstats.loc[:,nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
655
- all_var_num = len(sumstats)
656
-
657
- ## check ATCG
658
- bad_ea = sumstats[ea].str.contains("[^actgACTG]",na=True)
659
- bad_nea = sumstats[nea].str.contains("[^actgACTG]",na=True)
660
- good_ea = ~bad_ea
661
- good_nea = ~bad_nea
662
-
663
- log.write(" -Variants with bad EA : {}".format(sum(bad_ea)), verbose=verbose)
664
- log.write(" -Variants with bad NEA : {}".format(sum(bad_nea)), verbose=verbose)
665
-
666
- ## check NA
667
- is_eanea_na = sumstats[ea].isna() | sumstats[nea].isna()
668
- log.write(" -Variants with NA for EA or NEA: {}".format(sum(is_eanea_na)), verbose=verbose)
669
-
670
- ## check same alleles
671
- not_variant = sumstats[nea] == sumstats[ea]
672
- log.write(" -Variants with same EA and NEA: {}".format(sum(not_variant)), verbose=verbose)
688
+ ##start function with col checking##########################################################
689
+ _start_line = "fix alleles (EA and NEA)"
690
+ _end_line = "fixing alleles (EA and NEA)"
691
+ _start_cols =[ea, nea,status]
692
+ _start_function = ".fix_allele()"
693
+ _must_args ={}
673
694
 
674
- ## sum up invalid variants
675
- is_invalid = bad_ea | bad_nea | not_variant
676
-
677
- exclude = bad_nea | bad_ea
678
-
679
- if verbose:
680
- if len(set(sumstats.loc[bad_ea,ea].head())) >0:
681
- log.write(" -A look at the non-ATCG EA:",set(sumstats.loc[bad_ea,ea].head()),"...")
682
- if len(set(sumstats.loc[bad_nea,nea].head())) >0:
683
- log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...")
684
-
685
- if remove == True:
686
- sumstats = sumstats.loc[(good_ea & good_nea),:].copy()
687
- good_eanea_num = len(sumstats)
688
- if verbose: log.write(" -Removed "+str(all_var_num - good_eanea_num)+" variants with NA alleles or alleles that contain bases other than A/C/T/G.")
689
- sumstats = sumstats.loc[(good_ea & good_nea & (~not_variant)),:].copy()
690
- good_eanea_notsame_num = len(sumstats)
691
- if verbose: log.write(" -Removed "+str(good_eanea_num - good_eanea_notsame_num)+" variants with same allele for EA and NEA.")
692
- else:
693
- sumstats.loc[:,[ea,nea]] = sumstats.loc[:,[ea,nea]].fillna("N")
694
- if verbose: log.write(" -Detected "+str(sum(exclude))+" variants with alleles that contain bases other than A/C/T/G .")
695
- categories = set(sumstats.loc[:,ea].str.upper())|set(sumstats.loc[:,nea].str.upper())|set("N")
696
- sumstats.loc[:,ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
697
- sumstats.loc[:,nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
698
-
699
- is_eanea_fixed = good_ea | good_nea
700
- is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
701
- is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
702
- is_not_normalized = (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()>1)
703
- is_normalized = is_indel &( (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()>1) | (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()==1) )
704
-
705
- if sum(is_invalid)>0:
706
- sumstats.loc[is_invalid, status] = vchange_status(sumstats.loc[is_invalid,status], 5,"9","6")
707
- if sum(is_eanea_na)>0:
708
- sumstats.loc[is_eanea_na,status] = vchange_status(sumstats.loc[is_eanea_na, status], 5,"9","7")
709
- if sum(is_eanea_fixed&is_not_normalized)>0:
710
- sumstats.loc[is_eanea_fixed&is_not_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_not_normalized,status], 5,"9","5")
711
- if sum(is_eanea_fixed&is_snp)>0:
712
- sumstats.loc[is_eanea_fixed&is_snp, status] = vchange_status(sumstats.loc[is_eanea_fixed&is_snp,status], 5,"9","0")
713
- if sum(is_eanea_fixed&is_indel)>0:
714
- sumstats.loc[is_eanea_fixed&is_indel,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_indel, status], 5,"9","4")
715
- if sum(is_eanea_fixed&is_normalized)>0:
716
- sumstats.loc[is_eanea_fixed&is_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_normalized, status], 5,"4","3")
717
- gc.collect()
718
- if verbose: log.write("Finished fixing allele successfully!")
719
-
720
- return sumstats
695
+ is_enough_info = start_to(sumstats=sumstats,
696
+ log=log,
697
+ verbose=verbose,
698
+ start_line=_start_line,
699
+ end_line=_end_line,
700
+ start_cols=_start_cols,
701
+ start_function=_start_function,
702
+ **_must_args)
703
+ if is_enough_info == False: return sumstats
704
+ ############################################################################################
705
+ #try:
706
+ # ea_missing = sum(sumstats[ea].isna())
707
+ # nea_missing = sum(sumstats[nea].isna())
708
+ # if sum(ea_missing)>0:
709
+ # if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
710
+ # sumstats[ea] = sumstats[ea].add_categories("N").fillna("N")
711
+ # if sum(sumstats[nea].isna())>0:
712
+ # if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
713
+ # sumstats[nea] = sumstats[nea].add_categories("N").fillna("N")
714
+ #except:
715
+ # pass
716
+
717
+ if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
718
+ categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
719
+ categories = {x for x in categories if pd.notna(x)}
720
+ sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
721
+ sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
722
+ all_var_num = len(sumstats)
723
+
724
+ ## check ATCG
725
+ bad_ea = sumstats[ea].str.contains("[^actgACTG]",na=True)
726
+ bad_nea = sumstats[nea].str.contains("[^actgACTG]",na=True)
727
+ good_ea = ~bad_ea
728
+ good_nea = ~bad_nea
729
+
730
+ log.write(" -Variants with bad EA : {}".format(sum(bad_ea)), verbose=verbose)
731
+ log.write(" -Variants with bad NEA : {}".format(sum(bad_nea)), verbose=verbose)
732
+
733
+ ## check NA
734
+ is_eanea_na = sumstats[ea].isna() | sumstats[nea].isna()
735
+ log.write(" -Variants with NA for EA or NEA: {}".format(sum(is_eanea_na)), verbose=verbose)
736
+
737
+ ## check same alleles
738
+ not_variant = sumstats[nea] == sumstats[ea]
739
+ log.write(" -Variants with same EA and NEA: {}".format(sum(not_variant)), verbose=verbose)
740
+
741
+ ## sum up invalid variants
742
+ is_invalid = bad_ea | bad_nea | not_variant
743
+
744
+ exclude = bad_nea | bad_ea
745
+
746
+ if verbose:
747
+ if len(set(sumstats.loc[bad_ea,ea].head())) >0:
748
+ log.write(" -A look at the non-ATCG EA:",set(sumstats.loc[bad_ea,ea].head()),"...")
749
+ if len(set(sumstats.loc[bad_nea,nea].head())) >0:
750
+ log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...")
751
+
752
+ if remove == True:
753
+ sumstats = sumstats.loc[(good_ea & good_nea),:].copy()
754
+ good_eanea_num = len(sumstats)
755
+ if verbose: log.write(" -Removed "+str(all_var_num - good_eanea_num)+" variants with NA alleles or alleles that contain bases other than A/C/T/G.")
756
+ sumstats = sumstats.loc[(good_ea & good_nea & (~not_variant)),:].copy()
757
+ good_eanea_notsame_num = len(sumstats)
758
+ if verbose: log.write(" -Removed "+str(good_eanea_num - good_eanea_notsame_num)+" variants with same allele for EA and NEA.")
759
+ else:
760
+ sumstats[[ea,nea]] = sumstats[[ea,nea]].fillna("N")
761
+ if verbose: log.write(" -Detected "+str(sum(exclude))+" variants with alleles that contain bases other than A/C/T/G .")
762
+ categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
763
+ sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
764
+ sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
765
+
766
+ is_eanea_fixed = good_ea | good_nea
767
+ is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
768
+ is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
769
+ is_not_normalized = (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()>1)
770
+ is_normalized = is_indel &( (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()>1) | (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()==1) )
771
+
772
+ if sum(is_invalid)>0:
773
+ sumstats.loc[is_invalid, status] = vchange_status(sumstats.loc[is_invalid,status], 5,"9","6")
774
+ if sum(is_eanea_na)>0:
775
+ sumstats.loc[is_eanea_na,status] = vchange_status(sumstats.loc[is_eanea_na, status], 5,"9","7")
776
+ if sum(is_eanea_fixed&is_not_normalized)>0:
777
+ sumstats.loc[is_eanea_fixed&is_not_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_not_normalized,status], 5,"9","5")
778
+ if sum(is_eanea_fixed&is_snp)>0:
779
+ sumstats.loc[is_eanea_fixed&is_snp, status] = vchange_status(sumstats.loc[is_eanea_fixed&is_snp,status], 5,"9","0")
780
+ if sum(is_eanea_fixed&is_indel)>0:
781
+ sumstats.loc[is_eanea_fixed&is_indel,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_indel, status], 5,"9","4")
782
+ if sum(is_eanea_fixed&is_normalized)>0:
783
+ sumstats.loc[is_eanea_fixed&is_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_normalized, status], 5,"4","3")
784
+
785
+ finished(log,verbose,_end_line)
786
+ return sumstats
721
787
 
722
788
  ###############################################################################################################
723
789
  # 20220721
724
790
 
725
791
  def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
726
- if check_col(sumstats,pos,ea,nea,status) is not True:
727
- if verbose: log.write("WARNING! .normalize(): specified columns not detected..skipping...")
728
- return sumstats
729
-
730
- if verbose: log.write("Start to normalize variants...{}".format(_get_version()))
731
- check_dataframe_shape(sumstats, log, verbose)
792
+ ##start function with col checking##########################################################
793
+ _start_line = "normalize indels"
794
+ _end_line = "normalizing indels"
795
+ _start_cols =[ea, nea,status]
796
+ _start_function = ".normalize()"
797
+ _must_args ={}
798
+
799
+ is_enough_info = start_to(sumstats=sumstats,
800
+ log=log,
801
+ verbose=verbose,
802
+ start_line=_start_line,
803
+ end_line=_end_line,
804
+ start_cols=_start_cols,
805
+ start_function=_start_function,
806
+ **_must_args)
807
+ if is_enough_info == False: return sumstats
808
+ ############################################################################################
809
+
732
810
  #variants_to_check = status_match(sumstats[status],5,[4,5]) #
733
811
  #r'\w\w\w\w[45]\w\w'
734
812
  variants_to_check = sumstats[status].str[4].str.match(r'4|5', case=False, flags=0, na=False)
@@ -742,7 +820,8 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
742
820
  n_cores=1
743
821
  pool = Pool(n_cores)
744
822
  map_func = partial(normalizeallele,pos=pos,nea=nea,ea=ea,status=status)
745
- df_split = np.array_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
823
+ #df_split = np.array_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
824
+ df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
746
825
  normalized_pd = pd.concat(pool.map(map_func,df_split))
747
826
  pool.close()
748
827
  pool.join()
@@ -772,16 +851,16 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
772
851
  else:
773
852
  log.write(" -All variants are already normalized..")
774
853
  ###################################################################################################################
775
- categories = set(sumstats.loc[:,ea])|set(sumstats.loc[:,nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
776
- sumstats.loc[:,ea] = pd.Categorical(sumstats.loc[:,ea],categories = categories)
777
- sumstats.loc[:,nea] = pd.Categorical(sumstats.loc[:,nea],categories = categories )
854
+ categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
855
+ sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
856
+ sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
778
857
  sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
779
858
  try:
780
- sumstats.loc[:,pos] = sumstats.loc[:,pos].astype('Int64')
859
+ sumstats[pos] = sumstats[pos].astype('Int64')
781
860
  except:
782
- sumstats.loc[:,pos] = np.floor(pd.to_numeric(sumstats.loc[:,pos], errors='coerce')).astype('Int64')
861
+ sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
783
862
 
784
- if verbose: log.write("Finished normalizing variants successfully!")
863
+ finished(log,verbose,_end_line)
785
864
  return sumstats
786
865
 
787
866
  def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
@@ -846,6 +925,52 @@ def add_tolerence(stats, float_tolerence, mode):
846
925
  stats = (stats[0] , stats[1] + float_tolerence if stats[0]!=float("Inf") else float("Inf"))
847
926
  return stats
848
927
 
928
+
929
+ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, verbose, dtype="Int64"):
930
+ pre_number=len(sumstats)
931
+ if header in coltocheck and header in sumstats.columns:
932
+ cols_to_check.append(header)
933
+ if header=="STATUS":
934
+ if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
935
+ categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
936
+ sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
937
+ return sumstats
938
+
939
+ if dtype in ["Int64","Int32","int","int32","in64"]:
940
+ if verbose: log.write(" -Checking if {} <= {} <= {} ...".format( var_range[0] ,header, var_range[1]))
941
+ sumstats[header] = np.floor(pd.to_numeric(sumstats[header], errors='coerce')).astype(dtype)
942
+
943
+ elif dtype in ["Float64","Float32","float","float64","float32"]:
944
+ log.write(" -Checking if {} < {} < {} ...".format( var_range[0] ,header, var_range[1]),verbose=verbose)
945
+ sumstats[header] = pd.to_numeric(sumstats[header], errors='coerce').astype(dtype)
946
+
947
+ is_valid = (sumstats[header]>=var_range[0]) & (sumstats[header]<=var_range[1])
948
+ is_valid = is_valid.fillna(False)
949
+
950
+ if header=="P":
951
+ is_low_p = sumstats["P"] == 0
952
+ if sum(is_low_p) >0:
953
+ log.warning("Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)))
954
+ log.warning("Please consider using MLOG10P instead.")
955
+
956
+ if sum(~is_valid)>0:
957
+ try:
958
+ if "SNPID" in sumstats.columns:
959
+ id_to_use = "SNPID"
960
+ elif "rsID" in sumstats.columns:
961
+ id_to_use = "rsID"
962
+ invalid_ids = sumstats.loc[~is_valid, id_to_use].head().astype("string")
963
+ invalid_values = sumstats.loc[~is_valid, header].head().astype("string").fillna("NA")
964
+ log.write(" -Examples of invalid variants({}): {} ...".format(id_to_use, ",".join(invalid_ids.to_list()) ), verbose=verbose)
965
+ log.write(" -Examples of invalid values ({}): {} ...".format(header, ",".join(invalid_values.to_list()) ), verbose=verbose)
966
+ except:
967
+ pass
968
+
969
+ sumstats = sumstats.loc[is_valid,:]
970
+ after_number=len(sumstats)
971
+ log.write(" -Removed {} variants with bad/na {}.".format(pre_number - after_number, header), verbose=verbose)
972
+ return sumstats
973
+
849
974
  def sanitycheckstats(sumstats,
850
975
  coltocheck=None,
851
976
  n=(0,2**31-1),
@@ -853,8 +978,10 @@ def sanitycheckstats(sumstats,
853
978
  ncontrol=(0,2**31-1),
854
979
  eaf=(0,1),
855
980
  mac=(0,2**31-1),
981
+ maf=(0,0.5),
856
982
  chisq=(0,float("Inf")),
857
983
  z=(-9999,9999),
984
+ t=(-99999,99999),
858
985
  f=(0,float("Inf")),
859
986
  p=(0,1),
860
987
  mlog10p=(0,9999),
@@ -885,10 +1012,30 @@ def sanitycheckstats(sumstats,
885
1012
  HR_95U: float64 , HR_95L >0
886
1013
  INFO: float32 , 1>=INFO>0
887
1014
  Z float64 , -9999 < Z < 9999
1015
+ T float64 , -99999 < T < 99999
888
1016
  F float64 , F > 0
889
1017
  '''
1018
+ ##start function with col checking##########################################################
1019
+ _start_line = "perform sanity check for statistics"
1020
+ _end_line = "sanity check for statistics"
1021
+ _start_cols =[]
1022
+ _start_function = ".check_sanity()"
1023
+ _must_args ={}
1024
+
1025
+ is_enough_info = start_to(sumstats=sumstats,
1026
+ log=log,
1027
+ verbose=verbose,
1028
+ start_line=_start_line,
1029
+ end_line=_end_line,
1030
+ start_cols=_start_cols,
1031
+ start_function=_start_function,
1032
+ **_must_args)
1033
+ if is_enough_info == False: return sumstats
1034
+ ############################################################################################
890
1035
 
1036
+ if verbose: log.write(" -Comparison tolerance for floats: {}".format(float_tolerence))
891
1037
  eaf = add_tolerence(eaf, float_tolerence, "lr")
1038
+ maf = add_tolerence(maf, float_tolerence, "lr")
892
1039
  beta = add_tolerence(beta, float_tolerence, "lr")
893
1040
  se = add_tolerence(se, float_tolerence, "lr")
894
1041
  mlog10p = add_tolerence(mlog10p, float_tolerence, "lr")
@@ -903,233 +1050,83 @@ def sanitycheckstats(sumstats,
903
1050
  p = add_tolerence(p, float_tolerence, "lr")
904
1051
  f = add_tolerence(f, float_tolerence, "lr")
905
1052
  chisq = add_tolerence(chisq, float_tolerence, "lr")
906
-
907
-
1053
+ ############################################################################################
908
1054
  ## add direction
909
1055
  if coltocheck is None:
910
1056
  coltocheck = ["P","MLOG10P","INFO","Z","BETA","SE","EAF","CHISQ","F","N","N_CASE","N_CONTROL","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","STATUS"]
911
- if verbose: log.write("Start sanity check for statistics...{}".format(_get_version()))
912
- check_dataframe_shape(sumstats, log, verbose)
1057
+
913
1058
  cols_to_check=[]
914
1059
  oringinal_number=len(sumstats)
915
1060
  sumstats = sumstats.copy()
916
1061
 
917
- if verbose: log.write(" -Comparison tolerance for floats: {}".format(float_tolerence))
918
- ###SAMPLE SIZE################################################################################################################################################
919
- pre_number=len(sumstats)
920
- if "N" in coltocheck and "N" in sumstats.columns:
921
- cols_to_check.append("N")
922
- if verbose: log.write(" -Checking if ",n[0],"<=N<=",n[1]," ...")
923
- sumstats.loc[:,"N"] = np.floor(pd.to_numeric(sumstats.loc[:,"N"], errors='coerce')).astype("Int64")
924
- sumstats = sumstats.loc[(sumstats["N"]>=n[0]) & (sumstats["N"]<=n[1]),:]
925
- after_number=len(sumstats)
926
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N.")
927
- pre_number=len(sumstats)
928
- if "N_CASE" in coltocheck and "N_CASE" in sumstats.columns:
929
- cols_to_check.append("N_CASE")
930
- if verbose: log.write(" -Checking if ",ncase[0],"<=N_CASE<=",ncase[1]," ...")
931
- sumstats.loc[:,"N_CASE"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CASE"], errors='coerce')).astype("Int64")
932
- sumstats = sumstats.loc[(sumstats["N_CASE"]>=ncase[0]) & (sumstats["N_CASE"]<=ncase[1]),:]
933
- after_number=len(sumstats)
934
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CASE.")
935
- pre_number=len(sumstats)
936
- if "N_CONTROL" in coltocheck and "N_CONTROL" in sumstats.columns:
937
- cols_to_check.append("N_CONTROL")
938
- if verbose: log.write(" -Checking if ",ncontrol[0],"<=N_CONTROL<=",ncontrol[1]," ...")
939
- sumstats.loc[:,"N_CONTROL"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CONTROL"], errors='coerce')).astype("Int64")
940
- sumstats = sumstats.loc[(sumstats["N_CONTROL"]>=ncontrol[0]) & (sumstats["N_CONTROL"]<=ncontrol[1]),:]
941
- after_number=len(sumstats)
942
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CONTROL.")
1062
+ ###Int64 ################################################################################################################################################
1063
+ sumstats = check_range(sumstats, var_range=n, header="N", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
1064
+ sumstats = check_range(sumstats, var_range=ncase, header="N_CASE", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
1065
+ sumstats = check_range(sumstats, var_range=ncontrol, header="N_CONTROL", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
943
1066
 
1067
+ ###float32 ################################################################################################################################################
1068
+ sumstats = check_range(sumstats, var_range=eaf, header="EAF", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
1069
+ sumstats = check_range(sumstats, var_range=maf, header="MAF", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
1070
+ sumstats = check_range(sumstats, var_range=info, header="INFO", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
944
1071
 
945
- ###ALLELE FREQUENCY################################################################################################################################################
946
- pre_number=len(sumstats)
947
- if "EAF" in coltocheck and "EAF" in sumstats.columns:
948
- cols_to_check.append("EAF")
949
- if verbose: log.write(" -Checking if ",eaf[0],"<EAF<",eaf[1]," ...")
950
- sumstats.loc[:,"EAF"] = pd.to_numeric(sumstats.loc[:,"EAF"], errors='coerce').astype("float32")
951
- sumstats = sumstats.loc[(sumstats["EAF"]>eaf[0]) & (sumstats["EAF"]<eaf[1]),:]
952
- after_number=len(sumstats)
953
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad EAF.")
954
-
955
- pre_number=len(sumstats)
956
- if "EAF" in coltocheck and "EAF" in sumstats.columns and "N" in coltocheck and "N" in sumstats.columns:
957
- if verbose: log.write(" -Checking if ",mac[0],"<=MAC<=",mac[1]," ...")
958
- sumstats["_MAF"]=sumstats["EAF"]
959
- sumstats.loc[sumstats["EAF"]>0.5,"_MAF"] = 1 - sumstats.loc[sumstats["EAF"]>0.5,"EAF"]
960
- sumstats["_MAC"] = np.floor(pd.to_numeric(sumstats.loc[:,"_MAF"] * sumstats.loc[:,"N"], errors='coerce')).astype("int64")
961
- macl = ( sumstats["_MAC"] >= mac[0])
962
- macu = ( sumstats["_MAC"] <= mac[1])
963
- sumstats = sumstats.loc[macl&macu,:]
964
- sumstats = sumstats.drop(labels=["_MAF","_MAC"],axis=1)
965
- after_number=len(sumstats)
966
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad MAC.")
967
-
968
- ###TEST STATISTICS################################################################################################################################################
969
- pre_number=len(sumstats)
970
- if "CHISQ" in coltocheck and "CHISQ" in sumstats.columns:
971
- cols_to_check.append("CHISQ")
972
- if verbose: log.write(" -Checking if ",chisq[0],"<CHISQ<",chisq[1]," ...")
973
- sumstats.loc[:,"CHISQ"] = pd.to_numeric(sumstats.loc[:,"CHISQ"], errors='coerce').astype("float64")
974
- sumstats = sumstats.loc[(sumstats["CHISQ"]>chisq[0]) & (sumstats["CHISQ"]<chisq[1]),:]
975
- after_number=len(sumstats)
976
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad CHISQ.")
977
-
978
- pre_number=len(sumstats)
979
- if "Z" in coltocheck and "Z" in sumstats.columns:
980
- cols_to_check.append("Z")
981
- if verbose: log.write(" -Checking if ",z[0],"<Z<",z[1]," ...")
982
- sumstats.loc[:,"Z"] = pd.to_numeric(sumstats.loc[:,"Z"], errors='coerce').astype("float64")
983
- sumstats = sumstats.loc[(sumstats["Z"]>z[0]) & (sumstats["Z"]<z[1]),:]
984
- after_number=len(sumstats)
985
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad Z.")
986
-
987
- pre_number=len(sumstats)
988
- if "F" in coltocheck and "F" in sumstats.columns:
989
- cols_to_check.append("F")
990
- if verbose: log.write(" -Checking if ",f[0],"<F<",f[1]," ...")
991
- sumstats.loc[:,"F"] = pd.to_numeric(sumstats.loc[:,"F"], errors='coerce').astype("float64")
992
- sumstats = sumstats.loc[(sumstats["F"]>f[0]) & (sumstats["F"]<f[1]),:]
993
- after_number=len(sumstats)
994
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad F.")
995
-
996
- ###P ################################################################################################################################################
997
- pre_number=len(sumstats)
998
- if "P" in coltocheck and "P" in sumstats.columns:
999
- cols_to_check.append("P")
1000
- if verbose: log.write(" -Checking if ",p[0],"< P <",p[1]," ...")
1001
- sumstats.loc[:,"P"] = pd.to_numeric(sumstats.loc[:,"P"], errors='coerce').astype("float64")
1002
- sumstats = sumstats.loc[(sumstats["P"]>p[0]) & (sumstats["P"]<p[1]),:]
1003
-
1004
- is_low_p = sumstats["P"] == 0
1005
- if sum(is_low_p) >0:
1006
- log.write(" -WARNING! Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)), verbose=verbose)
1007
- log.write(" -WARNING! Please consider using MLOG10P instead.", verbose=verbose)
1008
- after_number=len(sumstats)
1009
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad P.")
1010
-
1011
- pre_number=len(sumstats)
1012
- if "MLOG10P" in coltocheck and "MLOG10P" in sumstats.columns:
1013
- cols_to_check.append("MLOG10P")
1014
- if verbose: log.write(" -Checking if ",mlog10p[0],"<MLOG10P<",mlog10p[1]," ...")
1015
- sumstats.loc[:,"MLOG10P"] = pd.to_numeric(sumstats.loc[:,"MLOG10P"], errors='coerce').astype("float64")
1016
- sumstats = sumstats.loc[(sumstats["MLOG10P"]>mlog10p[0]) & (sumstats["MLOG10P"]<mlog10p[1]),:]
1017
- after_number=len(sumstats)
1018
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad MLOG10P.")
1019
-
1020
- ###EFFECT ################################################################################################################################################
1021
- pre_number=len(sumstats)
1022
- if "BETA" in coltocheck and "BETA" in sumstats.columns:
1023
- cols_to_check.append("BETA")
1024
- if verbose: log.write(" -Checking if ",beta[0],"<BETA<",beta[1]," ...")
1025
- sumstats.loc[:,"BETA"] = pd.to_numeric(sumstats.loc[:,"BETA"], errors='coerce').astype("float64")
1026
- sumstats = sumstats.loc[(sumstats["BETA"]>beta[0]) & (sumstats["BETA"]<beta[1]),:]
1027
- after_number=len(sumstats)
1028
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad BETA.")
1029
-
1030
- pre_number=len(sumstats)
1031
- if "SE" in coltocheck and "SE" in sumstats.columns:
1032
- cols_to_check.append("SE")
1033
- if verbose: log.write(" -Checking if ",se[0],"<SE<",se[1]," ...")
1034
- sumstats.loc[:,"SE"] = pd.to_numeric(sumstats.loc[:,"SE"], errors='coerce').astype("float64")
1035
- sumstats = sumstats.loc[(sumstats["SE"]>se[0]) & (sumstats["SE"]<se[1]),:]
1036
- after_number=len(sumstats)
1037
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad SE.")
1038
-
1039
- pre_number=len(sumstats)
1040
- if "OR" in coltocheck and "OR" in sumstats.columns:
1041
- cols_to_check.append("OR")
1042
- if verbose: log.write(" -Checking if ",OR[0],"<log(OR)<",OR[1]," ...")
1043
- sumstats.loc[:,"OR"] = pd.to_numeric(sumstats.loc[:,"OR"], errors='coerce').astype("float64")
1044
- sumstats = sumstats.loc[(np.log(sumstats["OR"])>OR[0]) & (np.log(sumstats["OR"])<OR[1]),:]
1045
- after_number=len(sumstats)
1046
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR.")
1047
-
1048
- pre_number=len(sumstats)
1049
- if "OR_95L" in coltocheck and "OR_95L" in sumstats.columns:
1050
- cols_to_check.append("OR_95L")
1051
- if verbose: log.write(" -Checking if ",OR_95L[0],"<OR_95L<",OR_95L[1]," ...")
1052
- sumstats.loc[:,"OR_95L"] = pd.to_numeric(sumstats.loc[:,"OR_95L"], errors='coerce').astype("float64")
1053
- sumstats = sumstats.loc[(sumstats["OR_95L"]>OR_95L[0]) & (sumstats["OR_95L"]<OR_95L[1]),:]
1054
- after_number=len(sumstats)
1055
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR_95L.")
1056
-
1057
- pre_number=len(sumstats)
1058
- if "OR_95U" in coltocheck and "OR_95U" in sumstats.columns:
1059
- cols_to_check.append("OR_95U")
1060
- if verbose: log.write(" -Checking if ",OR_95U[0],"<OR_95U<",OR_95U[1]," ...")
1061
- sumstats.loc[:,"OR_95U"] = pd.to_numeric(sumstats.loc[:,"OR_95U"], errors='coerce').astype("float64")
1062
- sumstats = sumstats.loc[(sumstats["OR_95U"]>OR_95U[0]) & (sumstats["OR_95U"]<OR_95U[1]),:]
1063
- after_number=len(sumstats)
1064
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR_95U.")
1065
-
1066
- pre_number=len(sumstats)
1067
- if "HR" in coltocheck and "HR" in sumstats.columns:
1068
- cols_to_check.append("HR")
1069
- if verbose: log.write(" -Checking if ",HR[0],"<log(HR)<",HR[1]," ...")
1070
- sumstats.loc[:,"HR"] = pd.to_numeric(sumstats.loc[:,"HR"], errors='coerce').astype("float64")
1071
- sumstats = sumstats.loc[(np.log(sumstats["HR"])>HR[0]) & (np.log(sumstats["HR"])<HR[1]),:]
1072
- after_number=len(sumstats)
1073
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR.")
1074
-
1075
- pre_number=len(sumstats)
1076
- if "HR_95L" in coltocheck and "HR_95L" in sumstats.columns:
1077
- cols_to_check.append("HR_95L")
1078
- if verbose: log.write(" -Checking if ",HR_95L[0],"<HR_95L<",HR_95L[1]," ...")
1079
- sumstats.loc[:,"HR_95L"] = pd.to_numeric(sumstats.loc[:,"HR_95L"], errors='coerce').astype("float64")
1080
- sumstats = sumstats.loc[(sumstats["HR_95L"]>HR_95L[0]) & (sumstats["HR_95L"]<HR_95L[1]),:]
1081
- after_number=len(sumstats)
1082
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR_95L.")
1083
-
1084
- pre_number=len(sumstats)
1085
- if "HR_95U" in coltocheck and "HR_95U" in sumstats.columns:
1086
- cols_to_check.append("HR_95U")
1087
- if verbose: log.write(" -Checking if ",HR_95U[0],"<HR_95U<",HR_95U[1]," ...")
1088
- sumstats.loc[:,"HR_95U"] = pd.to_numeric(sumstats.loc[:,"HR_95U"], errors='coerce').astype("float64")
1089
- sumstats = sumstats.loc[(sumstats["HR_95U"]>HR_95U[0]) & (sumstats["HR_95U"]<HR_95U[1]),:]
1090
- after_number=len(sumstats)
1091
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR_95U.")
1092
- #INFO #################################################################################################################
1093
- pre_number=len(sumstats)
1094
- if "INFO" in coltocheck and "INFO" in sumstats.columns:
1095
- cols_to_check.append("INFO")
1096
- if verbose: log.write(" -Checking if ",info[0],"<INFO<",info[1]," ...")
1097
- sumstats.loc[:,"INFO"] = pd.to_numeric(sumstats.loc[:,"INFO"], errors='coerce').astype("float32")
1098
- sumstats = sumstats.loc[(sumstats["INFO"]>info[0]) & (sumstats["INFO"]<info[1]),:]
1099
- after_number=len(sumstats)
1100
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad INFO.")
1101
- ###STATUS ################################################################################################################################################
1102
- pre_number=len(sumstats)
1103
- if "STATUS" in coltocheck and "STATUS" in sumstats.columns:
1104
- cols_to_check.append("STATUS")
1105
- if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
1106
- categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
1107
- sumstats.loc[:,"STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
1108
-
1109
- #pre_number=len(sumstats)
1110
- #sumstats = sumstats.dropna(subset=cols_to_check)
1111
- after_number=len(sumstats)
1112
- #if verbose:log.write(" -Removed {} variants with NAs in the checked columns...".format(pre_number - after_number))
1072
+ ###float64 ################################################################################################################################################
1073
+ sumstats = check_range(sumstats, var_range=chisq, header="CHISQ", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1074
+ sumstats = check_range(sumstats, var_range=z, header="Z", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1075
+ sumstats = check_range(sumstats, var_range=t, header="T", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1076
+ sumstats = check_range(sumstats, var_range=f, header="F", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1077
+ sumstats = check_range(sumstats, var_range=p, header="P", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1078
+ sumstats = check_range(sumstats, var_range=mlog10p, header="MLOG10P", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1079
+ sumstats = check_range(sumstats, var_range=beta, header="BETA", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1080
+ sumstats = check_range(sumstats, var_range=se, header="SE", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1081
+ sumstats = check_range(sumstats, var_range=OR, header="OR", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1082
+ sumstats = check_range(sumstats, var_range=OR_95L, header="OR_95L", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1083
+ sumstats = check_range(sumstats, var_range=OR_95U, header="OR_95U", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1084
+ sumstats = check_range(sumstats, var_range=HR, header="HR", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1085
+ sumstats = check_range(sumstats, var_range=HR_95L, header="HR_95L", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1086
+ sumstats = check_range(sumstats, var_range=HR_95U, header="HR_95U", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1087
+ ###STATUS ###############################################################################################################################################
1088
+ sumstats = check_range(sumstats, var_range=None, header="STATUS", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="category")
1113
1089
 
1114
- if verbose: log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.")
1115
- if verbose:
1116
- log.write(" -Data types for each column:")
1117
- check_datatype(sumstats,verbose=verbose, log=log)
1118
- if verbose: log.write("Finished sanity check successfully!")
1090
+ after_number=len(sumstats)
1091
+ log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.",verbose=verbose)
1092
+ log.write(" -Data types for each column:",verbose=verbose)
1093
+ check_datatype(sumstats,verbose=verbose, log=log)
1094
+ finished(log,verbose,_end_line)
1119
1095
  return sumstats
1120
1096
 
1121
1097
  ### check consistency #############################################################################################################################################
1122
1098
 
1123
- def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
1124
- if verbose: log.write("Start to check data consistency across columns...{}".format(_get_version()))
1125
- check_dataframe_shape(sumstats, log, verbose)
1099
+ def _check_data_consistency(sumstats, beta="BETA", se="SE", p="P",mlog10p="MLOG10P",rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
1100
+ ##start function with col checking##########################################################
1101
+ _start_line = "check data consistency across columns"
1102
+ _end_line = "checking data consistency across columns"
1103
+ _start_cols =[]
1104
+ _start_function = ".check_data_consistency()"
1105
+ _must_args ={}
1106
+
1107
+ is_enough_info = start_to(sumstats=sumstats,
1108
+ log=log,
1109
+ verbose=verbose,
1110
+ start_line=_start_line,
1111
+ end_line=_end_line,
1112
+ start_cols=_start_cols,
1113
+ start_function=_start_function,
1114
+ **_must_args)
1115
+ if is_enough_info == False: return sumstats
1116
+ ############################################################################################
1117
+
1126
1118
  log.write(" -Tolerance: {} (Relative) and {} (Absolute)".format(rtol, atol),verbose=verbose)
1119
+ check_status = 0
1127
1120
 
1128
-
1129
- if "SNPID" not in sumstats.columns:
1121
+ if "SNPID" in sumstats.columns:
1122
+ id_to_use = "SNPID"
1123
+ elif "rsID" in sumstats.columns:
1130
1124
  id_to_use = "rsID"
1131
1125
  else:
1132
- id_to_use = "SNPID"
1126
+ log.write(" -SNPID/rsID not available...SKipping",verbose=verbose)
1127
+ log.write("Finished checking data consistency across columns.",verbose=verbose)
1128
+ return 0
1129
+
1133
1130
 
1134
1131
  if "BETA" in sumstats.columns and "SE" in sumstats.columns:
1135
1132
  if "MLOG10P" in sumstats.columns:
@@ -1138,10 +1135,11 @@ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verb
1138
1135
  is_close = np.isclose(betase_derived_mlog10p, sumstats["MLOG10P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1139
1136
  diff = betase_derived_mlog10p - sumstats["MLOG10P"]
1140
1137
  if sum(~is_close)>0:
1141
- log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1142
- log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1138
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
1139
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
1143
1140
  else:
1144
1141
  log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1142
+ check_status=1
1145
1143
 
1146
1144
  if "P" in sumstats.columns:
1147
1145
  log.write(" -Checking if BETA/SE-derived-P is consistent with P...",verbose=verbose)
@@ -1149,10 +1147,11 @@ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verb
1149
1147
  is_close = np.isclose(betase_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1150
1148
  diff = betase_derived_p - sumstats["P"]
1151
1149
  if sum(~is_close)>0:
1152
- log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1153
- log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1150
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
1151
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
1154
1152
  else:
1155
1153
  log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1154
+ check_status=1
1156
1155
 
1157
1156
  if "MLOG10P" in sumstats.columns and "P" in sumstats.columns:
1158
1157
  log.write(" -Checking if MLOG10P-derived-P is consistent with P...",verbose=verbose)
@@ -1160,25 +1159,30 @@ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verb
1160
1159
  is_close = np.isclose(mlog10p_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1161
1160
  diff = mlog10p_derived_p - sumstats["P"]
1162
1161
  if sum(~is_close)>0:
1163
- log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1164
- log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1162
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
1163
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
1165
1164
  else:
1166
1165
  log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1166
+ check_status=1
1167
1167
 
1168
1168
  if "N" in sumstats.columns and "N_CONTROL" in sumstats.columns and "N_CASE" in sumstats.columns:
1169
1169
  if verbose: log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...")
1170
- is_close = sumstats.loc[:,"N"] == sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"]
1171
- #is_close = np.isclose(sumstats.loc[:,"N"], sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
1172
- diff = abs(sumstats.loc[:,"N"] - (sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"] ))
1170
+ is_close = sumstats["N"] == sumstats["N_CASE"] + sumstats["N_CONTROL"]
1171
+ #is_close = np.isclose(sumstats["N"], sumstats["N_CASE"] + sumstats["N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
1172
+ diff = abs(sumstats["N"] - (sumstats["N_CASE"] + sumstats["N_CONTROL"] ))
1173
1173
  if sum(~is_close)>0:
1174
- log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1175
- log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1174
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
1175
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
1176
1176
  else:
1177
1177
  log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1178
+ check_status=1
1179
+
1180
+ if check_status==1:
1181
+ log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
1182
+ else:
1183
+ log.write(" -No availalbe columns for data consistency checking...Skipping...",verbose=verbose)
1184
+ finished(log,verbose,_end_line)
1178
1185
 
1179
- log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
1180
-
1181
- if verbose: log.write("Finished checking data consistency across columns.")
1182
1186
  ###############################################################################################################
1183
1187
  # 20220426
1184
1188
  def get_reverse_complementary_allele(a):
@@ -1201,11 +1205,81 @@ def flip_direction(string):
1201
1205
  else: #sometime it is 0
1202
1206
  flipped_string+=char
1203
1207
  return flipped_string
1204
-
1208
+
1209
+ def flip_by_swap(sumstats, matched_index, log, verbose):
1210
+ if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
1211
+ if verbose: log.write(" -Swapping column: NEA <=> EA...")
1212
+ sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
1213
+ return sumstats
1214
+
1215
+ def flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1):
1216
+ if "OR" in sumstats.columns:
1217
+ if verbose: log.write(" -Flipping column: OR = 1 / OR...")
1218
+ sumstats.loc[matched_index,"OR"] = factor / sumstats.loc[matched_index,"OR"].values
1219
+ if "OR_95L" in sumstats.columns:
1220
+ if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95L...")
1221
+ sumstats.loc[matched_index,"OR_95U"] = factor / sumstats.loc[matched_index,"OR_95L"].values
1222
+ if "OR_95U" in sumstats.columns:
1223
+ if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95U...")
1224
+ sumstats.loc[matched_index,"OR_95L"] = factor / sumstats.loc[matched_index,"OR_95U"].values
1225
+ if "HR" in sumstats.columns:
1226
+ if verbose: log.write(" -Flipping column: HR = 1 / HR...")
1227
+ sumstats.loc[matched_index,"HR"] = factor / sumstats.loc[matched_index,"HR"].values
1228
+ if "HR_95L" in sumstats.columns:
1229
+ if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95L...")
1230
+ sumstats.loc[matched_index,"HR_95U"] = factor / sumstats.loc[matched_index,"HR_95L"].values
1231
+ if "HR_95U" in sumstats.columns:
1232
+ if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95U...")
1233
+ sumstats.loc[matched_index,"HR_95L"] = factor / sumstats.loc[matched_index,"HR_95U"].values
1234
+ return sumstats
1235
+
1236
+ def flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1):
1237
+ if "EAF" in sumstats.columns:
1238
+ if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
1239
+ sumstats.loc[matched_index,"EAF"] = factor - sumstats.loc[matched_index,"EAF"].values
1240
+ return sumstats
1241
+
1242
+ def flip_by_sign(sumstats, matched_index, log, verbose, cols=None):
1243
+ if "BETA" in sumstats.columns:
1244
+ if verbose: log.write(" -Flipping column: BETA = - BETA...")
1245
+ sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1246
+ if "BETA_95L" in sumstats.columns:
1247
+ if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95L...")
1248
+ sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95L"].values
1249
+ if "BETA_95U" in sumstats.columns:
1250
+ if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95U...")
1251
+ sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95U"].values
1252
+ if "Z" in sumstats.columns:
1253
+ if verbose: log.write(" -Flipping column: Z = - Z...")
1254
+ sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"Z"].values
1255
+ if "T" in sumstats.columns:
1256
+ if verbose: log.write(" -Flipping column: T = - T...")
1257
+ sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"T"].values
1258
+ if "DIRECTION" in sumstats.columns:
1259
+ if verbose: log.write(" -Flipping column: DIRECTION +-?0 <=> -+?0 ...")
1260
+ sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1261
+ return sumstats
1262
+
1205
1263
  def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1206
-
1207
- check_dataframe_shape(sumstats, log, verbose)
1208
-
1264
+ ##start function with col checking##########################################################
1265
+ _start_line = "adjust statistics based on STATUS code"
1266
+ _end_line = "adjusting statistics based on STATUS code"
1267
+ _start_cols =[]
1268
+ _start_function = ".check_data_consistency()"
1269
+ _must_args ={}
1270
+
1271
+ is_enough_info = start_to(sumstats=sumstats,
1272
+ log=log,
1273
+ verbose=verbose,
1274
+ start_line=_start_line,
1275
+ end_line=_end_line,
1276
+ start_cols=_start_cols,
1277
+ start_function=_start_function,
1278
+ **_must_args)
1279
+ if is_enough_info == False: return sumstats
1280
+ ############################################################################################
1281
+
1282
+ if_stats_flipped = False
1209
1283
  ###################get reverse complementary####################
1210
1284
  pattern = r"\w\w\w\w\w[45]\w"
1211
1285
  #matched_index = status_match(sumstats[status],6,[4,5]) #
@@ -1217,107 +1291,49 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1217
1291
  if verbose: log.write(" -Converting to reverse complement : EA and NEA...")
1218
1292
  reverse_complement_nea = sumstats.loc[matched_index,'NEA'].apply(lambda x :get_reverse_complementary_allele(x))
1219
1293
  reverse_complement_ea = sumstats.loc[matched_index,'EA'].apply(lambda x :get_reverse_complementary_allele(x))
1220
- categories = set(sumstats.loc[:,'EA'])|set(sumstats.loc[:,'NEA']) |set(reverse_complement_ea) |set(reverse_complement_nea)
1221
- sumstats.loc[:,'EA']=pd.Categorical(sumstats.loc[:,'EA'],categories = categories)
1222
- sumstats.loc[:,'NEA']=pd.Categorical(sumstats.loc[:,'NEA'],categories = categories )
1294
+ categories = set(sumstats['EA'])|set(sumstats['NEA']) |set(reverse_complement_ea) |set(reverse_complement_nea)
1295
+ sumstats['EA']=pd.Categorical(sumstats['EA'],categories = categories)
1296
+ sumstats['NEA']=pd.Categorical(sumstats['NEA'],categories = categories )
1223
1297
  sumstats.loc[matched_index,['NEA']] = reverse_complement_nea
1224
1298
  sumstats.loc[matched_index,['EA']] = reverse_complement_ea
1225
1299
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "4","2")
1226
1300
  if verbose: log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x")
1227
-
1301
+ if_stats_flipped = True
1228
1302
  ###################flip ref####################
1229
1303
  pattern = r"\w\w\w\w\w[35]\w"
1230
1304
  #matched_index = status_match(sumstats[status],6,[3,5]) #sumstats[status].str.match(pattern)
1231
1305
  matched_index = sumstats[status].str[5].str.match(r"3|5")
1232
1306
  if sum(matched_index)>0:
1233
- if verbose: log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: alt->ea , ref->nea ...{}".format(_get_version()))
1307
+ if verbose: log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: ALT->EA , REF->NEA ...{}".format(_get_version()))
1234
1308
  if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1235
- if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
1236
- if verbose: log.write(" -Swapping column: NEA <=> EA...")
1237
- sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
1238
- if "BETA" in sumstats.columns:
1239
- if verbose: log.write(" -Flipping column: BETA = - BETA...")
1240
- sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1241
- if "BETA_95L" in sumstats.columns:
1242
- if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
1243
- sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
1244
- if "BETA_95U" in sumstats.columns:
1245
- if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
1246
- sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
1247
- if "EAF" in sumstats.columns:
1248
- if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
1249
- sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
1250
- if "OR" in sumstats.columns:
1251
- if verbose: log.write(" -Flipping column: OR = 1 / OR...")
1252
- sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
1253
- if "OR_95L" in sumstats.columns:
1254
- if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
1255
- sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
1256
- if "OR_95U" in sumstats.columns:
1257
- if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
1258
- sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
1259
- if "HR" in sumstats.columns:
1260
- if verbose: log.write(" -Flipping column: HR = 1 / HR...")
1261
- sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
1262
- if "HR_95L" in sumstats.columns:
1263
- if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
1264
- sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
1265
- if "HR_95U" in sumstats.columns:
1266
- if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
1267
- sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
1268
- if "DIRECTION" in sumstats.columns:
1269
- if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
1270
- sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1309
+
1310
+ flip_by_swap(sumstats, matched_index, log, verbose)
1311
+ flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
1312
+ flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
1313
+ flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
1314
+
1271
1315
  #change status
1272
1316
  if verbose: log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x")
1273
1317
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "35","12")
1318
+ if_stats_flipped = True
1274
1319
 
1275
1320
  ###################flip ref for undistingushable indels####################
1276
1321
  pattern = r"\w\w\w\w[123][67]6"
1277
1322
  #matched_index = status_match(sumstats[status],6,[1,2,3])|status_match(sumstats[status],6,[6,7])|status_match(sumstats[status],7,6) #sumstats[status].str.match(pattern)
1278
1323
  matched_index = sumstats[status].str[4:].str.match(r"[123][67]6")
1279
1324
  if sum(matched_index)>0:
1280
- if verbose: log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: alt->ea , ref->nea...{}".format(_get_version()))
1325
+ if verbose: log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: ALT->EA , REF->NEA...{}".format(_get_version()))
1281
1326
  if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1282
- if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
1283
- if verbose: log.write(" -Swapping column: NEA <=> EA...")
1284
- sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
1285
- if "BETA" in sumstats.columns:
1286
- if verbose: log.write(" -Flipping column: BETA = - BETA...")
1287
- sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1288
- if "BETA_95L" in sumstats.columns:
1289
- if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
1290
- sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
1291
- if "BETA_95U" in sumstats.columns:
1292
- if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
1293
- sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
1294
- if "EAF" in sumstats.columns:
1295
- if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
1296
- sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
1297
- if "OR" in sumstats.columns:
1298
- if verbose: log.write(" -Flipping column: OR = 1 / OR...")
1299
- sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
1300
- if "OR_95L" in sumstats.columns:
1301
- if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
1302
- sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
1303
- if "OR_95U" in sumstats.columns:
1304
- if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
1305
- sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
1306
- if "HR" in sumstats.columns:
1307
- if verbose: log.write(" -Flipping column: HR = 1 / HR...")
1308
- sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
1309
- if "HR_95L" in sumstats.columns:
1310
- if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
1311
- sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
1312
- if "HR_95U" in sumstats.columns:
1313
- if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
1314
- sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
1315
- if "DIRECTION" in sumstats.columns:
1316
- if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
1317
- sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1327
+
1328
+ flip_by_swap(sumstats, matched_index, log, verbose)
1329
+ flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
1330
+ flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
1331
+ flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
1332
+
1318
1333
  #change status
1319
1334
  if verbose: log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4")
1320
1335
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "6","4")
1336
+ if_stats_flipped = True
1321
1337
  # flip ref
1322
1338
  ###################flip statistics for reverse strand panlindromic variants####################
1323
1339
  pattern = r"\w\w\w\w\w[012]5"
@@ -1326,43 +1342,20 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1326
1342
  if sum(matched_index)>0:
1327
1343
  if verbose: log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()))
1328
1344
  if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1329
- if "BETA" in sumstats.columns:
1330
- if verbose: log.write(" -Flipping column: BETA = - BETA...")
1331
- sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1332
- if "BETA_95L" in sumstats.columns:
1333
- if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
1334
- sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
1335
- if "BETA_95U" in sumstats.columns:
1336
- if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
1337
- sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
1338
- if "EAF" in sumstats.columns:
1339
- if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
1340
- sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
1341
- if "OR" in sumstats.columns:
1342
- if verbose: log.write(" -Flipping column: OR = 1 / OR...")
1343
- sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
1344
- if "OR_95L" in sumstats.columns:
1345
- if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
1346
- sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
1347
- if "OR_95U" in sumstats.columns:
1348
- if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
1349
- sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
1350
- if "HR" in sumstats.columns:
1351
- if verbose: log.write(" -Flipping column: HR = 1 / HR...")
1352
- sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
1353
- if "HR_95L" in sumstats.columns:
1354
- if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
1355
- sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
1356
- if "HR_95U" in sumstats.columns:
1357
- if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
1358
- sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
1359
- if "DIRECTION" in sumstats.columns:
1360
- if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
1361
- sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1345
+
1346
+ flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
1347
+ flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
1348
+ flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
1349
+
1362
1350
  #change status
1363
1351
  if verbose: log.write(" -Changed the status for flipped variants: xxxxx[012]5: -> xxxxx[012]2")
1364
1352
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "5","2")
1365
- if verbose: log.write("Finished converting successfully!")
1353
+ if_stats_flipped = True
1354
+
1355
+ if if_stats_flipped == True:
1356
+ finished(log, verbose, "adjusting")
1357
+ else:
1358
+ finished(log, verbose, "adjusting with no statistics changed.")
1366
1359
  return sumstats
1367
1360
  ""
1368
1361
 
@@ -1371,8 +1364,8 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1371
1364
  # 20220426
1372
1365
  def liftover_snv(row,chrom,converter,to_build):
1373
1366
  status_pre=""
1374
- status_end=row[1][2]+"9"+row[1][4]+"99"
1375
- pos_0_based = int(row[0]) - 1
1367
+ status_end=row.iloc[1][2]+"9"+row.iloc[1][4]+"99"
1368
+ pos_0_based = int(row.iloc[0]) - 1
1376
1369
  results = converter[chrom][pos_0_based]
1377
1370
  if converter[chrom][pos_0_based]:
1378
1371
  # return chrom, pos_1_based
@@ -1402,13 +1395,25 @@ def liftover_variant(sumstats,
1402
1395
  return sumstats
1403
1396
 
1404
1397
  def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
1405
- if check_col(sumstats,chrom,pos,status) is not True:
1406
- if verbose: log.write("WARNING! .liftover(): specified columns not detected..skipping...")
1407
- return sumstats
1408
- if verbose: log.write("Start to perform liftover...{}".format(_get_version()))
1409
- check_dataframe_shape(sumstats, log, verbose)
1410
- if verbose: log.write(" -CPU Cores to use :",n_cores)
1411
- if verbose: log.write(" -Performing liftover ...")
1398
+ ##start function with col checking##########################################################
1399
+ _start_line = "perform liftover"
1400
+ _end_line = "liftover"
1401
+ _start_cols =[chrom,pos,status]
1402
+ _start_function = ".liftover()"
1403
+ _must_args ={}
1404
+
1405
+ is_enough_info = start_to(sumstats=sumstats,
1406
+ log=log,
1407
+ verbose=verbose,
1408
+ start_line=_start_line,
1409
+ end_line=_end_line,
1410
+ start_cols=_start_cols,
1411
+ start_function=_start_function,
1412
+ n_cores=n_cores,
1413
+ **_must_args)
1414
+ if is_enough_info == False: return sumstats
1415
+ ############################################################################################
1416
+
1412
1417
  if verbose: log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build)
1413
1418
  # valid chr and pos
1414
1419
  pattern = r"\w\w\w0\w\w\w"
@@ -1420,11 +1425,12 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
1420
1425
  if sum(to_lift)<10000:
1421
1426
  n_cores=1
1422
1427
 
1423
- df_split = np.array_split(sumstats.loc[:,[chrom,pos,status]], n_cores)
1428
+ #df_split = np.array_split(sumstats[[chrom,pos,status]], n_cores)
1429
+ df_split = _df_split(sumstats[[chrom,pos,status]], n_cores)
1424
1430
  pool = Pool(n_cores)
1425
1431
  #df = pd.concat(pool.starmap(func, df_split))
1426
1432
  func=liftover_variant
1427
- sumstats.loc[:,[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
1433
+ sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
1428
1434
  pool.close()
1429
1435
  pool.join()
1430
1436
  ############################################################################
@@ -1439,18 +1445,29 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
1439
1445
  sumstats = fixchr(sumstats,chrom=chrom,add_prefix="",remove=remove, verbose=True)
1440
1446
  sumstats = fixpos(sumstats,pos=pos,remove=remove, verbose=True)
1441
1447
 
1442
- if verbose: log.write("Finished liftover successfully!")
1448
+ finished(log,verbose,_end_line)
1443
1449
  return sumstats
1444
1450
 
1445
1451
  ###############################################################################################################
1446
1452
  # 20220426
1447
1453
  def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=Log()):
1448
- if check_col(sumstats,chrom,pos) is not True:
1449
- if verbose: log.write(".liftover(): specified columns not detected..skipping...")
1450
- return sumstats
1451
-
1452
- if verbose: log.write("Start to sort the genome coordinates...{}".format(_get_version()))
1453
- check_dataframe_shape(sumstats, log, verbose)
1454
+ ##start function with col checking##########################################################
1455
+ _start_line = "sort the genome coordinates"
1456
+ _end_line = "sorting coordinates"
1457
+ _start_cols =[chrom,pos]
1458
+ _start_function = ".sort_coordinate()"
1459
+ _must_args ={}
1460
+
1461
+ is_enough_info = start_to(sumstats=sumstats,
1462
+ log=log,
1463
+ verbose=verbose,
1464
+ start_line=_start_line,
1465
+ end_line=_end_line,
1466
+ start_cols=_start_cols,
1467
+ start_function=_start_function,
1468
+ **_must_args)
1469
+ if is_enough_info == False: return sumstats
1470
+ ############################################################################################
1454
1471
 
1455
1472
  try:
1456
1473
  if sumstats[pos].dtype == "Int64":
@@ -1460,50 +1477,144 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
1460
1477
  sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
1461
1478
  except:
1462
1479
  pass
1463
-
1464
- if verbose: log.write(" -Sorting genome coordinates...")
1465
1480
  sumstats = sumstats.sort_values(by=[chrom,pos],ascending=True,ignore_index=True)
1466
- if verbose: log.write("Finished sorting genome coordinates successfully!")
1467
- gc.collect()
1481
+
1482
+ finished(log,verbose,_end_line)
1468
1483
  return sumstats
1469
1484
  ###############################################################################################################
1470
1485
  # 20230430 added HR HR_95 BETA_95 N_CASE N_CONTROL
1471
- def sortcolumn(sumstats,verbose=True,log=Log(),order = [
1486
+ def sortcolumn(sumstats,verbose=True,log=Log(),order = None):
1487
+ ##start function with col checking##########################################################
1488
+ _start_line = "reorder the columns"
1489
+ _end_line = "reordering the columns"
1490
+ _start_cols =[]
1491
+ _start_function = ".sort_column()"
1492
+ _must_args ={}
1493
+
1494
+ is_enough_info = start_to(sumstats=sumstats,
1495
+ log=log,
1496
+ verbose=verbose,
1497
+ start_line=_start_line,
1498
+ end_line=_end_line,
1499
+ start_cols=_start_cols,
1500
+ start_function=_start_function,
1501
+ **_must_args)
1502
+ if is_enough_info == False: return sumstats
1503
+ ############################################################################################
1504
+
1505
+ if order is None:
1506
+ order = [
1472
1507
  "SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z","T","F",
1473
- "CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"
1474
- ]):
1475
- if verbose: log.write("Start to reorder the columns...{}".format(_get_version()))
1476
- check_dataframe_shape(sumstats, log, verbose)
1477
-
1508
+ "CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"]
1478
1509
  output_columns = []
1479
1510
  for i in order:
1480
1511
  if i in sumstats.columns: output_columns.append(i)
1481
1512
  for i in sumstats.columns:
1482
1513
  if i not in order: output_columns.append(i)
1483
1514
  if verbose: log.write(" -Reordering columns to :", ",".join(output_columns))
1484
- sumstats = sumstats.loc[:, output_columns]
1485
- if verbose: log.write("Finished sorting columns successfully!")
1515
+ sumstats = sumstats[ output_columns]
1516
+
1517
+ finished(log,verbose,_end_line)
1486
1518
  return sumstats
1487
1519
 
1488
- def check_col(df,*args):
1520
+
1521
+ ###############################################################################################################
1522
+ def start_to(sumstats,
1523
+ log,
1524
+ verbose,
1525
+ start_line,
1526
+ end_line,
1527
+ start_cols,
1528
+ start_function,
1529
+ ref_vcf=None,
1530
+ ref_fasta=None,
1531
+ n_cores=None,
1532
+ ref_tsv=None,
1533
+ **args
1534
+ ):
1535
+
1536
+ log.write("Start to {}...{}".format(start_line,_get_version()), verbose=verbose)
1537
+
1538
+ check_dataframe_shape(sumstats=sumstats,
1539
+ log=log,
1540
+ verbose=verbose)
1541
+
1542
+ is_enough_col = check_col(sumstats.columns,
1543
+ verbose=verbose,
1544
+ log=log,
1545
+ cols=start_cols,
1546
+ function=start_function)
1547
+
1548
+ if is_enough_col==True:
1549
+ if n_cores is not None:
1550
+ log.write(" -Number of threads/cores to use: {}".format(n_cores))
1551
+ if ref_vcf is not None:
1552
+ log.write(" -Reference VCF: {}".format(ref_vcf))
1553
+ if ref_fasta is not None:
1554
+ log.write(" -Reference FASTA: {}".format(ref_fasta))
1555
+ if ref_tsv is not None:
1556
+ log.write(" -Reference TSV: {}".format(ref_tsv))
1557
+
1558
+ is_args_valid = True
1559
+ for key, value in args.items():
1560
+ is_args_valid = is_args_valid & check_arg(log, verbose, key, value, start_function)
1561
+ is_enough_col = is_args_valid & is_enough_col
1562
+
1563
+ if is_enough_col == False:
1564
+ skipped(log, verbose, end_line)
1565
+
1566
+ return is_enough_col
1567
+
1568
+ def finished(log, verbose, end_line):
1569
+ log.write("Finished {}.".format(end_line), verbose=verbose)
1570
+ gc.collect()
1571
+
1572
+ def skipped(log, verbose, end_line):
1573
+ log.write("Skipped {}.".format(end_line), verbose=verbose)
1574
+ gc.collect()
1575
+
1576
+ def check_arg(log, verbose, key, value, function):
1577
+ if value is None:
1578
+ log.warning("Necessary argument {} for {} is not provided!".format(key, function))
1579
+ return False
1580
+ return True
1581
+
1582
+ def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
1489
1583
  not_in_df=[]
1490
- for i in args:
1584
+ for i in cols:
1491
1585
  if type(i) is str:
1492
- if i in df.columns:
1586
+ # single check
1587
+ if i in df_col_names:
1493
1588
  continue
1494
1589
  else:
1495
1590
  not_in_df.append(i)
1496
1591
  else:
1592
+ # paried check
1497
1593
  count=0
1498
1594
  for j in i:
1499
- if j in df.columns:
1595
+ if j not in df_col_names:
1596
+ not_in_df.append(j)
1500
1597
  count+=1
1501
- if count==0:
1502
- return False
1503
- print(" -Specified columns names was not detected. Please check:"+",".join(i))
1504
-
1598
+
1505
1599
  if len(not_in_df)>0:
1600
+ if function is None:
1601
+ to_show_title=" "
1602
+ else:
1603
+ to_show_title = " for {} ".format(function)
1604
+ log.warning("Necessary columns{}were not detected:{}".format(to_show_title, ",".join(not_in_df)))
1605
+ skipped(log, verbose, end_line=function)
1506
1606
  return False
1507
- print(" -Specified columns names was not detected. Please check:"+",".join(not_in_df))
1607
+
1508
1608
  return True
1509
1609
 
1610
+ ###############################################################################################################
1611
+ def _df_split(dataframe, n):
1612
+ chunks = []
1613
+ chunk_size = int(dataframe.shape[0] // n)+1
1614
+
1615
+ for index in range(0, dataframe.shape[0], chunk_size):
1616
+ chunks.append(
1617
+ dataframe.iloc[index:index + chunk_size]
1618
+ )
1619
+
1620
+ return chunks