gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/data/formatbook.json +722 -721
  5. gwaslab/g_Log.py +22 -5
  6. gwaslab/g_Sumstats.py +110 -163
  7. gwaslab/g_SumstatsPair.py +76 -25
  8. gwaslab/g_SumstatsT.py +2 -2
  9. gwaslab/g_Sumstats_summary.py +3 -3
  10. gwaslab/g_version.py +10 -10
  11. gwaslab/hm_casting.py +36 -17
  12. gwaslab/hm_harmonize_sumstats.py +354 -221
  13. gwaslab/hm_rsid_to_chrpos.py +1 -1
  14. gwaslab/io_preformat_input.py +49 -43
  15. gwaslab/io_read_ldsc.py +49 -1
  16. gwaslab/io_to_formats.py +428 -295
  17. gwaslab/ldsc_irwls.py +198 -0
  18. gwaslab/ldsc_jackknife.py +514 -0
  19. gwaslab/ldsc_ldscore.py +417 -0
  20. gwaslab/ldsc_parse.py +294 -0
  21. gwaslab/ldsc_regressions.py +747 -0
  22. gwaslab/ldsc_sumstats.py +629 -0
  23. gwaslab/qc_check_datatype.py +3 -3
  24. gwaslab/qc_fix_sumstats.py +891 -778
  25. gwaslab/util_ex_calculate_ldmatrix.py +31 -13
  26. gwaslab/util_ex_gwascatalog.py +25 -25
  27. gwaslab/util_ex_ldproxyfinder.py +10 -10
  28. gwaslab/util_ex_ldsc.py +189 -0
  29. gwaslab/util_ex_process_ref.py +3 -3
  30. gwaslab/util_ex_run_coloc.py +26 -4
  31. gwaslab/util_in_calculate_gc.py +6 -6
  32. gwaslab/util_in_calculate_power.py +42 -43
  33. gwaslab/util_in_convert_h2.py +8 -8
  34. gwaslab/util_in_fill_data.py +30 -30
  35. gwaslab/util_in_filter_value.py +201 -74
  36. gwaslab/util_in_get_density.py +10 -10
  37. gwaslab/util_in_get_sig.py +445 -71
  38. gwaslab/viz_aux_annotate_plot.py +12 -12
  39. gwaslab/viz_aux_quickfix.py +42 -37
  40. gwaslab/viz_aux_reposition_text.py +10 -7
  41. gwaslab/viz_aux_save_figure.py +18 -8
  42. gwaslab/viz_plot_compare_af.py +32 -33
  43. gwaslab/viz_plot_compare_effect.py +63 -71
  44. gwaslab/viz_plot_miamiplot2.py +34 -26
  45. gwaslab/viz_plot_mqqplot.py +126 -75
  46. gwaslab/viz_plot_qqplot.py +11 -8
  47. gwaslab/viz_plot_regionalplot.py +36 -33
  48. gwaslab/viz_plot_rg_heatmap.py +28 -26
  49. gwaslab/viz_plot_stackedregional.py +40 -21
  50. gwaslab/viz_plot_trumpetplot.py +65 -61
  51. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  52. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
  53. gwaslab-3.4.39.dist-info/RECORD +80 -0
  54. gwaslab-3.4.37.dist-info/RECORD +0 -72
  55. /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  56. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  57. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -47,14 +47,14 @@ def _process_build(build,log,verbose):
47
47
  log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
48
48
  final_build = "38"
49
49
  else:
50
- log.write(" -WARNING! Version of genomic coordinates is unknown...", verbose=verbose)
50
+ log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
51
51
  final_build = "99"
52
52
  return final_build
53
53
 
54
54
  def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
55
55
  build = _process_build(build,log=log,verbose=verbose)
56
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 1, "139",build[0]*3)
57
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 2, "89",build[1]*3)
56
+ sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
57
+ sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
58
58
  return sumstats, build
59
59
 
60
60
  def fixID(sumstats,
@@ -66,35 +66,49 @@ def fixID(sumstats,
66
66
  2. fix chr and pos using snpid
67
67
  3. checking rsid and chr:pos:nea:ea
68
68
  '''
69
- if verbose: log.write("Start to check IDs...{}".format(_get_version()))
70
- check_dataframe_shape(sumstats, log, verbose)
71
- check_col(sumstats,[snpid,rsid],status)
69
+ ##start function with col checking##########################################################
70
+ _start_line = "check SNPID/rsID"
71
+ _end_line = "checking SNPID/rsID"
72
+ _start_cols =[]
73
+ _start_function = ".fix_id()"
74
+ _must_args ={}
75
+
76
+ is_enough_info = start_to(sumstats=sumstats,
77
+ log=log,
78
+ verbose=verbose,
79
+ start_line=_start_line,
80
+ end_line=_end_line,
81
+ start_cols=_start_cols,
82
+ start_function=_start_function,
83
+ **_must_args)
84
+ if is_enough_info == False: return sumstats
85
+ ############################################################################################
72
86
 
73
87
  ############################ checking datatype ###################################################
74
88
  if rsid in sumstats.columns:
75
89
  # convert to string datatype
76
90
  try:
77
91
  log.write(" -Checking rsID data type...",verbose=verbose)
78
- if sumstats.loc[:,rsid].dtype == "string":
92
+ if sumstats[rsid].dtype == "string":
79
93
  pass
80
94
  else:
81
95
  log.write(" -Converting rsID to pd.string data type...",verbose=verbose)
82
- sumstats.loc[:,rsid] = sumstats.loc[:,rsid].astype("string")
96
+ sumstats[rsid] = sumstats[rsid].astype("string")
83
97
  except:
84
98
  log.write(" -Force converting rsID to pd.string data type...",verbose=verbose)
85
- sumstats.loc[:,rsid] = sumstats.loc[:,rsid].astype("string")
99
+ sumstats[rsid] = sumstats[rsid].astype("string")
86
100
  if snpid in sumstats.columns:
87
101
  # convert to string datatype
88
102
  try:
89
103
  log.write(" -Checking SNPID data type...",verbose=verbose)
90
- if sumstats.loc[:,snpid].dtype == "string":
104
+ if sumstats[snpid].dtype == "string":
91
105
  pass
92
106
  else:
93
107
  log.write(" -Converting SNPID to pd.string data type...",verbose=verbose)
94
- sumstats.loc[:,snpid] = sumstats.loc[:,snpid].astype("string")
108
+ sumstats[snpid] = sumstats[snpid].astype("string")
95
109
  except:
96
110
  log.write(" -Force converting SNPID to pd.string data type...",verbose=verbose)
97
- sumstats.loc[:,snpid] = sumstats.loc[:,snpid].astype("string")
111
+ sumstats[snpid] = sumstats[snpid].astype("string")
98
112
 
99
113
  ############################ checking ###################################################
100
114
  if snpid in sumstats.columns:
@@ -115,7 +129,7 @@ def fixID(sumstats,
115
129
  sumstats.loc[ is_rsid,status] = vchange_status(sumstats.loc[ is_rsid,status], 3, "986","520")
116
130
  sumstats.loc[~is_rsid,status] = vchange_status(sumstats.loc[~is_rsid,status], 3, "986","743")
117
131
 
118
- if verbose: log.write(" -Checking if CHR:POS:NEA:EA is mixed in rsID column ...")
132
+ log.write(" -Checking if CHR:POS:NEA:EA is mixed in rsID column ...", verbose=verbose)
119
133
  is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
120
134
 
121
135
  log.write(" -Number of CHR:POS:NEA:EA mixed in rsID column :",sum(is_rs_chrpos), verbose=verbose)
@@ -126,9 +140,9 @@ def fixID(sumstats,
126
140
  if fixchrpos == True:
127
141
  # from snpid or rsid, extract CHR:POS to fix CHR and POS
128
142
  if snpid in sumstats.columns:
129
- if verbose: log.write(" -Fixing CHR and POS...")
143
+ log.write(" -Fixing CHR and POS...", verbose=verbose)
130
144
  if overwrite is True:
131
- if verbose: log.write(" -Overwrite is applied...")
145
+ log.write(" -Overwrite is applied...", verbose=verbose)
132
146
  # fix all
133
147
  to_fix = is_chrposrefalt
134
148
 
@@ -137,35 +151,39 @@ def fixID(sumstats,
137
151
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
138
152
  to_fix_num = sum(to_fix)
139
153
  if to_fix_num and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
140
- elif verbose: log.write(" -No fixable variants. ...")
154
+ else:
155
+ log.write(" -No fixable variants. ...", verbose=verbose)
141
156
 
142
157
  elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
143
- if verbose: log.write(" -Initiating CHR columns...")
144
- sumstats.loc[:,chrom]=pd.Series(dtype="string")
158
+ log.write(" -Initiating CHR columns...", verbose=verbose)
159
+ sumstats[chrom]=pd.Series(dtype="string")
145
160
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
146
161
  to_fix_num = sum(to_fix)
147
162
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
148
- elif verbose: log.write(" -No fixable variants. ...")
163
+ else:
164
+ log.write(" -No fixable variants. ...", verbose=verbose)
149
165
 
150
166
  elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
151
- if verbose: log.write(" -Initiating CHR and POS column...")
152
- sumstats.loc[:,pos]=pd.Series(dtype="Int64")
167
+ log.write(" -Initiating CHR and POS column...", verbose=verbose)
168
+ sumstats[pos]=pd.Series(dtype="Int64")
153
169
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
154
170
  to_fix_num = sum(to_fix)
155
171
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
156
- elif verbose: log.write(" -No fixable variants. ...")
172
+ else:
173
+ log.write(" -No fixable variants. ...", verbose=verbose)
157
174
 
158
175
  else:
159
- if verbose: log.write(" -Initiating CHR and POS columns...")
160
- sumstats.loc[:,chrom]=pd.Series(dtype="string")
161
- sumstats.loc[:,pos]=pd.Series(dtype="Int64")
176
+ log.write(" -Initiating CHR and POS columns...", verbose=verbose)
177
+ sumstats[chrom]=pd.Series(dtype="string")
178
+ sumstats[pos]=pd.Series(dtype="Int64")
162
179
  to_fix = is_chrposrefalt
163
180
  to_fix_num = sum(to_fix)
164
181
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
165
- elif verbose: log.write(" -No fixable variants. ...")
182
+ else:
183
+ log.write(" -No fixable variants. ...", verbose=verbose)
166
184
 
167
185
  if sum(to_fix)>0:
168
- if verbose: log.write(" -Filling CHR and POS columns using valid SNPID's chr:pos...")
186
+ log.write(" -Filling CHR and POS columns using valid SNPID's chr:pos...", verbose=verbose)
169
187
  # format and qc filled chr and pos
170
188
 
171
189
  sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
@@ -177,36 +195,40 @@ def fixID(sumstats,
177
195
  #sumstats.loc[to_fix,status] = vchange_status(sumstats.loc[to_fix,status], 4, "98765432","00000000")
178
196
 
179
197
  if rsid in sumstats.columns:
180
- if verbose: log.write(" -Fixing CHR and POS using chr:pos:ref:alt format variants in rsID column...")
198
+ log.write(" -Fixing CHR and POS using chr:pos:ref:alt format variants in rsID column...", verbose=verbose)
181
199
  if overwrite is True:
182
- if verbose: log.write(" -Overwrite is applied...")
200
+ log.write(" -Overwrite is applied...", verbose=verbose)
183
201
  to_fix = is_rs_chrpos
184
202
  elif (chrom in sumstats.columns) and (pos in sumstats.columns) :
185
203
  to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
186
204
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
187
- elif verbose: log.write(" -No fixable variants ...")
205
+ else:
206
+ log.write(" -No fixable variants ...", verbose=verbose)
188
207
  elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
189
- if verbose: log.write(" -Initiating CHR columns...")
190
- sumstats.loc[:,chrom]=pd.Series(dtype="string")
208
+ log.write(" -Initiating CHR columns...", verbose=verbose)
209
+ sumstats[chrom]=pd.Series(dtype="string")
191
210
  to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
192
211
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
193
- elif verbose: log.write(" -No fixable variants ...")
212
+ else:
213
+ log.write(" -No fixable variants ...", verbose=verbose)
194
214
  elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
195
- if verbose: log.write(" -Initiating CHR and POS column...")
196
- sumstats.loc[:,pos]=pd.Series(dtype="Int64")
215
+ log.write(" -Initiating CHR and POS column...", verbose=verbose)
216
+ sumstats[pos]=pd.Series(dtype="Int64")
197
217
  to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
198
218
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
199
- elif verbose: log.write(" -No fixable variants ...")
219
+ else:
220
+ log.write(" -No fixable variants ...", verbose=verbose)
200
221
  else:
201
- if verbose: log.write(" -Initiating CHR and POS columns...")
202
- sumstats.loc[:,chrom]=pd.Series(dtype="string")
203
- sumstats.loc[:,pos]=pd.Series(dtype="Int64")
222
+ log.write(" -Initiating CHR and POS columns...", verbose=verbose)
223
+ sumstats[chrom]=pd.Series(dtype="string")
224
+ sumstats[pos]=pd.Series(dtype="Int64")
204
225
  to_fix = is_rs_chrpos
205
226
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
206
- elif verbose: log.write(" -No fixable variants ...")
227
+ else:
228
+ log.write(" -No fixable variants ...", verbose=verbose)
207
229
 
208
230
  if sum(to_fix)>0:
209
- if verbose: log.write(" -Filling CHR and POS columns using chr:pos:ref:alt format variants in rsID column...")
231
+ log.write(" -Filling CHR and POS columns using chr:pos:ref:alt format variants in rsID column...", verbose=verbose)
210
232
  sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,rsid].str.split(':|_|-',n=2).str[0]
211
233
  sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,rsid].str.split(':|_|-',n=2).str[1]
212
234
  #sumstats.loc[to_fix,pos] = np.floor(pd.to_numeric(sumstats.loc[to_fix,rsid].str.split(':|_|-',x).get(1), errors='coerce')).astype('Int64')
@@ -214,40 +236,40 @@ def fixID(sumstats,
214
236
 
215
237
  ############################ fixing chr pos###################################################
216
238
  if fixeanea == True:
217
- if verbose: log.write(" -WARNING! gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT")
239
+ log.warning("gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT", verbose=verbose)
218
240
  if overwrite is True:
219
- if verbose: log.write(" -Overwrite mode is applied...")
241
+ log.write(" -Overwrite mode is applied...", verbose=verbose)
220
242
  to_fix = is_chrposrefalt
221
243
  elif (nea in sumstats.columns) and (nea in sumstats.columns):
222
244
  to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
223
245
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
224
246
  elif (nea in sumstats.columns) and (ea not in sumstats.columns):
225
- if verbose: log.write(" -Initiating EA columns...")
226
- sumstats.loc[:,ea]=pd.Series(dtype="string")
247
+ log.write(" -Initiating EA columns...", verbose=verbose)
248
+ sumstats[ea]=pd.Series(dtype="string")
227
249
  to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
228
250
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
229
251
  elif (nea not in sumstats.columns) and (ea in sumstats.columns):
230
- if verbose: log.write(" -Initiating NEA columns...")
231
- sumstats.loc[:,nea]=pd.Series(dtype="string")
252
+ log.write(" -Initiating NEA columns...", verbose=verbose)
253
+ sumstats[nea]=pd.Series(dtype="string")
232
254
  to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
233
255
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
234
256
  else:
235
- if verbose: log.write(" -Initiating EA and NEA columns...")
257
+ log.write(" -Initiating EA and NEA columns...", verbose=verbose)
236
258
  sumstats[nea]=pd.Series(dtype="string")
237
259
  sumstats[ea]=pd.Series(dtype="string")
238
260
  to_fix = is_chrposrefalt
239
261
  if sum(to_fix)>0:
240
- if verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
262
+ log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...", verbose=verbose)
241
263
  #
242
264
  if sum(to_fix)>0:
243
- if verbose: log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's CHR:POS:NEA:EA...")
265
+ log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's CHR:POS:NEA:EA...", verbose=verbose)
244
266
  #
245
267
  if fixeanea_flip == True:
246
- if verbose: log.write(" -Flipped : CHR:POS:NEA:EA -> CHR:POS:EA:NEA ")
268
+ log.write(" -Flipped : CHR:POS:NEA:EA -> CHR:POS:EA:NEA ", verbose=verbose)
247
269
  sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
248
270
  sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
249
271
  else:
250
- if verbose: log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ")
272
+ log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ", verbose=verbose)
251
273
  sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
252
274
  sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
253
275
 
@@ -259,22 +281,22 @@ def fixID(sumstats,
259
281
  ############################ fixing id ###################################################
260
282
  if fixsep == True:
261
283
  if snpid in sumstats.columns:
262
- if verbose: log.write(' -Replacing [_-] in SNPID with ":" ...')
263
- sumstats.loc[:,snpid] = sumstats.loc[:,snpid].str.replace(r"[_-]",":",regex=True)
284
+ log.write(' -Replacing [_-] in SNPID with ":" ...', verbose=verbose)
285
+ sumstats[snpid] = sumstats[snpid].str.replace(r"[_-]",":",regex=True)
264
286
 
265
287
  if fixprefix == True:
266
288
  if snpid in sumstats.columns:
267
- if verbose: log.write(' -Removing /^chr/ in SNPID ...')
268
- prefix_removed = sumstats.loc[:,snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
289
+ log.write(' -Removing /^chr/ in SNPID ...', verbose=verbose)
290
+ prefix_removed = sumstats[snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
269
291
  sumstats.loc[~prefix_removed.isna(),snpid] = prefix_removed[~prefix_removed.isna()]
270
292
 
271
293
  if fixid == True:
272
294
  if snpid not in sumstats.columns:
273
295
  # initiate a SNPID column
274
- sumstats.loc[:,snpid]=pd.Series(dtype="string")
296
+ sumstats[snpid]=pd.Series(dtype="string")
275
297
 
276
298
  if (rsid in sumstats.columns) and (sum(is_rs_chrpos)>0) :
277
- sumstats.loc[:,snpid]= sumstats.loc[is_rs_chrpos,rsid]
299
+ sumstats[snpid]= sumstats.loc[is_rs_chrpos,rsid]
278
300
 
279
301
  if (chrom in sumstats.columns) and (pos in sumstats.columns):
280
302
  #only fix when CHR and POS is available
@@ -313,23 +335,25 @@ def fixID(sumstats,
313
335
  sumstats.loc[to_part_fix,snpid] = sumstats.loc[to_part_fix,chrom].astype("string") + ":"+sumstats.loc[to_part_fix,pos].astype("string")
314
336
  if sum(to_full_fix)>0:
315
337
  sumstats.loc[to_full_fix,snpid] = sumstats.loc[to_full_fix,chrom].astype("string") + ":"+sumstats.loc[to_full_fix,pos].astype("string") +":"+ sumstats.loc[to_full_fix,nea].astype("string") +":"+ sumstats.loc[to_full_fix,ea].astype("string")
316
- if verbose: log.write(" -Filling "+str(sum(to_part_fix)-sum(to_full_fix)) +" SNPID using CHR:POS...")
317
- if verbose: log.write(" -Filling "+str(sum(to_full_fix)) +" SNPID using CHR:POS:NEA:EA...")
338
+ log.write(" -Filling "+str(sum(to_part_fix)-sum(to_full_fix)) +" SNPID using CHR:POS...", verbose=verbose)
339
+ log.write(" -Filling "+str(sum(to_full_fix)) +" SNPID using CHR:POS:NEA:EA...", verbose=verbose)
318
340
  sumstats.loc[(to_full_fix),status] = vchange_status(sumstats.loc[(to_full_fix),status],3,"975","630")
319
341
  sumstats.loc[(to_part_fix),status] = vchange_status(sumstats.loc[(to_part_fix),status],3,"975","842")
320
342
 
321
343
  else:
322
344
  #when these is no ea or ena, just fix to chr:pos
323
345
  to_part_fix = to_fix & sumstats[chrom].notnull() & sumstats[pos].notnull()
324
- if verbose: log.write(" -Filling "+str(sum(to_part_fix)) +" SNPID using CHR POS...")
346
+ log.write(" -Filling "+str(sum(to_part_fix)) +" SNPID using CHR POS...", verbose=verbose)
325
347
  if sum(to_part_fix)>0:
326
348
  sumstats.loc[to_part_fix,snpid] = sumstats.loc[to_part_fix,chrom].astype("string") + ":"+sumstats.loc[to_part_fix,pos].astype("string")
327
349
  sumstats.loc[to_part_fix,status] = vchange_status(sumstats.loc[(to_part_fix),status],3,"975","842")
328
350
 
329
351
  after_number=sum(sumstats[snpid].isna())
330
- if verbose: log.write(" -Fixed "+ str(pre_number - after_number) +" variants ID...")
331
- elif verbose: log.write(" -ID unfixable: no CHR and POS columns or no SNPID. ")
332
- if verbose: log.write("Finished checking IDs successfully!")
352
+ log.write(" -Fixed "+ str(pre_number - after_number) +" variants ID...", verbose=verbose)
353
+ else:
354
+ log.write(" -ID unfixable: no CHR and POS columns or no SNPID. ", verbose=verbose)
355
+
356
+ finished(log,verbose,_end_line)
333
357
  return sumstats
334
358
 
335
359
  ""
@@ -344,73 +368,90 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
344
368
  remove duplicate SNPs based on 3. rsID
345
369
  remove multiallelic SNPs based on 4. CHR, POS
346
370
  '''
347
-
348
- if verbose: log.write("Start to remove duplicated/multiallelic variants...{}".format(_get_version()))
349
- if verbose: log.write(" -Removing mode:{}".format(mode))
371
+
372
+ ##start function with col checking##########################################################
373
+ _start_line = "remove duplicated/multiallelic variants"
374
+ _end_line = "removing duplicated/multiallelic variants"
375
+ _start_cols =[]
376
+ _start_function = ".remove_dup()"
377
+ _must_args ={}
378
+
379
+ is_enough_info = start_to(sumstats=sumstats,
380
+ log=log,
381
+ verbose=verbose,
382
+ start_line=_start_line,
383
+ end_line=_end_line,
384
+ start_cols=_start_cols,
385
+ start_function=_start_function,
386
+ **_must_args)
387
+ if is_enough_info == False: return sumstats
388
+ ############################################################################################
389
+
390
+ log.write(" -Removing mode:{}".format(mode), verbose=verbose)
350
391
  # sort the variants using the specified column before removing
351
392
  if keep_col is not None :
352
393
  if keep_col in sumstats.columns:
353
- if verbose: log.write("Start to sort the sumstats using {}...".format(keep_col))
394
+ log.write("Start to sort the sumstats using {}...".format(keep_col), verbose=verbose)
354
395
  sumstats = sumstats.sort_values(by=keep_col,ascending=keep_ascend)
355
396
  else:
356
- if verbose: log.write("Column" + keep_col +" was not detected... skipping... ")
397
+ log.write("Column" + keep_col +" was not detected... skipping... ", verbose=verbose)
357
398
  total_number = len(sumstats)
358
399
 
359
400
  # remove by duplicated SNPID
360
401
  if (snpid in sumstats.columns) and ("d" in mode or "s" in mode):
361
- if verbose: log.write("Start to remove duplicated variants based on snpid...{}".format(_get_version()))
402
+ log.write("Start to remove duplicated variants based on snpid...{}".format(_get_version()), verbose=verbose)
362
403
  check_dataframe_shape(sumstats, log, verbose)
363
- if verbose: log.write(" -Which variant to keep: ", keep )
404
+ log.write(" -Which variant to keep: ", keep , verbose=verbose)
364
405
  pre_number =len(sumstats)
365
406
  if snpid in sumstats.columns:
366
407
  # keep na and remove duplicated
367
408
  sumstats = sumstats.loc[sumstats[snpid].isna() | (~sumstats.duplicated(subset=[snpid], keep=keep)),:]
368
409
  after_number=len(sumstats)
369
- if verbose: log.write(" -Removed ",pre_number -after_number ," based on SNPID...")
410
+ log.write(" -Removed ",pre_number -after_number ," based on SNPID...", verbose=verbose)
370
411
 
371
412
  # remove by duplicated rsID
372
413
  if (rsid in sumstats.columns) and ("d" in mode or "r" in mode):
373
414
  # keep na and remove duplicated
374
415
  pre_number =len(sumstats)
375
- if verbose: log.write("Start to remove duplicated variants based on rsID...")
416
+ log.write("Start to remove duplicated variants based on rsID...", verbose=verbose)
376
417
  check_dataframe_shape(sumstats, log, verbose)
377
418
  sumstats = sumstats.loc[sumstats[rsid].isna() | (~sumstats.duplicated(subset=rsid, keep=keep)),:]
378
419
  after_number=len(sumstats)
379
- if verbose: log.write(" -Removed ",pre_number -after_number ," based on rsID...")
420
+ log.write(" -Removed ",pre_number -after_number ," based on rsID...", verbose=verbose)
380
421
 
381
422
  # remove by duplicated variants by CHR:POS:NEA:EA
382
423
  if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and ("d" in mode or "c" in mode):
383
- if verbose: log.write("Start to remove duplicated variants based on CHR,POS,EA and NEA...")
424
+ log.write("Start to remove duplicated variants based on CHR,POS,EA and NEA...", verbose=verbose)
384
425
  check_dataframe_shape(sumstats, log, verbose)
385
- if verbose: log.write(" -Which variant to keep: ", keep )
426
+ log.write(" -Which variant to keep: ", keep , verbose=verbose)
386
427
  pre_number =len(sumstats)
387
428
  if snpid in sumstats.columns:
388
429
  # keep na and remove duplicated
389
430
  sumstats = sumstats.loc[(~sumstats[[chrom,pos,ea,nea]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos,ea,nea], keep=keep)),:]
390
431
  after_number=len(sumstats)
391
- if verbose: log.write(" -Removed ",pre_number -after_number ," based on CHR,POS,EA and NEA...")
432
+ log.write(" -Removed ",pre_number -after_number ," based on CHR,POS,EA and NEA...", verbose=verbose)
392
433
 
393
434
  # remove by multiallelic variants by CHR:POS
394
435
  if (chrom in sumstats.columns) and (pos in sumstats.columns) and "m" in mode:
395
436
  # keep na and remove duplicated
396
437
  pre_number =len(sumstats)
397
- if verbose: log.write("Start to remove multiallelic variants based on chr:pos...")
438
+ log.write("Start to remove multiallelic variants based on chr:pos...", verbose=verbose)
398
439
  check_dataframe_shape(sumstats, log, verbose)
399
- if verbose: log.write(" -Which variant to keep: ", keep )
400
- sumstats = sumstats.loc[(~sumstats.loc[:,[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
440
+ log.write(" -Which variant to keep: ", keep , verbose=verbose)
441
+ sumstats = sumstats.loc[(~sumstats[[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
401
442
  after_number=len(sumstats)
402
- if verbose: log.write(" -Removed ",pre_number -after_number," multiallelic variants...")
443
+ log.write(" -Removed ",pre_number -after_number," multiallelic variants...", verbose=verbose)
403
444
  after_number=len(sumstats)
404
445
 
405
446
  # resort the coordinates
406
- if verbose: log.write(" -Removed ",total_number -after_number," variants in total.")
447
+ log.write(" -Removed ",total_number -after_number," variants in total.", verbose=verbose)
407
448
  if keep_col is not None :
408
- if verbose: log.write(" -Sort the coordinates based on CHR and POS...")
449
+ log.write(" -Sort the coordinates based on CHR and POS...", verbose=verbose)
409
450
  sumstats = sortcoordinate(sumstats,verbose=False)
410
451
 
411
452
  if "n" in mode or remove==True:
412
453
  # if remove==True, remove NAs
413
- if verbose: log.write(" -Removing NAs...")
454
+ log.write(" -Removing NAs...", verbose=verbose)
414
455
  pre_number =len(sumstats)
415
456
  specified_columns = []
416
457
  if "d" in mode:
@@ -434,307 +475,348 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
434
475
  specified_columns.append(nea)
435
476
  sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
436
477
  after_number=len(sumstats)
437
- if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)))
438
- if verbose: log.write("Finished removing duplicated/multiallelic variants successfully!")
478
+ log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)), verbose=verbose)
479
+
480
+ finished(log,verbose,_end_line)
439
481
  return sumstats
440
482
 
441
483
  ###############################################################################################################
442
484
  # 20230128
443
485
  def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",24),mt=("MT",25), remove=False, verbose=True, chrom_list = None, minchr=1,log=Log()):
444
- #chrom_list = get_chr_list() #bottom
445
- if chrom_list is None:
446
- chrom_list = get_chr_list()
447
- if check_col(sumstats,chrom,status) is not True:
448
- if verbose: log.write(".fix_chr: Specified not detected..skipping...")
449
- return sumstats
450
- if verbose: log.write("Start to fix chromosome notation...{}".format(_get_version()))
451
- check_dataframe_shape(sumstats, log, verbose)
452
-
453
- # convert to string datatype
454
- try:
455
- if verbose: log.write(" -Checking CHR data type...")
456
- if sumstats.loc[:,chrom].dtype == "string":
457
- pass
458
- else:
459
- sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype("string")
460
- except:
461
- if verbose: log.write(" -Force converting to pd string data type...")
462
- sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype("string")
463
-
464
- # check if CHR is numeric
465
- is_chr_fixed = sumstats[chrom].str.isnumeric()
466
- # fill NAs with False
467
- is_chr_fixed = is_chr_fixed.fillna(False)
468
- if verbose: log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed))
469
-
470
- # if there are variants whose CHR need to be fixed
471
- if sum(is_chr_fixed)<len(sumstats):
472
-
473
- #extract the CHR number or X Y M MT
474
- chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
486
+ ##start function with col checking##########################################################
487
+ _start_line = "fix chromosome notation (CHR)"
488
+ _end_line = "fixing chromosome notation (CHR)"
489
+ _start_cols =[chrom,status]
490
+ _start_function = ".fix_chr()"
491
+ _must_args ={}
475
492
 
476
- is_chr_fixable = ~chr_extracted.isna()
477
- if verbose: log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable))
493
+ is_enough_info = start_to(sumstats=sumstats,
494
+ log=log,
495
+ verbose=verbose,
496
+ start_line=_start_line,
497
+ end_line=_end_line,
498
+ start_cols=_start_cols,
499
+ start_function=_start_function,
500
+ **_must_args)
501
+ if is_enough_info == False: return sumstats
502
+ ############################################################################################
478
503
 
479
- # For not fixed variants, check if na
480
- is_chr_na = sumstats.loc[~is_chr_fixed, chrom].isna()
481
- if sum(is_chr_na)>0 and verbose:
482
- log.write(" -Variants with NA chromosome notations:",sum(is_chr_na))
483
-
484
- # Check variants with CHR being not NA and not fixable
485
- is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
486
- if sum(is_chr_invalid)>0 and verbose:
487
- log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
488
- try:
489
- log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
490
- except:
491
- pass
492
- elif verbose:
493
- log.write(" -No unrecognized chromosome notations...")
494
-
495
- # Assign good chr back to sumstats
496
- sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
504
+ #chrom_list = get_chr_list() #bottom
505
+ if chrom_list is None:
506
+ chrom_list = get_chr_list()
507
+
508
+ # convert to string datatype
509
+ try:
510
+ log.write(" -Checking CHR data type...", verbose=verbose)
511
+ if sumstats[chrom].dtype == "string":
512
+ pass
513
+ else:
514
+ sumstats[chrom] = sumstats[chrom].astype("string")
515
+ except:
516
+ log.write(" -Force converting to pd string data type...", verbose=verbose)
517
+ sumstats[chrom] = sumstats[chrom].astype("string")
518
+
519
+ # check if CHR is numeric
520
+ is_chr_fixed = sumstats[chrom].str.isnumeric()
521
+ # fill NAs with False
522
+ is_chr_fixed = is_chr_fixed.fillna(False)
523
+ log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed), verbose=verbose)
524
+
525
+ # if there are variants whose CHR need to be fixed
526
+ if sum(is_chr_fixed)<len(sumstats):
527
+
528
+ #extract the CHR number or X Y M MT
529
+ chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
497
530
 
498
- # X, Y, MT to 23,24,25
499
- xymt_list = [x[0].lower(),y[0].lower(),mt[0].lower(),x[0].upper(),y[0].upper(),mt[0].upper()]
500
-
501
- # check if sumstats contain sex CHR
502
- sex_chr = sumstats[chrom].isin(xymt_list)
503
-
504
- # if sumstats contain sex CHR
505
- if sum(sex_chr)>0:
506
- if verbose: log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]))
507
- if verbose: log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...")
508
-
509
- # convert "X, Y, MT" to numbers
510
- convert_num_to_xymt={}
511
- if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
512
- convert_num_to_xymt[x[0].lower()] = str(x[1])
513
- convert_num_to_xymt[x[0].upper()] = str(x[1])
514
- if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]))
515
- if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
516
- convert_num_to_xymt[y[0].lower()] = str(y[1])
517
- convert_num_to_xymt[y[0].upper()] = str(y[1])
518
- if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]))
519
- if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
520
- convert_num_to_xymt[mt[0].lower()] = str(mt[1])
521
- convert_num_to_xymt[mt[0].upper()] = str(mt[1])
522
- if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]))
523
- sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
524
-
525
- # change status code
526
- sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
527
- if len(is_chr_fixable.index)>0:
528
- sumstats.loc[is_chr_fixable.index,status] = vchange_status(sumstats.loc[is_chr_fixable.index,status],4,"986","520")
529
- if len(is_chr_fixable.index)>0:
530
- sumstats.loc[is_chr_invalid.index,status] = vchange_status(sumstats.loc[is_chr_invalid.index,status],4,"986","743")
531
-
532
- # check variants with unrecognized CHR
533
- unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
534
- if (remove is True) and unrecognized_num>0:
535
- # remove variants with unrecognized CHR
536
- try:
537
- if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
538
- except:
539
- pass
540
- if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
541
- try:
542
- log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
543
- except:
544
- pass
545
- #sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
546
- good_chr = sumstats[chrom].isin(chrom_list)
547
- sumstats = sumstats.loc[good_chr, :].copy()
531
+ is_chr_fixable = ~chr_extracted.isna()
532
+ log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable), verbose=verbose)
533
+
534
+ # For not fixed variants, check if na
535
+ is_chr_na = sumstats.loc[~is_chr_fixed, chrom].isna()
536
+ if sum(is_chr_na)>0 and verbose:
537
+ log.write(" -Variants with NA chromosome notations:",sum(is_chr_na))
538
+
539
+ # Check variants with CHR being not NA and not fixable
540
+ is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
541
+ if sum(is_chr_invalid)>0 and verbose:
542
+ log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid), verbose=verbose)
543
+ try:
544
+ log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()), verbose=verbose)
545
+ except:
546
+ pass
548
547
  else:
549
- if verbose: log.write(" -All CHR are already fixed...")
550
- sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
548
+ log.write(" -No unrecognized chromosome notations...", verbose=verbose)
551
549
 
552
- # Convert string to int
553
- try:
554
- sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype('Int64')
555
- except:
556
- # force convert
557
- sumstats.loc[:,chrom] = np.floor(pd.to_numeric(sumstats.loc[:,chrom], errors='coerce')).astype('Int64')
550
+ # Assign good chr back to sumstats
551
+ sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
552
+
553
+ # X, Y, MT to 23,24,25
554
+ xymt_list = [x[0].lower(),y[0].lower(),mt[0].lower(),x[0].upper(),y[0].upper(),mt[0].upper()]
558
555
 
559
- # filter out variants with CHR <=0
560
- out_of_range_chr = sumstats[chrom] < minchr
561
- out_of_range_chr = out_of_range_chr.fillna(False)
562
- if sum(out_of_range_chr)>0:
563
- if verbose: log.write(" -Sanity check for CHR...")
564
- if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
565
- sumstats = sumstats.loc[~out_of_range_chr,:]
566
-
567
- if verbose: log.write("Finished fixing chromosome notation successfully!")
556
+ # check if sumstats contain sex CHR
557
+ sex_chr = sumstats[chrom].isin(xymt_list)
568
558
 
569
- return sumstats
559
+ # if sumstats contain sex CHR
560
+ if sum(sex_chr)>0:
561
+ log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]), verbose=verbose)
562
+ log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...", verbose=verbose)
563
+
564
+ # convert "X, Y, MT" to numbers
565
+ convert_num_to_xymt={}
566
+ if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
567
+ convert_num_to_xymt[x[0].lower()] = str(x[1])
568
+ convert_num_to_xymt[x[0].upper()] = str(x[1])
569
+ log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]), verbose=verbose)
570
+ if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
571
+ convert_num_to_xymt[y[0].lower()] = str(y[1])
572
+ convert_num_to_xymt[y[0].upper()] = str(y[1])
573
+ log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]), verbose=verbose)
574
+ if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
575
+ convert_num_to_xymt[mt[0].lower()] = str(mt[1])
576
+ convert_num_to_xymt[mt[0].upper()] = str(mt[1])
577
+ log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]), verbose=verbose)
578
+ sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
579
+
580
+ # change status code
581
+ sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
582
+ if len(is_chr_fixable.index)>0:
583
+ sumstats.loc[is_chr_fixable.index,status] = vchange_status(sumstats.loc[is_chr_fixable.index,status],4,"986","520")
584
+ if len(is_chr_fixable.index)>0:
585
+ sumstats.loc[is_chr_invalid.index,status] = vchange_status(sumstats.loc[is_chr_invalid.index,status],4,"986","743")
586
+
587
+ # check variants with unrecognized CHR
588
+ unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
589
+ if (remove is True) and unrecognized_num>0:
590
+ # remove variants with unrecognized CHR
591
+ try:
592
+ log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])), verbose=verbose)
593
+ except:
594
+ pass
595
+ log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.", verbose=verbose)
596
+ try:
597
+ log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()), verbose=verbose)
598
+ except:
599
+ pass
600
+ #sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
601
+ good_chr = sumstats[chrom].isin(chrom_list)
602
+ sumstats = sumstats.loc[good_chr, :].copy()
603
+ else:
604
+ log.write(" -All CHR are already fixed...", verbose=verbose)
605
+ sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
606
+
607
+ # Convert string to int
608
+ try:
609
+ sumstats[chrom] = sumstats[chrom].astype('Int64')
610
+ except:
611
+ # # force convert
612
+ sumstats[chrom] = np.floor(pd.to_numeric(sumstats[chrom], errors='coerce')).astype('Int64')
613
+
614
+ # filter out variants with CHR <=0
615
+ out_of_range_chr = sumstats[chrom] < minchr
616
+ out_of_range_chr = out_of_range_chr.fillna(False)
617
+ if sum(out_of_range_chr)>0:
618
+ log.write(" -Sanity check for CHR...", verbose=verbose)
619
+ log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr), verbose=verbose)
620
+ sumstats = sumstats.loc[~out_of_range_chr,:]
621
+
622
+ finished(log,verbose,_end_line)
623
+ return sumstats
570
624
 
571
625
  ###############################################################################################################
572
626
  # 20230128
573
627
  def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_limit=0 , upper_limit=None , limit=250000000, log=Log()):
574
- if upper_limit is None:
575
- upper_limit = limit
576
- if check_col(sumstats,pos,status) is not True:
577
- if verbose: log.write(".fix_pos: Specified not detected..skipping...")
578
- return sumstats
579
- if verbose: log.write("Start to fix basepair positions...{}".format(_get_version()))
580
- check_dataframe_shape(sumstats, log, verbose)
581
-
582
- all_var_num = len(sumstats)
583
- #convert to numeric
584
- is_pos_na = sumstats.loc[:,pos].isna()
585
-
586
- try:
587
- if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
588
- sumstats.loc[:,pos] = sumstats.loc[:,pos].astype('string')
589
- # if so, remove thousands separator
590
- if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
591
- sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
592
- except:
593
- pass
628
+ ##start function with col checking##########################################################
629
+ _start_line = "fix basepair positions (POS)"
630
+ _end_line = "fixing basepair positions (POS)"
631
+ _start_cols =[pos,status]
632
+ _start_function = ".fix_pos()"
633
+ _must_args ={}
594
634
 
595
- # convert POS to integer
596
- try:
597
- if verbose: log.write(' -Converting to Int64 data type ...')
598
- sumstats[pos] = sumstats[pos].astype('Int64')
599
- except:
600
- if verbose: log.write(' -Force converting to Int64 data type ...')
601
- sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
602
- is_pos_fixed = ~sumstats.loc[:,pos].isna()
603
- is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
604
-
605
- sumstats.loc[is_pos_fixed,status] = vchange_status(sumstats.loc[is_pos_fixed,status] ,4,"975","630")
606
- sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
607
-
608
- # remove outlier, limit:250,000,000
609
- if verbose: log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit))
610
- is_pos_na = sumstats.loc[:,pos].isna()
611
- out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
612
- if verbose: log.write(" -Removed outliers:",sum(out_lier))
613
- sumstats = sumstats.loc[~out_lier,:]
614
- #remove na
615
- if remove is True:
616
- sumstats = sumstats.loc[~sumstats[pos].isna(),:]
617
- remain_var_num = len(sumstats)
618
- if verbose: log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.")
619
-
620
- if verbose: log.write(" -Converted all position to datatype Int64.")
621
- if verbose: log.write("Finished fixing basepair position successfully!")
635
+ is_enough_info = start_to(sumstats=sumstats,
636
+ log=log,
637
+ verbose=verbose,
638
+ start_line=_start_line,
639
+ end_line=_end_line,
640
+ start_cols=_start_cols,
641
+ start_function=_start_function,
642
+ **_must_args)
643
+ if is_enough_info == False: return sumstats
644
+ ############################################################################################
645
+
646
+ if upper_limit is None:
647
+ upper_limit = limit
622
648
 
623
- return sumstats
649
+ all_var_num = len(sumstats)
650
+ #convert to numeric
651
+ is_pos_na = sumstats[pos].isna()
652
+
653
+ try:
654
+ if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
655
+ sumstats[pos] = sumstats[pos].astype('string')
656
+ # if so, remove thousands separator
657
+ log.write(' -Removing thousands separator "," or underbar "_" ...', verbose=verbose)
658
+ sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
659
+ except:
660
+ pass
661
+
662
+ # convert POS to integer
663
+ try:
664
+ log.write(' -Converting to Int64 data type ...', verbose=verbose)
665
+ sumstats[pos] = sumstats[pos].astype('Int64')
666
+ except:
667
+ log.write(' -Force converting to Int64 data type ...', verbose=verbose)
668
+ sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
669
+ is_pos_fixed = ~sumstats[pos].isna()
670
+ is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
671
+
672
+ sumstats.loc[is_pos_fixed,status] = vchange_status(sumstats.loc[is_pos_fixed,status] ,4,"975","630")
673
+ sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
674
+
675
+ # remove outlier, limit:250,000,000
676
+ log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit), verbose=verbose)
677
+ is_pos_na = sumstats[pos].isna()
678
+ out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
679
+ log.write(" -Removed outliers:",sum(out_lier), verbose=verbose)
680
+ sumstats = sumstats.loc[~out_lier,:]
681
+ #remove na
682
+ if remove is True:
683
+ sumstats = sumstats.loc[~sumstats[pos].isna(),:]
684
+ remain_var_num = len(sumstats)
685
+ log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.", verbose=verbose)
686
+
687
+ finished(log,verbose,_end_line)
688
+ return sumstats
624
689
 
625
690
  ###############################################################################################################
626
691
  # 20220514
627
692
  def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=True,log=Log()):
628
- # remove variants with alleles other than actgACTG
629
- if check_col(sumstats,ea,nea,status) is not True:
630
- if verbose: log.write("EA and NEA not detected..skipping...")
631
- return sumstats
632
- if verbose: log.write("Start to fix alleles...{}".format(_get_version()))
633
- check_dataframe_shape(sumstats, log, verbose)
634
-
635
- #if (ea not in sumstats.columns) or (nea not in sumstats.columns):
636
- if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
637
-
638
- #try:
639
- # ea_missing = sum(sumstats[ea].isna())
640
- # nea_missing = sum(sumstats[nea].isna())
641
- # if sum(ea_missing)>0:
642
- # if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
643
- # sumstats.loc[:,ea] = sumstats.loc[:,ea].add_categories("N").fillna("N")
644
- # if sum(sumstats[nea].isna())>0:
645
- # if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
646
- # sumstats.loc[:,nea] = sumstats.loc[:,nea].add_categories("N").fillna("N")
647
- #except:
648
- # pass
649
-
650
- categories = set(sumstats.loc[:,ea].str.upper())|set(sumstats.loc[:,nea].str.upper())|set("N")
651
- categories = {x for x in categories if pd.notna(x)}
652
-
653
- sumstats.loc[:,ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
654
- sumstats.loc[:,nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
655
- all_var_num = len(sumstats)
656
-
657
- ## check ATCG
658
- bad_ea = sumstats[ea].str.contains("[^actgACTG]",na=True)
659
- bad_nea = sumstats[nea].str.contains("[^actgACTG]",na=True)
660
- good_ea = ~bad_ea
661
- good_nea = ~bad_nea
662
-
663
- log.write(" -Variants with bad EA : {}".format(sum(bad_ea)), verbose=verbose)
664
- log.write(" -Variants with bad NEA : {}".format(sum(bad_nea)), verbose=verbose)
665
-
666
- ## check NA
667
- is_eanea_na = sumstats[ea].isna() | sumstats[nea].isna()
668
- log.write(" -Variants with NA for EA or NEA: {}".format(sum(is_eanea_na)), verbose=verbose)
669
-
670
- ## check same alleles
671
- not_variant = sumstats[nea] == sumstats[ea]
672
- log.write(" -Variants with same EA and NEA: {}".format(sum(not_variant)), verbose=verbose)
693
+ ##start function with col checking##########################################################
694
+ _start_line = "fix alleles (EA and NEA)"
695
+ _end_line = "fixing alleles (EA and NEA)"
696
+ _start_cols =[ea, nea,status]
697
+ _start_function = ".fix_allele()"
698
+ _must_args ={}
673
699
 
674
- ## sum up invalid variants
675
- is_invalid = bad_ea | bad_nea | not_variant
676
-
677
- exclude = bad_nea | bad_ea
678
-
679
- if verbose:
680
- if len(set(sumstats.loc[bad_ea,ea].head())) >0:
681
- log.write(" -A look at the non-ATCG EA:",set(sumstats.loc[bad_ea,ea].head()),"...")
682
- if len(set(sumstats.loc[bad_nea,nea].head())) >0:
683
- log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...")
684
-
685
- if remove == True:
686
- sumstats = sumstats.loc[(good_ea & good_nea),:].copy()
687
- good_eanea_num = len(sumstats)
688
- if verbose: log.write(" -Removed "+str(all_var_num - good_eanea_num)+" variants with NA alleles or alleles that contain bases other than A/C/T/G.")
689
- sumstats = sumstats.loc[(good_ea & good_nea & (~not_variant)),:].copy()
690
- good_eanea_notsame_num = len(sumstats)
691
- if verbose: log.write(" -Removed "+str(good_eanea_num - good_eanea_notsame_num)+" variants with same allele for EA and NEA.")
692
- else:
693
- sumstats.loc[:,[ea,nea]] = sumstats.loc[:,[ea,nea]].fillna("N")
694
- if verbose: log.write(" -Detected "+str(sum(exclude))+" variants with alleles that contain bases other than A/C/T/G .")
695
- categories = set(sumstats.loc[:,ea].str.upper())|set(sumstats.loc[:,nea].str.upper())|set("N")
696
- sumstats.loc[:,ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
697
- sumstats.loc[:,nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
698
-
699
- is_eanea_fixed = good_ea | good_nea
700
- is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
701
- is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
702
- is_not_normalized = (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()>1)
703
- is_normalized = is_indel &( (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()>1) | (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()==1) )
704
-
705
- if sum(is_invalid)>0:
706
- sumstats.loc[is_invalid, status] = vchange_status(sumstats.loc[is_invalid,status], 5,"9","6")
707
- if sum(is_eanea_na)>0:
708
- sumstats.loc[is_eanea_na,status] = vchange_status(sumstats.loc[is_eanea_na, status], 5,"9","7")
709
- if sum(is_eanea_fixed&is_not_normalized)>0:
710
- sumstats.loc[is_eanea_fixed&is_not_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_not_normalized,status], 5,"9","5")
711
- if sum(is_eanea_fixed&is_snp)>0:
712
- sumstats.loc[is_eanea_fixed&is_snp, status] = vchange_status(sumstats.loc[is_eanea_fixed&is_snp,status], 5,"9","0")
713
- if sum(is_eanea_fixed&is_indel)>0:
714
- sumstats.loc[is_eanea_fixed&is_indel,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_indel, status], 5,"9","4")
715
- if sum(is_eanea_fixed&is_normalized)>0:
716
- sumstats.loc[is_eanea_fixed&is_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_normalized, status], 5,"4","3")
717
- gc.collect()
718
- if verbose: log.write("Finished fixing allele successfully!")
719
-
720
- return sumstats
700
+ is_enough_info = start_to(sumstats=sumstats,
701
+ log=log,
702
+ verbose=verbose,
703
+ start_line=_start_line,
704
+ end_line=_end_line,
705
+ start_cols=_start_cols,
706
+ start_function=_start_function,
707
+ **_must_args)
708
+ if is_enough_info == False: return sumstats
709
+ ############################################################################################
710
+ #try:
711
+ # ea_missing = sum(sumstats[ea].isna())
712
+ # nea_missing = sum(sumstats[nea].isna())
713
+ # if sum(ea_missing)>0:
714
+ # log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
715
+ # sumstats[ea] = sumstats[ea].add_categories("N").fillna("N")
716
+ # if sum(sumstats[nea].isna())>0:
717
+ # log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
718
+ # sumstats[nea] = sumstats[nea].add_categories("N").fillna("N")
719
+ #except:
720
+ # pass
721
+
722
+ log.write(" -Converted all bases to string datatype and UPPERCASE.", verbose=verbose)
723
+ categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
724
+ categories = {x for x in categories if pd.notna(x)}
725
+ sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
726
+ sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
727
+ all_var_num = len(sumstats)
728
+
729
+ ## check ATCG
730
+ bad_ea = sumstats[ea].str.contains("[^actgACTG]",na=True)
731
+ bad_nea = sumstats[nea].str.contains("[^actgACTG]",na=True)
732
+ good_ea = ~bad_ea
733
+ good_nea = ~bad_nea
734
+
735
+ log.write(" -Variants with bad EA : {}".format(sum(bad_ea)), verbose=verbose)
736
+ log.write(" -Variants with bad NEA : {}".format(sum(bad_nea)), verbose=verbose)
737
+
738
+ ## check NA
739
+ is_eanea_na = sumstats[ea].isna() | sumstats[nea].isna()
740
+ log.write(" -Variants with NA for EA or NEA: {}".format(sum(is_eanea_na)), verbose=verbose)
741
+
742
+ ## check same alleles
743
+ not_variant = sumstats[nea] == sumstats[ea]
744
+ log.write(" -Variants with same EA and NEA: {}".format(sum(not_variant)), verbose=verbose)
745
+
746
+ ## sum up invalid variants
747
+ is_invalid = bad_ea | bad_nea | not_variant
748
+
749
+ exclude = bad_nea | bad_ea
750
+
751
+ if len(set(sumstats.loc[bad_ea,ea].head())) >0:
752
+ log.write(" -A look at the non-ATCG EA:",set(sumstats.loc[bad_ea,ea].head()),"...", verbose=verbose)
753
+ if len(set(sumstats.loc[bad_nea,nea].head())) >0:
754
+ log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...", verbose=verbose)
755
+
756
+ if remove == True:
757
+ sumstats = sumstats.loc[(good_ea & good_nea),:].copy()
758
+ good_eanea_num = len(sumstats)
759
+ log.write(" -Removed "+str(all_var_num - good_eanea_num)+" variants with NA alleles or alleles that contain bases other than A/C/T/G.", verbose=verbose)
760
+ sumstats = sumstats.loc[(good_ea & good_nea & (~not_variant)),:].copy()
761
+ good_eanea_notsame_num = len(sumstats)
762
+ log.write(" -Removed "+str(good_eanea_num - good_eanea_notsame_num)+" variants with same allele for EA and NEA.", verbose=verbose)
763
+ else:
764
+ sumstats[[ea,nea]] = sumstats[[ea,nea]].fillna("N")
765
+ log.write(" -Detected "+str(sum(exclude))+" variants with alleles that contain bases other than A/C/T/G .", verbose=verbose)
766
+ categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
767
+ sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
768
+ sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
769
+
770
+ is_eanea_fixed = good_ea | good_nea
771
+ is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
772
+ is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
773
+ is_not_normalized = (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()>1)
774
+ is_normalized = is_indel &( (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()>1) | (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()==1) )
775
+
776
+ if sum(is_invalid)>0:
777
+ sumstats.loc[is_invalid, status] = vchange_status(sumstats.loc[is_invalid,status], 5,"9","6")
778
+ if sum(is_eanea_na)>0:
779
+ sumstats.loc[is_eanea_na,status] = vchange_status(sumstats.loc[is_eanea_na, status], 5,"9","7")
780
+ if sum(is_eanea_fixed&is_not_normalized)>0:
781
+ sumstats.loc[is_eanea_fixed&is_not_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_not_normalized,status], 5,"9","5")
782
+ if sum(is_eanea_fixed&is_snp)>0:
783
+ sumstats.loc[is_eanea_fixed&is_snp, status] = vchange_status(sumstats.loc[is_eanea_fixed&is_snp,status], 5,"9","0")
784
+ if sum(is_eanea_fixed&is_indel)>0:
785
+ sumstats.loc[is_eanea_fixed&is_indel,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_indel, status], 5,"9","4")
786
+ if sum(is_eanea_fixed&is_normalized)>0:
787
+ sumstats.loc[is_eanea_fixed&is_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_normalized, status], 5,"4","3")
788
+
789
+ finished(log,verbose,_end_line)
790
+ return sumstats
721
791
 
722
792
  ###############################################################################################################
723
793
  # 20220721
724
794
 
725
795
  def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
726
- if check_col(sumstats,pos,ea,nea,status) is not True:
727
- if verbose: log.write("WARNING! .normalize(): specified columns not detected..skipping...")
728
- return sumstats
729
-
730
- if verbose: log.write("Start to normalize variants...{}".format(_get_version()))
731
- check_dataframe_shape(sumstats, log, verbose)
796
+ ##start function with col checking##########################################################
797
+ _start_line = "normalize indels"
798
+ _end_line = "normalizing indels"
799
+ _start_cols =[ea, nea,status]
800
+ _start_function = ".normalize()"
801
+ _must_args ={}
802
+
803
+ is_enough_info = start_to(sumstats=sumstats,
804
+ log=log,
805
+ verbose=verbose,
806
+ start_line=_start_line,
807
+ end_line=_end_line,
808
+ start_cols=_start_cols,
809
+ start_function=_start_function,
810
+ **_must_args)
811
+ if is_enough_info == False: return sumstats
812
+ ############################################################################################
813
+
732
814
  #variants_to_check = status_match(sumstats[status],5,[4,5]) #
733
815
  #r'\w\w\w\w[45]\w\w'
734
816
  variants_to_check = sumstats[status].str[4].str.match(r'4|5', case=False, flags=0, na=False)
735
817
  if sum(variants_to_check)==0:
736
- if verbose: log.write(" -No available variants to normalize..")
737
- if verbose: log.write("Finished normalizing variants successfully!")
818
+ log.write(" -No available variants to normalize..", verbose=verbose)
819
+ log.write("Finished normalizing variants successfully!", verbose=verbose)
738
820
  return sumstats
739
821
  ###############################################################################################################
740
822
  if sum(variants_to_check)>0:
@@ -742,46 +824,46 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
742
824
  n_cores=1
743
825
  pool = Pool(n_cores)
744
826
  map_func = partial(normalizeallele,pos=pos,nea=nea,ea=ea,status=status)
745
- df_split = np.array_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
827
+ #df_split = np.array_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
828
+ df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
746
829
  normalized_pd = pd.concat(pool.map(map_func,df_split))
747
830
  pool.close()
748
831
  pool.join()
749
832
  ###############################################################################################################
750
833
 
751
- if verbose:
752
- before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
753
- changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
754
- if changed_num>0:
755
- if snpid in sumstats.columns:
756
- before_normalize_id = sumstats.loc[variants_to_check,snpid]
757
- elif rsid in sumstats.columns:
758
- before_normalize_id = sumstats.loc[variants_to_check,rsid]
759
- else:
760
- before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
761
-
762
- log.write(" -Not normalized allele IDs:",end="")
763
- for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
764
- log.write(i,end=" ",show_time=False)
765
- log.write("... \n",end="",show_time=False)
766
-
767
- log.write(" -Not normalized allele:",end="")
768
- for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
769
- log.write(i,end="",show_time=False)
770
- log.write("... \n",end="",show_time=False)
771
- log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.")
834
+ before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
835
+ changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
836
+ if changed_num>0:
837
+ if snpid in sumstats.columns:
838
+ before_normalize_id = sumstats.loc[variants_to_check,snpid]
839
+ elif rsid in sumstats.columns:
840
+ before_normalize_id = sumstats.loc[variants_to_check,rsid]
772
841
  else:
773
- log.write(" -All variants are already normalized..")
842
+ before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
843
+
844
+ log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
845
+ for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
846
+ log.write(i,end=" ",show_time=False)
847
+ log.write("... \n",end="",show_time=False, verbose=verbose)
848
+
849
+ log.write(" -Not normalized allele:",end="", verbose=verbose)
850
+ for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
851
+ log.write(i,end="",show_time=False, verbose=verbose)
852
+ log.write("... \n",end="",show_time=False, verbose=verbose)
853
+ log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
854
+ else:
855
+ log.write(" -All variants are already normalized..", verbose=verbose)
774
856
  ###################################################################################################################
775
- categories = set(sumstats.loc[:,ea])|set(sumstats.loc[:,nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
776
- sumstats.loc[:,ea] = pd.Categorical(sumstats.loc[:,ea],categories = categories)
777
- sumstats.loc[:,nea] = pd.Categorical(sumstats.loc[:,nea],categories = categories )
857
+ categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
858
+ sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
859
+ sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
778
860
  sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
779
861
  try:
780
- sumstats.loc[:,pos] = sumstats.loc[:,pos].astype('Int64')
862
+ sumstats[pos] = sumstats[pos].astype('Int64')
781
863
  except:
782
- sumstats.loc[:,pos] = np.floor(pd.to_numeric(sumstats.loc[:,pos], errors='coerce')).astype('Int64')
864
+ sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
783
865
 
784
- if verbose: log.write("Finished normalizing variants successfully!")
866
+ finished(log,verbose,_end_line)
785
867
  return sumstats
786
868
 
787
869
  def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
@@ -846,6 +928,52 @@ def add_tolerence(stats, float_tolerence, mode):
846
928
  stats = (stats[0] , stats[1] + float_tolerence if stats[0]!=float("Inf") else float("Inf"))
847
929
  return stats
848
930
 
931
+
932
+ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, verbose, dtype="Int64"):
933
+ pre_number=len(sumstats)
934
+ if header in coltocheck and header in sumstats.columns:
935
+ cols_to_check.append(header)
936
+ if header=="STATUS":
937
+ log.write(" -Checking STATUS and converting STATUS to categories....", verbose=verbose)
938
+ categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
939
+ sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
940
+ return sumstats
941
+
942
+ if dtype in ["Int64","Int32","int","int32","in64"]:
943
+ log.write(" -Checking if {} <= {} <= {} ...".format( var_range[0] ,header, var_range[1]), verbose=verbose)
944
+ sumstats[header] = np.floor(pd.to_numeric(sumstats[header], errors='coerce')).astype(dtype)
945
+
946
+ elif dtype in ["Float64","Float32","float","float64","float32"]:
947
+ log.write(" -Checking if {} < {} < {} ...".format( var_range[0] ,header, var_range[1]),verbose=verbose)
948
+ sumstats[header] = pd.to_numeric(sumstats[header], errors='coerce').astype(dtype)
949
+
950
+ is_valid = (sumstats[header]>=var_range[0]) & (sumstats[header]<=var_range[1])
951
+ is_valid = is_valid.fillna(False)
952
+
953
+ if header=="P":
954
+ is_low_p = sumstats["P"] == 0
955
+ if sum(is_low_p) >0:
956
+ log.warning("Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)))
957
+ log.warning("Please consider using MLOG10P instead.")
958
+
959
+ if sum(~is_valid)>0:
960
+ try:
961
+ if "SNPID" in sumstats.columns:
962
+ id_to_use = "SNPID"
963
+ elif "rsID" in sumstats.columns:
964
+ id_to_use = "rsID"
965
+ invalid_ids = sumstats.loc[~is_valid, id_to_use].head().astype("string")
966
+ invalid_values = sumstats.loc[~is_valid, header].head().astype("string").fillna("NA")
967
+ log.write(" -Examples of invalid variants({}): {} ...".format(id_to_use, ",".join(invalid_ids.to_list()) ), verbose=verbose)
968
+ log.write(" -Examples of invalid values ({}): {} ...".format(header, ",".join(invalid_values.to_list()) ), verbose=verbose)
969
+ except:
970
+ pass
971
+
972
+ sumstats = sumstats.loc[is_valid,:]
973
+ after_number=len(sumstats)
974
+ log.write(" -Removed {} variants with bad/na {}.".format(pre_number - after_number, header), verbose=verbose)
975
+ return sumstats
976
+
849
977
  def sanitycheckstats(sumstats,
850
978
  coltocheck=None,
851
979
  n=(0,2**31-1),
@@ -853,8 +981,10 @@ def sanitycheckstats(sumstats,
853
981
  ncontrol=(0,2**31-1),
854
982
  eaf=(0,1),
855
983
  mac=(0,2**31-1),
984
+ maf=(0,0.5),
856
985
  chisq=(0,float("Inf")),
857
986
  z=(-9999,9999),
987
+ t=(-99999,99999),
858
988
  f=(0,float("Inf")),
859
989
  p=(0,1),
860
990
  mlog10p=(0,9999),
@@ -885,10 +1015,30 @@ def sanitycheckstats(sumstats,
885
1015
  HR_95U: float64 , HR_95L >0
886
1016
  INFO: float32 , 1>=INFO>0
887
1017
  Z float64 , -9999 < Z < 9999
1018
+ T float64 , -99999 < T < 99999
888
1019
  F float64 , F > 0
889
1020
  '''
1021
+ ##start function with col checking##########################################################
1022
+ _start_line = "perform sanity check for statistics"
1023
+ _end_line = "sanity check for statistics"
1024
+ _start_cols =[]
1025
+ _start_function = ".check_sanity()"
1026
+ _must_args ={}
1027
+
1028
+ is_enough_info = start_to(sumstats=sumstats,
1029
+ log=log,
1030
+ verbose=verbose,
1031
+ start_line=_start_line,
1032
+ end_line=_end_line,
1033
+ start_cols=_start_cols,
1034
+ start_function=_start_function,
1035
+ **_must_args)
1036
+ if is_enough_info == False: return sumstats
1037
+ ############################################################################################
890
1038
 
1039
+ log.write(" -Comparison tolerance for floats: {}".format(float_tolerence), verbose=verbose)
891
1040
  eaf = add_tolerence(eaf, float_tolerence, "lr")
1041
+ maf = add_tolerence(maf, float_tolerence, "lr")
892
1042
  beta = add_tolerence(beta, float_tolerence, "lr")
893
1043
  se = add_tolerence(se, float_tolerence, "lr")
894
1044
  mlog10p = add_tolerence(mlog10p, float_tolerence, "lr")
@@ -903,233 +1053,83 @@ def sanitycheckstats(sumstats,
903
1053
  p = add_tolerence(p, float_tolerence, "lr")
904
1054
  f = add_tolerence(f, float_tolerence, "lr")
905
1055
  chisq = add_tolerence(chisq, float_tolerence, "lr")
906
-
907
-
1056
+ ############################################################################################
908
1057
  ## add direction
909
1058
  if coltocheck is None:
910
1059
  coltocheck = ["P","MLOG10P","INFO","Z","BETA","SE","EAF","CHISQ","F","N","N_CASE","N_CONTROL","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","STATUS"]
911
- if verbose: log.write("Start sanity check for statistics...{}".format(_get_version()))
912
- check_dataframe_shape(sumstats, log, verbose)
1060
+
913
1061
  cols_to_check=[]
914
1062
  oringinal_number=len(sumstats)
915
1063
  sumstats = sumstats.copy()
916
1064
 
917
- if verbose: log.write(" -Comparison tolerance for floats: {}".format(float_tolerence))
918
- ###SAMPLE SIZE################################################################################################################################################
919
- pre_number=len(sumstats)
920
- if "N" in coltocheck and "N" in sumstats.columns:
921
- cols_to_check.append("N")
922
- if verbose: log.write(" -Checking if ",n[0],"<=N<=",n[1]," ...")
923
- sumstats.loc[:,"N"] = np.floor(pd.to_numeric(sumstats.loc[:,"N"], errors='coerce')).astype("Int64")
924
- sumstats = sumstats.loc[(sumstats["N"]>=n[0]) & (sumstats["N"]<=n[1]),:]
925
- after_number=len(sumstats)
926
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N.")
927
- pre_number=len(sumstats)
928
- if "N_CASE" in coltocheck and "N_CASE" in sumstats.columns:
929
- cols_to_check.append("N_CASE")
930
- if verbose: log.write(" -Checking if ",ncase[0],"<=N_CASE<=",ncase[1]," ...")
931
- sumstats.loc[:,"N_CASE"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CASE"], errors='coerce')).astype("Int64")
932
- sumstats = sumstats.loc[(sumstats["N_CASE"]>=ncase[0]) & (sumstats["N_CASE"]<=ncase[1]),:]
933
- after_number=len(sumstats)
934
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CASE.")
935
- pre_number=len(sumstats)
936
- if "N_CONTROL" in coltocheck and "N_CONTROL" in sumstats.columns:
937
- cols_to_check.append("N_CONTROL")
938
- if verbose: log.write(" -Checking if ",ncontrol[0],"<=N_CONTROL<=",ncontrol[1]," ...")
939
- sumstats.loc[:,"N_CONTROL"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CONTROL"], errors='coerce')).astype("Int64")
940
- sumstats = sumstats.loc[(sumstats["N_CONTROL"]>=ncontrol[0]) & (sumstats["N_CONTROL"]<=ncontrol[1]),:]
941
- after_number=len(sumstats)
942
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CONTROL.")
1065
+ ###Int64 ################################################################################################################################################
1066
+ sumstats = check_range(sumstats, var_range=n, header="N", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
1067
+ sumstats = check_range(sumstats, var_range=ncase, header="N_CASE", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
1068
+ sumstats = check_range(sumstats, var_range=ncontrol, header="N_CONTROL", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
943
1069
 
1070
+ ###float32 ################################################################################################################################################
1071
+ sumstats = check_range(sumstats, var_range=eaf, header="EAF", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
1072
+ sumstats = check_range(sumstats, var_range=maf, header="MAF", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
1073
+ sumstats = check_range(sumstats, var_range=info, header="INFO", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
944
1074
 
945
- ###ALLELE FREQUENCY################################################################################################################################################
946
- pre_number=len(sumstats)
947
- if "EAF" in coltocheck and "EAF" in sumstats.columns:
948
- cols_to_check.append("EAF")
949
- if verbose: log.write(" -Checking if ",eaf[0],"<EAF<",eaf[1]," ...")
950
- sumstats.loc[:,"EAF"] = pd.to_numeric(sumstats.loc[:,"EAF"], errors='coerce').astype("float32")
951
- sumstats = sumstats.loc[(sumstats["EAF"]>eaf[0]) & (sumstats["EAF"]<eaf[1]),:]
952
- after_number=len(sumstats)
953
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad EAF.")
954
-
955
- pre_number=len(sumstats)
956
- if "EAF" in coltocheck and "EAF" in sumstats.columns and "N" in coltocheck and "N" in sumstats.columns:
957
- if verbose: log.write(" -Checking if ",mac[0],"<=MAC<=",mac[1]," ...")
958
- sumstats["_MAF"]=sumstats["EAF"]
959
- sumstats.loc[sumstats["EAF"]>0.5,"_MAF"] = 1 - sumstats.loc[sumstats["EAF"]>0.5,"EAF"]
960
- sumstats["_MAC"] = np.floor(pd.to_numeric(sumstats.loc[:,"_MAF"] * sumstats.loc[:,"N"], errors='coerce')).astype("int64")
961
- macl = ( sumstats["_MAC"] >= mac[0])
962
- macu = ( sumstats["_MAC"] <= mac[1])
963
- sumstats = sumstats.loc[macl&macu,:]
964
- sumstats = sumstats.drop(labels=["_MAF","_MAC"],axis=1)
965
- after_number=len(sumstats)
966
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad MAC.")
967
-
968
- ###TEST STATISTICS################################################################################################################################################
969
- pre_number=len(sumstats)
970
- if "CHISQ" in coltocheck and "CHISQ" in sumstats.columns:
971
- cols_to_check.append("CHISQ")
972
- if verbose: log.write(" -Checking if ",chisq[0],"<CHISQ<",chisq[1]," ...")
973
- sumstats.loc[:,"CHISQ"] = pd.to_numeric(sumstats.loc[:,"CHISQ"], errors='coerce').astype("float64")
974
- sumstats = sumstats.loc[(sumstats["CHISQ"]>chisq[0]) & (sumstats["CHISQ"]<chisq[1]),:]
975
- after_number=len(sumstats)
976
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad CHISQ.")
977
-
978
- pre_number=len(sumstats)
979
- if "Z" in coltocheck and "Z" in sumstats.columns:
980
- cols_to_check.append("Z")
981
- if verbose: log.write(" -Checking if ",z[0],"<Z<",z[1]," ...")
982
- sumstats.loc[:,"Z"] = pd.to_numeric(sumstats.loc[:,"Z"], errors='coerce').astype("float64")
983
- sumstats = sumstats.loc[(sumstats["Z"]>z[0]) & (sumstats["Z"]<z[1]),:]
984
- after_number=len(sumstats)
985
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad Z.")
986
-
987
- pre_number=len(sumstats)
988
- if "F" in coltocheck and "F" in sumstats.columns:
989
- cols_to_check.append("F")
990
- if verbose: log.write(" -Checking if ",f[0],"<F<",f[1]," ...")
991
- sumstats.loc[:,"F"] = pd.to_numeric(sumstats.loc[:,"F"], errors='coerce').astype("float64")
992
- sumstats = sumstats.loc[(sumstats["F"]>f[0]) & (sumstats["F"]<f[1]),:]
993
- after_number=len(sumstats)
994
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad F.")
995
-
996
- ###P ################################################################################################################################################
997
- pre_number=len(sumstats)
998
- if "P" in coltocheck and "P" in sumstats.columns:
999
- cols_to_check.append("P")
1000
- if verbose: log.write(" -Checking if ",p[0],"< P <",p[1]," ...")
1001
- sumstats.loc[:,"P"] = pd.to_numeric(sumstats.loc[:,"P"], errors='coerce').astype("float64")
1002
- sumstats = sumstats.loc[(sumstats["P"]>p[0]) & (sumstats["P"]<p[1]),:]
1003
-
1004
- is_low_p = sumstats["P"] == 0
1005
- if sum(is_low_p) >0:
1006
- log.write(" -WARNING! Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)), verbose=verbose)
1007
- log.write(" -WARNING! Please consider using MLOG10P instead.", verbose=verbose)
1008
- after_number=len(sumstats)
1009
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad P.")
1010
-
1011
- pre_number=len(sumstats)
1012
- if "MLOG10P" in coltocheck and "MLOG10P" in sumstats.columns:
1013
- cols_to_check.append("MLOG10P")
1014
- if verbose: log.write(" -Checking if ",mlog10p[0],"<MLOG10P<",mlog10p[1]," ...")
1015
- sumstats.loc[:,"MLOG10P"] = pd.to_numeric(sumstats.loc[:,"MLOG10P"], errors='coerce').astype("float64")
1016
- sumstats = sumstats.loc[(sumstats["MLOG10P"]>mlog10p[0]) & (sumstats["MLOG10P"]<mlog10p[1]),:]
1017
- after_number=len(sumstats)
1018
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad MLOG10P.")
1019
-
1020
- ###EFFECT ################################################################################################################################################
1021
- pre_number=len(sumstats)
1022
- if "BETA" in coltocheck and "BETA" in sumstats.columns:
1023
- cols_to_check.append("BETA")
1024
- if verbose: log.write(" -Checking if ",beta[0],"<BETA<",beta[1]," ...")
1025
- sumstats.loc[:,"BETA"] = pd.to_numeric(sumstats.loc[:,"BETA"], errors='coerce').astype("float64")
1026
- sumstats = sumstats.loc[(sumstats["BETA"]>beta[0]) & (sumstats["BETA"]<beta[1]),:]
1027
- after_number=len(sumstats)
1028
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad BETA.")
1029
-
1030
- pre_number=len(sumstats)
1031
- if "SE" in coltocheck and "SE" in sumstats.columns:
1032
- cols_to_check.append("SE")
1033
- if verbose: log.write(" -Checking if ",se[0],"<SE<",se[1]," ...")
1034
- sumstats.loc[:,"SE"] = pd.to_numeric(sumstats.loc[:,"SE"], errors='coerce').astype("float64")
1035
- sumstats = sumstats.loc[(sumstats["SE"]>se[0]) & (sumstats["SE"]<se[1]),:]
1036
- after_number=len(sumstats)
1037
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad SE.")
1038
-
1039
- pre_number=len(sumstats)
1040
- if "OR" in coltocheck and "OR" in sumstats.columns:
1041
- cols_to_check.append("OR")
1042
- if verbose: log.write(" -Checking if ",OR[0],"<log(OR)<",OR[1]," ...")
1043
- sumstats.loc[:,"OR"] = pd.to_numeric(sumstats.loc[:,"OR"], errors='coerce').astype("float64")
1044
- sumstats = sumstats.loc[(np.log(sumstats["OR"])>OR[0]) & (np.log(sumstats["OR"])<OR[1]),:]
1045
- after_number=len(sumstats)
1046
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR.")
1047
-
1048
- pre_number=len(sumstats)
1049
- if "OR_95L" in coltocheck and "OR_95L" in sumstats.columns:
1050
- cols_to_check.append("OR_95L")
1051
- if verbose: log.write(" -Checking if ",OR_95L[0],"<OR_95L<",OR_95L[1]," ...")
1052
- sumstats.loc[:,"OR_95L"] = pd.to_numeric(sumstats.loc[:,"OR_95L"], errors='coerce').astype("float64")
1053
- sumstats = sumstats.loc[(sumstats["OR_95L"]>OR_95L[0]) & (sumstats["OR_95L"]<OR_95L[1]),:]
1054
- after_number=len(sumstats)
1055
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR_95L.")
1056
-
1057
- pre_number=len(sumstats)
1058
- if "OR_95U" in coltocheck and "OR_95U" in sumstats.columns:
1059
- cols_to_check.append("OR_95U")
1060
- if verbose: log.write(" -Checking if ",OR_95U[0],"<OR_95U<",OR_95U[1]," ...")
1061
- sumstats.loc[:,"OR_95U"] = pd.to_numeric(sumstats.loc[:,"OR_95U"], errors='coerce').astype("float64")
1062
- sumstats = sumstats.loc[(sumstats["OR_95U"]>OR_95U[0]) & (sumstats["OR_95U"]<OR_95U[1]),:]
1063
- after_number=len(sumstats)
1064
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR_95U.")
1065
-
1066
- pre_number=len(sumstats)
1067
- if "HR" in coltocheck and "HR" in sumstats.columns:
1068
- cols_to_check.append("HR")
1069
- if verbose: log.write(" -Checking if ",HR[0],"<log(HR)<",HR[1]," ...")
1070
- sumstats.loc[:,"HR"] = pd.to_numeric(sumstats.loc[:,"HR"], errors='coerce').astype("float64")
1071
- sumstats = sumstats.loc[(np.log(sumstats["HR"])>HR[0]) & (np.log(sumstats["HR"])<HR[1]),:]
1072
- after_number=len(sumstats)
1073
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR.")
1074
-
1075
- pre_number=len(sumstats)
1076
- if "HR_95L" in coltocheck and "HR_95L" in sumstats.columns:
1077
- cols_to_check.append("HR_95L")
1078
- if verbose: log.write(" -Checking if ",HR_95L[0],"<HR_95L<",HR_95L[1]," ...")
1079
- sumstats.loc[:,"HR_95L"] = pd.to_numeric(sumstats.loc[:,"HR_95L"], errors='coerce').astype("float64")
1080
- sumstats = sumstats.loc[(sumstats["HR_95L"]>HR_95L[0]) & (sumstats["HR_95L"]<HR_95L[1]),:]
1081
- after_number=len(sumstats)
1082
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR_95L.")
1083
-
1084
- pre_number=len(sumstats)
1085
- if "HR_95U" in coltocheck and "HR_95U" in sumstats.columns:
1086
- cols_to_check.append("HR_95U")
1087
- if verbose: log.write(" -Checking if ",HR_95U[0],"<HR_95U<",HR_95U[1]," ...")
1088
- sumstats.loc[:,"HR_95U"] = pd.to_numeric(sumstats.loc[:,"HR_95U"], errors='coerce').astype("float64")
1089
- sumstats = sumstats.loc[(sumstats["HR_95U"]>HR_95U[0]) & (sumstats["HR_95U"]<HR_95U[1]),:]
1090
- after_number=len(sumstats)
1091
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR_95U.")
1092
- #INFO #################################################################################################################
1093
- pre_number=len(sumstats)
1094
- if "INFO" in coltocheck and "INFO" in sumstats.columns:
1095
- cols_to_check.append("INFO")
1096
- if verbose: log.write(" -Checking if ",info[0],"<INFO<",info[1]," ...")
1097
- sumstats.loc[:,"INFO"] = pd.to_numeric(sumstats.loc[:,"INFO"], errors='coerce').astype("float32")
1098
- sumstats = sumstats.loc[(sumstats["INFO"]>info[0]) & (sumstats["INFO"]<info[1]),:]
1099
- after_number=len(sumstats)
1100
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad INFO.")
1101
- ###STATUS ################################################################################################################################################
1102
- pre_number=len(sumstats)
1103
- if "STATUS" in coltocheck and "STATUS" in sumstats.columns:
1104
- cols_to_check.append("STATUS")
1105
- if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
1106
- categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
1107
- sumstats.loc[:,"STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
1108
-
1109
- #pre_number=len(sumstats)
1110
- #sumstats = sumstats.dropna(subset=cols_to_check)
1111
- after_number=len(sumstats)
1112
- #if verbose:log.write(" -Removed {} variants with NAs in the checked columns...".format(pre_number - after_number))
1075
+ ###float64 ################################################################################################################################################
1076
+ sumstats = check_range(sumstats, var_range=chisq, header="CHISQ", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1077
+ sumstats = check_range(sumstats, var_range=z, header="Z", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1078
+ sumstats = check_range(sumstats, var_range=t, header="T", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1079
+ sumstats = check_range(sumstats, var_range=f, header="F", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1080
+ sumstats = check_range(sumstats, var_range=p, header="P", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1081
+ sumstats = check_range(sumstats, var_range=mlog10p, header="MLOG10P", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1082
+ sumstats = check_range(sumstats, var_range=beta, header="BETA", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1083
+ sumstats = check_range(sumstats, var_range=se, header="SE", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1084
+ sumstats = check_range(sumstats, var_range=OR, header="OR", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1085
+ sumstats = check_range(sumstats, var_range=OR_95L, header="OR_95L", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1086
+ sumstats = check_range(sumstats, var_range=OR_95U, header="OR_95U", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1087
+ sumstats = check_range(sumstats, var_range=HR, header="HR", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1088
+ sumstats = check_range(sumstats, var_range=HR_95L, header="HR_95L", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1089
+ sumstats = check_range(sumstats, var_range=HR_95U, header="HR_95U", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1090
+ ###STATUS ###############################################################################################################################################
1091
+ sumstats = check_range(sumstats, var_range=None, header="STATUS", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="category")
1113
1092
 
1114
- if verbose: log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.")
1115
- if verbose:
1116
- log.write(" -Data types for each column:")
1117
- check_datatype(sumstats,verbose=verbose, log=log)
1118
- if verbose: log.write("Finished sanity check successfully!")
1093
+ after_number=len(sumstats)
1094
+ log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.",verbose=verbose)
1095
+ log.write(" -Data types for each column:",verbose=verbose)
1096
+ check_datatype(sumstats,verbose=verbose, log=log)
1097
+ finished(log,verbose,_end_line)
1119
1098
  return sumstats
1120
1099
 
1121
1100
  ### check consistency #############################################################################################################################################
1122
1101
 
1123
- def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
1124
- if verbose: log.write("Start to check data consistency across columns...{}".format(_get_version()))
1125
- check_dataframe_shape(sumstats, log, verbose)
1102
+ def _check_data_consistency(sumstats, beta="BETA", se="SE", p="P",mlog10p="MLOG10P",rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
1103
+ ##start function with col checking##########################################################
1104
+ _start_line = "check data consistency across columns"
1105
+ _end_line = "checking data consistency across columns"
1106
+ _start_cols =[]
1107
+ _start_function = ".check_data_consistency()"
1108
+ _must_args ={}
1109
+
1110
+ is_enough_info = start_to(sumstats=sumstats,
1111
+ log=log,
1112
+ verbose=verbose,
1113
+ start_line=_start_line,
1114
+ end_line=_end_line,
1115
+ start_cols=_start_cols,
1116
+ start_function=_start_function,
1117
+ **_must_args)
1118
+ if is_enough_info == False: return sumstats
1119
+ ############################################################################################
1120
+
1126
1121
  log.write(" -Tolerance: {} (Relative) and {} (Absolute)".format(rtol, atol),verbose=verbose)
1122
+ check_status = 0
1127
1123
 
1128
-
1129
- if "SNPID" not in sumstats.columns:
1124
+ if "SNPID" in sumstats.columns:
1125
+ id_to_use = "SNPID"
1126
+ elif "rsID" in sumstats.columns:
1130
1127
  id_to_use = "rsID"
1131
1128
  else:
1132
- id_to_use = "SNPID"
1129
+ log.write(" -SNPID/rsID not available...SKipping",verbose=verbose)
1130
+ log.write("Finished checking data consistency across columns.",verbose=verbose)
1131
+ return 0
1132
+
1133
1133
 
1134
1134
  if "BETA" in sumstats.columns and "SE" in sumstats.columns:
1135
1135
  if "MLOG10P" in sumstats.columns:
@@ -1138,10 +1138,11 @@ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verb
1138
1138
  is_close = np.isclose(betase_derived_mlog10p, sumstats["MLOG10P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1139
1139
  diff = betase_derived_mlog10p - sumstats["MLOG10P"]
1140
1140
  if sum(~is_close)>0:
1141
- log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1142
- log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1141
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
1142
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
1143
1143
  else:
1144
1144
  log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1145
+ check_status=1
1145
1146
 
1146
1147
  if "P" in sumstats.columns:
1147
1148
  log.write(" -Checking if BETA/SE-derived-P is consistent with P...",verbose=verbose)
@@ -1149,10 +1150,11 @@ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verb
1149
1150
  is_close = np.isclose(betase_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1150
1151
  diff = betase_derived_p - sumstats["P"]
1151
1152
  if sum(~is_close)>0:
1152
- log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1153
- log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1153
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
1154
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
1154
1155
  else:
1155
1156
  log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1157
+ check_status=1
1156
1158
 
1157
1159
  if "MLOG10P" in sumstats.columns and "P" in sumstats.columns:
1158
1160
  log.write(" -Checking if MLOG10P-derived-P is consistent with P...",verbose=verbose)
@@ -1160,25 +1162,30 @@ def _check_data_consistency(sumstats, rtol=1e-3, atol=1e-3, equal_nan=True, verb
1160
1162
  is_close = np.isclose(mlog10p_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1161
1163
  diff = mlog10p_derived_p - sumstats["P"]
1162
1164
  if sum(~is_close)>0:
1163
- log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1164
- log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1165
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
1166
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
1165
1167
  else:
1166
1168
  log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1169
+ check_status=1
1167
1170
 
1168
1171
  if "N" in sumstats.columns and "N_CONTROL" in sumstats.columns and "N_CASE" in sumstats.columns:
1169
- if verbose: log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...")
1170
- is_close = sumstats.loc[:,"N"] == sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"]
1171
- #is_close = np.isclose(sumstats.loc[:,"N"], sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
1172
- diff = abs(sumstats.loc[:,"N"] - (sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"] ))
1172
+ log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...", verbose=verbose)
1173
+ is_close = sumstats["N"] == sumstats["N_CASE"] + sumstats["N_CONTROL"]
1174
+ #is_close = np.isclose(sumstats["N"], sumstats["N_CASE"] + sumstats["N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
1175
+ diff = abs(sumstats["N"] - (sumstats["N_CASE"] + sumstats["N_CONTROL"] ))
1173
1176
  if sum(~is_close)>0:
1174
- log.write(" -Not consistent: {} variant(s)".format(sum(~is_close),verbose=verbose))
1175
- log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max(),verbose=verbose))
1177
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
1178
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
1176
1179
  else:
1177
1180
  log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1181
+ check_status=1
1182
+
1183
+ if check_status==1:
1184
+ log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
1185
+ else:
1186
+ log.write(" -No availalbe columns for data consistency checking...Skipping...",verbose=verbose)
1187
+ finished(log,verbose,_end_line)
1178
1188
 
1179
- log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
1180
-
1181
- if verbose: log.write("Finished checking data consistency across columns.")
1182
1189
  ###############################################################################################################
1183
1190
  # 20220426
1184
1191
  def get_reverse_complementary_allele(a):
@@ -1201,178 +1208,166 @@ def flip_direction(string):
1201
1208
  else: #sometime it is 0
1202
1209
  flipped_string+=char
1203
1210
  return flipped_string
1204
-
1211
+
1212
+ def flip_by_swap(sumstats, matched_index, log, verbose):
1213
+ if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
1214
+ log.write(" -Swapping column: NEA <=> EA...", verbose=verbose)
1215
+ sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
1216
+ return sumstats
1217
+
1218
+ def flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1):
1219
+ if "OR" in sumstats.columns:
1220
+ log.write(" -Flipping column: OR = 1 / OR...", verbose=verbose)
1221
+ sumstats.loc[matched_index,"OR"] = factor / sumstats.loc[matched_index,"OR"].values
1222
+ if "OR_95L" in sumstats.columns:
1223
+ log.write(" -Flipping column: OR_95U = 1 / OR_95L...", verbose=verbose)
1224
+ sumstats.loc[matched_index,"OR_95U"] = factor / sumstats.loc[matched_index,"OR_95L"].values
1225
+ if "OR_95U" in sumstats.columns:
1226
+ log.write(" -Flipping column: OR_95L = 1 / OR_95U...", verbose=verbose)
1227
+ sumstats.loc[matched_index,"OR_95L"] = factor / sumstats.loc[matched_index,"OR_95U"].values
1228
+ if "HR" in sumstats.columns:
1229
+ log.write(" -Flipping column: HR = 1 / HR...", verbose=verbose)
1230
+ sumstats.loc[matched_index,"HR"] = factor / sumstats.loc[matched_index,"HR"].values
1231
+ if "HR_95L" in sumstats.columns:
1232
+ log.write(" -Flipping column: HR_95U = 1 / HR_95L...", verbose=verbose)
1233
+ sumstats.loc[matched_index,"HR_95U"] = factor / sumstats.loc[matched_index,"HR_95L"].values
1234
+ if "HR_95U" in sumstats.columns:
1235
+ log.write(" -Flipping column: HR_95L = 1 / HR_95U...", verbose=verbose)
1236
+ sumstats.loc[matched_index,"HR_95L"] = factor / sumstats.loc[matched_index,"HR_95U"].values
1237
+ return sumstats
1238
+
1239
+ def flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1):
1240
+ if "EAF" in sumstats.columns:
1241
+ log.write(" -Flipping column: EAF = 1 - EAF...", verbose=verbose)
1242
+ sumstats.loc[matched_index,"EAF"] = factor - sumstats.loc[matched_index,"EAF"].values
1243
+ return sumstats
1244
+
1245
+ def flip_by_sign(sumstats, matched_index, log, verbose, cols=None):
1246
+ if "BETA" in sumstats.columns:
1247
+ log.write(" -Flipping column: BETA = - BETA...", verbose=verbose)
1248
+ sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1249
+ if "BETA_95L" in sumstats.columns:
1250
+ log.write(" -Flipping column: BETA_95U = - BETA_95L...", verbose=verbose)
1251
+ sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95L"].values
1252
+ if "BETA_95U" in sumstats.columns:
1253
+ log.write(" -Flipping column: BETA_95L = - BETA_95U...", verbose=verbose)
1254
+ sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95U"].values
1255
+ if "Z" in sumstats.columns:
1256
+ log.write(" -Flipping column: Z = - Z...", verbose=verbose)
1257
+ sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"Z"].values
1258
+ if "T" in sumstats.columns:
1259
+ log.write(" -Flipping column: T = - T...", verbose=verbose)
1260
+ sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"T"].values
1261
+ if "DIRECTION" in sumstats.columns:
1262
+ log.write(" -Flipping column: DIRECTION +-?0 <=> -+?0 ...", verbose=verbose)
1263
+ sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1264
+ return sumstats
1265
+
1205
1266
  def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1206
-
1207
- check_dataframe_shape(sumstats, log, verbose)
1208
-
1267
+ ##start function with col checking##########################################################
1268
+ _start_line = "adjust statistics based on STATUS code"
1269
+ _end_line = "adjusting statistics based on STATUS code"
1270
+ _start_cols =[]
1271
+ _start_function = ".flip_allele_stats()"
1272
+ _must_args ={}
1273
+
1274
+ is_enough_info = start_to(sumstats=sumstats,
1275
+ log=log,
1276
+ verbose=verbose,
1277
+ start_line=_start_line,
1278
+ end_line=_end_line,
1279
+ start_cols=_start_cols,
1280
+ start_function=_start_function,
1281
+ **_must_args)
1282
+ if is_enough_info == False: return sumstats
1283
+ ############################################################################################
1284
+
1285
+ if_stats_flipped = False
1209
1286
  ###################get reverse complementary####################
1210
1287
  pattern = r"\w\w\w\w\w[45]\w"
1211
1288
  #matched_index = status_match(sumstats[status],6,[4,5]) #
1212
1289
  matched_index = sumstats[status].str[5].str.match(r"4|5")
1213
1290
  if sum(matched_index)>0:
1214
- if verbose: log.write("Start to convert alleles to reverse complement for SNPs with status xxxxx[45]x...{}".format(_get_version()))
1215
- if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1291
+ log.write("Start to convert alleles to reverse complement for SNPs with status xxxxx[45]x...{}".format(_get_version()), verbose=verbose)
1292
+ log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
1216
1293
  if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
1217
- if verbose: log.write(" -Converting to reverse complement : EA and NEA...")
1294
+ log.write(" -Converting to reverse complement : EA and NEA...", verbose=verbose)
1218
1295
  reverse_complement_nea = sumstats.loc[matched_index,'NEA'].apply(lambda x :get_reverse_complementary_allele(x))
1219
1296
  reverse_complement_ea = sumstats.loc[matched_index,'EA'].apply(lambda x :get_reverse_complementary_allele(x))
1220
- categories = set(sumstats.loc[:,'EA'])|set(sumstats.loc[:,'NEA']) |set(reverse_complement_ea) |set(reverse_complement_nea)
1221
- sumstats.loc[:,'EA']=pd.Categorical(sumstats.loc[:,'EA'],categories = categories)
1222
- sumstats.loc[:,'NEA']=pd.Categorical(sumstats.loc[:,'NEA'],categories = categories )
1297
+ categories = set(sumstats['EA'])|set(sumstats['NEA']) |set(reverse_complement_ea) |set(reverse_complement_nea)
1298
+ sumstats['EA']=pd.Categorical(sumstats['EA'],categories = categories)
1299
+ sumstats['NEA']=pd.Categorical(sumstats['NEA'],categories = categories )
1223
1300
  sumstats.loc[matched_index,['NEA']] = reverse_complement_nea
1224
1301
  sumstats.loc[matched_index,['EA']] = reverse_complement_ea
1225
1302
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "4","2")
1226
- if verbose: log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x")
1227
-
1303
+ log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x", verbose=verbose)
1304
+ if_stats_flipped = True
1228
1305
  ###################flip ref####################
1229
1306
  pattern = r"\w\w\w\w\w[35]\w"
1230
1307
  #matched_index = status_match(sumstats[status],6,[3,5]) #sumstats[status].str.match(pattern)
1231
1308
  matched_index = sumstats[status].str[5].str.match(r"3|5")
1232
1309
  if sum(matched_index)>0:
1233
- if verbose: log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: alt->ea , ref->nea ...{}".format(_get_version()))
1234
- if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1235
- if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
1236
- if verbose: log.write(" -Swapping column: NEA <=> EA...")
1237
- sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
1238
- if "BETA" in sumstats.columns:
1239
- if verbose: log.write(" -Flipping column: BETA = - BETA...")
1240
- sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1241
- if "BETA_95L" in sumstats.columns:
1242
- if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
1243
- sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
1244
- if "BETA_95U" in sumstats.columns:
1245
- if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
1246
- sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
1247
- if "EAF" in sumstats.columns:
1248
- if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
1249
- sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
1250
- if "OR" in sumstats.columns:
1251
- if verbose: log.write(" -Flipping column: OR = 1 / OR...")
1252
- sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
1253
- if "OR_95L" in sumstats.columns:
1254
- if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
1255
- sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
1256
- if "OR_95U" in sumstats.columns:
1257
- if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
1258
- sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
1259
- if "HR" in sumstats.columns:
1260
- if verbose: log.write(" -Flipping column: HR = 1 / HR...")
1261
- sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
1262
- if "HR_95L" in sumstats.columns:
1263
- if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
1264
- sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
1265
- if "HR_95U" in sumstats.columns:
1266
- if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
1267
- sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
1268
- if "DIRECTION" in sumstats.columns:
1269
- if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
1270
- sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1310
+ log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: ALT->EA , REF->NEA ...{}".format(_get_version()), verbose=verbose)
1311
+ log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
1312
+
1313
+ flip_by_swap(sumstats, matched_index, log, verbose)
1314
+ flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
1315
+ flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
1316
+ flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
1317
+
1271
1318
  #change status
1272
- if verbose: log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x")
1319
+ log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x", verbose=verbose)
1273
1320
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "35","12")
1321
+ if_stats_flipped = True
1274
1322
 
1275
1323
  ###################flip ref for undistingushable indels####################
1276
1324
  pattern = r"\w\w\w\w[123][67]6"
1277
1325
  #matched_index = status_match(sumstats[status],6,[1,2,3])|status_match(sumstats[status],6,[6,7])|status_match(sumstats[status],7,6) #sumstats[status].str.match(pattern)
1278
1326
  matched_index = sumstats[status].str[4:].str.match(r"[123][67]6")
1279
1327
  if sum(matched_index)>0:
1280
- if verbose: log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: alt->ea , ref->nea...{}".format(_get_version()))
1281
- if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1282
- if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
1283
- if verbose: log.write(" -Swapping column: NEA <=> EA...")
1284
- sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
1285
- if "BETA" in sumstats.columns:
1286
- if verbose: log.write(" -Flipping column: BETA = - BETA...")
1287
- sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1288
- if "BETA_95L" in sumstats.columns:
1289
- if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
1290
- sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
1291
- if "BETA_95U" in sumstats.columns:
1292
- if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
1293
- sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
1294
- if "EAF" in sumstats.columns:
1295
- if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
1296
- sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
1297
- if "OR" in sumstats.columns:
1298
- if verbose: log.write(" -Flipping column: OR = 1 / OR...")
1299
- sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
1300
- if "OR_95L" in sumstats.columns:
1301
- if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
1302
- sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
1303
- if "OR_95U" in sumstats.columns:
1304
- if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
1305
- sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
1306
- if "HR" in sumstats.columns:
1307
- if verbose: log.write(" -Flipping column: HR = 1 / HR...")
1308
- sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
1309
- if "HR_95L" in sumstats.columns:
1310
- if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
1311
- sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
1312
- if "HR_95U" in sumstats.columns:
1313
- if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
1314
- sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
1315
- if "DIRECTION" in sumstats.columns:
1316
- if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
1317
- sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1328
+ log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: ALT->EA , REF->NEA...{}".format(_get_version()), verbose=verbose)
1329
+ log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
1330
+
1331
+ flip_by_swap(sumstats, matched_index, log, verbose)
1332
+ flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
1333
+ flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
1334
+ flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
1335
+
1318
1336
  #change status
1319
- if verbose: log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4")
1337
+ log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4", verbose=verbose)
1320
1338
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "6","4")
1339
+ if_stats_flipped = True
1321
1340
  # flip ref
1322
1341
  ###################flip statistics for reverse strand panlindromic variants####################
1323
1342
  pattern = r"\w\w\w\w\w[012]5"
1324
1343
  #matched_index = status_match(sumstats[status],6,[0,1,2]) | status_match(sumstats[status],7,[5])#sumstats[status].str.match(pattern)
1325
1344
  matched_index = sumstats[status].str[5:].str.match(r"05|15|25")
1326
1345
  if sum(matched_index)>0:
1327
- if verbose: log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()))
1328
- if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1329
- if "BETA" in sumstats.columns:
1330
- if verbose: log.write(" -Flipping column: BETA = - BETA...")
1331
- sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1332
- if "BETA_95L" in sumstats.columns:
1333
- if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
1334
- sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
1335
- if "BETA_95U" in sumstats.columns:
1336
- if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
1337
- sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
1338
- if "EAF" in sumstats.columns:
1339
- if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
1340
- sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
1341
- if "OR" in sumstats.columns:
1342
- if verbose: log.write(" -Flipping column: OR = 1 / OR...")
1343
- sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
1344
- if "OR_95L" in sumstats.columns:
1345
- if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
1346
- sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
1347
- if "OR_95U" in sumstats.columns:
1348
- if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
1349
- sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
1350
- if "HR" in sumstats.columns:
1351
- if verbose: log.write(" -Flipping column: HR = 1 / HR...")
1352
- sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
1353
- if "HR_95L" in sumstats.columns:
1354
- if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
1355
- sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
1356
- if "HR_95U" in sumstats.columns:
1357
- if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
1358
- sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
1359
- if "DIRECTION" in sumstats.columns:
1360
- if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
1361
- sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1346
+ log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()), verbose=verbose)
1347
+ log.write(" -Flipping "+ str(sum(matched_index)) +" variants...", verbose=verbose)
1348
+
1349
+ flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
1350
+ flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
1351
+ flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
1352
+
1362
1353
  #change status
1363
- if verbose: log.write(" -Changed the status for flipped variants: xxxxx[012]5: -> xxxxx[012]2")
1354
+ log.write(" -Changed the status for flipped variants: xxxxx[012]5: -> xxxxx[012]2", verbose=verbose)
1364
1355
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "5","2")
1365
- if verbose: log.write("Finished converting successfully!")
1356
+ if_stats_flipped = True
1357
+
1358
+ if if_stats_flipped != True:
1359
+ log.write(" -No statistics have been changed.")
1360
+
1361
+ finished(log, verbose, _end_line)
1366
1362
  return sumstats
1367
- ""
1368
1363
 
1369
1364
 
1370
1365
  ###############################################################################################################
1371
1366
  # 20220426
1372
1367
  def liftover_snv(row,chrom,converter,to_build):
1373
1368
  status_pre=""
1374
- status_end=row[1][2]+"9"+row[1][4]+"99"
1375
- pos_0_based = int(row[0]) - 1
1369
+ status_end=row.iloc[1][2]+"9"+row.iloc[1][4]+"99"
1370
+ pos_0_based = int(row.iloc[0]) - 1
1376
1371
  results = converter[chrom][pos_0_based]
1377
1372
  if converter[chrom][pos_0_based]:
1378
1373
  # return chrom, pos_1_based
@@ -1402,29 +1397,42 @@ def liftover_variant(sumstats,
1402
1397
  return sumstats
1403
1398
 
1404
1399
  def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
1405
- if check_col(sumstats,chrom,pos,status) is not True:
1406
- if verbose: log.write("WARNING! .liftover(): specified columns not detected..skipping...")
1407
- return sumstats
1408
- if verbose: log.write("Start to perform liftover...{}".format(_get_version()))
1409
- check_dataframe_shape(sumstats, log, verbose)
1410
- if verbose: log.write(" -CPU Cores to use :",n_cores)
1411
- if verbose: log.write(" -Performing liftover ...")
1412
- if verbose: log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build)
1400
+ ##start function with col checking##########################################################
1401
+ _start_line = "perform liftover"
1402
+ _end_line = "liftover"
1403
+ _start_cols =[chrom,pos,status]
1404
+ _start_function = ".liftover()"
1405
+ _must_args ={}
1406
+
1407
+ is_enough_info = start_to(sumstats=sumstats,
1408
+ log=log,
1409
+ verbose=verbose,
1410
+ start_line=_start_line,
1411
+ end_line=_end_line,
1412
+ start_cols=_start_cols,
1413
+ start_function=_start_function,
1414
+ n_cores=n_cores,
1415
+ **_must_args)
1416
+ if is_enough_info == False: return sumstats
1417
+ ############################################################################################
1418
+
1419
+ log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build, verbose=verbose)
1413
1420
  # valid chr and pos
1414
1421
  pattern = r"\w\w\w0\w\w\w"
1415
1422
  to_lift = sumstats[status].str.match(pattern)
1416
1423
  sumstats = sumstats.loc[to_lift,:].copy()
1417
- if verbose: log.write(" -Converting variants with status code xxx0xxx :"+str(len(sumstats))+"...")
1424
+ log.write(" -Converting variants with status code xxx0xxx :"+str(len(sumstats))+"...", verbose=verbose)
1418
1425
  ###########################################################################
1419
1426
  if sum(to_lift)>0:
1420
1427
  if sum(to_lift)<10000:
1421
1428
  n_cores=1
1422
1429
 
1423
- df_split = np.array_split(sumstats.loc[:,[chrom,pos,status]], n_cores)
1430
+ #df_split = np.array_split(sumstats[[chrom,pos,status]], n_cores)
1431
+ df_split = _df_split(sumstats[[chrom,pos,status]], n_cores)
1424
1432
  pool = Pool(n_cores)
1425
1433
  #df = pd.concat(pool.starmap(func, df_split))
1426
1434
  func=liftover_variant
1427
- sumstats.loc[:,[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
1435
+ sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
1428
1436
  pool.close()
1429
1437
  pool.join()
1430
1438
  ############################################################################
@@ -1432,78 +1440,183 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
1432
1440
  unmap_num = len(sumstats.loc[sumstats[pos].isna(),:])
1433
1441
 
1434
1442
  if remove is True:
1435
- if verbose: log.write(" -Removed unmapped variants: "+str(unmap_num))
1443
+ log.write(" -Removed unmapped variants: "+str(unmap_num), verbose=verbose)
1436
1444
  sumstats = sumstats.loc[~sumstats[pos].isna(),:]
1437
1445
 
1438
1446
  # after liftover check chr and pos
1439
1447
  sumstats = fixchr(sumstats,chrom=chrom,add_prefix="",remove=remove, verbose=True)
1440
1448
  sumstats = fixpos(sumstats,pos=pos,remove=remove, verbose=True)
1441
1449
 
1442
- if verbose: log.write("Finished liftover successfully!")
1450
+ finished(log,verbose,_end_line)
1443
1451
  return sumstats
1444
1452
 
1445
1453
  ###############################################################################################################
1446
1454
  # 20220426
1447
1455
  def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=Log()):
1448
- if check_col(sumstats,chrom,pos) is not True:
1449
- if verbose: log.write(".liftover(): specified columns not detected..skipping...")
1450
- return sumstats
1451
-
1452
- if verbose: log.write("Start to sort the genome coordinates...{}".format(_get_version()))
1453
- check_dataframe_shape(sumstats, log, verbose)
1456
+ ##start function with col checking##########################################################
1457
+ _start_line = "sort the genome coordinates"
1458
+ _end_line = "sorting coordinates"
1459
+ _start_cols =[chrom,pos]
1460
+ _start_function = ".sort_coordinate()"
1461
+ _must_args ={}
1462
+
1463
+ is_enough_info = start_to(sumstats=sumstats,
1464
+ log=log,
1465
+ verbose=verbose,
1466
+ start_line=_start_line,
1467
+ end_line=_end_line,
1468
+ start_cols=_start_cols,
1469
+ start_function=_start_function,
1470
+ **_must_args)
1471
+ if is_enough_info == False: return sumstats
1472
+ ############################################################################################
1454
1473
 
1455
1474
  try:
1456
1475
  if sumstats[pos].dtype == "Int64":
1457
1476
  pass
1458
1477
  else:
1459
- if verbose: log.write(" -Force converting POS to Int64...")
1478
+ log.write(" -Force converting POS to Int64...", verbose=verbose)
1460
1479
  sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
1461
1480
  except:
1462
1481
  pass
1463
-
1464
- if verbose: log.write(" -Sorting genome coordinates...")
1465
1482
  sumstats = sumstats.sort_values(by=[chrom,pos],ascending=True,ignore_index=True)
1466
- if verbose: log.write("Finished sorting genome coordinates successfully!")
1467
- gc.collect()
1483
+
1484
+ finished(log,verbose,_end_line)
1468
1485
  return sumstats
1469
1486
  ###############################################################################################################
1470
1487
  # 20230430 added HR HR_95 BETA_95 N_CASE N_CONTROL
1471
- def sortcolumn(sumstats,verbose=True,log=Log(),order = [
1488
+ def sortcolumn(sumstats,verbose=True,log=Log(),order = None):
1489
+ ##start function with col checking##########################################################
1490
+ _start_line = "reorder the columns"
1491
+ _end_line = "reordering the columns"
1492
+ _start_cols =[]
1493
+ _start_function = ".sort_column()"
1494
+ _must_args ={}
1495
+
1496
+ is_enough_info = start_to(sumstats=sumstats,
1497
+ log=log,
1498
+ verbose=verbose,
1499
+ start_line=_start_line,
1500
+ end_line=_end_line,
1501
+ start_cols=_start_cols,
1502
+ start_function=_start_function,
1503
+ **_must_args)
1504
+ if is_enough_info == False: return sumstats
1505
+ ############################################################################################
1506
+
1507
+ if order is None:
1508
+ order = [
1472
1509
  "SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z","T","F",
1473
- "CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"
1474
- ]):
1475
- if verbose: log.write("Start to reorder the columns...{}".format(_get_version()))
1476
- check_dataframe_shape(sumstats, log, verbose)
1477
-
1510
+ "CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"]
1478
1511
  output_columns = []
1479
1512
  for i in order:
1480
1513
  if i in sumstats.columns: output_columns.append(i)
1481
1514
  for i in sumstats.columns:
1482
1515
  if i not in order: output_columns.append(i)
1483
- if verbose: log.write(" -Reordering columns to :", ",".join(output_columns))
1484
- sumstats = sumstats.loc[:, output_columns]
1485
- if verbose: log.write("Finished sorting columns successfully!")
1516
+ log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
1517
+ sumstats = sumstats[ output_columns]
1518
+
1519
+ finished(log,verbose,_end_line)
1486
1520
  return sumstats
1487
1521
 
1488
- def check_col(df,*args):
1522
+
1523
+ ###############################################################################################################
1524
+ def start_to(sumstats,
1525
+ log,
1526
+ verbose,
1527
+ start_line,
1528
+ end_line,
1529
+ start_cols,
1530
+ start_function,
1531
+ ref_vcf=None,
1532
+ ref_fasta=None,
1533
+ n_cores=None,
1534
+ ref_tsv=None,
1535
+ **args
1536
+ ):
1537
+
1538
+ log.write("Start to {}...{}".format(start_line,_get_version()), verbose=verbose)
1539
+
1540
+ check_dataframe_shape(sumstats=sumstats,
1541
+ log=log,
1542
+ verbose=verbose)
1543
+
1544
+ is_enough_col = check_col(sumstats.columns,
1545
+ verbose=verbose,
1546
+ log=log,
1547
+ cols=start_cols,
1548
+ function=start_function)
1549
+
1550
+ if is_enough_col==True:
1551
+ if n_cores is not None:
1552
+ log.write(" -Number of threads/cores to use: {}".format(n_cores))
1553
+ if ref_vcf is not None:
1554
+ log.write(" -Reference VCF: {}".format(ref_vcf))
1555
+ if ref_fasta is not None:
1556
+ log.write(" -Reference FASTA: {}".format(ref_fasta))
1557
+ if ref_tsv is not None:
1558
+ log.write(" -Reference TSV: {}".format(ref_tsv))
1559
+
1560
+ is_args_valid = True
1561
+ for key, value in args.items():
1562
+ is_args_valid = is_args_valid & check_arg(log, verbose, key, value, start_function)
1563
+ is_enough_col = is_args_valid & is_enough_col
1564
+
1565
+ if is_enough_col == False:
1566
+ skipped(log, verbose, end_line)
1567
+
1568
+ return is_enough_col
1569
+
1570
+ def finished(log, verbose, end_line):
1571
+ log.write("Finished {}.".format(end_line), verbose=verbose)
1572
+ gc.collect()
1573
+
1574
+ def skipped(log, verbose, end_line):
1575
+ log.write("Skipped {}.".format(end_line), verbose=verbose)
1576
+ gc.collect()
1577
+
1578
+ def check_arg(log, verbose, key, value, function):
1579
+ if value is None:
1580
+ log.warning("Necessary argument {} for {} is not provided!".format(key, function))
1581
+ return False
1582
+ return True
1583
+
1584
+ def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
1489
1585
  not_in_df=[]
1490
- for i in args:
1586
+ for i in cols:
1491
1587
  if type(i) is str:
1492
- if i in df.columns:
1588
+ # single check
1589
+ if i in df_col_names:
1493
1590
  continue
1494
1591
  else:
1495
1592
  not_in_df.append(i)
1496
1593
  else:
1594
+ # paried check
1497
1595
  count=0
1498
1596
  for j in i:
1499
- if j in df.columns:
1597
+ if j not in df_col_names:
1598
+ not_in_df.append(j)
1500
1599
  count+=1
1501
- if count==0:
1502
- return False
1503
- print(" -Specified columns names was not detected. Please check:"+",".join(i))
1504
-
1600
+
1505
1601
  if len(not_in_df)>0:
1602
+ if function is None:
1603
+ to_show_title=" "
1604
+ else:
1605
+ to_show_title = " for {} ".format(function)
1606
+ log.warning("Necessary columns{}were not detected:{}".format(to_show_title, ",".join(not_in_df)))
1607
+ skipped(log, verbose, end_line=function)
1506
1608
  return False
1507
- print(" -Specified columns names was not detected. Please check:"+",".join(not_in_df))
1609
+
1508
1610
  return True
1509
1611
 
1612
+ ###############################################################################################################
1613
+ def _df_split(dataframe, n):
1614
+ chunks = []
1615
+ chunk_size = int(dataframe.shape[0] // n)+1
1616
+
1617
+ for index in range(0, dataframe.shape[0], chunk_size):
1618
+ chunks.append(
1619
+ dataframe.iloc[index:index + chunk_size]
1620
+ )
1621
+
1622
+ return chunks