gwaslab 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (42) hide show
  1. gwaslab/__init__.py +1 -1
  2. gwaslab/data/formatbook.json +722 -721
  3. gwaslab/g_Log.py +8 -0
  4. gwaslab/g_Sumstats.py +80 -178
  5. gwaslab/g_SumstatsPair.py +6 -2
  6. gwaslab/g_Sumstats_summary.py +3 -3
  7. gwaslab/g_meta.py +13 -3
  8. gwaslab/g_version.py +2 -2
  9. gwaslab/hm_casting.py +29 -15
  10. gwaslab/hm_harmonize_sumstats.py +312 -159
  11. gwaslab/hm_rsid_to_chrpos.py +1 -1
  12. gwaslab/io_preformat_input.py +46 -37
  13. gwaslab/io_to_formats.py +428 -295
  14. gwaslab/qc_check_datatype.py +15 -1
  15. gwaslab/qc_fix_sumstats.py +956 -719
  16. gwaslab/util_ex_calculate_ldmatrix.py +29 -11
  17. gwaslab/util_ex_gwascatalog.py +1 -1
  18. gwaslab/util_ex_ldproxyfinder.py +1 -1
  19. gwaslab/util_ex_process_h5.py +26 -17
  20. gwaslab/util_ex_process_ref.py +3 -3
  21. gwaslab/util_ex_run_coloc.py +26 -4
  22. gwaslab/util_in_convert_h2.py +1 -1
  23. gwaslab/util_in_fill_data.py +44 -5
  24. gwaslab/util_in_filter_value.py +122 -34
  25. gwaslab/util_in_get_density.py +2 -2
  26. gwaslab/util_in_get_sig.py +41 -9
  27. gwaslab/viz_aux_quickfix.py +26 -21
  28. gwaslab/viz_aux_reposition_text.py +7 -4
  29. gwaslab/viz_aux_save_figure.py +6 -5
  30. gwaslab/viz_plot_compare_af.py +5 -5
  31. gwaslab/viz_plot_compare_effect.py +22 -5
  32. gwaslab/viz_plot_miamiplot2.py +28 -20
  33. gwaslab/viz_plot_mqqplot.py +214 -98
  34. gwaslab/viz_plot_qqplot.py +11 -8
  35. gwaslab/viz_plot_regionalplot.py +16 -9
  36. gwaslab/viz_plot_trumpetplot.py +15 -6
  37. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/METADATA +3 -3
  38. gwaslab-3.4.38.dist-info/RECORD +72 -0
  39. gwaslab-3.4.36.dist-info/RECORD +0 -72
  40. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
  41. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
  42. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,12 @@ from gwaslab.bd_common_data import get_chr_to_number
14
14
  from gwaslab.bd_common_data import get_number_to_chr
15
15
  from gwaslab.bd_common_data import get_chr_list
16
16
  from gwaslab.qc_check_datatype import check_datatype
17
+ from gwaslab.qc_check_datatype import check_dataframe_shape
17
18
  from gwaslab.g_version import _get_version
19
+ from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
20
+ from gwaslab.util_in_fill_data import _convert_betase_to_p
21
+ from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
22
+ #process build
18
23
  #setbuild
19
24
  #fixID
20
25
  #rsidtochrpos
@@ -26,6 +31,7 @@ from gwaslab.g_version import _get_version
26
31
  #normalizevariant
27
32
  #checkref
28
33
  #sanitycheckstats
34
+ #_check_data_consistency
29
35
  #flipallelestats
30
36
  #parallelizeassignrsid
31
37
  #sortcoordinate
@@ -41,18 +47,18 @@ def _process_build(build,log,verbose):
41
47
  log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
42
48
  final_build = "38"
43
49
  else:
44
- log.write(" -Version of genomic coordinates are unknown...", verbose=verbose)
50
+ log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
45
51
  final_build = "99"
46
52
  return final_build
47
53
 
48
54
  def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
49
55
  build = _process_build(build,log=log,verbose=verbose)
50
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 1, "139",build[0]*3)
51
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status], 2, "89",build[1]*3)
52
- return sumstats
56
+ sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
57
+ sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
58
+ return sumstats, build
53
59
 
54
60
  def fixID(sumstats,
55
- snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",
61
+ snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",fixprefix=False,
56
62
  fixchrpos=False,fixid=False,fixeanea=False,fixeanea_flip=False,fixsep=False,
57
63
  overwrite=False,verbose=True,forcefixid=False,log=Log()):
58
64
  '''
@@ -60,38 +66,79 @@ def fixID(sumstats,
60
66
  2. fix chr and pos using snpid
61
67
  3. checking rsid and chr:pos:nea:ea
62
68
  '''
63
- if verbose: log.write("Start to check IDs...{}".format(_get_version()))
64
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
65
-
66
- check_col(sumstats,[snpid,rsid],status)
67
-
69
+ ##start function with col checking##########################################################
70
+ _start_line = "check SNPID/rsID"
71
+ _end_line = "checking SNPID/rsID"
72
+ _start_cols =[]
73
+ _start_function = ".fix_id()"
74
+ _must_args ={}
75
+
76
+ is_enough_info = start_to(sumstats=sumstats,
77
+ log=log,
78
+ verbose=verbose,
79
+ start_line=_start_line,
80
+ end_line=_end_line,
81
+ start_cols=_start_cols,
82
+ start_function=_start_function,
83
+ **_must_args)
84
+ if is_enough_info == False: return sumstats
85
+ ############################################################################################
86
+
87
+ ############################ checking datatype ###################################################
88
+ if rsid in sumstats.columns:
89
+ # convert to string datatype
90
+ try:
91
+ log.write(" -Checking rsID data type...",verbose=verbose)
92
+ if sumstats[rsid].dtype == "string":
93
+ pass
94
+ else:
95
+ log.write(" -Converting rsID to pd.string data type...",verbose=verbose)
96
+ sumstats[rsid] = sumstats[rsid].astype("string")
97
+ except:
98
+ log.write(" -Force converting rsID to pd.string data type...",verbose=verbose)
99
+ sumstats[rsid] = sumstats[rsid].astype("string")
100
+ if snpid in sumstats.columns:
101
+ # convert to string datatype
102
+ try:
103
+ log.write(" -Checking SNPID data type...",verbose=verbose)
104
+ if sumstats[snpid].dtype == "string":
105
+ pass
106
+ else:
107
+ log.write(" -Converting SNPID to pd.string data type...",verbose=verbose)
108
+ sumstats[snpid] = sumstats[snpid].astype("string")
109
+ except:
110
+ log.write(" -Force converting SNPID to pd.string data type...",verbose=verbose)
111
+ sumstats[snpid] = sumstats[snpid].astype("string")
112
+
68
113
  ############################ checking ###################################################
69
114
  if snpid in sumstats.columns:
70
- if verbose: log.write(" -Checking if SNPID is chr:pos:ref:alt...(separator: - ,: , _)")
71
- #is_chrposrefalt = sumstats[snpid].str.match(r'(chr)?([0-9XYMT]+)[:_-]([0-9]+)[:_-]([ATCG]+)[:_-]([ATCG]+)', case=False, flags=0, na=False)
115
+ log.write(" -Checking if SNPID is CHR:POS:NEA:EA...(separator: - ,: , _)",verbose=verbose)
116
+ # check if SNPID is CHR:POS:EA:NEA
72
117
  is_chrposrefalt = sumstats[snpid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
118
+ # check if SNPID is NA
73
119
  is_snpid_na = sumstats[snpid].isna()
120
+
121
+ # change STATUS code
74
122
  sumstats.loc[ is_chrposrefalt,status] = vchange_status(sumstats.loc[ is_chrposrefalt,status],3 ,"975" ,"630")
75
123
  sumstats.loc[(~is_chrposrefalt)&(~is_snpid_na),status] = vchange_status(sumstats.loc[(~is_chrposrefalt)&(~is_snpid_na),status],3 ,"975" ,"842")
76
124
 
77
125
  if rsid in sumstats.columns:
78
- if verbose: log.write(" -Checking if rsID is rsxxxxxx or RSxxxxxxx...")
79
- is_rsid = sumstats[rsid].str.startswith(r'rs',na=False)
126
+ log.write(" -Checking if rsID is rsxxxxxx...", verbose=verbose)
127
+ is_rsid = sumstats[rsid].str.match(r'^rs\d+$', case=False, flags=0, na=False)
80
128
 
81
129
  sumstats.loc[ is_rsid,status] = vchange_status(sumstats.loc[ is_rsid,status], 3, "986","520")
82
130
  sumstats.loc[~is_rsid,status] = vchange_status(sumstats.loc[~is_rsid,status], 3, "986","743")
83
131
 
84
- if verbose: log.write(" -Checking if chr:pos:ref:alt is mixed in rsID column ...")
85
- is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\w+[:_-]\w+[:_-]\w+$', case=False, flags=0, na=False)
86
- #is_rs_chrpos = sumstats[rsid].str.match(r'(chr)?([0-9XYMT]+)[:_-]([0-9]+)[:_-]([ATCG]+)[:_-]([ATCG]+)', case=False, flags=0, na=False)
132
+ if verbose: log.write(" -Checking if CHR:POS:NEA:EA is mixed in rsID column ...")
133
+ is_rs_chrpos = sumstats[rsid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
87
134
 
88
- if verbose: log.write(" -Number of chr:pos:ref:alt mixed in rsID column :",sum(is_rs_chrpos))
89
- if verbose: log.write(" -Number of Unrecognized rsID :",len(sumstats) - sum(is_rs_chrpos) - sum(is_rsid) )
90
- if verbose: log.write(" -A look at the unrecognized rsID :",set(sumstats.loc[(~is_rsid)&(~is_rs_chrpos),rsid].head()),"...")
135
+ log.write(" -Number of CHR:POS:NEA:EA mixed in rsID column :",sum(is_rs_chrpos), verbose=verbose)
136
+ log.write(" -Number of Unrecognized rsID :",len(sumstats) - sum(is_rs_chrpos) - sum(is_rsid) , verbose=verbose)
137
+ log.write(" -A look at the unrecognized rsID :",set(sumstats.loc[(~is_rsid)&(~is_rs_chrpos),rsid].head()),"...", verbose=verbose)
91
138
 
92
139
  ############################ fixing chr pos###################################################
93
- if fixchrpos is True:
94
- # from snpid or rsid, extract chr:pos to fix CHR and POS
140
+ if fixchrpos == True:
141
+ # from snpid or rsid, extract CHR:POS to fix CHR and POS
95
142
  if snpid in sumstats.columns:
96
143
  if verbose: log.write(" -Fixing CHR and POS...")
97
144
  if overwrite is True:
@@ -99,8 +146,8 @@ def fixID(sumstats,
99
146
  # fix all
100
147
  to_fix = is_chrposrefalt
101
148
 
102
- #fix variants with chr and pos being empty
103
149
  elif (chrom in sumstats.columns) and (pos in sumstats.columns) :
150
+ #fix variants with chr and pos being NA
104
151
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
105
152
  to_fix_num = sum(to_fix)
106
153
  if to_fix_num and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
@@ -108,7 +155,7 @@ def fixID(sumstats,
108
155
 
109
156
  elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
110
157
  if verbose: log.write(" -Initiating CHR columns...")
111
- sumstats.loc[:,chrom]=pd.Series(dtype="string")
158
+ sumstats[chrom]=pd.Series(dtype="string")
112
159
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
113
160
  to_fix_num = sum(to_fix)
114
161
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
@@ -116,15 +163,16 @@ def fixID(sumstats,
116
163
 
117
164
  elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
118
165
  if verbose: log.write(" -Initiating CHR and POS column...")
119
- sumstats.loc[:,pos]=pd.Series(dtype="Int64")
166
+ sumstats[pos]=pd.Series(dtype="Int64")
120
167
  to_fix = is_chrposrefalt & sumstats[chrom].isna() & sumstats[pos].isna()
121
168
  to_fix_num = sum(to_fix)
122
169
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
123
170
  elif verbose: log.write(" -No fixable variants. ...")
171
+
124
172
  else:
125
173
  if verbose: log.write(" -Initiating CHR and POS columns...")
126
- sumstats.loc[:,chrom]=pd.Series(dtype="string")
127
- sumstats.loc[:,pos]=pd.Series(dtype="Int64")
174
+ sumstats[chrom]=pd.Series(dtype="string")
175
+ sumstats[pos]=pd.Series(dtype="Int64")
128
176
  to_fix = is_chrposrefalt
129
177
  to_fix_num = sum(to_fix)
130
178
  if to_fix_num>0 and verbose: log.write(" -Number of variants could be fixed: "+str(to_fix_num)+" ...")
@@ -134,8 +182,8 @@ def fixID(sumstats,
134
182
  if verbose: log.write(" -Filling CHR and POS columns using valid SNPID's chr:pos...")
135
183
  # format and qc filled chr and pos
136
184
 
137
- sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.split(':|_|-',n=2).str.get(0)
138
- sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,snpid].str.split(':|_|-',n=2).str.get(1)
185
+ sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
186
+ sumstats.loc[to_fix,pos] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[2]
139
187
 
140
188
  #sumstats.loc[to_fix,chrom] = sumstats.loc[to_fix,snpid].str.split(':|_|-').str[0].str.strip("chrCHR").astype("string")
141
189
  #sumstats.loc[to_fix,pos] =np.floor(pd.to_numeric(sumstats.loc[to_fix,snpid].str.split(':|_|-').str[1], errors='coerce')).astype('Int64')
@@ -153,20 +201,20 @@ def fixID(sumstats,
153
201
  elif verbose: log.write(" -No fixable variants ...")
154
202
  elif (chrom not in sumstats.columns) and (pos in sumstats.columns):
155
203
  if verbose: log.write(" -Initiating CHR columns...")
156
- sumstats.loc[:,chrom]=pd.Series(dtype="string")
204
+ sumstats[chrom]=pd.Series(dtype="string")
157
205
  to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
158
206
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
159
207
  elif verbose: log.write(" -No fixable variants ...")
160
208
  elif (chrom in sumstats.columns) and (pos not in sumstats.columns):
161
209
  if verbose: log.write(" -Initiating CHR and POS column...")
162
- sumstats.loc[:,pos]=pd.Series(dtype="Int64")
210
+ sumstats[pos]=pd.Series(dtype="Int64")
163
211
  to_fix = is_rs_chrpos & sumstats[chrom].isna() & sumstats[pos].isna()
164
212
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
165
213
  elif verbose: log.write(" -No fixable variants ...")
166
214
  else:
167
215
  if verbose: log.write(" -Initiating CHR and POS columns...")
168
- sumstats.loc[:,chrom]=pd.Series(dtype="string")
169
- sumstats.loc[:,pos]=pd.Series(dtype="Int64")
216
+ sumstats[chrom]=pd.Series(dtype="string")
217
+ sumstats[pos]=pd.Series(dtype="Int64")
170
218
  to_fix = is_rs_chrpos
171
219
  if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
172
220
  elif verbose: log.write(" -No fixable variants ...")
@@ -179,61 +227,68 @@ def fixID(sumstats,
179
227
  #sumstats.loc[to_fix,status] = vchange_status(sumstats.loc[to_fix,status], 4, "98765432","00000000").astype("string")
180
228
 
181
229
  ############################ fixing chr pos###################################################
182
- #if fixeanea is True:
183
- # if verbose: log.write(" -Warning: Please make sure a1 is ref or not in Chr:pos:a1:a2")
184
- # if overwrite is True:
185
- # if verbose: log.write(" -Overwrite is applied...")
186
- # to_fix = is_chrposrefalt
187
- # elif (nea in sumstats.columns) and (nea in sumstats.columns):
188
- # to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
189
- # if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
190
- # elif (nea in sumstats.columns) and (ea not in sumstats.columns):
191
- # if verbose: log.write(" -Initiating EA columns...")
192
- # sumstats[ea]=pd.Series(dtype="string")
193
- # to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
194
- # if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
195
- # elif (nea not in sumstats.columns) and (ea in sumstats.columns):
196
- # if verbose: log.write(" -Initiating NEA columns...")
197
- # sumstats[nea]=pd.Series(dtype="string")
198
- # to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
199
- # if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
200
- # else:
201
- # if verbose: log.write(" -Initiating EA and NEA columns...")
202
- # sumstats[nea]=pd.Series(dtype="string")
203
- # sumstats[ea]=pd.Series(dtype="string")
204
- # to_fix = is_chrposrefalt
205
- # if sum(to_fix)>0:
206
- # if verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
230
+ if fixeanea == True:
231
+ if verbose: log.warning("gwaslab assumes SNPID is in the format of CHR:POS:NEA:EA / CHR:POS:REF:ALT")
232
+ if overwrite is True:
233
+ if verbose: log.write(" -Overwrite mode is applied...")
234
+ to_fix = is_chrposrefalt
235
+ elif (nea in sumstats.columns) and (nea in sumstats.columns):
236
+ to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
237
+ if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
238
+ elif (nea in sumstats.columns) and (ea not in sumstats.columns):
239
+ if verbose: log.write(" -Initiating EA columns...")
240
+ sumstats[ea]=pd.Series(dtype="string")
241
+ to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
242
+ if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
243
+ elif (nea not in sumstats.columns) and (ea in sumstats.columns):
244
+ if verbose: log.write(" -Initiating NEA columns...")
245
+ sumstats[nea]=pd.Series(dtype="string")
246
+ to_fix = is_chrposrefalt&(sumstats[nea].isna()|sumstats[ea].isna())
247
+ if sum(to_fix)>0 and verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
248
+ else:
249
+ if verbose: log.write(" -Initiating EA and NEA columns...")
250
+ sumstats[nea]=pd.Series(dtype="string")
251
+ sumstats[ea]=pd.Series(dtype="string")
252
+ to_fix = is_chrposrefalt
253
+ if sum(to_fix)>0:
254
+ if verbose: log.write(" -Number of variants could be fixed: "+str(sum(to_fix))+" ...")
207
255
  #
208
- # if sum(to_fix)>0:
209
- # if verbose: log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's chr:pos:nea:ea...")
256
+ if sum(to_fix)>0:
257
+ if verbose: log.write(" -Filling "+str(sum(to_fix))+" EA and NEA columns using SNPID's CHR:POS:NEA:EA...")
210
258
  #
211
- # if fixeanea_flip is True:
212
- # if verbose: log.write(" -Flipped : chr:pos:a1:a2...a1->EA , a2->NEA ")
213
- # sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[2]).astype("string")
214
- # sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[3]).astype("string")
215
- # else:
216
- # if verbose: log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ")
217
- # sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[3]).astype("string")
218
- # sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[2]).astype("string")
259
+ if fixeanea_flip == True:
260
+ if verbose: log.write(" -Flipped : CHR:POS:NEA:EA -> CHR:POS:EA:NEA ")
261
+ sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
262
+ sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
263
+ else:
264
+ if verbose: log.write(" -Chr:pos:a1:a2...a1->EA , a2->NEA ")
265
+ sumstats.loc[to_fix,ea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4]
266
+ sumstats.loc[to_fix,nea] = sumstats.loc[to_fix,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3]
267
+
219
268
  # #to_change_status = sumstats[status].str.match(r"\w\w\w[45]\w\w\w")
220
269
  # #sumstats.loc[to_fix&to_change_status,status] = vchange_status(sumstats.loc[to_fix&to_change_status,status],4,"2")
221
270
  # #sumstats.loc[to_fix,snpid].apply(lambda x:re.split(':|_|-',x)[1]).astype("string")
222
271
  # #sumstats.loc[to_fix,rsid].apply(lambda x:re.split(':|_|-',x)[1]).astype("Int64")
223
272
 
224
273
  ############################ fixing id ###################################################
225
- if fixsep is True:
274
+ if fixsep == True:
226
275
  if snpid in sumstats.columns:
227
276
  if verbose: log.write(' -Replacing [_-] in SNPID with ":" ...')
228
- sumstats.loc[:,snpid] = sumstats.loc[:,snpid].str.replace(r"[_-]",":",regex=True)
277
+ sumstats[snpid] = sumstats[snpid].str.replace(r"[_-]",":",regex=True)
278
+
279
+ if fixprefix == True:
280
+ if snpid in sumstats.columns:
281
+ if verbose: log.write(' -Removing /^chr/ in SNPID ...')
282
+ prefix_removed = sumstats[snpid].str.extract(r'^(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1]
283
+ sumstats.loc[~prefix_removed.isna(),snpid] = prefix_removed[~prefix_removed.isna()]
229
284
 
230
- if fixid is True:
285
+ if fixid == True:
231
286
  if snpid not in sumstats.columns:
232
287
  # initiate a SNPID column
233
- sumstats.loc[:,snpid]=pd.Series(dtype="string")
288
+ sumstats[snpid]=pd.Series(dtype="string")
234
289
 
235
290
  if (rsid in sumstats.columns) and (sum(is_rs_chrpos)>0) :
236
- sumstats.loc[:,snpid]= sumstats.loc[is_rs_chrpos,rsid]
291
+ sumstats[snpid]= sumstats.loc[is_rs_chrpos,rsid]
237
292
 
238
293
  if (chrom in sumstats.columns) and (pos in sumstats.columns):
239
294
  #only fix when CHR and POS is available
@@ -288,7 +343,8 @@ def fixID(sumstats,
288
343
  after_number=sum(sumstats[snpid].isna())
289
344
  if verbose: log.write(" -Fixed "+ str(pre_number - after_number) +" variants ID...")
290
345
  elif verbose: log.write(" -ID unfixable: no CHR and POS columns or no SNPID. ")
291
- if verbose: log.write("Finished checking IDs successfully!")
346
+
347
+ finished(log,verbose,_end_line)
292
348
  return sumstats
293
349
 
294
350
  ""
@@ -303,20 +359,39 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
303
359
  remove duplicate SNPs based on 3. rsID
304
360
  remove multiallelic SNPs based on 4. CHR, POS
305
361
  '''
306
-
362
+
363
+ ##start function with col checking##########################################################
364
+ _start_line = "remove duplicated/multiallelic variants"
365
+ _end_line = "removing duplicated/multiallelic variants"
366
+ _start_cols =[]
367
+ _start_function = ".remove_dup()"
368
+ _must_args ={}
369
+
370
+ is_enough_info = start_to(sumstats=sumstats,
371
+ log=log,
372
+ verbose=verbose,
373
+ start_line=_start_line,
374
+ end_line=_end_line,
375
+ start_cols=_start_cols,
376
+ start_function=_start_function,
377
+ **_must_args)
378
+ if is_enough_info == False: return sumstats
379
+ ############################################################################################
380
+
381
+ if verbose: log.write(" -Removing mode:{}".format(mode))
307
382
  # sort the variants using the specified column before removing
308
383
  if keep_col is not None :
309
384
  if keep_col in sumstats.columns:
310
- if verbose: log.write("Start to sort the sumstats using " + keep_col +"...")
385
+ if verbose: log.write("Start to sort the sumstats using {}...".format(keep_col))
311
386
  sumstats = sumstats.sort_values(by=keep_col,ascending=keep_ascend)
312
387
  else:
313
388
  if verbose: log.write("Column" + keep_col +" was not detected... skipping... ")
314
389
  total_number = len(sumstats)
315
390
 
316
391
  # remove by duplicated SNPID
317
- if (snpid in sumstats.columns) and "d" in mode:
392
+ if (snpid in sumstats.columns) and ("d" in mode or "s" in mode):
318
393
  if verbose: log.write("Start to remove duplicated variants based on snpid...{}".format(_get_version()))
319
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
394
+ check_dataframe_shape(sumstats, log, verbose)
320
395
  if verbose: log.write(" -Which variant to keep: ", keep )
321
396
  pre_number =len(sumstats)
322
397
  if snpid in sumstats.columns:
@@ -326,18 +401,19 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
326
401
  if verbose: log.write(" -Removed ",pre_number -after_number ," based on SNPID...")
327
402
 
328
403
  # remove by duplicated rsID
329
- if (rsid in sumstats.columns) and ("d" in mode):
404
+ if (rsid in sumstats.columns) and ("d" in mode or "r" in mode):
330
405
  # keep na and remove duplicated
331
406
  pre_number =len(sumstats)
332
407
  if verbose: log.write("Start to remove duplicated variants based on rsID...")
408
+ check_dataframe_shape(sumstats, log, verbose)
333
409
  sumstats = sumstats.loc[sumstats[rsid].isna() | (~sumstats.duplicated(subset=rsid, keep=keep)),:]
334
410
  after_number=len(sumstats)
335
411
  if verbose: log.write(" -Removed ",pre_number -after_number ," based on rsID...")
336
412
 
337
413
  # remove by duplicated variants by CHR:POS:NEA:EA
338
- if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and "d" in mode:
414
+ if (chrom in sumstats.columns) and (pos in sumstats.columns) and (nea in sumstats.columns) and (ea in sumstats.columns) and ("d" in mode or "c" in mode):
339
415
  if verbose: log.write("Start to remove duplicated variants based on CHR,POS,EA and NEA...")
340
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
416
+ check_dataframe_shape(sumstats, log, verbose)
341
417
  if verbose: log.write(" -Which variant to keep: ", keep )
342
418
  pre_number =len(sumstats)
343
419
  if snpid in sumstats.columns:
@@ -351,8 +427,9 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
351
427
  # keep na and remove duplicated
352
428
  pre_number =len(sumstats)
353
429
  if verbose: log.write("Start to remove multiallelic variants based on chr:pos...")
430
+ check_dataframe_shape(sumstats, log, verbose)
354
431
  if verbose: log.write(" -Which variant to keep: ", keep )
355
- sumstats = sumstats.loc[(~sumstats.loc[:,[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
432
+ sumstats = sumstats.loc[(~sumstats[[chrom,pos]].all(axis=1)) | (~sumstats.duplicated(subset=[chrom,pos], keep=keep)),:]
356
433
  after_number=len(sumstats)
357
434
  if verbose: log.write(" -Removed ",pre_number -after_number," multiallelic variants...")
358
435
  after_number=len(sumstats)
@@ -360,310 +437,376 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
360
437
  # resort the coordinates
361
438
  if verbose: log.write(" -Removed ",total_number -after_number," variants in total.")
362
439
  if keep_col is not None :
363
- if verbose: log.write(" -Sort the coordinates...")
440
+ if verbose: log.write(" -Sort the coordinates based on CHR and POS...")
364
441
  sumstats = sortcoordinate(sumstats,verbose=False)
365
442
 
366
- if remove is True:
443
+ if "n" in mode or remove==True:
367
444
  # if remove==True, remove NAs
368
445
  if verbose: log.write(" -Removing NAs...")
369
446
  pre_number =len(sumstats)
370
- sumstats = sumstats.loc[~sumstats.isna().any(axis=1),:]
447
+ specified_columns = []
448
+ if "d" in mode:
449
+ specified_columns.append(rsid)
450
+ specified_columns.append(snpid)
451
+ specified_columns.append(chrom)
452
+ specified_columns.append(pos)
453
+ specified_columns.append(ea)
454
+ specified_columns.append(nea)
455
+ if "r" in mode:
456
+ specified_columns.append(rsid)
457
+ if "s" in mode:
458
+ specified_columns.append(snpid)
459
+ if "m" in mode:
460
+ specified_columns.append(chrom)
461
+ specified_columns.append(pos)
462
+ if "c" in mode:
463
+ specified_columns.append(chrom)
464
+ specified_columns.append(pos)
465
+ specified_columns.append(ea)
466
+ specified_columns.append(nea)
467
+ sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
371
468
  after_number=len(sumstats)
372
- if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values.")
373
- if verbose: log.write("Finished removing successfully!")
469
+ if verbose: log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)))
470
+
471
+ finished(log,verbose,_end_line)
374
472
  return sumstats
375
473
 
376
474
  ###############################################################################################################
377
475
  # 20230128
378
476
  def fixchr(sumstats,chrom="CHR",status="STATUS",add_prefix="",x=("X",23),y=("Y",24),mt=("MT",25), remove=False, verbose=True, chrom_list = None, minchr=1,log=Log()):
379
- #chrom_list = get_chr_list() #bottom
380
- if chrom_list is None:
381
- chrom_list = get_chr_list()
382
- if check_col(sumstats,chrom,status) is not True:
383
- if verbose: log.write(".fix_chr: Specified not detected..skipping...")
384
- return sumstats
385
- if verbose: log.write("Start to fix chromosome notation...{}".format(_get_version()))
386
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
387
-
388
- # convert to string datatype
389
- try:
390
- if verbose: log.write(" -Checking CHR data type...")
391
- if sumstats.loc[:,chrom].dtype == "string":
392
- pass
393
- else:
394
- sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype("string")
395
- except:
396
- if verbose: log.write(" -Force converting to pd string data type...")
397
- sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype("string")
398
-
399
- # check if CHR is numeric
400
- is_chr_fixed = sumstats[chrom].str.isnumeric()
401
- # fill NAs with False
402
- is_chr_fixed = is_chr_fixed.fillna(False)
403
- if verbose: log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed))
404
-
405
- # if there are variants whose CHR need to be fixed
406
- if sum(is_chr_fixed)<len(sumstats):
407
-
408
- #extract the CHR number or X Y M MT
409
- chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
477
+ ##start function with col checking##########################################################
478
+ _start_line = "fix chromosome notation (CHR)"
479
+ _end_line = "fixing chromosome notation (CHR)"
480
+ _start_cols =[chrom,status]
481
+ _start_function = ".fix_chr()"
482
+ _must_args ={}
410
483
 
411
- is_chr_fixable = ~chr_extracted.isna()
412
- if verbose: log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable))
484
+ is_enough_info = start_to(sumstats=sumstats,
485
+ log=log,
486
+ verbose=verbose,
487
+ start_line=_start_line,
488
+ end_line=_end_line,
489
+ start_cols=_start_cols,
490
+ start_function=_start_function,
491
+ **_must_args)
492
+ if is_enough_info == False: return sumstats
493
+ ############################################################################################
413
494
 
414
- # For not fixed variants, check if na
415
- is_chr_na = sumstats.loc[~is_chr_fixed, chrom].isna()
416
- if sum(is_chr_na)>0 and verbose:
417
- log.write(" -Variants with NA chromosome notations:",sum(is_chr_na))
418
-
419
- # Check variants with CHR being not NA and not fixable
420
- is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
421
- if sum(is_chr_invalid)>0 and verbose:
422
- log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
423
- try:
424
- log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
425
- except:
426
- pass
427
- elif verbose:
428
- log.write(" -No unrecognized chromosome notations...")
429
-
430
- # Assign good chr back to sumstats
431
- sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
495
+ #chrom_list = get_chr_list() #bottom
496
+ if chrom_list is None:
497
+ chrom_list = get_chr_list()
498
+ #if check_col(sumstats,chrom,status) is not True:
499
+ # if verbose: log.write(".fix_chr: Specified not detected..skipping...")
500
+ # return sumstats
432
501
 
433
- # X, Y, MT to 23,24,25
434
- xymt_list = [x[0].lower(),y[0].lower(),mt[0].lower(),x[0].upper(),y[0].upper(),mt[0].upper()]
435
-
436
- # check if sumstats contain sex CHR
437
- sex_chr = sumstats[chrom].isin(xymt_list)
438
-
439
- # if sumstats contain sex CHR
440
- if sum(sex_chr)>0:
441
- if verbose: log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]))
442
- if verbose: log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...")
443
-
444
- # convert "X, Y, MT" to numbers
445
- convert_num_to_xymt={}
446
- if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
447
- convert_num_to_xymt[x[0].lower()] = str(x[1])
448
- convert_num_to_xymt[x[0].upper()] = str(x[1])
449
- if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]))
450
- if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
451
- convert_num_to_xymt[y[0].lower()] = str(y[1])
452
- convert_num_to_xymt[y[0].upper()] = str(y[1])
453
- if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]))
454
- if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
455
- convert_num_to_xymt[mt[0].lower()] = str(mt[1])
456
- convert_num_to_xymt[mt[0].upper()] = str(mt[1])
457
- if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]))
458
- sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
459
-
460
- # change status code
461
- sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
462
- if len(is_chr_fixable.index)>0:
463
- sumstats.loc[is_chr_fixable.index,status] = vchange_status(sumstats.loc[is_chr_fixable.index,status],4,"986","520")
464
- if len(is_chr_fixable.index)>0:
465
- sumstats.loc[is_chr_invalid.index,status] = vchange_status(sumstats.loc[is_chr_invalid.index,status],4,"986","743")
466
-
467
- # check variants with unrecognized CHR
468
- unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
469
- if (remove is True) and unrecognized_num>0:
470
- # remove variants with unrecognized CHR
471
- try:
472
- if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
473
- except:
474
- pass
475
- if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
476
- try:
477
- log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
478
- except:
479
- pass
480
- #sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
481
- good_chr = sumstats[chrom].isin(chrom_list)
482
- sumstats = sumstats.loc[good_chr, :].copy()
502
+
503
+ # convert to string datatype
504
+ try:
505
+ if verbose: log.write(" -Checking CHR data type...")
506
+ if sumstats[chrom].dtype == "string":
507
+ pass
483
508
  else:
484
- if verbose: log.write(" -All CHR are already fixed...")
485
- sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
486
-
487
- # Convert string to int
488
- try:
489
- sumstats.loc[:,chrom] = sumstats.loc[:,chrom].astype('Int64')
490
- except:
491
- # force convert
492
- sumstats.loc[:,chrom] = np.floor(pd.to_numeric(sumstats.loc[:,chrom], errors='coerce')).astype('Int64')
493
-
494
- # filter out variants with CHR <=0
495
- out_of_range_chr = sumstats[chrom] < minchr
496
- out_of_range_chr = out_of_range_chr.fillna(False)
497
- if sum(out_of_range_chr)>0:
498
- if verbose: log.write(" -Sanity check for CHR...")
499
- if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
500
- sumstats = sumstats.loc[~out_of_range_chr,:]
501
-
502
- if verbose: log.write("Finished fixing chromosome notation successfully!")
509
+ sumstats[chrom] = sumstats[chrom].astype("string")
510
+ except:
511
+ if verbose: log.write(" -Force converting to pd string data type...")
512
+ sumstats[chrom] = sumstats[chrom].astype("string")
513
+
514
+ # check if CHR is numeric
515
+ is_chr_fixed = sumstats[chrom].str.isnumeric()
516
+ # fill NAs with False
517
+ is_chr_fixed = is_chr_fixed.fillna(False)
518
+ if verbose: log.write(" -Variants with standardized chromosome notation:",sum(is_chr_fixed))
519
+
520
+ # if there are variants whose CHR need to be fixed
521
+ if sum(is_chr_fixed)<len(sumstats):
503
522
 
504
- return sumstats
523
+ #extract the CHR number or X Y M MT
524
+ chr_extracted = sumstats.loc[~is_chr_fixed,chrom].str.extract(r'^(chr)?(\d{1,3}|[XYM]|MT)$',flags=re.IGNORECASE|re.ASCII)[1]
505
525
 
506
- ###############################################################################################################
507
- # 20230128
508
- def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_limit=0 , upper_limit=None , limit=250000000, log=Log()):
509
- if upper_limit is None:
510
- upper_limit = limit
511
- if check_col(sumstats,pos,status) is not True:
512
- if verbose: log.write(".fix_pos: Specified not detected..skipping...")
513
- return sumstats
514
- if verbose: log.write("Start to fix basepair positions...{}".format(_get_version()))
515
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
526
+ is_chr_fixable = ~chr_extracted.isna()
527
+ if verbose: log.write(" -Variants with fixable chromosome notations:",sum(is_chr_fixable))
528
+
529
+ # For not fixed variants, check if na
530
+ is_chr_na = sumstats.loc[~is_chr_fixed, chrom].isna()
531
+ if sum(is_chr_na)>0 and verbose:
532
+ log.write(" -Variants with NA chromosome notations:",sum(is_chr_na))
516
533
 
517
- all_var_num = len(sumstats)
518
- #convert to numeric
519
- is_pos_na = sumstats.loc[:,pos].isna()
534
+ # Check variants with CHR being not NA and not fixable
535
+ is_chr_invalid = (~is_chr_fixable)&(~is_chr_na)
536
+ if sum(is_chr_invalid)>0 and verbose:
537
+ log.write(" -Variants with invalid chromosome notations:",sum(is_chr_invalid))
538
+ try:
539
+ log.write(" -A look at invalid chromosome notations:" , set(sumstats.loc[~is_chr_fixed,chrom][is_chr_invalid].head()))
540
+ except:
541
+ pass
542
+ elif verbose:
543
+ log.write(" -No unrecognized chromosome notations...")
520
544
 
521
- try:
522
- if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
523
- sumstats.loc[:,pos] = sumstats.loc[:,pos].astype('string')
524
- # if so, remove thousands separator
525
- if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
526
- sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
527
- except:
528
- pass
545
+ # Assign good chr back to sumstats
546
+ sumstats.loc[is_chr_fixable.index,chrom] = chr_extracted[is_chr_fixable.index]
529
547
 
530
- # convert POS to integer
531
- try:
532
- if verbose: log.write(' -Converting to Int64 data type ...')
533
- sumstats[pos] = sumstats[pos].astype('Int64')
534
- except:
535
- if verbose: log.write(' -Force converting to Int64 data type ...')
536
- sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
537
- is_pos_fixed = ~sumstats.loc[:,pos].isna()
538
- is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
548
+ # X, Y, MT to 23,24,25
549
+ xymt_list = [x[0].lower(),y[0].lower(),mt[0].lower(),x[0].upper(),y[0].upper(),mt[0].upper()]
539
550
 
540
- sumstats.loc[is_pos_fixed,status] = vchange_status(sumstats.loc[is_pos_fixed,status] ,4,"975","630")
541
- sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
551
+ # check if sumstats contain sex CHR
552
+ sex_chr = sumstats[chrom].isin(xymt_list)
542
553
 
543
- # remove outlier, limit:250,000,000
544
- if verbose: log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit))
545
- is_pos_na = sumstats.loc[:,pos].isna()
546
- out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
547
- if verbose: log.write(" -Removed outliers:",sum(out_lier))
548
- sumstats = sumstats.loc[~out_lier,:]
549
- #remove na
550
- if remove is True:
551
- sumstats = sumstats.loc[~sumstats[pos].isna(),:]
552
- remain_var_num = len(sumstats)
553
- if verbose: log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.")
554
+ # if sumstats contain sex CHR
555
+ if sum(sex_chr)>0:
556
+ if verbose: log.write(" -Identifying non-autosomal chromosomes : {}, {}, and {} ...".format(x[0],y[0],mt[0]))
557
+ if verbose: log.write(" -Identified ",str(sum(sex_chr))," variants on sex chromosomes...")
558
+
559
+ # convert "X, Y, MT" to numbers
560
+ convert_num_to_xymt={}
561
+ if x[0].lower() in sumstats[chrom].values or x[0].upper() in sumstats[chrom].values:
562
+ convert_num_to_xymt[x[0].lower()] = str(x[1])
563
+ convert_num_to_xymt[x[0].upper()] = str(x[1])
564
+ if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(x[0], x[1]))
565
+ if y[0].lower() in sumstats[chrom].values or y[0].upper() in sumstats[chrom].values:
566
+ convert_num_to_xymt[y[0].lower()] = str(y[1])
567
+ convert_num_to_xymt[y[0].upper()] = str(y[1])
568
+ if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(y[0], y[1]))
569
+ if mt[0].lower() in sumstats[chrom].values or mt[0].upper() in sumstats[chrom].values:
570
+ convert_num_to_xymt[mt[0].lower()] = str(mt[1])
571
+ convert_num_to_xymt[mt[0].upper()] = str(mt[1])
572
+ if verbose: log.write(" -Standardizing sex chromosome notations: {} to {}...".format(mt[0], mt[1]))
573
+ sumstats.loc[sex_chr,chrom] =sumstats.loc[sex_chr,chrom].map(convert_num_to_xymt)
554
574
 
555
- if verbose: log.write(" -Converted all position to datatype Int64.")
556
- if verbose: log.write("Finished fixing basepair position successfully!")
575
+ # change status code
576
+ sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
577
+ if len(is_chr_fixable.index)>0:
578
+ sumstats.loc[is_chr_fixable.index,status] = vchange_status(sumstats.loc[is_chr_fixable.index,status],4,"986","520")
579
+ if len(is_chr_fixable.index)>0:
580
+ sumstats.loc[is_chr_invalid.index,status] = vchange_status(sumstats.loc[is_chr_invalid.index,status],4,"986","743")
557
581
 
558
- return sumstats
582
+ # check variants with unrecognized CHR
583
+ unrecognized_num = sum(~sumstats[chrom].isin(chrom_list))
584
+ if (remove is True) and unrecognized_num>0:
585
+ # remove variants with unrecognized CHR
586
+ try:
587
+ if verbose: log.write(" -Valid CHR list: {} - {}".format(min([int(x) for x in chrom_list if x.isnumeric()]),max([int(x) for x in chrom_list if x.isnumeric()])))
588
+ except:
589
+ pass
590
+ if verbose: log.write(" -Removed "+ str(unrecognized_num)+ " variants with chromosome notations not in CHR list.")
591
+ try:
592
+ log.write(" -A look at chromosome notations not in CHR list:" , set(sumstats.loc[~sumstats[chrom].isin(chrom_list),chrom].head()))
593
+ except:
594
+ pass
595
+ #sumstats = sumstats.loc[sumstats.index[sumstats[chrom].isin(chrom_list)],:]
596
+ good_chr = sumstats[chrom].isin(chrom_list)
597
+ sumstats = sumstats.loc[good_chr, :].copy()
598
+ else:
599
+ if verbose: log.write(" -All CHR are already fixed...")
600
+ sumstats.loc[is_chr_fixed,status] = vchange_status(sumstats.loc[is_chr_fixed,status],4,"986","520")
601
+
602
+ # Convert string to int
603
+ try:
604
+ sumstats[chrom] = sumstats[chrom].astype('Int64')
605
+ except:
606
+ # # force convert
607
+ sumstats[chrom] = np.floor(pd.to_numeric(sumstats[chrom], errors='coerce')).astype('Int64')
608
+
609
+ # filter out variants with CHR <=0
610
+ out_of_range_chr = sumstats[chrom] < minchr
611
+ out_of_range_chr = out_of_range_chr.fillna(False)
612
+ if sum(out_of_range_chr)>0:
613
+ if verbose: log.write(" -Sanity check for CHR...")
614
+ if verbose:log.write(" -Removed {} variants with CHR < {}...".format(sum(out_of_range_chr),minchr))
615
+ sumstats = sumstats.loc[~out_of_range_chr,:]
616
+
617
+ finished(log,verbose,_end_line)
618
+ return sumstats
619
+
620
+ ###############################################################################################################
621
+ # 20230128
622
+ def fixpos(sumstats,pos="POS",status="STATUS",remove=False, verbose=True, lower_limit=0 , upper_limit=None , limit=250000000, log=Log()):
623
+ ##start function with col checking##########################################################
624
+ _start_line = "fix basepair positions (POS)"
625
+ _end_line = "fixing basepair positions (POS)"
626
+ _start_cols =[pos,status]
627
+ _start_function = ".fix_pos()"
628
+ _must_args ={}
629
+
630
+ is_enough_info = start_to(sumstats=sumstats,
631
+ log=log,
632
+ verbose=verbose,
633
+ start_line=_start_line,
634
+ end_line=_end_line,
635
+ start_cols=_start_cols,
636
+ start_function=_start_function,
637
+ **_must_args)
638
+ if is_enough_info == False: return sumstats
639
+ ############################################################################################
640
+
641
+ if upper_limit is None:
642
+ upper_limit = limit
643
+
644
+ all_var_num = len(sumstats)
645
+ #convert to numeric
646
+ is_pos_na = sumstats[pos].isna()
647
+
648
+ try:
649
+ if str(sumstats[pos].dtype) == "string" or str(sumstats[pos].dtype) == "object":
650
+ sumstats[pos] = sumstats[pos].astype('string')
651
+ # if so, remove thousands separator
652
+ if verbose: log.write(' -Removing thousands separator "," or underbar "_" ...')
653
+ sumstats.loc[~is_pos_na, pos] = sumstats.loc[~is_pos_na, pos].str.replace(r'[,_]', '' ,regex=True)
654
+ except:
655
+ pass
656
+
657
+ # convert POS to integer
658
+ try:
659
+ if verbose: log.write(' -Converting to Int64 data type ...')
660
+ sumstats[pos] = sumstats[pos].astype('Int64')
661
+ except:
662
+ if verbose: log.write(' -Force converting to Int64 data type ...')
663
+ sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
664
+ is_pos_fixed = ~sumstats[pos].isna()
665
+ is_pos_invalid = (~is_pos_na)&(~is_pos_fixed)
666
+
667
+ sumstats.loc[is_pos_fixed,status] = vchange_status(sumstats.loc[is_pos_fixed,status] ,4,"975","630")
668
+ sumstats.loc[is_pos_invalid,status] = vchange_status(sumstats.loc[is_pos_invalid,status],4,"975","842")
669
+
670
+ # remove outlier, limit:250,000,000
671
+ if verbose: log.write(" -Position bound:({} , {:,})".format(lower_limit, upper_limit))
672
+ is_pos_na = sumstats[pos].isna()
673
+ out_lier= ((sumstats[pos]<=lower_limit) | (sumstats[pos]>=upper_limit)) & (~is_pos_na)
674
+ if verbose: log.write(" -Removed outliers:",sum(out_lier))
675
+ sumstats = sumstats.loc[~out_lier,:]
676
+ #remove na
677
+ if remove is True:
678
+ sumstats = sumstats.loc[~sumstats[pos].isna(),:]
679
+ remain_var_num = len(sumstats)
680
+ if verbose: log.write(" -Removed "+str(all_var_num - remain_var_num)+" variants with bad positions.")
681
+
682
+ finished(log,verbose,_end_line)
683
+ return sumstats
559
684
 
560
685
  ###############################################################################################################
561
686
  # 20220514
562
687
  def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=True,log=Log()):
563
- # remove variants with alleles other than actgACTG
564
- if check_col(sumstats,ea,nea,status) is not True:
565
- if verbose: log.write("EA and NEA not detected..skipping...")
566
- return sumstats
567
- if verbose: log.write("Start to fix alleles...{}".format(_get_version()))
568
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
569
-
570
- #if (ea not in sumstats.columns) or (nea not in sumstats.columns):
571
- if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
572
-
573
- #try:
574
- # ea_missing = sum(sumstats[ea].isna())
575
- # nea_missing = sum(sumstats[nea].isna())
576
- # if sum(ea_missing)>0:
577
- # if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
578
- # sumstats.loc[:,ea] = sumstats.loc[:,ea].add_categories("N").fillna("N")
579
- # if sum(sumstats[nea].isna())>0:
580
- # if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
581
- # sumstats.loc[:,nea] = sumstats.loc[:,nea].add_categories("N").fillna("N")
582
- #except:
583
- # pass
584
-
585
- categories = set(sumstats.loc[:,ea].str.upper())|set(sumstats.loc[:,nea].str.upper())|set("N")
586
- categories = {x for x in categories if pd.notna(x)}
587
-
588
- sumstats.loc[:,ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
589
- sumstats.loc[:,nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
590
- all_var_num = len(sumstats)
591
-
592
- ## check ATCG
593
- bad_ea = sumstats[ea].str.contains("[^actgACTG]",na=True)
594
- bad_nea = sumstats[nea].str.contains("[^actgACTG]",na=True)
595
- good_ea = ~bad_ea
596
- good_nea = ~bad_nea
597
-
598
- log.write(" -Variants with bad EA : {}".format(sum(bad_ea)), verbose=verbose)
599
- log.write(" -Variants with bad NEA : {}".format(sum(bad_nea)), verbose=verbose)
600
-
601
- ## check NA
602
- is_eanea_na = sumstats[ea].isna() | sumstats[nea].isna()
603
- log.write(" -Variants with NA for EA or NEA: {}".format(sum(is_eanea_na)), verbose=verbose)
604
-
605
- ## check same alleles
606
- not_variant = sumstats[nea] == sumstats[ea]
607
- log.write(" -Variants with same EA and NEA: {}".format(sum(not_variant)), verbose=verbose)
688
+ ##start function with col checking##########################################################
689
+ _start_line = "fix alleles (EA and NEA)"
690
+ _end_line = "fixing alleles (EA and NEA)"
691
+ _start_cols =[ea, nea,status]
692
+ _start_function = ".fix_allele()"
693
+ _must_args ={}
608
694
 
609
- ## sum up invalid variants
610
- is_invalid = bad_ea | bad_nea | not_variant
611
-
612
- exclude = bad_nea | bad_ea
613
-
614
- if verbose:
615
- if len(set(sumstats.loc[bad_ea,ea].head())) >0:
616
- log.write(" -A look at the non-ATCG EA:",set(sumstats.loc[bad_ea,ea].head()),"...")
617
- if len(set(sumstats.loc[bad_nea,nea].head())) >0:
618
- log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...")
619
-
620
- if remove == True:
621
- sumstats = sumstats.loc[(good_ea & good_nea),:].copy()
622
- good_eanea_num = len(sumstats)
623
- if verbose: log.write(" -Removed "+str(all_var_num - good_eanea_num)+" variants with NA alleles or alleles that contain bases other than A/C/T/G.")
624
- sumstats = sumstats.loc[(good_ea & good_nea & (~not_variant)),:].copy()
625
- good_eanea_notsame_num = len(sumstats)
626
- if verbose: log.write(" -Removed "+str(good_eanea_num - good_eanea_notsame_num)+" variants with same allele for EA and NEA.")
627
- else:
628
- sumstats.loc[:,[ea,nea]] = sumstats.loc[:,[ea,nea]].fillna("N")
629
- if verbose: log.write(" -Detected "+str(sum(exclude))+" variants with alleles that contain bases other than A/C/T/G .")
630
- categories = set(sumstats.loc[:,ea].str.upper())|set(sumstats.loc[:,nea].str.upper())|set("N")
631
- sumstats.loc[:,ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
632
- sumstats.loc[:,nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
633
-
634
- is_eanea_fixed = good_ea | good_nea
635
- is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
636
- is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
637
- is_not_normalized = (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()>1)
638
- is_normalized = is_indel &( (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()>1) | (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()==1) )
639
-
640
- if sum(is_invalid)>0:
641
- sumstats.loc[is_invalid, status] = vchange_status(sumstats.loc[is_invalid,status], 5,"9","6")
642
- if sum(is_eanea_na)>0:
643
- sumstats.loc[is_eanea_na,status] = vchange_status(sumstats.loc[is_eanea_na, status], 5,"9","7")
644
- if sum(is_eanea_fixed&is_not_normalized)>0:
645
- sumstats.loc[is_eanea_fixed&is_not_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_not_normalized,status], 5,"9","5")
646
- if sum(is_eanea_fixed&is_snp)>0:
647
- sumstats.loc[is_eanea_fixed&is_snp, status] = vchange_status(sumstats.loc[is_eanea_fixed&is_snp,status], 5,"9","0")
648
- if sum(is_eanea_fixed&is_indel)>0:
649
- sumstats.loc[is_eanea_fixed&is_indel,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_indel, status], 5,"9","4")
650
- if sum(is_eanea_fixed&is_normalized)>0:
651
- sumstats.loc[is_eanea_fixed&is_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_normalized, status], 5,"4","3")
652
- gc.collect()
653
- if verbose: log.write("Finished fixing allele successfully!")
654
-
655
- return sumstats
695
+ is_enough_info = start_to(sumstats=sumstats,
696
+ log=log,
697
+ verbose=verbose,
698
+ start_line=_start_line,
699
+ end_line=_end_line,
700
+ start_cols=_start_cols,
701
+ start_function=_start_function,
702
+ **_must_args)
703
+ if is_enough_info == False: return sumstats
704
+ ############################################################################################
705
+ #try:
706
+ # ea_missing = sum(sumstats[ea].isna())
707
+ # nea_missing = sum(sumstats[nea].isna())
708
+ # if sum(ea_missing)>0:
709
+ # if verbose: log.write(" -Converting {} missing EA to letter N.".format(ea_missing))
710
+ # sumstats[ea] = sumstats[ea].add_categories("N").fillna("N")
711
+ # if sum(sumstats[nea].isna())>0:
712
+ # if verbose: log.write(" -Converting {} missing NEA to letter N.".format(nea_missing))
713
+ # sumstats[nea] = sumstats[nea].add_categories("N").fillna("N")
714
+ #except:
715
+ # pass
716
+
717
+ if verbose: log.write(" -Converted all bases to string datatype and UPPERCASE.")
718
+ categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
719
+ categories = {x for x in categories if pd.notna(x)}
720
+ sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
721
+ sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
722
+ all_var_num = len(sumstats)
723
+
724
+ ## check ATCG
725
+ bad_ea = sumstats[ea].str.contains("[^actgACTG]",na=True)
726
+ bad_nea = sumstats[nea].str.contains("[^actgACTG]",na=True)
727
+ good_ea = ~bad_ea
728
+ good_nea = ~bad_nea
729
+
730
+ log.write(" -Variants with bad EA : {}".format(sum(bad_ea)), verbose=verbose)
731
+ log.write(" -Variants with bad NEA : {}".format(sum(bad_nea)), verbose=verbose)
732
+
733
+ ## check NA
734
+ is_eanea_na = sumstats[ea].isna() | sumstats[nea].isna()
735
+ log.write(" -Variants with NA for EA or NEA: {}".format(sum(is_eanea_na)), verbose=verbose)
736
+
737
+ ## check same alleles
738
+ not_variant = sumstats[nea] == sumstats[ea]
739
+ log.write(" -Variants with same EA and NEA: {}".format(sum(not_variant)), verbose=verbose)
740
+
741
+ ## sum up invalid variants
742
+ is_invalid = bad_ea | bad_nea | not_variant
743
+
744
+ exclude = bad_nea | bad_ea
745
+
746
+ if verbose:
747
+ if len(set(sumstats.loc[bad_ea,ea].head())) >0:
748
+ log.write(" -A look at the non-ATCG EA:",set(sumstats.loc[bad_ea,ea].head()),"...")
749
+ if len(set(sumstats.loc[bad_nea,nea].head())) >0:
750
+ log.write(" -A look at the non-ATCG NEA:",set(sumstats.loc[bad_nea,nea].head()),"...")
751
+
752
+ if remove == True:
753
+ sumstats = sumstats.loc[(good_ea & good_nea),:].copy()
754
+ good_eanea_num = len(sumstats)
755
+ if verbose: log.write(" -Removed "+str(all_var_num - good_eanea_num)+" variants with NA alleles or alleles that contain bases other than A/C/T/G.")
756
+ sumstats = sumstats.loc[(good_ea & good_nea & (~not_variant)),:].copy()
757
+ good_eanea_notsame_num = len(sumstats)
758
+ if verbose: log.write(" -Removed "+str(good_eanea_num - good_eanea_notsame_num)+" variants with same allele for EA and NEA.")
759
+ else:
760
+ sumstats[[ea,nea]] = sumstats[[ea,nea]].fillna("N")
761
+ if verbose: log.write(" -Detected "+str(sum(exclude))+" variants with alleles that contain bases other than A/C/T/G .")
762
+ categories = set(sumstats[ea].str.upper())|set(sumstats[nea].str.upper())|set("N")
763
+ sumstats[ea]=pd.Categorical(sumstats[ea].str.upper(),categories = categories)
764
+ sumstats[nea]=pd.Categorical(sumstats[nea].str.upper(),categories = categories)
765
+
766
+ is_eanea_fixed = good_ea | good_nea
767
+ is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
768
+ is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
769
+ is_not_normalized = (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()>1)
770
+ is_normalized = is_indel &( (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()>1) | (sumstats[ea].str.len()>1) &(sumstats[nea].str.len()==1) )
771
+
772
+ if sum(is_invalid)>0:
773
+ sumstats.loc[is_invalid, status] = vchange_status(sumstats.loc[is_invalid,status], 5,"9","6")
774
+ if sum(is_eanea_na)>0:
775
+ sumstats.loc[is_eanea_na,status] = vchange_status(sumstats.loc[is_eanea_na, status], 5,"9","7")
776
+ if sum(is_eanea_fixed&is_not_normalized)>0:
777
+ sumstats.loc[is_eanea_fixed&is_not_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_not_normalized,status], 5,"9","5")
778
+ if sum(is_eanea_fixed&is_snp)>0:
779
+ sumstats.loc[is_eanea_fixed&is_snp, status] = vchange_status(sumstats.loc[is_eanea_fixed&is_snp,status], 5,"9","0")
780
+ if sum(is_eanea_fixed&is_indel)>0:
781
+ sumstats.loc[is_eanea_fixed&is_indel,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_indel, status], 5,"9","4")
782
+ if sum(is_eanea_fixed&is_normalized)>0:
783
+ sumstats.loc[is_eanea_fixed&is_normalized,status] = vchange_status(sumstats.loc[is_eanea_fixed&is_normalized, status], 5,"4","3")
784
+
785
+ finished(log,verbose,_end_line)
786
+ return sumstats
656
787
 
657
788
  ###############################################################################################################
658
789
  # 20220721
659
790
 
660
791
  def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
661
- if check_col(sumstats,pos,ea,nea,status) is not True:
662
- if verbose: log.write("WARNING:.normalize(): specified columns not detected..skipping...")
663
- return sumstats
664
-
665
- if verbose: log.write("Start to normalize variants...{}".format(_get_version()))
666
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
792
+ ##start function with col checking##########################################################
793
+ _start_line = "normalize indels"
794
+ _end_line = "normalizing indels"
795
+ _start_cols =[ea, nea,status]
796
+ _start_function = ".normalize()"
797
+ _must_args ={}
798
+
799
+ is_enough_info = start_to(sumstats=sumstats,
800
+ log=log,
801
+ verbose=verbose,
802
+ start_line=_start_line,
803
+ end_line=_end_line,
804
+ start_cols=_start_cols,
805
+ start_function=_start_function,
806
+ **_must_args)
807
+ if is_enough_info == False: return sumstats
808
+ ############################################################################################
809
+
667
810
  #variants_to_check = status_match(sumstats[status],5,[4,5]) #
668
811
  #r'\w\w\w\w[45]\w\w'
669
812
  variants_to_check = sumstats[status].str[4].str.match(r'4|5', case=False, flags=0, na=False)
@@ -677,7 +820,8 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
677
820
  n_cores=1
678
821
  pool = Pool(n_cores)
679
822
  map_func = partial(normalizeallele,pos=pos,nea=nea,ea=ea,status=status)
680
- df_split = np.array_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
823
+ #df_split = np.array_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
824
+ df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
681
825
  normalized_pd = pd.concat(pool.map(map_func,df_split))
682
826
  pool.close()
683
827
  pool.join()
@@ -707,16 +851,16 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
707
851
  else:
708
852
  log.write(" -All variants are already normalized..")
709
853
  ###################################################################################################################
710
- categories = set(sumstats.loc[:,ea])|set(sumstats.loc[:,nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
711
- sumstats.loc[:,ea] = pd.Categorical(sumstats.loc[:,ea],categories = categories)
712
- sumstats.loc[:,nea] = pd.Categorical(sumstats.loc[:,nea],categories = categories )
854
+ categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
855
+ sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
856
+ sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
713
857
  sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
714
858
  try:
715
- sumstats.loc[:,pos] = sumstats.loc[:,pos].astype('Int64')
859
+ sumstats[pos] = sumstats[pos].astype('Int64')
716
860
  except:
717
- sumstats.loc[:,pos] = np.floor(pd.to_numeric(sumstats.loc[:,pos], errors='coerce')).astype('Int64')
861
+ sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
718
862
 
719
- if verbose: log.write("Finished normalizing variants successfully!")
863
+ finished(log,verbose,_end_line)
720
864
  return sumstats
721
865
 
722
866
  def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
@@ -781,6 +925,52 @@ def add_tolerence(stats, float_tolerence, mode):
781
925
  stats = (stats[0] , stats[1] + float_tolerence if stats[0]!=float("Inf") else float("Inf"))
782
926
  return stats
783
927
 
928
+
929
+ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, verbose, dtype="Int64"):
930
+ pre_number=len(sumstats)
931
+ if header in coltocheck and header in sumstats.columns:
932
+ cols_to_check.append(header)
933
+ if header=="STATUS":
934
+ if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
935
+ categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
936
+ sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
937
+ return sumstats
938
+
939
+ if dtype in ["Int64","Int32","int","int32","in64"]:
940
+ if verbose: log.write(" -Checking if {} <= {} <= {} ...".format( var_range[0] ,header, var_range[1]))
941
+ sumstats[header] = np.floor(pd.to_numeric(sumstats[header], errors='coerce')).astype(dtype)
942
+
943
+ elif dtype in ["Float64","Float32","float","float64","float32"]:
944
+ log.write(" -Checking if {} < {} < {} ...".format( var_range[0] ,header, var_range[1]),verbose=verbose)
945
+ sumstats[header] = pd.to_numeric(sumstats[header], errors='coerce').astype(dtype)
946
+
947
+ is_valid = (sumstats[header]>=var_range[0]) & (sumstats[header]<=var_range[1])
948
+ is_valid = is_valid.fillna(False)
949
+
950
+ if header=="P":
951
+ is_low_p = sumstats["P"] == 0
952
+ if sum(is_low_p) >0:
953
+ log.warning("Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)))
954
+ log.warning("Please consider using MLOG10P instead.")
955
+
956
+ if sum(~is_valid)>0:
957
+ try:
958
+ if "SNPID" in sumstats.columns:
959
+ id_to_use = "SNPID"
960
+ elif "rsID" in sumstats.columns:
961
+ id_to_use = "rsID"
962
+ invalid_ids = sumstats.loc[~is_valid, id_to_use].head().astype("string")
963
+ invalid_values = sumstats.loc[~is_valid, header].head().astype("string").fillna("NA")
964
+ log.write(" -Examples of invalid variants({}): {} ...".format(id_to_use, ",".join(invalid_ids.to_list()) ), verbose=verbose)
965
+ log.write(" -Examples of invalid values ({}): {} ...".format(header, ",".join(invalid_values.to_list()) ), verbose=verbose)
966
+ except:
967
+ pass
968
+
969
+ sumstats = sumstats.loc[is_valid,:]
970
+ after_number=len(sumstats)
971
+ log.write(" -Removed {} variants with bad/na {}.".format(pre_number - after_number, header), verbose=verbose)
972
+ return sumstats
973
+
784
974
  def sanitycheckstats(sumstats,
785
975
  coltocheck=None,
786
976
  n=(0,2**31-1),
@@ -788,8 +978,10 @@ def sanitycheckstats(sumstats,
788
978
  ncontrol=(0,2**31-1),
789
979
  eaf=(0,1),
790
980
  mac=(0,2**31-1),
981
+ maf=(0,0.5),
791
982
  chisq=(0,float("Inf")),
792
983
  z=(-9999,9999),
984
+ t=(-99999,99999),
793
985
  f=(0,float("Inf")),
794
986
  p=(0,1),
795
987
  mlog10p=(0,9999),
@@ -820,10 +1012,30 @@ def sanitycheckstats(sumstats,
820
1012
  HR_95U: float64 , HR_95L >0
821
1013
  INFO: float32 , 1>=INFO>0
822
1014
  Z float64 , -9999 < Z < 9999
1015
+ T float64 , -99999 < T < 99999
823
1016
  F float64 , F > 0
824
1017
  '''
1018
+ ##start function with col checking##########################################################
1019
+ _start_line = "perform sanity check for statistics"
1020
+ _end_line = "sanity check for statistics"
1021
+ _start_cols =[]
1022
+ _start_function = ".check_sanity()"
1023
+ _must_args ={}
825
1024
 
1025
+ is_enough_info = start_to(sumstats=sumstats,
1026
+ log=log,
1027
+ verbose=verbose,
1028
+ start_line=_start_line,
1029
+ end_line=_end_line,
1030
+ start_cols=_start_cols,
1031
+ start_function=_start_function,
1032
+ **_must_args)
1033
+ if is_enough_info == False: return sumstats
1034
+ ############################################################################################
1035
+
1036
+ if verbose: log.write(" -Comparison tolerance for floats: {}".format(float_tolerence))
826
1037
  eaf = add_tolerence(eaf, float_tolerence, "lr")
1038
+ maf = add_tolerence(maf, float_tolerence, "lr")
827
1039
  beta = add_tolerence(beta, float_tolerence, "lr")
828
1040
  se = add_tolerence(se, float_tolerence, "lr")
829
1041
  mlog10p = add_tolerence(mlog10p, float_tolerence, "lr")
@@ -838,221 +1050,138 @@ def sanitycheckstats(sumstats,
838
1050
  p = add_tolerence(p, float_tolerence, "lr")
839
1051
  f = add_tolerence(f, float_tolerence, "lr")
840
1052
  chisq = add_tolerence(chisq, float_tolerence, "lr")
841
-
842
-
1053
+ ############################################################################################
843
1054
  ## add direction
844
1055
  if coltocheck is None:
845
1056
  coltocheck = ["P","MLOG10P","INFO","Z","BETA","SE","EAF","CHISQ","F","N","N_CASE","N_CONTROL","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","STATUS"]
846
- if verbose: log.write("Start sanity check for statistics...{}".format(_get_version()))
847
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1057
+
848
1058
  cols_to_check=[]
849
1059
  oringinal_number=len(sumstats)
850
1060
  sumstats = sumstats.copy()
851
1061
 
852
- if verbose: log.write(" -Comparison tolerance for floats: {}".format(float_tolerence))
853
- ###SAMPLE SIZE################################################################################################################################################
854
- pre_number=len(sumstats)
855
- if "N" in coltocheck and "N" in sumstats.columns:
856
- cols_to_check.append("N")
857
- if verbose: log.write(" -Checking if ",n[0],"<=N<=",n[1]," ...")
858
- sumstats.loc[:,"N"] = np.floor(pd.to_numeric(sumstats.loc[:,"N"], errors='coerce')).astype("Int64")
859
- sumstats = sumstats.loc[(sumstats["N"]>=n[0]) & (sumstats["N"]<=n[1]),:]
860
- after_number=len(sumstats)
861
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N.")
862
- pre_number=len(sumstats)
863
- if "N_CASE" in coltocheck and "N_CASE" in sumstats.columns:
864
- cols_to_check.append("N_CASE")
865
- if verbose: log.write(" -Checking if ",ncase[0],"<=N_CASE<=",ncase[1]," ...")
866
- sumstats.loc[:,"N_CASE"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CASE"], errors='coerce')).astype("Int64")
867
- sumstats = sumstats.loc[(sumstats["N_CASE"]>=ncase[0]) & (sumstats["N_CASE"]<=ncase[1]),:]
868
- after_number=len(sumstats)
869
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CASE.")
870
- pre_number=len(sumstats)
871
- if "N_CONTROL" in coltocheck and "N_CONTROL" in sumstats.columns:
872
- cols_to_check.append("N_CONTROL")
873
- if verbose: log.write(" -Checking if ",ncontrol[0],"<=N_CONTROL<=",ncontrol[1]," ...")
874
- sumstats.loc[:,"N_CONTROL"] = np.floor(pd.to_numeric(sumstats.loc[:,"N_CONTROL"], errors='coerce')).astype("Int64")
875
- sumstats = sumstats.loc[(sumstats["N_CONTROL"]>=ncontrol[0]) & (sumstats["N_CONTROL"]<=ncontrol[1]),:]
876
- after_number=len(sumstats)
877
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad N_CONTROL.")
878
- pre_number=len(sumstats)
879
- if "N" in coltocheck and "N" in sumstats.columns and "N_CONTROL" in coltocheck and "N_CONTROL" in sumstats.columns and "N_CASE" in coltocheck and "N_CASE" in sumstats.columns:
880
- if verbose: log.write(" -Checking if N = N_CASE + N_CONTROL ...")
881
- matched_n = sumstats.loc[:,"N"] == sumstats.loc[:,"N_CASE"] + sumstats.loc[:,"N_CONTROL"]
882
- sumstats = sumstats.loc[matched_n,:]
883
- after_number=len(sumstats)
884
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with N != N_CASE + N_CONTROL.")
885
-
886
- ###ALLELE FREQUENCY################################################################################################################################################
887
- pre_number=len(sumstats)
888
- if "EAF" in coltocheck and "EAF" in sumstats.columns:
889
- cols_to_check.append("EAF")
890
- if verbose: log.write(" -Checking if ",eaf[0],"<EAF<",eaf[1]," ...")
891
- sumstats.loc[:,"EAF"] = pd.to_numeric(sumstats.loc[:,"EAF"], errors='coerce').astype("float32")
892
- sumstats = sumstats.loc[(sumstats["EAF"]>eaf[0]) & (sumstats["EAF"]<eaf[1]),:]
893
- after_number=len(sumstats)
894
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad EAF.")
895
-
896
- pre_number=len(sumstats)
897
- if "EAF" in coltocheck and "EAF" in sumstats.columns and "N" in coltocheck and "N" in sumstats.columns:
898
- if verbose: log.write(" -Checking if ",mac[0],"<=MAC<=",mac[1]," ...")
899
- sumstats["_MAF"]=sumstats["EAF"]
900
- sumstats.loc[sumstats["EAF"]>0.5,"_MAF"] = 1 - sumstats.loc[sumstats["EAF"]>0.5,"EAF"]
901
- sumstats["_MAC"] = np.floor(pd.to_numeric(sumstats.loc[:,"_MAF"] * sumstats.loc[:,"N"], errors='coerce')).astype("int64")
902
- macl = ( sumstats["_MAC"] >= mac[0])
903
- macu = ( sumstats["_MAC"] <= mac[1])
904
- sumstats = sumstats.loc[macl&macu,:]
905
- sumstats = sumstats.drop(labels=["_MAF","_MAC"],axis=1)
906
- after_number=len(sumstats)
907
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad MAC.")
908
-
909
- ###TEST STATISTICS################################################################################################################################################
910
- pre_number=len(sumstats)
911
- if "CHISQ" in coltocheck and "CHISQ" in sumstats.columns:
912
- cols_to_check.append("CHISQ")
913
- if verbose: log.write(" -Checking if ",chisq[0],"<CHISQ<",chisq[1]," ...")
914
- sumstats.loc[:,"CHISQ"] = pd.to_numeric(sumstats.loc[:,"CHISQ"], errors='coerce').astype("float64")
915
- sumstats = sumstats.loc[(sumstats["CHISQ"]>chisq[0]) & (sumstats["CHISQ"]<chisq[1]),:]
916
- after_number=len(sumstats)
917
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad CHISQ.")
918
-
919
- pre_number=len(sumstats)
920
- if "Z" in coltocheck and "Z" in sumstats.columns:
921
- cols_to_check.append("Z")
922
- if verbose: log.write(" -Checking if ",z[0],"<Z<",z[1]," ...")
923
- sumstats.loc[:,"Z"] = pd.to_numeric(sumstats.loc[:,"Z"], errors='coerce').astype("float64")
924
- sumstats = sumstats.loc[(sumstats["Z"]>z[0]) & (sumstats["Z"]<z[1]),:]
925
- after_number=len(sumstats)
926
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad Z.")
927
-
928
- pre_number=len(sumstats)
929
- if "F" in coltocheck and "F" in sumstats.columns:
930
- cols_to_check.append("F")
931
- if verbose: log.write(" -Checking if ",f[0],"<F<",f[1]," ...")
932
- sumstats.loc[:,"F"] = pd.to_numeric(sumstats.loc[:,"F"], errors='coerce').astype("float64")
933
- sumstats = sumstats.loc[(sumstats["F"]>f[0]) & (sumstats["F"]<f[1]),:]
934
- after_number=len(sumstats)
935
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad F.")
936
-
937
- ###P ################################################################################################################################################
938
- pre_number=len(sumstats)
939
- if "P" in coltocheck and "P" in sumstats.columns:
940
- cols_to_check.append("P")
941
- if verbose: log.write(" -Checking if ",p[0],"< P <",p[1]," ...")
942
- sumstats.loc[:,"P"] = pd.to_numeric(sumstats.loc[:,"P"], errors='coerce').astype("float64")
943
- sumstats = sumstats.loc[(sumstats["P"]>p[0]) & (sumstats["P"]<p[1]),:]
944
- after_number=len(sumstats)
945
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad P.")
1062
+ ###Int64 ################################################################################################################################################
1063
+ sumstats = check_range(sumstats, var_range=n, header="N", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
1064
+ sumstats = check_range(sumstats, var_range=ncase, header="N_CASE", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
1065
+ sumstats = check_range(sumstats, var_range=ncontrol, header="N_CONTROL", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="Int64")
1066
+
1067
+ ###float32 ################################################################################################################################################
1068
+ sumstats = check_range(sumstats, var_range=eaf, header="EAF", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
1069
+ sumstats = check_range(sumstats, var_range=maf, header="MAF", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
1070
+ sumstats = check_range(sumstats, var_range=info, header="INFO", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float32")
1071
+
1072
+ ###float64 ################################################################################################################################################
1073
+ sumstats = check_range(sumstats, var_range=chisq, header="CHISQ", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1074
+ sumstats = check_range(sumstats, var_range=z, header="Z", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1075
+ sumstats = check_range(sumstats, var_range=t, header="T", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1076
+ sumstats = check_range(sumstats, var_range=f, header="F", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1077
+ sumstats = check_range(sumstats, var_range=p, header="P", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1078
+ sumstats = check_range(sumstats, var_range=mlog10p, header="MLOG10P", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1079
+ sumstats = check_range(sumstats, var_range=beta, header="BETA", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1080
+ sumstats = check_range(sumstats, var_range=se, header="SE", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1081
+ sumstats = check_range(sumstats, var_range=OR, header="OR", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1082
+ sumstats = check_range(sumstats, var_range=OR_95L, header="OR_95L", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1083
+ sumstats = check_range(sumstats, var_range=OR_95U, header="OR_95U", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1084
+ sumstats = check_range(sumstats, var_range=HR, header="HR", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1085
+ sumstats = check_range(sumstats, var_range=HR_95L, header="HR_95L", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1086
+ sumstats = check_range(sumstats, var_range=HR_95U, header="HR_95U", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="float64")
1087
+ ###STATUS ###############################################################################################################################################
1088
+ sumstats = check_range(sumstats, var_range=None, header="STATUS", coltocheck=coltocheck, cols_to_check=cols_to_check, log=log, verbose=verbose, dtype="category")
1089
+
1090
+ after_number=len(sumstats)
1091
+ log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.",verbose=verbose)
1092
+ log.write(" -Data types for each column:",verbose=verbose)
1093
+ check_datatype(sumstats,verbose=verbose, log=log)
1094
+ finished(log,verbose,_end_line)
1095
+ return sumstats
1096
+
1097
+ ### check consistency #############################################################################################################################################
1098
+
1099
+ def _check_data_consistency(sumstats, beta="BETA", se="SE", p="P",mlog10p="MLOG10P",rtol=1e-3, atol=1e-3, equal_nan=True, verbose=True,log=Log()):
1100
+ ##start function with col checking##########################################################
1101
+ _start_line = "check data consistency across columns"
1102
+ _end_line = "checking data consistency across columns"
1103
+ _start_cols =[]
1104
+ _start_function = ".check_data_consistency()"
1105
+ _must_args ={}
1106
+
1107
+ is_enough_info = start_to(sumstats=sumstats,
1108
+ log=log,
1109
+ verbose=verbose,
1110
+ start_line=_start_line,
1111
+ end_line=_end_line,
1112
+ start_cols=_start_cols,
1113
+ start_function=_start_function,
1114
+ **_must_args)
1115
+ if is_enough_info == False: return sumstats
1116
+ ############################################################################################
1117
+
1118
+ log.write(" -Tolerance: {} (Relative) and {} (Absolute)".format(rtol, atol),verbose=verbose)
1119
+ check_status = 0
946
1120
 
947
- pre_number=len(sumstats)
948
- if "MLOG10P" in coltocheck and "MLOG10P" in sumstats.columns:
949
- cols_to_check.append("MLOG10P")
950
- if verbose: log.write(" -Checking if ",mlog10p[0],"<MLOG10P<",mlog10p[1]," ...")
951
- sumstats.loc[:,"MLOG10P"] = pd.to_numeric(sumstats.loc[:,"MLOG10P"], errors='coerce').astype("float64")
952
- sumstats = sumstats.loc[(sumstats["MLOG10P"]>mlog10p[0]) & (sumstats["MLOG10P"]<mlog10p[1]),:]
953
- after_number=len(sumstats)
954
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad MLOG10P.")
1121
+ if "SNPID" in sumstats.columns:
1122
+ id_to_use = "SNPID"
1123
+ elif "rsID" in sumstats.columns:
1124
+ id_to_use = "rsID"
1125
+ else:
1126
+ log.write(" -SNPID/rsID not available...SKipping",verbose=verbose)
1127
+ log.write("Finished checking data consistency across columns.",verbose=verbose)
1128
+ return 0
955
1129
 
956
- ###EFFECT ################################################################################################################################################
957
- pre_number=len(sumstats)
958
- if "BETA" in coltocheck and "BETA" in sumstats.columns:
959
- cols_to_check.append("BETA")
960
- if verbose: log.write(" -Checking if ",beta[0],"<BETA<",beta[1]," ...")
961
- sumstats.loc[:,"BETA"] = pd.to_numeric(sumstats.loc[:,"BETA"], errors='coerce').astype("float64")
962
- sumstats = sumstats.loc[(sumstats["BETA"]>beta[0]) & (sumstats["BETA"]<beta[1]),:]
963
- after_number=len(sumstats)
964
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad BETA.")
965
-
966
- pre_number=len(sumstats)
967
- if "SE" in coltocheck and "SE" in sumstats.columns:
968
- cols_to_check.append("SE")
969
- if verbose: log.write(" -Checking if ",se[0],"<SE<",se[1]," ...")
970
- sumstats.loc[:,"SE"] = pd.to_numeric(sumstats.loc[:,"SE"], errors='coerce').astype("float64")
971
- sumstats = sumstats.loc[(sumstats["SE"]>se[0]) & (sumstats["SE"]<se[1]),:]
972
- after_number=len(sumstats)
973
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad SE.")
974
-
975
- pre_number=len(sumstats)
976
- if "OR" in coltocheck and "OR" in sumstats.columns:
977
- cols_to_check.append("OR")
978
- if verbose: log.write(" -Checking if ",OR[0],"<log(OR)<",OR[1]," ...")
979
- sumstats.loc[:,"OR"] = pd.to_numeric(sumstats.loc[:,"OR"], errors='coerce').astype("float64")
980
- sumstats = sumstats.loc[(np.log(sumstats["OR"])>OR[0]) & (np.log(sumstats["OR"])<OR[1]),:]
981
- after_number=len(sumstats)
982
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR.")
983
-
984
- pre_number=len(sumstats)
985
- if "OR_95L" in coltocheck and "OR_95L" in sumstats.columns:
986
- cols_to_check.append("OR_95L")
987
- if verbose: log.write(" -Checking if ",OR_95L[0],"<OR_95L<",OR_95L[1]," ...")
988
- sumstats.loc[:,"OR_95L"] = pd.to_numeric(sumstats.loc[:,"OR_95L"], errors='coerce').astype("float64")
989
- sumstats = sumstats.loc[(sumstats["OR_95L"]>OR_95L[0]) & (sumstats["OR_95L"]<OR_95L[1]),:]
990
- after_number=len(sumstats)
991
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR_95L.")
992
-
993
- pre_number=len(sumstats)
994
- if "OR_95U" in coltocheck and "OR_95U" in sumstats.columns:
995
- cols_to_check.append("OR_95U")
996
- if verbose: log.write(" -Checking if ",OR_95U[0],"<OR_95U<",OR_95U[1]," ...")
997
- sumstats.loc[:,"OR_95U"] = pd.to_numeric(sumstats.loc[:,"OR_95U"], errors='coerce').astype("float64")
998
- sumstats = sumstats.loc[(sumstats["OR_95U"]>OR_95U[0]) & (sumstats["OR_95U"]<OR_95U[1]),:]
999
- after_number=len(sumstats)
1000
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad OR_95U.")
1001
1130
 
1002
- pre_number=len(sumstats)
1003
- if "HR" in coltocheck and "HR" in sumstats.columns:
1004
- cols_to_check.append("HR")
1005
- if verbose: log.write(" -Checking if ",HR[0],"<log(HR)<",HR[1]," ...")
1006
- sumstats.loc[:,"HR"] = pd.to_numeric(sumstats.loc[:,"HR"], errors='coerce').astype("float64")
1007
- sumstats = sumstats.loc[(np.log(sumstats["HR"])>HR[0]) & (np.log(sumstats["HR"])<HR[1]),:]
1008
- after_number=len(sumstats)
1009
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR.")
1010
-
1011
- pre_number=len(sumstats)
1012
- if "HR_95L" in coltocheck and "HR_95L" in sumstats.columns:
1013
- cols_to_check.append("HR_95L")
1014
- if verbose: log.write(" -Checking if ",HR_95L[0],"<HR_95L<",HR_95L[1]," ...")
1015
- sumstats.loc[:,"HR_95L"] = pd.to_numeric(sumstats.loc[:,"HR_95L"], errors='coerce').astype("float64")
1016
- sumstats = sumstats.loc[(sumstats["HR_95L"]>HR_95L[0]) & (sumstats["HR_95L"]<HR_95L[1]),:]
1017
- after_number=len(sumstats)
1018
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR_95L.")
1019
-
1020
- pre_number=len(sumstats)
1021
- if "HR_95U" in coltocheck and "HR_95U" in sumstats.columns:
1022
- cols_to_check.append("HR_95U")
1023
- if verbose: log.write(" -Checking if ",HR_95U[0],"<HR_95U<",HR_95U[1]," ...")
1024
- sumstats.loc[:,"HR_95U"] = pd.to_numeric(sumstats.loc[:,"HR_95U"], errors='coerce').astype("float64")
1025
- sumstats = sumstats.loc[(sumstats["HR_95U"]>HR_95U[0]) & (sumstats["HR_95U"]<HR_95U[1]),:]
1026
- after_number=len(sumstats)
1027
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad HR_95U.")
1028
- #INFO #################################################################################################################
1029
- pre_number=len(sumstats)
1030
- if "INFO" in coltocheck and "INFO" in sumstats.columns:
1031
- cols_to_check.append("INFO")
1032
- if verbose: log.write(" -Checking if ",info[0],"<INFO<",info[1]," ...")
1033
- sumstats.loc[:,"INFO"] = pd.to_numeric(sumstats.loc[:,"INFO"], errors='coerce').astype("float32")
1034
- sumstats = sumstats.loc[(sumstats["INFO"]>info[0]) & (sumstats["INFO"]<info[1]),:]
1035
- after_number=len(sumstats)
1036
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad INFO.")
1037
- ###STATUS ################################################################################################################################################
1038
- pre_number=len(sumstats)
1039
- if "STATUS" in coltocheck and "STATUS" in sumstats.columns:
1040
- cols_to_check.append("STATUS")
1041
- if verbose: log.write(" -Checking STATUS and converting STATUS to categories....")
1042
- categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
1043
- sumstats.loc[:,"STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
1131
+ if "BETA" in sumstats.columns and "SE" in sumstats.columns:
1132
+ if "MLOG10P" in sumstats.columns:
1133
+ log.write(" -Checking if BETA/SE-derived-MLOG10P is consistent with MLOG10P...",verbose=verbose)
1134
+ betase_derived_mlog10p = _convert_betase_to_mlog10p(sumstats["BETA"], sumstats["SE"])
1135
+ is_close = np.isclose(betase_derived_mlog10p, sumstats["MLOG10P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1136
+ diff = betase_derived_mlog10p - sumstats["MLOG10P"]
1137
+ if sum(~is_close)>0:
1138
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
1139
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
1140
+ else:
1141
+ log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1142
+ check_status=1
1143
+
1144
+ if "P" in sumstats.columns:
1145
+ log.write(" -Checking if BETA/SE-derived-P is consistent with P...",verbose=verbose)
1146
+ betase_derived_p = _convert_betase_to_p(sumstats["BETA"], sumstats["SE"])
1147
+ is_close = np.isclose(betase_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1148
+ diff = betase_derived_p - sumstats["P"]
1149
+ if sum(~is_close)>0:
1150
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
1151
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
1152
+ else:
1153
+ log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1154
+ check_status=1
1044
1155
 
1045
- #pre_number=len(sumstats)
1046
- #sumstats = sumstats.dropna(subset=cols_to_check)
1047
- #after_number=len(sumstats)
1048
- #if verbose:log.write(" -Removed {} variants with NAs in the checked columns...".format(pre_number - after_number))
1156
+ if "MLOG10P" in sumstats.columns and "P" in sumstats.columns:
1157
+ log.write(" -Checking if MLOG10P-derived-P is consistent with P...",verbose=verbose)
1158
+ mlog10p_derived_p = _convert_mlog10p_to_p(sumstats["MLOG10P"])
1159
+ is_close = np.isclose(mlog10p_derived_p, sumstats["P"], rtol=rtol, atol=atol, equal_nan=equal_nan)
1160
+ diff = mlog10p_derived_p - sumstats["P"]
1161
+ if sum(~is_close)>0:
1162
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
1163
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
1164
+ else:
1165
+ log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1166
+ check_status=1
1049
1167
 
1050
- if verbose: log.write(" -Removed "+str(oringinal_number - after_number)+" variants with bad statistics in total.")
1051
- if verbose:
1052
- log.write(" -Data types for each column:")
1053
- check_datatype(sumstats,verbose=verbose, log=log)
1054
- if verbose: log.write("Finished sanity check successfully!")
1055
- return sumstats
1168
+ if "N" in sumstats.columns and "N_CONTROL" in sumstats.columns and "N_CASE" in sumstats.columns:
1169
+ if verbose: log.write(" -Checking if N is consistent with N_CASE + N_CONTROL ...")
1170
+ is_close = sumstats["N"] == sumstats["N_CASE"] + sumstats["N_CONTROL"]
1171
+ #is_close = np.isclose(sumstats["N"], sumstats["N_CASE"] + sumstats["N_CONTROL"] , rtol=rtol, atol=atol, equal_nan=equal_nan)
1172
+ diff = abs(sumstats["N"] - (sumstats["N_CASE"] + sumstats["N_CONTROL"] ))
1173
+ if sum(~is_close)>0:
1174
+ log.write(" -Not consistent: {} variant(s)".format(sum(~is_close)),verbose=verbose)
1175
+ log.write(" -Variant {} with max difference: {} with {}".format(id_to_use, sumstats.loc[diff.idxmax(),id_to_use], diff.max()),verbose=verbose)
1176
+ else:
1177
+ log.write(" -Variants with inconsistent values were not detected." ,verbose=verbose)
1178
+ check_status=1
1179
+
1180
+ if check_status==1:
1181
+ log.write(" -Note: if the max difference is greater than expected, please check your original sumstats.",verbose=verbose)
1182
+ else:
1183
+ log.write(" -No availalbe columns for data consistency checking...Skipping...",verbose=verbose)
1184
+ finished(log,verbose,_end_line)
1056
1185
 
1057
1186
  ###############################################################################################################
1058
1187
  # 20220426
@@ -1076,11 +1205,81 @@ def flip_direction(string):
1076
1205
  else: #sometime it is 0
1077
1206
  flipped_string+=char
1078
1207
  return flipped_string
1079
-
1208
+
1209
+ def flip_by_swap(sumstats, matched_index, log, verbose):
1210
+ if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
1211
+ if verbose: log.write(" -Swapping column: NEA <=> EA...")
1212
+ sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
1213
+ return sumstats
1214
+
1215
+ def flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1):
1216
+ if "OR" in sumstats.columns:
1217
+ if verbose: log.write(" -Flipping column: OR = 1 / OR...")
1218
+ sumstats.loc[matched_index,"OR"] = factor / sumstats.loc[matched_index,"OR"].values
1219
+ if "OR_95L" in sumstats.columns:
1220
+ if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95L...")
1221
+ sumstats.loc[matched_index,"OR_95U"] = factor / sumstats.loc[matched_index,"OR_95L"].values
1222
+ if "OR_95U" in sumstats.columns:
1223
+ if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95U...")
1224
+ sumstats.loc[matched_index,"OR_95L"] = factor / sumstats.loc[matched_index,"OR_95U"].values
1225
+ if "HR" in sumstats.columns:
1226
+ if verbose: log.write(" -Flipping column: HR = 1 / HR...")
1227
+ sumstats.loc[matched_index,"HR"] = factor / sumstats.loc[matched_index,"HR"].values
1228
+ if "HR_95L" in sumstats.columns:
1229
+ if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95L...")
1230
+ sumstats.loc[matched_index,"HR_95U"] = factor / sumstats.loc[matched_index,"HR_95L"].values
1231
+ if "HR_95U" in sumstats.columns:
1232
+ if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95U...")
1233
+ sumstats.loc[matched_index,"HR_95L"] = factor / sumstats.loc[matched_index,"HR_95U"].values
1234
+ return sumstats
1235
+
1236
+ def flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1):
1237
+ if "EAF" in sumstats.columns:
1238
+ if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
1239
+ sumstats.loc[matched_index,"EAF"] = factor - sumstats.loc[matched_index,"EAF"].values
1240
+ return sumstats
1241
+
1242
+ def flip_by_sign(sumstats, matched_index, log, verbose, cols=None):
1243
+ if "BETA" in sumstats.columns:
1244
+ if verbose: log.write(" -Flipping column: BETA = - BETA...")
1245
+ sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1246
+ if "BETA_95L" in sumstats.columns:
1247
+ if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95L...")
1248
+ sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95L"].values
1249
+ if "BETA_95U" in sumstats.columns:
1250
+ if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95U...")
1251
+ sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95U"].values
1252
+ if "Z" in sumstats.columns:
1253
+ if verbose: log.write(" -Flipping column: Z = - Z...")
1254
+ sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"Z"].values
1255
+ if "T" in sumstats.columns:
1256
+ if verbose: log.write(" -Flipping column: T = - T...")
1257
+ sumstats.loc[matched_index,"Z"] = - sumstats.loc[matched_index,"T"].values
1258
+ if "DIRECTION" in sumstats.columns:
1259
+ if verbose: log.write(" -Flipping column: DIRECTION +-?0 <=> -+?0 ...")
1260
+ sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1261
+ return sumstats
1262
+
1080
1263
  def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1081
-
1082
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1083
-
1264
+ ##start function with col checking##########################################################
1265
+ _start_line = "adjust statistics based on STATUS code"
1266
+ _end_line = "adjusting statistics based on STATUS code"
1267
+ _start_cols =[]
1268
+ _start_function = ".check_data_consistency()"
1269
+ _must_args ={}
1270
+
1271
+ is_enough_info = start_to(sumstats=sumstats,
1272
+ log=log,
1273
+ verbose=verbose,
1274
+ start_line=_start_line,
1275
+ end_line=_end_line,
1276
+ start_cols=_start_cols,
1277
+ start_function=_start_function,
1278
+ **_must_args)
1279
+ if is_enough_info == False: return sumstats
1280
+ ############################################################################################
1281
+
1282
+ if_stats_flipped = False
1084
1283
  ###################get reverse complementary####################
1085
1284
  pattern = r"\w\w\w\w\w[45]\w"
1086
1285
  #matched_index = status_match(sumstats[status],6,[4,5]) #
@@ -1092,107 +1291,49 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1092
1291
  if verbose: log.write(" -Converting to reverse complement : EA and NEA...")
1093
1292
  reverse_complement_nea = sumstats.loc[matched_index,'NEA'].apply(lambda x :get_reverse_complementary_allele(x))
1094
1293
  reverse_complement_ea = sumstats.loc[matched_index,'EA'].apply(lambda x :get_reverse_complementary_allele(x))
1095
- categories = set(sumstats.loc[:,'EA'])|set(sumstats.loc[:,'NEA']) |set(reverse_complement_ea) |set(reverse_complement_nea)
1096
- sumstats.loc[:,'EA']=pd.Categorical(sumstats.loc[:,'EA'],categories = categories)
1097
- sumstats.loc[:,'NEA']=pd.Categorical(sumstats.loc[:,'NEA'],categories = categories )
1294
+ categories = set(sumstats['EA'])|set(sumstats['NEA']) |set(reverse_complement_ea) |set(reverse_complement_nea)
1295
+ sumstats['EA']=pd.Categorical(sumstats['EA'],categories = categories)
1296
+ sumstats['NEA']=pd.Categorical(sumstats['NEA'],categories = categories )
1098
1297
  sumstats.loc[matched_index,['NEA']] = reverse_complement_nea
1099
1298
  sumstats.loc[matched_index,['EA']] = reverse_complement_ea
1100
1299
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "4","2")
1101
1300
  if verbose: log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x")
1102
-
1301
+ if_stats_flipped = True
1103
1302
  ###################flip ref####################
1104
1303
  pattern = r"\w\w\w\w\w[35]\w"
1105
1304
  #matched_index = status_match(sumstats[status],6,[3,5]) #sumstats[status].str.match(pattern)
1106
1305
  matched_index = sumstats[status].str[5].str.match(r"3|5")
1107
1306
  if sum(matched_index)>0:
1108
- if verbose: log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: alt->ea , ref->nea ...{}".format(_get_version()))
1307
+ if verbose: log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: ALT->EA , REF->NEA ...{}".format(_get_version()))
1109
1308
  if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1110
- if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
1111
- if verbose: log.write(" -Swapping column: NEA <=> EA...")
1112
- sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
1113
- if "BETA" in sumstats.columns:
1114
- if verbose: log.write(" -Flipping column: BETA = - BETA...")
1115
- sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1116
- if "BETA_95L" in sumstats.columns:
1117
- if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
1118
- sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
1119
- if "BETA_95U" in sumstats.columns:
1120
- if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
1121
- sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
1122
- if "EAF" in sumstats.columns:
1123
- if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
1124
- sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
1125
- if "OR" in sumstats.columns:
1126
- if verbose: log.write(" -Flipping column: OR = 1 / OR...")
1127
- sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
1128
- if "OR_95L" in sumstats.columns:
1129
- if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
1130
- sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
1131
- if "OR_95U" in sumstats.columns:
1132
- if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
1133
- sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
1134
- if "HR" in sumstats.columns:
1135
- if verbose: log.write(" -Flipping column: HR = 1 / HR...")
1136
- sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
1137
- if "HR_95L" in sumstats.columns:
1138
- if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
1139
- sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
1140
- if "HR_95U" in sumstats.columns:
1141
- if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
1142
- sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
1143
- if "DIRECTION" in sumstats.columns:
1144
- if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
1145
- sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1309
+
1310
+ flip_by_swap(sumstats, matched_index, log, verbose)
1311
+ flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
1312
+ flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
1313
+ flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
1314
+
1146
1315
  #change status
1147
1316
  if verbose: log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x")
1148
1317
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 6, "35","12")
1318
+ if_stats_flipped = True
1149
1319
 
1150
1320
  ###################flip ref for undistingushable indels####################
1151
1321
  pattern = r"\w\w\w\w[123][67]6"
1152
1322
  #matched_index = status_match(sumstats[status],6,[1,2,3])|status_match(sumstats[status],6,[6,7])|status_match(sumstats[status],7,6) #sumstats[status].str.match(pattern)
1153
1323
  matched_index = sumstats[status].str[4:].str.match(r"[123][67]6")
1154
1324
  if sum(matched_index)>0:
1155
- if verbose: log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: alt->ea , ref->nea...{}".format(_get_version()))
1325
+ if verbose: log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: ALT->EA , REF->NEA...{}".format(_get_version()))
1156
1326
  if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1157
- if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
1158
- if verbose: log.write(" -Swapping column: NEA <=> EA...")
1159
- sumstats.loc[matched_index,['NEA','EA']] = sumstats.loc[matched_index,['EA','NEA']].values
1160
- if "BETA" in sumstats.columns:
1161
- if verbose: log.write(" -Flipping column: BETA = - BETA...")
1162
- sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1163
- if "BETA_95L" in sumstats.columns:
1164
- if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
1165
- sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
1166
- if "BETA_95U" in sumstats.columns:
1167
- if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
1168
- sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
1169
- if "EAF" in sumstats.columns:
1170
- if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
1171
- sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
1172
- if "OR" in sumstats.columns:
1173
- if verbose: log.write(" -Flipping column: OR = 1 / OR...")
1174
- sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
1175
- if "OR_95L" in sumstats.columns:
1176
- if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
1177
- sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
1178
- if "OR_95U" in sumstats.columns:
1179
- if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
1180
- sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
1181
- if "HR" in sumstats.columns:
1182
- if verbose: log.write(" -Flipping column: HR = 1 / HR...")
1183
- sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
1184
- if "HR_95L" in sumstats.columns:
1185
- if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
1186
- sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
1187
- if "HR_95U" in sumstats.columns:
1188
- if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
1189
- sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
1190
- if "DIRECTION" in sumstats.columns:
1191
- if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
1192
- sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1327
+
1328
+ flip_by_swap(sumstats, matched_index, log, verbose)
1329
+ flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
1330
+ flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
1331
+ flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
1332
+
1193
1333
  #change status
1194
1334
  if verbose: log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4")
1195
1335
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "6","4")
1336
+ if_stats_flipped = True
1196
1337
  # flip ref
1197
1338
  ###################flip statistics for reverse strand panlindromic variants####################
1198
1339
  pattern = r"\w\w\w\w\w[012]5"
@@ -1201,43 +1342,20 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1201
1342
  if sum(matched_index)>0:
1202
1343
  if verbose: log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()))
1203
1344
  if verbose: log.write(" -Flipping "+ str(sum(matched_index)) +" variants...")
1204
- if "BETA" in sumstats.columns:
1205
- if verbose: log.write(" -Flipping column: BETA = - BETA...")
1206
- sumstats.loc[matched_index,"BETA"] = - sumstats.loc[matched_index,"BETA"].values
1207
- if "BETA_95L" in sumstats.columns:
1208
- if verbose: log.write(" -Flipping column: BETA_95L = - BETA_95L...")
1209
- sumstats.loc[matched_index,"BETA_95L"] = - sumstats.loc[matched_index,"BETA_95L"].values
1210
- if "BETA_95U" in sumstats.columns:
1211
- if verbose: log.write(" -Flipping column: BETA_95U = - BETA_95U...")
1212
- sumstats.loc[matched_index,"BETA_95U"] = - sumstats.loc[matched_index,"BETA_95U"].values
1213
- if "EAF" in sumstats.columns:
1214
- if verbose: log.write(" -Flipping column: EAF = 1 - EAF...")
1215
- sumstats.loc[matched_index,"EAF"] = 1 - sumstats.loc[matched_index,"EAF"].values
1216
- if "OR" in sumstats.columns:
1217
- if verbose: log.write(" -Flipping column: OR = 1 / OR...")
1218
- sumstats.loc[matched_index,"OR"] = 1 / sumstats.loc[matched_index,"OR"].values
1219
- if "OR_95L" in sumstats.columns:
1220
- if verbose: log.write(" -Flipping column: OR_95L = 1 / OR_95L...")
1221
- sumstats.loc[matched_index,"OR_95L"] = 1 / sumstats.loc[matched_index,"OR_95L"].values
1222
- if "OR_95U" in sumstats.columns:
1223
- if verbose: log.write(" -Flipping column: OR_95U = 1 / OR_95U...")
1224
- sumstats.loc[matched_index,"OR_95U"] = 1 / sumstats.loc[matched_index,"OR_95U"].values
1225
- if "HR" in sumstats.columns:
1226
- if verbose: log.write(" -Flipping column: HR = 1 / HR...")
1227
- sumstats.loc[matched_index,"HR"] = 1 / sumstats.loc[matched_index,"HR"].values
1228
- if "HR_95L" in sumstats.columns:
1229
- if verbose: log.write(" -Flipping column: HR_95L = 1 / HR_95L...")
1230
- sumstats.loc[matched_index,"HR_95L"] = 1 / sumstats.loc[matched_index,"HR_95L"].values
1231
- if "HR_95U" in sumstats.columns:
1232
- if verbose: log.write(" -Flipping column: HR_95U = 1 / HR_95U...")
1233
- sumstats.loc[matched_index,"HR_95U"] = 1 / sumstats.loc[matched_index,"HR_95U"].values
1234
- if "DIRECTION" in sumstats.columns:
1235
- if verbose: log.write(" -Flipping column: DIRECTION +-? <=> -+? ...")
1236
- sumstats.loc[matched_index,"DIRECTION"] = sumstats.loc[matched_index,"DIRECTION"].apply(flip_direction)
1345
+
1346
+ flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
1347
+ flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
1348
+ flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
1349
+
1237
1350
  #change status
1238
1351
  if verbose: log.write(" -Changed the status for flipped variants: xxxxx[012]5: -> xxxxx[012]2")
1239
1352
  sumstats.loc[matched_index,status] = vchange_status(sumstats.loc[matched_index,status], 7, "5","2")
1240
- if verbose: log.write("Finished converting successfully!")
1353
+ if_stats_flipped = True
1354
+
1355
+ if if_stats_flipped == True:
1356
+ finished(log, verbose, "adjusting")
1357
+ else:
1358
+ finished(log, verbose, "adjusting with no statistics changed.")
1241
1359
  return sumstats
1242
1360
  ""
1243
1361
 
@@ -1246,8 +1364,8 @@ def flipallelestats(sumstats,status="STATUS",verbose=True,log=Log()):
1246
1364
  # 20220426
1247
1365
  def liftover_snv(row,chrom,converter,to_build):
1248
1366
  status_pre=""
1249
- status_end=row[1][2]+"9"+row[1][4]+"99"
1250
- pos_0_based = int(row[0]) - 1
1367
+ status_end=row.iloc[1][2]+"9"+row.iloc[1][4]+"99"
1368
+ pos_0_based = int(row.iloc[0]) - 1
1251
1369
  results = converter[chrom][pos_0_based]
1252
1370
  if converter[chrom][pos_0_based]:
1253
1371
  # return chrom, pos_1_based
@@ -1277,13 +1395,25 @@ def liftover_variant(sumstats,
1277
1395
  return sumstats
1278
1396
 
1279
1397
  def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
1280
- if check_col(sumstats,chrom,pos,status) is not True:
1281
- if verbose: log.write("WARNING:.liftover(): specified columns not detected..skipping...")
1282
- return sumstats
1283
- if verbose: log.write("Start to perform liftover...{}".format(_get_version()))
1284
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1285
- if verbose: log.write(" -CPU Cores to use :",n_cores)
1286
- if verbose: log.write(" -Performing liftover ...")
1398
+ ##start function with col checking##########################################################
1399
+ _start_line = "perform liftover"
1400
+ _end_line = "liftover"
1401
+ _start_cols =[chrom,pos,status]
1402
+ _start_function = ".liftover()"
1403
+ _must_args ={}
1404
+
1405
+ is_enough_info = start_to(sumstats=sumstats,
1406
+ log=log,
1407
+ verbose=verbose,
1408
+ start_line=_start_line,
1409
+ end_line=_end_line,
1410
+ start_cols=_start_cols,
1411
+ start_function=_start_function,
1412
+ n_cores=n_cores,
1413
+ **_must_args)
1414
+ if is_enough_info == False: return sumstats
1415
+ ############################################################################################
1416
+
1287
1417
  if verbose: log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build)
1288
1418
  # valid chr and pos
1289
1419
  pattern = r"\w\w\w0\w\w\w"
@@ -1295,11 +1425,12 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
1295
1425
  if sum(to_lift)<10000:
1296
1426
  n_cores=1
1297
1427
 
1298
- df_split = np.array_split(sumstats.loc[:,[chrom,pos,status]], n_cores)
1428
+ #df_split = np.array_split(sumstats[[chrom,pos,status]], n_cores)
1429
+ df_split = _df_split(sumstats[[chrom,pos,status]], n_cores)
1299
1430
  pool = Pool(n_cores)
1300
1431
  #df = pd.concat(pool.starmap(func, df_split))
1301
1432
  func=liftover_variant
1302
- sumstats.loc[:,[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
1433
+ sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
1303
1434
  pool.close()
1304
1435
  pool.join()
1305
1436
  ############################################################################
@@ -1314,18 +1445,29 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
1314
1445
  sumstats = fixchr(sumstats,chrom=chrom,add_prefix="",remove=remove, verbose=True)
1315
1446
  sumstats = fixpos(sumstats,pos=pos,remove=remove, verbose=True)
1316
1447
 
1317
- if verbose: log.write("Finished liftover successfully!")
1448
+ finished(log,verbose,_end_line)
1318
1449
  return sumstats
1319
1450
 
1320
1451
  ###############################################################################################################
1321
1452
  # 20220426
1322
1453
  def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=Log()):
1323
- if check_col(sumstats,chrom,pos) is not True:
1324
- if verbose: log.write(".liftover(): specified columns not detected..skipping...")
1325
- return sumstats
1326
-
1327
- if verbose: log.write("Start to sort the genome coordinates...{}".format(_get_version()))
1328
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1454
+ ##start function with col checking##########################################################
1455
+ _start_line = "sort the genome coordinates"
1456
+ _end_line = "sorting coordinates"
1457
+ _start_cols =[chrom,pos]
1458
+ _start_function = ".sort_coordinate()"
1459
+ _must_args ={}
1460
+
1461
+ is_enough_info = start_to(sumstats=sumstats,
1462
+ log=log,
1463
+ verbose=verbose,
1464
+ start_line=_start_line,
1465
+ end_line=_end_line,
1466
+ start_cols=_start_cols,
1467
+ start_function=_start_function,
1468
+ **_must_args)
1469
+ if is_enough_info == False: return sumstats
1470
+ ############################################################################################
1329
1471
 
1330
1472
  try:
1331
1473
  if sumstats[pos].dtype == "Int64":
@@ -1335,49 +1477,144 @@ def sortcoordinate(sumstats,chrom="CHR",pos="POS",reindex=True,verbose=True,log=
1335
1477
  sumstats[pos] = np.floor(pd.to_numeric(sumstats[pos], errors='coerce')).astype('Int64')
1336
1478
  except:
1337
1479
  pass
1338
-
1339
- if verbose: log.write(" -Sorting genome coordinates...")
1340
1480
  sumstats = sumstats.sort_values(by=[chrom,pos],ascending=True,ignore_index=True)
1341
- if verbose: log.write("Finished sorting genome coordinates successfully!")
1342
- gc.collect()
1481
+
1482
+ finished(log,verbose,_end_line)
1343
1483
  return sumstats
1344
1484
  ###############################################################################################################
1345
1485
  # 20230430 added HR HR_95 BETA_95 N_CASE N_CONTROL
1346
- def sortcolumn(sumstats,verbose=True,log=Log(),order = [
1347
- "SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z",
1348
- "CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"
1349
- ]):
1350
- if verbose: log.write("Start to reorder the columns...{}".format(_get_version()))
1351
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
1352
-
1486
+ def sortcolumn(sumstats,verbose=True,log=Log(),order = None):
1487
+ ##start function with col checking##########################################################
1488
+ _start_line = "reorder the columns"
1489
+ _end_line = "reordering the columns"
1490
+ _start_cols =[]
1491
+ _start_function = ".sort_column()"
1492
+ _must_args ={}
1493
+
1494
+ is_enough_info = start_to(sumstats=sumstats,
1495
+ log=log,
1496
+ verbose=verbose,
1497
+ start_line=_start_line,
1498
+ end_line=_end_line,
1499
+ start_cols=_start_cols,
1500
+ start_function=_start_function,
1501
+ **_must_args)
1502
+ if is_enough_info == False: return sumstats
1503
+ ############################################################################################
1504
+
1505
+ if order is None:
1506
+ order = [
1507
+ "SNPID","rsID", "CHR", "POS", "EA", "NEA", "EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z","T","F",
1508
+ "CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"]
1353
1509
  output_columns = []
1354
1510
  for i in order:
1355
1511
  if i in sumstats.columns: output_columns.append(i)
1356
1512
  for i in sumstats.columns:
1357
1513
  if i not in order: output_columns.append(i)
1358
1514
  if verbose: log.write(" -Reordering columns to :", ",".join(output_columns))
1359
- sumstats = sumstats.loc[:, output_columns]
1360
- if verbose: log.write("Finished sorting columns successfully!")
1515
+ sumstats = sumstats[ output_columns]
1516
+
1517
+ finished(log,verbose,_end_line)
1361
1518
  return sumstats
1362
1519
 
1363
- def check_col(df,*args):
1520
+
1521
+ ###############################################################################################################
1522
+ def start_to(sumstats,
1523
+ log,
1524
+ verbose,
1525
+ start_line,
1526
+ end_line,
1527
+ start_cols,
1528
+ start_function,
1529
+ ref_vcf=None,
1530
+ ref_fasta=None,
1531
+ n_cores=None,
1532
+ ref_tsv=None,
1533
+ **args
1534
+ ):
1535
+
1536
+ log.write("Start to {}...{}".format(start_line,_get_version()), verbose=verbose)
1537
+
1538
+ check_dataframe_shape(sumstats=sumstats,
1539
+ log=log,
1540
+ verbose=verbose)
1541
+
1542
+ is_enough_col = check_col(sumstats.columns,
1543
+ verbose=verbose,
1544
+ log=log,
1545
+ cols=start_cols,
1546
+ function=start_function)
1547
+
1548
+ if is_enough_col==True:
1549
+ if n_cores is not None:
1550
+ log.write(" -Number of threads/cores to use: {}".format(n_cores))
1551
+ if ref_vcf is not None:
1552
+ log.write(" -Reference VCF: {}".format(ref_vcf))
1553
+ if ref_fasta is not None:
1554
+ log.write(" -Reference FASTA: {}".format(ref_fasta))
1555
+ if ref_tsv is not None:
1556
+ log.write(" -Reference TSV: {}".format(ref_tsv))
1557
+
1558
+ is_args_valid = True
1559
+ for key, value in args.items():
1560
+ is_args_valid = is_args_valid & check_arg(log, verbose, key, value, start_function)
1561
+ is_enough_col = is_args_valid & is_enough_col
1562
+
1563
+ if is_enough_col == False:
1564
+ skipped(log, verbose, end_line)
1565
+
1566
+ return is_enough_col
1567
+
1568
+ def finished(log, verbose, end_line):
1569
+ log.write("Finished {}.".format(end_line), verbose=verbose)
1570
+ gc.collect()
1571
+
1572
+ def skipped(log, verbose, end_line):
1573
+ log.write("Skipped {}.".format(end_line), verbose=verbose)
1574
+ gc.collect()
1575
+
1576
+ def check_arg(log, verbose, key, value, function):
1577
+ if value is None:
1578
+ log.warning("Necessary argument {} for {} is not provided!".format(key, function))
1579
+ return False
1580
+ return True
1581
+
1582
+ def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
1364
1583
  not_in_df=[]
1365
- for i in args:
1584
+ for i in cols:
1366
1585
  if type(i) is str:
1367
- if i in df.columns:
1586
+ # single check
1587
+ if i in df_col_names:
1368
1588
  continue
1369
1589
  else:
1370
1590
  not_in_df.append(i)
1371
1591
  else:
1592
+ # paried check
1372
1593
  count=0
1373
1594
  for j in i:
1374
- if j in df.columns:
1595
+ if j not in df_col_names:
1596
+ not_in_df.append(j)
1375
1597
  count+=1
1376
- if count==0:
1377
- return False
1378
- print(" -Specified columns names was not detected. Please check:"+",".join(i))
1379
-
1598
+
1380
1599
  if len(not_in_df)>0:
1600
+ if function is None:
1601
+ to_show_title=" "
1602
+ else:
1603
+ to_show_title = " for {} ".format(function)
1604
+ log.warning("Necessary columns{}were not detected:{}".format(to_show_title, ",".join(not_in_df)))
1605
+ skipped(log, verbose, end_line=function)
1381
1606
  return False
1382
- print(" -Specified columns names was not detected. Please check:"+",".join(not_in_df))
1383
- return True
1607
+
1608
+ return True
1609
+
1610
+ ###############################################################################################################
1611
+ def _df_split(dataframe, n):
1612
+ chunks = []
1613
+ chunk_size = int(dataframe.shape[0] // n)+1
1614
+
1615
+ for index in range(0, dataframe.shape[0], chunk_size):
1616
+ chunks.append(
1617
+ dataframe.iloc[index:index + chunk_size]
1618
+ )
1619
+
1620
+ return chunks