gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/data/formatbook.json +722 -721
  5. gwaslab/g_Log.py +22 -5
  6. gwaslab/g_Sumstats.py +110 -163
  7. gwaslab/g_SumstatsPair.py +76 -25
  8. gwaslab/g_SumstatsT.py +2 -2
  9. gwaslab/g_Sumstats_summary.py +3 -3
  10. gwaslab/g_version.py +10 -10
  11. gwaslab/hm_casting.py +36 -17
  12. gwaslab/hm_harmonize_sumstats.py +354 -221
  13. gwaslab/hm_rsid_to_chrpos.py +1 -1
  14. gwaslab/io_preformat_input.py +49 -43
  15. gwaslab/io_read_ldsc.py +49 -1
  16. gwaslab/io_to_formats.py +428 -295
  17. gwaslab/ldsc_irwls.py +198 -0
  18. gwaslab/ldsc_jackknife.py +514 -0
  19. gwaslab/ldsc_ldscore.py +417 -0
  20. gwaslab/ldsc_parse.py +294 -0
  21. gwaslab/ldsc_regressions.py +747 -0
  22. gwaslab/ldsc_sumstats.py +629 -0
  23. gwaslab/qc_check_datatype.py +3 -3
  24. gwaslab/qc_fix_sumstats.py +891 -778
  25. gwaslab/util_ex_calculate_ldmatrix.py +31 -13
  26. gwaslab/util_ex_gwascatalog.py +25 -25
  27. gwaslab/util_ex_ldproxyfinder.py +10 -10
  28. gwaslab/util_ex_ldsc.py +189 -0
  29. gwaslab/util_ex_process_ref.py +3 -3
  30. gwaslab/util_ex_run_coloc.py +26 -4
  31. gwaslab/util_in_calculate_gc.py +6 -6
  32. gwaslab/util_in_calculate_power.py +42 -43
  33. gwaslab/util_in_convert_h2.py +8 -8
  34. gwaslab/util_in_fill_data.py +30 -30
  35. gwaslab/util_in_filter_value.py +201 -74
  36. gwaslab/util_in_get_density.py +10 -10
  37. gwaslab/util_in_get_sig.py +445 -71
  38. gwaslab/viz_aux_annotate_plot.py +12 -12
  39. gwaslab/viz_aux_quickfix.py +42 -37
  40. gwaslab/viz_aux_reposition_text.py +10 -7
  41. gwaslab/viz_aux_save_figure.py +18 -8
  42. gwaslab/viz_plot_compare_af.py +32 -33
  43. gwaslab/viz_plot_compare_effect.py +63 -71
  44. gwaslab/viz_plot_miamiplot2.py +34 -26
  45. gwaslab/viz_plot_mqqplot.py +126 -75
  46. gwaslab/viz_plot_qqplot.py +11 -8
  47. gwaslab/viz_plot_regionalplot.py +36 -33
  48. gwaslab/viz_plot_rg_heatmap.py +28 -26
  49. gwaslab/viz_plot_stackedregional.py +40 -21
  50. gwaslab/viz_plot_trumpetplot.py +65 -61
  51. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  52. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
  53. gwaslab-3.4.39.dist-info/RECORD +80 -0
  54. gwaslab-3.4.37.dist-info/RECORD +0 -72
  55. /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  56. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  57. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -12,12 +12,18 @@ from gwaslab.g_Log import Log
12
12
  from gwaslab.qc_fix_sumstats import fixchr
13
13
  from gwaslab.qc_fix_sumstats import fixpos
14
14
  from gwaslab.qc_fix_sumstats import sortcolumn
15
+ from gwaslab.qc_fix_sumstats import _df_split
16
+ from gwaslab.qc_fix_sumstats import check_col
17
+ from gwaslab.qc_fix_sumstats import start_to
18
+ from gwaslab.qc_fix_sumstats import finished
19
+ from gwaslab.qc_fix_sumstats import skipped
15
20
  from gwaslab.qc_check_datatype import check_dataframe_shape
16
21
  from gwaslab.bd_common_data import get_number_to_chr
17
22
  from gwaslab.bd_common_data import get_chr_list
18
23
  from gwaslab.bd_common_data import get_chr_to_number
19
24
  from gwaslab.g_vchange_status import vchange_status
20
25
  from gwaslab.g_version import _get_version
26
+
21
27
  #rsidtochrpos
22
28
  #checkref
23
29
  #parallelizeassignrsid
@@ -35,20 +41,35 @@ def rsidtochrpos(sumstats,
35
41
  '''
36
42
  assign chr:pos based on rsID
37
43
  '''
38
- #########################################################################################################
39
- if verbose: log.write("Start to update chromosome and position information based on rsID...{}".format(_get_version()))
40
- check_dataframe_shape(sumstats, log, verbose)
41
- if verbose: log.write(" -rsID dictionary file: "+ path)
44
+ ##start function with col checking##########################################################
45
+ _start_line = "assign CHR and POS using rsIDs"
46
+ _end_line = "assigning CHR and POS using rsIDs"
47
+ _start_cols = [rsid,chrom,pos]
48
+ _start_function = ".rsid_to_chrpos()"
49
+ _must_args ={}
50
+
51
+ is_enough_info = start_to(sumstats=sumstats,
52
+ log=log,
53
+ verbose=verbose,
54
+ start_line=_start_line,
55
+ end_line=_end_line,
56
+ start_cols=_start_cols,
57
+ start_function=_start_function,
58
+ **_must_args)
59
+ if is_enough_info == False: return sumstats
60
+ ############################################################################################
61
+
62
+ log.write(" -rsID dictionary file: "+ path,verbose=verbose)
42
63
 
43
64
  if ref_rsid_to_chrpos_tsv is not None:
44
65
  path = ref_rsid_to_chrpos_tsv
45
66
 
46
67
  if snpid in sumstats.columns and sum(sumstats[rsid].isna())>0:
47
- if verbose: log.write(" -Filling na in rsID columns with SNPID...")
68
+ log.write(" -Filling na in rsID columns with SNPID...",verbose=verbose)
48
69
  sumstats.loc[sumstats[rsid].isna(),rsid] = sumstats.loc[sumstats[rsid].isna(),snpid]
49
70
 
50
71
  if sum(sumstats[rsid].isna())>0:
51
- if verbose: log.write(" -Filling na in rsID columns with NA_xxx for {} variants...".format(sum(sumstats[rsid].isna())))
72
+ log.write(" -Filling na in rsID columns with NA_xxx for {} variants...".format(sum(sumstats[rsid].isna())),verbose=verbose)
52
73
  sumstats.loc[sumstats[rsid].isna(),rsid] = ["NA_" + str(x+1) for x in range(len(sumstats.loc[sumstats[rsid].isna(),rsid]))]
53
74
 
54
75
  dic_chuncks = pd.read_csv(path,sep="\t",usecols=[ref_rsid,ref_chr,ref_pos],
@@ -63,8 +84,8 @@ def rsidtochrpos(sumstats,
63
84
  if pos not in sumstats.columns:
64
85
  sumstats[pos] =pd.Series(dtype="Int64")
65
86
 
66
- if verbose: log.write(" -Setting block size: ",chunksize)
67
- if verbose: log.write(" -Loading block: ",end="")
87
+ log.write(" -Setting block size: ",chunksize,verbose=verbose)
88
+ log.write(" -Loading block: ",end="",verbose=verbose)
68
89
  for i,dic in enumerate(dic_chuncks):
69
90
  dic_to_update = dic[dic.index.notnull()]
70
91
  log.write(i," ",end=" ",show_time=False)
@@ -74,13 +95,15 @@ def rsidtochrpos(sumstats,
74
95
  sumstats.update(dic_to_update,overwrite="True")
75
96
  gc.collect()
76
97
 
77
- if verbose: log.write("\n",end="",show_time=False)
98
+ log.write("\n",end="",show_time=False,verbose=verbose)
78
99
  sumstats = sumstats.reset_index()
79
100
  sumstats = sumstats.rename(columns = {'index':rsid})
80
- if verbose: log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ")
101
+ log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ",verbose=verbose)
81
102
  sumstats = fixchr(sumstats,verbose=verbose)
82
103
  sumstats = fixpos(sumstats,verbose=verbose)
83
104
  sumstats = sortcolumn(sumstats,verbose=verbose)
105
+
106
+ finished(log,verbose,_end_line)
84
107
  return sumstats
85
108
  ####################################################################################################
86
109
 
@@ -104,33 +127,48 @@ def merge_chrpos(sumstats_part,all_groups_max,path,build,status):
104
127
 
105
128
  def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None, ref_rsid_to_chrpos_vcf = None, ref_rsid_to_chrpos_hdf5 = None, build="99",status="STATUS",
106
129
  n_cores=4,block_size=20000000,verbose=True,log=Log()):
107
-
130
+
131
+ ##start function with col checking##########################################################
132
+ _start_line = "assign CHR and POS using rsIDs"
133
+ _end_line = "assigning CHR and POS using rsIDs"
134
+ _start_cols = [rsid,chrom,pos]
135
+ _start_function = ".rsid_to_chrpos2()"
136
+ _must_args ={}
137
+
138
+ is_enough_info = start_to(sumstats=sumstats,
139
+ log=log,
140
+ verbose=verbose,
141
+ start_line=_start_line,
142
+ end_line=_end_line,
143
+ start_cols=_start_cols,
144
+ start_function=_start_function,
145
+ **_must_args)
146
+ if is_enough_info == False: return sumstats
147
+ ############################################################################################
148
+
108
149
  if ref_rsid_to_chrpos_hdf5 is not None:
109
150
  path = ref_rsid_to_chrpos_hdf5
110
151
  elif ref_rsid_to_chrpos_vcf is not None:
111
152
  vcf_file_name = os.path.basename(ref_rsid_to_chrpos_vcf)
112
153
  vcf_dir_path = os.path.dirname(ref_rsid_to_chrpos_vcf)
113
154
  path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(vcf_dir_path,vcf_file_name,int(block_size))
114
-
115
- if verbose: log.write("Start to assign CHR and POS using rsIDs...{}".format(_get_version()))
116
- check_dataframe_shape(sumstats, log, verbose)
117
-
155
+
118
156
  if path is None:
119
157
  raise ValueError("Please provide path to hdf5 file.")
120
158
 
121
159
  sumstats["rsn"] = pd.to_numeric(sumstats[rsid].str.strip("rs"),errors="coerce").astype("Int64")
122
160
 
123
- if verbose: log.write(" -Source hdf5 file: ",path)
124
- if verbose: log.write(" -Cores to use : ",n_cores)
125
- if verbose: log.write(" -Blocksize (make sure it is the same as hdf5 file ): ",block_size)
161
+ log.write(" -Source hdf5 file: ",path,verbose=verbose)
162
+ log.write(" -Cores to use : ",n_cores,verbose=verbose)
163
+ log.write(" -Blocksize (make sure it is the same as hdf5 file ): ",block_size,verbose=verbose)
126
164
 
127
165
  input_columns= sumstats.columns
128
166
  sumstats_nonrs = sumstats.loc[sumstats["rsn"].isna()|sumstats["rsn"].duplicated(keep='first') ,:].copy()
129
167
  sumstats_rs = sumstats.loc[sumstats["rsn"].notnull(),:].copy()
130
168
 
131
- if verbose: log.write(" -Non-Valid rsIDs: ",sum(sumstats["rsn"].isna()))
132
- if verbose: log.write(" -Duplicated rsIDs except for the first occurrence: ",sum(sumstats.loc[~sumstats["rsn"].isna(), "rsn"].duplicated(keep='first')))
133
- if verbose: log.write(" -Valid rsIDs: ", len(sumstats_rs))
169
+ log.write(" -Non-Valid rsIDs: ",sum(sumstats["rsn"].isna()),verbose=verbose)
170
+ log.write(" -Duplicated rsIDs except for the first occurrence: ",sum(sumstats.loc[~sumstats["rsn"].isna(), "rsn"].duplicated(keep='first')),verbose=verbose)
171
+ log.write(" -Valid rsIDs: ", len(sumstats_rs),verbose=verbose)
134
172
 
135
173
  del sumstats
136
174
  gc.collect()
@@ -147,16 +185,16 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
147
185
  #
148
186
  pool = Pool(n_cores)
149
187
  if chrom not in input_columns:
150
- if verbose: log.write(" -Initiating CHR ... ")
188
+ log.write(" -Initiating CHR ... ",verbose=verbose)
151
189
  sumstats_rs[chrom]=pd.Series(dtype="Int32")
152
190
 
153
191
  if pos not in input_columns:
154
- if verbose: log.write(" -Initiating POS ... ")
192
+ log.write(" -Initiating POS ... ",verbose=verbose)
155
193
  sumstats_rs[pos]=pd.Series(dtype="Int64")
156
194
 
157
195
  df_split=[y for x, y in sumstats_rs.groupby('group', as_index=False)]
158
- if verbose: log.write(" -Divided into groups: ",len(df_split))
159
- if verbose: log.write(" -",set(sumstats_rs.loc[:,"group"].unique()))
196
+ log.write(" -Divided into groups: ",len(df_split),verbose=verbose)
197
+ log.write(" -",set(sumstats_rs.loc[:,"group"].unique()),verbose=verbose)
160
198
 
161
199
  # check keys
162
200
  store = pd.HDFStore(path, 'r')
@@ -164,21 +202,21 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
164
202
  all_groups_len = len(all_groups)
165
203
  store.close()
166
204
  all_groups_max = max(map(lambda x: int(x.split("_")[1]), all_groups))
167
- if verbose: log.write(" -Number of groups in HDF5: ",all_groups_len)
168
- if verbose: log.write(" -Max index of groups in HDF5: ",all_groups_max)
205
+ log.write(" -Number of groups in HDF5: ",all_groups_len,verbose=verbose)
206
+ log.write(" -Max index of groups in HDF5: ",all_groups_max,verbose=verbose)
169
207
 
170
208
  # update CHR and POS using rsID with multiple threads
171
209
  sumstats_rs = pd.concat(pool.map(partial(merge_chrpos,all_groups_max=all_groups_max,path=path,build=build,status=status),df_split),ignore_index=True)
172
210
  sumstats_rs.loc[:,["CHR","POS"]] = sumstats_rs.loc[:,["CHR","POS"]].astype("Int64")
173
211
  del df_split
174
212
  gc.collect()
175
- if verbose: log.write(" -Merging group data... ")
213
+ log.write(" -Merging group data... ",verbose=verbose)
176
214
  # drop group and rsn
177
215
  sumstats_rs = sumstats_rs.drop(columns=["group"])
178
216
  sumstats_nonrs = sumstats_nonrs.drop(columns=["rsn"])
179
217
 
180
218
  # merge back
181
- if verbose: log.write(" -Append data... ")
219
+ log.write(" -Append data... ",verbose=verbose)
182
220
  sumstats = pd.concat([sumstats_rs,sumstats_nonrs],ignore_index=True)
183
221
 
184
222
  del sumstats_rs
@@ -192,8 +230,8 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
192
230
 
193
231
  pool.close()
194
232
  pool.join()
195
- gc.collect()
196
- if verbose: log.write("Finished assigning CHR and POS using rsIDs.")
233
+
234
+ finished(log, verbose, _end_line)
197
235
  return sumstats
198
236
  ####################################################################################################################
199
237
  #20220426 check if non-effect allele is aligned with reference genome
@@ -211,15 +249,15 @@ def check_status(row,record):
211
249
  #8 / -----> not on ref genome
212
250
  #9 / ------> unchecked
213
251
 
214
- status_pre=row[3][:5]
215
- status_end=row[3][6:]
252
+ status_pre=row.iloc[3][:5]
253
+ status_end=row.iloc[3][6:]
216
254
 
217
255
  ## nea == ref
218
- if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
256
+ if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
219
257
  ## ea == ref
220
- if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
258
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
221
259
  ## len(nea) >len(ea):
222
- if len(row[2])!=len(row[1]):
260
+ if len(row.iloc[2])!=len(row.iloc[1]):
223
261
  # indels both on ref, unable to identify
224
262
  return status_pre+"6"+status_end
225
263
  else:
@@ -228,35 +266,50 @@ def check_status(row,record):
228
266
  ## nea!=ref
229
267
  else:
230
268
  # ea == ref_seq -> need to flip
231
- if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
269
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
232
270
  return status_pre+"3"+status_end
233
271
  # ea !=ref
234
272
  else:
235
273
  #_reverse_complementary
236
- row[1] = get_reverse_complementary_allele(row[1])
237
- row[2] = get_reverse_complementary_allele(row[2])
274
+ row.iloc[1] = get_reverse_complementary_allele(row.iloc[1])
275
+ row.iloc[2] = get_reverse_complementary_allele(row.iloc[2])
238
276
  ## nea == ref
239
- if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
277
+ if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
240
278
  ## ea == ref
241
- if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
279
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
242
280
  ## len(nea) >len(ea):
243
- if len(row[2])!=len(row[1]):
281
+ if len(row.iloc[2])!=len(row.iloc[1]):
244
282
  return status_pre+"8"+status_end # indel reverse complementary
245
283
  else:
246
284
  return status_pre+"4"+status_end
247
285
  else:
248
286
  # ea == ref_seq -> need to flip
249
- if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
287
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
250
288
  return status_pre+"5"+status_end
251
289
  # ea !=ref
252
290
  return status_pre+"8"+status_end
253
291
 
254
292
 
255
293
  def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
256
- if verbose: log.write("Start to check if NEA is aligned with reference sequence...{}".format(_get_version()))
257
- check_dataframe_shape(sumstats, log, verbose)
258
- if verbose: log.write(" -Reference genome fasta file: "+ ref_path)
259
- if verbose: log.write(" -Checking records: ", end="")
294
+ ##start function with col checking##########################################################
295
+ _start_line = "check if NEA is aligned with reference sequence"
296
+ _end_line = "checking if NEA is aligned with reference sequence"
297
+ _start_cols = [chrom,pos,ea,nea,status]
298
+ _start_function = ".check_ref()"
299
+ _must_args ={}
300
+
301
+ is_enough_info = start_to(sumstats=sumstats,
302
+ log=log,
303
+ verbose=verbose,
304
+ start_line=_start_line,
305
+ end_line=_end_line,
306
+ start_cols=_start_cols,
307
+ start_function=_start_function,
308
+ **_must_args)
309
+ if is_enough_info == False: return sumstats
310
+ ############################################################################################
311
+ log.write(" -Reference genome FASTA file: "+ ref_path,verbose=verbose)
312
+ log.write(" -Checking records: ", end="",verbose=verbose)
260
313
  chromlist = get_chr_list(add_number=True)
261
314
  records = SeqIO.parse(ref_path, "fasta")
262
315
  for record in records:
@@ -268,13 +321,13 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
268
321
  else:
269
322
  i = record_chr
270
323
  if i in chromlist:
271
- if verbose: log.write(record_chr," ", end="",show_time=False)
324
+ log.write(record_chr," ", end="",show_time=False,verbose=verbose)
272
325
  to_check_ref = (sumstats[chrom]==i) & (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna())
273
326
  sumstats.loc[to_check_ref,status] = sumstats.loc[to_check_ref,[pos,ea,nea,status]].apply(lambda x:check_status(x,record),axis=1)
274
327
 
275
- if verbose: log.write("\n",end="",show_time=False)
328
+ log.write("\n",end="",show_time=False,verbose=verbose)
276
329
 
277
- sumstats.loc[:,status] = sumstats.loc[:,status].astype("string")
330
+ sumstats[status] = sumstats[status].astype("string")
278
331
  available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
279
332
  status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
280
333
  status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -284,26 +337,27 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
284
337
  #status_7=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[7]\w", case=False, flags=0, na=False))
285
338
  status_8=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w", case=False, flags=0, na=False))
286
339
 
287
- if verbose: log.write(" -Variants allele on given reference sequence : ",status_0)
288
- if verbose: log.write(" -Variants flipped : ",status_3)
340
+ log.write(" -Variants allele on given reference sequence : ",status_0,verbose=verbose)
341
+ log.write(" -Variants flipped : ",status_3,verbose=verbose)
289
342
  raw_matching_rate = (status_3+status_0)/available_to_check
290
343
  flip_rate = status_3/available_to_check
291
- if verbose: log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100))
344
+ log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100),verbose=verbose)
292
345
  if raw_matching_rate <0.8:
293
- if verbose: log.write(" -!!!Warning, matching rate is low, please check if the right reference genome is used.")
346
+ log.warning("Matching rate is low, please check if the right reference genome is used.")
294
347
  if flip_rate > 0.85 :
295
- if verbose: log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.")
348
+ log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.",verbose=verbose)
296
349
 
297
- if verbose: log.write(" -Variants inferred reverse_complement : ",status_4)
298
- if verbose: log.write(" -Variants inferred reverse_complement_flipped : ",status_5)
299
- if verbose: log.write(" -Both allele on genome + unable to distinguish : ",status_6)
300
- #if verbose: log.write(" -Reverse_complementary + both allele on genome + unable to distinguish: ",status_7)
301
- if verbose: log.write(" -Variants not on given reference sequence : ",status_8)
350
+ log.write(" -Variants inferred reverse_complement : ",status_4,verbose=verbose)
351
+ log.write(" -Variants inferred reverse_complement_flipped : ",status_5,verbose=verbose)
352
+ log.write(" -Both allele on genome + unable to distinguish : ",status_6,verbose=verbose)
353
+ #log.write(" -Reverse_complementary + both allele on genome + unable to distinguish: ",status_7)
354
+ log.write(" -Variants not on given reference sequence : ",status_8,verbose=verbose)
302
355
 
303
356
  if remove is True:
304
357
  sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
305
- if verbose: log.write(" -Variants not on given reference sequence were removed.")
306
- gc.collect()
358
+ log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
359
+
360
+ finished(log, verbose, _end_line)
307
361
  return sumstats
308
362
 
309
363
  #######################################################################################################################################
@@ -333,7 +387,7 @@ def assign_rsid_single(sumstats,path,rsid="rsID",chr="CHR",pos="POS",ref="NEA",a
333
387
  ## single df assignment
334
388
  vcf_reader = VariantFile(path)
335
389
  def rsid_helper(x,vcf_reader,chr_dict):
336
- return chrposref_rsid(x[0],x[1],x[2],x[3],vcf_reader,chr_dict)
390
+ return chrposref_rsid(x.iloc[0],x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,chr_dict)
337
391
  map_func=partial(rsid_helper,vcf_reader=vcf_reader,chr_dict=chr_dict)
338
392
  rsID = sumstats.apply(map_func,axis=1)
339
393
  return rsID
@@ -346,19 +400,31 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
346
400
  all , overwrite rsid for all availalbe rsid
347
401
  invalid, only assign rsid for variants with invalid rsid
348
402
  empty only assign rsid for variants with na rsid
349
- '''
403
+ '''
404
+
350
405
  if ref_mode=="vcf":
351
406
  ###################################################################################################################
352
- if verbose: log.write("Start to assign rsID using vcf...{}".format(_get_version()))
353
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
354
- if verbose: log.write(" -CPU Cores to use :",n_cores)
355
- if verbose: log.write(" -Reference VCF file:", path)
356
-
407
+ ##start function with col checking##########################################################
408
+ _start_line = "assign rsID using reference VCF"
409
+ _end_line = "assign rsID using reference file"
410
+ _start_cols = [chr,pos,ref,alt,status]
411
+ _start_function = ".assign_rsid()"
412
+ _must_args ={}
413
+
414
+ is_enough_info = start_to(sumstats=sumstats,
415
+ log=log,
416
+ verbose=verbose,
417
+ start_line=_start_line,
418
+ end_line=_end_line,
419
+ start_cols=_start_cols,
420
+ start_function=_start_function,
421
+ n_cores=n_cores,
422
+ ref_vcf=path,
423
+ **_must_args)
424
+ if is_enough_info == False: return sumstats
425
+ ############################################################################################
357
426
  chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
358
-
359
- if verbose: log.write(" -Assigning rsID based on chr:pos and ref:alt/alt:ref...")
360
-
361
-
427
+ log.write(" -Assigning rsID based on CHR:POS and REF:ALT/ALT:REF...",verbose=verbose)
362
428
  ##############################################
363
429
  if rsid not in sumstats.columns:
364
430
  sumstats[rsid]=pd.Series(dtype="string")
@@ -380,7 +446,8 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
380
446
 
381
447
  if sum(to_assign)>0:
382
448
  if sum(to_assign)<10000: n_cores=1
383
- df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
449
+ #df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
450
+ df_split = _df_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
384
451
  pool = Pool(n_cores)
385
452
  map_func = partial(assign_rsid_single,path=path,chr=chr,pos=pos,ref=ref,alt=alt,chr_dict=chr_dict)
386
453
  assigned_rsid = pd.concat(pool.map(map_func,df_split))
@@ -391,40 +458,57 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
391
458
  ##################################################################################################################
392
459
 
393
460
  after_number = sum(~sumstats[rsid].isna())
394
- if verbose: log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!")
395
- if verbose: log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!")
461
+ log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!",verbose=verbose)
462
+ log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!",verbose=verbose)
396
463
 
397
464
  ##################################################################################################################
398
465
  elif ref_mode=="tsv":
399
466
  '''
400
467
  assign rsID based on chr:pos
401
468
  '''
402
- if verbose: log.write("Start to annotate rsID based on chromosome and position information...{}".format(_get_version()))
403
- check_dataframe_shape(sumstats, log, verbose)
404
- if verbose: log.write(" -SNPID-rsID text file: "+ path)
469
+ ##start function with col checking##########################################################
470
+ _start_line = "assign rsID by matching SNPID with CHR:POS:REF:ALT in the reference TSV"
471
+ _end_line = "assign rsID using reference file"
472
+ _start_cols = [snpid,status]
473
+ _start_function = ".assign_rsid()"
474
+ _must_args ={}
475
+
476
+ is_enough_info = start_to(sumstats=sumstats,
477
+ log=log,
478
+ verbose=verbose,
479
+ start_line=_start_line,
480
+ end_line=_end_line,
481
+ start_cols=_start_cols,
482
+ start_function=_start_function,
483
+ n_cores=n_cores,
484
+ ref_tsv=path,
485
+ **_must_args)
486
+ if is_enough_info == False: return sumstats
487
+ ############################################################################################
405
488
 
406
- standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234][0126]\w", case=False, flags=0, na=False)
489
+ standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
407
490
 
408
491
  if rsid not in sumstats.columns:
409
492
  sumstats[rsid]=pd.Series(dtype="string")
410
493
 
411
494
  if overwrite == "empty":
412
- to_assign = sumstats[rsid].isna()
495
+ to_assign = sumstats[rsid].isna() & standardized_normalized
413
496
  if overwrite=="all":
414
497
  to_assign = standardized_normalized
415
498
  if overwrite=="invalid":
416
499
  to_assign = (~sumstats[rsid].str.match(r'rs([0-9]+)', case=False, flags=0, na=False)) & standardized_normalized
500
+
417
501
  total_number= len(sumstats)
418
502
  pre_number = sum(~sumstats[rsid].isna())
419
- if verbose: log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...")
503
+ log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...",verbose=verbose)
420
504
  if sum(to_assign)>0:
421
505
  sumstats = sumstats.set_index(snpid)
422
506
  dic_chuncks = pd.read_csv(path,sep="\t",usecols=[ref_snpid,ref_rsid],
423
507
  chunksize=chunksize,index_col=ref_snpid,
424
508
  dtype={ref_snpid:"string",ref_rsid:"string"})
425
509
 
426
- if verbose: log.write(" -Setting block size: ",chunksize)
427
- if verbose: log.write(" -Loading block: ",end="")
510
+ log.write(" -Setting block size: ",chunksize,verbose=verbose)
511
+ log.write(" -Loading block: ",end="",verbose=verbose)
428
512
  for i,dic in enumerate(dic_chuncks):
429
513
  gc.collect()
430
514
  log.write(i," ",end=" ",show_time=False)
@@ -433,17 +517,18 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
433
517
  dic = dic.loc[~dic.index.duplicated(keep=False),:]
434
518
  sumstats.update(dic,overwrite=True)
435
519
 
436
- if verbose: log.write("\n",end="",show_time=False)
520
+ log.write("\n",end="",show_time=False,verbose=verbose)
437
521
  sumstats = sumstats.reset_index()
438
522
  sumstats = sumstats.rename(columns = {'index':snpid})
439
523
 
440
524
  after_number = sum(~sumstats[rsid].isna())
441
- if verbose: log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!")
442
- if verbose: log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!")
525
+ log.write(" -rsID annotation for "+str(total_number - after_number) +" needed to be fixed!",verbose=verbose)
526
+ log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!",verbose=verbose)
443
527
  else:
444
- if verbose: log.write(" -No rsID could be fixed...skipping...")
528
+ log.write(" -No rsID can be fixed...skipping...",verbose=verbose)
445
529
  ################################################################################################################
446
- gc.collect()
530
+
531
+ finished(log,verbose,_end_line)
447
532
  return sumstats
448
533
  #################################################################################################################################################
449
534
  #single record assignment
@@ -522,12 +607,12 @@ def is_palindromic(sumstats,a1="EA",a2="NEA"):
522
607
 
523
608
  def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS"):
524
609
  vcf_reader = VariantFile(ref_infer)
525
- status_part = sumstats.apply(lambda x:check_strand_status(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict),axis=1)
610
+ status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
526
611
  return status_part
527
612
 
528
613
  def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
529
614
  vcf_reader = VariantFile(ref_infer)
530
- status_part = sumstats.apply(lambda x:check_unkonwn_indel(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict,daf_tolerance),axis=1)
615
+ status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
531
616
  return status_part
532
617
 
533
618
  ##################################################################################################################################################
@@ -535,121 +620,141 @@ def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NE
535
620
  def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
536
621
  chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
537
622
  chr_dict=None,verbose=True,log=Log()):
538
- if verbose: log.write("Start to infer strand for palindromic SNPs...{}".format(_get_version()))
539
- check_dataframe_shape(sumstats, log, verbose)
540
- if verbose: log.write(" -Reference vcf file:", ref_infer)
623
+ ##start function with col checking##########################################################
624
+ _start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
625
+ _end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
626
+ _start_cols = [chr,pos,ref,alt,eaf,status]
627
+ _start_function = ".infer_strand()"
628
+ _must_args ={"ref_alt_freq":ref_alt_freq}
629
+
630
+ is_enough_info = start_to(sumstats=sumstats,
631
+ log=log,
632
+ verbose=verbose,
633
+ start_line=_start_line,
634
+ end_line=_end_line,
635
+ start_cols=_start_cols,
636
+ start_function=_start_function,
637
+ n_cores=n_cores,
638
+ ref_vcf=ref_infer,
639
+ **_must_args)
640
+ if is_enough_info == False: return sumstats
641
+ ############################################################################################
541
642
 
542
643
  chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
644
+
645
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
543
646
 
544
- # check if the columns are complete
545
- if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
546
- raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
547
647
  if "p" in mode:
548
- # ref_alt_freq INFO in vcf was provided
549
- if ref_alt_freq is not None:
550
-
551
- if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
552
- ## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
553
- good_chrpos = sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
554
- palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
555
- not_palindromic_snp = good_chrpos & (~palindromic)
556
-
557
- ##not palindromic : change status
558
- sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
559
- if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
560
-
561
- #palindromic but can not infer
562
- maf_can_infer = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
563
-
564
- sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
565
-
566
- #palindromic WITH UNKNWON OR UNCHECKED STATUS
567
- unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
648
+ ## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
649
+ good_chrpos = sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
650
+ palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
651
+ not_palindromic_snp = good_chrpos & (~palindromic)
652
+
653
+ ##not palindromic : change status
654
+ sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
655
+ log.write(" -Identified ", sum(palindromic)," palindromic SNPs...",verbose=verbose)
656
+
657
+ #palindromic but can not infer
658
+ maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
659
+
660
+ sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
661
+
662
+ #palindromic WITH UNKNWON OR UNCHECKED STATUS
663
+ unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
568
664
 
569
- unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
570
-
571
- if verbose: log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)))
665
+ unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
666
+
667
+ log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)),verbose=verbose)
572
668
 
573
- #########################################################################################
574
- if sum(unknow_palindromic_to_check)>0:
575
- if sum(unknow_palindromic_to_check)<10000:
576
- n_cores=1
577
- df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
578
- pool = Pool(n_cores)
579
- map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
580
- status_inferred = pd.concat(pool.map(map_func,df_split))
581
- sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
669
+ #########################################################################################
670
+ if sum(unknow_palindromic_to_check)>0:
671
+ if sum(unknow_palindromic_to_check)<10000:
672
+ n_cores=1
673
+
674
+ #df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
675
+ df_split = _df_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
676
+ pool = Pool(n_cores)
677
+ map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
678
+ status_inferred = pd.concat(pool.map(map_func,df_split))
679
+ sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
582
680
  pool.close()
583
681
  pool.join()
584
- #########################################################################################
585
- #0 Not palindromic SNPs
586
- #1 Palindromic +strand -> no need to flip
587
- #2 palindromic -strand -> need to flip -> fixed
588
- #3 Indel no need flip
589
- #4 Unknown Indel -> fixed
590
- #5 Palindromic -strand -> need to flip
591
- #6 Indel need flip
592
- #7 indistinguishable
593
- #8 Not matching or No information
594
- #9 Unchecked
595
-
596
- status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
597
- status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
598
- status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
599
- status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
600
- status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
601
-
602
- if verbose: log.write(" -Non-palindromic : ",sum(status0))
603
- if verbose: log.write(" -Palindromic SNPs on + strand: ",sum(status1))
604
- if verbose: log.write(" -Palindromic SNPs on - strand and need to be flipped:",sum(status5))
605
- if verbose: log.write(" -Palindromic SNPs with maf not available to infer : ",sum(status7))
606
- if verbose: log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8))
607
-
608
- if ("7" in remove_snp) and ("8" in remove_snp) :
609
- if verbose: log.write(" -Palindromic SNPs with maf not available to infer and with no macthes or no information will will be removed")
610
- sumstats = sumstats.loc[~(status7 | status8),:].copy()
611
- elif "8" in remove_snp:
612
- if verbose: log.write(" -Palindromic SNPs with no macthes or no information will be removed")
613
- sumstats = sumstats.loc[~status8,:].copy()
614
- elif "7" in remove_snp:
615
- if verbose: log.write(" -Palindromic SNPs with maf not available to infer will be removed")
616
- sumstats = sumstats.loc[~status7,:].copy()
682
+ else:
683
+ log.warning("No palindromic variants available for checking.")
684
+ #########################################################################################
685
+ #0 Not palindromic SNPs
686
+ #1 Palindromic +strand -> no need to flip
687
+ #2 palindromic -strand -> need to flip -> fixed
688
+ #3 Indel no need flip
689
+ #4 Unknown Indel -> fixed
690
+ #5 Palindromic -strand -> need to flip
691
+ #6 Indel need flip
692
+ #7 indistinguishable
693
+ #8 Not matching or No information
694
+ #9 Unchecked
695
+
696
+ status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
697
+ status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
698
+ status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
699
+ status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
700
+ status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
701
+
702
+ log.write(" -Non-palindromic : ",sum(status0),verbose=verbose)
703
+ log.write(" -Palindromic SNPs on + strand: ",sum(status1),verbose=verbose)
704
+ log.write(" -Palindromic SNPs on - strand and needed to be flipped:",sum(status5),verbose=verbose)
705
+ log.write(" -Palindromic SNPs with MAF not available to infer : ",sum(status7),verbose=verbose)
706
+ log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8),verbose=verbose)
707
+
708
+ if ("7" in remove_snp) and ("8" in remove_snp) :
709
+ log.write(" -Palindromic SNPs with MAF not available to infer and with no macthes or no information will will be removed",verbose=verbose)
710
+ sumstats = sumstats.loc[~(status7 | status8),:].copy()
711
+ elif "8" in remove_snp:
712
+ log.write(" -Palindromic SNPs with no macthes or no information will be removed",verbose=verbose)
713
+ sumstats = sumstats.loc[~status8,:].copy()
714
+ elif "7" in remove_snp:
715
+ log.write(" -Palindromic SNPs with MAF not available to infer will be removed",verbose=verbose)
716
+ sumstats = sumstats.loc[~status7,:].copy()
617
717
 
618
718
  ### unknow_indel
619
719
  if "i" in mode:
620
720
  unknow_indel = sumstats[status].str.match(r'\w\w\w\w\w[6][89]', case=False, flags=0, na=False)
621
- if verbose: log.write(" -Identified ", sum(unknow_indel)," indistinguishable Indels...")
721
+ log.write(" -Identified ", sum(unknow_indel)," indistinguishable Indels...",verbose=verbose)
622
722
  if sum(unknow_indel)>0:
623
- if verbose: log.write(" -Indistinguishable indels will be inferred from reference vcf ref and alt...")
723
+ log.write(" -Indistinguishable indels will be inferred from reference vcf REF and ALT...",verbose=verbose)
624
724
  #########################################################################################
625
725
  #with maf can not infer
626
- #maf_can_infer = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
726
+ #maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
627
727
  #sumstats.loc[unknow_indel&(~maf_can_infer),status] = vchange_status(sumstats.loc[unknow_indel&(~maf_can_infer),status],7,"9","8")
628
- if verbose: log.write(" -DAF tolerance: {}".format(daf_tolerance))
728
+ log.write(" -Difference in allele frequency (DAF) tolerance: {}".format(daf_tolerance),verbose=verbose)
629
729
 
630
730
  if sum(unknow_indel)>0:
631
731
  if sum(unknow_indel)<10000:
632
732
  n_cores=1
633
- df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
733
+ #df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
734
+ df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
634
735
  pool = Pool(n_cores)
635
736
  map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
636
737
  status_inferred = pd.concat(pool.map(map_func,df_split))
637
738
  sumstats.loc[unknow_indel,status] = status_inferred.values
638
- pool.close()
639
- pool.join()
739
+ pool.close()
740
+ pool.join()
741
+
640
742
  #########################################################################################
641
743
 
642
744
  status3 = sumstats[status].str.match(r'\w\w\w\w\w\w[3]', case=False, flags=0, na=False)
643
745
  status6 = sumstats[status].str.match(r'\w\w\w\w\w\w[6]', case=False, flags=0, na=False)
644
746
  status8 = sumstats[status].str.match(r'\w\w\w\w\w[6][8]', case=False, flags=0, na=False)
645
747
 
646
- if verbose: log.write(" -Indels ea/nea match reference : ",sum(status3))
647
- if verbose: log.write(" -Indels ea/nea need to be flipped : ",sum(status6))
648
- if verbose: log.write(" -Indels with no macthes or no information : ",sum(status8))
748
+ log.write(" -Indels ea/nea match reference : ",sum(status3),verbose=verbose)
749
+ log.write(" -Indels ea/nea need to be flipped : ",sum(status6),verbose=verbose)
750
+ log.write(" -Indels with no macthes or no information : ",sum(status8),verbose=verbose)
649
751
  if "8" in remove_indel:
650
- if verbose: log.write(" -Indels with no macthes or no information will be removed")
651
- sumstats = sumstats.loc[~status8,:].copy()
652
- gc.collect()
752
+ log.write(" -Indels with no macthes or no information will be removed",verbose=verbose)
753
+ sumstats = sumstats.loc[~status8,:].copy()
754
+ else:
755
+ log.warning("No indistinguishable indels available for checking.")
756
+
757
+ finished(log,verbose,_end_line)
653
758
  return sumstats
654
759
 
655
760
 
@@ -673,31 +778,45 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
673
778
 
674
779
  ################################################################################################################
675
780
  def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
676
-
677
- if verbose: log.write("Start to check the difference between EAF and reference vcf alt frequency ...{}".format(_get_version()))
678
- check_dataframe_shape(sumstats, log, verbose)
679
- if verbose: log.write(" -Reference vcf file:", ref_infer)
680
- if verbose: log.write(" -CPU Cores to use :",n_cores)
681
-
781
+ ##start function with col checking##########################################################
782
+ _start_line = "check the difference between EAF and reference VCF ALT frequency"
783
+ _end_line = "checking the difference between EAF and reference VCF ALT frequency"
784
+ _start_cols = [chr,pos,ref,alt,eaf,status]
785
+ _start_function = ".check_daf()"
786
+ _must_args ={"ref_alt_freq":ref_alt_freq}
787
+
788
+ is_enough_info = start_to(sumstats=sumstats,
789
+ log=log,
790
+ verbose=verbose,
791
+ start_line=_start_line,
792
+ end_line=_end_line,
793
+ start_cols=_start_cols,
794
+ start_function=_start_function,
795
+ n_cores=n_cores,
796
+ ref_vcf=ref_infer,
797
+ **_must_args)
798
+ if is_enough_info == False: return sumstats
799
+ ############################################################################################
800
+
682
801
  chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
683
802
 
684
803
  column_name = column_name + suffix
685
- # check if the columns are complete
686
- if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
687
- raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
688
804
 
805
+
806
+
689
807
  # ref_alt_freq INFO in vcf was provided
690
808
  if ref_alt_freq is not None:
691
- if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
809
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
692
810
  if not force:
693
811
  good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
694
- if verbose: log.write(" -Checking variants:", sum(good_chrpos))
812
+ log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
695
813
  sumstats[column_name]=np.nan
696
814
 
697
815
  ########################
698
816
  if sum(~sumstats[eaf].isna())<10000:
699
817
  n_cores=1
700
- df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
818
+ #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
819
+ df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
701
820
  pool = Pool(n_cores)
702
821
  if sum(~sumstats[eaf].isna())>0:
703
822
  map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,column_name=column_name,chr_dict=chr_dict)
@@ -708,25 +827,25 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
708
827
  #status_inferred = sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]].apply(lambda x:check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict),axis=1)
709
828
 
710
829
  #sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
711
- #sumstats.loc[:,"DAF"]=sumstats.loc[:,"DAF"].astype("float")
712
- if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats.loc[:,column_name]))
713
- if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats.loc[:,column_name]))
714
- if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats.loc[:,column_name]))
715
- if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats.loc[:,column_name])))
716
- if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats.loc[:,column_name])))
717
- if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats.loc[:,column_name])))
718
- if verbose: log.write("Finished allele frequency checking!")
830
+ #sumstats["DAF"]=sumstats["DAF"].astype("float")
831
+ log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]),verbose=verbose)
832
+ log.write(" - {} min:".format(column_name), np.nanmin(sumstats[column_name]),verbose=verbose)
833
+ log.write(" - {} sd:".format(column_name), np.nanstd(sumstats[column_name]),verbose=verbose)
834
+ log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats[column_name])),verbose=verbose)
835
+ log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats[column_name])),verbose=verbose)
836
+ log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats[column_name])),verbose=verbose)
837
+ log.write("Finished allele frequency checking!")
719
838
  return sumstats
720
839
 
721
840
  def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
722
841
  #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
723
842
  vcf_reader = VariantFile(ref_infer)
724
843
  def afapply(x,vcf,alt_freq,chr_dict):
725
- return check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict)
844
+ return check_daf(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,chr_dict)
726
845
  map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
727
846
  status_inferred = sumstats.apply(map_func,axis=1)
728
- sumstats.loc[:,column_name] = status_inferred.values
729
- sumstats.loc[:,column_name]=sumstats.loc[:,column_name].astype("float")
847
+ sumstats[column_name] = status_inferred.values
848
+ sumstats[column_name]=sumstats[column_name].astype("float")
730
849
  return sumstats
731
850
 
732
851
  def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
@@ -741,33 +860,44 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
741
860
  ################################################################################################################
742
861
 
743
862
  def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
744
-
745
- if verbose: log.write("Start to infer the AF and reference vcf alt frequency ...{}".format(_get_version()))
746
- check_dataframe_shape(sumstats, log, verbose)
747
- if verbose: log.write(" -Reference vcf file:", ref_infer)
748
- if verbose: log.write(" -CPU Cores to use :",n_cores)
749
-
863
+ ##start function with col checking##########################################################
864
+ _start_line = "infer EAF using reference VCF ALT frequency"
865
+ _end_line = "inferring EAF using reference VCF ALT frequency"
866
+ _start_cols = [chr,pos,ref,alt,eaf,status]
867
+ _start_function = ".infer_af()"
868
+ _must_args ={"ref_alt_freq":ref_alt_freq}
869
+
870
+ is_enough_info = start_to(sumstats=sumstats,
871
+ log=log,
872
+ verbose=verbose,
873
+ start_line=_start_line,
874
+ end_line=_end_line,
875
+ start_cols=_start_cols,
876
+ start_function=_start_function,
877
+ n_cores=n_cores,
878
+ ref_vcf=ref_infer,
879
+ **_must_args)
880
+ if is_enough_info == False: return sumstats
881
+ ############################################################################################
750
882
  chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
751
-
752
- # check if the columns are complete
753
- if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
754
- raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
755
883
 
756
884
  if eaf not in sumstats.columns:
757
885
  sumstats[eaf]=np.nan
758
886
 
759
887
  prenumber = sum(sumstats[eaf].isna())
888
+
760
889
  # ref_alt_freq INFO in vcf was provided
761
890
  if ref_alt_freq is not None:
762
- if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
891
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
763
892
  if not force:
764
893
  good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
765
- if verbose: log.write(" -Checking variants:", sum(good_chrpos))
894
+ log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
766
895
 
767
896
  ########################
768
897
  if sum(sumstats[eaf].isna())<10000:
769
898
  n_cores=1
770
- df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
899
+ #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
900
+ df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
771
901
  pool = Pool(n_cores)
772
902
  map_func = partial(inferaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
773
903
  sumstats.loc[good_chrpos,[eaf]] = pd.concat(pool.map(map_func,df_split))
@@ -776,20 +906,21 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
776
906
  ###########################
777
907
 
778
908
  afternumber = sum(sumstats[eaf].isna())
779
- if verbose: log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber))
780
- if verbose: log.write(" -EAF is still missing for {} variants.".format(afternumber))
781
- if verbose: log.write("Finished allele frequency inferring!")
909
+ log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber),verbose=verbose)
910
+ log.write(" -EAF is still missing for {} variants.".format(afternumber),verbose=verbose)
911
+
912
+ finished(log,verbose,_end_line)
782
913
  return sumstats
783
914
 
784
915
  def inferaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
785
916
  #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
786
917
  vcf_reader = VariantFile(ref_infer)
787
918
  def afapply(x,vcf,alt_freq,chr_dict):
788
- return infer_af(x[0],x[1]-1,x[1],x[2],x[3],vcf_reader,ref_alt_freq,chr_dict)
919
+ return infer_af(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,ref_alt_freq,chr_dict)
789
920
  map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
790
921
  status_inferred = sumstats.apply(map_func,axis=1)
791
- sumstats.loc[:,eaf] = status_inferred.values
792
- sumstats.loc[:,eaf]=sumstats.loc[:,eaf].astype("float")
922
+ sumstats[eaf] = status_inferred.values
923
+ sumstats[eaf]=sumstats[eaf].astype("float")
793
924
  return sumstats
794
925
 
795
926
  def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
@@ -810,13 +941,13 @@ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
810
941
  def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
811
942
  if vcf_path is not None:
812
943
  if vcf_chr_dict is None:
813
- if verbose: log.write(" -Checking prefix for chromosomes in vcf files..." )
944
+ log.write(" -Checking prefix for chromosomes in vcf files..." ,verbose=verbose)
814
945
  prefix = check_vcf_chr_prefix(vcf_path)
815
946
  if prefix is not None:
816
- if verbose: log.write(" -Prefix for chromosomes: ",prefix)
947
+ log.write(" -Prefix for chromosomes: ",prefix)
817
948
  vcf_chr_dict = get_number_to_chr(prefix=prefix)
818
949
  else:
819
- if verbose: log.write(" -No prefix for chromosomes in the VCF files." )
950
+ log.write(" -No prefix for chromosomes in the VCF files." ,verbose=verbose)
820
951
  vcf_chr_dict = get_number_to_chr()
821
952
  return vcf_chr_dict
822
953
 
@@ -827,4 +958,6 @@ def check_vcf_chr_prefix(vcf_bcf_path):
827
958
  if m is not None:
828
959
  return m.group(1)
829
960
  else:
830
- return None
961
+ return None
962
+
963
+