gwaslab 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (42) hide show
  1. gwaslab/__init__.py +1 -1
  2. gwaslab/data/formatbook.json +722 -721
  3. gwaslab/g_Log.py +8 -0
  4. gwaslab/g_Sumstats.py +80 -178
  5. gwaslab/g_SumstatsPair.py +6 -2
  6. gwaslab/g_Sumstats_summary.py +3 -3
  7. gwaslab/g_meta.py +13 -3
  8. gwaslab/g_version.py +2 -2
  9. gwaslab/hm_casting.py +29 -15
  10. gwaslab/hm_harmonize_sumstats.py +312 -159
  11. gwaslab/hm_rsid_to_chrpos.py +1 -1
  12. gwaslab/io_preformat_input.py +46 -37
  13. gwaslab/io_to_formats.py +428 -295
  14. gwaslab/qc_check_datatype.py +15 -1
  15. gwaslab/qc_fix_sumstats.py +956 -719
  16. gwaslab/util_ex_calculate_ldmatrix.py +29 -11
  17. gwaslab/util_ex_gwascatalog.py +1 -1
  18. gwaslab/util_ex_ldproxyfinder.py +1 -1
  19. gwaslab/util_ex_process_h5.py +26 -17
  20. gwaslab/util_ex_process_ref.py +3 -3
  21. gwaslab/util_ex_run_coloc.py +26 -4
  22. gwaslab/util_in_convert_h2.py +1 -1
  23. gwaslab/util_in_fill_data.py +44 -5
  24. gwaslab/util_in_filter_value.py +122 -34
  25. gwaslab/util_in_get_density.py +2 -2
  26. gwaslab/util_in_get_sig.py +41 -9
  27. gwaslab/viz_aux_quickfix.py +26 -21
  28. gwaslab/viz_aux_reposition_text.py +7 -4
  29. gwaslab/viz_aux_save_figure.py +6 -5
  30. gwaslab/viz_plot_compare_af.py +5 -5
  31. gwaslab/viz_plot_compare_effect.py +22 -5
  32. gwaslab/viz_plot_miamiplot2.py +28 -20
  33. gwaslab/viz_plot_mqqplot.py +214 -98
  34. gwaslab/viz_plot_qqplot.py +11 -8
  35. gwaslab/viz_plot_regionalplot.py +16 -9
  36. gwaslab/viz_plot_trumpetplot.py +15 -6
  37. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/METADATA +3 -3
  38. gwaslab-3.4.38.dist-info/RECORD +72 -0
  39. gwaslab-3.4.36.dist-info/RECORD +0 -72
  40. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
  41. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
  42. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
@@ -11,11 +11,19 @@ import gc
11
11
  from gwaslab.g_Log import Log
12
12
  from gwaslab.qc_fix_sumstats import fixchr
13
13
  from gwaslab.qc_fix_sumstats import fixpos
14
+ from gwaslab.qc_fix_sumstats import sortcolumn
15
+ from gwaslab.qc_fix_sumstats import _df_split
16
+ from gwaslab.qc_fix_sumstats import check_col
17
+ from gwaslab.qc_fix_sumstats import start_to
18
+ from gwaslab.qc_fix_sumstats import finished
19
+ from gwaslab.qc_fix_sumstats import skipped
20
+ from gwaslab.qc_check_datatype import check_dataframe_shape
14
21
  from gwaslab.bd_common_data import get_number_to_chr
15
22
  from gwaslab.bd_common_data import get_chr_list
16
23
  from gwaslab.bd_common_data import get_chr_to_number
17
24
  from gwaslab.g_vchange_status import vchange_status
18
25
  from gwaslab.g_version import _get_version
26
+
19
27
  #rsidtochrpos
20
28
  #checkref
21
29
  #parallelizeassignrsid
@@ -27,17 +35,35 @@ from gwaslab.g_version import _get_version
27
35
 
28
36
  ###~!!!!
29
37
  def rsidtochrpos(sumstats,
30
- path="", snpid="SNPID",
38
+ path=None, ref_rsid_to_chrpos_tsv=None, snpid="SNPID",
31
39
  rsid="rsID", chrom="CHR",pos="POS",ref_rsid="rsID",ref_chr="CHR",ref_pos="POS", build="19",
32
40
  overwrite=False,remove=False,chunksize=5000000,verbose=True,log=Log()):
33
41
  '''
34
42
  assign chr:pos based on rsID
35
43
  '''
36
- #########################################################################################################
37
- if verbose: log.write("Start to update chromosome and position information based on rsID...{}".format(_get_version()))
38
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
44
+ ##start function with col checking##########################################################
45
+ _start_line = "assign CHR and POS using rsIDs"
46
+ _end_line = "assigning CHR and POS using rsIDs"
47
+ _start_cols = [rsid,chrom,pos]
48
+ _start_function = ".rsid_to_chrpos()"
49
+ _must_args ={}
50
+
51
+ is_enough_info = start_to(sumstats=sumstats,
52
+ log=log,
53
+ verbose=verbose,
54
+ start_line=_start_line,
55
+ end_line=_end_line,
56
+ start_cols=_start_cols,
57
+ start_function=_start_function,
58
+ **_must_args)
59
+ if is_enough_info == False: return sumstats
60
+ ############################################################################################
61
+
39
62
  if verbose: log.write(" -rsID dictionary file: "+ path)
40
63
 
64
+ if ref_rsid_to_chrpos_tsv is not None:
65
+ path = ref_rsid_to_chrpos_tsv
66
+
41
67
  if snpid in sumstats.columns and sum(sumstats[rsid].isna())>0:
42
68
  if verbose: log.write(" -Filling na in rsID columns with SNPID...")
43
69
  sumstats.loc[sumstats[rsid].isna(),rsid] = sumstats.loc[sumstats[rsid].isna(),snpid]
@@ -75,6 +101,9 @@ def rsidtochrpos(sumstats,
75
101
  if verbose: log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ")
76
102
  sumstats = fixchr(sumstats,verbose=verbose)
77
103
  sumstats = fixpos(sumstats,verbose=verbose)
104
+ sumstats = sortcolumn(sumstats,verbose=verbose)
105
+
106
+ finished(log,verbose,_end_line)
78
107
  return sumstats
79
108
  ####################################################################################################
80
109
 
@@ -96,9 +125,34 @@ def merge_chrpos(sumstats_part,all_groups_max,path,build,status):
96
125
  return sumstats_part
97
126
 
98
127
 
99
- def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None,build="99",status="STATUS",
128
+ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None, ref_rsid_to_chrpos_vcf = None, ref_rsid_to_chrpos_hdf5 = None, build="99",status="STATUS",
100
129
  n_cores=4,block_size=20000000,verbose=True,log=Log()):
101
- if verbose: log.write("Start to assign CHR and POS using rsIDs...{}".format(_get_version()))
130
+
131
+ ##start function with col checking##########################################################
132
+ _start_line = "assign CHR and POS using rsIDs"
133
+ _end_line = "assigning CHR and POS using rsIDs"
134
+ _start_cols = [rsid,chrom,pos]
135
+ _start_function = ".rsid_to_chrpos2()"
136
+ _must_args ={}
137
+
138
+ is_enough_info = start_to(sumstats=sumstats,
139
+ log=log,
140
+ verbose=verbose,
141
+ start_line=_start_line,
142
+ end_line=_end_line,
143
+ start_cols=_start_cols,
144
+ start_function=_start_function,
145
+ **_must_args)
146
+ if is_enough_info == False: return sumstats
147
+ ############################################################################################
148
+
149
+ if ref_rsid_to_chrpos_hdf5 is not None:
150
+ path = ref_rsid_to_chrpos_hdf5
151
+ elif ref_rsid_to_chrpos_vcf is not None:
152
+ vcf_file_name = os.path.basename(ref_rsid_to_chrpos_vcf)
153
+ vcf_dir_path = os.path.dirname(ref_rsid_to_chrpos_vcf)
154
+ path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(vcf_dir_path,vcf_file_name,int(block_size))
155
+
102
156
  if path is None:
103
157
  raise ValueError("Please provide path to hdf5 file.")
104
158
 
@@ -164,17 +218,20 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
164
218
  # merge back
165
219
  if verbose: log.write(" -Append data... ")
166
220
  sumstats = pd.concat([sumstats_rs,sumstats_nonrs],ignore_index=True)
221
+
167
222
  del sumstats_rs
168
223
  del sumstats_nonrs
169
224
  gc.collect()
170
225
 
171
226
  # check
172
- sumstats = fixchr(sumstats,verbose=True)
173
- sumstats = fixpos(sumstats,verbose=True)
227
+ sumstats = fixchr(sumstats,verbose=verbose)
228
+ sumstats = fixpos(sumstats,verbose=verbose)
229
+ sumstats = sortcolumn(sumstats,verbose=verbose)
230
+
174
231
  pool.close()
175
232
  pool.join()
176
- gc.collect()
177
- if verbose: log.write("Finished assigning CHR and POS using rsIDs.")
233
+
234
+ finished(log, verbose, _end_line)
178
235
  return sumstats
179
236
  ####################################################################################################################
180
237
  #20220426 check if non-effect allele is aligned with reference genome
@@ -192,15 +249,15 @@ def check_status(row,record):
192
249
  #8 / -----> not on ref genome
193
250
  #9 / ------> unchecked
194
251
 
195
- status_pre=row[3][:5]
196
- status_end=row[3][6:]
252
+ status_pre=row.iloc[3][:5]
253
+ status_end=row.iloc[3][6:]
197
254
 
198
255
  ## nea == ref
199
- if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
256
+ if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
200
257
  ## ea == ref
201
- if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
258
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
202
259
  ## len(nea) >len(ea):
203
- if len(row[2])!=len(row[1]):
260
+ if len(row.iloc[2])!=len(row.iloc[1]):
204
261
  # indels both on ref, unable to identify
205
262
  return status_pre+"6"+status_end
206
263
  else:
@@ -209,34 +266,49 @@ def check_status(row,record):
209
266
  ## nea!=ref
210
267
  else:
211
268
  # ea == ref_seq -> need to flip
212
- if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
269
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
213
270
  return status_pre+"3"+status_end
214
271
  # ea !=ref
215
272
  else:
216
273
  #_reverse_complementary
217
- row[1] = get_reverse_complementary_allele(row[1])
218
- row[2] = get_reverse_complementary_allele(row[2])
274
+ row.iloc[1] = get_reverse_complementary_allele(row.iloc[1])
275
+ row.iloc[2] = get_reverse_complementary_allele(row.iloc[2])
219
276
  ## nea == ref
220
- if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
277
+ if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
221
278
  ## ea == ref
222
- if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
279
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
223
280
  ## len(nea) >len(ea):
224
- if len(row[2])!=len(row[1]):
281
+ if len(row.iloc[2])!=len(row.iloc[1]):
225
282
  return status_pre+"8"+status_end # indel reverse complementary
226
283
  else:
227
284
  return status_pre+"4"+status_end
228
285
  else:
229
286
  # ea == ref_seq -> need to flip
230
- if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
287
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
231
288
  return status_pre+"5"+status_end
232
289
  # ea !=ref
233
290
  return status_pre+"8"+status_end
234
291
 
235
292
 
236
293
  def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
237
- if verbose: log.write("Start to check if NEA is aligned with reference sequence...{}".format(_get_version()))
238
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
239
- if verbose: log.write(" -Reference genome fasta file: "+ ref_path)
294
+ ##start function with col checking##########################################################
295
+ _start_line = "check if NEA is aligned with reference sequence"
296
+ _end_line = "checking if NEA is aligned with reference sequence"
297
+ _start_cols = [chrom,pos,ea,nea,status]
298
+ _start_function = ".check_ref()"
299
+ _must_args ={}
300
+
301
+ is_enough_info = start_to(sumstats=sumstats,
302
+ log=log,
303
+ verbose=verbose,
304
+ start_line=_start_line,
305
+ end_line=_end_line,
306
+ start_cols=_start_cols,
307
+ start_function=_start_function,
308
+ **_must_args)
309
+ if is_enough_info == False: return sumstats
310
+ ############################################################################################
311
+ if verbose: log.write(" -Reference genome FASTA file: "+ ref_path)
240
312
  if verbose: log.write(" -Checking records: ", end="")
241
313
  chromlist = get_chr_list(add_number=True)
242
314
  records = SeqIO.parse(ref_path, "fasta")
@@ -255,7 +327,7 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
255
327
 
256
328
  if verbose: log.write("\n",end="",show_time=False)
257
329
 
258
- sumstats.loc[:,status] = sumstats.loc[:,status].astype("string")
330
+ sumstats[status] = sumstats[status].astype("string")
259
331
  available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
260
332
  status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
261
333
  status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -271,7 +343,7 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
271
343
  flip_rate = status_3/available_to_check
272
344
  if verbose: log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100))
273
345
  if raw_matching_rate <0.8:
274
- if verbose: log.write(" -!!!Warning, matching rate is low, please check if the right reference genome is used.")
346
+ if verbose: log.warning("Matching rate is low, please check if the right reference genome is used.")
275
347
  if flip_rate > 0.85 :
276
348
  if verbose: log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.")
277
349
 
@@ -284,7 +356,8 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
284
356
  if remove is True:
285
357
  sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
286
358
  if verbose: log.write(" -Variants not on given reference sequence were removed.")
287
- gc.collect()
359
+
360
+ finished(log, verbose, _end_line)
288
361
  return sumstats
289
362
 
290
363
  #######################################################################################################################################
@@ -314,7 +387,7 @@ def assign_rsid_single(sumstats,path,rsid="rsID",chr="CHR",pos="POS",ref="NEA",a
314
387
  ## single df assignment
315
388
  vcf_reader = VariantFile(path)
316
389
  def rsid_helper(x,vcf_reader,chr_dict):
317
- return chrposref_rsid(x[0],x[1],x[2],x[3],vcf_reader,chr_dict)
390
+ return chrposref_rsid(x.iloc[0],x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,chr_dict)
318
391
  map_func=partial(rsid_helper,vcf_reader=vcf_reader,chr_dict=chr_dict)
319
392
  rsID = sumstats.apply(map_func,axis=1)
320
393
  return rsID
@@ -327,19 +400,31 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
327
400
  all , overwrite rsid for all availalbe rsid
328
401
  invalid, only assign rsid for variants with invalid rsid
329
402
  empty only assign rsid for variants with na rsid
330
- '''
403
+ '''
404
+
331
405
  if ref_mode=="vcf":
332
406
  ###################################################################################################################
333
- if verbose: log.write("Start to assign rsID using vcf...{}".format(_get_version()))
334
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
335
- if verbose: log.write(" -CPU Cores to use :",n_cores)
336
- if verbose: log.write(" -Reference VCF file:", path)
337
-
407
+ ##start function with col checking##########################################################
408
+ _start_line = "assign rsID using reference VCF"
409
+ _end_line = "assign rsID using reference file"
410
+ _start_cols = [chr,pos,ref,alt,status]
411
+ _start_function = ".assign_rsid()"
412
+ _must_args ={}
413
+
414
+ is_enough_info = start_to(sumstats=sumstats,
415
+ log=log,
416
+ verbose=verbose,
417
+ start_line=_start_line,
418
+ end_line=_end_line,
419
+ start_cols=_start_cols,
420
+ start_function=_start_function,
421
+ n_cores=n_cores,
422
+ ref_vcf=path,
423
+ **_must_args)
424
+ if is_enough_info == False: return sumstats
425
+ ############################################################################################
338
426
  chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
339
-
340
- if verbose: log.write(" -Assigning rsID based on chr:pos and ref:alt/alt:ref...")
341
-
342
-
427
+ if verbose: log.write(" -Assigning rsID based on CHR:POS and REF:ALT/ALT:REF...")
343
428
  ##############################################
344
429
  if rsid not in sumstats.columns:
345
430
  sumstats[rsid]=pd.Series(dtype="string")
@@ -361,7 +446,8 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
361
446
 
362
447
  if sum(to_assign)>0:
363
448
  if sum(to_assign)<10000: n_cores=1
364
- df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
449
+ #df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
450
+ df_split = _df_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
365
451
  pool = Pool(n_cores)
366
452
  map_func = partial(assign_rsid_single,path=path,chr=chr,pos=pos,ref=ref,alt=alt,chr_dict=chr_dict)
367
453
  assigned_rsid = pd.concat(pool.map(map_func,df_split))
@@ -380,9 +466,25 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
380
466
  '''
381
467
  assign rsID based on chr:pos
382
468
  '''
383
- if verbose: log.write("Start to annotate rsID based on chromosome and position information...{}".format(_get_version()))
384
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
385
- if verbose: log.write(" -SNPID-rsID text file: "+ path)
469
+ ##start function with col checking##########################################################
470
+ _start_line = "assign rsID by matching SNPID with CHR:POS:REF:ALT in the reference TSV"
471
+ _end_line = "assign rsID using reference file"
472
+ _start_cols = [snpid,status]
473
+ _start_function = ".assign_rsid()"
474
+ _must_args ={}
475
+
476
+ is_enough_info = start_to(sumstats=sumstats,
477
+ log=log,
478
+ verbose=verbose,
479
+ start_line=_start_line,
480
+ end_line=_end_line,
481
+ start_cols=_start_cols,
482
+ start_function=_start_function,
483
+ n_cores=n_cores,
484
+ ref_tsv=path,
485
+ **_must_args)
486
+ if is_enough_info == False: return sumstats
487
+ ############################################################################################
386
488
 
387
489
  standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234][0126]\w", case=False, flags=0, na=False)
388
490
 
@@ -390,11 +492,12 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
390
492
  sumstats[rsid]=pd.Series(dtype="string")
391
493
 
392
494
  if overwrite == "empty":
393
- to_assign = sumstats[rsid].isna()
495
+ to_assign = sumstats[rsid].isna() & standardized_normalized
394
496
  if overwrite=="all":
395
497
  to_assign = standardized_normalized
396
498
  if overwrite=="invalid":
397
499
  to_assign = (~sumstats[rsid].str.match(r'rs([0-9]+)', case=False, flags=0, na=False)) & standardized_normalized
500
+
398
501
  total_number= len(sumstats)
399
502
  pre_number = sum(~sumstats[rsid].isna())
400
503
  if verbose: log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...")
@@ -419,12 +522,13 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
419
522
  sumstats = sumstats.rename(columns = {'index':snpid})
420
523
 
421
524
  after_number = sum(~sumstats[rsid].isna())
422
- if verbose: log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!")
525
+ if verbose: log.write(" -rsID annotation for "+str(total_number - after_number) +" needed to be fixed!")
423
526
  if verbose: log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!")
424
527
  else:
425
- if verbose: log.write(" -No rsID could be fixed...skipping...")
528
+ if verbose: log.write(" -No rsID can be fixed...skipping...")
426
529
  ################################################################################################################
427
- gc.collect()
530
+
531
+ finished(log,verbose,_end_line)
428
532
  return sumstats
429
533
  #################################################################################################################################################
430
534
  #single record assignment
@@ -503,12 +607,12 @@ def is_palindromic(sumstats,a1="EA",a2="NEA"):
503
607
 
504
608
  def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS"):
505
609
  vcf_reader = VariantFile(ref_infer)
506
- status_part = sumstats.apply(lambda x:check_strand_status(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict),axis=1)
610
+ status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
507
611
  return status_part
508
612
 
509
613
  def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
510
614
  vcf_reader = VariantFile(ref_infer)
511
- status_part = sumstats.apply(lambda x:check_unkonwn_indel(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict,daf_tolerance),axis=1)
615
+ status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
512
616
  return status_part
513
617
 
514
618
  ##################################################################################################################################################
@@ -516,79 +620,98 @@ def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NE
516
620
  def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
517
621
  chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
518
622
  chr_dict=None,verbose=True,log=Log()):
519
- if verbose: log.write("Start to infer strand for palindromic SNPs...{}".format(_get_version()))
520
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
521
- if verbose: log.write(" -Reference vcf file:", ref_infer)
623
+ ##start function with col checking##########################################################
624
+ _start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
625
+ _end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
626
+ _start_cols = [chr,pos,ref,alt,eaf,status]
627
+ _start_function = ".infer_strand()"
628
+ _must_args ={"ref_alt_freq":ref_alt_freq}
629
+
630
+ is_enough_info = start_to(sumstats=sumstats,
631
+ log=log,
632
+ verbose=verbose,
633
+ start_line=_start_line,
634
+ end_line=_end_line,
635
+ start_cols=_start_cols,
636
+ start_function=_start_function,
637
+ n_cores=n_cores,
638
+ ref_vcf=ref_infer,
639
+ **_must_args)
640
+ if is_enough_info == False: return sumstats
641
+ ############################################################################################
522
642
 
523
643
  chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
644
+
645
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
524
646
 
525
- # check if the columns are complete
526
- if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
527
- raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
528
647
  if "p" in mode:
529
- # ref_alt_freq INFO in vcf was provided
530
- if ref_alt_freq is not None:
531
- if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
532
-
533
- ## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
534
- good_chrpos = sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
535
- palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
536
- not_palindromic_snp = good_chrpos & (~palindromic)
537
-
538
- ##not palindromic : change status
539
- sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
540
- if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
541
-
542
- #palindromic but can not infer
543
- maf_can_infer = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
544
- sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
545
-
546
-
547
- if verbose: log.write(" -After filtering by MAF< ", maf_threshold ," , the strand of ", sum(palindromic & maf_can_infer)," palindromic SNPs will be inferred...")
548
- #########################################################################################
549
- if sum(palindromic & maf_can_infer)>0:
550
- if sum(palindromic & maf_can_infer)<10000:
551
- n_cores=1
552
- df_split = np.array_split(sumstats.loc[(palindromic & maf_can_infer),[chr,pos,ref,alt,eaf,status]], n_cores)
553
- pool = Pool(n_cores)
554
- map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
555
- status_inferred = pd.concat(pool.map(map_func,df_split))
556
- sumstats.loc[(palindromic & maf_can_infer),status] = status_inferred.values
557
- pool.close()
558
- pool.join()
559
- #########################################################################################
560
- #0 Not palindromic SNPs
561
- #1 Palindromic +strand -> no need to flip
562
- #2 palindromic -strand -> need to flip -> fixed
563
- #3 Indel no need flip
564
- #4 Unknown Indel -> fixed
565
- #5 Palindromic -strand -> need to flip
566
- #6 Indel need flip
567
- #7 indistinguishable
568
- #8 Not matching or No information
569
- #9 Unchecked
570
-
571
- status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
572
- status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
573
- status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
574
- status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
575
- status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
576
-
577
- if verbose: log.write(" -Non-palindromic : ",sum(status0))
578
- if verbose: log.write(" -Palindromic SNPs on + strand: ",sum(status1))
579
- if verbose: log.write(" -Palindromic SNPs on - strand and need to be flipped:",sum(status5))
580
- if verbose: log.write(" -Palindromic SNPs with maf not available to infer : ",sum(status7))
581
- if verbose: log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8))
582
-
583
- if ("7" in remove_snp) and ("8" in remove_snp) :
584
- if verbose: log.write(" -Palindromic SNPs with maf not available to infer and with no macthes or no information will will be removed")
585
- sumstats = sumstats.loc[~(status7 | status8),:].copy()
586
- elif "8" in remove_snp:
587
- if verbose: log.write(" -Palindromic SNPs with no macthes or no information will be removed")
588
- sumstats = sumstats.loc[~status8,:].copy()
589
- elif "7" in remove_snp:
590
- if verbose: log.write(" -Palindromic SNPs with maf not available to infer will be removed")
591
- sumstats = sumstats.loc[~status7,:].copy()
648
+ ## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
649
+ good_chrpos = sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
650
+ palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
651
+ not_palindromic_snp = good_chrpos & (~palindromic)
652
+
653
+ ##not palindromic : change status
654
+ sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
655
+ if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
656
+
657
+ #palindromic but can not infer
658
+ maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
659
+
660
+ sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
661
+
662
+ #palindromic WITH UNKNWON OR UNCHECKED STATUS
663
+ unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
664
+
665
+ unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
666
+
667
+ if verbose: log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)))
668
+
669
+ #########################################################################################
670
+ if sum(unknow_palindromic_to_check)>0:
671
+ if sum(unknow_palindromic_to_check)<10000:
672
+ n_cores=1
673
+
674
+ #df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
675
+ df_split = _df_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
676
+ pool = Pool(n_cores)
677
+ map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
678
+ status_inferred = pd.concat(pool.map(map_func,df_split))
679
+ sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
680
+ pool.close()
681
+ pool.join()
682
+ #########################################################################################
683
+ #0 Not palindromic SNPs
684
+ #1 Palindromic +strand -> no need to flip
685
+ #2 palindromic -strand -> need to flip -> fixed
686
+ #3 Indel no need flip
687
+ #4 Unknown Indel -> fixed
688
+ #5 Palindromic -strand -> need to flip
689
+ #6 Indel need flip
690
+ #7 indistinguishable
691
+ #8 Not matching or No information
692
+ #9 Unchecked
693
+
694
+ status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
695
+ status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
696
+ status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
697
+ status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
698
+ status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
699
+
700
+ if verbose: log.write(" -Non-palindromic : ",sum(status0))
701
+ if verbose: log.write(" -Palindromic SNPs on + strand: ",sum(status1))
702
+ if verbose: log.write(" -Palindromic SNPs on - strand and needed to be flipped:",sum(status5))
703
+ if verbose: log.write(" -Palindromic SNPs with MAF not available to infer : ",sum(status7))
704
+ if verbose: log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8))
705
+
706
+ if ("7" in remove_snp) and ("8" in remove_snp) :
707
+ if verbose: log.write(" -Palindromic SNPs with maf not available to infer and with no macthes or no information will will be removed")
708
+ sumstats = sumstats.loc[~(status7 | status8),:].copy()
709
+ elif "8" in remove_snp:
710
+ if verbose: log.write(" -Palindromic SNPs with no macthes or no information will be removed")
711
+ sumstats = sumstats.loc[~status8,:].copy()
712
+ elif "7" in remove_snp:
713
+ if verbose: log.write(" -Palindromic SNPs with maf not available to infer will be removed")
714
+ sumstats = sumstats.loc[~status7,:].copy()
592
715
 
593
716
  ### unknow_indel
594
717
  if "i" in mode:
@@ -598,14 +721,15 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
598
721
  if verbose: log.write(" -Indistinguishable indels will be inferred from reference vcf ref and alt...")
599
722
  #########################################################################################
600
723
  #with maf can not infer
601
- #maf_can_infer = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
724
+ #maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
602
725
  #sumstats.loc[unknow_indel&(~maf_can_infer),status] = vchange_status(sumstats.loc[unknow_indel&(~maf_can_infer),status],7,"9","8")
603
726
  if verbose: log.write(" -DAF tolerance: {}".format(daf_tolerance))
604
727
 
605
728
  if sum(unknow_indel)>0:
606
729
  if sum(unknow_indel)<10000:
607
730
  n_cores=1
608
- df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
731
+ #df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
732
+ df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
609
733
  pool = Pool(n_cores)
610
734
  map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
611
735
  status_inferred = pd.concat(pool.map(map_func,df_split))
@@ -624,7 +748,8 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
624
748
  if "8" in remove_indel:
625
749
  if verbose: log.write(" -Indels with no macthes or no information will be removed")
626
750
  sumstats = sumstats.loc[~status8,:].copy()
627
- gc.collect()
751
+
752
+ finished(log,verbose,_end_line)
628
753
  return sumstats
629
754
 
630
755
 
@@ -648,22 +773,35 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
648
773
 
649
774
  ################################################################################################################
650
775
  def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
651
-
652
- if verbose: log.write("Start to check the difference between EAF and reference vcf alt frequency ...{}".format(_get_version()))
653
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
654
- if verbose: log.write(" -Reference vcf file:", ref_infer)
655
- if verbose: log.write(" -CPU Cores to use :",n_cores)
656
-
776
+ ##start function with col checking##########################################################
777
+ _start_line = "check the difference between EAF and reference VCF ALT frequency"
778
+ _end_line = "checking the difference between EAF and reference VCF ALT frequency"
779
+ _start_cols = [chr,pos,ref,alt,eaf,status]
780
+ _start_function = ".check_daf()"
781
+ _must_args ={"ref_alt_freq":ref_alt_freq}
782
+
783
+ is_enough_info = start_to(sumstats=sumstats,
784
+ log=log,
785
+ verbose=verbose,
786
+ start_line=_start_line,
787
+ end_line=_end_line,
788
+ start_cols=_start_cols,
789
+ start_function=_start_function,
790
+ n_cores=n_cores,
791
+ ref_vcf=ref_infer,
792
+ **_must_args)
793
+ if is_enough_info == False: return sumstats
794
+ ############################################################################################
795
+
657
796
  chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
658
797
 
659
798
  column_name = column_name + suffix
660
- # check if the columns are complete
661
- if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
662
- raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
663
799
 
800
+
801
+
664
802
  # ref_alt_freq INFO in vcf was provided
665
803
  if ref_alt_freq is not None:
666
- if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
804
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
667
805
  if not force:
668
806
  good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
669
807
  if verbose: log.write(" -Checking variants:", sum(good_chrpos))
@@ -672,7 +810,8 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
672
810
  ########################
673
811
  if sum(~sumstats[eaf].isna())<10000:
674
812
  n_cores=1
675
- df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
813
+ #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
814
+ df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
676
815
  pool = Pool(n_cores)
677
816
  if sum(~sumstats[eaf].isna())>0:
678
817
  map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,column_name=column_name,chr_dict=chr_dict)
@@ -683,13 +822,13 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
683
822
  #status_inferred = sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]].apply(lambda x:check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict),axis=1)
684
823
 
685
824
  #sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
686
- #sumstats.loc[:,"DAF"]=sumstats.loc[:,"DAF"].astype("float")
687
- if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats.loc[:,column_name]))
688
- if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats.loc[:,column_name]))
689
- if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats.loc[:,column_name]))
690
- if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats.loc[:,column_name])))
691
- if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats.loc[:,column_name])))
692
- if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats.loc[:,column_name])))
825
+ #sumstats["DAF"]=sumstats["DAF"].astype("float")
826
+ if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]))
827
+ if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats[column_name]))
828
+ if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats[column_name]))
829
+ if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats[column_name])))
830
+ if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats[column_name])))
831
+ if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats[column_name])))
693
832
  if verbose: log.write("Finished allele frequency checking!")
694
833
  return sumstats
695
834
 
@@ -697,11 +836,11 @@ def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos
697
836
  #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
698
837
  vcf_reader = VariantFile(ref_infer)
699
838
  def afapply(x,vcf,alt_freq,chr_dict):
700
- return check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict)
839
+ return check_daf(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,chr_dict)
701
840
  map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
702
841
  status_inferred = sumstats.apply(map_func,axis=1)
703
- sumstats.loc[:,column_name] = status_inferred.values
704
- sumstats.loc[:,column_name]=sumstats.loc[:,column_name].astype("float")
842
+ sumstats[column_name] = status_inferred.values
843
+ sumstats[column_name]=sumstats[column_name].astype("float")
705
844
  return sumstats
706
845
 
707
846
  def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
@@ -716,25 +855,35 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
716
855
  ################################################################################################################
717
856
 
718
857
  def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
719
-
720
- if verbose: log.write("Start to infer the AF and reference vcf alt frequency ...{}".format(_get_version()))
721
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
722
- if verbose: log.write(" -Reference vcf file:", ref_infer)
723
- if verbose: log.write(" -CPU Cores to use :",n_cores)
724
-
858
+ ##start function with col checking##########################################################
859
+ _start_line = "infer EAF using reference VCF ALT frequency"
860
+ _end_line = "inferring EAF using reference VCF ALT frequency"
861
+ _start_cols = [chr,pos,ref,alt,eaf,status]
862
+ _start_function = ".infer_af()"
863
+ _must_args ={"ref_alt_freq":ref_alt_freq}
864
+
865
+ is_enough_info = start_to(sumstats=sumstats,
866
+ log=log,
867
+ verbose=verbose,
868
+ start_line=_start_line,
869
+ end_line=_end_line,
870
+ start_cols=_start_cols,
871
+ start_function=_start_function,
872
+ n_cores=n_cores,
873
+ ref_vcf=ref_infer,
874
+ **_must_args)
875
+ if is_enough_info == False: return sumstats
876
+ ############################################################################################
725
877
  chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
726
-
727
- # check if the columns are complete
728
- if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
729
- raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
730
878
 
731
879
  if eaf not in sumstats.columns:
732
880
  sumstats[eaf]=np.nan
733
881
 
734
882
  prenumber = sum(sumstats[eaf].isna())
883
+
735
884
  # ref_alt_freq INFO in vcf was provided
736
885
  if ref_alt_freq is not None:
737
- if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
886
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
738
887
  if not force:
739
888
  good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
740
889
  if verbose: log.write(" -Checking variants:", sum(good_chrpos))
@@ -742,7 +891,8 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
742
891
  ########################
743
892
  if sum(sumstats[eaf].isna())<10000:
744
893
  n_cores=1
745
- df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
894
+ #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
895
+ df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
746
896
  pool = Pool(n_cores)
747
897
  map_func = partial(inferaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
748
898
  sumstats.loc[good_chrpos,[eaf]] = pd.concat(pool.map(map_func,df_split))
@@ -753,18 +903,19 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
753
903
  afternumber = sum(sumstats[eaf].isna())
754
904
  if verbose: log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber))
755
905
  if verbose: log.write(" -EAF is still missing for {} variants.".format(afternumber))
756
- if verbose: log.write("Finished allele frequency inferring!")
906
+
907
+ finished(log,verbose,_end_line)
757
908
  return sumstats
758
909
 
759
910
  def inferaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
760
911
  #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
761
912
  vcf_reader = VariantFile(ref_infer)
762
913
  def afapply(x,vcf,alt_freq,chr_dict):
763
- return infer_af(x[0],x[1]-1,x[1],x[2],x[3],vcf_reader,ref_alt_freq,chr_dict)
914
+ return infer_af(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,ref_alt_freq,chr_dict)
764
915
  map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
765
916
  status_inferred = sumstats.apply(map_func,axis=1)
766
- sumstats.loc[:,eaf] = status_inferred.values
767
- sumstats.loc[:,eaf]=sumstats.loc[:,eaf].astype("float")
917
+ sumstats[eaf] = status_inferred.values
918
+ sumstats[eaf]=sumstats[eaf].astype("float")
768
919
  return sumstats
769
920
 
770
921
  def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
@@ -802,4 +953,6 @@ def check_vcf_chr_prefix(vcf_bcf_path):
802
953
  if m is not None:
803
954
  return m.group(1)
804
955
  else:
805
- return None
956
+ return None
957
+
958
+