gwaslab 3.4.37__py3-none-any.whl → 3.4.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (37) hide show
  1. gwaslab/data/formatbook.json +722 -721
  2. gwaslab/g_Log.py +8 -0
  3. gwaslab/g_Sumstats.py +26 -147
  4. gwaslab/g_SumstatsPair.py +6 -2
  5. gwaslab/g_Sumstats_summary.py +3 -3
  6. gwaslab/g_version.py +2 -2
  7. gwaslab/hm_casting.py +29 -15
  8. gwaslab/hm_harmonize_sumstats.py +291 -163
  9. gwaslab/hm_rsid_to_chrpos.py +1 -1
  10. gwaslab/io_preformat_input.py +43 -37
  11. gwaslab/io_to_formats.py +428 -295
  12. gwaslab/qc_check_datatype.py +3 -3
  13. gwaslab/qc_fix_sumstats.py +793 -682
  14. gwaslab/util_ex_calculate_ldmatrix.py +29 -11
  15. gwaslab/util_ex_gwascatalog.py +1 -1
  16. gwaslab/util_ex_ldproxyfinder.py +1 -1
  17. gwaslab/util_ex_process_ref.py +3 -3
  18. gwaslab/util_ex_run_coloc.py +26 -4
  19. gwaslab/util_in_convert_h2.py +1 -1
  20. gwaslab/util_in_fill_data.py +2 -2
  21. gwaslab/util_in_filter_value.py +122 -34
  22. gwaslab/util_in_get_density.py +2 -2
  23. gwaslab/util_in_get_sig.py +41 -9
  24. gwaslab/viz_aux_quickfix.py +24 -19
  25. gwaslab/viz_aux_reposition_text.py +7 -4
  26. gwaslab/viz_aux_save_figure.py +6 -5
  27. gwaslab/viz_plot_compare_af.py +5 -5
  28. gwaslab/viz_plot_miamiplot2.py +28 -20
  29. gwaslab/viz_plot_mqqplot.py +109 -72
  30. gwaslab/viz_plot_qqplot.py +11 -8
  31. gwaslab/viz_plot_regionalplot.py +3 -1
  32. gwaslab/viz_plot_trumpetplot.py +15 -6
  33. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/METADATA +2 -2
  34. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/RECORD +37 -37
  35. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
  36. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
  37. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
@@ -12,12 +12,18 @@ from gwaslab.g_Log import Log
12
12
  from gwaslab.qc_fix_sumstats import fixchr
13
13
  from gwaslab.qc_fix_sumstats import fixpos
14
14
  from gwaslab.qc_fix_sumstats import sortcolumn
15
+ from gwaslab.qc_fix_sumstats import _df_split
16
+ from gwaslab.qc_fix_sumstats import check_col
17
+ from gwaslab.qc_fix_sumstats import start_to
18
+ from gwaslab.qc_fix_sumstats import finished
19
+ from gwaslab.qc_fix_sumstats import skipped
15
20
  from gwaslab.qc_check_datatype import check_dataframe_shape
16
21
  from gwaslab.bd_common_data import get_number_to_chr
17
22
  from gwaslab.bd_common_data import get_chr_list
18
23
  from gwaslab.bd_common_data import get_chr_to_number
19
24
  from gwaslab.g_vchange_status import vchange_status
20
25
  from gwaslab.g_version import _get_version
26
+
21
27
  #rsidtochrpos
22
28
  #checkref
23
29
  #parallelizeassignrsid
@@ -35,9 +41,24 @@ def rsidtochrpos(sumstats,
35
41
  '''
36
42
  assign chr:pos based on rsID
37
43
  '''
38
- #########################################################################################################
39
- if verbose: log.write("Start to update chromosome and position information based on rsID...{}".format(_get_version()))
40
- check_dataframe_shape(sumstats, log, verbose)
44
+ ##start function with col checking##########################################################
45
+ _start_line = "assign CHR and POS using rsIDs"
46
+ _end_line = "assigning CHR and POS using rsIDs"
47
+ _start_cols = [rsid,chrom,pos]
48
+ _start_function = ".rsid_to_chrpos()"
49
+ _must_args ={}
50
+
51
+ is_enough_info = start_to(sumstats=sumstats,
52
+ log=log,
53
+ verbose=verbose,
54
+ start_line=_start_line,
55
+ end_line=_end_line,
56
+ start_cols=_start_cols,
57
+ start_function=_start_function,
58
+ **_must_args)
59
+ if is_enough_info == False: return sumstats
60
+ ############################################################################################
61
+
41
62
  if verbose: log.write(" -rsID dictionary file: "+ path)
42
63
 
43
64
  if ref_rsid_to_chrpos_tsv is not None:
@@ -81,6 +102,8 @@ def rsidtochrpos(sumstats,
81
102
  sumstats = fixchr(sumstats,verbose=verbose)
82
103
  sumstats = fixpos(sumstats,verbose=verbose)
83
104
  sumstats = sortcolumn(sumstats,verbose=verbose)
105
+
106
+ finished(log,verbose,_end_line)
84
107
  return sumstats
85
108
  ####################################################################################################
86
109
 
@@ -104,17 +127,32 @@ def merge_chrpos(sumstats_part,all_groups_max,path,build,status):
104
127
 
105
128
  def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None, ref_rsid_to_chrpos_vcf = None, ref_rsid_to_chrpos_hdf5 = None, build="99",status="STATUS",
106
129
  n_cores=4,block_size=20000000,verbose=True,log=Log()):
107
-
130
+
131
+ ##start function with col checking##########################################################
132
+ _start_line = "assign CHR and POS using rsIDs"
133
+ _end_line = "assigning CHR and POS using rsIDs"
134
+ _start_cols = [rsid,chrom,pos]
135
+ _start_function = ".rsid_to_chrpos2()"
136
+ _must_args ={}
137
+
138
+ is_enough_info = start_to(sumstats=sumstats,
139
+ log=log,
140
+ verbose=verbose,
141
+ start_line=_start_line,
142
+ end_line=_end_line,
143
+ start_cols=_start_cols,
144
+ start_function=_start_function,
145
+ **_must_args)
146
+ if is_enough_info == False: return sumstats
147
+ ############################################################################################
148
+
108
149
  if ref_rsid_to_chrpos_hdf5 is not None:
109
150
  path = ref_rsid_to_chrpos_hdf5
110
151
  elif ref_rsid_to_chrpos_vcf is not None:
111
152
  vcf_file_name = os.path.basename(ref_rsid_to_chrpos_vcf)
112
153
  vcf_dir_path = os.path.dirname(ref_rsid_to_chrpos_vcf)
113
154
  path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(vcf_dir_path,vcf_file_name,int(block_size))
114
-
115
- if verbose: log.write("Start to assign CHR and POS using rsIDs...{}".format(_get_version()))
116
- check_dataframe_shape(sumstats, log, verbose)
117
-
155
+
118
156
  if path is None:
119
157
  raise ValueError("Please provide path to hdf5 file.")
120
158
 
@@ -192,8 +230,8 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
192
230
 
193
231
  pool.close()
194
232
  pool.join()
195
- gc.collect()
196
- if verbose: log.write("Finished assigning CHR and POS using rsIDs.")
233
+
234
+ finished(log, verbose, _end_line)
197
235
  return sumstats
198
236
  ####################################################################################################################
199
237
  #20220426 check if non-effect allele is aligned with reference genome
@@ -211,15 +249,15 @@ def check_status(row,record):
211
249
  #8 / -----> not on ref genome
212
250
  #9 / ------> unchecked
213
251
 
214
- status_pre=row[3][:5]
215
- status_end=row[3][6:]
252
+ status_pre=row.iloc[3][:5]
253
+ status_end=row.iloc[3][6:]
216
254
 
217
255
  ## nea == ref
218
- if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
256
+ if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
219
257
  ## ea == ref
220
- if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
258
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
221
259
  ## len(nea) >len(ea):
222
- if len(row[2])!=len(row[1]):
260
+ if len(row.iloc[2])!=len(row.iloc[1]):
223
261
  # indels both on ref, unable to identify
224
262
  return status_pre+"6"+status_end
225
263
  else:
@@ -228,34 +266,49 @@ def check_status(row,record):
228
266
  ## nea!=ref
229
267
  else:
230
268
  # ea == ref_seq -> need to flip
231
- if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
269
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
232
270
  return status_pre+"3"+status_end
233
271
  # ea !=ref
234
272
  else:
235
273
  #_reverse_complementary
236
- row[1] = get_reverse_complementary_allele(row[1])
237
- row[2] = get_reverse_complementary_allele(row[2])
274
+ row.iloc[1] = get_reverse_complementary_allele(row.iloc[1])
275
+ row.iloc[2] = get_reverse_complementary_allele(row.iloc[2])
238
276
  ## nea == ref
239
- if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
277
+ if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
240
278
  ## ea == ref
241
- if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
279
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
242
280
  ## len(nea) >len(ea):
243
- if len(row[2])!=len(row[1]):
281
+ if len(row.iloc[2])!=len(row.iloc[1]):
244
282
  return status_pre+"8"+status_end # indel reverse complementary
245
283
  else:
246
284
  return status_pre+"4"+status_end
247
285
  else:
248
286
  # ea == ref_seq -> need to flip
249
- if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
287
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
250
288
  return status_pre+"5"+status_end
251
289
  # ea !=ref
252
290
  return status_pre+"8"+status_end
253
291
 
254
292
 
255
293
  def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
256
- if verbose: log.write("Start to check if NEA is aligned with reference sequence...{}".format(_get_version()))
257
- check_dataframe_shape(sumstats, log, verbose)
258
- if verbose: log.write(" -Reference genome fasta file: "+ ref_path)
294
+ ##start function with col checking##########################################################
295
+ _start_line = "check if NEA is aligned with reference sequence"
296
+ _end_line = "checking if NEA is aligned with reference sequence"
297
+ _start_cols = [chrom,pos,ea,nea,status]
298
+ _start_function = ".check_ref()"
299
+ _must_args ={}
300
+
301
+ is_enough_info = start_to(sumstats=sumstats,
302
+ log=log,
303
+ verbose=verbose,
304
+ start_line=_start_line,
305
+ end_line=_end_line,
306
+ start_cols=_start_cols,
307
+ start_function=_start_function,
308
+ **_must_args)
309
+ if is_enough_info == False: return sumstats
310
+ ############################################################################################
311
+ if verbose: log.write(" -Reference genome FASTA file: "+ ref_path)
259
312
  if verbose: log.write(" -Checking records: ", end="")
260
313
  chromlist = get_chr_list(add_number=True)
261
314
  records = SeqIO.parse(ref_path, "fasta")
@@ -274,7 +327,7 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
274
327
 
275
328
  if verbose: log.write("\n",end="",show_time=False)
276
329
 
277
- sumstats.loc[:,status] = sumstats.loc[:,status].astype("string")
330
+ sumstats[status] = sumstats[status].astype("string")
278
331
  available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
279
332
  status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
280
333
  status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -290,7 +343,7 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
290
343
  flip_rate = status_3/available_to_check
291
344
  if verbose: log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100))
292
345
  if raw_matching_rate <0.8:
293
- if verbose: log.write(" -!!!Warning, matching rate is low, please check if the right reference genome is used.")
346
+ if verbose: log.warning("Matching rate is low, please check if the right reference genome is used.")
294
347
  if flip_rate > 0.85 :
295
348
  if verbose: log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.")
296
349
 
@@ -303,7 +356,8 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
303
356
  if remove is True:
304
357
  sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
305
358
  if verbose: log.write(" -Variants not on given reference sequence were removed.")
306
- gc.collect()
359
+
360
+ finished(log, verbose, _end_line)
307
361
  return sumstats
308
362
 
309
363
  #######################################################################################################################################
@@ -333,7 +387,7 @@ def assign_rsid_single(sumstats,path,rsid="rsID",chr="CHR",pos="POS",ref="NEA",a
333
387
  ## single df assignment
334
388
  vcf_reader = VariantFile(path)
335
389
  def rsid_helper(x,vcf_reader,chr_dict):
336
- return chrposref_rsid(x[0],x[1],x[2],x[3],vcf_reader,chr_dict)
390
+ return chrposref_rsid(x.iloc[0],x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,chr_dict)
337
391
  map_func=partial(rsid_helper,vcf_reader=vcf_reader,chr_dict=chr_dict)
338
392
  rsID = sumstats.apply(map_func,axis=1)
339
393
  return rsID
@@ -346,19 +400,31 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
346
400
  all , overwrite rsid for all availalbe rsid
347
401
  invalid, only assign rsid for variants with invalid rsid
348
402
  empty only assign rsid for variants with na rsid
349
- '''
403
+ '''
404
+
350
405
  if ref_mode=="vcf":
351
406
  ###################################################################################################################
352
- if verbose: log.write("Start to assign rsID using vcf...{}".format(_get_version()))
353
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
354
- if verbose: log.write(" -CPU Cores to use :",n_cores)
355
- if verbose: log.write(" -Reference VCF file:", path)
356
-
407
+ ##start function with col checking##########################################################
408
+ _start_line = "assign rsID using reference VCF"
409
+ _end_line = "assign rsID using reference file"
410
+ _start_cols = [chr,pos,ref,alt,status]
411
+ _start_function = ".assign_rsid()"
412
+ _must_args ={}
413
+
414
+ is_enough_info = start_to(sumstats=sumstats,
415
+ log=log,
416
+ verbose=verbose,
417
+ start_line=_start_line,
418
+ end_line=_end_line,
419
+ start_cols=_start_cols,
420
+ start_function=_start_function,
421
+ n_cores=n_cores,
422
+ ref_vcf=path,
423
+ **_must_args)
424
+ if is_enough_info == False: return sumstats
425
+ ############################################################################################
357
426
  chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
358
-
359
- if verbose: log.write(" -Assigning rsID based on chr:pos and ref:alt/alt:ref...")
360
-
361
-
427
+ if verbose: log.write(" -Assigning rsID based on CHR:POS and REF:ALT/ALT:REF...")
362
428
  ##############################################
363
429
  if rsid not in sumstats.columns:
364
430
  sumstats[rsid]=pd.Series(dtype="string")
@@ -380,7 +446,8 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
380
446
 
381
447
  if sum(to_assign)>0:
382
448
  if sum(to_assign)<10000: n_cores=1
383
- df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
449
+ #df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
450
+ df_split = _df_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
384
451
  pool = Pool(n_cores)
385
452
  map_func = partial(assign_rsid_single,path=path,chr=chr,pos=pos,ref=ref,alt=alt,chr_dict=chr_dict)
386
453
  assigned_rsid = pd.concat(pool.map(map_func,df_split))
@@ -399,9 +466,25 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
399
466
  '''
400
467
  assign rsID based on chr:pos
401
468
  '''
402
- if verbose: log.write("Start to annotate rsID based on chromosome and position information...{}".format(_get_version()))
403
- check_dataframe_shape(sumstats, log, verbose)
404
- if verbose: log.write(" -SNPID-rsID text file: "+ path)
469
+ ##start function with col checking##########################################################
470
+ _start_line = "assign rsID by matching SNPID with CHR:POS:REF:ALT in the reference TSV"
471
+ _end_line = "assign rsID using reference file"
472
+ _start_cols = [snpid,status]
473
+ _start_function = ".assign_rsid()"
474
+ _must_args ={}
475
+
476
+ is_enough_info = start_to(sumstats=sumstats,
477
+ log=log,
478
+ verbose=verbose,
479
+ start_line=_start_line,
480
+ end_line=_end_line,
481
+ start_cols=_start_cols,
482
+ start_function=_start_function,
483
+ n_cores=n_cores,
484
+ ref_tsv=path,
485
+ **_must_args)
486
+ if is_enough_info == False: return sumstats
487
+ ############################################################################################
405
488
 
406
489
  standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234][0126]\w", case=False, flags=0, na=False)
407
490
 
@@ -409,11 +492,12 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
409
492
  sumstats[rsid]=pd.Series(dtype="string")
410
493
 
411
494
  if overwrite == "empty":
412
- to_assign = sumstats[rsid].isna()
495
+ to_assign = sumstats[rsid].isna() & standardized_normalized
413
496
  if overwrite=="all":
414
497
  to_assign = standardized_normalized
415
498
  if overwrite=="invalid":
416
499
  to_assign = (~sumstats[rsid].str.match(r'rs([0-9]+)', case=False, flags=0, na=False)) & standardized_normalized
500
+
417
501
  total_number= len(sumstats)
418
502
  pre_number = sum(~sumstats[rsid].isna())
419
503
  if verbose: log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...")
@@ -438,12 +522,13 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
438
522
  sumstats = sumstats.rename(columns = {'index':snpid})
439
523
 
440
524
  after_number = sum(~sumstats[rsid].isna())
441
- if verbose: log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!")
525
+ if verbose: log.write(" -rsID annotation for "+str(total_number - after_number) +" needed to be fixed!")
442
526
  if verbose: log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!")
443
527
  else:
444
- if verbose: log.write(" -No rsID could be fixed...skipping...")
528
+ if verbose: log.write(" -No rsID can be fixed...skipping...")
445
529
  ################################################################################################################
446
- gc.collect()
530
+
531
+ finished(log,verbose,_end_line)
447
532
  return sumstats
448
533
  #################################################################################################################################################
449
534
  #single record assignment
@@ -522,12 +607,12 @@ def is_palindromic(sumstats,a1="EA",a2="NEA"):
522
607
 
523
608
  def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS"):
524
609
  vcf_reader = VariantFile(ref_infer)
525
- status_part = sumstats.apply(lambda x:check_strand_status(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict),axis=1)
610
+ status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
526
611
  return status_part
527
612
 
528
613
  def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
529
614
  vcf_reader = VariantFile(ref_infer)
530
- status_part = sumstats.apply(lambda x:check_unkonwn_indel(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict,daf_tolerance),axis=1)
615
+ status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
531
616
  return status_part
532
617
 
533
618
  ##################################################################################################################################################
@@ -535,85 +620,98 @@ def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NE
535
620
  def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
536
621
  chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
537
622
  chr_dict=None,verbose=True,log=Log()):
538
- if verbose: log.write("Start to infer strand for palindromic SNPs...{}".format(_get_version()))
539
- check_dataframe_shape(sumstats, log, verbose)
540
- if verbose: log.write(" -Reference vcf file:", ref_infer)
623
+ ##start function with col checking##########################################################
624
+ _start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
625
+ _end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
626
+ _start_cols = [chr,pos,ref,alt,eaf,status]
627
+ _start_function = ".infer_strand()"
628
+ _must_args ={"ref_alt_freq":ref_alt_freq}
629
+
630
+ is_enough_info = start_to(sumstats=sumstats,
631
+ log=log,
632
+ verbose=verbose,
633
+ start_line=_start_line,
634
+ end_line=_end_line,
635
+ start_cols=_start_cols,
636
+ start_function=_start_function,
637
+ n_cores=n_cores,
638
+ ref_vcf=ref_infer,
639
+ **_must_args)
640
+ if is_enough_info == False: return sumstats
641
+ ############################################################################################
541
642
 
542
643
  chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
644
+
645
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
543
646
 
544
- # check if the columns are complete
545
- if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
546
- raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
547
647
  if "p" in mode:
548
- # ref_alt_freq INFO in vcf was provided
549
- if ref_alt_freq is not None:
550
-
551
- if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
552
- ## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
553
- good_chrpos = sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
554
- palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
555
- not_palindromic_snp = good_chrpos & (~palindromic)
556
-
557
- ##not palindromic : change status
558
- sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
559
- if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
560
-
561
- #palindromic but can not infer
562
- maf_can_infer = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
563
-
564
- sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
565
-
566
- #palindromic WITH UNKNWON OR UNCHECKED STATUS
567
- unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
648
+ ## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
649
+ good_chrpos = sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
650
+ palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
651
+ not_palindromic_snp = good_chrpos & (~palindromic)
652
+
653
+ ##not palindromic : change status
654
+ sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
655
+ if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
656
+
657
+ #palindromic but can not infer
658
+ maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
659
+
660
+ sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
661
+
662
+ #palindromic WITH UNKNWON OR UNCHECKED STATUS
663
+ unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
568
664
 
569
- unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
570
-
571
- if verbose: log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)))
665
+ unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
666
+
667
+ if verbose: log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)))
572
668
 
573
- #########################################################################################
574
- if sum(unknow_palindromic_to_check)>0:
575
- if sum(unknow_palindromic_to_check)<10000:
576
- n_cores=1
577
- df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
578
- pool = Pool(n_cores)
579
- map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
580
- status_inferred = pd.concat(pool.map(map_func,df_split))
581
- sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
582
- pool.close()
583
- pool.join()
584
- #########################################################################################
585
- #0 Not palindromic SNPs
586
- #1 Palindromic +strand -> no need to flip
587
- #2 palindromic -strand -> need to flip -> fixed
588
- #3 Indel no need flip
589
- #4 Unknown Indel -> fixed
590
- #5 Palindromic -strand -> need to flip
591
- #6 Indel need flip
592
- #7 indistinguishable
593
- #8 Not matching or No information
594
- #9 Unchecked
595
-
596
- status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
597
- status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
598
- status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
599
- status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
600
- status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
601
-
602
- if verbose: log.write(" -Non-palindromic : ",sum(status0))
603
- if verbose: log.write(" -Palindromic SNPs on + strand: ",sum(status1))
604
- if verbose: log.write(" -Palindromic SNPs on - strand and need to be flipped:",sum(status5))
605
- if verbose: log.write(" -Palindromic SNPs with maf not available to infer : ",sum(status7))
606
- if verbose: log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8))
607
-
608
- if ("7" in remove_snp) and ("8" in remove_snp) :
609
- if verbose: log.write(" -Palindromic SNPs with maf not available to infer and with no macthes or no information will will be removed")
610
- sumstats = sumstats.loc[~(status7 | status8),:].copy()
611
- elif "8" in remove_snp:
612
- if verbose: log.write(" -Palindromic SNPs with no macthes or no information will be removed")
613
- sumstats = sumstats.loc[~status8,:].copy()
614
- elif "7" in remove_snp:
615
- if verbose: log.write(" -Palindromic SNPs with maf not available to infer will be removed")
616
- sumstats = sumstats.loc[~status7,:].copy()
669
+ #########################################################################################
670
+ if sum(unknow_palindromic_to_check)>0:
671
+ if sum(unknow_palindromic_to_check)<10000:
672
+ n_cores=1
673
+
674
+ #df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
675
+ df_split = _df_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
676
+ pool = Pool(n_cores)
677
+ map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
678
+ status_inferred = pd.concat(pool.map(map_func,df_split))
679
+ sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
680
+ pool.close()
681
+ pool.join()
682
+ #########################################################################################
683
+ #0 Not palindromic SNPs
684
+ #1 Palindromic +strand -> no need to flip
685
+ #2 palindromic -strand -> need to flip -> fixed
686
+ #3 Indel no need flip
687
+ #4 Unknown Indel -> fixed
688
+ #5 Palindromic -strand -> need to flip
689
+ #6 Indel need flip
690
+ #7 indistinguishable
691
+ #8 Not matching or No information
692
+ #9 Unchecked
693
+
694
+ status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
695
+ status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
696
+ status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
697
+ status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
698
+ status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
699
+
700
+ if verbose: log.write(" -Non-palindromic : ",sum(status0))
701
+ if verbose: log.write(" -Palindromic SNPs on + strand: ",sum(status1))
702
+ if verbose: log.write(" -Palindromic SNPs on - strand and needed to be flipped:",sum(status5))
703
+ if verbose: log.write(" -Palindromic SNPs with MAF not available to infer : ",sum(status7))
704
+ if verbose: log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8))
705
+
706
+ if ("7" in remove_snp) and ("8" in remove_snp) :
707
+ if verbose: log.write(" -Palindromic SNPs with maf not available to infer and with no macthes or no information will will be removed")
708
+ sumstats = sumstats.loc[~(status7 | status8),:].copy()
709
+ elif "8" in remove_snp:
710
+ if verbose: log.write(" -Palindromic SNPs with no macthes or no information will be removed")
711
+ sumstats = sumstats.loc[~status8,:].copy()
712
+ elif "7" in remove_snp:
713
+ if verbose: log.write(" -Palindromic SNPs with maf not available to infer will be removed")
714
+ sumstats = sumstats.loc[~status7,:].copy()
617
715
 
618
716
  ### unknow_indel
619
717
  if "i" in mode:
@@ -623,14 +721,15 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
623
721
  if verbose: log.write(" -Indistinguishable indels will be inferred from reference vcf ref and alt...")
624
722
  #########################################################################################
625
723
  #with maf can not infer
626
- #maf_can_infer = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
724
+ #maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
627
725
  #sumstats.loc[unknow_indel&(~maf_can_infer),status] = vchange_status(sumstats.loc[unknow_indel&(~maf_can_infer),status],7,"9","8")
628
726
  if verbose: log.write(" -DAF tolerance: {}".format(daf_tolerance))
629
727
 
630
728
  if sum(unknow_indel)>0:
631
729
  if sum(unknow_indel)<10000:
632
730
  n_cores=1
633
- df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
731
+ #df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
732
+ df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
634
733
  pool = Pool(n_cores)
635
734
  map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
636
735
  status_inferred = pd.concat(pool.map(map_func,df_split))
@@ -649,7 +748,8 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
649
748
  if "8" in remove_indel:
650
749
  if verbose: log.write(" -Indels with no macthes or no information will be removed")
651
750
  sumstats = sumstats.loc[~status8,:].copy()
652
- gc.collect()
751
+
752
+ finished(log,verbose,_end_line)
653
753
  return sumstats
654
754
 
655
755
 
@@ -673,22 +773,35 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
673
773
 
674
774
  ################################################################################################################
675
775
  def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
676
-
677
- if verbose: log.write("Start to check the difference between EAF and reference vcf alt frequency ...{}".format(_get_version()))
678
- check_dataframe_shape(sumstats, log, verbose)
679
- if verbose: log.write(" -Reference vcf file:", ref_infer)
680
- if verbose: log.write(" -CPU Cores to use :",n_cores)
681
-
776
+ ##start function with col checking##########################################################
777
+ _start_line = "check the difference between EAF and reference VCF ALT frequency"
778
+ _end_line = "checking the difference between EAF and reference VCF ALT frequency"
779
+ _start_cols = [chr,pos,ref,alt,eaf,status]
780
+ _start_function = ".check_daf()"
781
+ _must_args ={"ref_alt_freq":ref_alt_freq}
782
+
783
+ is_enough_info = start_to(sumstats=sumstats,
784
+ log=log,
785
+ verbose=verbose,
786
+ start_line=_start_line,
787
+ end_line=_end_line,
788
+ start_cols=_start_cols,
789
+ start_function=_start_function,
790
+ n_cores=n_cores,
791
+ ref_vcf=ref_infer,
792
+ **_must_args)
793
+ if is_enough_info == False: return sumstats
794
+ ############################################################################################
795
+
682
796
  chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
683
797
 
684
798
  column_name = column_name + suffix
685
- # check if the columns are complete
686
- if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
687
- raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
688
799
 
800
+
801
+
689
802
  # ref_alt_freq INFO in vcf was provided
690
803
  if ref_alt_freq is not None:
691
- if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
804
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
692
805
  if not force:
693
806
  good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
694
807
  if verbose: log.write(" -Checking variants:", sum(good_chrpos))
@@ -697,7 +810,8 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
697
810
  ########################
698
811
  if sum(~sumstats[eaf].isna())<10000:
699
812
  n_cores=1
700
- df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
813
+ #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
814
+ df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
701
815
  pool = Pool(n_cores)
702
816
  if sum(~sumstats[eaf].isna())>0:
703
817
  map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,column_name=column_name,chr_dict=chr_dict)
@@ -708,13 +822,13 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
708
822
  #status_inferred = sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]].apply(lambda x:check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict),axis=1)
709
823
 
710
824
  #sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
711
- #sumstats.loc[:,"DAF"]=sumstats.loc[:,"DAF"].astype("float")
712
- if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats.loc[:,column_name]))
713
- if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats.loc[:,column_name]))
714
- if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats.loc[:,column_name]))
715
- if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats.loc[:,column_name])))
716
- if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats.loc[:,column_name])))
717
- if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats.loc[:,column_name])))
825
+ #sumstats["DAF"]=sumstats["DAF"].astype("float")
826
+ if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]))
827
+ if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats[column_name]))
828
+ if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats[column_name]))
829
+ if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats[column_name])))
830
+ if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats[column_name])))
831
+ if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats[column_name])))
718
832
  if verbose: log.write("Finished allele frequency checking!")
719
833
  return sumstats
720
834
 
@@ -722,11 +836,11 @@ def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos
722
836
  #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
723
837
  vcf_reader = VariantFile(ref_infer)
724
838
  def afapply(x,vcf,alt_freq,chr_dict):
725
- return check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict)
839
+ return check_daf(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,chr_dict)
726
840
  map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
727
841
  status_inferred = sumstats.apply(map_func,axis=1)
728
- sumstats.loc[:,column_name] = status_inferred.values
729
- sumstats.loc[:,column_name]=sumstats.loc[:,column_name].astype("float")
842
+ sumstats[column_name] = status_inferred.values
843
+ sumstats[column_name]=sumstats[column_name].astype("float")
730
844
  return sumstats
731
845
 
732
846
  def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
@@ -741,25 +855,35 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
741
855
  ################################################################################################################
742
856
 
743
857
  def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
744
-
745
- if verbose: log.write("Start to infer the AF and reference vcf alt frequency ...{}".format(_get_version()))
746
- check_dataframe_shape(sumstats, log, verbose)
747
- if verbose: log.write(" -Reference vcf file:", ref_infer)
748
- if verbose: log.write(" -CPU Cores to use :",n_cores)
749
-
858
+ ##start function with col checking##########################################################
859
+ _start_line = "infer EAF using reference VCF ALT frequency"
860
+ _end_line = "inferring EAF using reference VCF ALT frequency"
861
+ _start_cols = [chr,pos,ref,alt,eaf,status]
862
+ _start_function = ".infer_af()"
863
+ _must_args ={"ref_alt_freq":ref_alt_freq}
864
+
865
+ is_enough_info = start_to(sumstats=sumstats,
866
+ log=log,
867
+ verbose=verbose,
868
+ start_line=_start_line,
869
+ end_line=_end_line,
870
+ start_cols=_start_cols,
871
+ start_function=_start_function,
872
+ n_cores=n_cores,
873
+ ref_vcf=ref_infer,
874
+ **_must_args)
875
+ if is_enough_info == False: return sumstats
876
+ ############################################################################################
750
877
  chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
751
-
752
- # check if the columns are complete
753
- if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
754
- raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
755
878
 
756
879
  if eaf not in sumstats.columns:
757
880
  sumstats[eaf]=np.nan
758
881
 
759
882
  prenumber = sum(sumstats[eaf].isna())
883
+
760
884
  # ref_alt_freq INFO in vcf was provided
761
885
  if ref_alt_freq is not None:
762
- if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
886
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
763
887
  if not force:
764
888
  good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
765
889
  if verbose: log.write(" -Checking variants:", sum(good_chrpos))
@@ -767,7 +891,8 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
767
891
  ########################
768
892
  if sum(sumstats[eaf].isna())<10000:
769
893
  n_cores=1
770
- df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
894
+ #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
895
+ df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
771
896
  pool = Pool(n_cores)
772
897
  map_func = partial(inferaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
773
898
  sumstats.loc[good_chrpos,[eaf]] = pd.concat(pool.map(map_func,df_split))
@@ -778,18 +903,19 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
778
903
  afternumber = sum(sumstats[eaf].isna())
779
904
  if verbose: log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber))
780
905
  if verbose: log.write(" -EAF is still missing for {} variants.".format(afternumber))
781
- if verbose: log.write("Finished allele frequency inferring!")
906
+
907
+ finished(log,verbose,_end_line)
782
908
  return sumstats
783
909
 
784
910
  def inferaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
785
911
  #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
786
912
  vcf_reader = VariantFile(ref_infer)
787
913
  def afapply(x,vcf,alt_freq,chr_dict):
788
- return infer_af(x[0],x[1]-1,x[1],x[2],x[3],vcf_reader,ref_alt_freq,chr_dict)
914
+ return infer_af(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,ref_alt_freq,chr_dict)
789
915
  map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
790
916
  status_inferred = sumstats.apply(map_func,axis=1)
791
- sumstats.loc[:,eaf] = status_inferred.values
792
- sumstats.loc[:,eaf]=sumstats.loc[:,eaf].astype("float")
917
+ sumstats[eaf] = status_inferred.values
918
+ sumstats[eaf]=sumstats[eaf].astype("float")
793
919
  return sumstats
794
920
 
795
921
  def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
@@ -827,4 +953,6 @@ def check_vcf_chr_prefix(vcf_bcf_path):
827
953
  if m is not None:
828
954
  return m.group(1)
829
955
  else:
830
- return None
956
+ return None
957
+
958
+