gwaslab 3.4.37__py3-none-any.whl → 3.4.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +8 -0
- gwaslab/g_Sumstats.py +26 -147
- gwaslab/g_SumstatsPair.py +6 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +29 -15
- gwaslab/hm_harmonize_sumstats.py +291 -163
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +43 -37
- gwaslab/io_to_formats.py +428 -295
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +793 -682
- gwaslab/util_ex_calculate_ldmatrix.py +29 -11
- gwaslab/util_ex_gwascatalog.py +1 -1
- gwaslab/util_ex_ldproxyfinder.py +1 -1
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_convert_h2.py +1 -1
- gwaslab/util_in_fill_data.py +2 -2
- gwaslab/util_in_filter_value.py +122 -34
- gwaslab/util_in_get_density.py +2 -2
- gwaslab/util_in_get_sig.py +41 -9
- gwaslab/viz_aux_quickfix.py +24 -19
- gwaslab/viz_aux_reposition_text.py +7 -4
- gwaslab/viz_aux_save_figure.py +6 -5
- gwaslab/viz_plot_compare_af.py +5 -5
- gwaslab/viz_plot_miamiplot2.py +28 -20
- gwaslab/viz_plot_mqqplot.py +109 -72
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +3 -1
- gwaslab/viz_plot_trumpetplot.py +15 -6
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/METADATA +2 -2
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/RECORD +37 -37
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
gwaslab/hm_harmonize_sumstats.py
CHANGED
|
@@ -12,12 +12,18 @@ from gwaslab.g_Log import Log
|
|
|
12
12
|
from gwaslab.qc_fix_sumstats import fixchr
|
|
13
13
|
from gwaslab.qc_fix_sumstats import fixpos
|
|
14
14
|
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
15
|
+
from gwaslab.qc_fix_sumstats import _df_split
|
|
16
|
+
from gwaslab.qc_fix_sumstats import check_col
|
|
17
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
18
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
19
|
+
from gwaslab.qc_fix_sumstats import skipped
|
|
15
20
|
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
16
21
|
from gwaslab.bd_common_data import get_number_to_chr
|
|
17
22
|
from gwaslab.bd_common_data import get_chr_list
|
|
18
23
|
from gwaslab.bd_common_data import get_chr_to_number
|
|
19
24
|
from gwaslab.g_vchange_status import vchange_status
|
|
20
25
|
from gwaslab.g_version import _get_version
|
|
26
|
+
|
|
21
27
|
#rsidtochrpos
|
|
22
28
|
#checkref
|
|
23
29
|
#parallelizeassignrsid
|
|
@@ -35,9 +41,24 @@ def rsidtochrpos(sumstats,
|
|
|
35
41
|
'''
|
|
36
42
|
assign chr:pos based on rsID
|
|
37
43
|
'''
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
44
|
+
##start function with col checking##########################################################
|
|
45
|
+
_start_line = "assign CHR and POS using rsIDs"
|
|
46
|
+
_end_line = "assigning CHR and POS using rsIDs"
|
|
47
|
+
_start_cols = [rsid,chrom,pos]
|
|
48
|
+
_start_function = ".rsid_to_chrpos()"
|
|
49
|
+
_must_args ={}
|
|
50
|
+
|
|
51
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
52
|
+
log=log,
|
|
53
|
+
verbose=verbose,
|
|
54
|
+
start_line=_start_line,
|
|
55
|
+
end_line=_end_line,
|
|
56
|
+
start_cols=_start_cols,
|
|
57
|
+
start_function=_start_function,
|
|
58
|
+
**_must_args)
|
|
59
|
+
if is_enough_info == False: return sumstats
|
|
60
|
+
############################################################################################
|
|
61
|
+
|
|
41
62
|
if verbose: log.write(" -rsID dictionary file: "+ path)
|
|
42
63
|
|
|
43
64
|
if ref_rsid_to_chrpos_tsv is not None:
|
|
@@ -81,6 +102,8 @@ def rsidtochrpos(sumstats,
|
|
|
81
102
|
sumstats = fixchr(sumstats,verbose=verbose)
|
|
82
103
|
sumstats = fixpos(sumstats,verbose=verbose)
|
|
83
104
|
sumstats = sortcolumn(sumstats,verbose=verbose)
|
|
105
|
+
|
|
106
|
+
finished(log,verbose,_end_line)
|
|
84
107
|
return sumstats
|
|
85
108
|
####################################################################################################
|
|
86
109
|
|
|
@@ -104,17 +127,32 @@ def merge_chrpos(sumstats_part,all_groups_max,path,build,status):
|
|
|
104
127
|
|
|
105
128
|
def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None, ref_rsid_to_chrpos_vcf = None, ref_rsid_to_chrpos_hdf5 = None, build="99",status="STATUS",
|
|
106
129
|
n_cores=4,block_size=20000000,verbose=True,log=Log()):
|
|
107
|
-
|
|
130
|
+
|
|
131
|
+
##start function with col checking##########################################################
|
|
132
|
+
_start_line = "assign CHR and POS using rsIDs"
|
|
133
|
+
_end_line = "assigning CHR and POS using rsIDs"
|
|
134
|
+
_start_cols = [rsid,chrom,pos]
|
|
135
|
+
_start_function = ".rsid_to_chrpos2()"
|
|
136
|
+
_must_args ={}
|
|
137
|
+
|
|
138
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
139
|
+
log=log,
|
|
140
|
+
verbose=verbose,
|
|
141
|
+
start_line=_start_line,
|
|
142
|
+
end_line=_end_line,
|
|
143
|
+
start_cols=_start_cols,
|
|
144
|
+
start_function=_start_function,
|
|
145
|
+
**_must_args)
|
|
146
|
+
if is_enough_info == False: return sumstats
|
|
147
|
+
############################################################################################
|
|
148
|
+
|
|
108
149
|
if ref_rsid_to_chrpos_hdf5 is not None:
|
|
109
150
|
path = ref_rsid_to_chrpos_hdf5
|
|
110
151
|
elif ref_rsid_to_chrpos_vcf is not None:
|
|
111
152
|
vcf_file_name = os.path.basename(ref_rsid_to_chrpos_vcf)
|
|
112
153
|
vcf_dir_path = os.path.dirname(ref_rsid_to_chrpos_vcf)
|
|
113
154
|
path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(vcf_dir_path,vcf_file_name,int(block_size))
|
|
114
|
-
|
|
115
|
-
if verbose: log.write("Start to assign CHR and POS using rsIDs...{}".format(_get_version()))
|
|
116
|
-
check_dataframe_shape(sumstats, log, verbose)
|
|
117
|
-
|
|
155
|
+
|
|
118
156
|
if path is None:
|
|
119
157
|
raise ValueError("Please provide path to hdf5 file.")
|
|
120
158
|
|
|
@@ -192,8 +230,8 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
|
|
|
192
230
|
|
|
193
231
|
pool.close()
|
|
194
232
|
pool.join()
|
|
195
|
-
|
|
196
|
-
|
|
233
|
+
|
|
234
|
+
finished(log, verbose, _end_line)
|
|
197
235
|
return sumstats
|
|
198
236
|
####################################################################################################################
|
|
199
237
|
#20220426 check if non-effect allele is aligned with reference genome
|
|
@@ -211,15 +249,15 @@ def check_status(row,record):
|
|
|
211
249
|
#8 / -----> not on ref genome
|
|
212
250
|
#9 / ------> unchecked
|
|
213
251
|
|
|
214
|
-
status_pre=row[3][:5]
|
|
215
|
-
status_end=row[3][6:]
|
|
252
|
+
status_pre=row.iloc[3][:5]
|
|
253
|
+
status_end=row.iloc[3][6:]
|
|
216
254
|
|
|
217
255
|
## nea == ref
|
|
218
|
-
if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
|
|
256
|
+
if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
|
|
219
257
|
## ea == ref
|
|
220
|
-
if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
|
|
258
|
+
if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
|
|
221
259
|
## len(nea) >len(ea):
|
|
222
|
-
if len(row[2])!=len(row[1]):
|
|
260
|
+
if len(row.iloc[2])!=len(row.iloc[1]):
|
|
223
261
|
# indels both on ref, unable to identify
|
|
224
262
|
return status_pre+"6"+status_end
|
|
225
263
|
else:
|
|
@@ -228,34 +266,49 @@ def check_status(row,record):
|
|
|
228
266
|
## nea!=ref
|
|
229
267
|
else:
|
|
230
268
|
# ea == ref_seq -> need to flip
|
|
231
|
-
if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
|
|
269
|
+
if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
|
|
232
270
|
return status_pre+"3"+status_end
|
|
233
271
|
# ea !=ref
|
|
234
272
|
else:
|
|
235
273
|
#_reverse_complementary
|
|
236
|
-
row[1] = get_reverse_complementary_allele(row[1])
|
|
237
|
-
row[2] = get_reverse_complementary_allele(row[2])
|
|
274
|
+
row.iloc[1] = get_reverse_complementary_allele(row.iloc[1])
|
|
275
|
+
row.iloc[2] = get_reverse_complementary_allele(row.iloc[2])
|
|
238
276
|
## nea == ref
|
|
239
|
-
if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
|
|
277
|
+
if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
|
|
240
278
|
## ea == ref
|
|
241
|
-
if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
|
|
279
|
+
if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
|
|
242
280
|
## len(nea) >len(ea):
|
|
243
|
-
if len(row[2])!=len(row[1]):
|
|
281
|
+
if len(row.iloc[2])!=len(row.iloc[1]):
|
|
244
282
|
return status_pre+"8"+status_end # indel reverse complementary
|
|
245
283
|
else:
|
|
246
284
|
return status_pre+"4"+status_end
|
|
247
285
|
else:
|
|
248
286
|
# ea == ref_seq -> need to flip
|
|
249
|
-
if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
|
|
287
|
+
if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
|
|
250
288
|
return status_pre+"5"+status_end
|
|
251
289
|
# ea !=ref
|
|
252
290
|
return status_pre+"8"+status_end
|
|
253
291
|
|
|
254
292
|
|
|
255
293
|
def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
294
|
+
##start function with col checking##########################################################
|
|
295
|
+
_start_line = "check if NEA is aligned with reference sequence"
|
|
296
|
+
_end_line = "checking if NEA is aligned with reference sequence"
|
|
297
|
+
_start_cols = [chrom,pos,ea,nea,status]
|
|
298
|
+
_start_function = ".check_ref()"
|
|
299
|
+
_must_args ={}
|
|
300
|
+
|
|
301
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
302
|
+
log=log,
|
|
303
|
+
verbose=verbose,
|
|
304
|
+
start_line=_start_line,
|
|
305
|
+
end_line=_end_line,
|
|
306
|
+
start_cols=_start_cols,
|
|
307
|
+
start_function=_start_function,
|
|
308
|
+
**_must_args)
|
|
309
|
+
if is_enough_info == False: return sumstats
|
|
310
|
+
############################################################################################
|
|
311
|
+
if verbose: log.write(" -Reference genome FASTA file: "+ ref_path)
|
|
259
312
|
if verbose: log.write(" -Checking records: ", end="")
|
|
260
313
|
chromlist = get_chr_list(add_number=True)
|
|
261
314
|
records = SeqIO.parse(ref_path, "fasta")
|
|
@@ -274,7 +327,7 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
|
|
|
274
327
|
|
|
275
328
|
if verbose: log.write("\n",end="",show_time=False)
|
|
276
329
|
|
|
277
|
-
sumstats
|
|
330
|
+
sumstats[status] = sumstats[status].astype("string")
|
|
278
331
|
available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
|
|
279
332
|
status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
|
|
280
333
|
status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
|
|
@@ -290,7 +343,7 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
|
|
|
290
343
|
flip_rate = status_3/available_to_check
|
|
291
344
|
if verbose: log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100))
|
|
292
345
|
if raw_matching_rate <0.8:
|
|
293
|
-
if verbose: log.
|
|
346
|
+
if verbose: log.warning("Matching rate is low, please check if the right reference genome is used.")
|
|
294
347
|
if flip_rate > 0.85 :
|
|
295
348
|
if verbose: log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.")
|
|
296
349
|
|
|
@@ -303,7 +356,8 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
|
|
|
303
356
|
if remove is True:
|
|
304
357
|
sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
|
|
305
358
|
if verbose: log.write(" -Variants not on given reference sequence were removed.")
|
|
306
|
-
|
|
359
|
+
|
|
360
|
+
finished(log, verbose, _end_line)
|
|
307
361
|
return sumstats
|
|
308
362
|
|
|
309
363
|
#######################################################################################################################################
|
|
@@ -333,7 +387,7 @@ def assign_rsid_single(sumstats,path,rsid="rsID",chr="CHR",pos="POS",ref="NEA",a
|
|
|
333
387
|
## single df assignment
|
|
334
388
|
vcf_reader = VariantFile(path)
|
|
335
389
|
def rsid_helper(x,vcf_reader,chr_dict):
|
|
336
|
-
return chrposref_rsid(x[0],x[1],x[2],x[3],vcf_reader,chr_dict)
|
|
390
|
+
return chrposref_rsid(x.iloc[0],x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,chr_dict)
|
|
337
391
|
map_func=partial(rsid_helper,vcf_reader=vcf_reader,chr_dict=chr_dict)
|
|
338
392
|
rsID = sumstats.apply(map_func,axis=1)
|
|
339
393
|
return rsID
|
|
@@ -346,19 +400,31 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
346
400
|
all , overwrite rsid for all availalbe rsid
|
|
347
401
|
invalid, only assign rsid for variants with invalid rsid
|
|
348
402
|
empty only assign rsid for variants with na rsid
|
|
349
|
-
'''
|
|
403
|
+
'''
|
|
404
|
+
|
|
350
405
|
if ref_mode=="vcf":
|
|
351
406
|
###################################################################################################################
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
407
|
+
##start function with col checking##########################################################
|
|
408
|
+
_start_line = "assign rsID using reference VCF"
|
|
409
|
+
_end_line = "assign rsID using reference file"
|
|
410
|
+
_start_cols = [chr,pos,ref,alt,status]
|
|
411
|
+
_start_function = ".assign_rsid()"
|
|
412
|
+
_must_args ={}
|
|
413
|
+
|
|
414
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
415
|
+
log=log,
|
|
416
|
+
verbose=verbose,
|
|
417
|
+
start_line=_start_line,
|
|
418
|
+
end_line=_end_line,
|
|
419
|
+
start_cols=_start_cols,
|
|
420
|
+
start_function=_start_function,
|
|
421
|
+
n_cores=n_cores,
|
|
422
|
+
ref_vcf=path,
|
|
423
|
+
**_must_args)
|
|
424
|
+
if is_enough_info == False: return sumstats
|
|
425
|
+
############################################################################################
|
|
357
426
|
chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
|
|
358
|
-
|
|
359
|
-
if verbose: log.write(" -Assigning rsID based on chr:pos and ref:alt/alt:ref...")
|
|
360
|
-
|
|
361
|
-
|
|
427
|
+
if verbose: log.write(" -Assigning rsID based on CHR:POS and REF:ALT/ALT:REF...")
|
|
362
428
|
##############################################
|
|
363
429
|
if rsid not in sumstats.columns:
|
|
364
430
|
sumstats[rsid]=pd.Series(dtype="string")
|
|
@@ -380,7 +446,8 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
380
446
|
|
|
381
447
|
if sum(to_assign)>0:
|
|
382
448
|
if sum(to_assign)<10000: n_cores=1
|
|
383
|
-
df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
|
|
449
|
+
#df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
|
|
450
|
+
df_split = _df_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
|
|
384
451
|
pool = Pool(n_cores)
|
|
385
452
|
map_func = partial(assign_rsid_single,path=path,chr=chr,pos=pos,ref=ref,alt=alt,chr_dict=chr_dict)
|
|
386
453
|
assigned_rsid = pd.concat(pool.map(map_func,df_split))
|
|
@@ -399,9 +466,25 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
399
466
|
'''
|
|
400
467
|
assign rsID based on chr:pos
|
|
401
468
|
'''
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
469
|
+
##start function with col checking##########################################################
|
|
470
|
+
_start_line = "assign rsID by matching SNPID with CHR:POS:REF:ALT in the reference TSV"
|
|
471
|
+
_end_line = "assign rsID using reference file"
|
|
472
|
+
_start_cols = [snpid,status]
|
|
473
|
+
_start_function = ".assign_rsid()"
|
|
474
|
+
_must_args ={}
|
|
475
|
+
|
|
476
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
477
|
+
log=log,
|
|
478
|
+
verbose=verbose,
|
|
479
|
+
start_line=_start_line,
|
|
480
|
+
end_line=_end_line,
|
|
481
|
+
start_cols=_start_cols,
|
|
482
|
+
start_function=_start_function,
|
|
483
|
+
n_cores=n_cores,
|
|
484
|
+
ref_tsv=path,
|
|
485
|
+
**_must_args)
|
|
486
|
+
if is_enough_info == False: return sumstats
|
|
487
|
+
############################################################################################
|
|
405
488
|
|
|
406
489
|
standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234][0126]\w", case=False, flags=0, na=False)
|
|
407
490
|
|
|
@@ -409,11 +492,12 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
409
492
|
sumstats[rsid]=pd.Series(dtype="string")
|
|
410
493
|
|
|
411
494
|
if overwrite == "empty":
|
|
412
|
-
to_assign = sumstats[rsid].isna()
|
|
495
|
+
to_assign = sumstats[rsid].isna() & standardized_normalized
|
|
413
496
|
if overwrite=="all":
|
|
414
497
|
to_assign = standardized_normalized
|
|
415
498
|
if overwrite=="invalid":
|
|
416
499
|
to_assign = (~sumstats[rsid].str.match(r'rs([0-9]+)', case=False, flags=0, na=False)) & standardized_normalized
|
|
500
|
+
|
|
417
501
|
total_number= len(sumstats)
|
|
418
502
|
pre_number = sum(~sumstats[rsid].isna())
|
|
419
503
|
if verbose: log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...")
|
|
@@ -438,12 +522,13 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
438
522
|
sumstats = sumstats.rename(columns = {'index':snpid})
|
|
439
523
|
|
|
440
524
|
after_number = sum(~sumstats[rsid].isna())
|
|
441
|
-
if verbose: log.write(" -rsID
|
|
525
|
+
if verbose: log.write(" -rsID annotation for "+str(total_number - after_number) +" needed to be fixed!")
|
|
442
526
|
if verbose: log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!")
|
|
443
527
|
else:
|
|
444
|
-
if verbose: log.write(" -No rsID
|
|
528
|
+
if verbose: log.write(" -No rsID can be fixed...skipping...")
|
|
445
529
|
################################################################################################################
|
|
446
|
-
|
|
530
|
+
|
|
531
|
+
finished(log,verbose,_end_line)
|
|
447
532
|
return sumstats
|
|
448
533
|
#################################################################################################################################################
|
|
449
534
|
#single record assignment
|
|
@@ -522,12 +607,12 @@ def is_palindromic(sumstats,a1="EA",a2="NEA"):
|
|
|
522
607
|
|
|
523
608
|
def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS"):
|
|
524
609
|
vcf_reader = VariantFile(ref_infer)
|
|
525
|
-
status_part = sumstats.apply(lambda x:check_strand_status(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict),axis=1)
|
|
610
|
+
status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
|
|
526
611
|
return status_part
|
|
527
612
|
|
|
528
613
|
def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
|
|
529
614
|
vcf_reader = VariantFile(ref_infer)
|
|
530
|
-
status_part = sumstats.apply(lambda x:check_unkonwn_indel(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict,daf_tolerance),axis=1)
|
|
615
|
+
status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
|
|
531
616
|
return status_part
|
|
532
617
|
|
|
533
618
|
##################################################################################################################################################
|
|
@@ -535,85 +620,98 @@ def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NE
|
|
|
535
620
|
def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
|
|
536
621
|
chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
|
|
537
622
|
chr_dict=None,verbose=True,log=Log()):
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
623
|
+
##start function with col checking##########################################################
|
|
624
|
+
_start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
|
|
625
|
+
_end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
|
|
626
|
+
_start_cols = [chr,pos,ref,alt,eaf,status]
|
|
627
|
+
_start_function = ".infer_strand()"
|
|
628
|
+
_must_args ={"ref_alt_freq":ref_alt_freq}
|
|
629
|
+
|
|
630
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
631
|
+
log=log,
|
|
632
|
+
verbose=verbose,
|
|
633
|
+
start_line=_start_line,
|
|
634
|
+
end_line=_end_line,
|
|
635
|
+
start_cols=_start_cols,
|
|
636
|
+
start_function=_start_function,
|
|
637
|
+
n_cores=n_cores,
|
|
638
|
+
ref_vcf=ref_infer,
|
|
639
|
+
**_must_args)
|
|
640
|
+
if is_enough_info == False: return sumstats
|
|
641
|
+
############################################################################################
|
|
541
642
|
|
|
542
643
|
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
644
|
+
|
|
645
|
+
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
543
646
|
|
|
544
|
-
# check if the columns are complete
|
|
545
|
-
if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
|
|
546
|
-
raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
|
|
547
647
|
if "p" in mode:
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
|
|
565
|
-
|
|
566
|
-
#palindromic WITH UNKNWON OR UNCHECKED STATUS
|
|
567
|
-
unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
|
|
648
|
+
## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
|
|
649
|
+
good_chrpos = sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
|
|
650
|
+
palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
|
|
651
|
+
not_palindromic_snp = good_chrpos & (~palindromic)
|
|
652
|
+
|
|
653
|
+
##not palindromic : change status
|
|
654
|
+
sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
|
|
655
|
+
if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
|
|
656
|
+
|
|
657
|
+
#palindromic but can not infer
|
|
658
|
+
maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
|
|
659
|
+
|
|
660
|
+
sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
|
|
661
|
+
|
|
662
|
+
#palindromic WITH UNKNWON OR UNCHECKED STATUS
|
|
663
|
+
unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
|
|
568
664
|
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
665
|
+
unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
|
|
666
|
+
|
|
667
|
+
if verbose: log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)))
|
|
572
668
|
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
pool.
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
669
|
+
#########################################################################################
|
|
670
|
+
if sum(unknow_palindromic_to_check)>0:
|
|
671
|
+
if sum(unknow_palindromic_to_check)<10000:
|
|
672
|
+
n_cores=1
|
|
673
|
+
|
|
674
|
+
#df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
|
|
675
|
+
df_split = _df_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
|
|
676
|
+
pool = Pool(n_cores)
|
|
677
|
+
map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
678
|
+
status_inferred = pd.concat(pool.map(map_func,df_split))
|
|
679
|
+
sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
|
|
680
|
+
pool.close()
|
|
681
|
+
pool.join()
|
|
682
|
+
#########################################################################################
|
|
683
|
+
#0 Not palindromic SNPs
|
|
684
|
+
#1 Palindromic +strand -> no need to flip
|
|
685
|
+
#2 palindromic -strand -> need to flip -> fixed
|
|
686
|
+
#3 Indel no need flip
|
|
687
|
+
#4 Unknown Indel -> fixed
|
|
688
|
+
#5 Palindromic -strand -> need to flip
|
|
689
|
+
#6 Indel need flip
|
|
690
|
+
#7 indistinguishable
|
|
691
|
+
#8 Not matching or No information
|
|
692
|
+
#9 Unchecked
|
|
693
|
+
|
|
694
|
+
status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
|
|
695
|
+
status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
|
|
696
|
+
status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
|
|
697
|
+
status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
|
|
698
|
+
status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
|
|
699
|
+
|
|
700
|
+
if verbose: log.write(" -Non-palindromic : ",sum(status0))
|
|
701
|
+
if verbose: log.write(" -Palindromic SNPs on + strand: ",sum(status1))
|
|
702
|
+
if verbose: log.write(" -Palindromic SNPs on - strand and needed to be flipped:",sum(status5))
|
|
703
|
+
if verbose: log.write(" -Palindromic SNPs with MAF not available to infer : ",sum(status7))
|
|
704
|
+
if verbose: log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8))
|
|
705
|
+
|
|
706
|
+
if ("7" in remove_snp) and ("8" in remove_snp) :
|
|
707
|
+
if verbose: log.write(" -Palindromic SNPs with maf not available to infer and with no macthes or no information will will be removed")
|
|
708
|
+
sumstats = sumstats.loc[~(status7 | status8),:].copy()
|
|
709
|
+
elif "8" in remove_snp:
|
|
710
|
+
if verbose: log.write(" -Palindromic SNPs with no macthes or no information will be removed")
|
|
711
|
+
sumstats = sumstats.loc[~status8,:].copy()
|
|
712
|
+
elif "7" in remove_snp:
|
|
713
|
+
if verbose: log.write(" -Palindromic SNPs with maf not available to infer will be removed")
|
|
714
|
+
sumstats = sumstats.loc[~status7,:].copy()
|
|
617
715
|
|
|
618
716
|
### unknow_indel
|
|
619
717
|
if "i" in mode:
|
|
@@ -623,14 +721,15 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
623
721
|
if verbose: log.write(" -Indistinguishable indels will be inferred from reference vcf ref and alt...")
|
|
624
722
|
#########################################################################################
|
|
625
723
|
#with maf can not infer
|
|
626
|
-
#maf_can_infer = (sumstats
|
|
724
|
+
#maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
|
|
627
725
|
#sumstats.loc[unknow_indel&(~maf_can_infer),status] = vchange_status(sumstats.loc[unknow_indel&(~maf_can_infer),status],7,"9","8")
|
|
628
726
|
if verbose: log.write(" -DAF tolerance: {}".format(daf_tolerance))
|
|
629
727
|
|
|
630
728
|
if sum(unknow_indel)>0:
|
|
631
729
|
if sum(unknow_indel)<10000:
|
|
632
730
|
n_cores=1
|
|
633
|
-
df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
|
|
731
|
+
#df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
|
|
732
|
+
df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
|
|
634
733
|
pool = Pool(n_cores)
|
|
635
734
|
map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
|
|
636
735
|
status_inferred = pd.concat(pool.map(map_func,df_split))
|
|
@@ -649,7 +748,8 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
649
748
|
if "8" in remove_indel:
|
|
650
749
|
if verbose: log.write(" -Indels with no macthes or no information will be removed")
|
|
651
750
|
sumstats = sumstats.loc[~status8,:].copy()
|
|
652
|
-
|
|
751
|
+
|
|
752
|
+
finished(log,verbose,_end_line)
|
|
653
753
|
return sumstats
|
|
654
754
|
|
|
655
755
|
|
|
@@ -673,22 +773,35 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
673
773
|
|
|
674
774
|
################################################################################################################
|
|
675
775
|
def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
776
|
+
##start function with col checking##########################################################
|
|
777
|
+
_start_line = "check the difference between EAF and reference VCF ALT frequency"
|
|
778
|
+
_end_line = "checking the difference between EAF and reference VCF ALT frequency"
|
|
779
|
+
_start_cols = [chr,pos,ref,alt,eaf,status]
|
|
780
|
+
_start_function = ".check_daf()"
|
|
781
|
+
_must_args ={"ref_alt_freq":ref_alt_freq}
|
|
782
|
+
|
|
783
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
784
|
+
log=log,
|
|
785
|
+
verbose=verbose,
|
|
786
|
+
start_line=_start_line,
|
|
787
|
+
end_line=_end_line,
|
|
788
|
+
start_cols=_start_cols,
|
|
789
|
+
start_function=_start_function,
|
|
790
|
+
n_cores=n_cores,
|
|
791
|
+
ref_vcf=ref_infer,
|
|
792
|
+
**_must_args)
|
|
793
|
+
if is_enough_info == False: return sumstats
|
|
794
|
+
############################################################################################
|
|
795
|
+
|
|
682
796
|
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
683
797
|
|
|
684
798
|
column_name = column_name + suffix
|
|
685
|
-
# check if the columns are complete
|
|
686
|
-
if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
|
|
687
|
-
raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
|
|
688
799
|
|
|
800
|
+
|
|
801
|
+
|
|
689
802
|
# ref_alt_freq INFO in vcf was provided
|
|
690
803
|
if ref_alt_freq is not None:
|
|
691
|
-
|
|
804
|
+
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
692
805
|
if not force:
|
|
693
806
|
good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
|
|
694
807
|
if verbose: log.write(" -Checking variants:", sum(good_chrpos))
|
|
@@ -697,7 +810,8 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
|
|
|
697
810
|
########################
|
|
698
811
|
if sum(~sumstats[eaf].isna())<10000:
|
|
699
812
|
n_cores=1
|
|
700
|
-
df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
|
|
813
|
+
#df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
|
|
814
|
+
df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
|
|
701
815
|
pool = Pool(n_cores)
|
|
702
816
|
if sum(~sumstats[eaf].isna())>0:
|
|
703
817
|
map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,column_name=column_name,chr_dict=chr_dict)
|
|
@@ -708,13 +822,13 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
|
|
|
708
822
|
#status_inferred = sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]].apply(lambda x:check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict),axis=1)
|
|
709
823
|
|
|
710
824
|
#sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
|
|
711
|
-
#sumstats
|
|
712
|
-
if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats
|
|
713
|
-
if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats
|
|
714
|
-
if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats
|
|
715
|
-
if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats
|
|
716
|
-
if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats
|
|
717
|
-
if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats
|
|
825
|
+
#sumstats["DAF"]=sumstats["DAF"].astype("float")
|
|
826
|
+
if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]))
|
|
827
|
+
if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats[column_name]))
|
|
828
|
+
if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats[column_name]))
|
|
829
|
+
if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats[column_name])))
|
|
830
|
+
if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats[column_name])))
|
|
831
|
+
if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats[column_name])))
|
|
718
832
|
if verbose: log.write("Finished allele frequency checking!")
|
|
719
833
|
return sumstats
|
|
720
834
|
|
|
@@ -722,11 +836,11 @@ def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos
|
|
|
722
836
|
#vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
|
|
723
837
|
vcf_reader = VariantFile(ref_infer)
|
|
724
838
|
def afapply(x,vcf,alt_freq,chr_dict):
|
|
725
|
-
return check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict)
|
|
839
|
+
return check_daf(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,chr_dict)
|
|
726
840
|
map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
727
841
|
status_inferred = sumstats.apply(map_func,axis=1)
|
|
728
|
-
sumstats
|
|
729
|
-
sumstats
|
|
842
|
+
sumstats[column_name] = status_inferred.values
|
|
843
|
+
sumstats[column_name]=sumstats[column_name].astype("float")
|
|
730
844
|
return sumstats
|
|
731
845
|
|
|
732
846
|
def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
|
|
@@ -741,25 +855,35 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
|
|
|
741
855
|
################################################################################################################
|
|
742
856
|
|
|
743
857
|
def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
858
|
+
##start function with col checking##########################################################
|
|
859
|
+
_start_line = "infer EAF using reference VCF ALT frequency"
|
|
860
|
+
_end_line = "inferring EAF using reference VCF ALT frequency"
|
|
861
|
+
_start_cols = [chr,pos,ref,alt,eaf,status]
|
|
862
|
+
_start_function = ".infer_af()"
|
|
863
|
+
_must_args ={"ref_alt_freq":ref_alt_freq}
|
|
864
|
+
|
|
865
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
866
|
+
log=log,
|
|
867
|
+
verbose=verbose,
|
|
868
|
+
start_line=_start_line,
|
|
869
|
+
end_line=_end_line,
|
|
870
|
+
start_cols=_start_cols,
|
|
871
|
+
start_function=_start_function,
|
|
872
|
+
n_cores=n_cores,
|
|
873
|
+
ref_vcf=ref_infer,
|
|
874
|
+
**_must_args)
|
|
875
|
+
if is_enough_info == False: return sumstats
|
|
876
|
+
############################################################################################
|
|
750
877
|
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
751
|
-
|
|
752
|
-
# check if the columns are complete
|
|
753
|
-
if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
|
|
754
|
-
raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
|
|
755
878
|
|
|
756
879
|
if eaf not in sumstats.columns:
|
|
757
880
|
sumstats[eaf]=np.nan
|
|
758
881
|
|
|
759
882
|
prenumber = sum(sumstats[eaf].isna())
|
|
883
|
+
|
|
760
884
|
# ref_alt_freq INFO in vcf was provided
|
|
761
885
|
if ref_alt_freq is not None:
|
|
762
|
-
|
|
886
|
+
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
763
887
|
if not force:
|
|
764
888
|
good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
|
|
765
889
|
if verbose: log.write(" -Checking variants:", sum(good_chrpos))
|
|
@@ -767,7 +891,8 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
|
|
|
767
891
|
########################
|
|
768
892
|
if sum(sumstats[eaf].isna())<10000:
|
|
769
893
|
n_cores=1
|
|
770
|
-
df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
|
|
894
|
+
#df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
|
|
895
|
+
df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
|
|
771
896
|
pool = Pool(n_cores)
|
|
772
897
|
map_func = partial(inferaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
773
898
|
sumstats.loc[good_chrpos,[eaf]] = pd.concat(pool.map(map_func,df_split))
|
|
@@ -778,18 +903,19 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
|
|
|
778
903
|
afternumber = sum(sumstats[eaf].isna())
|
|
779
904
|
if verbose: log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber))
|
|
780
905
|
if verbose: log.write(" -EAF is still missing for {} variants.".format(afternumber))
|
|
781
|
-
|
|
906
|
+
|
|
907
|
+
finished(log,verbose,_end_line)
|
|
782
908
|
return sumstats
|
|
783
909
|
|
|
784
910
|
def inferaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
|
|
785
911
|
#vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
|
|
786
912
|
vcf_reader = VariantFile(ref_infer)
|
|
787
913
|
def afapply(x,vcf,alt_freq,chr_dict):
|
|
788
|
-
return infer_af(x[0],x[1]-1,x[1],x[2],x[3],vcf_reader,ref_alt_freq,chr_dict)
|
|
914
|
+
return infer_af(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,ref_alt_freq,chr_dict)
|
|
789
915
|
map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
790
916
|
status_inferred = sumstats.apply(map_func,axis=1)
|
|
791
|
-
sumstats
|
|
792
|
-
sumstats
|
|
917
|
+
sumstats[eaf] = status_inferred.values
|
|
918
|
+
sumstats[eaf]=sumstats[eaf].astype("float")
|
|
793
919
|
return sumstats
|
|
794
920
|
|
|
795
921
|
def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
|
|
@@ -827,4 +953,6 @@ def check_vcf_chr_prefix(vcf_bcf_path):
|
|
|
827
953
|
if m is not None:
|
|
828
954
|
return m.group(1)
|
|
829
955
|
else:
|
|
830
|
-
return None
|
|
956
|
+
return None
|
|
957
|
+
|
|
958
|
+
|