gwaslab 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +1 -1
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +8 -0
- gwaslab/g_Sumstats.py +80 -178
- gwaslab/g_SumstatsPair.py +6 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_meta.py +13 -3
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +29 -15
- gwaslab/hm_harmonize_sumstats.py +312 -159
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +46 -37
- gwaslab/io_to_formats.py +428 -295
- gwaslab/qc_check_datatype.py +15 -1
- gwaslab/qc_fix_sumstats.py +956 -719
- gwaslab/util_ex_calculate_ldmatrix.py +29 -11
- gwaslab/util_ex_gwascatalog.py +1 -1
- gwaslab/util_ex_ldproxyfinder.py +1 -1
- gwaslab/util_ex_process_h5.py +26 -17
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_convert_h2.py +1 -1
- gwaslab/util_in_fill_data.py +44 -5
- gwaslab/util_in_filter_value.py +122 -34
- gwaslab/util_in_get_density.py +2 -2
- gwaslab/util_in_get_sig.py +41 -9
- gwaslab/viz_aux_quickfix.py +26 -21
- gwaslab/viz_aux_reposition_text.py +7 -4
- gwaslab/viz_aux_save_figure.py +6 -5
- gwaslab/viz_plot_compare_af.py +5 -5
- gwaslab/viz_plot_compare_effect.py +22 -5
- gwaslab/viz_plot_miamiplot2.py +28 -20
- gwaslab/viz_plot_mqqplot.py +214 -98
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +16 -9
- gwaslab/viz_plot_trumpetplot.py +15 -6
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/METADATA +3 -3
- gwaslab-3.4.38.dist-info/RECORD +72 -0
- gwaslab-3.4.36.dist-info/RECORD +0 -72
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
gwaslab/hm_harmonize_sumstats.py
CHANGED
|
@@ -11,11 +11,19 @@ import gc
|
|
|
11
11
|
from gwaslab.g_Log import Log
|
|
12
12
|
from gwaslab.qc_fix_sumstats import fixchr
|
|
13
13
|
from gwaslab.qc_fix_sumstats import fixpos
|
|
14
|
+
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
15
|
+
from gwaslab.qc_fix_sumstats import _df_split
|
|
16
|
+
from gwaslab.qc_fix_sumstats import check_col
|
|
17
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
18
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
19
|
+
from gwaslab.qc_fix_sumstats import skipped
|
|
20
|
+
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
14
21
|
from gwaslab.bd_common_data import get_number_to_chr
|
|
15
22
|
from gwaslab.bd_common_data import get_chr_list
|
|
16
23
|
from gwaslab.bd_common_data import get_chr_to_number
|
|
17
24
|
from gwaslab.g_vchange_status import vchange_status
|
|
18
25
|
from gwaslab.g_version import _get_version
|
|
26
|
+
|
|
19
27
|
#rsidtochrpos
|
|
20
28
|
#checkref
|
|
21
29
|
#parallelizeassignrsid
|
|
@@ -27,17 +35,35 @@ from gwaslab.g_version import _get_version
|
|
|
27
35
|
|
|
28
36
|
###~!!!!
|
|
29
37
|
def rsidtochrpos(sumstats,
|
|
30
|
-
path=
|
|
38
|
+
path=None, ref_rsid_to_chrpos_tsv=None, snpid="SNPID",
|
|
31
39
|
rsid="rsID", chrom="CHR",pos="POS",ref_rsid="rsID",ref_chr="CHR",ref_pos="POS", build="19",
|
|
32
40
|
overwrite=False,remove=False,chunksize=5000000,verbose=True,log=Log()):
|
|
33
41
|
'''
|
|
34
42
|
assign chr:pos based on rsID
|
|
35
43
|
'''
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
44
|
+
##start function with col checking##########################################################
|
|
45
|
+
_start_line = "assign CHR and POS using rsIDs"
|
|
46
|
+
_end_line = "assigning CHR and POS using rsIDs"
|
|
47
|
+
_start_cols = [rsid,chrom,pos]
|
|
48
|
+
_start_function = ".rsid_to_chrpos()"
|
|
49
|
+
_must_args ={}
|
|
50
|
+
|
|
51
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
52
|
+
log=log,
|
|
53
|
+
verbose=verbose,
|
|
54
|
+
start_line=_start_line,
|
|
55
|
+
end_line=_end_line,
|
|
56
|
+
start_cols=_start_cols,
|
|
57
|
+
start_function=_start_function,
|
|
58
|
+
**_must_args)
|
|
59
|
+
if is_enough_info == False: return sumstats
|
|
60
|
+
############################################################################################
|
|
61
|
+
|
|
39
62
|
if verbose: log.write(" -rsID dictionary file: "+ path)
|
|
40
63
|
|
|
64
|
+
if ref_rsid_to_chrpos_tsv is not None:
|
|
65
|
+
path = ref_rsid_to_chrpos_tsv
|
|
66
|
+
|
|
41
67
|
if snpid in sumstats.columns and sum(sumstats[rsid].isna())>0:
|
|
42
68
|
if verbose: log.write(" -Filling na in rsID columns with SNPID...")
|
|
43
69
|
sumstats.loc[sumstats[rsid].isna(),rsid] = sumstats.loc[sumstats[rsid].isna(),snpid]
|
|
@@ -75,6 +101,9 @@ def rsidtochrpos(sumstats,
|
|
|
75
101
|
if verbose: log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ")
|
|
76
102
|
sumstats = fixchr(sumstats,verbose=verbose)
|
|
77
103
|
sumstats = fixpos(sumstats,verbose=verbose)
|
|
104
|
+
sumstats = sortcolumn(sumstats,verbose=verbose)
|
|
105
|
+
|
|
106
|
+
finished(log,verbose,_end_line)
|
|
78
107
|
return sumstats
|
|
79
108
|
####################################################################################################
|
|
80
109
|
|
|
@@ -96,9 +125,34 @@ def merge_chrpos(sumstats_part,all_groups_max,path,build,status):
|
|
|
96
125
|
return sumstats_part
|
|
97
126
|
|
|
98
127
|
|
|
99
|
-
def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None,build="99",status="STATUS",
|
|
128
|
+
def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None, ref_rsid_to_chrpos_vcf = None, ref_rsid_to_chrpos_hdf5 = None, build="99",status="STATUS",
|
|
100
129
|
n_cores=4,block_size=20000000,verbose=True,log=Log()):
|
|
101
|
-
|
|
130
|
+
|
|
131
|
+
##start function with col checking##########################################################
|
|
132
|
+
_start_line = "assign CHR and POS using rsIDs"
|
|
133
|
+
_end_line = "assigning CHR and POS using rsIDs"
|
|
134
|
+
_start_cols = [rsid,chrom,pos]
|
|
135
|
+
_start_function = ".rsid_to_chrpos2()"
|
|
136
|
+
_must_args ={}
|
|
137
|
+
|
|
138
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
139
|
+
log=log,
|
|
140
|
+
verbose=verbose,
|
|
141
|
+
start_line=_start_line,
|
|
142
|
+
end_line=_end_line,
|
|
143
|
+
start_cols=_start_cols,
|
|
144
|
+
start_function=_start_function,
|
|
145
|
+
**_must_args)
|
|
146
|
+
if is_enough_info == False: return sumstats
|
|
147
|
+
############################################################################################
|
|
148
|
+
|
|
149
|
+
if ref_rsid_to_chrpos_hdf5 is not None:
|
|
150
|
+
path = ref_rsid_to_chrpos_hdf5
|
|
151
|
+
elif ref_rsid_to_chrpos_vcf is not None:
|
|
152
|
+
vcf_file_name = os.path.basename(ref_rsid_to_chrpos_vcf)
|
|
153
|
+
vcf_dir_path = os.path.dirname(ref_rsid_to_chrpos_vcf)
|
|
154
|
+
path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(vcf_dir_path,vcf_file_name,int(block_size))
|
|
155
|
+
|
|
102
156
|
if path is None:
|
|
103
157
|
raise ValueError("Please provide path to hdf5 file.")
|
|
104
158
|
|
|
@@ -164,17 +218,20 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
|
|
|
164
218
|
# merge back
|
|
165
219
|
if verbose: log.write(" -Append data... ")
|
|
166
220
|
sumstats = pd.concat([sumstats_rs,sumstats_nonrs],ignore_index=True)
|
|
221
|
+
|
|
167
222
|
del sumstats_rs
|
|
168
223
|
del sumstats_nonrs
|
|
169
224
|
gc.collect()
|
|
170
225
|
|
|
171
226
|
# check
|
|
172
|
-
sumstats = fixchr(sumstats,verbose=
|
|
173
|
-
sumstats = fixpos(sumstats,verbose=
|
|
227
|
+
sumstats = fixchr(sumstats,verbose=verbose)
|
|
228
|
+
sumstats = fixpos(sumstats,verbose=verbose)
|
|
229
|
+
sumstats = sortcolumn(sumstats,verbose=verbose)
|
|
230
|
+
|
|
174
231
|
pool.close()
|
|
175
232
|
pool.join()
|
|
176
|
-
|
|
177
|
-
|
|
233
|
+
|
|
234
|
+
finished(log, verbose, _end_line)
|
|
178
235
|
return sumstats
|
|
179
236
|
####################################################################################################################
|
|
180
237
|
#20220426 check if non-effect allele is aligned with reference genome
|
|
@@ -192,15 +249,15 @@ def check_status(row,record):
|
|
|
192
249
|
#8 / -----> not on ref genome
|
|
193
250
|
#9 / ------> unchecked
|
|
194
251
|
|
|
195
|
-
status_pre=row[3][:5]
|
|
196
|
-
status_end=row[3][6:]
|
|
252
|
+
status_pre=row.iloc[3][:5]
|
|
253
|
+
status_end=row.iloc[3][6:]
|
|
197
254
|
|
|
198
255
|
## nea == ref
|
|
199
|
-
if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
|
|
256
|
+
if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
|
|
200
257
|
## ea == ref
|
|
201
|
-
if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
|
|
258
|
+
if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
|
|
202
259
|
## len(nea) >len(ea):
|
|
203
|
-
if len(row[2])!=len(row[1]):
|
|
260
|
+
if len(row.iloc[2])!=len(row.iloc[1]):
|
|
204
261
|
# indels both on ref, unable to identify
|
|
205
262
|
return status_pre+"6"+status_end
|
|
206
263
|
else:
|
|
@@ -209,34 +266,49 @@ def check_status(row,record):
|
|
|
209
266
|
## nea!=ref
|
|
210
267
|
else:
|
|
211
268
|
# ea == ref_seq -> need to flip
|
|
212
|
-
if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
|
|
269
|
+
if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
|
|
213
270
|
return status_pre+"3"+status_end
|
|
214
271
|
# ea !=ref
|
|
215
272
|
else:
|
|
216
273
|
#_reverse_complementary
|
|
217
|
-
row[1] = get_reverse_complementary_allele(row[1])
|
|
218
|
-
row[2] = get_reverse_complementary_allele(row[2])
|
|
274
|
+
row.iloc[1] = get_reverse_complementary_allele(row.iloc[1])
|
|
275
|
+
row.iloc[2] = get_reverse_complementary_allele(row.iloc[2])
|
|
219
276
|
## nea == ref
|
|
220
|
-
if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
|
|
277
|
+
if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
|
|
221
278
|
## ea == ref
|
|
222
|
-
if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
|
|
279
|
+
if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
|
|
223
280
|
## len(nea) >len(ea):
|
|
224
|
-
if len(row[2])!=len(row[1]):
|
|
281
|
+
if len(row.iloc[2])!=len(row.iloc[1]):
|
|
225
282
|
return status_pre+"8"+status_end # indel reverse complementary
|
|
226
283
|
else:
|
|
227
284
|
return status_pre+"4"+status_end
|
|
228
285
|
else:
|
|
229
286
|
# ea == ref_seq -> need to flip
|
|
230
|
-
if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
|
|
287
|
+
if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
|
|
231
288
|
return status_pre+"5"+status_end
|
|
232
289
|
# ea !=ref
|
|
233
290
|
return status_pre+"8"+status_end
|
|
234
291
|
|
|
235
292
|
|
|
236
293
|
def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
294
|
+
##start function with col checking##########################################################
|
|
295
|
+
_start_line = "check if NEA is aligned with reference sequence"
|
|
296
|
+
_end_line = "checking if NEA is aligned with reference sequence"
|
|
297
|
+
_start_cols = [chrom,pos,ea,nea,status]
|
|
298
|
+
_start_function = ".check_ref()"
|
|
299
|
+
_must_args ={}
|
|
300
|
+
|
|
301
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
302
|
+
log=log,
|
|
303
|
+
verbose=verbose,
|
|
304
|
+
start_line=_start_line,
|
|
305
|
+
end_line=_end_line,
|
|
306
|
+
start_cols=_start_cols,
|
|
307
|
+
start_function=_start_function,
|
|
308
|
+
**_must_args)
|
|
309
|
+
if is_enough_info == False: return sumstats
|
|
310
|
+
############################################################################################
|
|
311
|
+
if verbose: log.write(" -Reference genome FASTA file: "+ ref_path)
|
|
240
312
|
if verbose: log.write(" -Checking records: ", end="")
|
|
241
313
|
chromlist = get_chr_list(add_number=True)
|
|
242
314
|
records = SeqIO.parse(ref_path, "fasta")
|
|
@@ -255,7 +327,7 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
|
|
|
255
327
|
|
|
256
328
|
if verbose: log.write("\n",end="",show_time=False)
|
|
257
329
|
|
|
258
|
-
sumstats
|
|
330
|
+
sumstats[status] = sumstats[status].astype("string")
|
|
259
331
|
available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
|
|
260
332
|
status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
|
|
261
333
|
status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
|
|
@@ -271,7 +343,7 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
|
|
|
271
343
|
flip_rate = status_3/available_to_check
|
|
272
344
|
if verbose: log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100))
|
|
273
345
|
if raw_matching_rate <0.8:
|
|
274
|
-
if verbose: log.
|
|
346
|
+
if verbose: log.warning("Matching rate is low, please check if the right reference genome is used.")
|
|
275
347
|
if flip_rate > 0.85 :
|
|
276
348
|
if verbose: log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.")
|
|
277
349
|
|
|
@@ -284,7 +356,8 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
|
|
|
284
356
|
if remove is True:
|
|
285
357
|
sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
|
|
286
358
|
if verbose: log.write(" -Variants not on given reference sequence were removed.")
|
|
287
|
-
|
|
359
|
+
|
|
360
|
+
finished(log, verbose, _end_line)
|
|
288
361
|
return sumstats
|
|
289
362
|
|
|
290
363
|
#######################################################################################################################################
|
|
@@ -314,7 +387,7 @@ def assign_rsid_single(sumstats,path,rsid="rsID",chr="CHR",pos="POS",ref="NEA",a
|
|
|
314
387
|
## single df assignment
|
|
315
388
|
vcf_reader = VariantFile(path)
|
|
316
389
|
def rsid_helper(x,vcf_reader,chr_dict):
|
|
317
|
-
return chrposref_rsid(x[0],x[1],x[2],x[3],vcf_reader,chr_dict)
|
|
390
|
+
return chrposref_rsid(x.iloc[0],x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,chr_dict)
|
|
318
391
|
map_func=partial(rsid_helper,vcf_reader=vcf_reader,chr_dict=chr_dict)
|
|
319
392
|
rsID = sumstats.apply(map_func,axis=1)
|
|
320
393
|
return rsID
|
|
@@ -327,19 +400,31 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
327
400
|
all , overwrite rsid for all availalbe rsid
|
|
328
401
|
invalid, only assign rsid for variants with invalid rsid
|
|
329
402
|
empty only assign rsid for variants with na rsid
|
|
330
|
-
'''
|
|
403
|
+
'''
|
|
404
|
+
|
|
331
405
|
if ref_mode=="vcf":
|
|
332
406
|
###################################################################################################################
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
407
|
+
##start function with col checking##########################################################
|
|
408
|
+
_start_line = "assign rsID using reference VCF"
|
|
409
|
+
_end_line = "assign rsID using reference file"
|
|
410
|
+
_start_cols = [chr,pos,ref,alt,status]
|
|
411
|
+
_start_function = ".assign_rsid()"
|
|
412
|
+
_must_args ={}
|
|
413
|
+
|
|
414
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
415
|
+
log=log,
|
|
416
|
+
verbose=verbose,
|
|
417
|
+
start_line=_start_line,
|
|
418
|
+
end_line=_end_line,
|
|
419
|
+
start_cols=_start_cols,
|
|
420
|
+
start_function=_start_function,
|
|
421
|
+
n_cores=n_cores,
|
|
422
|
+
ref_vcf=path,
|
|
423
|
+
**_must_args)
|
|
424
|
+
if is_enough_info == False: return sumstats
|
|
425
|
+
############################################################################################
|
|
338
426
|
chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
|
|
339
|
-
|
|
340
|
-
if verbose: log.write(" -Assigning rsID based on chr:pos and ref:alt/alt:ref...")
|
|
341
|
-
|
|
342
|
-
|
|
427
|
+
if verbose: log.write(" -Assigning rsID based on CHR:POS and REF:ALT/ALT:REF...")
|
|
343
428
|
##############################################
|
|
344
429
|
if rsid not in sumstats.columns:
|
|
345
430
|
sumstats[rsid]=pd.Series(dtype="string")
|
|
@@ -361,7 +446,8 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
361
446
|
|
|
362
447
|
if sum(to_assign)>0:
|
|
363
448
|
if sum(to_assign)<10000: n_cores=1
|
|
364
|
-
df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
|
|
449
|
+
#df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
|
|
450
|
+
df_split = _df_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
|
|
365
451
|
pool = Pool(n_cores)
|
|
366
452
|
map_func = partial(assign_rsid_single,path=path,chr=chr,pos=pos,ref=ref,alt=alt,chr_dict=chr_dict)
|
|
367
453
|
assigned_rsid = pd.concat(pool.map(map_func,df_split))
|
|
@@ -380,9 +466,25 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
380
466
|
'''
|
|
381
467
|
assign rsID based on chr:pos
|
|
382
468
|
'''
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
469
|
+
##start function with col checking##########################################################
|
|
470
|
+
_start_line = "assign rsID by matching SNPID with CHR:POS:REF:ALT in the reference TSV"
|
|
471
|
+
_end_line = "assign rsID using reference file"
|
|
472
|
+
_start_cols = [snpid,status]
|
|
473
|
+
_start_function = ".assign_rsid()"
|
|
474
|
+
_must_args ={}
|
|
475
|
+
|
|
476
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
477
|
+
log=log,
|
|
478
|
+
verbose=verbose,
|
|
479
|
+
start_line=_start_line,
|
|
480
|
+
end_line=_end_line,
|
|
481
|
+
start_cols=_start_cols,
|
|
482
|
+
start_function=_start_function,
|
|
483
|
+
n_cores=n_cores,
|
|
484
|
+
ref_tsv=path,
|
|
485
|
+
**_must_args)
|
|
486
|
+
if is_enough_info == False: return sumstats
|
|
487
|
+
############################################################################################
|
|
386
488
|
|
|
387
489
|
standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234][0126]\w", case=False, flags=0, na=False)
|
|
388
490
|
|
|
@@ -390,11 +492,12 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
390
492
|
sumstats[rsid]=pd.Series(dtype="string")
|
|
391
493
|
|
|
392
494
|
if overwrite == "empty":
|
|
393
|
-
to_assign = sumstats[rsid].isna()
|
|
495
|
+
to_assign = sumstats[rsid].isna() & standardized_normalized
|
|
394
496
|
if overwrite=="all":
|
|
395
497
|
to_assign = standardized_normalized
|
|
396
498
|
if overwrite=="invalid":
|
|
397
499
|
to_assign = (~sumstats[rsid].str.match(r'rs([0-9]+)', case=False, flags=0, na=False)) & standardized_normalized
|
|
500
|
+
|
|
398
501
|
total_number= len(sumstats)
|
|
399
502
|
pre_number = sum(~sumstats[rsid].isna())
|
|
400
503
|
if verbose: log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...")
|
|
@@ -419,12 +522,13 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
419
522
|
sumstats = sumstats.rename(columns = {'index':snpid})
|
|
420
523
|
|
|
421
524
|
after_number = sum(~sumstats[rsid].isna())
|
|
422
|
-
if verbose: log.write(" -rsID
|
|
525
|
+
if verbose: log.write(" -rsID annotation for "+str(total_number - after_number) +" needed to be fixed!")
|
|
423
526
|
if verbose: log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!")
|
|
424
527
|
else:
|
|
425
|
-
if verbose: log.write(" -No rsID
|
|
528
|
+
if verbose: log.write(" -No rsID can be fixed...skipping...")
|
|
426
529
|
################################################################################################################
|
|
427
|
-
|
|
530
|
+
|
|
531
|
+
finished(log,verbose,_end_line)
|
|
428
532
|
return sumstats
|
|
429
533
|
#################################################################################################################################################
|
|
430
534
|
#single record assignment
|
|
@@ -503,12 +607,12 @@ def is_palindromic(sumstats,a1="EA",a2="NEA"):
|
|
|
503
607
|
|
|
504
608
|
def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS"):
|
|
505
609
|
vcf_reader = VariantFile(ref_infer)
|
|
506
|
-
status_part = sumstats.apply(lambda x:check_strand_status(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict),axis=1)
|
|
610
|
+
status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
|
|
507
611
|
return status_part
|
|
508
612
|
|
|
509
613
|
def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
|
|
510
614
|
vcf_reader = VariantFile(ref_infer)
|
|
511
|
-
status_part = sumstats.apply(lambda x:check_unkonwn_indel(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict,daf_tolerance),axis=1)
|
|
615
|
+
status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
|
|
512
616
|
return status_part
|
|
513
617
|
|
|
514
618
|
##################################################################################################################################################
|
|
@@ -516,79 +620,98 @@ def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NE
|
|
|
516
620
|
def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
|
|
517
621
|
chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
|
|
518
622
|
chr_dict=None,verbose=True,log=Log()):
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
623
|
+
##start function with col checking##########################################################
|
|
624
|
+
_start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
|
|
625
|
+
_end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
|
|
626
|
+
_start_cols = [chr,pos,ref,alt,eaf,status]
|
|
627
|
+
_start_function = ".infer_strand()"
|
|
628
|
+
_must_args ={"ref_alt_freq":ref_alt_freq}
|
|
629
|
+
|
|
630
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
631
|
+
log=log,
|
|
632
|
+
verbose=verbose,
|
|
633
|
+
start_line=_start_line,
|
|
634
|
+
end_line=_end_line,
|
|
635
|
+
start_cols=_start_cols,
|
|
636
|
+
start_function=_start_function,
|
|
637
|
+
n_cores=n_cores,
|
|
638
|
+
ref_vcf=ref_infer,
|
|
639
|
+
**_must_args)
|
|
640
|
+
if is_enough_info == False: return sumstats
|
|
641
|
+
############################################################################################
|
|
522
642
|
|
|
523
643
|
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
644
|
+
|
|
645
|
+
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
524
646
|
|
|
525
|
-
# check if the columns are complete
|
|
526
|
-
if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
|
|
527
|
-
raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
|
|
528
647
|
if "p" in mode:
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
pool
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
648
|
+
## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
|
|
649
|
+
good_chrpos = sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
|
|
650
|
+
palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
|
|
651
|
+
not_palindromic_snp = good_chrpos & (~palindromic)
|
|
652
|
+
|
|
653
|
+
##not palindromic : change status
|
|
654
|
+
sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
|
|
655
|
+
if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
|
|
656
|
+
|
|
657
|
+
#palindromic but can not infer
|
|
658
|
+
maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
|
|
659
|
+
|
|
660
|
+
sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
|
|
661
|
+
|
|
662
|
+
#palindromic WITH UNKNWON OR UNCHECKED STATUS
|
|
663
|
+
unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
|
|
664
|
+
|
|
665
|
+
unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
|
|
666
|
+
|
|
667
|
+
if verbose: log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)))
|
|
668
|
+
|
|
669
|
+
#########################################################################################
|
|
670
|
+
if sum(unknow_palindromic_to_check)>0:
|
|
671
|
+
if sum(unknow_palindromic_to_check)<10000:
|
|
672
|
+
n_cores=1
|
|
673
|
+
|
|
674
|
+
#df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
|
|
675
|
+
df_split = _df_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
|
|
676
|
+
pool = Pool(n_cores)
|
|
677
|
+
map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
678
|
+
status_inferred = pd.concat(pool.map(map_func,df_split))
|
|
679
|
+
sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
|
|
680
|
+
pool.close()
|
|
681
|
+
pool.join()
|
|
682
|
+
#########################################################################################
|
|
683
|
+
#0 Not palindromic SNPs
|
|
684
|
+
#1 Palindromic +strand -> no need to flip
|
|
685
|
+
#2 palindromic -strand -> need to flip -> fixed
|
|
686
|
+
#3 Indel no need flip
|
|
687
|
+
#4 Unknown Indel -> fixed
|
|
688
|
+
#5 Palindromic -strand -> need to flip
|
|
689
|
+
#6 Indel need flip
|
|
690
|
+
#7 indistinguishable
|
|
691
|
+
#8 Not matching or No information
|
|
692
|
+
#9 Unchecked
|
|
693
|
+
|
|
694
|
+
status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
|
|
695
|
+
status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
|
|
696
|
+
status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
|
|
697
|
+
status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
|
|
698
|
+
status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
|
|
699
|
+
|
|
700
|
+
if verbose: log.write(" -Non-palindromic : ",sum(status0))
|
|
701
|
+
if verbose: log.write(" -Palindromic SNPs on + strand: ",sum(status1))
|
|
702
|
+
if verbose: log.write(" -Palindromic SNPs on - strand and needed to be flipped:",sum(status5))
|
|
703
|
+
if verbose: log.write(" -Palindromic SNPs with MAF not available to infer : ",sum(status7))
|
|
704
|
+
if verbose: log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8))
|
|
705
|
+
|
|
706
|
+
if ("7" in remove_snp) and ("8" in remove_snp) :
|
|
707
|
+
if verbose: log.write(" -Palindromic SNPs with maf not available to infer and with no macthes or no information will will be removed")
|
|
708
|
+
sumstats = sumstats.loc[~(status7 | status8),:].copy()
|
|
709
|
+
elif "8" in remove_snp:
|
|
710
|
+
if verbose: log.write(" -Palindromic SNPs with no macthes or no information will be removed")
|
|
711
|
+
sumstats = sumstats.loc[~status8,:].copy()
|
|
712
|
+
elif "7" in remove_snp:
|
|
713
|
+
if verbose: log.write(" -Palindromic SNPs with maf not available to infer will be removed")
|
|
714
|
+
sumstats = sumstats.loc[~status7,:].copy()
|
|
592
715
|
|
|
593
716
|
### unknow_indel
|
|
594
717
|
if "i" in mode:
|
|
@@ -598,14 +721,15 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
598
721
|
if verbose: log.write(" -Indistinguishable indels will be inferred from reference vcf ref and alt...")
|
|
599
722
|
#########################################################################################
|
|
600
723
|
#with maf can not infer
|
|
601
|
-
#maf_can_infer = (sumstats
|
|
724
|
+
#maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
|
|
602
725
|
#sumstats.loc[unknow_indel&(~maf_can_infer),status] = vchange_status(sumstats.loc[unknow_indel&(~maf_can_infer),status],7,"9","8")
|
|
603
726
|
if verbose: log.write(" -DAF tolerance: {}".format(daf_tolerance))
|
|
604
727
|
|
|
605
728
|
if sum(unknow_indel)>0:
|
|
606
729
|
if sum(unknow_indel)<10000:
|
|
607
730
|
n_cores=1
|
|
608
|
-
df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
|
|
731
|
+
#df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
|
|
732
|
+
df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
|
|
609
733
|
pool = Pool(n_cores)
|
|
610
734
|
map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
|
|
611
735
|
status_inferred = pd.concat(pool.map(map_func,df_split))
|
|
@@ -624,7 +748,8 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
624
748
|
if "8" in remove_indel:
|
|
625
749
|
if verbose: log.write(" -Indels with no macthes or no information will be removed")
|
|
626
750
|
sumstats = sumstats.loc[~status8,:].copy()
|
|
627
|
-
|
|
751
|
+
|
|
752
|
+
finished(log,verbose,_end_line)
|
|
628
753
|
return sumstats
|
|
629
754
|
|
|
630
755
|
|
|
@@ -648,22 +773,35 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
648
773
|
|
|
649
774
|
################################################################################################################
|
|
650
775
|
def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
776
|
+
##start function with col checking##########################################################
|
|
777
|
+
_start_line = "check the difference between EAF and reference VCF ALT frequency"
|
|
778
|
+
_end_line = "checking the difference between EAF and reference VCF ALT frequency"
|
|
779
|
+
_start_cols = [chr,pos,ref,alt,eaf,status]
|
|
780
|
+
_start_function = ".check_daf()"
|
|
781
|
+
_must_args ={"ref_alt_freq":ref_alt_freq}
|
|
782
|
+
|
|
783
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
784
|
+
log=log,
|
|
785
|
+
verbose=verbose,
|
|
786
|
+
start_line=_start_line,
|
|
787
|
+
end_line=_end_line,
|
|
788
|
+
start_cols=_start_cols,
|
|
789
|
+
start_function=_start_function,
|
|
790
|
+
n_cores=n_cores,
|
|
791
|
+
ref_vcf=ref_infer,
|
|
792
|
+
**_must_args)
|
|
793
|
+
if is_enough_info == False: return sumstats
|
|
794
|
+
############################################################################################
|
|
795
|
+
|
|
657
796
|
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
658
797
|
|
|
659
798
|
column_name = column_name + suffix
|
|
660
|
-
# check if the columns are complete
|
|
661
|
-
if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
|
|
662
|
-
raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
|
|
663
799
|
|
|
800
|
+
|
|
801
|
+
|
|
664
802
|
# ref_alt_freq INFO in vcf was provided
|
|
665
803
|
if ref_alt_freq is not None:
|
|
666
|
-
|
|
804
|
+
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
667
805
|
if not force:
|
|
668
806
|
good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
|
|
669
807
|
if verbose: log.write(" -Checking variants:", sum(good_chrpos))
|
|
@@ -672,7 +810,8 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
|
|
|
672
810
|
########################
|
|
673
811
|
if sum(~sumstats[eaf].isna())<10000:
|
|
674
812
|
n_cores=1
|
|
675
|
-
df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
|
|
813
|
+
#df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
|
|
814
|
+
df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
|
|
676
815
|
pool = Pool(n_cores)
|
|
677
816
|
if sum(~sumstats[eaf].isna())>0:
|
|
678
817
|
map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,column_name=column_name,chr_dict=chr_dict)
|
|
@@ -683,13 +822,13 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
|
|
|
683
822
|
#status_inferred = sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]].apply(lambda x:check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict),axis=1)
|
|
684
823
|
|
|
685
824
|
#sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
|
|
686
|
-
#sumstats
|
|
687
|
-
if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats
|
|
688
|
-
if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats
|
|
689
|
-
if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats
|
|
690
|
-
if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats
|
|
691
|
-
if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats
|
|
692
|
-
if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats
|
|
825
|
+
#sumstats["DAF"]=sumstats["DAF"].astype("float")
|
|
826
|
+
if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]))
|
|
827
|
+
if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats[column_name]))
|
|
828
|
+
if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats[column_name]))
|
|
829
|
+
if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats[column_name])))
|
|
830
|
+
if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats[column_name])))
|
|
831
|
+
if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats[column_name])))
|
|
693
832
|
if verbose: log.write("Finished allele frequency checking!")
|
|
694
833
|
return sumstats
|
|
695
834
|
|
|
@@ -697,11 +836,11 @@ def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos
|
|
|
697
836
|
#vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
|
|
698
837
|
vcf_reader = VariantFile(ref_infer)
|
|
699
838
|
def afapply(x,vcf,alt_freq,chr_dict):
|
|
700
|
-
return check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict)
|
|
839
|
+
return check_daf(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,chr_dict)
|
|
701
840
|
map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
702
841
|
status_inferred = sumstats.apply(map_func,axis=1)
|
|
703
|
-
sumstats
|
|
704
|
-
sumstats
|
|
842
|
+
sumstats[column_name] = status_inferred.values
|
|
843
|
+
sumstats[column_name]=sumstats[column_name].astype("float")
|
|
705
844
|
return sumstats
|
|
706
845
|
|
|
707
846
|
def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
|
|
@@ -716,25 +855,35 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
|
|
|
716
855
|
################################################################################################################
|
|
717
856
|
|
|
718
857
|
def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
858
|
+
##start function with col checking##########################################################
|
|
859
|
+
_start_line = "infer EAF using reference VCF ALT frequency"
|
|
860
|
+
_end_line = "inferring EAF using reference VCF ALT frequency"
|
|
861
|
+
_start_cols = [chr,pos,ref,alt,eaf,status]
|
|
862
|
+
_start_function = ".infer_af()"
|
|
863
|
+
_must_args ={"ref_alt_freq":ref_alt_freq}
|
|
864
|
+
|
|
865
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
866
|
+
log=log,
|
|
867
|
+
verbose=verbose,
|
|
868
|
+
start_line=_start_line,
|
|
869
|
+
end_line=_end_line,
|
|
870
|
+
start_cols=_start_cols,
|
|
871
|
+
start_function=_start_function,
|
|
872
|
+
n_cores=n_cores,
|
|
873
|
+
ref_vcf=ref_infer,
|
|
874
|
+
**_must_args)
|
|
875
|
+
if is_enough_info == False: return sumstats
|
|
876
|
+
############################################################################################
|
|
725
877
|
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
726
|
-
|
|
727
|
-
# check if the columns are complete
|
|
728
|
-
if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
|
|
729
|
-
raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
|
|
730
878
|
|
|
731
879
|
if eaf not in sumstats.columns:
|
|
732
880
|
sumstats[eaf]=np.nan
|
|
733
881
|
|
|
734
882
|
prenumber = sum(sumstats[eaf].isna())
|
|
883
|
+
|
|
735
884
|
# ref_alt_freq INFO in vcf was provided
|
|
736
885
|
if ref_alt_freq is not None:
|
|
737
|
-
|
|
886
|
+
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
738
887
|
if not force:
|
|
739
888
|
good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
|
|
740
889
|
if verbose: log.write(" -Checking variants:", sum(good_chrpos))
|
|
@@ -742,7 +891,8 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
|
|
|
742
891
|
########################
|
|
743
892
|
if sum(sumstats[eaf].isna())<10000:
|
|
744
893
|
n_cores=1
|
|
745
|
-
df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
|
|
894
|
+
#df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
|
|
895
|
+
df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
|
|
746
896
|
pool = Pool(n_cores)
|
|
747
897
|
map_func = partial(inferaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
748
898
|
sumstats.loc[good_chrpos,[eaf]] = pd.concat(pool.map(map_func,df_split))
|
|
@@ -753,18 +903,19 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
|
|
|
753
903
|
afternumber = sum(sumstats[eaf].isna())
|
|
754
904
|
if verbose: log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber))
|
|
755
905
|
if verbose: log.write(" -EAF is still missing for {} variants.".format(afternumber))
|
|
756
|
-
|
|
906
|
+
|
|
907
|
+
finished(log,verbose,_end_line)
|
|
757
908
|
return sumstats
|
|
758
909
|
|
|
759
910
|
def inferaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
|
|
760
911
|
#vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
|
|
761
912
|
vcf_reader = VariantFile(ref_infer)
|
|
762
913
|
def afapply(x,vcf,alt_freq,chr_dict):
|
|
763
|
-
return infer_af(x[0],x[1]-1,x[1],x[2],x[3],vcf_reader,ref_alt_freq,chr_dict)
|
|
914
|
+
return infer_af(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,ref_alt_freq,chr_dict)
|
|
764
915
|
map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
765
916
|
status_inferred = sumstats.apply(map_func,axis=1)
|
|
766
|
-
sumstats
|
|
767
|
-
sumstats
|
|
917
|
+
sumstats[eaf] = status_inferred.values
|
|
918
|
+
sumstats[eaf]=sumstats[eaf].astype("float")
|
|
768
919
|
return sumstats
|
|
769
920
|
|
|
770
921
|
def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
|
|
@@ -802,4 +953,6 @@ def check_vcf_chr_prefix(vcf_bcf_path):
|
|
|
802
953
|
if m is not None:
|
|
803
954
|
return m.group(1)
|
|
804
955
|
else:
|
|
805
|
-
return None
|
|
956
|
+
return None
|
|
957
|
+
|
|
958
|
+
|