gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +22 -5
- gwaslab/g_Sumstats.py +110 -163
- gwaslab/g_SumstatsPair.py +76 -25
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +36 -17
- gwaslab/hm_harmonize_sumstats.py +354 -221
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +49 -43
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/io_to_formats.py +428 -295
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +891 -778
- gwaslab/util_ex_calculate_ldmatrix.py +31 -13
- gwaslab/util_ex_gwascatalog.py +25 -25
- gwaslab/util_ex_ldproxyfinder.py +10 -10
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +30 -30
- gwaslab/util_in_filter_value.py +201 -74
- gwaslab/util_in_get_density.py +10 -10
- gwaslab/util_in_get_sig.py +445 -71
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +42 -37
- gwaslab/viz_aux_reposition_text.py +10 -7
- gwaslab/viz_aux_save_figure.py +18 -8
- gwaslab/viz_plot_compare_af.py +32 -33
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +34 -26
- gwaslab/viz_plot_mqqplot.py +126 -75
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +36 -33
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +65 -61
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.37.dist-info/RECORD +0 -72
- /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/hm_harmonize_sumstats.py
CHANGED
|
@@ -12,12 +12,18 @@ from gwaslab.g_Log import Log
|
|
|
12
12
|
from gwaslab.qc_fix_sumstats import fixchr
|
|
13
13
|
from gwaslab.qc_fix_sumstats import fixpos
|
|
14
14
|
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
15
|
+
from gwaslab.qc_fix_sumstats import _df_split
|
|
16
|
+
from gwaslab.qc_fix_sumstats import check_col
|
|
17
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
18
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
19
|
+
from gwaslab.qc_fix_sumstats import skipped
|
|
15
20
|
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
16
21
|
from gwaslab.bd_common_data import get_number_to_chr
|
|
17
22
|
from gwaslab.bd_common_data import get_chr_list
|
|
18
23
|
from gwaslab.bd_common_data import get_chr_to_number
|
|
19
24
|
from gwaslab.g_vchange_status import vchange_status
|
|
20
25
|
from gwaslab.g_version import _get_version
|
|
26
|
+
|
|
21
27
|
#rsidtochrpos
|
|
22
28
|
#checkref
|
|
23
29
|
#parallelizeassignrsid
|
|
@@ -35,20 +41,35 @@ def rsidtochrpos(sumstats,
|
|
|
35
41
|
'''
|
|
36
42
|
assign chr:pos based on rsID
|
|
37
43
|
'''
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
44
|
+
##start function with col checking##########################################################
|
|
45
|
+
_start_line = "assign CHR and POS using rsIDs"
|
|
46
|
+
_end_line = "assigning CHR and POS using rsIDs"
|
|
47
|
+
_start_cols = [rsid,chrom,pos]
|
|
48
|
+
_start_function = ".rsid_to_chrpos()"
|
|
49
|
+
_must_args ={}
|
|
50
|
+
|
|
51
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
52
|
+
log=log,
|
|
53
|
+
verbose=verbose,
|
|
54
|
+
start_line=_start_line,
|
|
55
|
+
end_line=_end_line,
|
|
56
|
+
start_cols=_start_cols,
|
|
57
|
+
start_function=_start_function,
|
|
58
|
+
**_must_args)
|
|
59
|
+
if is_enough_info == False: return sumstats
|
|
60
|
+
############################################################################################
|
|
61
|
+
|
|
62
|
+
log.write(" -rsID dictionary file: "+ path,verbose=verbose)
|
|
42
63
|
|
|
43
64
|
if ref_rsid_to_chrpos_tsv is not None:
|
|
44
65
|
path = ref_rsid_to_chrpos_tsv
|
|
45
66
|
|
|
46
67
|
if snpid in sumstats.columns and sum(sumstats[rsid].isna())>0:
|
|
47
|
-
|
|
68
|
+
log.write(" -Filling na in rsID columns with SNPID...",verbose=verbose)
|
|
48
69
|
sumstats.loc[sumstats[rsid].isna(),rsid] = sumstats.loc[sumstats[rsid].isna(),snpid]
|
|
49
70
|
|
|
50
71
|
if sum(sumstats[rsid].isna())>0:
|
|
51
|
-
|
|
72
|
+
log.write(" -Filling na in rsID columns with NA_xxx for {} variants...".format(sum(sumstats[rsid].isna())),verbose=verbose)
|
|
52
73
|
sumstats.loc[sumstats[rsid].isna(),rsid] = ["NA_" + str(x+1) for x in range(len(sumstats.loc[sumstats[rsid].isna(),rsid]))]
|
|
53
74
|
|
|
54
75
|
dic_chuncks = pd.read_csv(path,sep="\t",usecols=[ref_rsid,ref_chr,ref_pos],
|
|
@@ -63,8 +84,8 @@ def rsidtochrpos(sumstats,
|
|
|
63
84
|
if pos not in sumstats.columns:
|
|
64
85
|
sumstats[pos] =pd.Series(dtype="Int64")
|
|
65
86
|
|
|
66
|
-
|
|
67
|
-
|
|
87
|
+
log.write(" -Setting block size: ",chunksize,verbose=verbose)
|
|
88
|
+
log.write(" -Loading block: ",end="",verbose=verbose)
|
|
68
89
|
for i,dic in enumerate(dic_chuncks):
|
|
69
90
|
dic_to_update = dic[dic.index.notnull()]
|
|
70
91
|
log.write(i," ",end=" ",show_time=False)
|
|
@@ -74,13 +95,15 @@ def rsidtochrpos(sumstats,
|
|
|
74
95
|
sumstats.update(dic_to_update,overwrite="True")
|
|
75
96
|
gc.collect()
|
|
76
97
|
|
|
77
|
-
|
|
98
|
+
log.write("\n",end="",show_time=False,verbose=verbose)
|
|
78
99
|
sumstats = sumstats.reset_index()
|
|
79
100
|
sumstats = sumstats.rename(columns = {'index':rsid})
|
|
80
|
-
|
|
101
|
+
log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ",verbose=verbose)
|
|
81
102
|
sumstats = fixchr(sumstats,verbose=verbose)
|
|
82
103
|
sumstats = fixpos(sumstats,verbose=verbose)
|
|
83
104
|
sumstats = sortcolumn(sumstats,verbose=verbose)
|
|
105
|
+
|
|
106
|
+
finished(log,verbose,_end_line)
|
|
84
107
|
return sumstats
|
|
85
108
|
####################################################################################################
|
|
86
109
|
|
|
@@ -104,33 +127,48 @@ def merge_chrpos(sumstats_part,all_groups_max,path,build,status):
|
|
|
104
127
|
|
|
105
128
|
def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None, ref_rsid_to_chrpos_vcf = None, ref_rsid_to_chrpos_hdf5 = None, build="99",status="STATUS",
|
|
106
129
|
n_cores=4,block_size=20000000,verbose=True,log=Log()):
|
|
107
|
-
|
|
130
|
+
|
|
131
|
+
##start function with col checking##########################################################
|
|
132
|
+
_start_line = "assign CHR and POS using rsIDs"
|
|
133
|
+
_end_line = "assigning CHR and POS using rsIDs"
|
|
134
|
+
_start_cols = [rsid,chrom,pos]
|
|
135
|
+
_start_function = ".rsid_to_chrpos2()"
|
|
136
|
+
_must_args ={}
|
|
137
|
+
|
|
138
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
139
|
+
log=log,
|
|
140
|
+
verbose=verbose,
|
|
141
|
+
start_line=_start_line,
|
|
142
|
+
end_line=_end_line,
|
|
143
|
+
start_cols=_start_cols,
|
|
144
|
+
start_function=_start_function,
|
|
145
|
+
**_must_args)
|
|
146
|
+
if is_enough_info == False: return sumstats
|
|
147
|
+
############################################################################################
|
|
148
|
+
|
|
108
149
|
if ref_rsid_to_chrpos_hdf5 is not None:
|
|
109
150
|
path = ref_rsid_to_chrpos_hdf5
|
|
110
151
|
elif ref_rsid_to_chrpos_vcf is not None:
|
|
111
152
|
vcf_file_name = os.path.basename(ref_rsid_to_chrpos_vcf)
|
|
112
153
|
vcf_dir_path = os.path.dirname(ref_rsid_to_chrpos_vcf)
|
|
113
154
|
path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(vcf_dir_path,vcf_file_name,int(block_size))
|
|
114
|
-
|
|
115
|
-
if verbose: log.write("Start to assign CHR and POS using rsIDs...{}".format(_get_version()))
|
|
116
|
-
check_dataframe_shape(sumstats, log, verbose)
|
|
117
|
-
|
|
155
|
+
|
|
118
156
|
if path is None:
|
|
119
157
|
raise ValueError("Please provide path to hdf5 file.")
|
|
120
158
|
|
|
121
159
|
sumstats["rsn"] = pd.to_numeric(sumstats[rsid].str.strip("rs"),errors="coerce").astype("Int64")
|
|
122
160
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
161
|
+
log.write(" -Source hdf5 file: ",path,verbose=verbose)
|
|
162
|
+
log.write(" -Cores to use : ",n_cores,verbose=verbose)
|
|
163
|
+
log.write(" -Blocksize (make sure it is the same as hdf5 file ): ",block_size,verbose=verbose)
|
|
126
164
|
|
|
127
165
|
input_columns= sumstats.columns
|
|
128
166
|
sumstats_nonrs = sumstats.loc[sumstats["rsn"].isna()|sumstats["rsn"].duplicated(keep='first') ,:].copy()
|
|
129
167
|
sumstats_rs = sumstats.loc[sumstats["rsn"].notnull(),:].copy()
|
|
130
168
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
169
|
+
log.write(" -Non-Valid rsIDs: ",sum(sumstats["rsn"].isna()),verbose=verbose)
|
|
170
|
+
log.write(" -Duplicated rsIDs except for the first occurrence: ",sum(sumstats.loc[~sumstats["rsn"].isna(), "rsn"].duplicated(keep='first')),verbose=verbose)
|
|
171
|
+
log.write(" -Valid rsIDs: ", len(sumstats_rs),verbose=verbose)
|
|
134
172
|
|
|
135
173
|
del sumstats
|
|
136
174
|
gc.collect()
|
|
@@ -147,16 +185,16 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
|
|
|
147
185
|
#
|
|
148
186
|
pool = Pool(n_cores)
|
|
149
187
|
if chrom not in input_columns:
|
|
150
|
-
|
|
188
|
+
log.write(" -Initiating CHR ... ",verbose=verbose)
|
|
151
189
|
sumstats_rs[chrom]=pd.Series(dtype="Int32")
|
|
152
190
|
|
|
153
191
|
if pos not in input_columns:
|
|
154
|
-
|
|
192
|
+
log.write(" -Initiating POS ... ",verbose=verbose)
|
|
155
193
|
sumstats_rs[pos]=pd.Series(dtype="Int64")
|
|
156
194
|
|
|
157
195
|
df_split=[y for x, y in sumstats_rs.groupby('group', as_index=False)]
|
|
158
|
-
|
|
159
|
-
|
|
196
|
+
log.write(" -Divided into groups: ",len(df_split),verbose=verbose)
|
|
197
|
+
log.write(" -",set(sumstats_rs.loc[:,"group"].unique()),verbose=verbose)
|
|
160
198
|
|
|
161
199
|
# check keys
|
|
162
200
|
store = pd.HDFStore(path, 'r')
|
|
@@ -164,21 +202,21 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
|
|
|
164
202
|
all_groups_len = len(all_groups)
|
|
165
203
|
store.close()
|
|
166
204
|
all_groups_max = max(map(lambda x: int(x.split("_")[1]), all_groups))
|
|
167
|
-
|
|
168
|
-
|
|
205
|
+
log.write(" -Number of groups in HDF5: ",all_groups_len,verbose=verbose)
|
|
206
|
+
log.write(" -Max index of groups in HDF5: ",all_groups_max,verbose=verbose)
|
|
169
207
|
|
|
170
208
|
# update CHR and POS using rsID with multiple threads
|
|
171
209
|
sumstats_rs = pd.concat(pool.map(partial(merge_chrpos,all_groups_max=all_groups_max,path=path,build=build,status=status),df_split),ignore_index=True)
|
|
172
210
|
sumstats_rs.loc[:,["CHR","POS"]] = sumstats_rs.loc[:,["CHR","POS"]].astype("Int64")
|
|
173
211
|
del df_split
|
|
174
212
|
gc.collect()
|
|
175
|
-
|
|
213
|
+
log.write(" -Merging group data... ",verbose=verbose)
|
|
176
214
|
# drop group and rsn
|
|
177
215
|
sumstats_rs = sumstats_rs.drop(columns=["group"])
|
|
178
216
|
sumstats_nonrs = sumstats_nonrs.drop(columns=["rsn"])
|
|
179
217
|
|
|
180
218
|
# merge back
|
|
181
|
-
|
|
219
|
+
log.write(" -Append data... ",verbose=verbose)
|
|
182
220
|
sumstats = pd.concat([sumstats_rs,sumstats_nonrs],ignore_index=True)
|
|
183
221
|
|
|
184
222
|
del sumstats_rs
|
|
@@ -192,8 +230,8 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
|
|
|
192
230
|
|
|
193
231
|
pool.close()
|
|
194
232
|
pool.join()
|
|
195
|
-
|
|
196
|
-
|
|
233
|
+
|
|
234
|
+
finished(log, verbose, _end_line)
|
|
197
235
|
return sumstats
|
|
198
236
|
####################################################################################################################
|
|
199
237
|
#20220426 check if non-effect allele is aligned with reference genome
|
|
@@ -211,15 +249,15 @@ def check_status(row,record):
|
|
|
211
249
|
#8 / -----> not on ref genome
|
|
212
250
|
#9 / ------> unchecked
|
|
213
251
|
|
|
214
|
-
status_pre=row[3][:5]
|
|
215
|
-
status_end=row[3][6:]
|
|
252
|
+
status_pre=row.iloc[3][:5]
|
|
253
|
+
status_end=row.iloc[3][6:]
|
|
216
254
|
|
|
217
255
|
## nea == ref
|
|
218
|
-
if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
|
|
256
|
+
if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
|
|
219
257
|
## ea == ref
|
|
220
|
-
if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
|
|
258
|
+
if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
|
|
221
259
|
## len(nea) >len(ea):
|
|
222
|
-
if len(row[2])!=len(row[1]):
|
|
260
|
+
if len(row.iloc[2])!=len(row.iloc[1]):
|
|
223
261
|
# indels both on ref, unable to identify
|
|
224
262
|
return status_pre+"6"+status_end
|
|
225
263
|
else:
|
|
@@ -228,35 +266,50 @@ def check_status(row,record):
|
|
|
228
266
|
## nea!=ref
|
|
229
267
|
else:
|
|
230
268
|
# ea == ref_seq -> need to flip
|
|
231
|
-
if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
|
|
269
|
+
if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
|
|
232
270
|
return status_pre+"3"+status_end
|
|
233
271
|
# ea !=ref
|
|
234
272
|
else:
|
|
235
273
|
#_reverse_complementary
|
|
236
|
-
row[1] = get_reverse_complementary_allele(row[1])
|
|
237
|
-
row[2] = get_reverse_complementary_allele(row[2])
|
|
274
|
+
row.iloc[1] = get_reverse_complementary_allele(row.iloc[1])
|
|
275
|
+
row.iloc[2] = get_reverse_complementary_allele(row.iloc[2])
|
|
238
276
|
## nea == ref
|
|
239
|
-
if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
|
|
277
|
+
if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
|
|
240
278
|
## ea == ref
|
|
241
|
-
if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
|
|
279
|
+
if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
|
|
242
280
|
## len(nea) >len(ea):
|
|
243
|
-
if len(row[2])!=len(row[1]):
|
|
281
|
+
if len(row.iloc[2])!=len(row.iloc[1]):
|
|
244
282
|
return status_pre+"8"+status_end # indel reverse complementary
|
|
245
283
|
else:
|
|
246
284
|
return status_pre+"4"+status_end
|
|
247
285
|
else:
|
|
248
286
|
# ea == ref_seq -> need to flip
|
|
249
|
-
if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
|
|
287
|
+
if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
|
|
250
288
|
return status_pre+"5"+status_end
|
|
251
289
|
# ea !=ref
|
|
252
290
|
return status_pre+"8"+status_end
|
|
253
291
|
|
|
254
292
|
|
|
255
293
|
def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
294
|
+
##start function with col checking##########################################################
|
|
295
|
+
_start_line = "check if NEA is aligned with reference sequence"
|
|
296
|
+
_end_line = "checking if NEA is aligned with reference sequence"
|
|
297
|
+
_start_cols = [chrom,pos,ea,nea,status]
|
|
298
|
+
_start_function = ".check_ref()"
|
|
299
|
+
_must_args ={}
|
|
300
|
+
|
|
301
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
302
|
+
log=log,
|
|
303
|
+
verbose=verbose,
|
|
304
|
+
start_line=_start_line,
|
|
305
|
+
end_line=_end_line,
|
|
306
|
+
start_cols=_start_cols,
|
|
307
|
+
start_function=_start_function,
|
|
308
|
+
**_must_args)
|
|
309
|
+
if is_enough_info == False: return sumstats
|
|
310
|
+
############################################################################################
|
|
311
|
+
log.write(" -Reference genome FASTA file: "+ ref_path,verbose=verbose)
|
|
312
|
+
log.write(" -Checking records: ", end="",verbose=verbose)
|
|
260
313
|
chromlist = get_chr_list(add_number=True)
|
|
261
314
|
records = SeqIO.parse(ref_path, "fasta")
|
|
262
315
|
for record in records:
|
|
@@ -268,13 +321,13 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
|
|
|
268
321
|
else:
|
|
269
322
|
i = record_chr
|
|
270
323
|
if i in chromlist:
|
|
271
|
-
|
|
324
|
+
log.write(record_chr," ", end="",show_time=False,verbose=verbose)
|
|
272
325
|
to_check_ref = (sumstats[chrom]==i) & (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna())
|
|
273
326
|
sumstats.loc[to_check_ref,status] = sumstats.loc[to_check_ref,[pos,ea,nea,status]].apply(lambda x:check_status(x,record),axis=1)
|
|
274
327
|
|
|
275
|
-
|
|
328
|
+
log.write("\n",end="",show_time=False,verbose=verbose)
|
|
276
329
|
|
|
277
|
-
sumstats
|
|
330
|
+
sumstats[status] = sumstats[status].astype("string")
|
|
278
331
|
available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
|
|
279
332
|
status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
|
|
280
333
|
status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
|
|
@@ -284,26 +337,27 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
|
|
|
284
337
|
#status_7=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[7]\w", case=False, flags=0, na=False))
|
|
285
338
|
status_8=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w", case=False, flags=0, na=False))
|
|
286
339
|
|
|
287
|
-
|
|
288
|
-
|
|
340
|
+
log.write(" -Variants allele on given reference sequence : ",status_0,verbose=verbose)
|
|
341
|
+
log.write(" -Variants flipped : ",status_3,verbose=verbose)
|
|
289
342
|
raw_matching_rate = (status_3+status_0)/available_to_check
|
|
290
343
|
flip_rate = status_3/available_to_check
|
|
291
|
-
|
|
344
|
+
log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100),verbose=verbose)
|
|
292
345
|
if raw_matching_rate <0.8:
|
|
293
|
-
|
|
346
|
+
log.warning("Matching rate is low, please check if the right reference genome is used.")
|
|
294
347
|
if flip_rate > 0.85 :
|
|
295
|
-
|
|
348
|
+
log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.",verbose=verbose)
|
|
296
349
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
#
|
|
301
|
-
|
|
350
|
+
log.write(" -Variants inferred reverse_complement : ",status_4,verbose=verbose)
|
|
351
|
+
log.write(" -Variants inferred reverse_complement_flipped : ",status_5,verbose=verbose)
|
|
352
|
+
log.write(" -Both allele on genome + unable to distinguish : ",status_6,verbose=verbose)
|
|
353
|
+
#log.write(" -Reverse_complementary + both allele on genome + unable to distinguish: ",status_7)
|
|
354
|
+
log.write(" -Variants not on given reference sequence : ",status_8,verbose=verbose)
|
|
302
355
|
|
|
303
356
|
if remove is True:
|
|
304
357
|
sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
|
|
305
|
-
|
|
306
|
-
|
|
358
|
+
log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
|
|
359
|
+
|
|
360
|
+
finished(log, verbose, _end_line)
|
|
307
361
|
return sumstats
|
|
308
362
|
|
|
309
363
|
#######################################################################################################################################
|
|
@@ -333,7 +387,7 @@ def assign_rsid_single(sumstats,path,rsid="rsID",chr="CHR",pos="POS",ref="NEA",a
|
|
|
333
387
|
## single df assignment
|
|
334
388
|
vcf_reader = VariantFile(path)
|
|
335
389
|
def rsid_helper(x,vcf_reader,chr_dict):
|
|
336
|
-
return chrposref_rsid(x[0],x[1],x[2],x[3],vcf_reader,chr_dict)
|
|
390
|
+
return chrposref_rsid(x.iloc[0],x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,chr_dict)
|
|
337
391
|
map_func=partial(rsid_helper,vcf_reader=vcf_reader,chr_dict=chr_dict)
|
|
338
392
|
rsID = sumstats.apply(map_func,axis=1)
|
|
339
393
|
return rsID
|
|
@@ -346,19 +400,31 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
346
400
|
all , overwrite rsid for all availalbe rsid
|
|
347
401
|
invalid, only assign rsid for variants with invalid rsid
|
|
348
402
|
empty only assign rsid for variants with na rsid
|
|
349
|
-
'''
|
|
403
|
+
'''
|
|
404
|
+
|
|
350
405
|
if ref_mode=="vcf":
|
|
351
406
|
###################################################################################################################
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
407
|
+
##start function with col checking##########################################################
|
|
408
|
+
_start_line = "assign rsID using reference VCF"
|
|
409
|
+
_end_line = "assign rsID using reference file"
|
|
410
|
+
_start_cols = [chr,pos,ref,alt,status]
|
|
411
|
+
_start_function = ".assign_rsid()"
|
|
412
|
+
_must_args ={}
|
|
413
|
+
|
|
414
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
415
|
+
log=log,
|
|
416
|
+
verbose=verbose,
|
|
417
|
+
start_line=_start_line,
|
|
418
|
+
end_line=_end_line,
|
|
419
|
+
start_cols=_start_cols,
|
|
420
|
+
start_function=_start_function,
|
|
421
|
+
n_cores=n_cores,
|
|
422
|
+
ref_vcf=path,
|
|
423
|
+
**_must_args)
|
|
424
|
+
if is_enough_info == False: return sumstats
|
|
425
|
+
############################################################################################
|
|
357
426
|
chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
|
|
358
|
-
|
|
359
|
-
if verbose: log.write(" -Assigning rsID based on chr:pos and ref:alt/alt:ref...")
|
|
360
|
-
|
|
361
|
-
|
|
427
|
+
log.write(" -Assigning rsID based on CHR:POS and REF:ALT/ALT:REF...",verbose=verbose)
|
|
362
428
|
##############################################
|
|
363
429
|
if rsid not in sumstats.columns:
|
|
364
430
|
sumstats[rsid]=pd.Series(dtype="string")
|
|
@@ -380,7 +446,8 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
380
446
|
|
|
381
447
|
if sum(to_assign)>0:
|
|
382
448
|
if sum(to_assign)<10000: n_cores=1
|
|
383
|
-
df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
|
|
449
|
+
#df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
|
|
450
|
+
df_split = _df_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
|
|
384
451
|
pool = Pool(n_cores)
|
|
385
452
|
map_func = partial(assign_rsid_single,path=path,chr=chr,pos=pos,ref=ref,alt=alt,chr_dict=chr_dict)
|
|
386
453
|
assigned_rsid = pd.concat(pool.map(map_func,df_split))
|
|
@@ -391,40 +458,57 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
391
458
|
##################################################################################################################
|
|
392
459
|
|
|
393
460
|
after_number = sum(~sumstats[rsid].isna())
|
|
394
|
-
|
|
395
|
-
|
|
461
|
+
log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!",verbose=verbose)
|
|
462
|
+
log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!",verbose=verbose)
|
|
396
463
|
|
|
397
464
|
##################################################################################################################
|
|
398
465
|
elif ref_mode=="tsv":
|
|
399
466
|
'''
|
|
400
467
|
assign rsID based on chr:pos
|
|
401
468
|
'''
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
469
|
+
##start function with col checking##########################################################
|
|
470
|
+
_start_line = "assign rsID by matching SNPID with CHR:POS:REF:ALT in the reference TSV"
|
|
471
|
+
_end_line = "assign rsID using reference file"
|
|
472
|
+
_start_cols = [snpid,status]
|
|
473
|
+
_start_function = ".assign_rsid()"
|
|
474
|
+
_must_args ={}
|
|
475
|
+
|
|
476
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
477
|
+
log=log,
|
|
478
|
+
verbose=verbose,
|
|
479
|
+
start_line=_start_line,
|
|
480
|
+
end_line=_end_line,
|
|
481
|
+
start_cols=_start_cols,
|
|
482
|
+
start_function=_start_function,
|
|
483
|
+
n_cores=n_cores,
|
|
484
|
+
ref_tsv=path,
|
|
485
|
+
**_must_args)
|
|
486
|
+
if is_enough_info == False: return sumstats
|
|
487
|
+
############################################################################################
|
|
405
488
|
|
|
406
|
-
standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]
|
|
489
|
+
standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
|
|
407
490
|
|
|
408
491
|
if rsid not in sumstats.columns:
|
|
409
492
|
sumstats[rsid]=pd.Series(dtype="string")
|
|
410
493
|
|
|
411
494
|
if overwrite == "empty":
|
|
412
|
-
to_assign = sumstats[rsid].isna()
|
|
495
|
+
to_assign = sumstats[rsid].isna() & standardized_normalized
|
|
413
496
|
if overwrite=="all":
|
|
414
497
|
to_assign = standardized_normalized
|
|
415
498
|
if overwrite=="invalid":
|
|
416
499
|
to_assign = (~sumstats[rsid].str.match(r'rs([0-9]+)', case=False, flags=0, na=False)) & standardized_normalized
|
|
500
|
+
|
|
417
501
|
total_number= len(sumstats)
|
|
418
502
|
pre_number = sum(~sumstats[rsid].isna())
|
|
419
|
-
|
|
503
|
+
log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...",verbose=verbose)
|
|
420
504
|
if sum(to_assign)>0:
|
|
421
505
|
sumstats = sumstats.set_index(snpid)
|
|
422
506
|
dic_chuncks = pd.read_csv(path,sep="\t",usecols=[ref_snpid,ref_rsid],
|
|
423
507
|
chunksize=chunksize,index_col=ref_snpid,
|
|
424
508
|
dtype={ref_snpid:"string",ref_rsid:"string"})
|
|
425
509
|
|
|
426
|
-
|
|
427
|
-
|
|
510
|
+
log.write(" -Setting block size: ",chunksize,verbose=verbose)
|
|
511
|
+
log.write(" -Loading block: ",end="",verbose=verbose)
|
|
428
512
|
for i,dic in enumerate(dic_chuncks):
|
|
429
513
|
gc.collect()
|
|
430
514
|
log.write(i," ",end=" ",show_time=False)
|
|
@@ -433,17 +517,18 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
433
517
|
dic = dic.loc[~dic.index.duplicated(keep=False),:]
|
|
434
518
|
sumstats.update(dic,overwrite=True)
|
|
435
519
|
|
|
436
|
-
|
|
520
|
+
log.write("\n",end="",show_time=False,verbose=verbose)
|
|
437
521
|
sumstats = sumstats.reset_index()
|
|
438
522
|
sumstats = sumstats.rename(columns = {'index':snpid})
|
|
439
523
|
|
|
440
524
|
after_number = sum(~sumstats[rsid].isna())
|
|
441
|
-
|
|
442
|
-
|
|
525
|
+
log.write(" -rsID annotation for "+str(total_number - after_number) +" needed to be fixed!",verbose=verbose)
|
|
526
|
+
log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!",verbose=verbose)
|
|
443
527
|
else:
|
|
444
|
-
|
|
528
|
+
log.write(" -No rsID can be fixed...skipping...",verbose=verbose)
|
|
445
529
|
################################################################################################################
|
|
446
|
-
|
|
530
|
+
|
|
531
|
+
finished(log,verbose,_end_line)
|
|
447
532
|
return sumstats
|
|
448
533
|
#################################################################################################################################################
|
|
449
534
|
#single record assignment
|
|
@@ -522,12 +607,12 @@ def is_palindromic(sumstats,a1="EA",a2="NEA"):
|
|
|
522
607
|
|
|
523
608
|
def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS"):
|
|
524
609
|
vcf_reader = VariantFile(ref_infer)
|
|
525
|
-
status_part = sumstats.apply(lambda x:check_strand_status(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict),axis=1)
|
|
610
|
+
status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
|
|
526
611
|
return status_part
|
|
527
612
|
|
|
528
613
|
def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
|
|
529
614
|
vcf_reader = VariantFile(ref_infer)
|
|
530
|
-
status_part = sumstats.apply(lambda x:check_unkonwn_indel(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict,daf_tolerance),axis=1)
|
|
615
|
+
status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
|
|
531
616
|
return status_part
|
|
532
617
|
|
|
533
618
|
##################################################################################################################################################
|
|
@@ -535,121 +620,141 @@ def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NE
|
|
|
535
620
|
def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
|
|
536
621
|
chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
|
|
537
622
|
chr_dict=None,verbose=True,log=Log()):
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
623
|
+
##start function with col checking##########################################################
|
|
624
|
+
_start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
|
|
625
|
+
_end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
|
|
626
|
+
_start_cols = [chr,pos,ref,alt,eaf,status]
|
|
627
|
+
_start_function = ".infer_strand()"
|
|
628
|
+
_must_args ={"ref_alt_freq":ref_alt_freq}
|
|
629
|
+
|
|
630
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
631
|
+
log=log,
|
|
632
|
+
verbose=verbose,
|
|
633
|
+
start_line=_start_line,
|
|
634
|
+
end_line=_end_line,
|
|
635
|
+
start_cols=_start_cols,
|
|
636
|
+
start_function=_start_function,
|
|
637
|
+
n_cores=n_cores,
|
|
638
|
+
ref_vcf=ref_infer,
|
|
639
|
+
**_must_args)
|
|
640
|
+
if is_enough_info == False: return sumstats
|
|
641
|
+
############################################################################################
|
|
541
642
|
|
|
542
643
|
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
644
|
+
|
|
645
|
+
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
543
646
|
|
|
544
|
-
# check if the columns are complete
|
|
545
|
-
if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
|
|
546
|
-
raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
|
|
547
647
|
if "p" in mode:
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
|
|
565
|
-
|
|
566
|
-
#palindromic WITH UNKNWON OR UNCHECKED STATUS
|
|
567
|
-
unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
|
|
648
|
+
## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
|
|
649
|
+
good_chrpos = sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
|
|
650
|
+
palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
|
|
651
|
+
not_palindromic_snp = good_chrpos & (~palindromic)
|
|
652
|
+
|
|
653
|
+
##not palindromic : change status
|
|
654
|
+
sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
|
|
655
|
+
log.write(" -Identified ", sum(palindromic)," palindromic SNPs...",verbose=verbose)
|
|
656
|
+
|
|
657
|
+
#palindromic but can not infer
|
|
658
|
+
maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
|
|
659
|
+
|
|
660
|
+
sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
|
|
661
|
+
|
|
662
|
+
#palindromic WITH UNKNWON OR UNCHECKED STATUS
|
|
663
|
+
unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
|
|
568
664
|
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
665
|
+
unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
|
|
666
|
+
|
|
667
|
+
log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)),verbose=verbose)
|
|
572
668
|
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
669
|
+
#########################################################################################
|
|
670
|
+
if sum(unknow_palindromic_to_check)>0:
|
|
671
|
+
if sum(unknow_palindromic_to_check)<10000:
|
|
672
|
+
n_cores=1
|
|
673
|
+
|
|
674
|
+
#df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
|
|
675
|
+
df_split = _df_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
|
|
676
|
+
pool = Pool(n_cores)
|
|
677
|
+
map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
678
|
+
status_inferred = pd.concat(pool.map(map_func,df_split))
|
|
679
|
+
sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
|
|
582
680
|
pool.close()
|
|
583
681
|
pool.join()
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
682
|
+
else:
|
|
683
|
+
log.warning("No palindromic variants available for checking.")
|
|
684
|
+
#########################################################################################
|
|
685
|
+
#0 Not palindromic SNPs
|
|
686
|
+
#1 Palindromic +strand -> no need to flip
|
|
687
|
+
#2 palindromic -strand -> need to flip -> fixed
|
|
688
|
+
#3 Indel no need flip
|
|
689
|
+
#4 Unknown Indel -> fixed
|
|
690
|
+
#5 Palindromic -strand -> need to flip
|
|
691
|
+
#6 Indel need flip
|
|
692
|
+
#7 indistinguishable
|
|
693
|
+
#8 Not matching or No information
|
|
694
|
+
#9 Unchecked
|
|
695
|
+
|
|
696
|
+
status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
|
|
697
|
+
status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
|
|
698
|
+
status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
|
|
699
|
+
status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
|
|
700
|
+
status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
|
|
701
|
+
|
|
702
|
+
log.write(" -Non-palindromic : ",sum(status0),verbose=verbose)
|
|
703
|
+
log.write(" -Palindromic SNPs on + strand: ",sum(status1),verbose=verbose)
|
|
704
|
+
log.write(" -Palindromic SNPs on - strand and needed to be flipped:",sum(status5),verbose=verbose)
|
|
705
|
+
log.write(" -Palindromic SNPs with MAF not available to infer : ",sum(status7),verbose=verbose)
|
|
706
|
+
log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8),verbose=verbose)
|
|
707
|
+
|
|
708
|
+
if ("7" in remove_snp) and ("8" in remove_snp) :
|
|
709
|
+
log.write(" -Palindromic SNPs with MAF not available to infer and with no macthes or no information will will be removed",verbose=verbose)
|
|
710
|
+
sumstats = sumstats.loc[~(status7 | status8),:].copy()
|
|
711
|
+
elif "8" in remove_snp:
|
|
712
|
+
log.write(" -Palindromic SNPs with no macthes or no information will be removed",verbose=verbose)
|
|
713
|
+
sumstats = sumstats.loc[~status8,:].copy()
|
|
714
|
+
elif "7" in remove_snp:
|
|
715
|
+
log.write(" -Palindromic SNPs with MAF not available to infer will be removed",verbose=verbose)
|
|
716
|
+
sumstats = sumstats.loc[~status7,:].copy()
|
|
617
717
|
|
|
618
718
|
### unknow_indel
|
|
619
719
|
if "i" in mode:
|
|
620
720
|
unknow_indel = sumstats[status].str.match(r'\w\w\w\w\w[6][89]', case=False, flags=0, na=False)
|
|
621
|
-
|
|
721
|
+
log.write(" -Identified ", sum(unknow_indel)," indistinguishable Indels...",verbose=verbose)
|
|
622
722
|
if sum(unknow_indel)>0:
|
|
623
|
-
|
|
723
|
+
log.write(" -Indistinguishable indels will be inferred from reference vcf REF and ALT...",verbose=verbose)
|
|
624
724
|
#########################################################################################
|
|
625
725
|
#with maf can not infer
|
|
626
|
-
#maf_can_infer = (sumstats
|
|
726
|
+
#maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
|
|
627
727
|
#sumstats.loc[unknow_indel&(~maf_can_infer),status] = vchange_status(sumstats.loc[unknow_indel&(~maf_can_infer),status],7,"9","8")
|
|
628
|
-
|
|
728
|
+
log.write(" -Difference in allele frequency (DAF) tolerance: {}".format(daf_tolerance),verbose=verbose)
|
|
629
729
|
|
|
630
730
|
if sum(unknow_indel)>0:
|
|
631
731
|
if sum(unknow_indel)<10000:
|
|
632
732
|
n_cores=1
|
|
633
|
-
df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
|
|
733
|
+
#df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
|
|
734
|
+
df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
|
|
634
735
|
pool = Pool(n_cores)
|
|
635
736
|
map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
|
|
636
737
|
status_inferred = pd.concat(pool.map(map_func,df_split))
|
|
637
738
|
sumstats.loc[unknow_indel,status] = status_inferred.values
|
|
638
|
-
|
|
639
|
-
|
|
739
|
+
pool.close()
|
|
740
|
+
pool.join()
|
|
741
|
+
|
|
640
742
|
#########################################################################################
|
|
641
743
|
|
|
642
744
|
status3 = sumstats[status].str.match(r'\w\w\w\w\w\w[3]', case=False, flags=0, na=False)
|
|
643
745
|
status6 = sumstats[status].str.match(r'\w\w\w\w\w\w[6]', case=False, flags=0, na=False)
|
|
644
746
|
status8 = sumstats[status].str.match(r'\w\w\w\w\w[6][8]', case=False, flags=0, na=False)
|
|
645
747
|
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
748
|
+
log.write(" -Indels ea/nea match reference : ",sum(status3),verbose=verbose)
|
|
749
|
+
log.write(" -Indels ea/nea need to be flipped : ",sum(status6),verbose=verbose)
|
|
750
|
+
log.write(" -Indels with no macthes or no information : ",sum(status8),verbose=verbose)
|
|
649
751
|
if "8" in remove_indel:
|
|
650
|
-
|
|
651
|
-
sumstats = sumstats.loc[~status8,:].copy()
|
|
652
|
-
|
|
752
|
+
log.write(" -Indels with no macthes or no information will be removed",verbose=verbose)
|
|
753
|
+
sumstats = sumstats.loc[~status8,:].copy()
|
|
754
|
+
else:
|
|
755
|
+
log.warning("No indistinguishable indels available for checking.")
|
|
756
|
+
|
|
757
|
+
finished(log,verbose,_end_line)
|
|
653
758
|
return sumstats
|
|
654
759
|
|
|
655
760
|
|
|
@@ -673,31 +778,45 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
673
778
|
|
|
674
779
|
################################################################################################################
|
|
675
780
|
def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
781
|
+
##start function with col checking##########################################################
|
|
782
|
+
_start_line = "check the difference between EAF and reference VCF ALT frequency"
|
|
783
|
+
_end_line = "checking the difference between EAF and reference VCF ALT frequency"
|
|
784
|
+
_start_cols = [chr,pos,ref,alt,eaf,status]
|
|
785
|
+
_start_function = ".check_daf()"
|
|
786
|
+
_must_args ={"ref_alt_freq":ref_alt_freq}
|
|
787
|
+
|
|
788
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
789
|
+
log=log,
|
|
790
|
+
verbose=verbose,
|
|
791
|
+
start_line=_start_line,
|
|
792
|
+
end_line=_end_line,
|
|
793
|
+
start_cols=_start_cols,
|
|
794
|
+
start_function=_start_function,
|
|
795
|
+
n_cores=n_cores,
|
|
796
|
+
ref_vcf=ref_infer,
|
|
797
|
+
**_must_args)
|
|
798
|
+
if is_enough_info == False: return sumstats
|
|
799
|
+
############################################################################################
|
|
800
|
+
|
|
682
801
|
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
683
802
|
|
|
684
803
|
column_name = column_name + suffix
|
|
685
|
-
# check if the columns are complete
|
|
686
|
-
if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
|
|
687
|
-
raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
|
|
688
804
|
|
|
805
|
+
|
|
806
|
+
|
|
689
807
|
# ref_alt_freq INFO in vcf was provided
|
|
690
808
|
if ref_alt_freq is not None:
|
|
691
|
-
|
|
809
|
+
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
692
810
|
if not force:
|
|
693
811
|
good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
|
|
694
|
-
|
|
812
|
+
log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
|
|
695
813
|
sumstats[column_name]=np.nan
|
|
696
814
|
|
|
697
815
|
########################
|
|
698
816
|
if sum(~sumstats[eaf].isna())<10000:
|
|
699
817
|
n_cores=1
|
|
700
|
-
df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
|
|
818
|
+
#df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
|
|
819
|
+
df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
|
|
701
820
|
pool = Pool(n_cores)
|
|
702
821
|
if sum(~sumstats[eaf].isna())>0:
|
|
703
822
|
map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,column_name=column_name,chr_dict=chr_dict)
|
|
@@ -708,25 +827,25 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
|
|
|
708
827
|
#status_inferred = sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]].apply(lambda x:check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict),axis=1)
|
|
709
828
|
|
|
710
829
|
#sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
|
|
711
|
-
#sumstats
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
830
|
+
#sumstats["DAF"]=sumstats["DAF"].astype("float")
|
|
831
|
+
log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]),verbose=verbose)
|
|
832
|
+
log.write(" - {} min:".format(column_name), np.nanmin(sumstats[column_name]),verbose=verbose)
|
|
833
|
+
log.write(" - {} sd:".format(column_name), np.nanstd(sumstats[column_name]),verbose=verbose)
|
|
834
|
+
log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats[column_name])),verbose=verbose)
|
|
835
|
+
log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats[column_name])),verbose=verbose)
|
|
836
|
+
log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats[column_name])),verbose=verbose)
|
|
837
|
+
log.write("Finished allele frequency checking!")
|
|
719
838
|
return sumstats
|
|
720
839
|
|
|
721
840
|
def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
|
|
722
841
|
#vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
|
|
723
842
|
vcf_reader = VariantFile(ref_infer)
|
|
724
843
|
def afapply(x,vcf,alt_freq,chr_dict):
|
|
725
|
-
return check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict)
|
|
844
|
+
return check_daf(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,chr_dict)
|
|
726
845
|
map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
727
846
|
status_inferred = sumstats.apply(map_func,axis=1)
|
|
728
|
-
sumstats
|
|
729
|
-
sumstats
|
|
847
|
+
sumstats[column_name] = status_inferred.values
|
|
848
|
+
sumstats[column_name]=sumstats[column_name].astype("float")
|
|
730
849
|
return sumstats
|
|
731
850
|
|
|
732
851
|
def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
|
|
@@ -741,33 +860,44 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
|
|
|
741
860
|
################################################################################################################
|
|
742
861
|
|
|
743
862
|
def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
863
|
+
##start function with col checking##########################################################
|
|
864
|
+
_start_line = "infer EAF using reference VCF ALT frequency"
|
|
865
|
+
_end_line = "inferring EAF using reference VCF ALT frequency"
|
|
866
|
+
_start_cols = [chr,pos,ref,alt,eaf,status]
|
|
867
|
+
_start_function = ".infer_af()"
|
|
868
|
+
_must_args ={"ref_alt_freq":ref_alt_freq}
|
|
869
|
+
|
|
870
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
871
|
+
log=log,
|
|
872
|
+
verbose=verbose,
|
|
873
|
+
start_line=_start_line,
|
|
874
|
+
end_line=_end_line,
|
|
875
|
+
start_cols=_start_cols,
|
|
876
|
+
start_function=_start_function,
|
|
877
|
+
n_cores=n_cores,
|
|
878
|
+
ref_vcf=ref_infer,
|
|
879
|
+
**_must_args)
|
|
880
|
+
if is_enough_info == False: return sumstats
|
|
881
|
+
############################################################################################
|
|
750
882
|
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
751
|
-
|
|
752
|
-
# check if the columns are complete
|
|
753
|
-
if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
|
|
754
|
-
raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
|
|
755
883
|
|
|
756
884
|
if eaf not in sumstats.columns:
|
|
757
885
|
sumstats[eaf]=np.nan
|
|
758
886
|
|
|
759
887
|
prenumber = sum(sumstats[eaf].isna())
|
|
888
|
+
|
|
760
889
|
# ref_alt_freq INFO in vcf was provided
|
|
761
890
|
if ref_alt_freq is not None:
|
|
762
|
-
|
|
891
|
+
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
763
892
|
if not force:
|
|
764
893
|
good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
|
|
765
|
-
|
|
894
|
+
log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
|
|
766
895
|
|
|
767
896
|
########################
|
|
768
897
|
if sum(sumstats[eaf].isna())<10000:
|
|
769
898
|
n_cores=1
|
|
770
|
-
df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
|
|
899
|
+
#df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
|
|
900
|
+
df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
|
|
771
901
|
pool = Pool(n_cores)
|
|
772
902
|
map_func = partial(inferaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
773
903
|
sumstats.loc[good_chrpos,[eaf]] = pd.concat(pool.map(map_func,df_split))
|
|
@@ -776,20 +906,21 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
|
|
|
776
906
|
###########################
|
|
777
907
|
|
|
778
908
|
afternumber = sum(sumstats[eaf].isna())
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
909
|
+
log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber),verbose=verbose)
|
|
910
|
+
log.write(" -EAF is still missing for {} variants.".format(afternumber),verbose=verbose)
|
|
911
|
+
|
|
912
|
+
finished(log,verbose,_end_line)
|
|
782
913
|
return sumstats
|
|
783
914
|
|
|
784
915
|
def inferaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
|
|
785
916
|
#vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
|
|
786
917
|
vcf_reader = VariantFile(ref_infer)
|
|
787
918
|
def afapply(x,vcf,alt_freq,chr_dict):
|
|
788
|
-
return infer_af(x[0],x[1]-1,x[1],x[2],x[3],vcf_reader,ref_alt_freq,chr_dict)
|
|
919
|
+
return infer_af(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,ref_alt_freq,chr_dict)
|
|
789
920
|
map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
790
921
|
status_inferred = sumstats.apply(map_func,axis=1)
|
|
791
|
-
sumstats
|
|
792
|
-
sumstats
|
|
922
|
+
sumstats[eaf] = status_inferred.values
|
|
923
|
+
sumstats[eaf]=sumstats[eaf].astype("float")
|
|
793
924
|
return sumstats
|
|
794
925
|
|
|
795
926
|
def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
|
|
@@ -810,13 +941,13 @@ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
|
|
|
810
941
|
def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
|
|
811
942
|
if vcf_path is not None:
|
|
812
943
|
if vcf_chr_dict is None:
|
|
813
|
-
|
|
944
|
+
log.write(" -Checking prefix for chromosomes in vcf files..." ,verbose=verbose)
|
|
814
945
|
prefix = check_vcf_chr_prefix(vcf_path)
|
|
815
946
|
if prefix is not None:
|
|
816
|
-
|
|
947
|
+
log.write(" -Prefix for chromosomes: ",prefix)
|
|
817
948
|
vcf_chr_dict = get_number_to_chr(prefix=prefix)
|
|
818
949
|
else:
|
|
819
|
-
|
|
950
|
+
log.write(" -No prefix for chromosomes in the VCF files." ,verbose=verbose)
|
|
820
951
|
vcf_chr_dict = get_number_to_chr()
|
|
821
952
|
return vcf_chr_dict
|
|
822
953
|
|
|
@@ -827,4 +958,6 @@ def check_vcf_chr_prefix(vcf_bcf_path):
|
|
|
827
958
|
if m is not None:
|
|
828
959
|
return m.group(1)
|
|
829
960
|
else:
|
|
830
|
-
return None
|
|
961
|
+
return None
|
|
962
|
+
|
|
963
|
+
|