gwaslab 3.5.6__py3-none-any.whl → 3.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +2 -0
- gwaslab/bd_common_data.py +1 -0
- gwaslab/bd_get_hapmap3.py +0 -1
- gwaslab/data/formatbook.json +78 -0
- gwaslab/g_Sumstats.py +98 -24
- gwaslab/g_SumstatsMulti.py +287 -0
- gwaslab/g_SumstatsPair.py +101 -16
- gwaslab/g_Sumstats_polars.py +245 -0
- gwaslab/g_headers.py +12 -3
- gwaslab/g_meta.py +123 -47
- gwaslab/g_meta_update.py +48 -0
- gwaslab/g_vchange_status_polars.py +44 -0
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +169 -110
- gwaslab/hm_casting_polars.py +202 -0
- gwaslab/hm_harmonize_sumstats.py +19 -8
- gwaslab/io_load_ld.py +529 -0
- gwaslab/io_preformat_input.py +11 -0
- gwaslab/io_preformat_input_polars.py +632 -0
- gwaslab/io_process_args.py +25 -1
- gwaslab/io_read_ldsc.py +34 -3
- gwaslab/io_read_pipcs.py +62 -6
- gwaslab/prscs_gigrnd.py +122 -0
- gwaslab/prscs_mcmc_gtb.py +136 -0
- gwaslab/prscs_parse_genet.py +98 -0
- gwaslab/qc_build.py +53 -0
- gwaslab/qc_check_datatype.py +10 -8
- gwaslab/qc_check_datatype_polars.py +128 -0
- gwaslab/qc_fix_sumstats.py +25 -23
- gwaslab/qc_fix_sumstats_polars.py +193 -0
- gwaslab/util_ex_calculate_ldmatrix.py +49 -19
- gwaslab/util_ex_gwascatalog.py +71 -28
- gwaslab/util_ex_ldsc.py +67 -21
- gwaslab/util_ex_match_ldmatrix.py +396 -0
- gwaslab/util_ex_run_2samplemr.py +0 -2
- gwaslab/util_ex_run_ccgwas.py +155 -0
- gwaslab/util_ex_run_coloc.py +1 -1
- gwaslab/util_ex_run_hyprcoloc.py +117 -0
- gwaslab/util_ex_run_mesusie.py +155 -0
- gwaslab/util_ex_run_mtag.py +92 -0
- gwaslab/util_ex_run_prscs.py +85 -0
- gwaslab/util_ex_run_susie.py +40 -9
- gwaslab/util_in_estimate_ess.py +18 -0
- gwaslab/util_in_fill_data.py +20 -1
- gwaslab/util_in_filter_value.py +10 -5
- gwaslab/util_in_get_sig.py +71 -13
- gwaslab/util_in_meta.py +168 -4
- gwaslab/util_in_meta_polars.py +174 -0
- gwaslab/viz_plot_compare_effect.py +87 -23
- gwaslab/viz_plot_credible_sets.py +55 -11
- gwaslab/viz_plot_effect.py +22 -12
- gwaslab/viz_plot_miamiplot2.py +3 -2
- gwaslab/viz_plot_mqqplot.py +165 -141
- gwaslab/viz_plot_qqplot.py +6 -6
- gwaslab/viz_plot_regional2.py +5 -13
- gwaslab/viz_plot_rg_heatmap.py +6 -1
- gwaslab/viz_plot_stackedregional.py +21 -6
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/METADATA +9 -7
- gwaslab-3.5.8.dist-info/RECORD +117 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
- gwaslab-3.5.6.dist-info/RECORD +0 -96
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0
gwaslab/hm_casting.py
CHANGED
|
@@ -11,20 +11,38 @@ from gwaslab.util_in_fill_data import filldata
|
|
|
11
11
|
from Bio import SeqIO
|
|
12
12
|
from itertools import combinations
|
|
13
13
|
|
|
14
|
-
def _merge_mold_with_sumstats_by_chrpos(mold, sumstats, ref_path=None,
|
|
14
|
+
def _merge_mold_with_sumstats_by_chrpos(mold, sumstats, ref_path=None,add_raw_index=False, stats_cols1=None, stats_cols2=None,
|
|
15
|
+
windowsizeb=10,
|
|
16
|
+
log=Log(),
|
|
17
|
+
suffixes=("_MOLD",""),
|
|
18
|
+
merge_mode="inner",
|
|
19
|
+
verbose=True,
|
|
20
|
+
return_not_matched_mold =False):
|
|
15
21
|
|
|
16
|
-
|
|
22
|
+
log.write("Start to merge sumstats...", verbose=verbose)
|
|
23
|
+
if merge_mode=="outer":
|
|
24
|
+
sumstats = sumstats.rename(columns={
|
|
25
|
+
"SNPID":"_SNPID_RIGHT",
|
|
26
|
+
"rsID":"_rsID_RIGHT"
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
# drop old ids
|
|
17
30
|
cols_to_drop = []
|
|
18
31
|
for i in sumstats.columns:
|
|
19
32
|
if i in ["SNPID","rsID"]:
|
|
20
|
-
cols_to_drop.append(i)
|
|
21
|
-
|
|
22
|
-
log.write("Start to merge sumstats...", verbose=verbose)
|
|
23
|
-
|
|
33
|
+
cols_to_drop.append(i)
|
|
24
34
|
if len(cols_to_drop)>0:
|
|
25
35
|
log.write(" -Dropping old IDs:{}".format(cols_to_drop), verbose=verbose)
|
|
26
36
|
sumstats = sumstats.drop(columns=cols_to_drop)
|
|
27
|
-
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
if add_raw_index==True:
|
|
40
|
+
index1= "_INDEX" + suffixes[0]
|
|
41
|
+
index2= "_INDEX" + suffixes[1]
|
|
42
|
+
mold[index1] = mold.index
|
|
43
|
+
sumstats[index2] = sumstats.index
|
|
44
|
+
|
|
45
|
+
|
|
28
46
|
if ref_path is not None :
|
|
29
47
|
# index for checking removed variants
|
|
30
48
|
index1= "_INDEX" + suffixes[0]
|
|
@@ -32,11 +50,35 @@ def _merge_mold_with_sumstats_by_chrpos(mold, sumstats, ref_path=None, windowsiz
|
|
|
32
50
|
mold[index1] = range(len(mold))
|
|
33
51
|
sumstats[index2] = range(len(sumstats))
|
|
34
52
|
|
|
35
|
-
if return_not_matched_mold:
|
|
36
|
-
|
|
53
|
+
#if return_not_matched_mold:
|
|
54
|
+
# mold["_IDENTIFIER_FOR_VARIANT"] = range(len(mold))
|
|
55
|
+
# sumstats["_IDENTIFIER_FOR_VARIANT2"] = range(len(sumstats))
|
|
37
56
|
|
|
38
57
|
# mold sumffix + mold
|
|
39
|
-
mold_sumstats = pd.merge(mold, sumstats, on=["CHR","POS"], how=
|
|
58
|
+
mold_sumstats = pd.merge(mold, sumstats, on=["CHR","POS"], how=merge_mode,suffixes=suffixes)
|
|
59
|
+
|
|
60
|
+
if merge_mode=="outer":
|
|
61
|
+
is_temp_na = mold_sumstats["EA_1"].isna()
|
|
62
|
+
log.write(" -Detected {} variants not in the template...".format(sum(is_temp_na)), verbose=verbose)
|
|
63
|
+
|
|
64
|
+
mold_sumstats["EA_1"] = mold_sumstats["EA_1"].astype("string")
|
|
65
|
+
mold_sumstats["NEA_1"] = mold_sumstats["NEA_1"].astype("string")
|
|
66
|
+
mold_sumstats["EA"] = mold_sumstats["EA"].astype("string")
|
|
67
|
+
mold_sumstats["NEA"] = mold_sumstats["NEA"].astype("string")
|
|
68
|
+
|
|
69
|
+
# for variants not in template, copy snp info
|
|
70
|
+
mold_sumstats.loc[is_temp_na, ["SNPID","EA_1","NEA_1","STATUS_1"]] = mold_sumstats.loc[is_temp_na, ["_SNPID_RIGHT","EA","NEA","STATUS"]].values
|
|
71
|
+
|
|
72
|
+
#
|
|
73
|
+
if "_rsID_RIGHT" in mold_sumstats.columns:
|
|
74
|
+
mold_sumstats.loc[is_temp_na, "rsID"] = mold_sumstats.loc[is_temp_na, "_rsID_RIGHT"].values
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# for variants not in right sumstats, copy snp info
|
|
78
|
+
is_temp_na_2 = mold_sumstats["EA"].isna()
|
|
79
|
+
mold_sumstats.loc[is_temp_na_2, ["EA","NEA"]] = mold_sumstats.loc[is_temp_na_2, ["EA_1","NEA_1"]].values
|
|
80
|
+
mold_sumstats = mold_sumstats.drop(columns=["_SNPID_RIGHT"])
|
|
81
|
+
|
|
40
82
|
log.write(" -After merging by CHR and POS:{}".format(len(mold_sumstats)), verbose=verbose)
|
|
41
83
|
|
|
42
84
|
mold_sumstats = _keep_variants_with_same_allele_set(mold_sumstats,suffixes=suffixes)
|
|
@@ -51,10 +93,17 @@ def _merge_mold_with_sumstats_by_chrpos(mold, sumstats, ref_path=None, windowsiz
|
|
|
51
93
|
# mold_sumstats.drop(columns=["_INDEX",""])
|
|
52
94
|
|
|
53
95
|
if return_not_matched_mold == True:
|
|
54
|
-
|
|
55
|
-
sumstats1=
|
|
56
|
-
|
|
57
|
-
|
|
96
|
+
|
|
97
|
+
sumstats1 = mold.loc[~mold["_RAW_INDEX_1"].isin(mold_sumstats["_RAW_INDEX_1"]),:]
|
|
98
|
+
sumstats1 = sumstats1.drop(columns=["_RAW_INDEX_1"])
|
|
99
|
+
sumstats1 = _renaming_cols_r(sumstats1, stats_cols1 +["EA","NEA"],suffix="_1", verbose=False)
|
|
100
|
+
|
|
101
|
+
sumstats2 = sumstats.loc[~sumstats["_RAW_INDEX_2"].isin(mold_sumstats["_RAW_INDEX_2"]),:]
|
|
102
|
+
sumstats2 = sumstats2.drop(columns=["_RAW_INDEX_2"])
|
|
103
|
+
|
|
104
|
+
mold_sumstats= mold_sumstats.drop(columns=["_RAW_INDEX_1","_RAW_INDEX_2"])
|
|
105
|
+
|
|
106
|
+
return mold_sumstats, sumstats1, sumstats2
|
|
58
107
|
|
|
59
108
|
return mold_sumstats
|
|
60
109
|
|
|
@@ -117,6 +166,16 @@ def _renaming_cols(sumstats, columns, log=Log(),verbose=True, suffixes=("_1","_2
|
|
|
117
166
|
log.write(" -Renaming sumstats2 columns by adding suffix {}".format(suffixes[1]),verbose=verbose)
|
|
118
167
|
return sumstats
|
|
119
168
|
|
|
169
|
+
def _renaming_cols_r(sumstats, columns, log=Log(),verbose=True, suffix=""):
|
|
170
|
+
# columns: name without suffix
|
|
171
|
+
to_rename =[]
|
|
172
|
+
for col in columns:
|
|
173
|
+
if col + suffix in sumstats.columns:
|
|
174
|
+
to_rename.append(col)
|
|
175
|
+
sumstats = sumstats.rename(columns={i + suffix:i for i in to_rename})
|
|
176
|
+
log.write(" -Renaming sumstats columns by removing suffix {}".format(suffix),verbose=verbose)
|
|
177
|
+
return sumstats
|
|
178
|
+
|
|
120
179
|
def _sort_pair_cols(molded_sumstats, verbose=True, log=Log(), order=None, stats_order=None,suffixes=("_1","_2")):
|
|
121
180
|
if stats_order is None:
|
|
122
181
|
order = ["SNPID","rsID", "CHR", "POS", "EA", "NEA"]
|
|
@@ -158,99 +217,99 @@ def _assign_warning_code(sumstats, threshold=0.2, log=Log(),verbose=True):
|
|
|
158
217
|
return sumstats
|
|
159
218
|
|
|
160
219
|
|
|
161
|
-
def _match_two_sumstats(mold,sumstats,ref_path,windowsizeb=25,verbose=True,log=Log()):
|
|
162
|
-
|
|
163
|
-
records = SeqIO.parse(ref_path, "fasta")
|
|
164
|
-
|
|
165
|
-
chromlist = list(set(mold["CHR"].values) & set(sumstats["CHR"].values))
|
|
166
|
-
|
|
167
|
-
for record in records:
|
|
168
|
-
if len(chromlist) ==0:
|
|
169
|
-
break
|
|
170
|
-
|
|
171
|
-
if record is not None:
|
|
172
|
-
##############################################################################
|
|
173
|
-
record_chr = int(str(record.id).strip("chrCHR").upper())
|
|
174
|
-
|
|
175
|
-
if record_chr in chromlist:
|
|
176
|
-
log.write(record_chr," ", end="",show_time=False,verbose=verbose)
|
|
177
|
-
chromlist.remove(record_chr)
|
|
178
|
-
else:
|
|
179
|
-
continue
|
|
180
|
-
###############################################################################
|
|
181
|
-
mold_chr = mold.loc[mold["CHR"]==record_chr,:]
|
|
182
|
-
sumstats_chr = sumstats.loc[sumstats["CHR"]==record_chr,:]
|
|
183
|
-
|
|
184
|
-
for index, row in sumstats_chr.iterrows():
|
|
185
|
-
if len(row["EA"])>1 or len(row["NEA"])>1:
|
|
186
|
-
is_in_variants_lista = (mold_chr["POS"] > row["POS"] - windowsizeb) & (mold_chr["POS"]< row["POS"] + windowsizeb)
|
|
187
|
-
|
|
188
|
-
is_in_variants_listb = (sumstats_chr["POS"] > row["POS"] - windowsizeb) & (sumstats_chr["POS"]< row["POS"] + windowsizeb)
|
|
189
|
-
|
|
190
|
-
if sum(is_in_variants_lista)>0 and sum(is_in_variants_listb)>0 and (sum(is_in_variants_lista) + sum(is_in_variants_listb) >2):
|
|
191
|
-
variants_lista = mold.loc[is_in_variants_lista,:]
|
|
192
|
-
variants_listb = sumstats.loc[is_in_variants_listb,:]
|
|
193
|
-
|
|
194
|
-
refseq = record[row["POS"]-1 - windowsizeb: row["POS"] + windowsizeb].seq.upper()
|
|
195
|
-
_match_single_variant(refseq, variants_lista, variants_listb, left_offset=row["POS"] - windowsizeb, windowsizeb=windowsizeb)
|
|
196
|
-
|
|
197
|
-
def _match_single_variant(refseq, variants_lista, variants_listb, left_offset,windowsizeb):
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
seta=set()
|
|
201
|
-
setb=set()
|
|
202
|
-
|
|
203
|
-
seta_pumutations=[]
|
|
204
|
-
for i in range(1, len(variants_lista)+1):
|
|
205
|
-
seta_pumutations+=combinations(variants_lista.index, i)
|
|
206
|
-
|
|
207
|
-
for i in seta_pumutations:
|
|
208
|
-
if _is_ref_overlap(variants_lista.loc[i,:],suffix="_MOLD"):
|
|
209
|
-
continue
|
|
210
|
-
else:
|
|
211
|
-
seta = _form_haplotype(refseq, variants_lista.loc[i,:], seta, left_offset,suffix="_MOLD")
|
|
212
|
-
|
|
213
|
-
setb_pumutations=[]
|
|
214
|
-
for i in range(1,len(variants_listb)+1):
|
|
215
|
-
setb_pumutations+=combinations(variants_listb.index, i)
|
|
216
|
-
for i in setb_pumutations:
|
|
217
|
-
if _is_ref_overlap(variants_listb.loc[i,:],suffix=""):
|
|
218
|
-
continue
|
|
219
|
-
else:
|
|
220
|
-
setb = _form_haplotype(refseq, variants_listb.loc[i,:], setb, left_offset,suffix="")
|
|
221
|
-
|
|
222
|
-
if len(seta & setb)>0:
|
|
223
|
-
print("-Topmed--------------------------------")
|
|
224
|
-
print(variants_lista[["CHR","POS","NEA_MOLD","EA_MOLD","EAF_MOLD"]])
|
|
225
|
-
print("-Finngen--------------------------------")
|
|
226
|
-
print(variants_listb[["CHR","POS","NEA","EA","EAF"]])
|
|
227
|
-
print(refseq,left_offset)
|
|
228
|
-
print("-set a--------------------------------")
|
|
229
|
-
print(seta)
|
|
230
|
-
print("-set b---------------------------------")
|
|
231
|
-
print(setb)
|
|
232
|
-
print("------------------------------------")
|
|
233
|
-
print("maybe equivalent ########################################################################")
|
|
234
|
-
a = seta & setb
|
|
235
|
-
for i in a:
|
|
236
|
-
print(i)
|
|
237
|
-
|
|
238
|
-
def _is_ref_overlap(variants_list,suffix="_MOLD"):
|
|
239
|
-
previous_end = 0
|
|
240
|
-
for index, row in variants_list.iterrows():
|
|
241
|
-
if row["POS"] <= previous_end:
|
|
242
|
-
return True
|
|
243
|
-
if row["POS"] + len(row["NEA"+suffix]) -1 > previous_end:
|
|
244
|
-
previous_end = row["POS"] + len(row["NEA"+suffix]) -1
|
|
245
|
-
return False
|
|
246
|
-
|
|
247
|
-
def _form_haplotype(refseq, variants_list, haplotype_set, left_offset,suffix="_MOLD"):
|
|
248
|
-
new_haplotype = ""
|
|
249
|
-
lastpos = 0
|
|
250
|
-
for index, row in variants_list.iterrows():
|
|
251
|
-
new_haplotype += refseq[lastpos:row["POS"] - left_offset]
|
|
252
|
-
new_haplotype += row["EA"+suffix]
|
|
253
|
-
lastpos = row["POS"] + len(row["NEA"+suffix])- left_offset
|
|
254
|
-
new_haplotype += refseq[lastpos:]
|
|
255
|
-
haplotype_set.add(new_haplotype)
|
|
256
|
-
return haplotype_set
|
|
220
|
+
#def _match_two_sumstats(mold,sumstats,ref_path,windowsizeb=25,verbose=True,log=Log()):
|
|
221
|
+
#
|
|
222
|
+
# records = SeqIO.parse(ref_path, "fasta")
|
|
223
|
+
#
|
|
224
|
+
# chromlist = list(set(mold["CHR"].values) & set(sumstats["CHR"].values))
|
|
225
|
+
#
|
|
226
|
+
# for record in records:
|
|
227
|
+
# if len(chromlist) ==0:
|
|
228
|
+
# break
|
|
229
|
+
#
|
|
230
|
+
# if record is not None:
|
|
231
|
+
# ##############################################################################
|
|
232
|
+
# record_chr = int(str(record.id).strip("chrCHR").upper())
|
|
233
|
+
#
|
|
234
|
+
# if record_chr in chromlist:
|
|
235
|
+
# log.write(record_chr," ", end="",show_time=False,verbose=verbose)
|
|
236
|
+
# chromlist.remove(record_chr)
|
|
237
|
+
# else:
|
|
238
|
+
# continue
|
|
239
|
+
# ###############################################################################
|
|
240
|
+
# mold_chr = mold.loc[mold["CHR"]==record_chr,:]
|
|
241
|
+
# sumstats_chr = sumstats.loc[sumstats["CHR"]==record_chr,:]
|
|
242
|
+
#
|
|
243
|
+
# for index, row in sumstats_chr.iterrows():
|
|
244
|
+
# if len(row["EA"])>1 or len(row["NEA"])>1:
|
|
245
|
+
# is_in_variants_lista = (mold_chr["POS"] > row["POS"] - windowsizeb) & (mold_chr["POS"]< row["POS"] + windowsizeb)
|
|
246
|
+
#
|
|
247
|
+
# is_in_variants_listb = (sumstats_chr["POS"] > row["POS"] - windowsizeb) & (sumstats_chr["POS"]< row["POS"] + windowsizeb)
|
|
248
|
+
#
|
|
249
|
+
# if sum(is_in_variants_lista)>0 and sum(is_in_variants_listb)>0 and (sum(is_in_variants_lista) + sum(is_in_variants_listb) >2):
|
|
250
|
+
# variants_lista = mold.loc[is_in_variants_lista,:]
|
|
251
|
+
# variants_listb = sumstats.loc[is_in_variants_listb,:]
|
|
252
|
+
#
|
|
253
|
+
# refseq = record[row["POS"]-1 - windowsizeb: row["POS"] + windowsizeb].seq.upper()
|
|
254
|
+
# _match_single_variant(refseq, variants_lista, variants_listb, left_offset=row["POS"] - windowsizeb, windowsizeb=windowsizeb)
|
|
255
|
+
#
|
|
256
|
+
#def _match_single_variant(refseq, variants_lista, variants_listb, left_offset,windowsizeb):
|
|
257
|
+
#
|
|
258
|
+
#
|
|
259
|
+
# seta=set()
|
|
260
|
+
# setb=set()
|
|
261
|
+
#
|
|
262
|
+
# seta_pumutations=[]
|
|
263
|
+
# for i in range(1, len(variants_lista)+1):
|
|
264
|
+
# seta_pumutations+=combinations(variants_lista.index, i)
|
|
265
|
+
#
|
|
266
|
+
# for i in seta_pumutations:
|
|
267
|
+
# if _is_ref_overlap(variants_lista.loc[i,:],suffix="_MOLD"):
|
|
268
|
+
# continue
|
|
269
|
+
# else:
|
|
270
|
+
# seta = _form_haplotype(refseq, variants_lista.loc[i,:], seta, left_offset,suffix="_MOLD")
|
|
271
|
+
#
|
|
272
|
+
# setb_pumutations=[]
|
|
273
|
+
# for i in range(1,len(variants_listb)+1):
|
|
274
|
+
# setb_pumutations+=combinations(variants_listb.index, i)
|
|
275
|
+
# for i in setb_pumutations:
|
|
276
|
+
# if _is_ref_overlap(variants_listb.loc[i,:],suffix=""):
|
|
277
|
+
# continue
|
|
278
|
+
# else:
|
|
279
|
+
# setb = _form_haplotype(refseq, variants_listb.loc[i,:], setb, left_offset,suffix="")
|
|
280
|
+
#
|
|
281
|
+
# if len(seta & setb)>0:
|
|
282
|
+
# print("-Topmed--------------------------------")
|
|
283
|
+
# print(variants_lista[["CHR","POS","NEA_MOLD","EA_MOLD","EAF_MOLD"]])
|
|
284
|
+
# print("-Finngen--------------------------------")
|
|
285
|
+
# print(variants_listb[["CHR","POS","NEA","EA","EAF"]])
|
|
286
|
+
# print(refseq,left_offset)
|
|
287
|
+
# print("-set a--------------------------------")
|
|
288
|
+
# print(seta)
|
|
289
|
+
# print("-set b---------------------------------")
|
|
290
|
+
# print(setb)
|
|
291
|
+
# print("------------------------------------")
|
|
292
|
+
# print("maybe equivalent ########################################################################")
|
|
293
|
+
# a = seta & setb
|
|
294
|
+
# for i in a:
|
|
295
|
+
# print(i)
|
|
296
|
+
#
|
|
297
|
+
#def _is_ref_overlap(variants_list,suffix="_MOLD"):
|
|
298
|
+
# previous_end = 0
|
|
299
|
+
# for index, row in variants_list.iterrows():
|
|
300
|
+
# if row["POS"] <= previous_end:
|
|
301
|
+
# return True
|
|
302
|
+
# if row["POS"] + len(row["NEA"+suffix]) -1 > previous_end:
|
|
303
|
+
# previous_end = row["POS"] + len(row["NEA"+suffix]) -1
|
|
304
|
+
# return False
|
|
305
|
+
#
|
|
306
|
+
#def _form_haplotype(refseq, variants_list, haplotype_set, left_offset,suffix="_MOLD"):
|
|
307
|
+
# new_haplotype = ""
|
|
308
|
+
# lastpos = 0
|
|
309
|
+
# for index, row in variants_list.iterrows():
|
|
310
|
+
# new_haplotype += refseq[lastpos:row["POS"] - left_offset]
|
|
311
|
+
# new_haplotype += row["EA"+suffix]
|
|
312
|
+
# lastpos = row["POS"] + len(row["NEA"+suffix])- left_offset
|
|
313
|
+
# new_haplotype += refseq[lastpos:]
|
|
314
|
+
# haplotype_set.add(new_haplotype)
|
|
315
|
+
# return haplotype_set
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from gwaslab.g_Log import Log
|
|
4
|
+
from pandas.api.types import CategoricalDtype
|
|
5
|
+
from gwaslab.g_vchange_status import copy_status
|
|
6
|
+
from gwaslab.g_vchange_status_polars import vchange_statusp
|
|
7
|
+
from gwaslab.g_vchange_status_polars import copy_statusp
|
|
8
|
+
from gwaslab.qc_fix_sumstats import flipallelestats
|
|
9
|
+
from gwaslab.qc_check_datatype import check_datatype
|
|
10
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
11
|
+
from gwaslab.util_in_fill_data import filldata
|
|
12
|
+
from Bio import SeqIO
|
|
13
|
+
from itertools import combinations
|
|
14
|
+
import polars as pl
|
|
15
|
+
|
|
16
|
+
def _merge_mold_with_sumstats_by_chrposp(mold, sumstats, ref_path=None,add_raw_index=False, stats_cols1=None, stats_cols2=None,
|
|
17
|
+
windowsizeb=10,
|
|
18
|
+
log=Log(),
|
|
19
|
+
suffixes=("_MOLD",""),
|
|
20
|
+
merge_mode="full",
|
|
21
|
+
verbose=True,
|
|
22
|
+
return_not_matched_mold =False):
|
|
23
|
+
|
|
24
|
+
log.write("Start to merge sumstats...", verbose=verbose)
|
|
25
|
+
if merge_mode=="full":
|
|
26
|
+
sumstats = sumstats.rename({
|
|
27
|
+
"SNPID":"_SNPID_RIGHT",
|
|
28
|
+
"rsID":"_rsID_RIGHT"
|
|
29
|
+
}, strict=False) #,
|
|
30
|
+
|
|
31
|
+
# drop old ids
|
|
32
|
+
cols_to_drop = []
|
|
33
|
+
for i in sumstats.columns:
|
|
34
|
+
if i in ["SNPID","rsID"]:
|
|
35
|
+
cols_to_drop.append(i)
|
|
36
|
+
if len(cols_to_drop)>0:
|
|
37
|
+
log.write(" -Dropping old IDs:{}".format(cols_to_drop), verbose=verbose)
|
|
38
|
+
sumstats = sumstats.drop(columns=cols_to_drop)
|
|
39
|
+
|
|
40
|
+
##################################################################################################################
|
|
41
|
+
|
|
42
|
+
# mold sumffix + mold
|
|
43
|
+
|
|
44
|
+
mold_sumstats = mold.join(sumstats, on=["CHR","POS"], how=merge_mode, suffix="_", coalesce=True)
|
|
45
|
+
|
|
46
|
+
if merge_mode=="full":
|
|
47
|
+
is_temp_na = mold_sumstats["EA_1"].is_null()
|
|
48
|
+
log.write(" -Detected {} variants not in the template...".format(sum(is_temp_na)), verbose=verbose)
|
|
49
|
+
|
|
50
|
+
for i in ["EA_1","NEA_1","EA","NEA"]:
|
|
51
|
+
mold_sumstats = mold_sumstats.with_columns(pl.col(i).cast(pl.String).alias(i))
|
|
52
|
+
|
|
53
|
+
# for variants not in template, copy snp info
|
|
54
|
+
mold_sumstats = mold_sumstats.with_columns(
|
|
55
|
+
pl.when( is_temp_na )
|
|
56
|
+
.then( pl.col("_SNPID_RIGHT") )
|
|
57
|
+
.otherwise( pl.col("SNPID") )
|
|
58
|
+
.alias("SNPID")
|
|
59
|
+
).with_columns(
|
|
60
|
+
pl.when( is_temp_na )
|
|
61
|
+
.then( pl.col("EA") )
|
|
62
|
+
.otherwise( pl.col("EA_1") )
|
|
63
|
+
.alias("EA_1")
|
|
64
|
+
).with_columns(
|
|
65
|
+
pl.when( is_temp_na )
|
|
66
|
+
.then( pl.col("NEA") )
|
|
67
|
+
.otherwise( pl.col("NEA_1") )
|
|
68
|
+
.alias("NEA_1")
|
|
69
|
+
).with_columns(
|
|
70
|
+
pl.when( is_temp_na )
|
|
71
|
+
.then( pl.col("STATUS") )
|
|
72
|
+
.otherwise( pl.col("STATUS_1") )
|
|
73
|
+
.alias("STATUS_1")
|
|
74
|
+
)
|
|
75
|
+
#
|
|
76
|
+
if "_rsID_RIGHT" in mold_sumstats.columns:
|
|
77
|
+
mold_sumstats = mold_sumstats.with_columns(
|
|
78
|
+
pl.when( is_temp_na )
|
|
79
|
+
.then( pl.col("_rsID_RIGHT") )
|
|
80
|
+
.otherwise( pl.col("rsID") )
|
|
81
|
+
.alias("rsID")
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# for variants not in right sumstats, copy snp info
|
|
86
|
+
is_temp_na_2 = mold_sumstats["EA"].is_null()
|
|
87
|
+
|
|
88
|
+
mold_sumstats = mold_sumstats.with_columns(
|
|
89
|
+
pl.when( is_temp_na_2 )
|
|
90
|
+
.then( pl.col("EA_1") )
|
|
91
|
+
.otherwise( pl.col("EA") )
|
|
92
|
+
.alias("EA")
|
|
93
|
+
).with_columns(
|
|
94
|
+
pl.when( is_temp_na_2 )
|
|
95
|
+
.then( pl.col("NEA_1") )
|
|
96
|
+
.otherwise( pl.col("NEA") )
|
|
97
|
+
.alias("NEA")
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
mold_sumstats = mold_sumstats.drop(["_SNPID_RIGHT"])
|
|
101
|
+
|
|
102
|
+
log.write(" -After merging by CHR and POS:{}".format(len(mold_sumstats)), verbose=verbose)
|
|
103
|
+
|
|
104
|
+
mold_sumstats = _keep_variants_with_same_allele_setp(mold_sumstats,suffixes=suffixes)
|
|
105
|
+
|
|
106
|
+
log.write(" -Matched variants:{}".format(len(mold_sumstats)), verbose=verbose)
|
|
107
|
+
|
|
108
|
+
return mold_sumstats
|
|
109
|
+
|
|
110
|
+
def _keep_variants_with_same_allele_setp(sumstats, log=Log(),verbose=True,suffixes=("_MOLD","")):
|
|
111
|
+
|
|
112
|
+
ea1="EA"+suffixes[0]
|
|
113
|
+
nea1="NEA"+suffixes[0]
|
|
114
|
+
ea2="EA"+suffixes[1]
|
|
115
|
+
nea2="NEA"+suffixes[1]
|
|
116
|
+
|
|
117
|
+
is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
|
|
118
|
+
is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
|
|
119
|
+
is_allele_set_match = is_flipped_match | is_perfect_match
|
|
120
|
+
|
|
121
|
+
log.write(" -Matching alleles and keeping only variants with same allele set: ", verbose=verbose)
|
|
122
|
+
log.write(" -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
|
|
123
|
+
log.write(" -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
|
|
124
|
+
log.write(" -Unmatched : {}".format(sum(~is_allele_set_match)), verbose=verbose)
|
|
125
|
+
sumstats = sumstats.filter(is_allele_set_match)
|
|
126
|
+
return sumstats
|
|
127
|
+
|
|
128
|
+
def _align_with_moldp(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
|
|
129
|
+
|
|
130
|
+
ea1="EA"+suffixes[0]
|
|
131
|
+
nea1="NEA"+suffixes[0]
|
|
132
|
+
ea2="EA"+suffixes[1]
|
|
133
|
+
nea2="NEA"+suffixes[1]
|
|
134
|
+
status1="STATUS"+suffixes[0]
|
|
135
|
+
status2="STATUS"+suffixes[1]
|
|
136
|
+
|
|
137
|
+
is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
|
|
138
|
+
is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
|
|
139
|
+
|
|
140
|
+
log.write(" -Aligning alleles with reference: ", verbose=verbose)
|
|
141
|
+
log.write(" -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
|
|
142
|
+
log.write(" -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
|
|
143
|
+
|
|
144
|
+
log.write(" -For perfect match: copy STATUS from reference...", verbose=verbose)
|
|
145
|
+
|
|
146
|
+
sumstats = copy_statusp(sumstats, is_perfect_match, status1, status2, 6)
|
|
147
|
+
|
|
148
|
+
log.write(" -For Flipped match: convert STATUS xxxxx[456789]x to xxxxx3x...", verbose=verbose)
|
|
149
|
+
|
|
150
|
+
sumstats = vchange_statusp(sumstats, is_flipped_match, status2, 6,"456789","333333")
|
|
151
|
+
|
|
152
|
+
return sumstats
|
|
153
|
+
|
|
154
|
+
def _fill_missing_columnsp(sumstats, columns, log=Log(),verbose=True):
|
|
155
|
+
sumstats = filldata(sumstats, to_fill=columns)
|
|
156
|
+
return sumstats
|
|
157
|
+
|
|
158
|
+
def _renaming_colsp(sumstats, columns, log=Log(),verbose=True, suffixes=("_1","_2")):
|
|
159
|
+
to_rename =["STATUS"]
|
|
160
|
+
for col in columns:
|
|
161
|
+
if col in sumstats.columns:
|
|
162
|
+
to_rename.append(col)
|
|
163
|
+
sumstats = sumstats.rename({i:i + suffixes[1] for i in to_rename})
|
|
164
|
+
log.write(" -Renaming sumstats2 columns by adding suffix {}".format(suffixes[1]),verbose=verbose)
|
|
165
|
+
return sumstats
|
|
166
|
+
|
|
167
|
+
def _renaming_cols_rp(sumstats, columns, log=Log(),verbose=True, suffix=""):
|
|
168
|
+
# columns: name without suffix
|
|
169
|
+
to_rename =[]
|
|
170
|
+
for col in columns:
|
|
171
|
+
if col + suffix in sumstats.columns:
|
|
172
|
+
to_rename.append(col)
|
|
173
|
+
sumstats = sumstats.rename({i + suffix:i for i in to_rename})
|
|
174
|
+
log.write(" -Renaming sumstats columns by removing suffix {}".format(suffix),verbose=verbose)
|
|
175
|
+
return sumstats
|
|
176
|
+
|
|
177
|
+
def _sort_pair_colsp(molded_sumstats, verbose=True, log=Log(), order=None, stats_order=None,suffixes=("_1","_2")):
|
|
178
|
+
if stats_order is None:
|
|
179
|
+
order = ["SNPID","rsID", "CHR", "POS", "EA", "NEA"]
|
|
180
|
+
stats_order = ["EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z",
|
|
181
|
+
"CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"]
|
|
182
|
+
|
|
183
|
+
for suffix in suffixes:
|
|
184
|
+
for i in stats_order:
|
|
185
|
+
order.append(i+suffix)
|
|
186
|
+
|
|
187
|
+
log.write("Start to reorder the columns...",verbose=verbose)
|
|
188
|
+
|
|
189
|
+
output_columns = []
|
|
190
|
+
|
|
191
|
+
for i in order:
|
|
192
|
+
if i in molded_sumstats.columns:
|
|
193
|
+
output_columns.append(i)
|
|
194
|
+
for i in molded_sumstats.columns:
|
|
195
|
+
if i not in order:
|
|
196
|
+
output_columns.append(i)
|
|
197
|
+
|
|
198
|
+
log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
|
|
199
|
+
molded_sumstats = molded_sumstats[ output_columns]
|
|
200
|
+
log.write("Finished sorting columns successfully!", verbose=verbose)
|
|
201
|
+
|
|
202
|
+
return molded_sumstats
|
gwaslab/hm_harmonize_sumstats.py
CHANGED
|
@@ -17,6 +17,7 @@ from gwaslab.qc_fix_sumstats import check_col
|
|
|
17
17
|
from gwaslab.qc_fix_sumstats import start_to
|
|
18
18
|
from gwaslab.qc_fix_sumstats import finished
|
|
19
19
|
from gwaslab.qc_fix_sumstats import skipped
|
|
20
|
+
from gwaslab.qc_fix_sumstats import sortcoordinate
|
|
20
21
|
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
21
22
|
from gwaslab.bd_common_data import get_number_to_chr
|
|
22
23
|
from gwaslab.bd_common_data import get_chr_list
|
|
@@ -397,7 +398,6 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
|
|
|
397
398
|
def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array, records_len: np.array):
|
|
398
399
|
# starting_positions and records_len must be 1D arrays containing data only for the chromosomes contained in x,
|
|
399
400
|
# and these arrays must be ordered in the same way as the chromosomes in np.unique(x['CHR'].values).
|
|
400
|
-
|
|
401
401
|
# status
|
|
402
402
|
#0 / -----> match
|
|
403
403
|
#1 / -----> Flipped Fixed
|
|
@@ -435,12 +435,13 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
435
435
|
_chrom = _chrom.values
|
|
436
436
|
unique_values, _ = np.unique(_chrom, return_inverse=True) # Get the sorted unique values and their indices
|
|
437
437
|
chrom = np.searchsorted(unique_values, _chrom) # Replace each value in '_chrom' with its corresponding index in the sorted unique values
|
|
438
|
-
|
|
438
|
+
|
|
439
439
|
max_len_nea = _nea.str.len().max()
|
|
440
440
|
max_len_ea = _ea.str.len().max()
|
|
441
441
|
|
|
442
442
|
########################################## mask for variants with out of range POS
|
|
443
443
|
mask_outlier = pos > records_len[chrom]
|
|
444
|
+
|
|
444
445
|
#########################################
|
|
445
446
|
|
|
446
447
|
# Let's apply the same magic used for the fasta records (check build_fasta_records() for details) to convert the NEA and EA to
|
|
@@ -538,6 +539,7 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
538
539
|
# -> nea == output_nea: [[True, True], [True, False]], mask: [[True, True], [True, False]]
|
|
539
540
|
# -> nea == output_nea + ~mask: [[True, True], [True, True]]
|
|
540
541
|
# -> np.all(nea == output_nea + ~mask, 1): [True, True]
|
|
542
|
+
|
|
541
543
|
nea_eq_ref = np.all((nea == output_nea) + ~mask_nea, 1)
|
|
542
544
|
rev_nea_eq_ref = np.all((rev_nea == output_nea) + ~mask_nea, 1)
|
|
543
545
|
|
|
@@ -550,6 +552,7 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
550
552
|
output_ea[mask_outlier] = PADDING_VALUE
|
|
551
553
|
##################################################################
|
|
552
554
|
|
|
555
|
+
|
|
553
556
|
ea_eq_ref = np.all((ea == output_ea) + ~mask_ea, 1)
|
|
554
557
|
rev_ea_eq_ref = np.all((rev_ea == output_ea) + ~mask_ea, 1)
|
|
555
558
|
|
|
@@ -617,6 +620,7 @@ def check_status(sumstats: pd.DataFrame, fasta_records_dict, log=Log(), verbose=
|
|
|
617
620
|
unique_chrom_cond = sumstats_cond[chrom].unique()
|
|
618
621
|
starting_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_cond])
|
|
619
622
|
records_len_cond = np.array([records_len_dict[k] for k in unique_chrom_cond])
|
|
623
|
+
|
|
620
624
|
sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond, records_len=records_len_cond)
|
|
621
625
|
|
|
622
626
|
log.write(f" -Checking records for ( len(NEA) > {max_len} or len(EA) > {max_len} )", verbose=verbose)
|
|
@@ -651,6 +655,8 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
|
|
|
651
655
|
log.write(" -Loading fasta records:",end="", verbose=verbose)
|
|
652
656
|
chromlist = get_chr_list(add_number=True)
|
|
653
657
|
records = SeqIO.parse(ref_seq, "fasta")
|
|
658
|
+
|
|
659
|
+
sumstats = sortcoordinate(sumstats,verbose=False)
|
|
654
660
|
|
|
655
661
|
all_records_dict = {}
|
|
656
662
|
chroms_in_sumstats = sumstats[chrom].unique() # load records from Fasta file only for the chromosomes present in the sumstats
|
|
@@ -729,17 +735,21 @@ def build_fasta_records(fasta_records_dict, pos_as_dict=True, log=Log(), verbose
|
|
|
729
735
|
r = r.seq._data.translate(TRANSLATE_TABLE)
|
|
730
736
|
r = np.array([r], dtype=f'<U{len(r)}').view('<u4').astype(np.uint8)
|
|
731
737
|
all_r.append(r)
|
|
732
|
-
|
|
738
|
+
|
|
733
739
|
# We've just created a list of numpy arrays, so we can concatenate them to obtain a single numpy array
|
|
734
740
|
# Then we keep track of the starting position of each record in the concatenated array. This will be useful later
|
|
735
741
|
# to index the record array depending on the position of the variant and the chromosome
|
|
736
742
|
records_len = np.array([len(r) for r in all_r])
|
|
743
|
+
|
|
737
744
|
starting_positions = np.cumsum(records_len) - records_len
|
|
745
|
+
|
|
746
|
+
|
|
738
747
|
if pos_as_dict:
|
|
739
748
|
starting_positions = {k: v for k, v in zip(fasta_records_dict.keys(), starting_positions)}
|
|
740
749
|
records_len_dict = {k: v for k, v in zip(fasta_records_dict.keys(), records_len)}
|
|
741
750
|
record = np.concatenate(all_r)
|
|
742
751
|
del all_r # free memory
|
|
752
|
+
|
|
743
753
|
|
|
744
754
|
return record, starting_positions,records_len_dict
|
|
745
755
|
|
|
@@ -1335,8 +1345,8 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
1335
1345
|
################################################################################################################
|
|
1336
1346
|
def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
|
|
1337
1347
|
##start function with col checking##########################################################
|
|
1338
|
-
_start_line = "check the difference between EAF and reference VCF
|
|
1339
|
-
_end_line = "checking the difference between EAF and reference VCF
|
|
1348
|
+
_start_line = "check the difference between EAF (sumstats) and ALT frequency (reference VCF)"
|
|
1349
|
+
_end_line = "checking the difference between EAF (sumstats) and ALT frequency (reference VCF)"
|
|
1340
1350
|
_start_cols = [chr,pos,ref,alt,eaf,status]
|
|
1341
1351
|
_start_function = ".check_daf()"
|
|
1342
1352
|
_must_args ={"ref_alt_freq":ref_alt_freq}
|
|
@@ -1381,7 +1391,8 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
|
|
|
1381
1391
|
pool.join()
|
|
1382
1392
|
###########################
|
|
1383
1393
|
#status_inferred = sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]].apply(lambda x:check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict),axis=1)
|
|
1384
|
-
|
|
1394
|
+
log.write(" -Difference in allele frequency (DAF) = EAF (sumstats) - ALT_AF (reference VCF)", verbose=verbose)
|
|
1395
|
+
log.write(" -Note: this DAF is not the derived allele frequency.", verbose=verbose)
|
|
1385
1396
|
#sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
|
|
1386
1397
|
#sumstats["DAF"]=sumstats["DAF"].astype("float")
|
|
1387
1398
|
log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]),verbose=verbose)
|
|
@@ -1417,8 +1428,8 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
|
|
|
1417
1428
|
|
|
1418
1429
|
def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
|
|
1419
1430
|
##start function with col checking##########################################################
|
|
1420
|
-
_start_line = "infer EAF using reference VCF ALT frequency"
|
|
1421
|
-
_end_line = "inferring EAF using reference VCF ALT frequency"
|
|
1431
|
+
_start_line = "infer sumstats EAF using reference VCF ALT frequency"
|
|
1432
|
+
_end_line = "inferring sumstats EAF using reference VCF ALT frequency"
|
|
1422
1433
|
_start_cols = [chr,pos,ref,alt,status]
|
|
1423
1434
|
_start_function = ".infer_af()"
|
|
1424
1435
|
_must_args ={"ref_alt_freq":ref_alt_freq}
|