gwaslab 3.5.7__py3-none-any.whl → 3.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +2 -0
- gwaslab/bd_common_data.py +1 -0
- gwaslab/bd_get_hapmap3.py +0 -1
- gwaslab/data/formatbook.json +78 -0
- gwaslab/g_Sumstats.py +98 -24
- gwaslab/g_SumstatsMulti.py +287 -0
- gwaslab/g_SumstatsPair.py +101 -16
- gwaslab/g_Sumstats_polars.py +245 -0
- gwaslab/g_headers.py +12 -3
- gwaslab/g_meta.py +123 -47
- gwaslab/g_meta_update.py +48 -0
- gwaslab/g_vchange_status_polars.py +44 -0
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +169 -110
- gwaslab/hm_casting_polars.py +202 -0
- gwaslab/hm_harmonize_sumstats.py +19 -8
- gwaslab/io_load_ld.py +529 -0
- gwaslab/io_preformat_input.py +11 -0
- gwaslab/io_preformat_input_polars.py +632 -0
- gwaslab/io_process_args.py +25 -1
- gwaslab/io_read_ldsc.py +34 -3
- gwaslab/io_read_pipcs.py +62 -6
- gwaslab/prscs_gigrnd.py +122 -0
- gwaslab/prscs_mcmc_gtb.py +136 -0
- gwaslab/prscs_parse_genet.py +98 -0
- gwaslab/qc_build.py +53 -0
- gwaslab/qc_check_datatype.py +10 -8
- gwaslab/qc_check_datatype_polars.py +128 -0
- gwaslab/qc_fix_sumstats.py +25 -23
- gwaslab/qc_fix_sumstats_polars.py +193 -0
- gwaslab/util_ex_calculate_ldmatrix.py +49 -19
- gwaslab/util_ex_gwascatalog.py +71 -28
- gwaslab/util_ex_ldsc.py +67 -21
- gwaslab/util_ex_match_ldmatrix.py +396 -0
- gwaslab/util_ex_run_2samplemr.py +0 -2
- gwaslab/util_ex_run_ccgwas.py +155 -0
- gwaslab/util_ex_run_coloc.py +1 -1
- gwaslab/util_ex_run_hyprcoloc.py +117 -0
- gwaslab/util_ex_run_mesusie.py +155 -0
- gwaslab/util_ex_run_mtag.py +92 -0
- gwaslab/util_ex_run_prscs.py +85 -0
- gwaslab/util_ex_run_susie.py +40 -9
- gwaslab/util_in_estimate_ess.py +18 -0
- gwaslab/util_in_fill_data.py +20 -1
- gwaslab/util_in_filter_value.py +10 -5
- gwaslab/util_in_get_sig.py +71 -13
- gwaslab/util_in_meta.py +168 -4
- gwaslab/util_in_meta_polars.py +174 -0
- gwaslab/viz_plot_compare_effect.py +87 -23
- gwaslab/viz_plot_credible_sets.py +55 -11
- gwaslab/viz_plot_effect.py +22 -12
- gwaslab/viz_plot_miamiplot2.py +3 -2
- gwaslab/viz_plot_mqqplot.py +84 -81
- gwaslab/viz_plot_qqplot.py +6 -6
- gwaslab/viz_plot_regional2.py +2 -1
- gwaslab/viz_plot_stackedregional.py +4 -1
- {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/METADATA +8 -6
- gwaslab-3.5.8.dist-info/RECORD +117 -0
- {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
- gwaslab-3.5.7.dist-info/RECORD +0 -96
- {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
- {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0
gwaslab/io_load_ld.py
ADDED
|
@@ -0,0 +1,529 @@
|
|
|
1
|
+
|
|
2
|
+
import scipy.sparse as sparse
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
|
|
6
|
+
import subprocess
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import gc
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import numpy as np
|
|
12
|
+
from gwaslab.g_Log import Log
|
|
13
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
14
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
15
|
+
from gwaslab.util_in_get_sig import getsig
|
|
16
|
+
from gwaslab.util_ex_process_ref import _process_plink_input_files
|
|
17
|
+
from gwaslab.g_version import _checking_plink_version
|
|
18
|
+
from gwaslab.util_in_filter_value import _exclude_hla
|
|
19
|
+
from gwaslab.util_ex_calculate_ldmatrix import _extract_variants_in_locus
|
|
20
|
+
from gwaslab.util_ex_calculate_ldmatrix import _export_snplist_and_locus_sumstats
|
|
21
|
+
from gwaslab.viz_plot_regional2 import _get_lead_id
|
|
22
|
+
from gwaslab.util_ex_calculate_ldmatrix import _extract_variants_in_locus
|
|
23
|
+
|
|
24
|
+
def tofinemapping_using_ld(sumstats,
|
|
25
|
+
study=None,
|
|
26
|
+
ld_map_path=None,
|
|
27
|
+
ld_path=None,
|
|
28
|
+
ld_fmt = "npz",
|
|
29
|
+
ld_if_square = False,
|
|
30
|
+
ld_if_add_T = False,
|
|
31
|
+
ld_map_rename_dic = None,
|
|
32
|
+
ld_map_kwargs = None,
|
|
33
|
+
loci=None,
|
|
34
|
+
out="./",
|
|
35
|
+
windowsizekb=1000,
|
|
36
|
+
n_cores=1,
|
|
37
|
+
mode="r",
|
|
38
|
+
exclude_hla=False,
|
|
39
|
+
getlead_args=None,
|
|
40
|
+
memory=None,
|
|
41
|
+
overwrite=False,
|
|
42
|
+
log=Log(),
|
|
43
|
+
suffixes=None,
|
|
44
|
+
verbose=True,
|
|
45
|
+
**kwargs):
|
|
46
|
+
##start function with col checking##########################################################
|
|
47
|
+
_start_line = "calculate LD matrix"
|
|
48
|
+
_end_line = "calculating LD matrix"
|
|
49
|
+
_start_cols =["SNPID","CHR","POS","EA","NEA"]
|
|
50
|
+
_start_function = ".calculate_ld_matrix()"
|
|
51
|
+
_must_args ={}
|
|
52
|
+
|
|
53
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
54
|
+
log=log,
|
|
55
|
+
verbose=verbose,
|
|
56
|
+
start_line=_start_line,
|
|
57
|
+
end_line=_end_line,
|
|
58
|
+
start_cols=_start_cols,
|
|
59
|
+
start_function=_start_function,
|
|
60
|
+
**_must_args)
|
|
61
|
+
if is_enough_info == False: raise ValueError("Not enough columns for calculating LD matrix")
|
|
62
|
+
############################################################################################
|
|
63
|
+
if suffixes is None:
|
|
64
|
+
suffixes=[""]
|
|
65
|
+
if getlead_args is None:
|
|
66
|
+
getlead_args={"windowsizekb":1000}
|
|
67
|
+
if ld_map_kwargs is None:
|
|
68
|
+
ld_map_kwargs={}
|
|
69
|
+
|
|
70
|
+
if loci is None:
|
|
71
|
+
log.write(" -Loci were not provided. All significant loci will be automatically extracted...",verbose=verbose)
|
|
72
|
+
sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
|
|
73
|
+
else:
|
|
74
|
+
sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
|
|
75
|
+
|
|
76
|
+
# Drop duplicate!!!!
|
|
77
|
+
log.write(" -Dropping duplicated SNPIDs...",verbose=verbose)
|
|
78
|
+
sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
|
|
79
|
+
|
|
80
|
+
# init Filelist DataFrame
|
|
81
|
+
output_file_list = pd.DataFrame(columns=["SNPID","SNPID_LIST","LD_R_MATRIX","LOCUS_SUMSTATS"])
|
|
82
|
+
|
|
83
|
+
plink_log=""
|
|
84
|
+
|
|
85
|
+
if exclude_hla==True:
|
|
86
|
+
sig_df = _exclude_hla(sig_df, log=log, verbose=verbose)
|
|
87
|
+
|
|
88
|
+
sig_df = sig_df.reset_index()
|
|
89
|
+
|
|
90
|
+
## for each lead variant
|
|
91
|
+
for index, row in sig_df.iterrows():
|
|
92
|
+
# extract snplist in each locus
|
|
93
|
+
gc.collect()
|
|
94
|
+
log.write(" -Locus #{}---------------------------------------------------------------".format(index+1))
|
|
95
|
+
log.write(" -Processing locus with lead variant {} at CHR {} POS {} ...".format(row["SNPID"],row["CHR"],row["POS"]))
|
|
96
|
+
locus_sumstats = _extract_variants_in_locus(sumstats, windowsizekb, locus = (row["CHR"],row["POS"]))
|
|
97
|
+
|
|
98
|
+
ld_map = _load_ld_map(ld_map_path, ld_map_rename_dic = ld_map_rename_dic, **ld_map_kwargs )
|
|
99
|
+
|
|
100
|
+
## check available snps with reference file
|
|
101
|
+
matched_sumstats = _merge_ld_map_with_sumstats(row=row,
|
|
102
|
+
locus_sumstats=locus_sumstats,
|
|
103
|
+
ld_map=ld_map,
|
|
104
|
+
log=log,suffixes=suffixes)
|
|
105
|
+
if len(matched_sumstats)==0:
|
|
106
|
+
log.write(" -No matching LD information... Skipping...")
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
#########################################################################################################
|
|
110
|
+
# create matched snp list
|
|
111
|
+
matched_snp_list_path, matched_sumstats_path=_export_snplist_and_locus_sumstats(matched_sumstats=matched_sumstats,
|
|
112
|
+
out=out,
|
|
113
|
+
study=study,
|
|
114
|
+
row=row,
|
|
115
|
+
windowsizekb=windowsizekb,
|
|
116
|
+
log=log,
|
|
117
|
+
suffixes=suffixes)
|
|
118
|
+
#########################################################################################################
|
|
119
|
+
|
|
120
|
+
## Calculate ld matrix using PLINK
|
|
121
|
+
r_matrix = _load_ld_matrix(ld_path, fmt=ld_fmt, if_square=ld_if_square, if_add_T=ld_if_add_T, log=log, verbose=verbose)
|
|
122
|
+
|
|
123
|
+
matched_ld_matrix_path = _extract_variants(matched_sumstats, r_matrix, out, study, row, windowsizekb, log=log, verbose=verbose)
|
|
124
|
+
|
|
125
|
+
# print file list
|
|
126
|
+
row_dict={}
|
|
127
|
+
row_dict["SNPID"]=row["SNPID"]
|
|
128
|
+
row_dict["SNPID_LIST"] = matched_snp_list_path
|
|
129
|
+
row_dict["LD_R_MATRIX"] = matched_ld_matrix_path
|
|
130
|
+
row_dict["LOCUS_SUMSTATS"] = matched_sumstats_path
|
|
131
|
+
file_row = pd.Series(row_dict).to_frame().T
|
|
132
|
+
output_file_list = pd.concat([output_file_list, file_row],ignore_index=True)
|
|
133
|
+
|
|
134
|
+
if len(output_file_list)>0:
|
|
135
|
+
output_file_list["STUDY"] = study
|
|
136
|
+
nloci = len(output_file_list)
|
|
137
|
+
output_file_list_path = "{}/{}_{}loci_{}kb.filelist".format(out.rstrip("/"), study,nloci, windowsizekb)
|
|
138
|
+
output_file_list.to_csv(output_file_list_path,index=None,sep="\t")
|
|
139
|
+
log.write(" -File list is saved to: {}".format(output_file_list_path),verbose=verbose)
|
|
140
|
+
log.write(" -Finished LD matrix calculation.",verbose=verbose)
|
|
141
|
+
else:
|
|
142
|
+
output_file_list_path=None
|
|
143
|
+
log.write(" -No avaialable lead variants.",verbose=verbose)
|
|
144
|
+
log.write(" -Stopped LD matrix calculation.",verbose=verbose)
|
|
145
|
+
finished(log=log, verbose=verbose, end_line=_end_line)
|
|
146
|
+
return output_file_list_path, output_file_list, plink_log
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def process_ld(sumstats,
|
|
152
|
+
ld_path,
|
|
153
|
+
ld_map_path,
|
|
154
|
+
region,
|
|
155
|
+
region_ref,
|
|
156
|
+
log,
|
|
157
|
+
verbose,
|
|
158
|
+
pos ,
|
|
159
|
+
nea,
|
|
160
|
+
ea,
|
|
161
|
+
region_ld_threshold,
|
|
162
|
+
ld_fmt = "npz",
|
|
163
|
+
ld_if_square = False,
|
|
164
|
+
ld_if_add_T = False,
|
|
165
|
+
ld_map_rename_dic = None,
|
|
166
|
+
ld_map_kwargs = None):
|
|
167
|
+
log.write("Start to load reference genotype...", verbose=verbose)
|
|
168
|
+
log.write(" -reference ld matrix path : "+ ld_path, verbose=verbose)
|
|
169
|
+
|
|
170
|
+
# load genotype data of the targeted region
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
log.write(" -Retrieving index...", verbose=verbose)
|
|
174
|
+
|
|
175
|
+
# match sumstats pos and ref pos:
|
|
176
|
+
# get ref index for its first appearance of sumstats pos
|
|
177
|
+
#######################################################################################
|
|
178
|
+
if ld_map_kwargs is None:
|
|
179
|
+
ld_map_kwargs={}
|
|
180
|
+
|
|
181
|
+
ld_map = _load_ld_map(ld_map_path,
|
|
182
|
+
ld_map_rename_dic = ld_map_rename_dic,
|
|
183
|
+
**ld_map_kwargs )
|
|
184
|
+
|
|
185
|
+
log.write(" -Ref variants: {}".format( len(ld_map) ), verbose=verbose)
|
|
186
|
+
|
|
187
|
+
## check available snps with reference file
|
|
188
|
+
sumstats = _merge_ld_map_with_sumstats_for_regional(
|
|
189
|
+
locus_sumstats=sumstats,
|
|
190
|
+
ld_map=ld_map,
|
|
191
|
+
log=log,
|
|
192
|
+
suffixes=None,verbose=verbose)
|
|
193
|
+
sumstats["REFINDEX"] = sumstats["_INDEX_BIM"]
|
|
194
|
+
|
|
195
|
+
#############################################################################################
|
|
196
|
+
|
|
197
|
+
r_matrix = _load_ld_matrix(ld_path,
|
|
198
|
+
fmt=ld_fmt,
|
|
199
|
+
if_square=ld_if_square,
|
|
200
|
+
if_add_T=ld_if_add_T,
|
|
201
|
+
log=log,
|
|
202
|
+
verbose=verbose)
|
|
203
|
+
|
|
204
|
+
#for loop to add LD information
|
|
205
|
+
#############################################################################################
|
|
206
|
+
for ref_n, region_ref_single in enumerate(region_ref):
|
|
207
|
+
|
|
208
|
+
rsq = "RSQ_{}".format(ref_n)
|
|
209
|
+
ld_single = "LD_{}".format(ref_n)
|
|
210
|
+
lead = "LEAD_{}".format(ref_n)
|
|
211
|
+
sumstats[lead]= 0
|
|
212
|
+
|
|
213
|
+
# get lead variant id and pos
|
|
214
|
+
if region_ref_single is None:
|
|
215
|
+
# if not specified, use lead variant
|
|
216
|
+
lead_id = sumstats["scaled_P"].idxmax()
|
|
217
|
+
else:
|
|
218
|
+
# figure out lead variant
|
|
219
|
+
lead_id = _get_lead_id(sumstats, region_ref_single, log, verbose)
|
|
220
|
+
|
|
221
|
+
lead_series = None
|
|
222
|
+
if lead_id is None:
|
|
223
|
+
|
|
224
|
+
matched_snpid = re.match("(chr)?[0-9]+:[0-9]+:[ATCG]+:[ATCG]+",region_ref_single, re.IGNORECASE)
|
|
225
|
+
|
|
226
|
+
if matched_snpid is None:
|
|
227
|
+
sumstats[rsq] = None
|
|
228
|
+
sumstats[rsq] = sumstats[rsq].astype("float")
|
|
229
|
+
sumstats[ld_single] = 0
|
|
230
|
+
continue
|
|
231
|
+
else:
|
|
232
|
+
|
|
233
|
+
lead_snpid = matched_snpid.group(0).split(":")[1:]
|
|
234
|
+
lead_snpid[0]= int(lead_snpid[0])
|
|
235
|
+
lead_series = pd.Series(lead_snpid)
|
|
236
|
+
|
|
237
|
+
# if lead pos is available:
|
|
238
|
+
if sumstats.loc[lead_id, "REFINDEX"] is not None:
|
|
239
|
+
lead_snp_ref_index = sumstats.loc[lead_id, "REFINDEX"]
|
|
240
|
+
|
|
241
|
+
is_matched = ~sumstats["REFINDEX"].isna()
|
|
242
|
+
|
|
243
|
+
ref_index = sumstats.loc[is_matched,"REFINDEX"].astype("Int64")
|
|
244
|
+
|
|
245
|
+
sumstats.loc[is_matched, rsq] = r_matrix[int(lead_snp_ref_index), list(ref_index.values)]
|
|
246
|
+
|
|
247
|
+
else:
|
|
248
|
+
log.write(" -Lead SNP not found in reference...", verbose=verbose)
|
|
249
|
+
sumstats[rsq]=None
|
|
250
|
+
#
|
|
251
|
+
try:
|
|
252
|
+
sumstats.loc[lead_id,rsq]=1
|
|
253
|
+
except KeyError:
|
|
254
|
+
pass
|
|
255
|
+
|
|
256
|
+
sumstats[rsq] = sumstats[rsq].astype("float")
|
|
257
|
+
sumstats[ld_single] = 0
|
|
258
|
+
|
|
259
|
+
for index,ld_threshold in enumerate(region_ld_threshold):
|
|
260
|
+
# No data,LD = 0
|
|
261
|
+
# 0, 0.2 LD = 1
|
|
262
|
+
# 1, 0.4 LD = 2
|
|
263
|
+
# 2, 0.6 LD = 3
|
|
264
|
+
# 3, 0.8 LD = 4
|
|
265
|
+
# 4, 1.0 LD = 5
|
|
266
|
+
# lead LD = 6
|
|
267
|
+
|
|
268
|
+
if index==0:
|
|
269
|
+
to_change_color = sumstats[rsq]>-1
|
|
270
|
+
sumstats.loc[to_change_color,ld_single] = 1
|
|
271
|
+
to_change_color = sumstats[rsq]>ld_threshold
|
|
272
|
+
sumstats.loc[to_change_color,ld_single] = index+2
|
|
273
|
+
|
|
274
|
+
if lead_series is None:
|
|
275
|
+
sumstats.loc[lead_id,ld_single] = len(region_ld_threshold)+2
|
|
276
|
+
sumstats.loc[lead_id,lead] = 1
|
|
277
|
+
|
|
278
|
+
####################################################################################################
|
|
279
|
+
final_shape_col = "SHAPE"
|
|
280
|
+
final_ld_col = "LD"
|
|
281
|
+
final_rsq_col = "RSQ"
|
|
282
|
+
|
|
283
|
+
sumstats[final_ld_col] = 0
|
|
284
|
+
sumstats[final_shape_col] = 1
|
|
285
|
+
sumstats[final_rsq_col] = 0.0
|
|
286
|
+
|
|
287
|
+
if len(region_ref)==1:
|
|
288
|
+
if lead_id is not None:
|
|
289
|
+
sumstats.loc[lead_id, final_shape_col] +=1
|
|
290
|
+
|
|
291
|
+
for i in range(len(region_ref)):
|
|
292
|
+
ld_single = "LD_{}".format(i)
|
|
293
|
+
current_rsq = "RSQ_{}".format(i)
|
|
294
|
+
a_ngt_b = sumstats[final_rsq_col] < sumstats[current_rsq]
|
|
295
|
+
#set levels with interval=100
|
|
296
|
+
sumstats.loc[a_ngt_b, final_ld_col] = 100 * (i+1) + sumstats.loc[a_ngt_b, ld_single]
|
|
297
|
+
sumstats.loc[a_ngt_b, final_rsq_col] = sumstats.loc[a_ngt_b, current_rsq]
|
|
298
|
+
sumstats.loc[a_ngt_b, final_shape_col] = i + 1
|
|
299
|
+
|
|
300
|
+
sumstats = sumstats.dropna(subset=[pos,nea,ea])
|
|
301
|
+
|
|
302
|
+
####################################################################################################
|
|
303
|
+
log.write("Finished loading reference genotype successfully!", verbose=verbose)
|
|
304
|
+
return sumstats
|
|
305
|
+
####################################################################################################
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
####################################################################################################
|
|
339
|
+
def _load_ld_matrix(path,
|
|
340
|
+
fmt="npz",
|
|
341
|
+
if_square=False,
|
|
342
|
+
if_add_T=False,
|
|
343
|
+
log=Log(),
|
|
344
|
+
verbose=True):
|
|
345
|
+
|
|
346
|
+
if fmt == "npz":
|
|
347
|
+
log.write(" -Loading LD matrix from npz file...",verbose=verbose)
|
|
348
|
+
r_matrix = sparse.load_npz(path).toarray()
|
|
349
|
+
if fmt == "txt":
|
|
350
|
+
log.write(" -Loading LD matrix from text file...",verbose=verbose)
|
|
351
|
+
r_matrix = np.loadtxt(path)
|
|
352
|
+
|
|
353
|
+
if if_add_T==True:
|
|
354
|
+
log.write(" -Transforming LD matrix by adding its transpose...",verbose=verbose)
|
|
355
|
+
r_matrix += r_matrix.T
|
|
356
|
+
if if_square==True:
|
|
357
|
+
log.write(" -Transforming LD matrix by squaring all elements...",verbose=verbose)
|
|
358
|
+
r_matrix = np.power(r_matrix,2)
|
|
359
|
+
return r_matrix
|
|
360
|
+
|
|
361
|
+
def _load_ld_map(path,
|
|
362
|
+
snpid="rsid",
|
|
363
|
+
chrom="chromosome",
|
|
364
|
+
pos="position",
|
|
365
|
+
ref="allele1",
|
|
366
|
+
alt="allele2",
|
|
367
|
+
ld_map_rename_dic = None,
|
|
368
|
+
**ld_map_kwargs):
|
|
369
|
+
|
|
370
|
+
if ld_map_rename_dic is not None:
|
|
371
|
+
if type(ld_map_rename_dic) is dict:
|
|
372
|
+
ld_map_rename_dic_to_use={ld_map_rename_dic["EA"]:'EA_bim',
|
|
373
|
+
ld_map_rename_dic["NEA"]:'NEA_bim',
|
|
374
|
+
ld_map_rename_dic["POS"]:'POS',
|
|
375
|
+
ld_map_rename_dic["CHR"]:'CHR',
|
|
376
|
+
ld_map_rename_dic["SNPID"]:'SNPID_bim'
|
|
377
|
+
}
|
|
378
|
+
ld_map_kwargs["usecols"]=list(ld_map_rename_dic.values())
|
|
379
|
+
else:
|
|
380
|
+
ld_map_rename_dic_to_use={ld_map_rename_dic[4]:'EA_bim',
|
|
381
|
+
ld_map_rename_dic[3]:'NEA_bim',
|
|
382
|
+
ld_map_rename_dic[2]:'POS',
|
|
383
|
+
ld_map_rename_dic[1]:'CHR',
|
|
384
|
+
ld_map_rename_dic[0]:'SNPID_bim'
|
|
385
|
+
}
|
|
386
|
+
ld_map_kwargs["usecols"]=ld_map_rename_dic
|
|
387
|
+
else:
|
|
388
|
+
ld_map_rename_dic_to_use={alt:'EA_bim',
|
|
389
|
+
ref:'NEA_bim',
|
|
390
|
+
pos:'POS',
|
|
391
|
+
chrom:'CHR',
|
|
392
|
+
snpid:"SNPID_bim"
|
|
393
|
+
}
|
|
394
|
+
ld_map_kwargs["usecols"]=[chrom, pos, ref, alt, snpid]
|
|
395
|
+
#rsid chromosome position allele1 allele2
|
|
396
|
+
if "sep" not in ld_map_kwargs:
|
|
397
|
+
ld_map_kwargs["sep"] = "\s+"
|
|
398
|
+
|
|
399
|
+
ld_map = pd.read_csv(path,**ld_map_kwargs)
|
|
400
|
+
ld_map = ld_map.rename(columns=ld_map_rename_dic_to_use, errors='ignore')
|
|
401
|
+
# "SNPID",0:"CHR_bim",3:"POS_bim",4:"EA_bim",5:"NEA_bim"
|
|
402
|
+
return ld_map
|
|
403
|
+
|
|
404
|
+
def _extract_variants(merged_sumstats, r_matrix, out, study, row, windowsizekb, log, verbose):
|
|
405
|
+
|
|
406
|
+
avaiable_index = merged_sumstats["_INDEX_BIM"].values
|
|
407
|
+
|
|
408
|
+
flipped = merged_sumstats["_FLIPPED"].values
|
|
409
|
+
|
|
410
|
+
reduced_r_matrix = r_matrix[np.ix_(avaiable_index, avaiable_index)]
|
|
411
|
+
|
|
412
|
+
log.write(" -Flipping LD matrix for {} variants...".format(sum(flipped)),verbose=verbose)
|
|
413
|
+
reduced_r_matrix[flipped,:] = -1 * reduced_r_matrix[flipped,:]
|
|
414
|
+
reduced_r_matrix[:,flipped] = -1 * reduced_r_matrix[:,flipped]
|
|
415
|
+
|
|
416
|
+
snplist_path = "{}/{}_{}_{}.snplist.raw".format(out.rstrip("/"),study,row["SNPID"],windowsizekb)
|
|
417
|
+
output_prefix = "{}/{}_{}_{}".format(out.rstrip("/"),study,row["SNPID"],windowsizekb)
|
|
418
|
+
output_path = "{}.ld.gz".format(output_prefix)
|
|
419
|
+
|
|
420
|
+
pd.DataFrame(reduced_r_matrix).to_csv(output_path,sep="\t",index=None,header=None)
|
|
421
|
+
#reduced_r_matrix.to_csv("{}.ld.gz".format(output_prefix),se="\t")
|
|
422
|
+
return output_path
|
|
423
|
+
|
|
424
|
+
def _merge_ld_map_with_sumstats(row,
|
|
425
|
+
locus_sumstats,
|
|
426
|
+
ld_map,
|
|
427
|
+
log=Log(),
|
|
428
|
+
suffixes=None):
|
|
429
|
+
'''
|
|
430
|
+
align sumstats with bim
|
|
431
|
+
'''
|
|
432
|
+
|
|
433
|
+
index1= "_INDEX_SUMSTATS"
|
|
434
|
+
index2= "_INDEX_BIM"
|
|
435
|
+
locus_sumstats[index1] = locus_sumstats.index
|
|
436
|
+
ld_map[index2] = ld_map.index
|
|
437
|
+
locus_sumstats["_FLIPPED"] = False
|
|
438
|
+
|
|
439
|
+
if suffixes is None:
|
|
440
|
+
suffixes=[""]
|
|
441
|
+
|
|
442
|
+
log.write(" -Variants in locus ({}): {}".format(row["SNPID"],len(locus_sumstats)))
|
|
443
|
+
# convert category to string
|
|
444
|
+
locus_sumstats["EA"] = locus_sumstats["EA"].astype("string")
|
|
445
|
+
locus_sumstats["NEA"] = locus_sumstats["NEA"].astype("string")
|
|
446
|
+
|
|
447
|
+
# matching by SNPID
|
|
448
|
+
# preserve bim keys (use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.)
|
|
449
|
+
combined_df = pd.merge(ld_map, locus_sumstats, on=["CHR","POS"],how="inner")
|
|
450
|
+
|
|
451
|
+
# match allele
|
|
452
|
+
perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
|
|
453
|
+
log.write(" -Variants with perfect matched alleles:{}".format(sum(perfect_match)))
|
|
454
|
+
|
|
455
|
+
# fliipped allele
|
|
456
|
+
#ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
|
|
457
|
+
flipped_match = ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
|
|
458
|
+
log.write(" -Variants with flipped alleles:{}".format(sum(flipped_match)))
|
|
459
|
+
|
|
460
|
+
allele_match = perfect_match | flipped_match
|
|
461
|
+
log.write(" -Total Variants matched:{}".format(sum(allele_match)))
|
|
462
|
+
|
|
463
|
+
if row["SNPID"] not in combined_df.loc[allele_match,"SNPID"].values:
|
|
464
|
+
log.warning("Lead variant was not available in reference!")
|
|
465
|
+
|
|
466
|
+
# adjust statistics
|
|
467
|
+
output_columns=["SNPID","CHR","POS","EA","NEA","_INDEX_BIM","_FLIPPED"]
|
|
468
|
+
for suffix in suffixes:
|
|
469
|
+
if ("BETA"+suffix in locus_sumstats.columns) and ("SE"+suffix in locus_sumstats.columns):
|
|
470
|
+
#log.write(" -Flipping BETA{} for variants with flipped alleles...".format(suffix))
|
|
471
|
+
#combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
|
|
472
|
+
output_columns.append("BETA"+suffix)
|
|
473
|
+
output_columns.append("SE"+suffix)
|
|
474
|
+
if "Z" in locus_sumstats.columns:
|
|
475
|
+
#log.write(" -Flipping Z{} for variants with flipped alleles...".format(suffix))
|
|
476
|
+
#combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
|
|
477
|
+
output_columns.append("Z"+suffix)
|
|
478
|
+
if "EAF" in locus_sumstats.columns:
|
|
479
|
+
#log.write(" -Flipping EAF{} for variants with flipped alleles...".format(suffix))
|
|
480
|
+
#combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
|
|
481
|
+
output_columns.append("EAF"+suffix)
|
|
482
|
+
if "N" in locus_sumstats.columns:
|
|
483
|
+
output_columns.append("N"+suffix)
|
|
484
|
+
combined_df.loc[flipped_match,"_FLIPPED"] = True
|
|
485
|
+
return combined_df.loc[allele_match,output_columns]
|
|
486
|
+
|
|
487
|
+
def _merge_ld_map_with_sumstats_for_regional(
|
|
488
|
+
locus_sumstats,
|
|
489
|
+
ld_map,
|
|
490
|
+
log=Log(),
|
|
491
|
+
suffixes=None,
|
|
492
|
+
verbose=True):
|
|
493
|
+
'''
|
|
494
|
+
align sumstats with bim
|
|
495
|
+
'''
|
|
496
|
+
|
|
497
|
+
index1= "_INDEX_SUMSTATS"
|
|
498
|
+
index2= "_INDEX_BIM"
|
|
499
|
+
locus_sumstats[index1] = locus_sumstats.index
|
|
500
|
+
ld_map[index2] = ld_map.index
|
|
501
|
+
|
|
502
|
+
if suffixes is None:
|
|
503
|
+
suffixes=[""]
|
|
504
|
+
|
|
505
|
+
# convert category to string
|
|
506
|
+
locus_sumstats["EA"] = locus_sumstats["EA"].astype("string")
|
|
507
|
+
locus_sumstats["NEA"] = locus_sumstats["NEA"].astype("string")
|
|
508
|
+
|
|
509
|
+
# matching by SNPID
|
|
510
|
+
# preserve bim keys (use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.)
|
|
511
|
+
combined_df = pd.merge(locus_sumstats, ld_map, on=["CHR","POS"],how="left")
|
|
512
|
+
combined_df[["EA_bim","NEA_bim"]] = combined_df[["EA_bim","NEA_bim"]].fillna("N")
|
|
513
|
+
# match allele
|
|
514
|
+
perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
|
|
515
|
+
|
|
516
|
+
# fliipped allele
|
|
517
|
+
#ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
|
|
518
|
+
flipped_match = ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
|
|
519
|
+
|
|
520
|
+
not_matched = combined_df[index2].isna()
|
|
521
|
+
|
|
522
|
+
allele_match = perfect_match | flipped_match
|
|
523
|
+
|
|
524
|
+
log.write(" -Total Variants matched:{}".format( sum(allele_match) ),verbose=verbose)
|
|
525
|
+
log.write(" -Total Variants not in reference:{}".format(sum(not_matched)),verbose=verbose)
|
|
526
|
+
|
|
527
|
+
return combined_df.loc[allele_match | not_matched,:]
|
|
528
|
+
|
|
529
|
+
############################################################################################################################################################################################################################################################
|
gwaslab/io_preformat_input.py
CHANGED
|
@@ -34,6 +34,7 @@ def preformat(sumstats,
|
|
|
34
34
|
f=None,
|
|
35
35
|
t=None,
|
|
36
36
|
p=None,
|
|
37
|
+
q=None,
|
|
37
38
|
mlog10p=None,
|
|
38
39
|
test=None,
|
|
39
40
|
info=None,
|
|
@@ -51,6 +52,7 @@ def preformat(sumstats,
|
|
|
51
52
|
dof=None,
|
|
52
53
|
ncase=None,
|
|
53
54
|
ncontrol=None,
|
|
55
|
+
neff=None,
|
|
54
56
|
direction=None,
|
|
55
57
|
status=None,
|
|
56
58
|
study=None,
|
|
@@ -107,6 +109,9 @@ def preformat(sumstats,
|
|
|
107
109
|
if "format_comment" in meta_data.keys():
|
|
108
110
|
readargs["comment"] = meta_data["format_comment"]
|
|
109
111
|
|
|
112
|
+
if "format_other_cols" in meta_data.keys():
|
|
113
|
+
other += meta_data["format_other_cols"]
|
|
114
|
+
|
|
110
115
|
if "sep" not in readargs.keys():
|
|
111
116
|
readargs["sep"] = "\t"
|
|
112
117
|
|
|
@@ -215,6 +220,9 @@ def preformat(sumstats,
|
|
|
215
220
|
if ncontrol and (type(ncontrol) is str):
|
|
216
221
|
usecols.append(ncontrol)
|
|
217
222
|
rename_dictionary[ncontrol]= "N_CONTROL"
|
|
223
|
+
if neff and (type(neff) is str):
|
|
224
|
+
usecols.append(neff)
|
|
225
|
+
rename_dictionary[neff]= "N_EFF"
|
|
218
226
|
if beta:
|
|
219
227
|
usecols.append(beta)
|
|
220
228
|
rename_dictionary[beta]= "BETA"
|
|
@@ -233,6 +241,9 @@ def preformat(sumstats,
|
|
|
233
241
|
if z:
|
|
234
242
|
usecols.append(z)
|
|
235
243
|
rename_dictionary[z]= "Z"
|
|
244
|
+
if q:
|
|
245
|
+
usecols.append(q)
|
|
246
|
+
rename_dictionary[q]= "Q"
|
|
236
247
|
if p:
|
|
237
248
|
usecols.append(p)
|
|
238
249
|
rename_dictionary[p]= "P"
|