gwaslab 3.5.4__py3-none-any.whl → 3.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +3 -1
- gwaslab/g_Sumstats.py +56 -9
- gwaslab/g_SumstatsPair.py +16 -12
- gwaslab/g_SumstatsSet.py +663 -0
- gwaslab/g_headers.py +131 -0
- gwaslab/g_meta.py +2 -1
- gwaslab/g_version.py +3 -3
- gwaslab/hm_harmonize_sumstats.py +91 -1
- gwaslab/io_preformat_input.py +29 -7
- gwaslab/io_read_pipcs.py +23 -0
- gwaslab/io_to_formats.py +45 -44
- gwaslab/qc_check_datatype.py +65 -42
- gwaslab/qc_fix_sumstats.py +1 -1
- gwaslab/util_ex_ldproxyfinder.py +162 -3
- gwaslab/util_ex_ldsc.py +9 -0
- gwaslab/util_ex_run_2samplemr.py +34 -0
- gwaslab/util_ex_run_clumping.py +4 -2
- gwaslab/util_in_fill_data.py +28 -3
- gwaslab/util_in_filter_value.py +66 -1
- gwaslab/util_in_merge.py +51 -0
- gwaslab/viz_aux_save_figure.py +2 -1
- gwaslab/viz_plot_credible_sets.py +99 -0
- gwaslab/viz_plot_effect.py +283 -0
- gwaslab/viz_plot_miamiplot2.py +1 -1
- gwaslab/viz_plot_mqqplot.py +31 -11
- gwaslab/viz_plot_regional2.py +133 -32
- gwaslab/viz_plot_stackedregional.py +64 -34
- {gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/METADATA +4 -4
- {gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/RECORD +33 -28
- {gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/WHEEL +1 -1
- gwaslab/vis_plot_credible sets.py +0 -0
- {gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/LICENSE +0 -0
- {gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/top_level.txt +0 -0
gwaslab/util_ex_ldproxyfinder.py
CHANGED
|
@@ -37,7 +37,7 @@ from gwaslab.hm_harmonize_sumstats import auto_check_vcf_chr_dict
|
|
|
37
37
|
#check if in outcome and exposure snp list
|
|
38
38
|
#replace
|
|
39
39
|
|
|
40
|
-
def _extract_with_ld_proxy(
|
|
40
|
+
def _extract_with_ld_proxy( snplist=None,
|
|
41
41
|
common_sumstats=None,
|
|
42
42
|
sumstats1=None,
|
|
43
43
|
vcf_path=None,
|
|
@@ -58,6 +58,7 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
58
58
|
is_needed=[]
|
|
59
59
|
no_need =[]
|
|
60
60
|
|
|
61
|
+
print(common_sumstats.head())
|
|
61
62
|
for i in snplist:
|
|
62
63
|
if i in common_sumstats["SNPID"].values:
|
|
63
64
|
no_need.append(i)
|
|
@@ -72,7 +73,7 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
72
73
|
if len(in_sumstats)==0:
|
|
73
74
|
log.write(" -No available variants for LD proxy checking...Skipping... ", verbose=verbose)
|
|
74
75
|
else:
|
|
75
|
-
log.write(" -{}
|
|
76
|
+
log.write(" -{} available variants for LD proxy checking... ".format(len(in_sumstats)), verbose=verbose)
|
|
76
77
|
|
|
77
78
|
for index,row in in_sumstats.iterrows():
|
|
78
79
|
# determine SNP and select region
|
|
@@ -93,6 +94,16 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
93
94
|
if len(flanking_sumstats)==0:
|
|
94
95
|
log.write(" -No availble variants in the region...Skipping!", verbose=verbose)
|
|
95
96
|
continue
|
|
97
|
+
|
|
98
|
+
_get_rsq_single(in_sumstats.loc[index,["POS","NEA_1","EA_1"]],
|
|
99
|
+
row_pos=row["POS"],
|
|
100
|
+
vcf_path=vcf_path,
|
|
101
|
+
region=region,
|
|
102
|
+
log=log,
|
|
103
|
+
verbose=verbose,
|
|
104
|
+
vcf_chr_dict=vcf_chr_dict,
|
|
105
|
+
tabix=tabix)
|
|
106
|
+
|
|
96
107
|
|
|
97
108
|
flanking_sumstats = _get_rsq(row =in_sumstats.loc[index,["POS","NEA_1","EA_1"]],
|
|
98
109
|
sumstats = flanking_sumstats,
|
|
@@ -126,6 +137,81 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
126
137
|
return extracted_sumstats
|
|
127
138
|
|
|
128
139
|
|
|
140
|
+
def _extract_ld_proxy( snplist=None,
|
|
141
|
+
common_sumstats=None,
|
|
142
|
+
vcf_path=None,
|
|
143
|
+
vcf_chr_dict=None,
|
|
144
|
+
tabix=None,
|
|
145
|
+
log=Log(),
|
|
146
|
+
verbose=True,
|
|
147
|
+
windowsizekb=100,
|
|
148
|
+
ld_threshold=0.8
|
|
149
|
+
):
|
|
150
|
+
### Load vcf#######################################################################################
|
|
151
|
+
log.write("Start to load reference genotype...", verbose=verbose)
|
|
152
|
+
log.write(" -reference vcf path : "+ vcf_path, verbose=verbose)
|
|
153
|
+
if tabix is None:
|
|
154
|
+
tabix = which("tabix")
|
|
155
|
+
vcf_chr_dict = auto_check_vcf_chr_dict(vcf_path=vcf_path, vcf_chr_dict=vcf_chr_dict, verbose=verbose, log=log)
|
|
156
|
+
|
|
157
|
+
ld_proxies = pd.DataFrame()
|
|
158
|
+
in_sumstats = common_sumstats.loc[common_sumstats["SNPID"].isin(snplist),:]
|
|
159
|
+
|
|
160
|
+
if len(in_sumstats)==0:
|
|
161
|
+
log.write(" -No available variants for LD proxy checking...Skipping... ", verbose=verbose)
|
|
162
|
+
else:
|
|
163
|
+
log.write(" -{} available variants for LD proxy checking... ".format(len(in_sumstats)), verbose=verbose)
|
|
164
|
+
|
|
165
|
+
for index,row in in_sumstats.iterrows():
|
|
166
|
+
# determine SNP and select region
|
|
167
|
+
snpid = row["SNPID"]
|
|
168
|
+
chrom= int(row["CHR"])
|
|
169
|
+
start= int(row["POS"]-windowsizekb*1000)
|
|
170
|
+
end= int(row["POS"]+windowsizekb*1000)
|
|
171
|
+
|
|
172
|
+
region = (chrom, start, end)
|
|
173
|
+
|
|
174
|
+
### #######################################################################################
|
|
175
|
+
#is_flanking = common_sumstats["CHR"] == chrom & common_sumstats["CHR"]>start & common_sumstats["CHR"]<end
|
|
176
|
+
#flanking_sumstats = common_sumstats.loc[is_flanking,:]
|
|
177
|
+
flanking_sumstats = common_sumstats.query('CHR == @chrom and @start < POS < @end',engine='python').copy()
|
|
178
|
+
|
|
179
|
+
log.write(" -Extract {} variants in flanking region of {} for checking: {}:{}-{}".format(len(flanking_sumstats), snpid, chrom, start, end), verbose=verbose)
|
|
180
|
+
|
|
181
|
+
if len(flanking_sumstats)==0:
|
|
182
|
+
log.write(" -No availble variants in the region...Skipping!", verbose=verbose)
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
flanking_sumstats = _get_rsq(row =in_sumstats.loc[index,["POS","NEA","EA"]],
|
|
186
|
+
sumstats = flanking_sumstats,
|
|
187
|
+
row_pos=row["POS"],
|
|
188
|
+
vcf_path=vcf_path,
|
|
189
|
+
region=region,
|
|
190
|
+
log=log,
|
|
191
|
+
verbose=verbose,
|
|
192
|
+
vcf_chr_dict=vcf_chr_dict,
|
|
193
|
+
tabix=tabix)
|
|
194
|
+
if flanking_sumstats is None:
|
|
195
|
+
log.write(" -{} is not found in the vcf...Skipping!".format(snpid))
|
|
196
|
+
continue
|
|
197
|
+
flanking_sumstats = flanking_sumstats.loc[flanking_sumstats["RSQ"]>ld_threshold,:]
|
|
198
|
+
|
|
199
|
+
log.write(" -Variants in LD with {} (RSQ > {}): {}".format(snpid, ld_threshold,len(flanking_sumstats)), verbose=verbose)
|
|
200
|
+
|
|
201
|
+
if len(flanking_sumstats)>0:
|
|
202
|
+
flanking_sumstats["LD_REF_VARIANT"]= snpid
|
|
203
|
+
for i,row_with_rsq in flanking_sumstats.iterrows():
|
|
204
|
+
if row_with_rsq["SNPID"] in common_sumstats["SNPID"].values:
|
|
205
|
+
log.write(" -Top Proxy for {} is found: {} (LD RSQ= {})".format(snpid, row_with_rsq["SNPID"], row_with_rsq["RSQ"]))
|
|
206
|
+
break
|
|
207
|
+
#row_with_rsq = pd.DataFrame(row_with_rsq)
|
|
208
|
+
ld_proxies = pd.concat([ld_proxies, flanking_sumstats], ignore_index=True)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
log.write("Finished loading reference genotype successfully!", verbose=verbose)
|
|
212
|
+
return ld_proxies.sort_values(by="RSQ",ascending=False)
|
|
213
|
+
|
|
214
|
+
|
|
129
215
|
def _get_rsq( row,
|
|
130
216
|
sumstats,
|
|
131
217
|
row_pos,
|
|
@@ -205,4 +291,77 @@ def _get_rsq( row,
|
|
|
205
291
|
return sumstats
|
|
206
292
|
|
|
207
293
|
def _check_if_in_sumstats2(row, sumstast):
|
|
208
|
-
pass
|
|
294
|
+
pass
|
|
295
|
+
|
|
296
|
+
def _get_rsq_single( row,
|
|
297
|
+
row_pos,
|
|
298
|
+
vcf_path,
|
|
299
|
+
region,
|
|
300
|
+
log,
|
|
301
|
+
verbose,
|
|
302
|
+
vcf_chr_dict,
|
|
303
|
+
tabix):
|
|
304
|
+
#load genotype data of the targeted region
|
|
305
|
+
ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
|
|
306
|
+
|
|
307
|
+
if ref_genotype is None:
|
|
308
|
+
log.warning("No data was retrieved. Skipping ...", verbose=verbose)
|
|
309
|
+
ref_genotype=dict()
|
|
310
|
+
ref_genotype["variants/POS"]=np.array([],dtype="int64")
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
log.write(" -Retrieving index...", verbose=verbose)
|
|
314
|
+
log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])), verbose=verbose)
|
|
315
|
+
# match sumstats pos and ref pos:
|
|
316
|
+
# get ref index for its first appearance of sumstats pos
|
|
317
|
+
#######################################################################################
|
|
318
|
+
def match_varaint(x):
|
|
319
|
+
# x: "POS,NEA,EA"
|
|
320
|
+
if np.any(ref_genotype["variants/POS"] == x.iloc[0]):
|
|
321
|
+
if len(np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0])>1:
|
|
322
|
+
# multiple position matches
|
|
323
|
+
for j in np.where(ref_genotype["variants/POS"] == x.iloc[0])[0]:
|
|
324
|
+
# for each possible match, compare ref and alt
|
|
325
|
+
if x.iloc[1] == ref_genotype["variants/REF"][j]:
|
|
326
|
+
if x.iloc[2] in ref_genotype["variants/ALT"][j]:
|
|
327
|
+
return j
|
|
328
|
+
elif x.iloc[1] in ref_genotype["variants/ALT"][j]:
|
|
329
|
+
if x.iloc[2] == ref_genotype["variants/REF"][j]:
|
|
330
|
+
return j
|
|
331
|
+
else:
|
|
332
|
+
return None
|
|
333
|
+
else:
|
|
334
|
+
# single match
|
|
335
|
+
return np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0][0]
|
|
336
|
+
else:
|
|
337
|
+
# no position match
|
|
338
|
+
return None
|
|
339
|
+
|
|
340
|
+
#############################################################################################
|
|
341
|
+
lead_pos = row_pos
|
|
342
|
+
|
|
343
|
+
# if lead pos is available:
|
|
344
|
+
if lead_pos in ref_genotype["variants/POS"]:
|
|
345
|
+
|
|
346
|
+
# get ref index for lead snp
|
|
347
|
+
lead_snp_ref_index = match_varaint(row)
|
|
348
|
+
#lead_snp_ref_index = np.where(ref_genotype["variants/POS"] == lead_pos)[0][0]
|
|
349
|
+
|
|
350
|
+
# non-na other snp index
|
|
351
|
+
other_snps_ref_index = list(range(len(ref_genotype["calldata/GT"])))
|
|
352
|
+
other_snps_ref_index.remove(lead_snp_ref_index)
|
|
353
|
+
|
|
354
|
+
# get genotype
|
|
355
|
+
lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
|
|
356
|
+
other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
|
|
357
|
+
|
|
358
|
+
log.write(" -Calculating Rsq...", verbose=verbose)
|
|
359
|
+
|
|
360
|
+
if len(other_snp_genotype)>1:
|
|
361
|
+
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
|
|
362
|
+
else:
|
|
363
|
+
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
|
|
364
|
+
|
|
365
|
+
ld_proxy = pd.DataFrame( {"SNPID":ref_genotype["variants/ID"][other_snps_ref_index],"RSQ":valid_r2 })
|
|
366
|
+
|
|
367
|
+
return ld_proxy.sort_values(by="RSQ",ascending=False)
|
gwaslab/util_ex_ldsc.py
CHANGED
|
@@ -260,6 +260,9 @@ class ARGS():
|
|
|
260
260
|
def _estimate_h2_by_ldsc(insumstats, log, verbose=True, munge=False, munge_args=None, **kwargs):
|
|
261
261
|
sumstats = insumstats.copy()
|
|
262
262
|
|
|
263
|
+
if "N" in sumstats.columns:
|
|
264
|
+
sumstats["N"] = sumstats["N"].astype("int64")
|
|
265
|
+
|
|
263
266
|
if munge:
|
|
264
267
|
if munge_args is None:
|
|
265
268
|
munge_args={}
|
|
@@ -320,6 +323,8 @@ def _estimate_h2_by_ldsc(insumstats, log, verbose=True, munge=False, munge_args=
|
|
|
320
323
|
|
|
321
324
|
def _estimate_partitioned_h2_by_ldsc(insumstats, log, verbose=True, **kwargs):
|
|
322
325
|
sumstats = insumstats.copy()
|
|
326
|
+
if "N" in sumstats.columns:
|
|
327
|
+
sumstats["N"] = sumstats["N"].astype("int64")
|
|
323
328
|
##start function with col checking##########################################################
|
|
324
329
|
_start_line = "run LD score regression"
|
|
325
330
|
_end_line = "running LD score regression"
|
|
@@ -366,6 +371,8 @@ def _estimate_partitioned_h2_by_ldsc(insumstats, log, verbose=True, **kwargs):
|
|
|
366
371
|
|
|
367
372
|
def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
|
|
368
373
|
sumstats = insumstats.copy()
|
|
374
|
+
if "N" in sumstats.columns:
|
|
375
|
+
sumstats["N"] = sumstats["N"].astype("int64")
|
|
369
376
|
##start function with col checking##########################################################
|
|
370
377
|
_start_line = "run LD score regression for genetic correlation"
|
|
371
378
|
_end_line = "running LD score regression for genetic correlation"
|
|
@@ -426,6 +433,8 @@ def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
|
|
|
426
433
|
|
|
427
434
|
def _estimate_h2_cts_by_ldsc(insumstats, log, verbose=True, **kwargs):
|
|
428
435
|
sumstats = insumstats.copy()
|
|
436
|
+
if "N" in sumstats.columns:
|
|
437
|
+
sumstats["N"] = sumstats["N"].astype("int64")
|
|
429
438
|
##start function with col checking##########################################################
|
|
430
439
|
_start_line = "run LD score regression"
|
|
431
440
|
_end_line = "running LD score regression"
|
gwaslab/util_ex_run_2samplemr.py
CHANGED
|
@@ -21,6 +21,8 @@ def _run_two_sample_mr(sumstatspair_object,
|
|
|
21
21
|
n1=None,
|
|
22
22
|
n2=None,
|
|
23
23
|
binary1=False,
|
|
24
|
+
cck1=None,
|
|
25
|
+
cck2=None,
|
|
24
26
|
ncase1=None,
|
|
25
27
|
ncontrol1=None,
|
|
26
28
|
prevalence1=None,
|
|
@@ -35,6 +37,22 @@ def _run_two_sample_mr(sumstatspair_object,
|
|
|
35
37
|
if methods is None:
|
|
36
38
|
methods = ["mr_ivw","mr_simple_mode","mr_weighted_median","mr_egger_regression","mr_ivw_mre", "mr_weighted_mode"]
|
|
37
39
|
methods_string = '"{}"'.format('","'.join(methods))
|
|
40
|
+
|
|
41
|
+
if cck1 is not None:
|
|
42
|
+
log.write(" - ncase1, ncontrol1, prevalence1:{}".format(cck1))
|
|
43
|
+
binary1 = True
|
|
44
|
+
ncase1 = cck1[0]
|
|
45
|
+
ncontrol1 = cck1[1]
|
|
46
|
+
prevalence1 = cck1[2]
|
|
47
|
+
n1 = ncase1 + ncontrol1
|
|
48
|
+
if cck2 is not None:
|
|
49
|
+
log.write(" - ncase2, ncontrol2, prevalence2:{}".format(cck2))
|
|
50
|
+
binary2 = True
|
|
51
|
+
ncase2 = cck2[0]
|
|
52
|
+
ncontrol2 = cck2[1]
|
|
53
|
+
prevalence2 = cck2[2]
|
|
54
|
+
n2 = ncase2 + ncontrol2
|
|
55
|
+
|
|
38
56
|
if clump==True:
|
|
39
57
|
sumstatspair = sumstatspair_object.clumps["clumps"]
|
|
40
58
|
else:
|
|
@@ -64,10 +82,16 @@ def _run_two_sample_mr(sumstatspair_object,
|
|
|
64
82
|
|
|
65
83
|
###
|
|
66
84
|
calculate_r_script = ""
|
|
85
|
+
|
|
67
86
|
if binary1==True:
|
|
68
87
|
calculate_r_script+= _make_script_for_calculating_r("exposure", ncase1, ncontrol1, prevalence1)
|
|
88
|
+
else:
|
|
89
|
+
calculate_r_script+= _make_script_for_calculating_r_quant("exposure")
|
|
90
|
+
|
|
69
91
|
if binary2==True:
|
|
70
92
|
calculate_r_script+= _make_script_for_calculating_r("outcome", ncase2, ncontrol2, prevalence2)
|
|
93
|
+
else:
|
|
94
|
+
calculate_r_script+= _make_script_for_calculating_r_quant("outcome")
|
|
71
95
|
|
|
72
96
|
# create scripts
|
|
73
97
|
directionality_test_script='''
|
|
@@ -218,6 +242,16 @@ def _make_script_for_calculating_r(exposure_or_outcome, ncase, ncontrol, prevale
|
|
|
218
242
|
return script
|
|
219
243
|
|
|
220
244
|
|
|
245
|
+
def _make_script_for_calculating_r_quant(exposure_or_outcome):
|
|
246
|
+
script = """
|
|
247
|
+
harmonized_data$"r.{exposure_or_outcome}" <- get_r_from_bsen( harmonized_data$"beta.{exposure_or_outcome}",
|
|
248
|
+
harmonized_data$"se.{exposure_or_outcome}",
|
|
249
|
+
harmonized_data$"samplesize.{exposure_or_outcome}"
|
|
250
|
+
)
|
|
251
|
+
""".format(
|
|
252
|
+
exposure_or_outcome = exposure_or_outcome
|
|
253
|
+
)
|
|
254
|
+
return script
|
|
221
255
|
|
|
222
256
|
|
|
223
257
|
def _filter_by_f(sumstatspair, f_check, n1, binary1=None, ncase1=None, ncontrol1=None, prevalence1=None, log=Log() ):
|
gwaslab/util_ex_run_clumping.py
CHANGED
|
@@ -162,7 +162,7 @@ def _clump(insumstats, vcf=None, scaled=False, out="clumping_plink2",
|
|
|
162
162
|
log.write(e.output)
|
|
163
163
|
#os.system(script)
|
|
164
164
|
|
|
165
|
-
clumped = pd.read_csv("{}.clumps".format(out_single_chr),
|
|
165
|
+
clumped = pd.read_csv("{}.clumps".format(out_single_chr),sep="\s+")
|
|
166
166
|
results = pd.concat([results,clumped],ignore_index=True)
|
|
167
167
|
|
|
168
168
|
# remove temp SNPIDP file
|
|
@@ -172,7 +172,9 @@ def _clump(insumstats, vcf=None, scaled=False, out="clumping_plink2",
|
|
|
172
172
|
log.write("Finished clumping.",verbose=verbose)
|
|
173
173
|
results_sumstats = insumstats.loc[insumstats["SNPID"].isin(results["SNPID"]),:].copy()
|
|
174
174
|
finished(log=log, verbose=verbose, end_line=_end_line)
|
|
175
|
-
|
|
175
|
+
|
|
176
|
+
return results_sumstats, results, plink_log
|
|
177
|
+
|
|
176
178
|
|
|
177
179
|
|
|
178
180
|
|
gwaslab/util_in_fill_data.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import numpy as np
|
|
3
3
|
import scipy.stats as ss
|
|
4
|
+
from scipy.stats import norm
|
|
4
5
|
from scipy import stats
|
|
5
6
|
from gwaslab.g_Log import Log
|
|
6
7
|
import gc
|
|
@@ -8,6 +9,7 @@ import gc
|
|
|
8
9
|
from gwaslab.g_version import _get_version
|
|
9
10
|
from gwaslab.qc_check_datatype import check_datatype
|
|
10
11
|
|
|
12
|
+
|
|
11
13
|
def filldata(
|
|
12
14
|
insumstats,
|
|
13
15
|
to_fill=None,
|
|
@@ -38,7 +40,7 @@ def filldata(
|
|
|
38
40
|
for i in skip_cols:
|
|
39
41
|
to_fill.remove(i)
|
|
40
42
|
log.write(" -Skipping columns: ",skip_cols, verbose=verbose)
|
|
41
|
-
if len(set(to_fill) & set(["OR","OR_95L","OR_95U","BETA","SE","P","Z","CHISQ","MLOG10P","MAF"]))==0:
|
|
43
|
+
if len(set(to_fill) & set(["OR","OR_95L","OR_95U","BETA","SE","P","Z","CHISQ","MLOG10P","MAF","SIG"]))==0:
|
|
42
44
|
log.write(" -No available columns to fill. Skipping.", verbose=verbose)
|
|
43
45
|
log.write("Finished filling data using existing columns.", verbose=verbose)
|
|
44
46
|
return sumstats
|
|
@@ -217,6 +219,20 @@ def fill_maf(sumstats,log,verbose=True,filled_count=0):
|
|
|
217
219
|
return 0,filled_count
|
|
218
220
|
return 1,filled_count
|
|
219
221
|
|
|
222
|
+
def fill_sig(sumstats,log,sig_level=5e-8, verbose=True,filled_count=0):
|
|
223
|
+
if "P" in sumstats.columns or "MLOG10P" in sumstats.columns:
|
|
224
|
+
log.write(" - Determining significant using P and MLOG10P with threshold:{}".format(sig_level), verbose=verbose)
|
|
225
|
+
if "P" in sumstats.columns:
|
|
226
|
+
is_sig = sumstats["P"]<sig_level
|
|
227
|
+
elif "MLOG10P" in sumstats.columns:
|
|
228
|
+
is_sig = sumstats["MLOG10P"]>np.log10(sig_level)
|
|
229
|
+
sumstats["SIGNIFICANT"] = False
|
|
230
|
+
sumstats.loc[is_sig, "SIGNIFICANT"] = True
|
|
231
|
+
filled_count +=1
|
|
232
|
+
else:
|
|
233
|
+
return 0,filled_count
|
|
234
|
+
return 1,filled_count
|
|
235
|
+
|
|
220
236
|
####################################################################################################################
|
|
221
237
|
def fill_extreme_mlog10(sumstats, z):
|
|
222
238
|
log_pvalue = np.log(2) + ss.norm.logsf(np.abs(sumstats[z])) #two-sided
|
|
@@ -287,7 +303,10 @@ def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_le
|
|
|
287
303
|
else:
|
|
288
304
|
status,filled_count = fill_mlog10p(sumstats,log,verbose=verbose)
|
|
289
305
|
if status == 1 : to_fill.remove("MLOG10P")
|
|
290
|
-
|
|
306
|
+
|
|
307
|
+
if "SIG" in to_fill:
|
|
308
|
+
status,filled_count = fill_sig(sumstats,sig_level=sig_level ,log=log,verbose=verbose,filled_count=filled_count)
|
|
309
|
+
if status == 1 : to_fill.remove("SIG")
|
|
291
310
|
if filled_count == 0:
|
|
292
311
|
break
|
|
293
312
|
|
|
@@ -330,4 +349,10 @@ def _convert_or_to_beta(OR):
|
|
|
330
349
|
return np.log(OR)
|
|
331
350
|
|
|
332
351
|
def _convert_beta_to_or(beta):
|
|
333
|
-
return np.exp(beta)
|
|
352
|
+
return np.exp(beta)
|
|
353
|
+
|
|
354
|
+
def rank_based_int(series, c=3/8):
|
|
355
|
+
#https://onlinelibrary.wiley.com/doi/10.1111/biom.13214
|
|
356
|
+
n=sum(~series.isna())
|
|
357
|
+
normalized_value = norm.ppf((series.rank()-c)/(n+1-2*c))
|
|
358
|
+
return normalized_value
|
gwaslab/util_in_filter_value.py
CHANGED
|
@@ -513,4 +513,69 @@ def _exclude(sumstats, exclude=None, id_use="SNPID", log=Log(), verbose=True ):
|
|
|
513
513
|
log.write(" -Excluding {} variants from sumstats...".format(len(exclude)),verbose=verbose)
|
|
514
514
|
sumstats = sumstats.loc[~sumstats[id_use].isin(exclude),:]
|
|
515
515
|
log.write(" -Excluded {} variants from sumstats...".format(len(sumstats)),verbose=verbose)
|
|
516
|
-
return sumstats
|
|
516
|
+
return sumstats
|
|
517
|
+
|
|
518
|
+
def _filter_region(sumstats, region, chrom="CHR",pos="POS",log=Log(),verbose=True):
|
|
519
|
+
if region is not None:
|
|
520
|
+
region_chr = region[0]
|
|
521
|
+
region_start = region[1]
|
|
522
|
+
region_end = region[2]
|
|
523
|
+
|
|
524
|
+
log.write(" -Extract SNPs in region : chr{}:{}-{}...".format(region_chr, region[1], region[2]),verbose=verbose)
|
|
525
|
+
|
|
526
|
+
in_region_snp = (sumstats[chrom]==region_chr) & (sumstats[pos]<region_end) & (sumstats[pos]>region_start)
|
|
527
|
+
|
|
528
|
+
log.write(" -Extract SNPs in specified regions: "+str(sum(in_region_snp)),verbose=verbose)
|
|
529
|
+
sumstats = sumstats.loc[in_region_snp,:]
|
|
530
|
+
return sumstats.copy()
|
|
531
|
+
|
|
532
|
+
def _search_variants( sumstats, snplist=None,
|
|
533
|
+
snpid="SNPID" ,rsid="rsID",
|
|
534
|
+
chrom="CHR",pos="POS",ea="EA",nea="NEA",
|
|
535
|
+
log=Log(),verbose=True):
|
|
536
|
+
log.write("Start to search for variants...", verbose=verbose)
|
|
537
|
+
# create a boolean col with FALSE
|
|
538
|
+
if snpid in sumstats.columns:
|
|
539
|
+
is_extract = sumstats[snpid]!=sumstats[snpid]
|
|
540
|
+
else:
|
|
541
|
+
is_extract = sumstats[rsid]!=sumstats[rsid]
|
|
542
|
+
|
|
543
|
+
# search each variant
|
|
544
|
+
for variant in snplist:
|
|
545
|
+
|
|
546
|
+
if pd.api.types.is_list_like(variant):
|
|
547
|
+
# (1:1234)
|
|
548
|
+
single_chrom=variant[0]
|
|
549
|
+
single_pos=variant[1]
|
|
550
|
+
is_extract = is_extract | ((sumstats[pos] == single_pos ) &(sumstats[chrom] == single_chrom))
|
|
551
|
+
|
|
552
|
+
elif pd.api.types.is_string_dtype(type(variant)):
|
|
553
|
+
# rs123
|
|
554
|
+
if "rsID" in sumstats.columns:
|
|
555
|
+
is_extract = is_extract | (sumstats["rsID"] == variant)
|
|
556
|
+
|
|
557
|
+
# 1:123:A:D
|
|
558
|
+
if "SNPID" in sumstats.columns:
|
|
559
|
+
is_extract = is_extract | (sumstats["SNPID"] == variant)
|
|
560
|
+
|
|
561
|
+
# 1:123:A:D -> (1:1234)
|
|
562
|
+
a= re.match(r'^(chr|Chr|CHR)?(\d+)[:_-](\d+)([:_-]([ATCG]+)[:_-]([ATCG]+))?$', variant, flags=0)
|
|
563
|
+
|
|
564
|
+
if a is not None:
|
|
565
|
+
if a[4] is None:
|
|
566
|
+
single_chrom=int(a[2])
|
|
567
|
+
single_pos=int(a[3])
|
|
568
|
+
is_extract = is_extract | ((sumstats[pos] == single_pos ) &(sumstats[chrom] == single_chrom))
|
|
569
|
+
else:
|
|
570
|
+
single_chrom = int(a[2])
|
|
571
|
+
single_pos = int(a[3])
|
|
572
|
+
single_ea = a[5]
|
|
573
|
+
single_nea = a[6]
|
|
574
|
+
a_match = ((sumstats[nea] == single_nea) & (sumstats[ea] == single_ea)) | ((sumstats[nea] == single_ea) & (sumstats[ea] == single_nea))
|
|
575
|
+
is_extract = is_extract | ((sumstats[pos] == single_pos ) &(sumstats[chrom] == single_chrom) & a_match)
|
|
576
|
+
|
|
577
|
+
to_search = sumstats.loc[is_extract,:].copy()
|
|
578
|
+
log.write(" -Found {} variants...".format(len(to_search)),verbose=verbose)
|
|
579
|
+
|
|
580
|
+
log.write("Finished searching variants.", verbose=verbose)
|
|
581
|
+
return to_search
|
gwaslab/util_in_merge.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from gwaslab.g_Log import Log
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
def _extract_variant(variant_set, sumstats_dic, log=Log(), verbose=True):
|
|
6
|
+
|
|
7
|
+
combined = pd.DataFrame()
|
|
8
|
+
log.write("Start to initialize gl.SumstatsSet...", verbose=verbose)
|
|
9
|
+
for key, sumstats_gls in sumstats_dic.items():
|
|
10
|
+
log.write(" -{} : {}".format(key, sumstats_gls), verbose=verbose)
|
|
11
|
+
|
|
12
|
+
for key, sumstats_gls in sumstats_dic.items():
|
|
13
|
+
|
|
14
|
+
sumstats_single = sumstats_gls.data
|
|
15
|
+
|
|
16
|
+
# create a boolean col with FALSE
|
|
17
|
+
is_extract = sumstats_single["SNPID"]!=sumstats_single["SNPID"]
|
|
18
|
+
|
|
19
|
+
for variant in variant_set:
|
|
20
|
+
|
|
21
|
+
if pd.api.types.is_list_like(variant):
|
|
22
|
+
|
|
23
|
+
chrom=variant[0]
|
|
24
|
+
pos=variant[1]
|
|
25
|
+
|
|
26
|
+
is_extract = is_extract | ((sumstats_single["POS"] == pos ) &(sumstats_single["CHR"] == chrom))
|
|
27
|
+
elif pd.api.types.is_string_dtype(type(variant)):
|
|
28
|
+
|
|
29
|
+
is_extract = is_extract | (sumstats_single["SNPID"] == variant)
|
|
30
|
+
|
|
31
|
+
a= re.search(r'^(chr|Chr|CHR)?(\d+)[:_-](\d+)[:_-][ATCG]+[:_-][ATCG]+$', variant, flags=0)
|
|
32
|
+
if a is not None:
|
|
33
|
+
chrom=int(a[2])
|
|
34
|
+
pos=int(a[3])
|
|
35
|
+
is_extract = is_extract | ((sumstats_single["POS"] == pos ) &(sumstats_single["CHR"] == chrom))
|
|
36
|
+
|
|
37
|
+
to_extract = sumstats_single.loc[is_extract,:].copy()
|
|
38
|
+
log.write(" -Extracted {} variants from {}".format(len(to_extract), key),verbose=verbose)
|
|
39
|
+
to_extract["STUDY"] = key
|
|
40
|
+
|
|
41
|
+
to_extract_cols=["STUDY"]
|
|
42
|
+
|
|
43
|
+
default_cols=["SNPID","EA","NEA","CHR","POS","BETA","SE","P","MLOG10P","EAF","MAF","STATUS"]
|
|
44
|
+
|
|
45
|
+
for i in default_cols:
|
|
46
|
+
if i in sumstats_single.columns:
|
|
47
|
+
to_extract_cols.append(i)
|
|
48
|
+
|
|
49
|
+
combined = pd.concat([combined, to_extract[to_extract_cols]], ignore_index=True)
|
|
50
|
+
log.write("Finished initializing gl.SumstatsSet.", verbose=verbose)
|
|
51
|
+
return combined
|
gwaslab/viz_aux_save_figure.py
CHANGED
|
@@ -52,7 +52,8 @@ def get_default_path(keyword,fmt="png"):
|
|
|
52
52
|
"esc":"effect_size_comparision",
|
|
53
53
|
"afc":"allele_frequency_comparision",
|
|
54
54
|
"gwheatmap":"genome_wide_heatmap",
|
|
55
|
-
"scatter":"scatter"
|
|
55
|
+
"scatter":"scatter",
|
|
56
|
+
"forest":"forest"
|
|
56
57
|
}
|
|
57
58
|
prefix = path_dictionary[keyword]
|
|
58
59
|
count = 1
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import matplotlib.pyplot as plt
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import seaborn as sns
|
|
5
|
+
from gwaslab.g_Log import Log
|
|
6
|
+
from gwaslab.viz_aux_quickfix import _quick_assign_i_with_rank
|
|
7
|
+
from gwaslab.viz_plot_mqqplot import _process_xtick
|
|
8
|
+
from gwaslab.viz_plot_mqqplot import _process_xlabel
|
|
9
|
+
from gwaslab.bd_common_data import get_number_to_chr
|
|
10
|
+
from gwaslab.util_in_filter_value import _filter_region
|
|
11
|
+
from gwaslab.io_process_args import _extract_kwargs
|
|
12
|
+
|
|
13
|
+
def _plot_cs(pipcs,
|
|
14
|
+
region,
|
|
15
|
+
figax=None,
|
|
16
|
+
_posdiccul=None,
|
|
17
|
+
xtick_chr_dict=None,
|
|
18
|
+
pip="PIP",
|
|
19
|
+
onlycs=False,
|
|
20
|
+
cs="CREDIBLE_SET_INDEX",
|
|
21
|
+
marker_size=(45,85),
|
|
22
|
+
fontsize = 12,
|
|
23
|
+
font_family = "Arial",
|
|
24
|
+
legend_title="Credible sets",
|
|
25
|
+
log=Log(),
|
|
26
|
+
verbose=True,
|
|
27
|
+
**kwargs):
|
|
28
|
+
'''
|
|
29
|
+
pipcs : a DataFrame of finemapping results
|
|
30
|
+
'''
|
|
31
|
+
## parameters #############################
|
|
32
|
+
if xtick_chr_dict is None:
|
|
33
|
+
xtick_chr_dict = get_number_to_chr()
|
|
34
|
+
|
|
35
|
+
scatter_kwargs = _extract_kwargs("scatter", dict(), locals())
|
|
36
|
+
|
|
37
|
+
region_marker_shapes = ['o', '^','s','D','*','P','X','h','8']
|
|
38
|
+
region_ld_colors_m = ["grey","#E51819","green","#F07818","#AD5691","yellow","purple"]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
## filter data #############################
|
|
42
|
+
pipcs = _filter_region(pipcs, region)
|
|
43
|
+
if onlycs ==True:
|
|
44
|
+
pipcs = pipcs.loc[pipcs[cs]>0,:]
|
|
45
|
+
|
|
46
|
+
pipcs[cs] = pipcs[cs].astype("string")
|
|
47
|
+
|
|
48
|
+
## figure and ax #############################
|
|
49
|
+
if figax is not None:
|
|
50
|
+
ax=figax[1]
|
|
51
|
+
fig=figax[0]
|
|
52
|
+
else:
|
|
53
|
+
fig, ax = plt.subplots()
|
|
54
|
+
|
|
55
|
+
# assign i
|
|
56
|
+
pipcs,chrom_df=_quick_assign_i_with_rank(pipcs, chrpad=0.00,
|
|
57
|
+
use_rank=False,
|
|
58
|
+
chrom="CHR",pos="POS",
|
|
59
|
+
drop_chr_start=False,
|
|
60
|
+
_posdiccul=_posdiccul)
|
|
61
|
+
pipcs = pipcs.sort_values(by=cs,ascending=True)
|
|
62
|
+
|
|
63
|
+
## plot ##########################################
|
|
64
|
+
scatter_kwargs["markers"]= {m:region_marker_shapes[i] for i,m in enumerate(pipcs[cs].unique())}
|
|
65
|
+
palette = sns.color_palette(region_ld_colors_m,n_colors=pipcs[cs].nunique())
|
|
66
|
+
edgecolor="none"
|
|
67
|
+
|
|
68
|
+
plot = sns.scatterplot(data=pipcs,
|
|
69
|
+
x="i",
|
|
70
|
+
y=pip,
|
|
71
|
+
hue=cs,
|
|
72
|
+
edgecolor=edgecolor,
|
|
73
|
+
palette=palette,
|
|
74
|
+
style=cs,
|
|
75
|
+
s=marker_size[1],
|
|
76
|
+
ax=ax,
|
|
77
|
+
**scatter_kwargs)
|
|
78
|
+
|
|
79
|
+
# process legend
|
|
80
|
+
handles, labels = ax.get_legend_handles_labels()
|
|
81
|
+
new_labels = []
|
|
82
|
+
new_handles = []
|
|
83
|
+
ncol = len(labels)
|
|
84
|
+
|
|
85
|
+
for i,label in enumerate(labels):
|
|
86
|
+
if label in [str(j) for j in range(1,10)]:
|
|
87
|
+
new_labels.append(labels[i])
|
|
88
|
+
new_handles.append(handles[i])
|
|
89
|
+
|
|
90
|
+
ax.legend(labels =new_labels,
|
|
91
|
+
handles=new_handles,
|
|
92
|
+
loc="upper right",
|
|
93
|
+
bbox_to_anchor=(0.995, 0.995),
|
|
94
|
+
ncol=1,
|
|
95
|
+
scatterpoints=2,
|
|
96
|
+
title=legend_title,
|
|
97
|
+
frameon=True)
|
|
98
|
+
|
|
99
|
+
return fig, log
|