gwaslab 3.5.7__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +2 -0
- gwaslab/bd_common_data.py +1 -0
- gwaslab/bd_get_hapmap3.py +0 -1
- gwaslab/data/formatbook.json +78 -0
- gwaslab/data/reference.json +3 -1
- gwaslab/g_Sumstats.py +110 -25
- gwaslab/g_SumstatsMulti.py +287 -0
- gwaslab/g_SumstatsPair.py +101 -16
- gwaslab/g_Sumstats_polars.py +245 -0
- gwaslab/g_headers.py +12 -3
- gwaslab/g_meta.py +124 -47
- gwaslab/g_meta_update.py +48 -0
- gwaslab/g_vchange_status_polars.py +44 -0
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +169 -110
- gwaslab/hm_casting_polars.py +202 -0
- gwaslab/hm_harmonize_sumstats.py +19 -8
- gwaslab/io_load_ld.py +529 -0
- gwaslab/io_preformat_input.py +11 -0
- gwaslab/io_preformat_input_polars.py +632 -0
- gwaslab/io_process_args.py +25 -1
- gwaslab/io_read_ldsc.py +34 -3
- gwaslab/io_read_pipcs.py +62 -6
- gwaslab/prscs_gigrnd.py +122 -0
- gwaslab/prscs_mcmc_gtb.py +136 -0
- gwaslab/prscs_parse_genet.py +98 -0
- gwaslab/qc_build.py +53 -0
- gwaslab/qc_check_datatype.py +10 -8
- gwaslab/qc_check_datatype_polars.py +128 -0
- gwaslab/qc_fix_sumstats.py +25 -23
- gwaslab/qc_fix_sumstats_polars.py +193 -0
- gwaslab/util_ex_calculate_ldmatrix.py +49 -19
- gwaslab/util_ex_gwascatalog.py +71 -28
- gwaslab/util_ex_infer_ancestry.py +65 -0
- gwaslab/util_ex_ldsc.py +67 -21
- gwaslab/util_ex_match_ldmatrix.py +396 -0
- gwaslab/util_ex_run_2samplemr.py +0 -2
- gwaslab/util_ex_run_ccgwas.py +155 -0
- gwaslab/util_ex_run_coloc.py +1 -1
- gwaslab/util_ex_run_hyprcoloc.py +117 -0
- gwaslab/util_ex_run_magma.py +74 -0
- gwaslab/util_ex_run_mesusie.py +155 -0
- gwaslab/util_ex_run_mtag.py +92 -0
- gwaslab/util_ex_run_prscs.py +85 -0
- gwaslab/util_ex_run_susie.py +40 -9
- gwaslab/util_in_estimate_ess.py +18 -0
- gwaslab/util_in_fill_data.py +20 -1
- gwaslab/util_in_filter_value.py +10 -5
- gwaslab/util_in_get_sig.py +71 -13
- gwaslab/util_in_meta.py +168 -4
- gwaslab/util_in_meta_polars.py +174 -0
- gwaslab/viz_aux_annotate_plot.py +13 -2
- gwaslab/viz_plot_compare_effect.py +87 -23
- gwaslab/viz_plot_credible_sets.py +55 -11
- gwaslab/viz_plot_effect.py +22 -12
- gwaslab/viz_plot_miamiplot2.py +3 -2
- gwaslab/viz_plot_mqqplot.py +94 -84
- gwaslab/viz_plot_qqplot.py +9 -7
- gwaslab/viz_plot_regional2.py +2 -1
- gwaslab/viz_plot_stackedregional.py +4 -1
- {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/METADATA +46 -68
- gwaslab-3.6.0.dist-info/RECORD +119 -0
- {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/WHEEL +1 -1
- gwaslab-3.5.7.dist-info/RECORD +0 -96
- {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info/licenses}/LICENSE +0 -0
- {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/top_level.txt +0 -0
gwaslab/util_ex_run_susie.py
CHANGED
|
@@ -9,7 +9,20 @@ from gwaslab.g_version import _check_susie_version
|
|
|
9
9
|
from gwaslab.qc_fix_sumstats import start_to
|
|
10
10
|
from gwaslab.qc_fix_sumstats import finished
|
|
11
11
|
|
|
12
|
-
def _run_susie_rss(filepath,
|
|
12
|
+
def _run_susie_rss(filepath,
|
|
13
|
+
r="Rscript",
|
|
14
|
+
mode="bs",
|
|
15
|
+
max_iter=100000,
|
|
16
|
+
min_abs_corr=0.1,
|
|
17
|
+
refine="TRUE",
|
|
18
|
+
L=10,
|
|
19
|
+
fillldna=True,
|
|
20
|
+
n=None,
|
|
21
|
+
delete=False, #if delete output file
|
|
22
|
+
susie_args="",
|
|
23
|
+
log=Log(),
|
|
24
|
+
main_sumstats=None,
|
|
25
|
+
verbose=True):
|
|
13
26
|
##start function with col checking##########################################################
|
|
14
27
|
_start_line = "run finemapping using SuSieR from command line"
|
|
15
28
|
_end_line = "running finemapping using SuSieR from command line"
|
|
@@ -43,8 +56,8 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
|
|
|
43
56
|
for index, row in filelist.iterrows():
|
|
44
57
|
gc.collect()
|
|
45
58
|
study = row["STUDY"]
|
|
46
|
-
ld_r_matrix = row["LD_R_MATRIX"]
|
|
47
|
-
sumstats = row["LOCUS_SUMSTATS"]
|
|
59
|
+
ld_r_matrix = row["LD_R_MATRIX"] #ld matrix path
|
|
60
|
+
sumstats = row["LOCUS_SUMSTATS"] #sumsttas path
|
|
48
61
|
output_prefix = sumstats.replace(".sumstats.gz","")
|
|
49
62
|
log.write(" -Running for: {} - {}".format(row["SNPID"],row["STUDY"] ))
|
|
50
63
|
log.write(" -Locus sumstats:{}".format(sumstats))
|
|
@@ -54,7 +67,7 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
|
|
|
54
67
|
rscript='''
|
|
55
68
|
library(susieR)
|
|
56
69
|
|
|
57
|
-
sumstats <- read.csv("{}")
|
|
70
|
+
sumstats <- read.csv("{}",sep="\t")
|
|
58
71
|
|
|
59
72
|
R <- as.matrix(read.csv("{}",sep="\t",header=FALSE))
|
|
60
73
|
{}
|
|
@@ -67,6 +80,8 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
|
|
|
67
80
|
|
|
68
81
|
output <- susie_fitted_summary$vars
|
|
69
82
|
output$SNPID <- sumstats$SNPID[susie_fitted_summary$vars$variable]
|
|
83
|
+
output$LOCUS <- "{}"
|
|
84
|
+
output$STUDY <- "{}"
|
|
70
85
|
|
|
71
86
|
write.csv(output, "{}.pipcs", row.names = FALSE)
|
|
72
87
|
'''.format(sumstats,
|
|
@@ -79,6 +94,8 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
|
|
|
79
94
|
refine,
|
|
80
95
|
L,
|
|
81
96
|
susie_args,
|
|
97
|
+
row["SNPID"],
|
|
98
|
+
row["STUDY"],
|
|
82
99
|
output_prefix)
|
|
83
100
|
susier_line = "susie_rss({}, n = {}, R = R, max_iter = {}, min_abs_corr={}, refine = {}, L = {}{})".format("z= sumstats$Z," if mode=="z" else "bhat = sumstats$BETA,shat = sumstats$SE,",
|
|
84
101
|
n if n is not None else "n",
|
|
@@ -88,34 +105,48 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
|
|
|
88
105
|
L,
|
|
89
106
|
susie_args)
|
|
90
107
|
log.write(" -SuSieR script: {}".format(susier_line))
|
|
91
|
-
|
|
108
|
+
|
|
109
|
+
temp_r_path = "_{}_{}_{}_gwaslab_susie_temp.R".format(study,row["SNPID"],id(sumstats))
|
|
110
|
+
log.write(" -Createing temp R script: {}".format(temp_r_path))
|
|
111
|
+
with open(temp_r_path,"w") as file:
|
|
92
112
|
file.write(rscript)
|
|
93
113
|
|
|
94
|
-
script_run_r = "{}
|
|
114
|
+
script_run_r = "{} {}".format(r, temp_r_path)
|
|
95
115
|
|
|
96
116
|
try:
|
|
117
|
+
log.write(" -Running SuSieR from command line...")
|
|
97
118
|
output = subprocess.check_output(script_run_r, stderr=subprocess.STDOUT, shell=True,text=True)
|
|
98
119
|
#plink_process = subprocess.Popen("exec "+script_run_r, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True,text=True)
|
|
99
120
|
#output1,output2 = plink_process.communicate()
|
|
100
121
|
#output= output1 + output2+ "\n"
|
|
101
122
|
#plink_process.kill()
|
|
102
|
-
|
|
123
|
+
|
|
103
124
|
r_log+= output + "\n"
|
|
104
125
|
pip_cs = pd.read_csv("{}.pipcs".format(output_prefix))
|
|
105
126
|
pip_cs["LOCUS"] = row["SNPID"]
|
|
106
127
|
pip_cs["STUDY"] = row["STUDY"]
|
|
107
128
|
locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
|
|
108
129
|
|
|
109
|
-
os.remove(
|
|
130
|
+
os.remove(temp_r_path)
|
|
131
|
+
log.write(" -Removing temp R script: {}".format(temp_r_path))
|
|
132
|
+
|
|
110
133
|
if delete == True:
|
|
111
134
|
os.remove("{}.pipcs".format(output_prefix))
|
|
135
|
+
log.write(" -Removing output file: {}".format(temp_r_path))
|
|
112
136
|
else:
|
|
113
137
|
log.write(" -SuSieR result summary to: {}".format("{}.pipcs".format(output_prefix)))
|
|
114
138
|
except subprocess.CalledProcessError as e:
|
|
115
139
|
log.write(e.output)
|
|
116
|
-
os.remove(
|
|
140
|
+
os.remove(temp_r_path)
|
|
141
|
+
log.write(" -Removing temp R script: {}".format(temp_r_path))
|
|
117
142
|
|
|
118
143
|
locus_pip_cs = locus_pip_cs.rename(columns={"variable":"N_SNP","variable_prob":"PIP","cs":"CREDIBLE_SET_INDEX"})
|
|
144
|
+
locus_pip_cs = pd.merge(locus_pip_cs, main_sumstats, on="SNPID",how="left")
|
|
145
|
+
|
|
119
146
|
finished(log=log, verbose=verbose, end_line=_end_line)
|
|
120
147
|
return locus_pip_cs
|
|
121
148
|
|
|
149
|
+
def _get_cs_lead(pipcs):
|
|
150
|
+
leads = pipcs.loc[pipcs["CREDIBLE_SET_INDEX"]>0,:]
|
|
151
|
+
leads = leads.sort_values(by="PIP",ascending=False).drop_duplicates(subset=["STUDY","LOCUS","CREDIBLE_SET_INDEX"])
|
|
152
|
+
return leads
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from scipy.stats import norm
|
|
3
|
+
from gwaslab.g_Log import Log
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _get_ess(sumstats, method="metal",log=Log(),verbose=True):
|
|
7
|
+
log.write("Start to estimate effective sample size (N_EFF)...", verbose=verbose)
|
|
8
|
+
if type(method) is str:
|
|
9
|
+
if method =="metal":
|
|
10
|
+
log.write(" - Method: {} ".format(method), verbose=verbose)
|
|
11
|
+
log.write(" - Referencec: {} ".format("Willer, C. J., Li, Y., & Abecasis, G. R. (2010)"), verbose=verbose)
|
|
12
|
+
log.write(" - Equation: {} ".format(" N_EFF = 4 * N_CASE * N_CONTROL / (N_CASE + N_CONTROL)"), verbose=verbose)
|
|
13
|
+
# Willer, C. J., Li, Y., & Abecasis, G. R. (2010). METAL: fast and efficient meta-analysis of genomewide association scans. Bioinformatics, 26(17), 2190-2191.
|
|
14
|
+
sumstats["N_EFF"] = 4 / (1/sumstats["N_CASE"] + 1/sumstats["N_CONTROL"])
|
|
15
|
+
else:
|
|
16
|
+
sumstats["N_EFF"] = method
|
|
17
|
+
log.write("Finished estimating effective sample size (N_EFF)...", verbose=verbose)
|
|
18
|
+
return sumstats
|
gwaslab/util_in_fill_data.py
CHANGED
|
@@ -355,4 +355,23 @@ def rank_based_int(series, c=3/8):
|
|
|
355
355
|
#https://onlinelibrary.wiley.com/doi/10.1111/biom.13214
|
|
356
356
|
n=sum(~series.isna())
|
|
357
357
|
normalized_value = norm.ppf((series.rank()-c)/(n+1-2*c))
|
|
358
|
-
return normalized_value
|
|
358
|
+
return normalized_value
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
################################################################################################################################################################################
|
|
362
|
+
|
|
363
|
+
def _get_multi_min(sumstats_multi, col, nstudy,log=Log(), verbose=True):
|
|
364
|
+
cols =[]
|
|
365
|
+
for i in range(nstudy):
|
|
366
|
+
single_header = "{}_{}".format(col, i + 1)
|
|
367
|
+
if single_header in sumstats_multi.columns:
|
|
368
|
+
cols.append(single_header)
|
|
369
|
+
|
|
370
|
+
combined_header = "{}_{}".format(col, "MIN")
|
|
371
|
+
log.write(" -Filling {} using {}".format(combined_header,",".join(cols)), verbose=verbose)
|
|
372
|
+
sumstats_multi[combined_header] = sumstats_multi[cols].min(axis=1)
|
|
373
|
+
|
|
374
|
+
combined_header_index = "{}_{}_COL".format(col, "MIN")
|
|
375
|
+
sumstats_multi[combined_header_index] = sumstats_multi[cols].idxmin(axis=1)
|
|
376
|
+
return sumstats_multi
|
|
377
|
+
|
gwaslab/util_in_filter_value.py
CHANGED
|
@@ -217,7 +217,10 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
|
|
|
217
217
|
gc.collect()
|
|
218
218
|
return sumstats
|
|
219
219
|
|
|
220
|
-
def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS",
|
|
220
|
+
def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS",
|
|
221
|
+
ea="EA", nea="NEA",build="19",
|
|
222
|
+
change_status=True,
|
|
223
|
+
verbose=True,log=Log()):
|
|
221
224
|
##start function with col checking##########################################################
|
|
222
225
|
_start_line = "infer genome build version using hapmap3 SNPs"
|
|
223
226
|
_end_line = "inferring genome build version using hapmap3 SNPs"
|
|
@@ -261,13 +264,15 @@ def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NE
|
|
|
261
264
|
|
|
262
265
|
if match_count_for_19 > match_count_for_38:
|
|
263
266
|
log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...", verbose=verbose)
|
|
264
|
-
|
|
265
|
-
|
|
267
|
+
if change_status==True:
|
|
268
|
+
sumstats[status] = vchange_status(sumstats[status],1,"9","1")
|
|
269
|
+
sumstats[status] = vchange_status(sumstats[status],2,"9","9")
|
|
266
270
|
inferred_build="19"
|
|
267
271
|
elif match_count_for_19 < match_count_for_38:
|
|
268
272
|
log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...", verbose=verbose)
|
|
269
|
-
|
|
270
|
-
|
|
273
|
+
if change_status==True:
|
|
274
|
+
sumstats[status] = vchange_status(sumstats[status],1,"9","3")
|
|
275
|
+
sumstats[status] = vchange_status(sumstats[status],2,"9","8")
|
|
271
276
|
inferred_build="38"
|
|
272
277
|
else:
|
|
273
278
|
log.write(" -Since num_hg19 = num_hg38, unable to infer...", verbose=verbose)
|
gwaslab/util_in_get_sig.py
CHANGED
|
@@ -17,6 +17,7 @@ from gwaslab.util_ex_gwascatalog import gwascatalog_trait
|
|
|
17
17
|
from gwaslab.qc_fix_sumstats import check_dataframe_shape
|
|
18
18
|
from gwaslab.qc_fix_sumstats import start_to
|
|
19
19
|
from gwaslab.qc_fix_sumstats import finished
|
|
20
|
+
from gwaslab.qc_build import _check_build
|
|
20
21
|
from gwaslab.util_in_correct_winnerscurse import wc_correct
|
|
21
22
|
# getsig
|
|
22
23
|
# closest_gene
|
|
@@ -372,6 +373,8 @@ def getnovel(insumstats,
|
|
|
372
373
|
xymt=["X","Y","MT"],
|
|
373
374
|
anno=False,
|
|
374
375
|
wc_correction=False,
|
|
376
|
+
use_cache=True,
|
|
377
|
+
cache_dir="./",
|
|
375
378
|
build="19",
|
|
376
379
|
source="ensembl",
|
|
377
380
|
gwascatalog_source="NCBI",
|
|
@@ -405,15 +408,26 @@ def getnovel(insumstats,
|
|
|
405
408
|
############################################################################################
|
|
406
409
|
knownsig = pd.DataFrame()
|
|
407
410
|
if efo != False:
|
|
411
|
+
# For GWAS catalog, checking if sumstats build is hg38
|
|
412
|
+
_check_build(target_build="38" ,build=build ,log=log,verbose=verbose)
|
|
413
|
+
|
|
408
414
|
if type(efo) is not list:
|
|
409
415
|
log.write("Start to retrieve data using EFO: {}...".format(efo), verbose=verbose)
|
|
410
|
-
known_Sumstats = gwascatalog_trait(efo,source=gwascatalog_source,
|
|
416
|
+
known_Sumstats = gwascatalog_trait(efo,source=gwascatalog_source,
|
|
417
|
+
sig_level=sig_level,
|
|
418
|
+
use_cache=use_cache,
|
|
419
|
+
cache_dir=cache_dir,
|
|
420
|
+
verbose=verbose,log=log)
|
|
411
421
|
knownsig = known_Sumstats.data.copy()
|
|
412
|
-
else:
|
|
422
|
+
else:
|
|
413
423
|
knownsig=pd.DataFrame()
|
|
414
424
|
log.write("Start to retrieve data using {} EFOs: {}...".format(len(efo),efo), verbose=verbose)
|
|
425
|
+
|
|
415
426
|
for single_efo in efo:
|
|
416
|
-
known_Sumstats = gwascatalog_trait(single_efo,source=gwascatalog_source,
|
|
427
|
+
known_Sumstats = gwascatalog_trait(single_efo,source=gwascatalog_source,
|
|
428
|
+
use_cache=use_cache,
|
|
429
|
+
cache_dir=cache_dir,
|
|
430
|
+
sig_level=sig_level,verbose=verbose,log=log)
|
|
417
431
|
known_Sumstats.data["EFOID"] = single_efo
|
|
418
432
|
knownsig = pd.concat([known_Sumstats.data, knownsig],ignore_index=True)
|
|
419
433
|
knownsig["CHR"] = knownsig["CHR"].astype("Int64")
|
|
@@ -832,44 +846,88 @@ def _check_novel_set(insumstats,
|
|
|
832
846
|
else:
|
|
833
847
|
reference_dict[row[group_key]] = {row[snpset]:set([row[snpid]])}
|
|
834
848
|
############################################################################################
|
|
835
|
-
|
|
849
|
+
#match group/trait
|
|
836
850
|
try:
|
|
837
851
|
no_reference_avaialble = allsig.loc[~allsig[group_key].isin(reference_dict.keys()),group_key]
|
|
838
852
|
if len(no_reference_avaialble)>0:
|
|
839
853
|
log.write(" -Groups not in reference: {}".format( ",".join(no_reference_avaialble)), verbose=verbose)
|
|
840
854
|
except:
|
|
841
855
|
pass
|
|
856
|
+
############################################################################################
|
|
842
857
|
|
|
843
858
|
log.write(" -Checking if variants are in reference variant sets...", verbose=verbose)
|
|
844
|
-
known_list = allsig.apply(lambda x: check_overlap(x,snpid, group_key,reference_dict), axis=1)
|
|
859
|
+
#known_list = allsig.apply(lambda x: check_overlap(x,snpid, group_key,reference_dict), axis=1)
|
|
860
|
+
new_row_list = []
|
|
861
|
+
for index, row in allsig.iterrows():
|
|
862
|
+
|
|
863
|
+
row = check_overlap(row, snpset, snpid, group_key,reference_dict)
|
|
864
|
+
new_row_list = new_row_list+row
|
|
865
|
+
known_df = pd.DataFrame(new_row_list,
|
|
866
|
+
columns=[snpid,group_key, snpset,"KNOWN_SET","OVERLAP_VARIANT","KNOWN_SET_VARIANT"])
|
|
845
867
|
|
|
846
|
-
allsig
|
|
847
|
-
|
|
868
|
+
allsig = pd.merge(allsig,known_df, on=[snpid, group_key, snpset],how="left")
|
|
869
|
+
|
|
870
|
+
#allsig["KNOWN_SET"] = known_list.str[0]
|
|
871
|
+
#allsig["OVERLAP_VARIANT"] = known_list.str[1]
|
|
872
|
+
#allsig["KNOWN_SET_VARIANT"] = known_list.str[2]
|
|
848
873
|
|
|
874
|
+
##
|
|
875
|
+
is_overlapped = ~allsig["KNOWN_SET"].isna()
|
|
876
|
+
allsig["KNOWN_SET_SIZE"] = 0
|
|
877
|
+
allsig.loc[is_overlapped, "KNOWN_SET_SIZE"] = allsig.loc[is_overlapped, "KNOWN_SET_VARIANT"].str.len()
|
|
878
|
+
|
|
879
|
+
# sumstats set dic
|
|
849
880
|
back_dict={}
|
|
850
881
|
for i in allsig[group_key].unique():
|
|
882
|
+
# for each trait in sumstats
|
|
851
883
|
back_dict[i] ={}
|
|
852
884
|
for j in allsig.loc[allsig[group_key]==i,snpset].unique():
|
|
885
|
+
#for each locus in each trait
|
|
853
886
|
back_dict[i][j] =set()
|
|
854
|
-
for index, row in allsig.loc[(allsig[group_key]==i) & (allsig[snpset]==j)
|
|
855
|
-
|
|
887
|
+
for index, row in allsig.loc[(allsig[group_key]==i) & (allsig[snpset]==j),:].iterrows():
|
|
888
|
+
#for each variant in each locus
|
|
889
|
+
back_dict[i][j].add("{}".format(row["SNPID"]))
|
|
856
890
|
|
|
857
|
-
allsig["
|
|
891
|
+
allsig["SUMSTATS_SET_VARIANT"] = allsig.apply(lambda x: assign_set_variant(x,group_key,snpset,back_dict), axis=1)
|
|
892
|
+
allsig["SUMSTATS_SET_SIZE"] = 0
|
|
893
|
+
allsig["SUMSTATS_SET_SIZE"] = allsig[ "SUMSTATS_SET_VARIANT"].str.len()
|
|
858
894
|
|
|
859
895
|
finished(log,verbose,_end_line)
|
|
860
896
|
|
|
861
897
|
return allsig
|
|
862
898
|
|
|
863
|
-
def check_overlap(x,snpid, group_key,reference_dict):
|
|
899
|
+
def check_overlap(x,snpset, snpid, group_key,reference_dict):
|
|
900
|
+
|
|
901
|
+
matched=[]
|
|
864
902
|
if x[group_key] in reference_dict.keys():
|
|
903
|
+
# if trait match
|
|
865
904
|
for key, value in reference_dict[x[group_key]].items():
|
|
905
|
+
# locus and snplist
|
|
866
906
|
if x[snpid] in value:
|
|
867
|
-
|
|
868
|
-
|
|
907
|
+
# if sumstats snp in reference snplist for locus
|
|
908
|
+
# return locus and snsumstats snppid
|
|
909
|
+
matched.append( (x[snpid], x[group_key], x[snpset], key, x[snpid], value))
|
|
910
|
+
if len(matched)==0:
|
|
911
|
+
matched = [(x[snpid], x[group_key], x[snpset], pd.NA, pd.NA, pd.NA)]
|
|
912
|
+
return matched
|
|
913
|
+
|
|
914
|
+
#def check_overlap(x,snpid, group_key,reference_dict):
|
|
915
|
+
# if x[group_key] in reference_dict.keys():
|
|
916
|
+
# # if trait match
|
|
917
|
+
# for key, value in reference_dict[x[group_key]].items():
|
|
918
|
+
# # locus and snplist
|
|
919
|
+
# if x[snpid] in value:
|
|
920
|
+
# # if sumstats snp in reference snplist for locus
|
|
921
|
+
# # return locus and snsumstats snppid
|
|
922
|
+
# return key, x[snpid], value
|
|
923
|
+
# return pd.NA, pd.NA, pd.NA
|
|
869
924
|
|
|
870
925
|
def assign_set_variant(x,group_key,snpset,back_dict):
|
|
871
926
|
if x[group_key] in back_dict.keys():
|
|
927
|
+
# if trait match
|
|
872
928
|
if x[snpset] in back_dict[x[group_key]].keys():
|
|
929
|
+
#if locus match
|
|
873
930
|
if len(back_dict[x[group_key]][x[snpset]]) >0:
|
|
931
|
+
# return sumstats snplist for locus
|
|
874
932
|
return back_dict[x[group_key]][x[snpset]]
|
|
875
933
|
return pd.NA
|
gwaslab/util_in_meta.py
CHANGED
|
@@ -7,8 +7,12 @@ from gwaslab.g_Log import Log
|
|
|
7
7
|
from gwaslab.io_to_pickle import load_data_from_pickle
|
|
8
8
|
from gwaslab.g_Sumstats import Sumstats
|
|
9
9
|
import gc
|
|
10
|
+
import statsmodels.api as sm
|
|
10
11
|
|
|
11
|
-
def meta_analyze(sumstats_list,
|
|
12
|
+
def meta_analyze(sumstats_list,
|
|
13
|
+
random_effects=False,
|
|
14
|
+
match_allele=True,
|
|
15
|
+
log=Log()):
|
|
12
16
|
|
|
13
17
|
###########################################################################
|
|
14
18
|
columns=["SNPID","CHR","POS","EA","NEA"]
|
|
@@ -16,6 +20,7 @@ def meta_analyze(sumstats_list,random_effects=False, match_allele=True, log=Log(
|
|
|
16
20
|
|
|
17
21
|
log.write("Start to perform meta-analysis...")
|
|
18
22
|
log.write(" -Datasets:")
|
|
23
|
+
|
|
19
24
|
for index,sumstats_path in enumerate(sumstats_list):
|
|
20
25
|
if isinstance(sumstats_path, pd.DataFrame):
|
|
21
26
|
log.write(" -Sumstats #{}: {} ".format(index, sumstats_path))
|
|
@@ -42,8 +47,6 @@ def meta_analyze(sumstats_list,random_effects=False, match_allele=True, log=Log(
|
|
|
42
47
|
del new_rows
|
|
43
48
|
gc.collect()
|
|
44
49
|
|
|
45
|
-
|
|
46
|
-
|
|
47
50
|
###########################################################################
|
|
48
51
|
log.write(" -Initiating result DataFrame...")
|
|
49
52
|
columns=["SNPID","CHR","POS","EA","NEA","_BETAW_SUM","_EA_N","_NEA_N","_BETA2W_SUM","_W_SUM","EAF","N","DIRECTION","BETA","SE","DOF"]
|
|
@@ -231,4 +234,165 @@ def get_sumstats(input_path,usekeys=None):
|
|
|
231
234
|
sumstats = sumstats[usekeys]
|
|
232
235
|
else:
|
|
233
236
|
sumstats = Sumstats(path,fmt="auto",verbose=False,usekeys=usekeys,**path_args).data
|
|
234
|
-
return sumstats
|
|
237
|
+
return sumstats
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
############################################################################################################################################################################
|
|
241
|
+
|
|
242
|
+
def meta_analyze_multi(sumstats_multi,
|
|
243
|
+
random_effects=False,
|
|
244
|
+
nstudy=1,
|
|
245
|
+
match_allele=True,
|
|
246
|
+
log=Log()):
|
|
247
|
+
log.write("Start to perform meta-analysis...")
|
|
248
|
+
###########################################################################
|
|
249
|
+
log.write(" -Initiating result DataFrame...")
|
|
250
|
+
sumstats_multi["_INDEX"] = range(len(sumstats_multi))
|
|
251
|
+
results_df = _init_result_df(sumstats_multi)
|
|
252
|
+
##########################################################################
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
log.write(" -Iterating through {} datasets to compute statistics for fixed-effect model...".format(nstudy))
|
|
256
|
+
for i in range(nstudy):
|
|
257
|
+
n="N_{}".format(i+1)
|
|
258
|
+
beta="BETA_{}".format(i+1)
|
|
259
|
+
se="SE_{}".format(i+1)
|
|
260
|
+
eaf="EAF_{}".format(i+1)
|
|
261
|
+
single_study_cols=[n,beta,se,eaf,"SNPID","_INDEX"]
|
|
262
|
+
to_use_sumstats = sumstats_multi.loc[~sumstats_multi["BETA_{}".format(i+1)].isna(),single_study_cols].drop_duplicates(subset="_INDEX").set_index("_INDEX")
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
sumstats_index = to_use_sumstats.index
|
|
266
|
+
|
|
267
|
+
results_df_not_in_sumstat_index = results_df.index[~results_df.index.isin(to_use_sumstats.index)]
|
|
268
|
+
|
|
269
|
+
# N and DOF
|
|
270
|
+
results_df.loc[sumstats_index, "N"] += to_use_sumstats[n].fillna(0)
|
|
271
|
+
results_df.loc[sumstats_index, "DOF"] += 1
|
|
272
|
+
|
|
273
|
+
# BEAT and SE
|
|
274
|
+
results_df.loc[sumstats_index,"_BETA2W_SUM"] += to_use_sumstats[beta]**2 *(1/(to_use_sumstats[se]**2))
|
|
275
|
+
results_df.loc[sumstats_index,"_BETAW_SUM"] += to_use_sumstats[beta]*(1/(to_use_sumstats[se]**2))
|
|
276
|
+
results_df.loc[sumstats_index,"_W_SUM"] += 1/(to_use_sumstats[se]**2)
|
|
277
|
+
results_df.loc[sumstats_index,"_W2_SUM"] += results_df.loc[sumstats_index,"_W_SUM"]**2
|
|
278
|
+
|
|
279
|
+
# EAF
|
|
280
|
+
results_df.loc[sumstats_index,"_EA_N"] += to_use_sumstats[n]*to_use_sumstats[eaf]
|
|
281
|
+
results_df.loc[sumstats_index,"_NEA_N"] += to_use_sumstats[n]*(1 - to_use_sumstats[eaf])
|
|
282
|
+
|
|
283
|
+
# DIRECTION
|
|
284
|
+
beta_index = to_use_sumstats[to_use_sumstats[beta]>0].index
|
|
285
|
+
results_df.loc[beta_index, "DIRECTION"] += "+"
|
|
286
|
+
beta_index = to_use_sumstats[to_use_sumstats[beta]==0].index
|
|
287
|
+
results_df.loc[beta_index, "DIRECTION"] += "0"
|
|
288
|
+
beta_index = to_use_sumstats[to_use_sumstats[beta]<0].index
|
|
289
|
+
results_df.loc[beta_index, "DIRECTION"] += "-"
|
|
290
|
+
results_df.loc[results_df_not_in_sumstat_index, "DIRECTION"] += "?"
|
|
291
|
+
|
|
292
|
+
del to_use_sumstats
|
|
293
|
+
gc.collect()
|
|
294
|
+
|
|
295
|
+
##############################################################################
|
|
296
|
+
# fixed - effect statistics
|
|
297
|
+
results_df["BETA"] = results_df["_BETAW_SUM"] / results_df["_W_SUM"]
|
|
298
|
+
results_df["EAF"] = results_df["_EA_N"] / (results_df["_EA_N"] + results_df["_NEA_N"])
|
|
299
|
+
results_df["SE"] = np.sqrt(1/results_df["_W_SUM"])
|
|
300
|
+
results_df["Z"] = results_df["BETA"] / results_df["SE"]
|
|
301
|
+
results_df["P"] = norm.sf(abs(results_df["Z"]))*2
|
|
302
|
+
results_df["Q"] = results_df["_BETA2W_SUM"] - (results_df["_BETAW_SUM"]**2 / results_df["_W_SUM"])
|
|
303
|
+
|
|
304
|
+
for dof in results_df["DOF"].unique():
|
|
305
|
+
results_df_dof_index = results_df["DOF"] == dof
|
|
306
|
+
results_df.loc[results_df_dof_index,"P_HET"] = chi2.sf(results_df.loc[results_df_dof_index, "Q"].values,dof)
|
|
307
|
+
gc.collect()
|
|
308
|
+
|
|
309
|
+
results_df["I2"] = (results_df["Q"] - results_df["DOF"])/results_df["Q"]
|
|
310
|
+
results_df.loc[results_df["I2"]<0, "I2"] = 0
|
|
311
|
+
|
|
312
|
+
results_df=results_df.drop(columns=["_EA_N","_NEA_N"])
|
|
313
|
+
gc.collect()
|
|
314
|
+
|
|
315
|
+
###########################################################################
|
|
316
|
+
if random_effects==True:
|
|
317
|
+
log.write(" -Iterating through {} datasets to compute statistics for random-effects model...".format(nstudy))
|
|
318
|
+
results_df["_R2"] = (results_df["Q"] - results_df["DOF"])/(results_df["_W_SUM"] - (results_df["_W2_SUM"]/results_df["_W_SUM"]))
|
|
319
|
+
results_df.loc[results_df["_R2"]<0, "_R2"] = 0
|
|
320
|
+
variant_index_random = results_df[results_df["_R2"]>0].index
|
|
321
|
+
|
|
322
|
+
results_df["_BETAW_SUM_R"] = 0.0
|
|
323
|
+
results_df["_W_SUM_R"] = 0.0
|
|
324
|
+
results_df["BETA_RANDOM"] = results_df["BETA"]
|
|
325
|
+
results_df["SE_RANDOM"] = results_df["SE"]
|
|
326
|
+
|
|
327
|
+
for i in range(nstudy):
|
|
328
|
+
n="N_{}".format(i+1)
|
|
329
|
+
beta="BETA_{}".format(i+1)
|
|
330
|
+
se="SE_{}".format(i+1)
|
|
331
|
+
eaf="EAF_{}".format(i+1)
|
|
332
|
+
single_study_cols=[n,beta,se,eaf,"SNPID","_INDEX"]
|
|
333
|
+
to_use_sumstats = sumstats_multi.loc[~sumstats_multi["BETA_{}".format(i+1)].isna(),single_study_cols].drop_duplicates(subset="_INDEX").set_index("_INDEX")
|
|
334
|
+
sumstats_index = to_use_sumstats.index
|
|
335
|
+
|
|
336
|
+
# BEAT and SE
|
|
337
|
+
results_df.loc[sumstats_index,"_BETAW_SUM_R"] += to_use_sumstats[beta]*(1/(to_use_sumstats[se]**2 + results_df.loc[sumstats_index,"_R2"]))
|
|
338
|
+
results_df.loc[sumstats_index,"_W_SUM_R"] += 1/(to_use_sumstats[se]**2 + results_df.loc[sumstats_index,"_R2"])
|
|
339
|
+
|
|
340
|
+
del to_use_sumstats
|
|
341
|
+
del sumstats_index
|
|
342
|
+
gc.collect()
|
|
343
|
+
|
|
344
|
+
results_df.loc[variant_index_random,"BETA_RANDOM"] = results_df.loc[variant_index_random,"_BETAW_SUM_R"] / results_df.loc[variant_index_random,"_W_SUM_R"]
|
|
345
|
+
results_df.loc[variant_index_random,"SE_RANDOM"] = np.sqrt(1/results_df.loc[variant_index_random,"_W_SUM_R"])
|
|
346
|
+
results_df["Z_RANDOM"] = results_df["BETA_RANDOM"] / results_df["SE_RANDOM"]
|
|
347
|
+
results_df["P_RANDOM"] = norm.sf(abs(results_df["Z_RANDOM"]))*2
|
|
348
|
+
results_df = results_df.drop(columns=["_BETAW_SUM_R","_W_SUM_R"])
|
|
349
|
+
|
|
350
|
+
gc.collect()
|
|
351
|
+
###########################################################################
|
|
352
|
+
results_df = results_df.drop(columns=["_BETAW_SUM","_BETA2W_SUM","_W_SUM","_R2","_W2_SUM"]).sort_values(by=["CHR","POS"]).reset_index()
|
|
353
|
+
gc.collect()
|
|
354
|
+
log.write("Finished meta-analysis successfully!")
|
|
355
|
+
|
|
356
|
+
if random_effects==True:
|
|
357
|
+
other_cols = ["BETA_RANDOM","SE_RANDOM","Z_RANDOM","P_RANDOM"]
|
|
358
|
+
else:
|
|
359
|
+
other_cols = []
|
|
360
|
+
|
|
361
|
+
results_df = results_df.drop(columns=["_INDEX"])
|
|
362
|
+
|
|
363
|
+
results_df = Sumstats(results_df, fmt="gwaslab", other = other_cols)
|
|
364
|
+
|
|
365
|
+
return results_df
|
|
366
|
+
|
|
367
|
+
def _init_result_df(sumstats):
|
|
368
|
+
|
|
369
|
+
results_df = sumstats[["_INDEX","SNPID","CHR","POS","EA","NEA"]]
|
|
370
|
+
results_df = results_df.drop_duplicates(subset="_INDEX").set_index("_INDEX")
|
|
371
|
+
|
|
372
|
+
results_df["N"] = 0
|
|
373
|
+
results_df["_BETAW_SUM"] = 0.0
|
|
374
|
+
results_df["_BETA2W_SUM"] = 0.0
|
|
375
|
+
results_df["_W_SUM"] = 0.0
|
|
376
|
+
results_df["_W2_SUM"] = 0.0
|
|
377
|
+
results_df["_EA_N"] = 0.0
|
|
378
|
+
results_df["_NEA_N"] = 0.0
|
|
379
|
+
results_df["N"] = 0
|
|
380
|
+
results_df["DIRECTION"] = ""
|
|
381
|
+
results_df["BETA"] = 0.0
|
|
382
|
+
results_df["SE"] = 0.0
|
|
383
|
+
results_df["DOF"] = -1
|
|
384
|
+
results_df["_R2"] = 0
|
|
385
|
+
|
|
386
|
+
dtype_dict ={
|
|
387
|
+
"_BETAW_SUM":"float64",
|
|
388
|
+
"_EA_N":"float64",
|
|
389
|
+
"_NEA_N":"float64",
|
|
390
|
+
"_BETA2W_SUM":"float64",
|
|
391
|
+
"_W_SUM":"float64",
|
|
392
|
+
"BETA":"float64",
|
|
393
|
+
"SE":"float64",
|
|
394
|
+
"N":"Int64",
|
|
395
|
+
"DOF":"Int64"
|
|
396
|
+
}
|
|
397
|
+
results_df=results_df.astype(dtype_dict)
|
|
398
|
+
return results_df
|