gwaslab 3.5.6__py3-none-any.whl → 3.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +2 -0
- gwaslab/bd_common_data.py +1 -0
- gwaslab/bd_get_hapmap3.py +0 -1
- gwaslab/data/formatbook.json +78 -0
- gwaslab/g_Sumstats.py +98 -24
- gwaslab/g_SumstatsMulti.py +287 -0
- gwaslab/g_SumstatsPair.py +101 -16
- gwaslab/g_Sumstats_polars.py +245 -0
- gwaslab/g_headers.py +12 -3
- gwaslab/g_meta.py +123 -47
- gwaslab/g_meta_update.py +48 -0
- gwaslab/g_vchange_status_polars.py +44 -0
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +169 -110
- gwaslab/hm_casting_polars.py +202 -0
- gwaslab/hm_harmonize_sumstats.py +19 -8
- gwaslab/io_load_ld.py +529 -0
- gwaslab/io_preformat_input.py +11 -0
- gwaslab/io_preformat_input_polars.py +632 -0
- gwaslab/io_process_args.py +25 -1
- gwaslab/io_read_ldsc.py +34 -3
- gwaslab/io_read_pipcs.py +62 -6
- gwaslab/prscs_gigrnd.py +122 -0
- gwaslab/prscs_mcmc_gtb.py +136 -0
- gwaslab/prscs_parse_genet.py +98 -0
- gwaslab/qc_build.py +53 -0
- gwaslab/qc_check_datatype.py +10 -8
- gwaslab/qc_check_datatype_polars.py +128 -0
- gwaslab/qc_fix_sumstats.py +25 -23
- gwaslab/qc_fix_sumstats_polars.py +193 -0
- gwaslab/util_ex_calculate_ldmatrix.py +49 -19
- gwaslab/util_ex_gwascatalog.py +71 -28
- gwaslab/util_ex_ldsc.py +67 -21
- gwaslab/util_ex_match_ldmatrix.py +396 -0
- gwaslab/util_ex_run_2samplemr.py +0 -2
- gwaslab/util_ex_run_ccgwas.py +155 -0
- gwaslab/util_ex_run_coloc.py +1 -1
- gwaslab/util_ex_run_hyprcoloc.py +117 -0
- gwaslab/util_ex_run_mesusie.py +155 -0
- gwaslab/util_ex_run_mtag.py +92 -0
- gwaslab/util_ex_run_prscs.py +85 -0
- gwaslab/util_ex_run_susie.py +40 -9
- gwaslab/util_in_estimate_ess.py +18 -0
- gwaslab/util_in_fill_data.py +20 -1
- gwaslab/util_in_filter_value.py +10 -5
- gwaslab/util_in_get_sig.py +71 -13
- gwaslab/util_in_meta.py +168 -4
- gwaslab/util_in_meta_polars.py +174 -0
- gwaslab/viz_plot_compare_effect.py +87 -23
- gwaslab/viz_plot_credible_sets.py +55 -11
- gwaslab/viz_plot_effect.py +22 -12
- gwaslab/viz_plot_miamiplot2.py +3 -2
- gwaslab/viz_plot_mqqplot.py +165 -141
- gwaslab/viz_plot_qqplot.py +6 -6
- gwaslab/viz_plot_regional2.py +5 -13
- gwaslab/viz_plot_rg_heatmap.py +6 -1
- gwaslab/viz_plot_stackedregional.py +21 -6
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/METADATA +9 -7
- gwaslab-3.5.8.dist-info/RECORD +117 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
- gwaslab-3.5.6.dist-info/RECORD +0 -96
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import os
|
|
3
|
+
import gc
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
from gwaslab.g_Log import Log
|
|
7
|
+
from gwaslab.g_version import _checking_r_version
|
|
8
|
+
from gwaslab.g_version import _check_susie_version
|
|
9
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
10
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
11
|
+
from gwaslab.util_ex_calculate_ldmatrix import _extract_variants_in_locus
|
|
12
|
+
from gwaslab.util_in_get_sig import getsig
|
|
13
|
+
|
|
14
|
+
def _run_ccgwas( sumstats_pair,
|
|
15
|
+
r="Rscript",
|
|
16
|
+
group="Group1",
|
|
17
|
+
studies=None,
|
|
18
|
+
traits=None,
|
|
19
|
+
meta=None,
|
|
20
|
+
ldsc=None,
|
|
21
|
+
ldsc_rg=None,
|
|
22
|
+
nstudy=2,
|
|
23
|
+
K_A1A0 = None ,
|
|
24
|
+
K_A1A0_high = None ,
|
|
25
|
+
K_A1A0_low = None ,
|
|
26
|
+
K_B1B0 = None ,
|
|
27
|
+
K_B1B0_high = None ,
|
|
28
|
+
K_B1B0_low = None ,
|
|
29
|
+
h2l_A1A0 = None ,
|
|
30
|
+
h2l_B1B0 = None ,
|
|
31
|
+
rg_A1A0_B1B0 = None ,
|
|
32
|
+
intercept_A1A0_B1B0 = None ,
|
|
33
|
+
m = 1e4,
|
|
34
|
+
N_A1 = None,
|
|
35
|
+
N_B1 = None,
|
|
36
|
+
N_A0 = None ,
|
|
37
|
+
N_B0 = None ,
|
|
38
|
+
N_overlap_A0B0 = 0,
|
|
39
|
+
log=Log(),
|
|
40
|
+
verbose=True):
|
|
41
|
+
|
|
42
|
+
log.write(" Start to run CCGWAS from command line:", verbose=verbose)
|
|
43
|
+
log.write(" -Methods: : {}...".format("Peyrot, W. J., & Price, A. L. (2021). Identifying loci with different allele frequencies among cases of eight psychiatric disorders using CC-GWAS. Nature genetics, 53(4), 445-454."),verbose=verbose)
|
|
44
|
+
#"SNP, CHR, BP, EA, NEA, FRQ, OR, SE, P, Neff"
|
|
45
|
+
log.write(" -Running CCGWAS for: {}...".format(group),verbose=verbose)
|
|
46
|
+
|
|
47
|
+
snp_info_cols=["SNPID","CHR","POS","EA","NEA"]
|
|
48
|
+
stats_cols=["EAF","OR","SE","P","N_EFF"]
|
|
49
|
+
|
|
50
|
+
if meta["gwaslab"]["objects"][0]["gwaslab"]["population_prevalence"] != "Unknown":
|
|
51
|
+
K_A1A0 = float(meta["gwaslab"]["objects"][0]["gwaslab"]["population_prevalence"])
|
|
52
|
+
K_A1A0_high = 1.1 * K_A1A0
|
|
53
|
+
K_A1A0_low = K_A1A0/ 1.1
|
|
54
|
+
|
|
55
|
+
if meta["gwaslab"]["objects"][1]["gwaslab"]["population_prevalence"] != "Unknown":
|
|
56
|
+
K_B1B0 = float(meta["gwaslab"]["objects"][1]["gwaslab"]["population_prevalence"])
|
|
57
|
+
K_B1B0_high = 1.1 * K_B1B0
|
|
58
|
+
K_B1B0_low = K_B1B0/ 1.1
|
|
59
|
+
|
|
60
|
+
if h2l_A1A0 is None:
|
|
61
|
+
h2l_A1A0 = ldsc[0].loc[0, "h2_liab"]
|
|
62
|
+
if h2l_B1B0 is None:
|
|
63
|
+
h2l_B1B0 = ldsc[1].loc[0, "h2_liab"]
|
|
64
|
+
if rg_A1A0_B1B0 is None:
|
|
65
|
+
rg_A1A0_B1B0 = ldsc_rg.loc[(ldsc_rg["p1"]==studies[0])&(ldsc_rg["p2"]==studies[1]), :].iloc[0,ldsc_rg.columns.get_loc("rg")]
|
|
66
|
+
if intercept_A1A0_B1B0 is None:
|
|
67
|
+
intercept_A1A0_B1B0 = ldsc_rg.loc[(ldsc_rg["p1"]==studies[0])&(ldsc_rg["p2"]==studies[1]), :].iloc[0,ldsc_rg.columns.get_loc("gcov_int")]
|
|
68
|
+
|
|
69
|
+
# prepare input files sumstats_multi
|
|
70
|
+
for i in range(nstudy):
|
|
71
|
+
output_cols = snp_info_cols + list(map(lambda x: x+"_{}".format(i+1), stats_cols))
|
|
72
|
+
|
|
73
|
+
dic= {"SNPID":"SNP",
|
|
74
|
+
"POS":"BP",
|
|
75
|
+
"EAF_{}".format(i+1):"FRQ",
|
|
76
|
+
"OR_{}".format(i+1):"OR",
|
|
77
|
+
"SE_{}".format(i+1):"SE",
|
|
78
|
+
"P_{}".format(i+1):"P",
|
|
79
|
+
"N_EFF_{}".format(i+1):"Neff"
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
sumstats_pair[output_cols].rename(columns=dic).to_csv("./{}_{}.txt.gz".format(group, studies[i]),index=None,sep="\t")
|
|
83
|
+
|
|
84
|
+
output_prefix = "{group}_ccgwas".format(group=group)
|
|
85
|
+
r_log=""
|
|
86
|
+
log = _checking_r_version(r, log)
|
|
87
|
+
|
|
88
|
+
rscript='''
|
|
89
|
+
library(data.table)
|
|
90
|
+
library(R.utils)
|
|
91
|
+
library(CCGWAS)
|
|
92
|
+
|
|
93
|
+
CCGWAS( outcome_file = "{output_prefix}" ,
|
|
94
|
+
A_name = "{study1}" ,
|
|
95
|
+
B_name = "{study2}" ,
|
|
96
|
+
sumstats_fileA1A0 = "./{group}_{study1}.txt.gz" ,
|
|
97
|
+
sumstats_fileB1B0 = "./{group}_{study2}.txt.gz" ,
|
|
98
|
+
K_A1A0 = {K_A1A0} ,
|
|
99
|
+
K_A1A0_high = {K_A1A0_high} ,
|
|
100
|
+
K_A1A0_low = {K_A1A0_low},
|
|
101
|
+
|
|
102
|
+
K_B1B0 ={K_B1B0} ,
|
|
103
|
+
K_B1B0_high ={K_B1B0_high} ,
|
|
104
|
+
K_B1B0_low = {K_B1B0_low} ,
|
|
105
|
+
|
|
106
|
+
h2l_A1A0 ={h2l_A1A0},
|
|
107
|
+
h2l_B1B0 = {h2l_B1B0} ,
|
|
108
|
+
rg_A1A0_B1B0 = {rg_A1A0_B1B0} ,
|
|
109
|
+
|
|
110
|
+
intercept_A1A0_B1B0 = {intercept_A1A0_B1B0} ,
|
|
111
|
+
m = {m} ,
|
|
112
|
+
N_A1 = {N_A1} ,
|
|
113
|
+
N_B1 = {N_B1} ,
|
|
114
|
+
N_A0 = {N_A0} ,
|
|
115
|
+
N_B0 = {N_B0} ,
|
|
116
|
+
N_overlap_A0B0 = {N_overlap_A0B0} )
|
|
117
|
+
'''.format(
|
|
118
|
+
output_prefix=output_prefix,
|
|
119
|
+
study1=studies[0],
|
|
120
|
+
study2=studies[1],
|
|
121
|
+
group=group,
|
|
122
|
+
K_A1A0 = K_A1A0 ,
|
|
123
|
+
K_A1A0_high = K_A1A0_high ,
|
|
124
|
+
K_A1A0_low = K_A1A0_low ,
|
|
125
|
+
K_B1B0 =K_B1B0 ,
|
|
126
|
+
K_B1B0_high = K_B1B0_high ,
|
|
127
|
+
K_B1B0_low =K_B1B0_low ,
|
|
128
|
+
h2l_A1A0 = h2l_A1A0 ,
|
|
129
|
+
h2l_B1B0 = h2l_B1B0 ,
|
|
130
|
+
rg_A1A0_B1B0 = rg_A1A0_B1B0 ,
|
|
131
|
+
intercept_A1A0_B1B0 = intercept_A1A0_B1B0 ,
|
|
132
|
+
m = m,
|
|
133
|
+
N_A1 = N_A1,
|
|
134
|
+
N_B1 = N_B1,
|
|
135
|
+
N_A0 = N_A0 ,
|
|
136
|
+
N_B0 = N_B0 ,
|
|
137
|
+
N_overlap_A0B0 = N_overlap_A0B0,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
with open("_{}_gwaslab_ccgwas_temp.R".format( group),"w") as file:
|
|
141
|
+
file.write(rscript)
|
|
142
|
+
|
|
143
|
+
script_run_r = "{} _{}_gwaslab_ccgwas_temp.R".format(r, group )
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
log.write(" -Running CCGWAS from command line...", verbose=verbose)
|
|
147
|
+
output = subprocess.check_output(script_run_r, stderr=subprocess.STDOUT, shell=True,text=True)
|
|
148
|
+
r_log+= output + "\n"
|
|
149
|
+
#os.remove("_{}_{}_gwaslab_hyprcoloc_temp.R".format(study,locus))
|
|
150
|
+
except subprocess.CalledProcessError as e:
|
|
151
|
+
log.write(e.output)
|
|
152
|
+
#os.remove("_{}_{}_gwaslab_hyprcoloc_temp.R".format(study,locus))
|
|
153
|
+
log.write(" -Finishing CCGWAS for {}...".format(group),verbose=verbose)
|
|
154
|
+
log.write("Finished Case-case GWAS using CCGWAS.", verbose=verbose)
|
|
155
|
+
return output_prefix
|
gwaslab/util_ex_run_coloc.py
CHANGED
|
@@ -58,7 +58,7 @@ def _run_coloc_susie(filepath, r="Rscript",
|
|
|
58
58
|
rscript='''
|
|
59
59
|
library(coloc)
|
|
60
60
|
|
|
61
|
-
df = read.csv("{sumstats_path}",header=TRUE)
|
|
61
|
+
df = read.csv("{sumstats_path}",sep="\t",header=TRUE)
|
|
62
62
|
|
|
63
63
|
R <- as.matrix(read.csv("{ld_r_matrix_path}",sep="\t",header=FALSE))
|
|
64
64
|
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import os
|
|
3
|
+
import gc
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
from gwaslab.g_Log import Log
|
|
7
|
+
from gwaslab.g_version import _checking_r_version
|
|
8
|
+
from gwaslab.g_version import _check_susie_version
|
|
9
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
10
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
11
|
+
from gwaslab.util_ex_calculate_ldmatrix import _extract_variants_in_locus
|
|
12
|
+
from gwaslab.util_in_get_sig import getsig
|
|
13
|
+
|
|
14
|
+
def _run_hyprcoloc( sumstats_multi,
|
|
15
|
+
r="Rscript",
|
|
16
|
+
study="Group1",
|
|
17
|
+
traits=None,
|
|
18
|
+
types=None,
|
|
19
|
+
loci=None,
|
|
20
|
+
nstudy=2,
|
|
21
|
+
windowsizekb=1000,
|
|
22
|
+
build="99",
|
|
23
|
+
log=Log(),
|
|
24
|
+
verbose=True):
|
|
25
|
+
|
|
26
|
+
log.write(" Start to run hyprcoloc from command line:", verbose=verbose)
|
|
27
|
+
if traits is None:
|
|
28
|
+
traits_to_form_string = [ '"trait_{}"'.format(i+1) for i in range(nstudy)]
|
|
29
|
+
else:
|
|
30
|
+
traits_to_form_string = ['"{}"'.format(i) for i in traits]
|
|
31
|
+
|
|
32
|
+
hyprcoloc_res_combined = pd.DataFrame()
|
|
33
|
+
|
|
34
|
+
if loci is None:
|
|
35
|
+
log.write(" -Loci were not provided. All significant loci will be automatically extracted...",verbose=verbose)
|
|
36
|
+
sig_df = getsig(sumstats_multi,id="SNPID",chrom="CHR",pos="POS",p="P_MIN", build=build)
|
|
37
|
+
else:
|
|
38
|
+
sig_df = sumstats_multi.loc[sumstats_multi["SNPID"].isin(loci),:]
|
|
39
|
+
|
|
40
|
+
for index,row in sig_df.iterrows():
|
|
41
|
+
#row = sig_df.iloc[0,:]
|
|
42
|
+
|
|
43
|
+
# extract locus
|
|
44
|
+
locus = row["SNPID"]
|
|
45
|
+
log.write(" -Running hyprcoloc for locus : {}...".format(locus),verbose=verbose)
|
|
46
|
+
|
|
47
|
+
# prepare input files sumstats_multi
|
|
48
|
+
output_beta_cols = []
|
|
49
|
+
output_se_cols = []
|
|
50
|
+
|
|
51
|
+
for i in range(nstudy):
|
|
52
|
+
output_beta_cols.append("BETA_{}".format(i+1))
|
|
53
|
+
output_se_cols.append("SE_{}".format(i+1))
|
|
54
|
+
|
|
55
|
+
matched_sumstats = _extract_variants_in_locus(sumstats_multi, windowsizekb, locus = (row["CHR"],row["POS"]))
|
|
56
|
+
|
|
57
|
+
to_export = matched_sumstats[["SNPID"] + output_se_cols + output_beta_cols].dropna()
|
|
58
|
+
|
|
59
|
+
if len(to_export)>0:
|
|
60
|
+
log.write(" -Number of shared variants in locus {} : {}...".format(locus, len(to_export)),verbose=verbose)
|
|
61
|
+
to_export[["SNPID"] + output_beta_cols].to_csv("{}_{}_beta_cols.tsv.gz".format(study,locus), index=None,sep="\t")
|
|
62
|
+
to_export[["SNPID"] + output_se_cols].to_csv("{}_{}_se_cols.tsv.gz".format(study,locus), index=None,sep="\t")
|
|
63
|
+
else:
|
|
64
|
+
log.write(" -No shared variants in locus {}...skipping".format(locus),verbose=verbose)
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
r_log=""
|
|
68
|
+
log = _checking_r_version(r, log)
|
|
69
|
+
|
|
70
|
+
rscript='''
|
|
71
|
+
library(hyprcoloc)
|
|
72
|
+
|
|
73
|
+
betas<-read.csv("{study}_{locus}_beta_cols.tsv.gz",row.names = 1,sep="\\t")
|
|
74
|
+
ses <-read.csv("{study}_{locus}_se_cols.tsv.gz",row.names = 1,sep="\\t")
|
|
75
|
+
|
|
76
|
+
betas <- as.matrix(betas)
|
|
77
|
+
ses <- as.matrix(ses)
|
|
78
|
+
|
|
79
|
+
traits <- c({traits_string})
|
|
80
|
+
snpid <- rownames(betas)
|
|
81
|
+
|
|
82
|
+
res <- hyprcoloc(betas,
|
|
83
|
+
ses,
|
|
84
|
+
trait.names = traits,
|
|
85
|
+
snp.id = snpid,
|
|
86
|
+
snpscore=TRUE)
|
|
87
|
+
|
|
88
|
+
write.csv(res[[1]], "{study}_{locus}_{nstudy}studies.res",row.names = FALSE)
|
|
89
|
+
'''.format(
|
|
90
|
+
study=study,
|
|
91
|
+
locus=locus,
|
|
92
|
+
nstudy=nstudy,
|
|
93
|
+
traits_string = ','.join(traits_to_form_string)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
output_path = "{study}_{locus}_{nstudy}studies.res".format(study=study, locus=locus, nstudy=nstudy)
|
|
97
|
+
output_prefix = "{study}_{locus}_{nstudy}studies".format(study=study, locus=locus, nstudy=nstudy)
|
|
98
|
+
|
|
99
|
+
with open("_{}_{}_gwaslab_hyprcoloc_temp.R".format(study,locus),"w") as file:
|
|
100
|
+
file.write(rscript)
|
|
101
|
+
|
|
102
|
+
script_run_r = "{} _{}_{}_gwaslab_hyprcoloc_temp.R".format(r, study,locus)
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
log.write(" Running hyprcoloc from command line...", verbose=verbose)
|
|
106
|
+
output = subprocess.check_output(script_run_r, stderr=subprocess.STDOUT, shell=True,text=True)
|
|
107
|
+
r_log+= output + "\n"
|
|
108
|
+
#os.remove("_{}_{}_gwaslab_hyprcoloc_temp.R".format(study,locus))
|
|
109
|
+
hyprcoloc_res = pd.read_csv(output_path)
|
|
110
|
+
hyprcoloc_res["PREFIX"] = output_prefix
|
|
111
|
+
hyprcoloc_res_combined = pd.concat([hyprcoloc_res_combined, hyprcoloc_res],ignore_index=True)
|
|
112
|
+
except subprocess.CalledProcessError as e:
|
|
113
|
+
log.write(e.output)
|
|
114
|
+
#os.remove("_{}_{}_gwaslab_hyprcoloc_temp.R".format(study,locus))
|
|
115
|
+
log.write(" -Finishing hyprcoloc for locus : {}...".format(locus),verbose=verbose)
|
|
116
|
+
log.write("Finished clocalization using hyprcoloc.", verbose=verbose)
|
|
117
|
+
return hyprcoloc_res_combined
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import os
|
|
3
|
+
import gc
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
from gwaslab.g_Log import Log
|
|
7
|
+
from gwaslab.g_version import _checking_r_version
|
|
8
|
+
from gwaslab.g_version import _check_susie_version
|
|
9
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
10
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
11
|
+
from gwaslab.viz_plot_stackedregional import _sort_args
|
|
12
|
+
|
|
13
|
+
def _run_mesusie(filepath,
|
|
14
|
+
r="Rscript",
|
|
15
|
+
types=None, ns=None,
|
|
16
|
+
fillldna=True, delete=False,
|
|
17
|
+
coloc_args="",
|
|
18
|
+
susie_args="",
|
|
19
|
+
ncols=None,
|
|
20
|
+
d1_args="",
|
|
21
|
+
d2_args="",
|
|
22
|
+
log=Log(),
|
|
23
|
+
verbose=True):
|
|
24
|
+
|
|
25
|
+
log.write(" Start to run mesusie from command line:", verbose=verbose)
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
if ns is None:
|
|
29
|
+
if ncols is not None:
|
|
30
|
+
ns = ncols
|
|
31
|
+
log.write(" -Ns: {} and {}".format(ns[0],ns[1]), verbose=verbose)
|
|
32
|
+
|
|
33
|
+
if filepath is None:
|
|
34
|
+
log.write(" -File path is None.", verbose=verbose)
|
|
35
|
+
log.write("Finished finemapping using MESuSie.", verbose=verbose)
|
|
36
|
+
return pd.DataFrame()
|
|
37
|
+
|
|
38
|
+
filelist = pd.read_csv(filepath,sep="\t")
|
|
39
|
+
r_log=""
|
|
40
|
+
# write R script
|
|
41
|
+
locus_pip_cs = pd.DataFrame()
|
|
42
|
+
|
|
43
|
+
log = _checking_r_version(r, log)
|
|
44
|
+
#log = _check_susie_version(r,log)
|
|
45
|
+
r_script_init='''
|
|
46
|
+
library(MESuSiE)
|
|
47
|
+
ld_list <- list()
|
|
48
|
+
summ_stat_list <- list()
|
|
49
|
+
'''
|
|
50
|
+
r_scripts_for_loading =[r_script_init]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
for index, row in filelist.iterrows():
|
|
54
|
+
gc.collect()
|
|
55
|
+
if index==0:
|
|
56
|
+
study0 = row["STUDY"]
|
|
57
|
+
study = row["STUDY"]
|
|
58
|
+
group = row["GROUP"]
|
|
59
|
+
ld_r_matrix = row["LD_R_MATRIX"]
|
|
60
|
+
sumstats = row["LOCUS_SUMSTATS"]
|
|
61
|
+
locus=row["LOCUS"]
|
|
62
|
+
|
|
63
|
+
log.write(" -Running for: {} - {}".format(row["SNPID"],row["STUDY"] ), verbose=verbose)
|
|
64
|
+
log.write(" -Locus sumstats:{}".format(sumstats), verbose=verbose)
|
|
65
|
+
log.write(" -LD r matrix:{}".format(ld_r_matrix), verbose=verbose)
|
|
66
|
+
|
|
67
|
+
rscript='''
|
|
68
|
+
sum{index} <- read.csv("{sumstats}",sep="\\t")
|
|
69
|
+
sum{index}$Z <- sum{index}$Beta/sum{index}$Se
|
|
70
|
+
sum{index}$N <- {n}
|
|
71
|
+
ld{index} <- read.csv("{ld_r_matrix}",sep="\\t",header=FALSE)
|
|
72
|
+
ld{index}[is.na(ld{index})] <- 0
|
|
73
|
+
names(ld{index}) <- sum{index}$SNP
|
|
74
|
+
ld_list${study} <- as.matrix(ld{index})
|
|
75
|
+
summ_stat_list${study} <- sum{index}
|
|
76
|
+
|
|
77
|
+
png(filename="./diagnostic_{group}_{locus}_{index}.png")
|
|
78
|
+
diagnostic <- kriging_rss(summ_stat_list${study}$Z, ld_list${study})
|
|
79
|
+
diagnostic$plot
|
|
80
|
+
dev.off()
|
|
81
|
+
'''.format(
|
|
82
|
+
index = index,
|
|
83
|
+
study = study,
|
|
84
|
+
group=group,
|
|
85
|
+
locus = locus,
|
|
86
|
+
n = ns[index],
|
|
87
|
+
sumstats = sumstats,
|
|
88
|
+
ld_r_matrix = ld_r_matrix
|
|
89
|
+
)
|
|
90
|
+
r_scripts_for_loading.append(rscript)
|
|
91
|
+
|
|
92
|
+
rscript_loading = "".join(r_scripts_for_loading)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
rscript_computing='''
|
|
96
|
+
MESuSiE_res<-meSuSie_core(ld_list, summ_stat_list, L=10)'''
|
|
97
|
+
|
|
98
|
+
rscript_output = '''
|
|
99
|
+
saveRDS(MESuSiE_res, file = "{group}_{locus}.rds")
|
|
100
|
+
pips <- cbind(summ_stat_list${study0}$SNP, summ_stat_list${study0}$CHR, summ_stat_list${study0}$POS, MESuSiE_res$pip_config)
|
|
101
|
+
colnames(pips)[1] <-"SNPID"
|
|
102
|
+
colnames(pips)[2] <-"CHR"
|
|
103
|
+
colnames(pips)[3] <-"POS"
|
|
104
|
+
pips <- data.frame(pips)
|
|
105
|
+
pips[c("CREDIBLE_SET_INDEX")] <- 0
|
|
106
|
+
pips[c("CS_CATEGORY")] <- NA
|
|
107
|
+
for (i in 1:length(MESuSiE_res$cs$cs)) {{
|
|
108
|
+
pips[MESuSiE_res$cs$cs[[i]],c("CREDIBLE_SET_INDEX")]<-i
|
|
109
|
+
pips[MESuSiE_res$cs$cs[[i]],c("CS_CATEGORY")] <- MESuSiE_res$cs$cs_category[[i]]
|
|
110
|
+
}}
|
|
111
|
+
write.csv(pips, "{group}_{locus}.pipcs", row.names = FALSE)
|
|
112
|
+
|
|
113
|
+
write.csv(MESuSiE_res$cs$cs_index, "{group}_{locus}.cscs_index", row.names = FALSE)
|
|
114
|
+
write.csv(MESuSiE_res$cs$purity, "{group}_{locus}.cspurity", row.names = FALSE)
|
|
115
|
+
write.csv(MESuSiE_res$cs$cs_category, "{group}_{locus}.cscs_category", row.names = FALSE)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
for (p in MESuSiE_res$cs$cs) {{
|
|
119
|
+
write(p,"{group}_{locus}.cscs_i", append=TRUE, sep="\t", ncolumns=10000000)
|
|
120
|
+
write(summ_stat_list${study0}$SNP[p],"{group}_{locus}.cscs_snpid", append=TRUE, sep="\t", ncolumns=10000000)
|
|
121
|
+
}}
|
|
122
|
+
|
|
123
|
+
'''.format(group=group,locus=locus,study0=study0)
|
|
124
|
+
|
|
125
|
+
rscript_plotting='''
|
|
126
|
+
png(filename="./{group}_{locus}_stacked_regions.png")
|
|
127
|
+
MESuSiE_Plot(MESuSiE_res, ld_list ,summ_stat_list)
|
|
128
|
+
dev.off()
|
|
129
|
+
'''.format(group=group,locus=locus)
|
|
130
|
+
|
|
131
|
+
rscript = rscript_loading + rscript_computing + rscript_output + rscript_plotting
|
|
132
|
+
|
|
133
|
+
log.write(" -MESuSie script: {}".format(rscript_computing), verbose=verbose)
|
|
134
|
+
|
|
135
|
+
with open("_{}_{}_gwaslab_mesusie_temp.R".format(group,row["SNPID"]),"w") as file:
|
|
136
|
+
file.write(rscript)
|
|
137
|
+
|
|
138
|
+
script_run_r = "{} _{}_{}_gwaslab_mesusie_temp.R".format(r, group,row["SNPID"])
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
output = subprocess.check_output(script_run_r, stderr=subprocess.STDOUT, shell=True,text=True)
|
|
142
|
+
#plink_process = subprocess.Popen("exec "+script_run_r, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True,text=True)
|
|
143
|
+
#output1,output2 = plink_process.communicate()
|
|
144
|
+
#output= output1 + output2+ "\n"
|
|
145
|
+
#plink_process.kill()
|
|
146
|
+
log.write(" Running MESuSie from command line...", verbose=verbose)
|
|
147
|
+
r_log+= output + "\n"
|
|
148
|
+
|
|
149
|
+
#os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
|
|
150
|
+
|
|
151
|
+
except subprocess.CalledProcessError as e:
|
|
152
|
+
log.write(e.output)
|
|
153
|
+
#os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
|
|
154
|
+
log.write("Finished cross ancestry finemapping using MESuSie.", verbose=verbose)
|
|
155
|
+
return "./{}_@.pipcs".format(group)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import os
|
|
3
|
+
import gc
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
from gwaslab.g_Log import Log
|
|
7
|
+
|
|
8
|
+
def _run_mtag( sumstats_multi,
|
|
9
|
+
python="Rscript",
|
|
10
|
+
mtag="",
|
|
11
|
+
study="Group1",
|
|
12
|
+
traits=None,
|
|
13
|
+
out_prefix=None,
|
|
14
|
+
types=None,
|
|
15
|
+
n_min=0,
|
|
16
|
+
loci=None,
|
|
17
|
+
nstudy=2,
|
|
18
|
+
windowsizekb=1000,
|
|
19
|
+
build="99",
|
|
20
|
+
log=Log(),
|
|
21
|
+
verbose=True):
|
|
22
|
+
|
|
23
|
+
log.write("Start to run MTAG from command line:", verbose=verbose)
|
|
24
|
+
|
|
25
|
+
if traits is None:
|
|
26
|
+
traits_to_form_string = [ 'trait_{}'.format(i+1) for i in range(nstudy)]
|
|
27
|
+
else:
|
|
28
|
+
traits_to_form_string = ['{}'.format(i) for i in traits]
|
|
29
|
+
|
|
30
|
+
res_combined = pd.DataFrame()
|
|
31
|
+
# snpid chr bpos a1 a2 freq z pval n
|
|
32
|
+
|
|
33
|
+
output_snp_info_cols =["rsID","CHR","POS","EA","NEA"]
|
|
34
|
+
sumstats_paths = []
|
|
35
|
+
for i in range(nstudy):
|
|
36
|
+
output_stats_cols=[]
|
|
37
|
+
for col in ["Z","P","EAF","N"]:
|
|
38
|
+
output_stats_cols.append("{}_{}".format(col, i+1))
|
|
39
|
+
|
|
40
|
+
rename_dict = {
|
|
41
|
+
"rsID":"snpid",
|
|
42
|
+
"CHR":"chr",
|
|
43
|
+
"POS":"bpos",
|
|
44
|
+
"EA":"a1",
|
|
45
|
+
"NEA":"a2",
|
|
46
|
+
"EAF_{}".format( i+1) :"freq",
|
|
47
|
+
"Z_{}".format( i+1) :"z",
|
|
48
|
+
"P_{}".format( i+1) :"pval",
|
|
49
|
+
"N_{}".format( i+1) :"n",
|
|
50
|
+
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
sumstats_multi[output_snp_info_cols+ output_stats_cols].rename(columns=rename_dict).to_csv("{}_{}.tsv.gz".format(study, traits_to_form_string[i]), index=None,sep="\t")
|
|
54
|
+
sumstats_paths.append("{}_{}.tsv.gz".format(study, traits_to_form_string[i]))
|
|
55
|
+
|
|
56
|
+
python_log=""
|
|
57
|
+
if out_prefix is None:
|
|
58
|
+
out_prefix = "./{study}_{nstudy}studies".format(study=study, nstudy=nstudy)
|
|
59
|
+
|
|
60
|
+
script='''
|
|
61
|
+
{python} {mtag} \
|
|
62
|
+
--sumstats {sumstats_paths_string} \
|
|
63
|
+
--out {out_prefix} \
|
|
64
|
+
--n_min {n_min} \
|
|
65
|
+
--stream_stdout &
|
|
66
|
+
'''.format(
|
|
67
|
+
python=python,
|
|
68
|
+
n_min=n_min,
|
|
69
|
+
mtag=mtag,
|
|
70
|
+
out_prefix=out_prefix,
|
|
71
|
+
sumstats_paths_string = ",".join(sumstats_paths)
|
|
72
|
+
)
|
|
73
|
+
log.write(" MTAG script: {} ".format(script), verbose=verbose)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
with open("_{}_gwaslab_mtag_temp.sh".format(study),"w") as file:
|
|
77
|
+
file.write(script)
|
|
78
|
+
|
|
79
|
+
os.chmod("_{}_gwaslab_mtag_temp.sh".format(study), 0o700)
|
|
80
|
+
|
|
81
|
+
script_run = "./_{}_gwaslab_mtag_temp.sh".format(study)
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
log.write(" Running MTAG from command line...", verbose=verbose)
|
|
85
|
+
output = subprocess.check_output(script_run, stderr=subprocess.STDOUT, shell=True,text=True)
|
|
86
|
+
log.write(output)
|
|
87
|
+
python_log+= output + "\n"
|
|
88
|
+
|
|
89
|
+
except subprocess.CalledProcessError as e:
|
|
90
|
+
log.write(e.output)
|
|
91
|
+
|
|
92
|
+
log.write("Finished MTAG.", verbose=verbose)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
PRS-CS: a polygenic prediction method that infers posterior SNP effect sizes under continuous shrinkage (CS) priors
|
|
5
|
+
using GWAS summary statistics and an external LD reference panel.
|
|
6
|
+
|
|
7
|
+
Reference: T Ge, CY Chen, Y Ni, YCA Feng, JW Smoller. Polygenic Prediction via Bayesian Regression and Continuous Shrinkage Priors.
|
|
8
|
+
Nature Communications, 10:1776, 2019.
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python PRScs.py --ref_dir=PATH_TO_REFERENCE --bim_prefix=VALIDATION_BIM_PREFIX --sst_file=SUM_STATS_FILE --n_gwas=GWAS_SAMPLE_SIZE --out_dir=OUTPUT_DIR
|
|
13
|
+
[--a=PARAM_A --b=PARAM_B --phi=PARAM_PHI --n_iter=MCMC_ITERATIONS --n_burnin=MCMC_BURNIN --thin=MCMC_THINNING_FACTOR
|
|
14
|
+
--chrom=CHROM --write_psi=WRITE_PSI --write_pst=WRITE_POSTERIOR_SAMPLES --seed=SEED]
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
import getopt
|
|
22
|
+
|
|
23
|
+
import gwaslab.prscs_parse_genet as parse_genet
|
|
24
|
+
import gwaslab.prscs_mcmc_gtb as mcmc_gtb
|
|
25
|
+
import gwaslab.prscs_gigrnd as gigrnd
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _run_prscs(
|
|
29
|
+
ref_dir=None,
|
|
30
|
+
bim_prefix=None,
|
|
31
|
+
sst_file=None,
|
|
32
|
+
a= 1,
|
|
33
|
+
b= 0.5,
|
|
34
|
+
phi= None,
|
|
35
|
+
n_gwas= None,
|
|
36
|
+
n_iter= 1000,
|
|
37
|
+
n_burnin= 500,
|
|
38
|
+
thin= 5,
|
|
39
|
+
out_dir= "./",
|
|
40
|
+
chrom= range(1,23),
|
|
41
|
+
beta_std= 'FALSE',
|
|
42
|
+
write_psi= 'FALSE',
|
|
43
|
+
write_pst= 'FALSE',
|
|
44
|
+
seed= None,
|
|
45
|
+
log=None,
|
|
46
|
+
**kwargs):
|
|
47
|
+
## T Ge, CY Chen, Y Ni, YCA Feng, JW Smoller. Polygenic Prediction via Bayesian Regression and Continuous Shrinkage Priors.Nature Communications, 10:1776, 2019.
|
|
48
|
+
sst_file = sst_file.rename(columns={"rsID":"SNP","POS":"BP"})
|
|
49
|
+
log.write("Start to runnig PRScs...")
|
|
50
|
+
param_dict = {'ref_dir': ref_dir,
|
|
51
|
+
'bim_prefix': bim_prefix,
|
|
52
|
+
'a': a,
|
|
53
|
+
'b': b,
|
|
54
|
+
'phi': phi,
|
|
55
|
+
'n_gwas': n_gwas,
|
|
56
|
+
'n_iter': n_iter,
|
|
57
|
+
'n_burnin': n_burnin,
|
|
58
|
+
'thin':thin,
|
|
59
|
+
'out_dir': out_dir,
|
|
60
|
+
'chrom': chrom,
|
|
61
|
+
'beta_std': beta_std,
|
|
62
|
+
'write_psi': write_psi,
|
|
63
|
+
'write_pst': write_pst,
|
|
64
|
+
'seed': seed}
|
|
65
|
+
|
|
66
|
+
for chrom in param_dict['chrom']:
|
|
67
|
+
log.write('##### process chromosome %d #####' % int(chrom))
|
|
68
|
+
|
|
69
|
+
if '1kg' in os.path.basename(param_dict['ref_dir']):
|
|
70
|
+
ref_dict = parse_genet.parse_ref(param_dict['ref_dir'] + '/snpinfo_1kg_hm3', int(chrom), log)
|
|
71
|
+
elif 'ukbb' in os.path.basename(param_dict['ref_dir']):
|
|
72
|
+
ref_dict = parse_genet.parse_ref(param_dict['ref_dir'] + '/snpinfo_ukbb_hm3', int(chrom), log)
|
|
73
|
+
|
|
74
|
+
vld_dict = parse_genet.parse_bim(param_dict['bim_prefix'], int(chrom))
|
|
75
|
+
|
|
76
|
+
sst_dict = parse_genet.parse_sumstats(ref_dict, vld_dict, sst_file, param_dict['n_gwas'], log)
|
|
77
|
+
|
|
78
|
+
ld_blk, blk_size = parse_genet.parse_ldblk(param_dict['ref_dir'], sst_dict, int(chrom), log)
|
|
79
|
+
|
|
80
|
+
mcmc_gtb.mcmc(param_dict['a'], param_dict['b'], param_dict['phi'], sst_dict, param_dict['n_gwas'], ld_blk, blk_size,
|
|
81
|
+
param_dict['n_iter'], param_dict['n_burnin'], param_dict['thin'], int(chrom), param_dict['out_dir'], param_dict['beta_std'],
|
|
82
|
+
param_dict['write_psi'], param_dict['write_pst'], param_dict['seed'], log)
|
|
83
|
+
|
|
84
|
+
log.write("Finished!")
|
|
85
|
+
|