gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/g_Log.py +14 -5
- gwaslab/g_Sumstats.py +86 -18
- gwaslab/g_SumstatsPair.py +70 -23
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +9 -4
- gwaslab/hm_harmonize_sumstats.py +88 -83
- gwaslab/io_preformat_input.py +14 -14
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +1 -1
- gwaslab/qc_fix_sumstats.py +163 -161
- gwaslab/util_ex_calculate_ldmatrix.py +2 -2
- gwaslab/util_ex_gwascatalog.py +24 -24
- gwaslab/util_ex_ldproxyfinder.py +9 -9
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +28 -28
- gwaslab/util_in_filter_value.py +91 -52
- gwaslab/util_in_get_density.py +8 -8
- gwaslab/util_in_get_sig.py +407 -65
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +18 -18
- gwaslab/viz_aux_reposition_text.py +3 -3
- gwaslab/viz_aux_save_figure.py +14 -5
- gwaslab/viz_plot_compare_af.py +29 -30
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +6 -6
- gwaslab/viz_plot_mqqplot.py +17 -3
- gwaslab/viz_plot_qqplot.py +1 -1
- gwaslab/viz_plot_regionalplot.py +33 -32
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +50 -55
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.38.dist-info/RECORD +0 -72
- /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
|
@@ -199,7 +199,7 @@ def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=No
|
|
|
199
199
|
log.write(" -#Total Variants matched:{}".format(sum(allele_match)))
|
|
200
200
|
|
|
201
201
|
if row["SNPID"] not in combined_df.loc[perfect_match,"SNPID"].values:
|
|
202
|
-
log.
|
|
202
|
+
log.warning("Lead variant was not available in reference!")
|
|
203
203
|
|
|
204
204
|
# adjust statistics
|
|
205
205
|
output_columns=["SNPID","CHR","POS","EA_bim","NEA_bim"]
|
|
@@ -256,4 +256,4 @@ def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):
|
|
|
256
256
|
if list(matched_sumstats_snpid) == list(snpid_list):
|
|
257
257
|
log.write(" -Sumstats SNPID order and LD matrix SNPID order are matched.")
|
|
258
258
|
else:
|
|
259
|
-
log.
|
|
259
|
+
log.warning("Sumstats SNPID order and LD matrix SNPID order are not matched!")
|
gwaslab/util_ex_gwascatalog.py
CHANGED
|
@@ -9,26 +9,26 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
|
|
|
9
9
|
#https://www.ebi.ac.uk/gwas/rest/docs/api
|
|
10
10
|
|
|
11
11
|
base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
12
|
+
log.write("Start to retrieve data from GWASCatalog...", verbose=verbose)
|
|
13
|
+
log.write(" -Please make sure your sumstats is based on GRCh38...", verbose=verbose)
|
|
14
|
+
log.write(" -Requesting (GET) trait information through the GWASCatalog API...", verbose=verbose)
|
|
15
|
+
log.write(" -EFO trait api: "+ base_url, verbose=verbose)
|
|
16
16
|
text = requests.get(base_url)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
17
|
+
|
|
18
|
+
log.write(" -Status code: {}".format(text.status_code), verbose=verbose)
|
|
19
|
+
if text.status_code!=200:
|
|
20
|
+
log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.", verbose=verbose)
|
|
21
|
+
log.write(" -Message:{}".format(text.text), verbose=verbose)
|
|
22
|
+
return 0
|
|
23
23
|
|
|
24
24
|
api_response = json.loads(text.text)
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
log.write(" -Trait Name:",api_response["trait"], verbose=verbose)
|
|
26
|
+
log.write(" -Trait URL:",api_response["uri"], verbose=verbose)
|
|
27
27
|
|
|
28
28
|
base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo+"/associations?projection=associationByEfoTrait"
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...", verbose=verbose)
|
|
30
|
+
log.write(" -associationsByTraitSummary API: "+ base_url, verbose=verbose)
|
|
31
|
+
log.write(" -Note: this step might take a while...", verbose=verbose)
|
|
32
32
|
|
|
33
33
|
# get request and check status code of response
|
|
34
34
|
raw_data = requests.get(base_url)
|
|
@@ -37,13 +37,13 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
|
|
|
37
37
|
is_proceed = check_request_status_code(raw_data.status_code,verbose=verbose,log=log)
|
|
38
38
|
if is_proceed is False: return False
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
log.write(" -Loading json ...", verbose=verbose)
|
|
41
41
|
# Transform API response from JSON into Python dictionary
|
|
42
42
|
api_response = json.loads(raw_data.text)
|
|
43
|
-
|
|
43
|
+
log.write(" -Parsing json ...", verbose=verbose)
|
|
44
44
|
# An
|
|
45
45
|
records=list()
|
|
46
|
-
|
|
46
|
+
log.write(" -Number of reported associations for "+ efo +" in GWASCatalog:",len( api_response["_embedded"]["associations"]), verbose=verbose)
|
|
47
47
|
|
|
48
48
|
for association in api_response["_embedded"]["associations"]:
|
|
49
49
|
#association statistics:
|
|
@@ -126,12 +126,12 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
|
|
|
126
126
|
records.append(row)
|
|
127
127
|
#rsid locations
|
|
128
128
|
gwascatalog_lead_snps = pd.DataFrame(records,columns=["SNPID","CHR","POS","REPORT_GENENAME","CLOSEST_GENENAMES","FUNCTION_CLASS","OR","BETA","SE","P","TRAIT","STUDY","PUBMEDID","AUTHOR"])
|
|
129
|
-
|
|
129
|
+
log.write(" -Loading retrieved data into gwaslab Sumstats object ...", verbose=verbose)
|
|
130
130
|
sigs = gl.Sumstats(gwascatalog_lead_snps.copy(),fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
|
|
131
131
|
sigs.fix_pos(verbose=False)
|
|
132
132
|
sigs.fix_chr(verbose=False)
|
|
133
133
|
sigs.sort_coordinate(verbose=False)
|
|
134
|
-
|
|
134
|
+
log.write("Finished retrieving data from GWASCatalog...", verbose=verbose)
|
|
135
135
|
#return gwaslab Sumstats object
|
|
136
136
|
return sigs
|
|
137
137
|
|
|
@@ -142,14 +142,14 @@ def check_request_status_code(request_code,verbose=True,log=Log()):
|
|
|
142
142
|
is_proceed=False
|
|
143
143
|
|
|
144
144
|
if request_code == 200:
|
|
145
|
-
|
|
145
|
+
log.write(" -Status code 200 OK: Retrieved data from GWASCatalog successffully ...", verbose=verbose)
|
|
146
146
|
is_proceed=True
|
|
147
147
|
elif request_code == 404:
|
|
148
|
-
|
|
148
|
+
log.write(" -Status code 404 Not Found: The requested resource did not exist ...", verbose=verbose)
|
|
149
149
|
elif request_code == 301:
|
|
150
|
-
|
|
150
|
+
log.write(" -Status code 301 Moved Permanently: The requested resource did not exist ...", verbose=verbose)
|
|
151
151
|
elif request_code == 400:
|
|
152
|
-
|
|
152
|
+
log.write(" -Status code 400 Bad Request: The requested resource did not exist ...", verbose=verbose)
|
|
153
153
|
|
|
154
154
|
return is_proceed
|
|
155
155
|
|
gwaslab/util_ex_ldproxyfinder.py
CHANGED
|
@@ -49,8 +49,8 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
49
49
|
ld_threshold=0.8
|
|
50
50
|
):
|
|
51
51
|
### Load vcf#######################################################################################
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
log.write("Start to load reference genotype...", verbose=verbose)
|
|
53
|
+
log.write(" -reference vcf path : "+ vcf_path, verbose=verbose)
|
|
54
54
|
if tabix is None:
|
|
55
55
|
tabix = which("tabix")
|
|
56
56
|
vcf_chr_dict = auto_check_vcf_chr_dict(vcf_path=vcf_path, vcf_chr_dict=vcf_chr_dict, verbose=verbose, log=log)
|
|
@@ -122,7 +122,7 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
122
122
|
|
|
123
123
|
extracted_sumstats = pd.concat([extracted_sumstats, ld_proxies],ignore_index=True)
|
|
124
124
|
|
|
125
|
-
|
|
125
|
+
log.write("Finished loading reference genotype successfully!", verbose=verbose)
|
|
126
126
|
return extracted_sumstats
|
|
127
127
|
|
|
128
128
|
|
|
@@ -139,13 +139,13 @@ def _get_rsq( row,
|
|
|
139
139
|
ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
|
|
140
140
|
|
|
141
141
|
if ref_genotype is None:
|
|
142
|
-
|
|
142
|
+
log.warning("No data was retrieved. Skipping ...", verbose=verbose)
|
|
143
143
|
ref_genotype=dict()
|
|
144
144
|
ref_genotype["variants/POS"]=np.array([],dtype="int64")
|
|
145
145
|
return None
|
|
146
146
|
|
|
147
|
-
|
|
148
|
-
|
|
147
|
+
log.write(" -Retrieving index...", verbose=verbose)
|
|
148
|
+
log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])), verbose=verbose)
|
|
149
149
|
# match sumstats pos and ref pos:
|
|
150
150
|
# get ref index for its first appearance of sumstats pos
|
|
151
151
|
#######################################################################################
|
|
@@ -170,7 +170,7 @@ def _get_rsq( row,
|
|
|
170
170
|
else:
|
|
171
171
|
# no position match
|
|
172
172
|
return None
|
|
173
|
-
|
|
173
|
+
log.write(" -Matching variants using POS, NEA, EA ...", verbose=verbose)
|
|
174
174
|
|
|
175
175
|
sumstats["REFINDEX"] = sumstats.loc[:,["POS","NEA","EA"]].apply(lambda x: match_varaint(x), axis=1)
|
|
176
176
|
log.write(" -Matched variants in sumstats and vcf:{} ".format(sum(~sumstats["REFINDEX"].isna())))
|
|
@@ -190,7 +190,7 @@ def _get_rsq( row,
|
|
|
190
190
|
lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
|
|
191
191
|
other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
|
|
192
192
|
|
|
193
|
-
|
|
193
|
+
log.write(" -Calculating Rsq...", verbose=verbose)
|
|
194
194
|
|
|
195
195
|
if len(other_snp_genotype)>1:
|
|
196
196
|
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
|
|
@@ -198,7 +198,7 @@ def _get_rsq( row,
|
|
|
198
198
|
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
|
|
199
199
|
sumstats.loc[~sumstats["REFINDEX"].isna(),"RSQ"] = valid_r2
|
|
200
200
|
else:
|
|
201
|
-
|
|
201
|
+
log.write(" -Lead SNP not found in reference...", verbose=verbose)
|
|
202
202
|
sumstats["RSQ"]=None
|
|
203
203
|
|
|
204
204
|
sumstats["RSQ"] = sumstats["RSQ"].astype("float")
|
gwaslab/util_ex_ldsc.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from gwaslab.ldsc_sumstats import estimate_h2
|
|
2
|
+
from gwaslab.ldsc_sumstats import estimate_rg
|
|
3
|
+
from gwaslab.g_Log import Log
|
|
4
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
5
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
6
|
+
from gwaslab.qc_fix_sumstats import skipped
|
|
7
|
+
from gwaslab.io_read_ldsc import parse_ldsc_summary
|
|
8
|
+
|
|
9
|
+
class ARGS():
|
|
10
|
+
def __init__(self, **args):
|
|
11
|
+
|
|
12
|
+
self.out = "ldsc"
|
|
13
|
+
self.bfile = None
|
|
14
|
+
self.l2 = False
|
|
15
|
+
self.extract = None
|
|
16
|
+
self.keep = None
|
|
17
|
+
self.ld_wind_snps = None
|
|
18
|
+
self.ld_wind_kb = None
|
|
19
|
+
self.ld_wind_cm = None
|
|
20
|
+
self.print_snps = None
|
|
21
|
+
self.annot =None
|
|
22
|
+
self.thin_annot = False
|
|
23
|
+
self.cts_bin = None
|
|
24
|
+
self.cts_breaks = None
|
|
25
|
+
self.cts_names = None
|
|
26
|
+
self.per_allele = False
|
|
27
|
+
self.pq_exp =None
|
|
28
|
+
self.no_print_annot = False
|
|
29
|
+
|
|
30
|
+
if "h2" in args.keys():
|
|
31
|
+
self.h2 = args["h2"]
|
|
32
|
+
else:
|
|
33
|
+
self.h2 = None
|
|
34
|
+
|
|
35
|
+
self.h2_cts = None
|
|
36
|
+
|
|
37
|
+
if "rg" in args.keys():
|
|
38
|
+
self.rg = args["rg"]
|
|
39
|
+
else:
|
|
40
|
+
self.rg = None
|
|
41
|
+
|
|
42
|
+
if "ref_ld" in args.keys():
|
|
43
|
+
self.ref_ld = args["ref_ld"]
|
|
44
|
+
else:
|
|
45
|
+
self.ref_ld = None
|
|
46
|
+
|
|
47
|
+
if "ref_ld_chr" in args.keys():
|
|
48
|
+
self.ref_ld_chr = args["ref_ld_chr"]
|
|
49
|
+
else:
|
|
50
|
+
self.ref_ld_chr = None
|
|
51
|
+
|
|
52
|
+
if "w_ld" in args.keys():
|
|
53
|
+
self.w_ld = args["w_ld"]
|
|
54
|
+
else:
|
|
55
|
+
self.w_ld = None
|
|
56
|
+
|
|
57
|
+
if "w_ld_chr" in args.keys():
|
|
58
|
+
self.w_ld_chr = args["w_ld_chr"]
|
|
59
|
+
else:
|
|
60
|
+
self.w_ld_chr = None
|
|
61
|
+
|
|
62
|
+
self.overlap_annot = False
|
|
63
|
+
self.print_coefficients = "ldsc"
|
|
64
|
+
self.frqfile = None
|
|
65
|
+
self.frqfile_chr = None
|
|
66
|
+
self.no_intercept = None
|
|
67
|
+
self.intercept_h2 = None
|
|
68
|
+
self.intercept_gencov = None
|
|
69
|
+
self.M = None
|
|
70
|
+
self.two_step = None
|
|
71
|
+
self.chisq_max = None
|
|
72
|
+
self.ref_ld_chr_cts = None
|
|
73
|
+
self.print_cov = None
|
|
74
|
+
self.print_delete_vals = False
|
|
75
|
+
self.chunk_size = 50
|
|
76
|
+
self.pickle = False
|
|
77
|
+
self.yes_really = False
|
|
78
|
+
self.invert_anyway = False
|
|
79
|
+
self.n_blocks = 200
|
|
80
|
+
self.not_M_5_50 = False
|
|
81
|
+
self.no_check_alleles = False
|
|
82
|
+
self.return_silly_things = False
|
|
83
|
+
|
|
84
|
+
if "samp_prev" in args.keys():
|
|
85
|
+
self.samp_prev = args["samp_prev"]
|
|
86
|
+
else:
|
|
87
|
+
self.samp_prev = None
|
|
88
|
+
|
|
89
|
+
if "pop_prev" in args.keys():
|
|
90
|
+
self.pop_prev = args["pop_prev"]
|
|
91
|
+
else:
|
|
92
|
+
self.pop_prev = None
|
|
93
|
+
|
|
94
|
+
def _estimate_h2_by_ldsc(insumstats, log, verbose=True, **args):
|
|
95
|
+
sumstats = insumstats.copy()
|
|
96
|
+
##start function with col checking##########################################################
|
|
97
|
+
_start_line = "run LD score regression"
|
|
98
|
+
_end_line = "running LD score regression"
|
|
99
|
+
_start_cols =[]
|
|
100
|
+
_start_function = ".estimate_h2_by_ldsc()"
|
|
101
|
+
_must_args ={}
|
|
102
|
+
|
|
103
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
104
|
+
log=log,
|
|
105
|
+
verbose=verbose,
|
|
106
|
+
start_line=_start_line,
|
|
107
|
+
end_line=_end_line,
|
|
108
|
+
start_cols=_start_cols,
|
|
109
|
+
start_function=_start_function,
|
|
110
|
+
**_must_args)
|
|
111
|
+
if is_enough_info == False: return None
|
|
112
|
+
############################################################################################
|
|
113
|
+
log.write(" -Run single variate LD score regression:", verbose=verbose)
|
|
114
|
+
log.write(" -Adopted from LDSC source code: https://github.com/bulik/ldsc", verbose=verbose)
|
|
115
|
+
log.write(" -Please cite LDSC: Bulik-Sullivan, et al. LD Score Regression Distinguishes Confounding from Polygenicity in Genome-Wide Association Studies. Nature Genetics, 2015.", verbose=verbose)
|
|
116
|
+
log.write(" -Arguments:", verbose=verbose)
|
|
117
|
+
|
|
118
|
+
for key, value in args.items():
|
|
119
|
+
log.write(" -{}:{}".format(key, value), verbose=verbose)
|
|
120
|
+
|
|
121
|
+
default_args = ARGS(**args)
|
|
122
|
+
|
|
123
|
+
if "Z" not in sumstats.columns:
|
|
124
|
+
sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
|
|
125
|
+
|
|
126
|
+
sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
|
|
127
|
+
|
|
128
|
+
log.write(" -LDSC log:", verbose=verbose)
|
|
129
|
+
summary = estimate_h2(sumstats, default_args, log)
|
|
130
|
+
|
|
131
|
+
log.write(" -Results have been stored in .ldsc_h2", verbose=verbose)
|
|
132
|
+
finished(log=log,verbose=verbose,end_line=_end_line)
|
|
133
|
+
return parse_ldsc_summary(summary)
|
|
134
|
+
|
|
135
|
+
def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **args):
|
|
136
|
+
sumstats = insumstats.copy()
|
|
137
|
+
##start function with col checking##########################################################
|
|
138
|
+
_start_line = "run LD score regression for genetic correlation"
|
|
139
|
+
_end_line = "running LD score regression for genetic correlation"
|
|
140
|
+
_start_cols =[]
|
|
141
|
+
_start_function = ".estimate_rg_by_ldsc()"
|
|
142
|
+
_must_args ={}
|
|
143
|
+
|
|
144
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
145
|
+
log=log,
|
|
146
|
+
verbose=verbose,
|
|
147
|
+
start_line=_start_line,
|
|
148
|
+
end_line=_end_line,
|
|
149
|
+
start_cols=_start_cols,
|
|
150
|
+
start_function=_start_function,
|
|
151
|
+
**_must_args)
|
|
152
|
+
if is_enough_info == False: return None
|
|
153
|
+
############################################################################################
|
|
154
|
+
log.write(" -Run cross-trait LD score regression:", verbose=verbose)
|
|
155
|
+
log.write(" -Adopted from LDSC source code: https://github.com/bulik/ldsc", verbose=verbose)
|
|
156
|
+
log.write(" -Please cite LDSC: Bulik-Sullivan, B., et al. An Atlas of Genetic Correlations across Human Diseases and Traits. Nature Genetics, 2015.", verbose=verbose)
|
|
157
|
+
log.write(" -Arguments:", verbose=verbose)
|
|
158
|
+
|
|
159
|
+
for key, value in args.items():
|
|
160
|
+
log.write(" -{}:{}".format(key, value), verbose=verbose)
|
|
161
|
+
|
|
162
|
+
default_args = ARGS(**args)
|
|
163
|
+
|
|
164
|
+
if "Z" not in sumstats.columns:
|
|
165
|
+
sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
|
|
166
|
+
|
|
167
|
+
sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
|
|
168
|
+
|
|
169
|
+
other_traits_to_use = []
|
|
170
|
+
alias = default_args.rg.split(",")[1:]
|
|
171
|
+
|
|
172
|
+
for index, each_other_sumstats in enumerate(other_traits):
|
|
173
|
+
log.write(" -Processing sumstats with alias {} ({})".format(alias[index], each_other_sumstats.meta["gwaslab"]["study_name"]))
|
|
174
|
+
if "rsID" not in each_other_sumstats.data.columns:
|
|
175
|
+
to_append = each_other_sumstats.filter_hapmap3(verbose=False).data.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
|
|
176
|
+
else:
|
|
177
|
+
to_append = each_other_sumstats.data.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
|
|
178
|
+
|
|
179
|
+
if "Z" not in to_append.columns:
|
|
180
|
+
to_append["Z"] = to_append["BETA"]/to_append["SE"]
|
|
181
|
+
|
|
182
|
+
other_traits_to_use.append(to_append[["SNP","A1","A2","Z","N"]])
|
|
183
|
+
|
|
184
|
+
log.write(" -LDSC log:", verbose=verbose)
|
|
185
|
+
summary = estimate_rg(sumstats[["SNP","A1","A2","Z","N"]], other_traits_to_use, default_args, log)[1]
|
|
186
|
+
|
|
187
|
+
log.write(" -Results have been stored in .ldsc_rg", verbose=verbose)
|
|
188
|
+
finished(log=log,verbose=verbose,end_line=_end_line)
|
|
189
|
+
return summary
|
gwaslab/util_in_calculate_gc.py
CHANGED
|
@@ -12,34 +12,34 @@ def lambdaGC(insumstats,include_chrXYMT=True, x=23 ,y=24, mt=25, mode="P",level=
|
|
|
12
12
|
sumstats=insumstats.loc[:,["CHR",mode]]
|
|
13
13
|
|
|
14
14
|
if include_chrXYMT is False:
|
|
15
|
-
|
|
15
|
+
log.write(" -Excluding chrX, chrY, chrMT from lambda GC calculation.", verbose=verbose)
|
|
16
16
|
xymt= [x,y,mt,"chrx","chry","chrmt","chrX","chrY","chrMT","chrM","M","x","y","mt","X","Y","MT"]
|
|
17
17
|
sumstats = sumstats.loc[~sumstats["CHR"].isin(xymt),:]
|
|
18
18
|
|
|
19
19
|
indata = sumstats[mode].values
|
|
20
20
|
if len(indata) == 0:
|
|
21
|
-
|
|
21
|
+
log.write(" -No available variants to use for calculation.", verbose=verbose)
|
|
22
22
|
return np.nan
|
|
23
23
|
if mode=="p" or mode=="P":
|
|
24
24
|
observedMedianChi2 = sp.stats.chi2.isf(np.nanmedian(indata),1)
|
|
25
25
|
expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
|
|
26
26
|
lambdagc=observedMedianChi2/expectedMedianChi2
|
|
27
|
-
|
|
27
|
+
log.write(" -Lambda GC (P mode) at "+ str(1 - level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
|
|
28
28
|
elif mode=="mlog10p" or mode=="MLOG10P":
|
|
29
29
|
observedMedianChi2 = sp.stats.chi2.isf( np.nanmedian(np.power(10,-indata)) ,1)
|
|
30
30
|
expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
|
|
31
31
|
lambdagc=observedMedianChi2/expectedMedianChi2
|
|
32
|
-
|
|
32
|
+
log.write(" -Lambda GC (MLOG10P mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
|
|
33
33
|
elif mode=="z" or mode=="Z":
|
|
34
34
|
observedMedianChi2 = np.median((indata)**2)
|
|
35
35
|
expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
|
|
36
36
|
lambdagc=observedMedianChi2/expectedMedianChi2
|
|
37
|
-
if verbose:log.write(" -Lambda GC (Z mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc))
|
|
37
|
+
if verbose:log.write(" -Lambda GC (Z mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
|
|
38
38
|
elif mode=="chi2" or mode=="CHISQ":
|
|
39
39
|
observedMedianChi2 = np.median(indata)
|
|
40
40
|
expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
|
|
41
41
|
lambdagc=observedMedianChi2/expectedMedianChi2
|
|
42
|
-
|
|
42
|
+
log.write(" -Lambda GC (CHISQ mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
|
|
43
43
|
else:
|
|
44
44
|
return np.nan
|
|
45
45
|
return lambdagc
|
|
@@ -21,24 +21,23 @@ def get_power(
|
|
|
21
21
|
log=Log(),
|
|
22
22
|
verbose=True
|
|
23
23
|
):
|
|
24
|
-
|
|
24
|
+
log.write(" Start to calculate statistical power...", verbose=verbose)
|
|
25
25
|
if mode=="b":
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
log.write(" -Significance level: {:.3e}".format(sig_level))
|
|
26
|
+
log.write(" -Input settings (b mode):", verbose=verbose)
|
|
27
|
+
log.write(" -Number of cases:{}".format(ncase), verbose=verbose)
|
|
28
|
+
log.write(" -Number of controls:{}".format(ncontrol), verbose=verbose)
|
|
29
|
+
if genotype_rr is not None:
|
|
30
|
+
log.write(" -Risk allele RR:{:.3f}".format(genotype_rr), verbose=verbose)
|
|
31
|
+
elif genotype_or is not None:
|
|
32
|
+
log.write(" -Risk allele OR:{:.3f}".format(genotype_or), verbose=verbose)
|
|
33
|
+
elif beta is not None:
|
|
34
|
+
log.write(" -Risk allele beta:{:.3f}".format(beta), verbose=verbose)
|
|
35
|
+
else:
|
|
36
|
+
genotype_rr = 0.1
|
|
37
|
+
log.write(" -Risk allele RR:{:.3f}".format(genotype_rr), verbose=verbose)
|
|
38
|
+
log.write(" -Disease prevalence:{:.3f}".format(prevalence), verbose=verbose)
|
|
39
|
+
log.write(" -Risk allele frequency: {:.3f}".format(daf), verbose=verbose)
|
|
40
|
+
log.write(" -Significance level: {:.3e}".format(sig_level), verbose=verbose)
|
|
42
41
|
# Skol, A. D., Scott, L. J., Abecasis, G. R., & Boehnke, M. (2006). Joint analysis is more efficient than replication-based analysis for two-stage genome-wide association studies. Nature genetics, 38(2), 209-213.
|
|
43
42
|
aaf = daf**2
|
|
44
43
|
abf = 2 * (daf) * (1 - daf)
|
|
@@ -56,11 +55,11 @@ def get_power(
|
|
|
56
55
|
# https://jamanetwork.com/journals/jama/fullarticle/188182
|
|
57
56
|
|
|
58
57
|
if or_to_rr ==False:
|
|
59
|
-
|
|
60
|
-
|
|
58
|
+
log.write(" -Alogorithm: Skol, Andrew D., et al. Nature genetics 38.2 (2006): 209-213....", verbose=verbose)
|
|
59
|
+
log.write(" -GRR is approximated using OR. For prevalence < 10%, GRR is very similar to OR....", verbose=verbose)
|
|
61
60
|
else:
|
|
62
|
-
|
|
63
|
-
|
|
61
|
+
log.write(" -OR is converted to GRR using base prevalence: {}".format(prevalence), verbose=verbose)
|
|
62
|
+
log.write(" -Alogorithm: Zhang, J., & Kai, F. Y. (1998). What's the relative risk?: A method of correcting the odds ratio in cohort studies of common outcomes. Jama, 280(19), 1690-1691.....", verbose=verbose)
|
|
64
63
|
|
|
65
64
|
# additive
|
|
66
65
|
x = [ 2*genotype_rr-1, genotype_rr, 1 ]
|
|
@@ -68,19 +67,19 @@ def get_power(
|
|
|
68
67
|
aap= x[0] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
|
|
69
68
|
abp= x[1] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
|
|
70
69
|
bbp= x[2] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
70
|
+
log.write("Probability of disease :", verbose=verbose)
|
|
71
|
+
log.write(" - Individuals with AA genotype: {:.3f}".format(aap), verbose=verbose)
|
|
72
|
+
log.write(" - Individuals with AB genotype: {:.3f}".format(abp), verbose=verbose)
|
|
73
|
+
log.write(" - Individuals with BB genotype: {:.3f}".format(bbp), verbose=verbose)
|
|
75
74
|
|
|
76
75
|
pcase= (aap * aaf + abp * abf*0.5) / prevalence
|
|
77
76
|
pcontrol=((1-aap )* aaf + (1-abp )* abf*0.5) / (1 - prevalence)
|
|
78
77
|
|
|
79
78
|
vcase = pcase *(1-pcase)
|
|
80
79
|
vcontrol =pcontrol *(1-pcontrol)
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
80
|
+
log.write("Expected risk allele frequency:", verbose=verbose)
|
|
81
|
+
log.write(" - In cases: {:.3f}".format(pcase), verbose=verbose)
|
|
82
|
+
log.write(" - In controls: {:.3f}".format(pcontrol), verbose=verbose)
|
|
84
83
|
|
|
85
84
|
num= (pcase - pcontrol)
|
|
86
85
|
den= np.sqrt( (vcase/ncase + vcontrol/ncontrol)*0.5 )
|
|
@@ -88,22 +87,22 @@ def get_power(
|
|
|
88
87
|
|
|
89
88
|
c = ss.norm.isf(sig_level/2)
|
|
90
89
|
power = 1 - ss.norm.cdf(c-u) + ss.norm.cdf(-c-u)
|
|
91
|
-
|
|
90
|
+
log.write("Expected power: {:.3f}".format(power), verbose=verbose)
|
|
92
91
|
|
|
93
92
|
elif mode=="q":
|
|
94
93
|
if beta is None:
|
|
95
94
|
beta = 0.1
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
95
|
+
|
|
96
|
+
log.write(" -Input settings (q mode):", verbose=verbose)
|
|
97
|
+
log.write(" -Significance level: {}".format(sig_level), verbose=verbose)
|
|
98
|
+
log.write(" -EAF: {}".format(eaf), verbose=verbose)
|
|
99
|
+
log.write(" -BETA: {}".format(beta), verbose=verbose)
|
|
100
|
+
log.write(" -N: {}".format(n), verbose=verbose)
|
|
101
|
+
log.write(" -SNPR2: {}".format(2*eaf*(1-eaf)*(beta**2)), verbose=verbose)
|
|
103
102
|
c = ss.chi2.isf(sig_level,df=1)
|
|
104
103
|
NCP = n * 2*eaf*(1-eaf)*(beta**2)/vary
|
|
105
104
|
power = 1 - ss.ncx2.cdf(c, df=1, nc=NCP)
|
|
106
|
-
|
|
105
|
+
log.write("Finished calculating statistical power.", verbose=verbose)
|
|
107
106
|
return power
|
|
108
107
|
|
|
109
108
|
def get_beta(
|
|
@@ -137,11 +136,11 @@ def get_beta(
|
|
|
137
136
|
eafs = np.linspace(eaf_range[1],eaf_range[0],n_matrix)
|
|
138
137
|
betas = np.linspace(beta_range[0],beta_range[1],n_matrix)
|
|
139
138
|
|
|
140
|
-
|
|
139
|
+
log.write(" -Updating eaf-beta matrix...", verbose=verbose)
|
|
141
140
|
for i in range(n_matrix):
|
|
142
141
|
eaf_beta_matrix[i,] = calculate_power_single(beta=betas,eaf=eafs[i],n=n,sig_level=sig_level,vary=vary)
|
|
143
142
|
|
|
144
|
-
|
|
143
|
+
log.write(" -Extracting eaf-beta combinations with power = {}...".format(t), verbose=verbose)
|
|
145
144
|
i,j=1,1
|
|
146
145
|
eaf_beta = []
|
|
147
146
|
while i<n_matrix-1 and j<n_matrix-1:
|
|
@@ -207,11 +206,11 @@ def get_beta_binary(
|
|
|
207
206
|
eafs = np.linspace(eaf_range[1],eaf_range[0],n_matrix)
|
|
208
207
|
betas = np.linspace(beta_range[0],beta_range[1],n_matrix)
|
|
209
208
|
|
|
210
|
-
|
|
209
|
+
log.write(" -Updating eaf-beta matrix...", verbose=verbose)
|
|
211
210
|
if or_to_rr ==False:
|
|
212
|
-
|
|
211
|
+
log.write(" -GRR is approximated using OR. For prevalence < 10%, GRR is very similar to OR....", verbose=verbose)
|
|
213
212
|
else:
|
|
214
|
-
|
|
213
|
+
log.write(" -OR is converted to GRR using base prevalence: {}".format(prevalence), verbose=verbose)
|
|
215
214
|
|
|
216
215
|
for i in range(n_matrix):
|
|
217
216
|
eaf_beta_matrix[i,] = calculate_power_single(beta=betas,
|
|
@@ -222,7 +221,7 @@ def get_beta_binary(
|
|
|
222
221
|
sig_level=sig_level,
|
|
223
222
|
or_to_rr=or_to_rr)
|
|
224
223
|
|
|
225
|
-
|
|
224
|
+
log.write(" -Extracting eaf-beta combinations with power = {}...".format(t), verbose=verbose)
|
|
226
225
|
i,j=1,1
|
|
227
226
|
eaf_beta = []
|
|
228
227
|
while i<n_matrix-1 and j<n_matrix-1:
|
gwaslab/util_in_convert_h2.py
CHANGED
|
@@ -65,7 +65,7 @@ def _get_per_snp_r2(sumstats,
|
|
|
65
65
|
adjuested=False,
|
|
66
66
|
verbose=True):
|
|
67
67
|
# Pierce, B. L., Ahsan, H., & VanderWeele, T. J. (2011). Power and instrument strength requirements for Mendelian randomization studies using multiple genetic variants. International journal of epidemiology, 40(3), 740-752.
|
|
68
|
-
|
|
68
|
+
log.write("Start to calculate per-SNP heritibility...", verbose=verbose)
|
|
69
69
|
if type(k) is int or type(k) is float:
|
|
70
70
|
pass
|
|
71
71
|
elif k =="all":
|
|
@@ -81,18 +81,18 @@ def _get_per_snp_r2(sumstats,
|
|
|
81
81
|
# Var(e) = betase**2 * 2 * N * MAF * (1-MAF)
|
|
82
82
|
# r2 = Var(beta * X) / Var(y)
|
|
83
83
|
|
|
84
|
-
|
|
84
|
+
log.write(" -Calculating per-SNP rsq by 2 * (BETA**2) * AF * (1-AF) / Var(y)...", verbose=verbose)
|
|
85
85
|
sumstats["_VAR(BETAX)"] = 2*(sumstats[beta]**2)*sumstats[af]*(1-sumstats[af])
|
|
86
86
|
|
|
87
87
|
if type(vary) is int or type(vary) is float:
|
|
88
|
-
|
|
88
|
+
log.write(" -Var(y) is provided: {}...".format(vary), verbose=verbose)
|
|
89
89
|
sumstats["SNPR2"] = sumstats["_VAR(BETAX)"] / vary
|
|
90
90
|
elif vary=="se":
|
|
91
|
-
|
|
91
|
+
log.write(" -Var(y) is estimated from VAR(BETA * X), N, MAF, SE: {}...".format(vary), verbose=verbose)
|
|
92
92
|
sumstats["_SIGMA2"] = sumstats[se]**2 * 2*(sumstats[n])*sumstats[af]*(1-sumstats[af])
|
|
93
93
|
sumstats["SNPR2"] = sumstats["_VAR(BETAX)"] / (sumstats["_SIGMA2"] + sumstats["_VAR(BETAX)"])
|
|
94
94
|
else:
|
|
95
|
-
|
|
95
|
+
log.warning("Not enough information for calculation.")
|
|
96
96
|
|
|
97
97
|
if mode=="b":
|
|
98
98
|
if ncase not in sumstats.columns:
|
|
@@ -117,11 +117,11 @@ def _get_per_snp_r2(sumstats,
|
|
|
117
117
|
else:
|
|
118
118
|
snpr2 = "SNPR2"
|
|
119
119
|
if n in sumstats.columns:
|
|
120
|
-
|
|
121
|
-
|
|
120
|
+
log.write(" -Calculating F-statistic: F = [(N-k-1)/k] * (r2/1-r2)... where k = {}".format(k), verbose=verbose)
|
|
121
|
+
log.write(" -For r2, {} is used.".format(snpr2), verbose=verbose)
|
|
122
122
|
sumstats["F"] = sumstats[snpr2]*(sumstats[n]-1 -k)/((1-sumstats[snpr2]) * k)
|
|
123
123
|
|
|
124
|
-
|
|
124
|
+
log.write("Finished calculating per-SNP heritability!", verbose=verbose)
|
|
125
125
|
return sumstats
|
|
126
126
|
#
|
|
127
127
|
def get_population_allele_frequency(af, prop, odds_ratio, prevalence,eps=1e-15):
|