gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +22 -5
- gwaslab/g_Sumstats.py +110 -163
- gwaslab/g_SumstatsPair.py +76 -25
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +36 -17
- gwaslab/hm_harmonize_sumstats.py +354 -221
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +49 -43
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/io_to_formats.py +428 -295
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +891 -778
- gwaslab/util_ex_calculate_ldmatrix.py +31 -13
- gwaslab/util_ex_gwascatalog.py +25 -25
- gwaslab/util_ex_ldproxyfinder.py +10 -10
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +30 -30
- gwaslab/util_in_filter_value.py +201 -74
- gwaslab/util_in_get_density.py +10 -10
- gwaslab/util_in_get_sig.py +445 -71
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +42 -37
- gwaslab/viz_aux_reposition_text.py +10 -7
- gwaslab/viz_aux_save_figure.py +18 -8
- gwaslab/viz_plot_compare_af.py +32 -33
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +34 -26
- gwaslab/viz_plot_mqqplot.py +126 -75
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +36 -33
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +65 -61
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.37.dist-info/RECORD +0 -72
- /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
|
@@ -12,6 +12,7 @@ def tofinemapping(sumstats,
|
|
|
12
12
|
study=None,
|
|
13
13
|
bfile=None,
|
|
14
14
|
vcf=None,
|
|
15
|
+
loci=None,
|
|
15
16
|
out="./",
|
|
16
17
|
windowsizekb=1000,
|
|
17
18
|
n_cores=1,
|
|
@@ -27,8 +28,13 @@ def tofinemapping(sumstats,
|
|
|
27
28
|
suffixes=[""]
|
|
28
29
|
if getlead_args is None:
|
|
29
30
|
getlead_args={"windowsizekb":1000}
|
|
30
|
-
|
|
31
|
-
|
|
31
|
+
|
|
32
|
+
if loci is None:
|
|
33
|
+
log.write(" -Loci were not provided. All significant loci will be automatically extracted...")
|
|
34
|
+
sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
|
|
35
|
+
else:
|
|
36
|
+
sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
|
|
37
|
+
|
|
32
38
|
# Drop duplicate!!!!
|
|
33
39
|
log.write(" -Dropping duplicated SNPIDs...")
|
|
34
40
|
sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
|
|
@@ -170,6 +176,7 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
|
|
|
170
176
|
def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=None):
|
|
171
177
|
if suffixes is None:
|
|
172
178
|
suffixes=[""]
|
|
179
|
+
|
|
173
180
|
log.write(" -#variants in locus ({}): {}".format(row["SNPID"],len(locus_sumstats)))
|
|
174
181
|
# convert category to string
|
|
175
182
|
locus_sumstats["EA"] = locus_sumstats["EA"].astype("string")
|
|
@@ -180,28 +187,35 @@ def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=No
|
|
|
180
187
|
combined_df = pd.merge(ref_bim, locus_sumstats, on="SNPID",how="inner")
|
|
181
188
|
|
|
182
189
|
# match allele
|
|
183
|
-
|
|
184
|
-
log.write(" -#Variants with matched alleles:{}".format(sum(
|
|
190
|
+
perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
|
|
191
|
+
log.write(" -#Variants with perfect matched alleles:{}".format(sum(perfect_match)))
|
|
185
192
|
|
|
186
193
|
# fliipped allele
|
|
187
|
-
ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
|
|
188
|
-
|
|
194
|
+
#ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
|
|
195
|
+
flipped_match = ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
|
|
196
|
+
log.write(" -#Variants with flipped alleles:{}".format(sum(flipped_match)))
|
|
189
197
|
|
|
190
|
-
|
|
191
|
-
|
|
198
|
+
allele_match = perfect_match | flipped_match
|
|
199
|
+
log.write(" -#Total Variants matched:{}".format(sum(allele_match)))
|
|
200
|
+
|
|
201
|
+
if row["SNPID"] not in combined_df.loc[perfect_match,"SNPID"].values:
|
|
202
|
+
log.warning("Lead variant was not available in reference!")
|
|
192
203
|
|
|
193
204
|
# adjust statistics
|
|
194
205
|
output_columns=["SNPID","CHR","POS","EA_bim","NEA_bim"]
|
|
195
206
|
for suffix in suffixes:
|
|
196
207
|
if ("BETA"+suffix in locus_sumstats.columns) and ("SE"+suffix in locus_sumstats.columns):
|
|
197
|
-
|
|
208
|
+
log.write(" -Flipping BETA{} for variants with flipped alleles...".format(suffix))
|
|
209
|
+
combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
|
|
198
210
|
output_columns.append("BETA"+suffix)
|
|
199
211
|
output_columns.append("SE"+suffix)
|
|
200
212
|
if "Z" in locus_sumstats.columns:
|
|
201
|
-
|
|
213
|
+
log.write(" -Flipping Z{} for variants with flipped alleles...".format(suffix))
|
|
214
|
+
combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
|
|
202
215
|
output_columns.append("Z"+suffix)
|
|
203
216
|
if "EAF" in locus_sumstats.columns:
|
|
204
|
-
|
|
217
|
+
log.write(" -Flipping EAF{} for variants with flipped alleles...".format(suffix))
|
|
218
|
+
combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
|
|
205
219
|
output_columns.append("EAF"+suffix)
|
|
206
220
|
if "N" in locus_sumstats.columns:
|
|
207
221
|
output_columns.append("N"+suffix)
|
|
@@ -215,6 +229,7 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
|
|
|
215
229
|
matched_snp_list_path = "{}/{}_{}_{}.snplist.raw".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
|
|
216
230
|
|
|
217
231
|
matched_sumstats["SNPID"].to_csv(matched_snp_list_path, index=None, header=None)
|
|
232
|
+
log.write(" -Exporting SNP list of {} to: {}...".format(len(matched_sumstats) ,matched_snp_list_path))
|
|
218
233
|
|
|
219
234
|
# create locus-sumstats EA, NEA, (BETA, SE), Z
|
|
220
235
|
matched_sumstats_path = "{}/{}_{}_{}.sumstats.gz".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
|
|
@@ -230,7 +245,10 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
|
|
|
230
245
|
to_export_columns.append("EAF"+suffix)
|
|
231
246
|
if "N"+suffix in matched_sumstats.columns:
|
|
232
247
|
to_export_columns.append("N"+suffix)
|
|
233
|
-
|
|
248
|
+
|
|
249
|
+
log.write(" -Exporting locus sumstats to: {}...".format(matched_sumstats_path))
|
|
250
|
+
log.write(" -Exported columns: {}...".format(["SNPID"]+to_export_columns))
|
|
251
|
+
matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
|
|
234
252
|
return matched_snp_list_path, matched_sumstats_path
|
|
235
253
|
|
|
236
254
|
def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):
|
|
@@ -238,4 +256,4 @@ def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):
|
|
|
238
256
|
if list(matched_sumstats_snpid) == list(snpid_list):
|
|
239
257
|
log.write(" -Sumstats SNPID order and LD matrix SNPID order are matched.")
|
|
240
258
|
else:
|
|
241
|
-
log.
|
|
259
|
+
log.warning("Sumstats SNPID order and LD matrix SNPID order are not matched!")
|
gwaslab/util_ex_gwascatalog.py
CHANGED
|
@@ -9,26 +9,26 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
|
|
|
9
9
|
#https://www.ebi.ac.uk/gwas/rest/docs/api
|
|
10
10
|
|
|
11
11
|
base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
12
|
+
log.write("Start to retrieve data from GWASCatalog...", verbose=verbose)
|
|
13
|
+
log.write(" -Please make sure your sumstats is based on GRCh38...", verbose=verbose)
|
|
14
|
+
log.write(" -Requesting (GET) trait information through the GWASCatalog API...", verbose=verbose)
|
|
15
|
+
log.write(" -EFO trait api: "+ base_url, verbose=verbose)
|
|
16
16
|
text = requests.get(base_url)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
17
|
+
|
|
18
|
+
log.write(" -Status code: {}".format(text.status_code), verbose=verbose)
|
|
19
|
+
if text.status_code!=200:
|
|
20
|
+
log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.", verbose=verbose)
|
|
21
|
+
log.write(" -Message:{}".format(text.text), verbose=verbose)
|
|
22
|
+
return 0
|
|
23
23
|
|
|
24
24
|
api_response = json.loads(text.text)
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
log.write(" -Trait Name:",api_response["trait"], verbose=verbose)
|
|
26
|
+
log.write(" -Trait URL:",api_response["uri"], verbose=verbose)
|
|
27
27
|
|
|
28
28
|
base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo+"/associations?projection=associationByEfoTrait"
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...", verbose=verbose)
|
|
30
|
+
log.write(" -associationsByTraitSummary API: "+ base_url, verbose=verbose)
|
|
31
|
+
log.write(" -Note: this step might take a while...", verbose=verbose)
|
|
32
32
|
|
|
33
33
|
# get request and check status code of response
|
|
34
34
|
raw_data = requests.get(base_url)
|
|
@@ -37,13 +37,13 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
|
|
|
37
37
|
is_proceed = check_request_status_code(raw_data.status_code,verbose=verbose,log=log)
|
|
38
38
|
if is_proceed is False: return False
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
log.write(" -Loading json ...", verbose=verbose)
|
|
41
41
|
# Transform API response from JSON into Python dictionary
|
|
42
42
|
api_response = json.loads(raw_data.text)
|
|
43
|
-
|
|
43
|
+
log.write(" -Parsing json ...", verbose=verbose)
|
|
44
44
|
# An
|
|
45
45
|
records=list()
|
|
46
|
-
|
|
46
|
+
log.write(" -Number of reported associations for "+ efo +" in GWASCatalog:",len( api_response["_embedded"]["associations"]), verbose=verbose)
|
|
47
47
|
|
|
48
48
|
for association in api_response["_embedded"]["associations"]:
|
|
49
49
|
#association statistics:
|
|
@@ -126,12 +126,12 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
|
|
|
126
126
|
records.append(row)
|
|
127
127
|
#rsid locations
|
|
128
128
|
gwascatalog_lead_snps = pd.DataFrame(records,columns=["SNPID","CHR","POS","REPORT_GENENAME","CLOSEST_GENENAMES","FUNCTION_CLASS","OR","BETA","SE","P","TRAIT","STUDY","PUBMEDID","AUTHOR"])
|
|
129
|
-
|
|
130
|
-
sigs = gl.Sumstats(gwascatalog_lead_snps,fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
|
|
129
|
+
log.write(" -Loading retrieved data into gwaslab Sumstats object ...", verbose=verbose)
|
|
130
|
+
sigs = gl.Sumstats(gwascatalog_lead_snps.copy(),fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
|
|
131
131
|
sigs.fix_pos(verbose=False)
|
|
132
132
|
sigs.fix_chr(verbose=False)
|
|
133
133
|
sigs.sort_coordinate(verbose=False)
|
|
134
|
-
|
|
134
|
+
log.write("Finished retrieving data from GWASCatalog...", verbose=verbose)
|
|
135
135
|
#return gwaslab Sumstats object
|
|
136
136
|
return sigs
|
|
137
137
|
|
|
@@ -142,14 +142,14 @@ def check_request_status_code(request_code,verbose=True,log=Log()):
|
|
|
142
142
|
is_proceed=False
|
|
143
143
|
|
|
144
144
|
if request_code == 200:
|
|
145
|
-
|
|
145
|
+
log.write(" -Status code 200 OK: Retrieved data from GWASCatalog successffully ...", verbose=verbose)
|
|
146
146
|
is_proceed=True
|
|
147
147
|
elif request_code == 404:
|
|
148
|
-
|
|
148
|
+
log.write(" -Status code 404 Not Found: The requested resource did not exist ...", verbose=verbose)
|
|
149
149
|
elif request_code == 301:
|
|
150
|
-
|
|
150
|
+
log.write(" -Status code 301 Moved Permanently: The requested resource did not exist ...", verbose=verbose)
|
|
151
151
|
elif request_code == 400:
|
|
152
|
-
|
|
152
|
+
log.write(" -Status code 400 Bad Request: The requested resource did not exist ...", verbose=verbose)
|
|
153
153
|
|
|
154
154
|
return is_proceed
|
|
155
155
|
|
gwaslab/util_ex_ldproxyfinder.py
CHANGED
|
@@ -46,11 +46,11 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
46
46
|
log=Log(),
|
|
47
47
|
verbose=True,
|
|
48
48
|
windowsizekb=100,
|
|
49
|
-
ld_threshold=0.8
|
|
49
|
+
ld_threshold=0.8
|
|
50
50
|
):
|
|
51
51
|
### Load vcf#######################################################################################
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
log.write("Start to load reference genotype...", verbose=verbose)
|
|
53
|
+
log.write(" -reference vcf path : "+ vcf_path, verbose=verbose)
|
|
54
54
|
if tabix is None:
|
|
55
55
|
tabix = which("tabix")
|
|
56
56
|
vcf_chr_dict = auto_check_vcf_chr_dict(vcf_path=vcf_path, vcf_chr_dict=vcf_chr_dict, verbose=verbose, log=log)
|
|
@@ -122,7 +122,7 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
122
122
|
|
|
123
123
|
extracted_sumstats = pd.concat([extracted_sumstats, ld_proxies],ignore_index=True)
|
|
124
124
|
|
|
125
|
-
|
|
125
|
+
log.write("Finished loading reference genotype successfully!", verbose=verbose)
|
|
126
126
|
return extracted_sumstats
|
|
127
127
|
|
|
128
128
|
|
|
@@ -139,13 +139,13 @@ def _get_rsq( row,
|
|
|
139
139
|
ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
|
|
140
140
|
|
|
141
141
|
if ref_genotype is None:
|
|
142
|
-
|
|
142
|
+
log.warning("No data was retrieved. Skipping ...", verbose=verbose)
|
|
143
143
|
ref_genotype=dict()
|
|
144
144
|
ref_genotype["variants/POS"]=np.array([],dtype="int64")
|
|
145
145
|
return None
|
|
146
146
|
|
|
147
|
-
|
|
148
|
-
|
|
147
|
+
log.write(" -Retrieving index...", verbose=verbose)
|
|
148
|
+
log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])), verbose=verbose)
|
|
149
149
|
# match sumstats pos and ref pos:
|
|
150
150
|
# get ref index for its first appearance of sumstats pos
|
|
151
151
|
#######################################################################################
|
|
@@ -170,7 +170,7 @@ def _get_rsq( row,
|
|
|
170
170
|
else:
|
|
171
171
|
# no position match
|
|
172
172
|
return None
|
|
173
|
-
|
|
173
|
+
log.write(" -Matching variants using POS, NEA, EA ...", verbose=verbose)
|
|
174
174
|
|
|
175
175
|
sumstats["REFINDEX"] = sumstats.loc[:,["POS","NEA","EA"]].apply(lambda x: match_varaint(x), axis=1)
|
|
176
176
|
log.write(" -Matched variants in sumstats and vcf:{} ".format(sum(~sumstats["REFINDEX"].isna())))
|
|
@@ -190,7 +190,7 @@ def _get_rsq( row,
|
|
|
190
190
|
lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
|
|
191
191
|
other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
|
|
192
192
|
|
|
193
|
-
|
|
193
|
+
log.write(" -Calculating Rsq...", verbose=verbose)
|
|
194
194
|
|
|
195
195
|
if len(other_snp_genotype)>1:
|
|
196
196
|
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
|
|
@@ -198,7 +198,7 @@ def _get_rsq( row,
|
|
|
198
198
|
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
|
|
199
199
|
sumstats.loc[~sumstats["REFINDEX"].isna(),"RSQ"] = valid_r2
|
|
200
200
|
else:
|
|
201
|
-
|
|
201
|
+
log.write(" -Lead SNP not found in reference...", verbose=verbose)
|
|
202
202
|
sumstats["RSQ"]=None
|
|
203
203
|
|
|
204
204
|
sumstats["RSQ"] = sumstats["RSQ"].astype("float")
|
gwaslab/util_ex_ldsc.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from gwaslab.ldsc_sumstats import estimate_h2
|
|
2
|
+
from gwaslab.ldsc_sumstats import estimate_rg
|
|
3
|
+
from gwaslab.g_Log import Log
|
|
4
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
5
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
6
|
+
from gwaslab.qc_fix_sumstats import skipped
|
|
7
|
+
from gwaslab.io_read_ldsc import parse_ldsc_summary
|
|
8
|
+
|
|
9
|
+
class ARGS():
|
|
10
|
+
def __init__(self, **args):
|
|
11
|
+
|
|
12
|
+
self.out = "ldsc"
|
|
13
|
+
self.bfile = None
|
|
14
|
+
self.l2 = False
|
|
15
|
+
self.extract = None
|
|
16
|
+
self.keep = None
|
|
17
|
+
self.ld_wind_snps = None
|
|
18
|
+
self.ld_wind_kb = None
|
|
19
|
+
self.ld_wind_cm = None
|
|
20
|
+
self.print_snps = None
|
|
21
|
+
self.annot =None
|
|
22
|
+
self.thin_annot = False
|
|
23
|
+
self.cts_bin = None
|
|
24
|
+
self.cts_breaks = None
|
|
25
|
+
self.cts_names = None
|
|
26
|
+
self.per_allele = False
|
|
27
|
+
self.pq_exp =None
|
|
28
|
+
self.no_print_annot = False
|
|
29
|
+
|
|
30
|
+
if "h2" in args.keys():
|
|
31
|
+
self.h2 = args["h2"]
|
|
32
|
+
else:
|
|
33
|
+
self.h2 = None
|
|
34
|
+
|
|
35
|
+
self.h2_cts = None
|
|
36
|
+
|
|
37
|
+
if "rg" in args.keys():
|
|
38
|
+
self.rg = args["rg"]
|
|
39
|
+
else:
|
|
40
|
+
self.rg = None
|
|
41
|
+
|
|
42
|
+
if "ref_ld" in args.keys():
|
|
43
|
+
self.ref_ld = args["ref_ld"]
|
|
44
|
+
else:
|
|
45
|
+
self.ref_ld = None
|
|
46
|
+
|
|
47
|
+
if "ref_ld_chr" in args.keys():
|
|
48
|
+
self.ref_ld_chr = args["ref_ld_chr"]
|
|
49
|
+
else:
|
|
50
|
+
self.ref_ld_chr = None
|
|
51
|
+
|
|
52
|
+
if "w_ld" in args.keys():
|
|
53
|
+
self.w_ld = args["w_ld"]
|
|
54
|
+
else:
|
|
55
|
+
self.w_ld = None
|
|
56
|
+
|
|
57
|
+
if "w_ld_chr" in args.keys():
|
|
58
|
+
self.w_ld_chr = args["w_ld_chr"]
|
|
59
|
+
else:
|
|
60
|
+
self.w_ld_chr = None
|
|
61
|
+
|
|
62
|
+
self.overlap_annot = False
|
|
63
|
+
self.print_coefficients = "ldsc"
|
|
64
|
+
self.frqfile = None
|
|
65
|
+
self.frqfile_chr = None
|
|
66
|
+
self.no_intercept = None
|
|
67
|
+
self.intercept_h2 = None
|
|
68
|
+
self.intercept_gencov = None
|
|
69
|
+
self.M = None
|
|
70
|
+
self.two_step = None
|
|
71
|
+
self.chisq_max = None
|
|
72
|
+
self.ref_ld_chr_cts = None
|
|
73
|
+
self.print_cov = None
|
|
74
|
+
self.print_delete_vals = False
|
|
75
|
+
self.chunk_size = 50
|
|
76
|
+
self.pickle = False
|
|
77
|
+
self.yes_really = False
|
|
78
|
+
self.invert_anyway = False
|
|
79
|
+
self.n_blocks = 200
|
|
80
|
+
self.not_M_5_50 = False
|
|
81
|
+
self.no_check_alleles = False
|
|
82
|
+
self.return_silly_things = False
|
|
83
|
+
|
|
84
|
+
if "samp_prev" in args.keys():
|
|
85
|
+
self.samp_prev = args["samp_prev"]
|
|
86
|
+
else:
|
|
87
|
+
self.samp_prev = None
|
|
88
|
+
|
|
89
|
+
if "pop_prev" in args.keys():
|
|
90
|
+
self.pop_prev = args["pop_prev"]
|
|
91
|
+
else:
|
|
92
|
+
self.pop_prev = None
|
|
93
|
+
|
|
94
|
+
def _estimate_h2_by_ldsc(insumstats, log, verbose=True, **args):
|
|
95
|
+
sumstats = insumstats.copy()
|
|
96
|
+
##start function with col checking##########################################################
|
|
97
|
+
_start_line = "run LD score regression"
|
|
98
|
+
_end_line = "running LD score regression"
|
|
99
|
+
_start_cols =[]
|
|
100
|
+
_start_function = ".estimate_h2_by_ldsc()"
|
|
101
|
+
_must_args ={}
|
|
102
|
+
|
|
103
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
104
|
+
log=log,
|
|
105
|
+
verbose=verbose,
|
|
106
|
+
start_line=_start_line,
|
|
107
|
+
end_line=_end_line,
|
|
108
|
+
start_cols=_start_cols,
|
|
109
|
+
start_function=_start_function,
|
|
110
|
+
**_must_args)
|
|
111
|
+
if is_enough_info == False: return None
|
|
112
|
+
############################################################################################
|
|
113
|
+
log.write(" -Run single variate LD score regression:", verbose=verbose)
|
|
114
|
+
log.write(" -Adopted from LDSC source code: https://github.com/bulik/ldsc", verbose=verbose)
|
|
115
|
+
log.write(" -Please cite LDSC: Bulik-Sullivan, et al. LD Score Regression Distinguishes Confounding from Polygenicity in Genome-Wide Association Studies. Nature Genetics, 2015.", verbose=verbose)
|
|
116
|
+
log.write(" -Arguments:", verbose=verbose)
|
|
117
|
+
|
|
118
|
+
for key, value in args.items():
|
|
119
|
+
log.write(" -{}:{}".format(key, value), verbose=verbose)
|
|
120
|
+
|
|
121
|
+
default_args = ARGS(**args)
|
|
122
|
+
|
|
123
|
+
if "Z" not in sumstats.columns:
|
|
124
|
+
sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
|
|
125
|
+
|
|
126
|
+
sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
|
|
127
|
+
|
|
128
|
+
log.write(" -LDSC log:", verbose=verbose)
|
|
129
|
+
summary = estimate_h2(sumstats, default_args, log)
|
|
130
|
+
|
|
131
|
+
log.write(" -Results have been stored in .ldsc_h2", verbose=verbose)
|
|
132
|
+
finished(log=log,verbose=verbose,end_line=_end_line)
|
|
133
|
+
return parse_ldsc_summary(summary)
|
|
134
|
+
|
|
135
|
+
def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **args):
|
|
136
|
+
sumstats = insumstats.copy()
|
|
137
|
+
##start function with col checking##########################################################
|
|
138
|
+
_start_line = "run LD score regression for genetic correlation"
|
|
139
|
+
_end_line = "running LD score regression for genetic correlation"
|
|
140
|
+
_start_cols =[]
|
|
141
|
+
_start_function = ".estimate_rg_by_ldsc()"
|
|
142
|
+
_must_args ={}
|
|
143
|
+
|
|
144
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
145
|
+
log=log,
|
|
146
|
+
verbose=verbose,
|
|
147
|
+
start_line=_start_line,
|
|
148
|
+
end_line=_end_line,
|
|
149
|
+
start_cols=_start_cols,
|
|
150
|
+
start_function=_start_function,
|
|
151
|
+
**_must_args)
|
|
152
|
+
if is_enough_info == False: return None
|
|
153
|
+
############################################################################################
|
|
154
|
+
log.write(" -Run cross-trait LD score regression:", verbose=verbose)
|
|
155
|
+
log.write(" -Adopted from LDSC source code: https://github.com/bulik/ldsc", verbose=verbose)
|
|
156
|
+
log.write(" -Please cite LDSC: Bulik-Sullivan, B., et al. An Atlas of Genetic Correlations across Human Diseases and Traits. Nature Genetics, 2015.", verbose=verbose)
|
|
157
|
+
log.write(" -Arguments:", verbose=verbose)
|
|
158
|
+
|
|
159
|
+
for key, value in args.items():
|
|
160
|
+
log.write(" -{}:{}".format(key, value), verbose=verbose)
|
|
161
|
+
|
|
162
|
+
default_args = ARGS(**args)
|
|
163
|
+
|
|
164
|
+
if "Z" not in sumstats.columns:
|
|
165
|
+
sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
|
|
166
|
+
|
|
167
|
+
sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
|
|
168
|
+
|
|
169
|
+
other_traits_to_use = []
|
|
170
|
+
alias = default_args.rg.split(",")[1:]
|
|
171
|
+
|
|
172
|
+
for index, each_other_sumstats in enumerate(other_traits):
|
|
173
|
+
log.write(" -Processing sumstats with alias {} ({})".format(alias[index], each_other_sumstats.meta["gwaslab"]["study_name"]))
|
|
174
|
+
if "rsID" not in each_other_sumstats.data.columns:
|
|
175
|
+
to_append = each_other_sumstats.filter_hapmap3(verbose=False).data.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
|
|
176
|
+
else:
|
|
177
|
+
to_append = each_other_sumstats.data.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
|
|
178
|
+
|
|
179
|
+
if "Z" not in to_append.columns:
|
|
180
|
+
to_append["Z"] = to_append["BETA"]/to_append["SE"]
|
|
181
|
+
|
|
182
|
+
other_traits_to_use.append(to_append[["SNP","A1","A2","Z","N"]])
|
|
183
|
+
|
|
184
|
+
log.write(" -LDSC log:", verbose=verbose)
|
|
185
|
+
summary = estimate_rg(sumstats[["SNP","A1","A2","Z","N"]], other_traits_to_use, default_args, log)[1]
|
|
186
|
+
|
|
187
|
+
log.write(" -Results have been stored in .ldsc_rg", verbose=verbose)
|
|
188
|
+
finished(log=log,verbose=verbose,end_line=_end_line)
|
|
189
|
+
return summary
|
gwaslab/util_ex_process_ref.py
CHANGED
|
@@ -89,7 +89,7 @@ def _load_single_bim_to_ref_bims(bpfile_prefix, ref_bims, log):
|
|
|
89
89
|
sep="\s+",
|
|
90
90
|
usecols=[0,1,3,4,5],
|
|
91
91
|
header=None,
|
|
92
|
-
dtype={1:"string",0:"category", 3:"int", 4:"string", 5:"string"}).rename(columns={1:"SNPID",0:"CHR_bim",3:"POS_bim",4:"
|
|
92
|
+
dtype={1:"string",0:"category", 3:"int", 4:"string", 5:"string"}).rename(columns={1:"SNPID",0:"CHR_bim",3:"POS_bim",4:"EA_bim",5:"NEA_bim"})
|
|
93
93
|
log.write(" -#variants in ref file: {}".format(len(single_bim)))
|
|
94
94
|
ref_bims.append(single_bim)
|
|
95
95
|
return ref_bims
|
|
@@ -104,7 +104,7 @@ def _load_single_pvar_to_ref_bims(bpfile_prefix, ref_bims, log):
|
|
|
104
104
|
usecols=[0,1,2,3,4],
|
|
105
105
|
header=None,
|
|
106
106
|
comment="#",
|
|
107
|
-
dtype={2:"string",0:"category", 1:"int", 3:"string", 4:"string"}).rename(columns={2:"SNPID",0:"CHR_bim",1:"POS_bim",3:"
|
|
107
|
+
dtype={2:"string",0:"category", 1:"int", 3:"string", 4:"string"}).rename(columns={2:"SNPID",0:"CHR_bim",1:"POS_bim",3:"EA_bim",4:"NEA_bim"})
|
|
108
108
|
log.write(" -#variants in ref file: {}".format(len(single_bim)))
|
|
109
109
|
ref_bims.append(single_bim)
|
|
110
110
|
return ref_bims
|
|
@@ -265,7 +265,7 @@ def _process_vcf(ref_file_prefix,
|
|
|
265
265
|
except subprocess.CalledProcessError as e:
|
|
266
266
|
log.write(e.output)
|
|
267
267
|
else:
|
|
268
|
-
log.write(" -Plink {} for CHR {} exists. Skipping...".format(convert ,i))
|
|
268
|
+
log.write(" -Plink {} for CHR {} exists: {}. Skipping...".format(convert ,i, bpfile_prefix))
|
|
269
269
|
|
|
270
270
|
if load_bim == True:
|
|
271
271
|
if convert == "bfile":
|
gwaslab/util_ex_run_coloc.py
CHANGED
|
@@ -68,12 +68,16 @@ def _run_coloc_susie(filepath, r="Rscript",
|
|
|
68
68
|
D1 <- list( "LD"=R, "beta"=df[,"BETA_1"],"varbeta"=df[,"SE_1"]**2,"snp"=df[,"SNPID"],"position"=df[,"POS"],"type"="{type1}","N"={n1}{d1_args})
|
|
69
69
|
D2 <- list( "LD"=R, "beta"=df[,"BETA_2"],"varbeta"=df[,"SE_2"]**2,"snp"=df[,"SNPID"],"position"=df[,"POS"],"type"="{type2}","N"={n2}{d2_args})
|
|
70
70
|
|
|
71
|
+
abf <- coloc.abf(dataset1=D1,dataset2=D2)
|
|
72
|
+
write.csv(t(data.frame(abf$summary)) , "{output_prefix}.coloc.abf", row.names = FALSE)
|
|
73
|
+
|
|
71
74
|
S1=runsusie(D1{susie_args})
|
|
72
75
|
S2=runsusie(D2{susie_args})
|
|
73
76
|
|
|
74
77
|
susie.res=coloc.susie(S1,S2{coloc_args})
|
|
75
78
|
|
|
76
79
|
write.csv(susie.res$summary, "{output_prefix}.coloc.susie", row.names = FALSE)
|
|
80
|
+
|
|
77
81
|
'''.format(sumstats_path = sumstats,
|
|
78
82
|
ld_r_matrix_path = ld_r_matrix,
|
|
79
83
|
fillna_script = "R[is.na(R)] <- 0" if fillldna==True else "",
|
|
@@ -87,7 +91,9 @@ def _run_coloc_susie(filepath, r="Rscript",
|
|
|
87
91
|
coloc_args = coloc_args,
|
|
88
92
|
output_prefix = output_prefix)
|
|
89
93
|
|
|
90
|
-
log.write(" -coloc script: {}".format("coloc.
|
|
94
|
+
log.write(" -coloc abf script: {}".format("coloc.abf(dataset1=D1,dataset2=D2)"), verbose=verbose)
|
|
95
|
+
log.write(" -coloc susie script: {}".format("coloc.susie(S1,S2)"), verbose=verbose)
|
|
96
|
+
|
|
91
97
|
with open("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]),"w") as file:
|
|
92
98
|
file.write(rscript)
|
|
93
99
|
|
|
@@ -101,21 +107,37 @@ def _run_coloc_susie(filepath, r="Rscript",
|
|
|
101
107
|
#plink_process.kill()
|
|
102
108
|
log.write(" Running coloc.SuSieR from command line...", verbose=verbose)
|
|
103
109
|
r_log+= output + "\n"
|
|
110
|
+
|
|
111
|
+
pip_cs = pd.read_csv("{}.coloc.abf".format(output_prefix))
|
|
112
|
+
if len(pip_cs)==0:
|
|
113
|
+
log.write(" -SuSieR result for {} is empty. Please check parameters.".format(output_prefix), verbose=verbose)
|
|
114
|
+
else:
|
|
115
|
+
pip_cs["Locus"] = row["SNPID"]
|
|
116
|
+
pip_cs["STUDY"] = row["study"]
|
|
117
|
+
pip_cs["hit1"] = row["SNPID"]
|
|
118
|
+
pip_cs["METHOD"] = "abf"
|
|
119
|
+
locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
|
|
120
|
+
|
|
104
121
|
pip_cs = pd.read_csv("{}.coloc.susie".format(output_prefix))
|
|
105
122
|
if len(pip_cs)==0:
|
|
106
123
|
log.write(" -SuSieR result for {} is empty. Please check parameters.".format(output_prefix), verbose=verbose)
|
|
107
124
|
else:
|
|
108
125
|
pip_cs["Locus"] = row["SNPID"]
|
|
109
126
|
pip_cs["STUDY"] = row["study"]
|
|
127
|
+
pip_cs["METHOD"] = "susie"
|
|
110
128
|
locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
|
|
129
|
+
|
|
111
130
|
os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
|
|
131
|
+
|
|
112
132
|
if delete == True:
|
|
113
|
-
os.remove("{}.
|
|
133
|
+
os.remove("{}.coloc.susie".format(output_prefix))
|
|
134
|
+
os.remove("{}.coloc.abf".format(output_prefix))
|
|
114
135
|
else:
|
|
115
|
-
log.write(" -
|
|
136
|
+
log.write(" -coloc-abf result summary to: {}".format("{}.coloc.abf".format(output_prefix)), verbose=verbose)
|
|
137
|
+
log.write(" -coloc-susie result summary to: {}".format("{}.coloc.susie".format(output_prefix)), verbose=verbose)
|
|
116
138
|
|
|
117
139
|
except subprocess.CalledProcessError as e:
|
|
118
140
|
log.write(e.output)
|
|
119
141
|
os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
|
|
120
|
-
log.write("Finished
|
|
142
|
+
log.write("Finished clocalization using coloc and SuSiE.", verbose=verbose)
|
|
121
143
|
return locus_pip_cs
|
gwaslab/util_in_calculate_gc.py
CHANGED
|
@@ -12,34 +12,34 @@ def lambdaGC(insumstats,include_chrXYMT=True, x=23 ,y=24, mt=25, mode="P",level=
|
|
|
12
12
|
sumstats=insumstats.loc[:,["CHR",mode]]
|
|
13
13
|
|
|
14
14
|
if include_chrXYMT is False:
|
|
15
|
-
|
|
15
|
+
log.write(" -Excluding chrX, chrY, chrMT from lambda GC calculation.", verbose=verbose)
|
|
16
16
|
xymt= [x,y,mt,"chrx","chry","chrmt","chrX","chrY","chrMT","chrM","M","x","y","mt","X","Y","MT"]
|
|
17
17
|
sumstats = sumstats.loc[~sumstats["CHR"].isin(xymt),:]
|
|
18
18
|
|
|
19
19
|
indata = sumstats[mode].values
|
|
20
20
|
if len(indata) == 0:
|
|
21
|
-
|
|
21
|
+
log.write(" -No available variants to use for calculation.", verbose=verbose)
|
|
22
22
|
return np.nan
|
|
23
23
|
if mode=="p" or mode=="P":
|
|
24
24
|
observedMedianChi2 = sp.stats.chi2.isf(np.nanmedian(indata),1)
|
|
25
25
|
expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
|
|
26
26
|
lambdagc=observedMedianChi2/expectedMedianChi2
|
|
27
|
-
|
|
27
|
+
log.write(" -Lambda GC (P mode) at "+ str(1 - level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
|
|
28
28
|
elif mode=="mlog10p" or mode=="MLOG10P":
|
|
29
29
|
observedMedianChi2 = sp.stats.chi2.isf( np.nanmedian(np.power(10,-indata)) ,1)
|
|
30
30
|
expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
|
|
31
31
|
lambdagc=observedMedianChi2/expectedMedianChi2
|
|
32
|
-
|
|
32
|
+
log.write(" -Lambda GC (MLOG10P mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
|
|
33
33
|
elif mode=="z" or mode=="Z":
|
|
34
34
|
observedMedianChi2 = np.median((indata)**2)
|
|
35
35
|
expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
|
|
36
36
|
lambdagc=observedMedianChi2/expectedMedianChi2
|
|
37
|
-
if verbose:log.write(" -Lambda GC (Z mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc))
|
|
37
|
+
if verbose:log.write(" -Lambda GC (Z mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
|
|
38
38
|
elif mode=="chi2" or mode=="CHISQ":
|
|
39
39
|
observedMedianChi2 = np.median(indata)
|
|
40
40
|
expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
|
|
41
41
|
lambdagc=observedMedianChi2/expectedMedianChi2
|
|
42
|
-
|
|
42
|
+
log.write(" -Lambda GC (CHISQ mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
|
|
43
43
|
else:
|
|
44
44
|
return np.nan
|
|
45
45
|
return lambdagc
|