gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +22 -5
- gwaslab/g_Sumstats.py +110 -163
- gwaslab/g_SumstatsPair.py +76 -25
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +36 -17
- gwaslab/hm_harmonize_sumstats.py +354 -221
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +49 -43
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/io_to_formats.py +428 -295
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +891 -778
- gwaslab/util_ex_calculate_ldmatrix.py +31 -13
- gwaslab/util_ex_gwascatalog.py +25 -25
- gwaslab/util_ex_ldproxyfinder.py +10 -10
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +30 -30
- gwaslab/util_in_filter_value.py +201 -74
- gwaslab/util_in_get_density.py +10 -10
- gwaslab/util_in_get_sig.py +445 -71
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +42 -37
- gwaslab/viz_aux_reposition_text.py +10 -7
- gwaslab/viz_aux_save_figure.py +18 -8
- gwaslab/viz_plot_compare_af.py +32 -33
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +34 -26
- gwaslab/viz_plot_mqqplot.py +126 -75
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +36 -33
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +65 -61
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.37.dist-info/RECORD +0 -72
- /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/ldsc_sumstats.py
ADDED
|
@@ -0,0 +1,629 @@
|
|
|
1
|
+
'''
|
|
2
|
+
(c) 2014 Brendan Bulik-Sullivan and Hilary Finucane
|
|
3
|
+
|
|
4
|
+
This module deals with getting all the data needed for LD Score regression from files
|
|
5
|
+
into memory and checking that the input makes sense. There is no math here. LD Score
|
|
6
|
+
regression is implemented in the regressions module.
|
|
7
|
+
'''
|
|
8
|
+
from __future__ import division
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from scipy import stats
|
|
12
|
+
import itertools as it
|
|
13
|
+
import gwaslab.ldsc_parse as ps
|
|
14
|
+
import gwaslab.ldsc_regressions as reg
|
|
15
|
+
import sys
|
|
16
|
+
import traceback
|
|
17
|
+
import copy
|
|
18
|
+
import os
|
|
19
|
+
import glob
|
|
20
|
+
|
|
21
|
+
log_prefix = ' -'
|
|
22
|
+
log_prefix_short = ' -'
|
|
23
|
+
def xrange(*args):
|
|
24
|
+
return range(*args)
|
|
25
|
+
|
|
26
|
+
_N_CHR = 22
|
|
27
|
+
# complementary bases
|
|
28
|
+
COMPLEMENT = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
|
|
29
|
+
# bases
|
|
30
|
+
BASES = COMPLEMENT.keys()
|
|
31
|
+
# true iff strand ambiguous
|
|
32
|
+
STRAND_AMBIGUOUS = {''.join(x): x[0] == COMPLEMENT[x[1]]
|
|
33
|
+
for x in it.product(BASES, BASES)
|
|
34
|
+
if x[0] != x[1]}
|
|
35
|
+
# SNPS we want to keep (pairs of alleles)
|
|
36
|
+
VALID_SNPS = {x for x in map(lambda y: ''.join(y), it.product(BASES, BASES))
|
|
37
|
+
if x[0] != x[1] and not STRAND_AMBIGUOUS[x]}
|
|
38
|
+
# T iff SNP 1 has the same alleles as SNP 2 (allowing for strand or ref allele flip).
|
|
39
|
+
MATCH_ALLELES = {x for x in map(lambda y: ''.join(y), it.product(VALID_SNPS, VALID_SNPS))
|
|
40
|
+
# strand and ref match
|
|
41
|
+
if ((x[0] == x[2]) and (x[1] == x[3])) or
|
|
42
|
+
# ref match, strand flip
|
|
43
|
+
((x[0] == COMPLEMENT[x[2]]) and (x[1] == COMPLEMENT[x[3]])) or
|
|
44
|
+
# ref flip, strand match
|
|
45
|
+
((x[0] == x[3]) and (x[1] == x[2])) or
|
|
46
|
+
((x[0] == COMPLEMENT[x[3]]) and (x[1] == COMPLEMENT[x[2]]))} # strand and ref flip
|
|
47
|
+
# T iff SNP 1 has the same alleles as SNP 2 w/ ref allele flip.
|
|
48
|
+
FLIP_ALLELES = {''.join(x):
|
|
49
|
+
((x[0] == x[3]) and (x[1] == x[2])) or # strand match
|
|
50
|
+
# strand flip
|
|
51
|
+
((x[0] == COMPLEMENT[x[3]]) and (x[1] == COMPLEMENT[x[2]]))
|
|
52
|
+
for x in MATCH_ALLELES}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _splitp(fstr):
|
|
56
|
+
flist = fstr.split(',')
|
|
57
|
+
flist = [os.path.expanduser(os.path.expandvars(x)) for x in flist]
|
|
58
|
+
return flist
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _select_and_log(x, ii, log, msg):
|
|
62
|
+
'''Fiter down to rows that are True in ii. Log # of SNPs removed.'''
|
|
63
|
+
new_len = ii.sum()
|
|
64
|
+
if new_len == 0:
|
|
65
|
+
raise ValueError(msg.format(N=0))
|
|
66
|
+
else:
|
|
67
|
+
x = x[ii]
|
|
68
|
+
log.log(" -" + msg.format(N=new_len))
|
|
69
|
+
return x
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def smart_merge(x, y):
|
|
73
|
+
'''Check if SNP columns are equal. If so, save time by using concat instead of merge.'''
|
|
74
|
+
if len(x) == len(y) and (x.index == y.index).all() and (x.SNP == y.SNP).all():
|
|
75
|
+
x = x.reset_index(drop=True)
|
|
76
|
+
y = y.reset_index(drop=True).drop('SNP', 1)
|
|
77
|
+
out = pd.concat([x, y], axis=1)
|
|
78
|
+
else:
|
|
79
|
+
out = pd.merge(x, y, how='inner', on='SNP')
|
|
80
|
+
return out
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _read_ref_ld(args, log):
|
|
84
|
+
'''Read reference LD Scores.'''
|
|
85
|
+
ref_ld = _read_chr_split_files(args.ref_ld_chr, args.ref_ld, log,
|
|
86
|
+
'reference panel LD Score', ps.ldscore_fromlist)
|
|
87
|
+
log.log(
|
|
88
|
+
' -Read reference panel LD Scores for {N} SNPs.'.format(N=len(ref_ld)))
|
|
89
|
+
return ref_ld
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _read_annot(args, log):
|
|
93
|
+
'''Read annot matrix.'''
|
|
94
|
+
try:
|
|
95
|
+
if args.ref_ld is not None:
|
|
96
|
+
overlap_matrix, M_tot = _read_chr_split_files(args.ref_ld_chr, args.ref_ld, log,
|
|
97
|
+
'annot matrix', ps.annot, frqfile=args.frqfile)
|
|
98
|
+
elif args.ref_ld_chr is not None:
|
|
99
|
+
overlap_matrix, M_tot = _read_chr_split_files(args.ref_ld_chr, args.ref_ld, log,
|
|
100
|
+
'annot matrix', ps.annot, frqfile=args.frqfile_chr)
|
|
101
|
+
except Exception:
|
|
102
|
+
log.log(' -Error parsing .annot file.')
|
|
103
|
+
raise
|
|
104
|
+
|
|
105
|
+
return overlap_matrix, M_tot
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _read_M(args, log, n_annot):
|
|
109
|
+
'''Read M (--M, --M-file, etc).'''
|
|
110
|
+
if args.M:
|
|
111
|
+
try:
|
|
112
|
+
M_annot = [float(x) for x in _splitp(args.M)]
|
|
113
|
+
except ValueError as e:
|
|
114
|
+
raise ValueError('Could not cast --M to float: ' + str(e.args))
|
|
115
|
+
else:
|
|
116
|
+
if args.ref_ld:
|
|
117
|
+
M_annot = ps.M_fromlist(
|
|
118
|
+
_splitp(args.ref_ld), common=(not args.not_M_5_50))
|
|
119
|
+
elif args.ref_ld_chr:
|
|
120
|
+
M_annot = ps.M_fromlist(
|
|
121
|
+
_splitp(args.ref_ld_chr), _N_CHR, common=(not args.not_M_5_50))
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
M_annot = np.array(M_annot).reshape((1, n_annot))
|
|
125
|
+
except ValueError as e:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
'# terms in --M must match # of LD Scores in --ref-ld.\n' + str(e.args))
|
|
128
|
+
|
|
129
|
+
return M_annot
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _read_w_ld(args, log):
|
|
133
|
+
'''Read regression SNP LD.'''
|
|
134
|
+
if (args.w_ld and ',' in args.w_ld) or (args.w_ld_chr and ',' in args.w_ld_chr):
|
|
135
|
+
raise ValueError(
|
|
136
|
+
'--w-ld must point to a single fileset (no commas allowed).')
|
|
137
|
+
w_ld = _read_chr_split_files(args.w_ld_chr, args.w_ld, log,
|
|
138
|
+
'regression weight LD Score', ps.ldscore_fromlist)
|
|
139
|
+
if len(w_ld.columns) != 2:
|
|
140
|
+
raise ValueError('--w-ld may only have one LD Score column.')
|
|
141
|
+
w_ld.columns = ['SNP', 'LD_weights'] # prevent colname conflicts w/ ref ld
|
|
142
|
+
log.log(
|
|
143
|
+
' -Read regression weight LD Scores for {N} SNPs.'.format(N=len(w_ld)))
|
|
144
|
+
return w_ld
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs):
|
|
148
|
+
'''Read files split across 22 chromosomes (annot, ref_ld, w_ld).'''
|
|
149
|
+
try:
|
|
150
|
+
if not_chr_arg:
|
|
151
|
+
log.log(' -Reading {N} from {F} ... ({p})'.format(N=noun, F=not_chr_arg, p=parsefunc.__name__))
|
|
152
|
+
out = parsefunc(_splitp(not_chr_arg), **kwargs)
|
|
153
|
+
elif chr_arg:
|
|
154
|
+
f = ps.sub_chr(chr_arg, '[1-22]')
|
|
155
|
+
log.log(' -Reading {N} from {F} ... ({p})'.format(N=noun, F=f, p=parsefunc.__name__))
|
|
156
|
+
out = parsefunc(_splitp(chr_arg), _N_CHR, **kwargs)
|
|
157
|
+
except ValueError as e:
|
|
158
|
+
log.log(' -Error parsing {N}.'.format(N=noun))
|
|
159
|
+
raise e
|
|
160
|
+
|
|
161
|
+
return out
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _read_sumstats(args, log, fh, alleles=False, dropna=False):
|
|
165
|
+
'''Parse summary statistics.'''
|
|
166
|
+
#log.log(' -Reading summary statistics from {S} ...'.format(S=fh))
|
|
167
|
+
#sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna)
|
|
168
|
+
log_msg = ' -Read summary statistics for {N} SNPs.'
|
|
169
|
+
sumstats = fh.dropna()
|
|
170
|
+
log.log(log_msg.format(N=len(sumstats)))
|
|
171
|
+
m = len(sumstats)
|
|
172
|
+
sumstats = sumstats.drop_duplicates(subset='SNP')
|
|
173
|
+
if m > len(sumstats):
|
|
174
|
+
log.log(
|
|
175
|
+
' -Dropped {M} SNPs with duplicated rs numbers.'.format(M=m - len(sumstats)))
|
|
176
|
+
|
|
177
|
+
return sumstats
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _check_ld_condnum(args, log, ref_ld):
|
|
181
|
+
'''Check condition number of LD Score matrix.'''
|
|
182
|
+
if len(ref_ld.shape) >= 2:
|
|
183
|
+
cond_num = int(np.linalg.cond(ref_ld))
|
|
184
|
+
if cond_num > 100000:
|
|
185
|
+
if args.invert_anyway:
|
|
186
|
+
warn = "WARNING: LD Score matrix condition number is {C}. "
|
|
187
|
+
warn += "Inverting anyway because the --invert-anyway flag is set."
|
|
188
|
+
log.log(warn.format(C=cond_num))
|
|
189
|
+
else:
|
|
190
|
+
warn = "WARNING: LD Score matrix condition number is {C}. "
|
|
191
|
+
warn += "Remove collinear LD Scores. "
|
|
192
|
+
raise ValueError(warn.format(C=cond_num))
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _check_variance(log, M_annot, ref_ld):
|
|
196
|
+
'''Remove zero-variance LD Scores.'''
|
|
197
|
+
ii = ref_ld.iloc[:, 1:].var() == 0 # NB there is a SNP column here
|
|
198
|
+
if ii.all():
|
|
199
|
+
raise ValueError('All LD Scores have zero variance.')
|
|
200
|
+
else:
|
|
201
|
+
log.log(' -Removing partitioned LD Scores with zero variance.')
|
|
202
|
+
ii_snp = np.array([True] + list(~ii))
|
|
203
|
+
ii_m = np.array(~ii)
|
|
204
|
+
ref_ld = ref_ld.iloc[:, ii_snp]
|
|
205
|
+
M_annot = M_annot[:, ii_m]
|
|
206
|
+
|
|
207
|
+
return M_annot, ref_ld, ii
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _warn_length(log, sumstats):
|
|
211
|
+
if len(sumstats) < 200000:
|
|
212
|
+
log.warning(
|
|
213
|
+
'number of SNPs less than 200k; this is almost always bad.')
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _print_cov(ldscore_reg, ofh, log):
|
|
217
|
+
'''Prints covariance matrix of slopes.'''
|
|
218
|
+
log.log(
|
|
219
|
+
' -Printing covariance matrix of the estimates to {F}.'.format(F=ofh))
|
|
220
|
+
np.savetxt(ofh, ldscore_reg.coef_cov)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _print_delete_values(ldscore_reg, ofh, log):
|
|
224
|
+
'''Prints block jackknife delete-k values'''
|
|
225
|
+
log.log(' -Printing block jackknife delete values to {F}.'.format(F=ofh))
|
|
226
|
+
np.savetxt(ofh, ldscore_reg.tot_delete_values)
|
|
227
|
+
|
|
228
|
+
def _print_part_delete_values(ldscore_reg, ofh, log):
|
|
229
|
+
'''Prints partitioned block jackknife delete-k values'''
|
|
230
|
+
log.log(' -Printing partitioned block jackknife delete values to {F}.'.format(F=ofh))
|
|
231
|
+
np.savetxt(ofh, ldscore_reg.part_delete_values)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _merge_and_log(ld, sumstats, noun, log):
|
|
235
|
+
'''Wrap smart merge with log messages about # of SNPs.'''
|
|
236
|
+
sumstats = smart_merge(ld, sumstats)
|
|
237
|
+
msg = ' -After merging with {F}, {N} SNPs remain.'
|
|
238
|
+
if len(sumstats) == 0:
|
|
239
|
+
raise ValueError(msg.format(N=len(sumstats), F=noun))
|
|
240
|
+
else:
|
|
241
|
+
log.log(msg.format(N=len(sumstats), F=noun))
|
|
242
|
+
|
|
243
|
+
return sumstats
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _read_ld_sumstats(sumstats, args, log, fh, alleles=False, dropna=True):
|
|
247
|
+
#sumstats = _read_sumstats(args, log, fh, alleles=alleles, dropna=dropna)
|
|
248
|
+
sumstats = sumstats.dropna()
|
|
249
|
+
ref_ld = _read_ref_ld(args, log)
|
|
250
|
+
n_annot = len(ref_ld.columns) - 1
|
|
251
|
+
M_annot = _read_M(args, log, n_annot)
|
|
252
|
+
M_annot, ref_ld, novar_cols = _check_variance(log, M_annot, ref_ld)
|
|
253
|
+
w_ld = _read_w_ld(args, log)
|
|
254
|
+
sumstats = _merge_and_log(ref_ld, sumstats, 'reference panel LD', log)
|
|
255
|
+
sumstats = _merge_and_log(sumstats, w_ld, 'regression SNP LD', log)
|
|
256
|
+
w_ld_cname = sumstats.columns[-1]
|
|
257
|
+
ref_ld_cnames = ref_ld.columns[1:len(ref_ld.columns)]
|
|
258
|
+
return M_annot, w_ld_cname, ref_ld_cnames, sumstats, novar_cols
|
|
259
|
+
|
|
260
|
+
def cell_type_specific(args, log):
|
|
261
|
+
'''Cell type specific analysis'''
|
|
262
|
+
args = copy.deepcopy(args)
|
|
263
|
+
if args.intercept_h2 is not None:
|
|
264
|
+
args.intercept_h2 = float(args.intercept_h2)
|
|
265
|
+
if args.no_intercept:
|
|
266
|
+
args.intercept_h2 = 1
|
|
267
|
+
|
|
268
|
+
M_annot_all_regr, w_ld_cname, ref_ld_cnames_all_regr, sumstats, novar_cols = \
|
|
269
|
+
_read_ld_sumstats(args, log, args.h2_cts)
|
|
270
|
+
M_tot = np.sum(M_annot_all_regr)
|
|
271
|
+
_check_ld_condnum(args, log, ref_ld_cnames_all_regr)
|
|
272
|
+
_warn_length(log, sumstats)
|
|
273
|
+
n_snp = len(sumstats)
|
|
274
|
+
n_blocks = min(n_snp, args.n_blocks)
|
|
275
|
+
if args.chisq_max is None:
|
|
276
|
+
chisq_max = max(0.001*sumstats.N.max(), 80)
|
|
277
|
+
else:
|
|
278
|
+
chisq_max = args.chisq_max
|
|
279
|
+
|
|
280
|
+
ii = np.ravel(sumstats.Z**2 < chisq_max)
|
|
281
|
+
sumstats = sumstats.iloc[ii, :]
|
|
282
|
+
log.log(' -Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format(
|
|
283
|
+
C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii)))
|
|
284
|
+
n_snp = np.sum(ii) # lambdas are late-binding, so this works
|
|
285
|
+
ref_ld_all_regr = np.array(sumstats[ref_ld_cnames_all_regr]).reshape((len(sumstats),-1))
|
|
286
|
+
chisq = np.array(sumstats.Z**2)
|
|
287
|
+
keep_snps = sumstats[['SNP']]
|
|
288
|
+
|
|
289
|
+
s = lambda x: np.array(x).reshape((n_snp, 1))
|
|
290
|
+
results_columns = ['Name', 'Coefficient', 'Coefficient_std_error', 'Coefficient_P_value']
|
|
291
|
+
results_data = []
|
|
292
|
+
for (name, ct_ld_chr) in [x.split() for x in open(args.ref_ld_chr_cts).readlines()]:
|
|
293
|
+
ref_ld_cts_allsnps = _read_chr_split_files(ct_ld_chr, None, log,
|
|
294
|
+
'cts reference panel LD Score', ps.ldscore_fromlist)
|
|
295
|
+
log.log(' -Performing regression.')
|
|
296
|
+
ref_ld_cts = np.array(pd.merge(keep_snps, ref_ld_cts_allsnps, on='SNP', how='left').iloc[:,1:])
|
|
297
|
+
if np.any(np.isnan(ref_ld_cts)):
|
|
298
|
+
raise ValueError ('Missing some LD scores from cts files. Are you sure all SNPs in ref-ld-chr are also in ref-ld-chr-cts')
|
|
299
|
+
|
|
300
|
+
ref_ld = np.hstack([ref_ld_cts, ref_ld_all_regr])
|
|
301
|
+
M_cts = ps.M_fromlist(
|
|
302
|
+
_splitp(ct_ld_chr), _N_CHR, common=(not args.not_M_5_50))
|
|
303
|
+
M_annot = np.hstack([M_cts, M_annot_all_regr])
|
|
304
|
+
hsqhat = reg.Hsq(s(chisq), ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N),
|
|
305
|
+
M_annot, n_blocks=n_blocks, intercept=args.intercept_h2,
|
|
306
|
+
twostep=None, old_weights=True)
|
|
307
|
+
coef, coef_se = hsqhat.coef[0], hsqhat.coef_se[0]
|
|
308
|
+
results_data.append((name, coef, coef_se, stats.norm.sf(coef/coef_se)))
|
|
309
|
+
if args.print_all_cts:
|
|
310
|
+
for i in range(1, len(ct_ld_chr.split(','))):
|
|
311
|
+
coef, coef_se = hsqhat.coef[i], hsqhat.coef_se[i]
|
|
312
|
+
results_data.append((name+'_'+str(i), coef, coef_se, stats.norm.sf(coef/coef_se)))
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
df_results = pd.DataFrame(data = results_data, columns = results_columns)
|
|
316
|
+
df_results.sort_values(by = 'Coefficient_P_value', inplace=True)
|
|
317
|
+
df_results.to_csv(args.out+'.cell_type_results.txt', sep='\t', index=False)
|
|
318
|
+
log.log(' -Results printed to '+args.out+'.cell_type_results.txt')
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def estimate_h2(sumstats, args, log):
|
|
322
|
+
'''Estimate h2 and partitioned h2.'''
|
|
323
|
+
args = copy.deepcopy(args)
|
|
324
|
+
if args.samp_prev is not None and args.pop_prev is not None:
|
|
325
|
+
args.samp_prev, args.pop_prev = map(
|
|
326
|
+
float, [args.samp_prev, args.pop_prev])
|
|
327
|
+
if args.intercept_h2 is not None:
|
|
328
|
+
args.intercept_h2 = float(args.intercept_h2)
|
|
329
|
+
if args.no_intercept:
|
|
330
|
+
args.intercept_h2 = 1
|
|
331
|
+
M_annot, w_ld_cname, ref_ld_cnames, sumstats, novar_cols = _read_ld_sumstats(
|
|
332
|
+
sumstats, args, log, args.h2)
|
|
333
|
+
ref_ld = np.array(sumstats[ref_ld_cnames])
|
|
334
|
+
_check_ld_condnum(args, log, ref_ld_cnames)
|
|
335
|
+
_warn_length(log, sumstats)
|
|
336
|
+
n_snp = len(sumstats)
|
|
337
|
+
n_blocks = min(n_snp, args.n_blocks)
|
|
338
|
+
n_annot = len(ref_ld_cnames)
|
|
339
|
+
chisq_max = args.chisq_max
|
|
340
|
+
old_weights = False
|
|
341
|
+
if n_annot == 1:
|
|
342
|
+
if args.two_step is None and args.intercept_h2 is None:
|
|
343
|
+
args.two_step = 30
|
|
344
|
+
else:
|
|
345
|
+
old_weights = True
|
|
346
|
+
if args.chisq_max is None:
|
|
347
|
+
chisq_max = max(0.001*sumstats.N.max(), 80)
|
|
348
|
+
|
|
349
|
+
s = lambda x: np.array(x).reshape((n_snp, 1))
|
|
350
|
+
chisq = s(sumstats.Z**2)
|
|
351
|
+
if chisq_max is not None:
|
|
352
|
+
ii = np.ravel(chisq < chisq_max)
|
|
353
|
+
sumstats = sumstats.iloc[ii, :]
|
|
354
|
+
log.log(' -Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format(
|
|
355
|
+
C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii)))
|
|
356
|
+
n_snp = np.sum(ii) # lambdas are late-binding, so this works
|
|
357
|
+
ref_ld = np.array(sumstats[ref_ld_cnames])
|
|
358
|
+
chisq = chisq[ii].reshape((n_snp, 1))
|
|
359
|
+
|
|
360
|
+
if args.two_step is not None:
|
|
361
|
+
log.log(' -Using two-step estimator with cutoff at {M}.'.format(M=args.two_step))
|
|
362
|
+
|
|
363
|
+
hsqhat = reg.Hsq(chisq, ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N),
|
|
364
|
+
M_annot, n_blocks=n_blocks, intercept=args.intercept_h2,
|
|
365
|
+
twostep=args.two_step, old_weights=old_weights)
|
|
366
|
+
|
|
367
|
+
if args.print_cov:
|
|
368
|
+
_print_cov(hsqhat, args.out + '.cov', log)
|
|
369
|
+
if args.print_delete_vals:
|
|
370
|
+
_print_delete_values(hsqhat, args.out + '.delete', log)
|
|
371
|
+
_print_part_delete_values(hsqhat, args.out + '.part_delete', log)
|
|
372
|
+
|
|
373
|
+
#log.log(hsqhat.summary(ref_ld_cnames, P=args.samp_prev, K=args.pop_prev, overlap = args.overlap_annot))
|
|
374
|
+
if args.overlap_annot:
|
|
375
|
+
overlap_matrix, M_tot = _read_annot(args, log)
|
|
376
|
+
|
|
377
|
+
# overlap_matrix = overlap_matrix[np.array(~novar_cols), np.array(~novar_cols)]#np.logical_not
|
|
378
|
+
df_results = hsqhat._overlap_output(ref_ld_cnames, overlap_matrix, M_annot, M_tot, args.print_coefficients)
|
|
379
|
+
df_results.to_csv(args.out+'.results', sep="\t", index=False)
|
|
380
|
+
log.log(' -Results printed to '+args.out+'.results')
|
|
381
|
+
|
|
382
|
+
return hsqhat.summary(ref_ld_cnames, P=args.samp_prev, K=args.pop_prev, overlap = args.overlap_annot)
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def estimate_rg(sumstats, other_sumstats, args, log):
|
|
386
|
+
'''Estimate rg between trait 1 and a list of other traits.'''
|
|
387
|
+
args = copy.deepcopy(args)
|
|
388
|
+
|
|
389
|
+
rg_paths, rg_files = _parse_rg(args.rg)
|
|
390
|
+
|
|
391
|
+
n_pheno = len(rg_paths)
|
|
392
|
+
|
|
393
|
+
f = lambda x: _split_or_none(x, n_pheno)
|
|
394
|
+
|
|
395
|
+
args.intercept_h2, args.intercept_gencov, args.samp_prev, args.pop_prev = map(f,
|
|
396
|
+
(args.intercept_h2, args.intercept_gencov, args.samp_prev, args.pop_prev))
|
|
397
|
+
|
|
398
|
+
##map behaviour changed since python3##############################################
|
|
399
|
+
args.intercept_h2 = list(args.intercept_h2)
|
|
400
|
+
args.intercept_gencov = list(args.intercept_gencov)
|
|
401
|
+
args.samp_prev = list(args.samp_prev)
|
|
402
|
+
args.pop_prev = list(args.pop_prev)
|
|
403
|
+
################################################
|
|
404
|
+
|
|
405
|
+
map(lambda x: _check_arg_len(x, n_pheno), ((args.intercept_h2, '--intercept-h2'),
|
|
406
|
+
(args.intercept_gencov, '--intercept-gencov'),
|
|
407
|
+
(args.samp_prev, '--samp-prev'),
|
|
408
|
+
(args.pop_prev, '--pop-prev')))
|
|
409
|
+
|
|
410
|
+
if args.no_intercept:
|
|
411
|
+
args.intercept_h2 = [1 for _ in xrange(n_pheno)]
|
|
412
|
+
args.intercept_gencov = [0 for _ in xrange(n_pheno)]
|
|
413
|
+
|
|
414
|
+
p1 = rg_paths[0]
|
|
415
|
+
out_prefix = args.out + rg_files[0]
|
|
416
|
+
|
|
417
|
+
M_annot, w_ld_cname, ref_ld_cnames, sumstats, _ = _read_ld_sumstats(sumstats, args, log, p1,
|
|
418
|
+
alleles=True, dropna=True)
|
|
419
|
+
RG = []
|
|
420
|
+
n_annot = M_annot.shape[1]
|
|
421
|
+
|
|
422
|
+
if n_annot == 1 and args.two_step is None and args.intercept_h2 is None:
|
|
423
|
+
args.two_step = 30
|
|
424
|
+
if args.two_step is not None:
|
|
425
|
+
log.log(' -Using two-step estimator with cutoff at {M}.'.format(M=args.two_step))
|
|
426
|
+
|
|
427
|
+
for i, p2 in enumerate(other_sumstats):
|
|
428
|
+
log.log(
|
|
429
|
+
' -Computing rg for phenotype {I}/{N}'.format(I=i + 2, N=len(rg_paths)))
|
|
430
|
+
try:
|
|
431
|
+
loop = _read_other_sumstats(args, log, p2, sumstats, ref_ld_cnames)
|
|
432
|
+
rghat = _rg(loop, args, log, M_annot, ref_ld_cnames, w_ld_cname, i)
|
|
433
|
+
RG.append(rghat)
|
|
434
|
+
_print_gencor(args, log, rghat, ref_ld_cnames, i, rg_paths, i == 0)
|
|
435
|
+
out_prefix_loop = out_prefix + '_' + rg_files[i + 1]
|
|
436
|
+
if args.print_cov:
|
|
437
|
+
_print_rg_cov(rghat, out_prefix_loop, log)
|
|
438
|
+
if args.print_delete_vals:
|
|
439
|
+
_print_rg_delete_values(rghat, out_prefix_loop, log)
|
|
440
|
+
|
|
441
|
+
except Exception: # keep going if phenotype 50/100 causes an error
|
|
442
|
+
|
|
443
|
+
msg = 'ERROR computing rg for phenotype {I}/{N}, from file {F}.'
|
|
444
|
+
log.log(msg.format(I=i + 2, N=len(rg_paths), F=rg_paths[i + 1]))
|
|
445
|
+
ex_type, ex, tb = sys.exc_info()
|
|
446
|
+
log.log(traceback.format_exc(ex) + '\n')
|
|
447
|
+
if len(RG) <= i: # if exception raised before appending to RG
|
|
448
|
+
RG.append(None)
|
|
449
|
+
|
|
450
|
+
log.log('Summary of Genetic Correlation Results\n' +
|
|
451
|
+
_get_rg_table(rg_paths, RG, args)[0])
|
|
452
|
+
return RG, _get_rg_table(rg_paths, RG, args)[1]
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def _read_other_sumstats(args, log, p2, sumstats, ref_ld_cnames):
|
|
456
|
+
loop = _read_sumstats(args, log, p2, alleles=True, dropna=False)
|
|
457
|
+
loop = _merge_sumstats_sumstats(args, sumstats, loop, log)
|
|
458
|
+
loop = loop.dropna(how='any')
|
|
459
|
+
loop[['A1', 'A1x', 'A2', 'A2x']] = loop[['A1', 'A1x', 'A2', 'A2x']].astype("string")
|
|
460
|
+
alleles = loop.A1 + loop.A2 + loop.A1x + loop.A2x
|
|
461
|
+
if not args.no_check_alleles:
|
|
462
|
+
loop = _select_and_log(loop, _filter_alleles(alleles), log,
|
|
463
|
+
'{N} SNPs with valid alleles.')
|
|
464
|
+
loop['Z2'] = _align_alleles(loop.Z2, alleles)
|
|
465
|
+
|
|
466
|
+
loop = loop.drop(['A1', 'A1x', 'A2', 'A2x'], axis=1)
|
|
467
|
+
_check_ld_condnum(args, log, loop[ref_ld_cnames])
|
|
468
|
+
_warn_length(log, loop)
|
|
469
|
+
return loop
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def _get_rg_table(rg_paths, RG, args):
|
|
473
|
+
'''Print a table of genetic correlations.'''
|
|
474
|
+
# fix error caused by behaviour change for map
|
|
475
|
+
t = lambda attr: lambda obj: getattr(obj, attr, 'NA')
|
|
476
|
+
x = pd.DataFrame()
|
|
477
|
+
x['p1'] = [rg_paths[0] for i in xrange(1, len(rg_paths))]
|
|
478
|
+
x['p2'] = rg_paths[1:len(rg_paths)]
|
|
479
|
+
|
|
480
|
+
#x['rg'] = map(t('rg_ratio'), RG)
|
|
481
|
+
#x['se'] = map(t('rg_se'), RG)
|
|
482
|
+
#x['z'] = map(t('z'), RG)
|
|
483
|
+
#x['p'] = map(t('p'), RG)
|
|
484
|
+
|
|
485
|
+
x['rg'] = [getattr(i, 'rg_ratio', 'NA') for i in RG]
|
|
486
|
+
x['se'] = [getattr(i, 'rg_se', 'NA') for i in RG]
|
|
487
|
+
x['z'] = [getattr(i, 'z', 'NA') for i in RG]
|
|
488
|
+
x['p'] = [getattr(i, 'p', 'NA') for i in RG]
|
|
489
|
+
## i -> it
|
|
490
|
+
if args.samp_prev is not None and \
|
|
491
|
+
args.pop_prev is not None and \
|
|
492
|
+
all((i is not None for i in args.samp_prev)) and \
|
|
493
|
+
all((it is not None for it in args.pop_prev)):
|
|
494
|
+
|
|
495
|
+
#c = map(lambda x, y: reg.h2_obs_to_liab(1, x, y), args.samp_prev[1:], args.pop_prev[1:])
|
|
496
|
+
c = list(map(lambda x, y: reg.h2_obs_to_liab(1, x, y), args.samp_prev[1:], args.pop_prev[1:]))
|
|
497
|
+
|
|
498
|
+
#x['h2_liab'] = map(lambda x, y: x * y, c, map(t('tot'), map(t('hsq2'), RG)))
|
|
499
|
+
#x['h2_liab_se'] = map(lambda x, y: x * y, c, map(t('tot_se'), map(t('hsq2'), RG)))
|
|
500
|
+
|
|
501
|
+
x['h2_liab'] = [getattr(getattr(i, 'hsq2', 'NA'), 'tot', 'NA') * c[index] for index,i in enumerate(RG)]
|
|
502
|
+
x['h2_liab_se'] =[getattr(getattr(i, 'hsq2', 'NA'), 'tot_se', 'NA') * c[index] for index,i in enumerate(RG)]
|
|
503
|
+
else:
|
|
504
|
+
#x['h2_obs'] = map(t('tot'), map(t('hsq2'), RG))
|
|
505
|
+
#x['h2_obs_se'] = map(t('tot_se'), map(t('hsq2'), RG))
|
|
506
|
+
|
|
507
|
+
x['h2_obs'] = [getattr(getattr(i, 'hsq2', 'NA'), 'tot', 'NA') for i in RG]
|
|
508
|
+
x['h2_obs_se'] = [getattr(getattr(i, 'hsq2', 'NA'), 'tot_se', 'NA') for i in RG]
|
|
509
|
+
|
|
510
|
+
#x['h2_int'] = map(t('intercept'), map(t('hsq2'), RG))
|
|
511
|
+
#x['h2_int_se'] = map(t('intercept_se'), map(t('hsq2'), RG))
|
|
512
|
+
#x['gcov_int'] = map(t('intercept'), map(t('gencov'), RG))
|
|
513
|
+
#x['gcov_int_se'] = map(t('intercept_se'), map(t('gencov'), RG))
|
|
514
|
+
|
|
515
|
+
x['h2_int'] = [getattr(getattr(i, 'hsq2', 'NA'), 'intercept', 'NA') for i in RG]
|
|
516
|
+
x['h2_int_se'] = [getattr(getattr(i, 'hsq2', 'NA'), 'intercept_se', 'NA') for i in RG]
|
|
517
|
+
x['gcov_int'] = [getattr(getattr(i, 'gencov', 'NA'), 'intercept', 'NA') for i in RG]
|
|
518
|
+
x['gcov_int_se'] = [getattr(getattr(i, 'gencov', 'NA'), 'intercept_se', 'NA') for i in RG]
|
|
519
|
+
return x.to_string(header=True, index=False) + '\n', x
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def _print_gencor(args, log, rghat, ref_ld_cnames, i, rg_paths, print_hsq1):
|
|
523
|
+
#l = lambda x: x + ''.join(['-' for i in range(len(x.replace('\n', '')))])
|
|
524
|
+
l = lambda x: x
|
|
525
|
+
P = [args.samp_prev[0], args.samp_prev[i + 1]]
|
|
526
|
+
K = [args.pop_prev[0], args.pop_prev[i + 1]]
|
|
527
|
+
if args.samp_prev is None and args.pop_prev is None:
|
|
528
|
+
args.samp_prev = [None, None]
|
|
529
|
+
args.pop_prev = [None, None]
|
|
530
|
+
if print_hsq1:
|
|
531
|
+
log.log(l(log_prefix_short+'Heritability of phenotype 1'))
|
|
532
|
+
log.log(log_prefix_short+rghat.hsq1.summary(ref_ld_cnames, P=P[0], K=K[0]))
|
|
533
|
+
|
|
534
|
+
log.log(
|
|
535
|
+
l(log_prefix_short+'Heritability of phenotype {I}/{N}'.format(I=i + 2, N=len(rg_paths))))
|
|
536
|
+
log.log(log_prefix_short+rghat.hsq2.summary(ref_ld_cnames, P=P[1], K=K[1]))
|
|
537
|
+
log.log(l(log_prefix_short+'Genetic Covariance'))
|
|
538
|
+
log.log(log_prefix_short+rghat.gencov.summary(ref_ld_cnames, P=P, K=K))
|
|
539
|
+
log.log(l(log_prefix_short+'Genetic Correlation'))
|
|
540
|
+
log.log(log_prefix_short+rghat.summary() + '\n')
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def _merge_sumstats_sumstats(args, sumstats1, sumstats2, log):
|
|
544
|
+
'''Merge two sets of summary statistics.'''
|
|
545
|
+
sumstats1.rename(columns={'N': 'N1', 'Z': 'Z1'}, inplace=True)
|
|
546
|
+
sumstats2.rename(
|
|
547
|
+
columns={'A1': 'A1x', 'A2': 'A2x', 'N': 'N2', 'Z': 'Z2'}, inplace=True)
|
|
548
|
+
x = _merge_and_log(sumstats1, sumstats2, 'summary statistics', log)
|
|
549
|
+
return x
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def _filter_alleles(alleles):
|
|
553
|
+
'''Remove bad variants (mismatched alleles, non-SNPs, strand ambiguous).'''
|
|
554
|
+
ii = alleles.apply(lambda y: y in MATCH_ALLELES)
|
|
555
|
+
return ii
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def _align_alleles(z, alleles):
|
|
559
|
+
'''Align Z1 and Z2 to same choice of ref allele (allowing for strand flip).'''
|
|
560
|
+
try:
|
|
561
|
+
z *= (-1) ** alleles.apply(lambda y: FLIP_ALLELES[y])
|
|
562
|
+
except KeyError as e:
|
|
563
|
+
msg = 'Incompatible alleles in .sumstats files: %s. ' % e.args
|
|
564
|
+
msg += 'Did you forget to use --merge-alleles with munge_sumstats.py?'
|
|
565
|
+
raise KeyError(msg)
|
|
566
|
+
return z
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def _rg(sumstats, args, log, M_annot, ref_ld_cnames, w_ld_cname, i):
|
|
570
|
+
'''Run the regressions.'''
|
|
571
|
+
n_snp = len(sumstats)
|
|
572
|
+
s = lambda x: np.array(x).reshape((n_snp, 1))
|
|
573
|
+
|
|
574
|
+
if args.chisq_max is not None:
|
|
575
|
+
ii = sumstats.Z1**2*sumstats.Z2**2 < args.chisq_max**2
|
|
576
|
+
n_snp = np.sum(ii) # lambdas are late binding, so this works
|
|
577
|
+
sumstats = sumstats[ii]
|
|
578
|
+
n_blocks = min(args.n_blocks, n_snp)
|
|
579
|
+
#ref_ld = sumstats.as_matrix(columns=ref_ld_cnames)
|
|
580
|
+
ref_ld = sumstats[ref_ld_cnames].values
|
|
581
|
+
intercepts = [args.intercept_h2[0], args.intercept_h2[
|
|
582
|
+
i + 1], args.intercept_gencov[i + 1]]
|
|
583
|
+
rghat = reg.RG(s(sumstats.Z1), s(sumstats.Z2),
|
|
584
|
+
ref_ld, s(sumstats[w_ld_cname]), s(
|
|
585
|
+
sumstats.N1), s(sumstats.N2), M_annot,
|
|
586
|
+
intercept_hsq1=intercepts[0], intercept_hsq2=intercepts[1],
|
|
587
|
+
intercept_gencov=intercepts[2], n_blocks=n_blocks, twostep=args.two_step)
|
|
588
|
+
|
|
589
|
+
return rghat
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def _parse_rg(rg):
|
|
593
|
+
'''Parse args.rg.'''
|
|
594
|
+
rg_paths = _splitp(rg)
|
|
595
|
+
rg_files = [x.split('/')[-1] for x in rg_paths]
|
|
596
|
+
if len(rg_paths) < 2:
|
|
597
|
+
raise ValueError(
|
|
598
|
+
'Must specify at least two phenotypes for rg estimation.')
|
|
599
|
+
|
|
600
|
+
return rg_paths, rg_files
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def _print_rg_delete_values(rg, fh, log):
|
|
604
|
+
'''Print block jackknife delete values.'''
|
|
605
|
+
_print_delete_values(rg.hsq1, fh + '.hsq1.delete', log)
|
|
606
|
+
_print_delete_values(rg.hsq2, fh + '.hsq2.delete', log)
|
|
607
|
+
_print_delete_values(rg.gencov, fh + '.gencov.delete', log)
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def _print_rg_cov(rghat, fh, log):
|
|
611
|
+
'''Print covariance matrix of estimates.'''
|
|
612
|
+
_print_cov(rghat.hsq1, fh + '.hsq1.cov', log)
|
|
613
|
+
_print_cov(rghat.hsq2, fh + '.hsq2.cov', log)
|
|
614
|
+
_print_cov(rghat.gencov, fh + '.gencov.cov', log)
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def _split_or_none(x, n):
|
|
618
|
+
if x is not None:
|
|
619
|
+
y = map(float, x.replace('N', '-').split(','))
|
|
620
|
+
else:
|
|
621
|
+
y = [None for _ in xrange(n)]
|
|
622
|
+
return y
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
def _check_arg_len(x, n):
|
|
626
|
+
x, m = x
|
|
627
|
+
if len(x) != n:
|
|
628
|
+
raise ValueError(
|
|
629
|
+
'{M} must have the same number of arguments as --rg/--h2.'.format(M=m))
|
gwaslab/qc_check_datatype.py
CHANGED
|
@@ -33,7 +33,7 @@ def check_datatype(sumstats, verbose=True, log=Log()):
|
|
|
33
33
|
log.write(" -Verified:", " ".join(verified), verbose=verbose)
|
|
34
34
|
|
|
35
35
|
if len(raw_verified)>0:
|
|
36
|
-
log.
|
|
36
|
+
log.warning("Columns with possibly incompatible dtypes: {}".format(",".join(raw_verified)), verbose=verbose)
|
|
37
37
|
except:
|
|
38
38
|
pass
|
|
39
39
|
|
|
@@ -93,11 +93,11 @@ def check_dataframe_shape(sumstats, log, verbose):
|
|
|
93
93
|
try:
|
|
94
94
|
log.write(" -Current Dataframe shape : {} x {} ; Memory usage: {:.2f} MB".format(len(sumstats),len(sumstats.columns),memory_in_mb), verbose=verbose)
|
|
95
95
|
except:
|
|
96
|
-
log.
|
|
96
|
+
log.warning("Error: cannot get Dataframe shape...")
|
|
97
97
|
|
|
98
98
|
def check_dataframe_memory_usage(sumstats, log, verbose):
|
|
99
99
|
memory_in_mb = sumstats.memory_usage().sum()/1024/1024
|
|
100
100
|
try:
|
|
101
101
|
log.write(" -Current Dataframe memory usage: {:.2f} MB".format(memory_in_mb), verbose=verbose)
|
|
102
102
|
except:
|
|
103
|
-
log.
|
|
103
|
+
log.warning("Error: cannot get Memory usage...")
|