gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (51) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/g_Log.py +14 -5
  5. gwaslab/g_Sumstats.py +86 -18
  6. gwaslab/g_SumstatsPair.py +70 -23
  7. gwaslab/g_SumstatsT.py +2 -2
  8. gwaslab/g_version.py +10 -10
  9. gwaslab/hm_casting.py +9 -4
  10. gwaslab/hm_harmonize_sumstats.py +88 -83
  11. gwaslab/io_preformat_input.py +14 -14
  12. gwaslab/io_read_ldsc.py +49 -1
  13. gwaslab/ldsc_irwls.py +198 -0
  14. gwaslab/ldsc_jackknife.py +514 -0
  15. gwaslab/ldsc_ldscore.py +417 -0
  16. gwaslab/ldsc_parse.py +294 -0
  17. gwaslab/ldsc_regressions.py +747 -0
  18. gwaslab/ldsc_sumstats.py +629 -0
  19. gwaslab/qc_check_datatype.py +1 -1
  20. gwaslab/qc_fix_sumstats.py +163 -161
  21. gwaslab/util_ex_calculate_ldmatrix.py +2 -2
  22. gwaslab/util_ex_gwascatalog.py +24 -24
  23. gwaslab/util_ex_ldproxyfinder.py +9 -9
  24. gwaslab/util_ex_ldsc.py +189 -0
  25. gwaslab/util_in_calculate_gc.py +6 -6
  26. gwaslab/util_in_calculate_power.py +42 -43
  27. gwaslab/util_in_convert_h2.py +8 -8
  28. gwaslab/util_in_fill_data.py +28 -28
  29. gwaslab/util_in_filter_value.py +91 -52
  30. gwaslab/util_in_get_density.py +8 -8
  31. gwaslab/util_in_get_sig.py +407 -65
  32. gwaslab/viz_aux_annotate_plot.py +12 -12
  33. gwaslab/viz_aux_quickfix.py +18 -18
  34. gwaslab/viz_aux_reposition_text.py +3 -3
  35. gwaslab/viz_aux_save_figure.py +14 -5
  36. gwaslab/viz_plot_compare_af.py +29 -30
  37. gwaslab/viz_plot_compare_effect.py +63 -71
  38. gwaslab/viz_plot_miamiplot2.py +6 -6
  39. gwaslab/viz_plot_mqqplot.py +17 -3
  40. gwaslab/viz_plot_qqplot.py +1 -1
  41. gwaslab/viz_plot_regionalplot.py +33 -32
  42. gwaslab/viz_plot_rg_heatmap.py +28 -26
  43. gwaslab/viz_plot_stackedregional.py +40 -21
  44. gwaslab/viz_plot_trumpetplot.py +50 -55
  45. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  46. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
  47. gwaslab-3.4.39.dist-info/RECORD +80 -0
  48. gwaslab-3.4.38.dist-info/RECORD +0 -72
  49. /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  50. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  51. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,629 @@
1
+ '''
2
+ (c) 2014 Brendan Bulik-Sullivan and Hilary Finucane
3
+
4
+ This module deals with getting all the data needed for LD Score regression from files
5
+ into memory and checking that the input makes sense. There is no math here. LD Score
6
+ regression is implemented in the regressions module.
7
+ '''
8
+ from __future__ import division
9
+ import numpy as np
10
+ import pandas as pd
11
+ from scipy import stats
12
+ import itertools as it
13
+ import gwaslab.ldsc_parse as ps
14
+ import gwaslab.ldsc_regressions as reg
15
+ import sys
16
+ import traceback
17
+ import copy
18
+ import os
19
+ import glob
20
+
21
+ log_prefix = ' -'
22
+ log_prefix_short = ' -'
23
+ def xrange(*args):
24
+ return range(*args)
25
+
26
+ _N_CHR = 22
27
+ # complementary bases
28
+ COMPLEMENT = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
29
+ # bases
30
+ BASES = COMPLEMENT.keys()
31
+ # true iff strand ambiguous
32
+ STRAND_AMBIGUOUS = {''.join(x): x[0] == COMPLEMENT[x[1]]
33
+ for x in it.product(BASES, BASES)
34
+ if x[0] != x[1]}
35
+ # SNPS we want to keep (pairs of alleles)
36
+ VALID_SNPS = {x for x in map(lambda y: ''.join(y), it.product(BASES, BASES))
37
+ if x[0] != x[1] and not STRAND_AMBIGUOUS[x]}
38
+ # T iff SNP 1 has the same alleles as SNP 2 (allowing for strand or ref allele flip).
39
+ MATCH_ALLELES = {x for x in map(lambda y: ''.join(y), it.product(VALID_SNPS, VALID_SNPS))
40
+ # strand and ref match
41
+ if ((x[0] == x[2]) and (x[1] == x[3])) or
42
+ # ref match, strand flip
43
+ ((x[0] == COMPLEMENT[x[2]]) and (x[1] == COMPLEMENT[x[3]])) or
44
+ # ref flip, strand match
45
+ ((x[0] == x[3]) and (x[1] == x[2])) or
46
+ ((x[0] == COMPLEMENT[x[3]]) and (x[1] == COMPLEMENT[x[2]]))} # strand and ref flip
47
+ # T iff SNP 1 has the same alleles as SNP 2 w/ ref allele flip.
48
+ FLIP_ALLELES = {''.join(x):
49
+ ((x[0] == x[3]) and (x[1] == x[2])) or # strand match
50
+ # strand flip
51
+ ((x[0] == COMPLEMENT[x[3]]) and (x[1] == COMPLEMENT[x[2]]))
52
+ for x in MATCH_ALLELES}
53
+
54
+
55
+ def _splitp(fstr):
56
+ flist = fstr.split(',')
57
+ flist = [os.path.expanduser(os.path.expandvars(x)) for x in flist]
58
+ return flist
59
+
60
+
61
+ def _select_and_log(x, ii, log, msg):
62
+ '''Fiter down to rows that are True in ii. Log # of SNPs removed.'''
63
+ new_len = ii.sum()
64
+ if new_len == 0:
65
+ raise ValueError(msg.format(N=0))
66
+ else:
67
+ x = x[ii]
68
+ log.log(" -" + msg.format(N=new_len))
69
+ return x
70
+
71
+
72
+ def smart_merge(x, y):
73
+ '''Check if SNP columns are equal. If so, save time by using concat instead of merge.'''
74
+ if len(x) == len(y) and (x.index == y.index).all() and (x.SNP == y.SNP).all():
75
+ x = x.reset_index(drop=True)
76
+ y = y.reset_index(drop=True).drop('SNP', 1)
77
+ out = pd.concat([x, y], axis=1)
78
+ else:
79
+ out = pd.merge(x, y, how='inner', on='SNP')
80
+ return out
81
+
82
+
83
+ def _read_ref_ld(args, log):
84
+ '''Read reference LD Scores.'''
85
+ ref_ld = _read_chr_split_files(args.ref_ld_chr, args.ref_ld, log,
86
+ 'reference panel LD Score', ps.ldscore_fromlist)
87
+ log.log(
88
+ ' -Read reference panel LD Scores for {N} SNPs.'.format(N=len(ref_ld)))
89
+ return ref_ld
90
+
91
+
92
+ def _read_annot(args, log):
93
+ '''Read annot matrix.'''
94
+ try:
95
+ if args.ref_ld is not None:
96
+ overlap_matrix, M_tot = _read_chr_split_files(args.ref_ld_chr, args.ref_ld, log,
97
+ 'annot matrix', ps.annot, frqfile=args.frqfile)
98
+ elif args.ref_ld_chr is not None:
99
+ overlap_matrix, M_tot = _read_chr_split_files(args.ref_ld_chr, args.ref_ld, log,
100
+ 'annot matrix', ps.annot, frqfile=args.frqfile_chr)
101
+ except Exception:
102
+ log.log(' -Error parsing .annot file.')
103
+ raise
104
+
105
+ return overlap_matrix, M_tot
106
+
107
+
108
+ def _read_M(args, log, n_annot):
109
+ '''Read M (--M, --M-file, etc).'''
110
+ if args.M:
111
+ try:
112
+ M_annot = [float(x) for x in _splitp(args.M)]
113
+ except ValueError as e:
114
+ raise ValueError('Could not cast --M to float: ' + str(e.args))
115
+ else:
116
+ if args.ref_ld:
117
+ M_annot = ps.M_fromlist(
118
+ _splitp(args.ref_ld), common=(not args.not_M_5_50))
119
+ elif args.ref_ld_chr:
120
+ M_annot = ps.M_fromlist(
121
+ _splitp(args.ref_ld_chr), _N_CHR, common=(not args.not_M_5_50))
122
+
123
+ try:
124
+ M_annot = np.array(M_annot).reshape((1, n_annot))
125
+ except ValueError as e:
126
+ raise ValueError(
127
+ '# terms in --M must match # of LD Scores in --ref-ld.\n' + str(e.args))
128
+
129
+ return M_annot
130
+
131
+
132
+ def _read_w_ld(args, log):
133
+ '''Read regression SNP LD.'''
134
+ if (args.w_ld and ',' in args.w_ld) or (args.w_ld_chr and ',' in args.w_ld_chr):
135
+ raise ValueError(
136
+ '--w-ld must point to a single fileset (no commas allowed).')
137
+ w_ld = _read_chr_split_files(args.w_ld_chr, args.w_ld, log,
138
+ 'regression weight LD Score', ps.ldscore_fromlist)
139
+ if len(w_ld.columns) != 2:
140
+ raise ValueError('--w-ld may only have one LD Score column.')
141
+ w_ld.columns = ['SNP', 'LD_weights'] # prevent colname conflicts w/ ref ld
142
+ log.log(
143
+ ' -Read regression weight LD Scores for {N} SNPs.'.format(N=len(w_ld)))
144
+ return w_ld
145
+
146
+
147
+ def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs):
148
+ '''Read files split across 22 chromosomes (annot, ref_ld, w_ld).'''
149
+ try:
150
+ if not_chr_arg:
151
+ log.log(' -Reading {N} from {F} ... ({p})'.format(N=noun, F=not_chr_arg, p=parsefunc.__name__))
152
+ out = parsefunc(_splitp(not_chr_arg), **kwargs)
153
+ elif chr_arg:
154
+ f = ps.sub_chr(chr_arg, '[1-22]')
155
+ log.log(' -Reading {N} from {F} ... ({p})'.format(N=noun, F=f, p=parsefunc.__name__))
156
+ out = parsefunc(_splitp(chr_arg), _N_CHR, **kwargs)
157
+ except ValueError as e:
158
+ log.log(' -Error parsing {N}.'.format(N=noun))
159
+ raise e
160
+
161
+ return out
162
+
163
+
164
+ def _read_sumstats(args, log, fh, alleles=False, dropna=False):
165
+ '''Parse summary statistics.'''
166
+ #log.log(' -Reading summary statistics from {S} ...'.format(S=fh))
167
+ #sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna)
168
+ log_msg = ' -Read summary statistics for {N} SNPs.'
169
+ sumstats = fh.dropna()
170
+ log.log(log_msg.format(N=len(sumstats)))
171
+ m = len(sumstats)
172
+ sumstats = sumstats.drop_duplicates(subset='SNP')
173
+ if m > len(sumstats):
174
+ log.log(
175
+ ' -Dropped {M} SNPs with duplicated rs numbers.'.format(M=m - len(sumstats)))
176
+
177
+ return sumstats
178
+
179
+
180
+ def _check_ld_condnum(args, log, ref_ld):
181
+ '''Check condition number of LD Score matrix.'''
182
+ if len(ref_ld.shape) >= 2:
183
+ cond_num = int(np.linalg.cond(ref_ld))
184
+ if cond_num > 100000:
185
+ if args.invert_anyway:
186
+ warn = "WARNING: LD Score matrix condition number is {C}. "
187
+ warn += "Inverting anyway because the --invert-anyway flag is set."
188
+ log.log(warn.format(C=cond_num))
189
+ else:
190
+ warn = "WARNING: LD Score matrix condition number is {C}. "
191
+ warn += "Remove collinear LD Scores. "
192
+ raise ValueError(warn.format(C=cond_num))
193
+
194
+
195
+ def _check_variance(log, M_annot, ref_ld):
196
+ '''Remove zero-variance LD Scores.'''
197
+ ii = ref_ld.iloc[:, 1:].var() == 0 # NB there is a SNP column here
198
+ if ii.all():
199
+ raise ValueError('All LD Scores have zero variance.')
200
+ else:
201
+ log.log(' -Removing partitioned LD Scores with zero variance.')
202
+ ii_snp = np.array([True] + list(~ii))
203
+ ii_m = np.array(~ii)
204
+ ref_ld = ref_ld.iloc[:, ii_snp]
205
+ M_annot = M_annot[:, ii_m]
206
+
207
+ return M_annot, ref_ld, ii
208
+
209
+
210
+ def _warn_length(log, sumstats):
211
+ if len(sumstats) < 200000:
212
+ log.warning(
213
+ 'number of SNPs less than 200k; this is almost always bad.')
214
+
215
+
216
+ def _print_cov(ldscore_reg, ofh, log):
217
+ '''Prints covariance matrix of slopes.'''
218
+ log.log(
219
+ ' -Printing covariance matrix of the estimates to {F}.'.format(F=ofh))
220
+ np.savetxt(ofh, ldscore_reg.coef_cov)
221
+
222
+
223
+ def _print_delete_values(ldscore_reg, ofh, log):
224
+ '''Prints block jackknife delete-k values'''
225
+ log.log(' -Printing block jackknife delete values to {F}.'.format(F=ofh))
226
+ np.savetxt(ofh, ldscore_reg.tot_delete_values)
227
+
228
+ def _print_part_delete_values(ldscore_reg, ofh, log):
229
+ '''Prints partitioned block jackknife delete-k values'''
230
+ log.log(' -Printing partitioned block jackknife delete values to {F}.'.format(F=ofh))
231
+ np.savetxt(ofh, ldscore_reg.part_delete_values)
232
+
233
+
234
+ def _merge_and_log(ld, sumstats, noun, log):
235
+ '''Wrap smart merge with log messages about # of SNPs.'''
236
+ sumstats = smart_merge(ld, sumstats)
237
+ msg = ' -After merging with {F}, {N} SNPs remain.'
238
+ if len(sumstats) == 0:
239
+ raise ValueError(msg.format(N=len(sumstats), F=noun))
240
+ else:
241
+ log.log(msg.format(N=len(sumstats), F=noun))
242
+
243
+ return sumstats
244
+
245
+
246
+ def _read_ld_sumstats(sumstats, args, log, fh, alleles=False, dropna=True):
247
+ #sumstats = _read_sumstats(args, log, fh, alleles=alleles, dropna=dropna)
248
+ sumstats = sumstats.dropna()
249
+ ref_ld = _read_ref_ld(args, log)
250
+ n_annot = len(ref_ld.columns) - 1
251
+ M_annot = _read_M(args, log, n_annot)
252
+ M_annot, ref_ld, novar_cols = _check_variance(log, M_annot, ref_ld)
253
+ w_ld = _read_w_ld(args, log)
254
+ sumstats = _merge_and_log(ref_ld, sumstats, 'reference panel LD', log)
255
+ sumstats = _merge_and_log(sumstats, w_ld, 'regression SNP LD', log)
256
+ w_ld_cname = sumstats.columns[-1]
257
+ ref_ld_cnames = ref_ld.columns[1:len(ref_ld.columns)]
258
+ return M_annot, w_ld_cname, ref_ld_cnames, sumstats, novar_cols
259
+
260
+ def cell_type_specific(args, log):
261
+ '''Cell type specific analysis'''
262
+ args = copy.deepcopy(args)
263
+ if args.intercept_h2 is not None:
264
+ args.intercept_h2 = float(args.intercept_h2)
265
+ if args.no_intercept:
266
+ args.intercept_h2 = 1
267
+
268
+ M_annot_all_regr, w_ld_cname, ref_ld_cnames_all_regr, sumstats, novar_cols = \
269
+ _read_ld_sumstats(args, log, args.h2_cts)
270
+ M_tot = np.sum(M_annot_all_regr)
271
+ _check_ld_condnum(args, log, ref_ld_cnames_all_regr)
272
+ _warn_length(log, sumstats)
273
+ n_snp = len(sumstats)
274
+ n_blocks = min(n_snp, args.n_blocks)
275
+ if args.chisq_max is None:
276
+ chisq_max = max(0.001*sumstats.N.max(), 80)
277
+ else:
278
+ chisq_max = args.chisq_max
279
+
280
+ ii = np.ravel(sumstats.Z**2 < chisq_max)
281
+ sumstats = sumstats.iloc[ii, :]
282
+ log.log(' -Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format(
283
+ C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii)))
284
+ n_snp = np.sum(ii) # lambdas are late-binding, so this works
285
+ ref_ld_all_regr = np.array(sumstats[ref_ld_cnames_all_regr]).reshape((len(sumstats),-1))
286
+ chisq = np.array(sumstats.Z**2)
287
+ keep_snps = sumstats[['SNP']]
288
+
289
+ s = lambda x: np.array(x).reshape((n_snp, 1))
290
+ results_columns = ['Name', 'Coefficient', 'Coefficient_std_error', 'Coefficient_P_value']
291
+ results_data = []
292
+ for (name, ct_ld_chr) in [x.split() for x in open(args.ref_ld_chr_cts).readlines()]:
293
+ ref_ld_cts_allsnps = _read_chr_split_files(ct_ld_chr, None, log,
294
+ 'cts reference panel LD Score', ps.ldscore_fromlist)
295
+ log.log(' -Performing regression.')
296
+ ref_ld_cts = np.array(pd.merge(keep_snps, ref_ld_cts_allsnps, on='SNP', how='left').iloc[:,1:])
297
+ if np.any(np.isnan(ref_ld_cts)):
298
+ raise ValueError ('Missing some LD scores from cts files. Are you sure all SNPs in ref-ld-chr are also in ref-ld-chr-cts')
299
+
300
+ ref_ld = np.hstack([ref_ld_cts, ref_ld_all_regr])
301
+ M_cts = ps.M_fromlist(
302
+ _splitp(ct_ld_chr), _N_CHR, common=(not args.not_M_5_50))
303
+ M_annot = np.hstack([M_cts, M_annot_all_regr])
304
+ hsqhat = reg.Hsq(s(chisq), ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N),
305
+ M_annot, n_blocks=n_blocks, intercept=args.intercept_h2,
306
+ twostep=None, old_weights=True)
307
+ coef, coef_se = hsqhat.coef[0], hsqhat.coef_se[0]
308
+ results_data.append((name, coef, coef_se, stats.norm.sf(coef/coef_se)))
309
+ if args.print_all_cts:
310
+ for i in range(1, len(ct_ld_chr.split(','))):
311
+ coef, coef_se = hsqhat.coef[i], hsqhat.coef_se[i]
312
+ results_data.append((name+'_'+str(i), coef, coef_se, stats.norm.sf(coef/coef_se)))
313
+
314
+
315
+ df_results = pd.DataFrame(data = results_data, columns = results_columns)
316
+ df_results.sort_values(by = 'Coefficient_P_value', inplace=True)
317
+ df_results.to_csv(args.out+'.cell_type_results.txt', sep='\t', index=False)
318
+ log.log(' -Results printed to '+args.out+'.cell_type_results.txt')
319
+
320
+
321
+ def estimate_h2(sumstats, args, log):
322
+ '''Estimate h2 and partitioned h2.'''
323
+ args = copy.deepcopy(args)
324
+ if args.samp_prev is not None and args.pop_prev is not None:
325
+ args.samp_prev, args.pop_prev = map(
326
+ float, [args.samp_prev, args.pop_prev])
327
+ if args.intercept_h2 is not None:
328
+ args.intercept_h2 = float(args.intercept_h2)
329
+ if args.no_intercept:
330
+ args.intercept_h2 = 1
331
+ M_annot, w_ld_cname, ref_ld_cnames, sumstats, novar_cols = _read_ld_sumstats(
332
+ sumstats, args, log, args.h2)
333
+ ref_ld = np.array(sumstats[ref_ld_cnames])
334
+ _check_ld_condnum(args, log, ref_ld_cnames)
335
+ _warn_length(log, sumstats)
336
+ n_snp = len(sumstats)
337
+ n_blocks = min(n_snp, args.n_blocks)
338
+ n_annot = len(ref_ld_cnames)
339
+ chisq_max = args.chisq_max
340
+ old_weights = False
341
+ if n_annot == 1:
342
+ if args.two_step is None and args.intercept_h2 is None:
343
+ args.two_step = 30
344
+ else:
345
+ old_weights = True
346
+ if args.chisq_max is None:
347
+ chisq_max = max(0.001*sumstats.N.max(), 80)
348
+
349
+ s = lambda x: np.array(x).reshape((n_snp, 1))
350
+ chisq = s(sumstats.Z**2)
351
+ if chisq_max is not None:
352
+ ii = np.ravel(chisq < chisq_max)
353
+ sumstats = sumstats.iloc[ii, :]
354
+ log.log(' -Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format(
355
+ C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii)))
356
+ n_snp = np.sum(ii) # lambdas are late-binding, so this works
357
+ ref_ld = np.array(sumstats[ref_ld_cnames])
358
+ chisq = chisq[ii].reshape((n_snp, 1))
359
+
360
+ if args.two_step is not None:
361
+ log.log(' -Using two-step estimator with cutoff at {M}.'.format(M=args.two_step))
362
+
363
+ hsqhat = reg.Hsq(chisq, ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N),
364
+ M_annot, n_blocks=n_blocks, intercept=args.intercept_h2,
365
+ twostep=args.two_step, old_weights=old_weights)
366
+
367
+ if args.print_cov:
368
+ _print_cov(hsqhat, args.out + '.cov', log)
369
+ if args.print_delete_vals:
370
+ _print_delete_values(hsqhat, args.out + '.delete', log)
371
+ _print_part_delete_values(hsqhat, args.out + '.part_delete', log)
372
+
373
+ #log.log(hsqhat.summary(ref_ld_cnames, P=args.samp_prev, K=args.pop_prev, overlap = args.overlap_annot))
374
+ if args.overlap_annot:
375
+ overlap_matrix, M_tot = _read_annot(args, log)
376
+
377
+ # overlap_matrix = overlap_matrix[np.array(~novar_cols), np.array(~novar_cols)]#np.logical_not
378
+ df_results = hsqhat._overlap_output(ref_ld_cnames, overlap_matrix, M_annot, M_tot, args.print_coefficients)
379
+ df_results.to_csv(args.out+'.results', sep="\t", index=False)
380
+ log.log(' -Results printed to '+args.out+'.results')
381
+
382
+ return hsqhat.summary(ref_ld_cnames, P=args.samp_prev, K=args.pop_prev, overlap = args.overlap_annot)
383
+
384
+
385
+ def estimate_rg(sumstats, other_sumstats, args, log):
386
+ '''Estimate rg between trait 1 and a list of other traits.'''
387
+ args = copy.deepcopy(args)
388
+
389
+ rg_paths, rg_files = _parse_rg(args.rg)
390
+
391
+ n_pheno = len(rg_paths)
392
+
393
+ f = lambda x: _split_or_none(x, n_pheno)
394
+
395
+ args.intercept_h2, args.intercept_gencov, args.samp_prev, args.pop_prev = map(f,
396
+ (args.intercept_h2, args.intercept_gencov, args.samp_prev, args.pop_prev))
397
+
398
+ ##map behaviour changed since python3##############################################
399
+ args.intercept_h2 = list(args.intercept_h2)
400
+ args.intercept_gencov = list(args.intercept_gencov)
401
+ args.samp_prev = list(args.samp_prev)
402
+ args.pop_prev = list(args.pop_prev)
403
+ ################################################
404
+
405
+ map(lambda x: _check_arg_len(x, n_pheno), ((args.intercept_h2, '--intercept-h2'),
406
+ (args.intercept_gencov, '--intercept-gencov'),
407
+ (args.samp_prev, '--samp-prev'),
408
+ (args.pop_prev, '--pop-prev')))
409
+
410
+ if args.no_intercept:
411
+ args.intercept_h2 = [1 for _ in xrange(n_pheno)]
412
+ args.intercept_gencov = [0 for _ in xrange(n_pheno)]
413
+
414
+ p1 = rg_paths[0]
415
+ out_prefix = args.out + rg_files[0]
416
+
417
+ M_annot, w_ld_cname, ref_ld_cnames, sumstats, _ = _read_ld_sumstats(sumstats, args, log, p1,
418
+ alleles=True, dropna=True)
419
+ RG = []
420
+ n_annot = M_annot.shape[1]
421
+
422
+ if n_annot == 1 and args.two_step is None and args.intercept_h2 is None:
423
+ args.two_step = 30
424
+ if args.two_step is not None:
425
+ log.log(' -Using two-step estimator with cutoff at {M}.'.format(M=args.two_step))
426
+
427
+ for i, p2 in enumerate(other_sumstats):
428
+ log.log(
429
+ ' -Computing rg for phenotype {I}/{N}'.format(I=i + 2, N=len(rg_paths)))
430
+ try:
431
+ loop = _read_other_sumstats(args, log, p2, sumstats, ref_ld_cnames)
432
+ rghat = _rg(loop, args, log, M_annot, ref_ld_cnames, w_ld_cname, i)
433
+ RG.append(rghat)
434
+ _print_gencor(args, log, rghat, ref_ld_cnames, i, rg_paths, i == 0)
435
+ out_prefix_loop = out_prefix + '_' + rg_files[i + 1]
436
+ if args.print_cov:
437
+ _print_rg_cov(rghat, out_prefix_loop, log)
438
+ if args.print_delete_vals:
439
+ _print_rg_delete_values(rghat, out_prefix_loop, log)
440
+
441
+ except Exception: # keep going if phenotype 50/100 causes an error
442
+
443
+ msg = 'ERROR computing rg for phenotype {I}/{N}, from file {F}.'
444
+ log.log(msg.format(I=i + 2, N=len(rg_paths), F=rg_paths[i + 1]))
445
+ ex_type, ex, tb = sys.exc_info()
446
+ log.log(traceback.format_exc(ex) + '\n')
447
+ if len(RG) <= i: # if exception raised before appending to RG
448
+ RG.append(None)
449
+
450
+ log.log('Summary of Genetic Correlation Results\n' +
451
+ _get_rg_table(rg_paths, RG, args)[0])
452
+ return RG, _get_rg_table(rg_paths, RG, args)[1]
453
+
454
+
455
+ def _read_other_sumstats(args, log, p2, sumstats, ref_ld_cnames):
456
+ loop = _read_sumstats(args, log, p2, alleles=True, dropna=False)
457
+ loop = _merge_sumstats_sumstats(args, sumstats, loop, log)
458
+ loop = loop.dropna(how='any')
459
+ loop[['A1', 'A1x', 'A2', 'A2x']] = loop[['A1', 'A1x', 'A2', 'A2x']].astype("string")
460
+ alleles = loop.A1 + loop.A2 + loop.A1x + loop.A2x
461
+ if not args.no_check_alleles:
462
+ loop = _select_and_log(loop, _filter_alleles(alleles), log,
463
+ '{N} SNPs with valid alleles.')
464
+ loop['Z2'] = _align_alleles(loop.Z2, alleles)
465
+
466
+ loop = loop.drop(['A1', 'A1x', 'A2', 'A2x'], axis=1)
467
+ _check_ld_condnum(args, log, loop[ref_ld_cnames])
468
+ _warn_length(log, loop)
469
+ return loop
470
+
471
+
472
+ def _get_rg_table(rg_paths, RG, args):
473
+ '''Print a table of genetic correlations.'''
474
+ # fix error caused by behaviour change for map
475
+ t = lambda attr: lambda obj: getattr(obj, attr, 'NA')
476
+ x = pd.DataFrame()
477
+ x['p1'] = [rg_paths[0] for i in xrange(1, len(rg_paths))]
478
+ x['p2'] = rg_paths[1:len(rg_paths)]
479
+
480
+ #x['rg'] = map(t('rg_ratio'), RG)
481
+ #x['se'] = map(t('rg_se'), RG)
482
+ #x['z'] = map(t('z'), RG)
483
+ #x['p'] = map(t('p'), RG)
484
+
485
+ x['rg'] = [getattr(i, 'rg_ratio', 'NA') for i in RG]
486
+ x['se'] = [getattr(i, 'rg_se', 'NA') for i in RG]
487
+ x['z'] = [getattr(i, 'z', 'NA') for i in RG]
488
+ x['p'] = [getattr(i, 'p', 'NA') for i in RG]
489
+ ## i -> it
490
+ if args.samp_prev is not None and \
491
+ args.pop_prev is not None and \
492
+ all((i is not None for i in args.samp_prev)) and \
493
+ all((it is not None for it in args.pop_prev)):
494
+
495
+ #c = map(lambda x, y: reg.h2_obs_to_liab(1, x, y), args.samp_prev[1:], args.pop_prev[1:])
496
+ c = list(map(lambda x, y: reg.h2_obs_to_liab(1, x, y), args.samp_prev[1:], args.pop_prev[1:]))
497
+
498
+ #x['h2_liab'] = map(lambda x, y: x * y, c, map(t('tot'), map(t('hsq2'), RG)))
499
+ #x['h2_liab_se'] = map(lambda x, y: x * y, c, map(t('tot_se'), map(t('hsq2'), RG)))
500
+
501
+ x['h2_liab'] = [getattr(getattr(i, 'hsq2', 'NA'), 'tot', 'NA') * c[index] for index,i in enumerate(RG)]
502
+ x['h2_liab_se'] =[getattr(getattr(i, 'hsq2', 'NA'), 'tot_se', 'NA') * c[index] for index,i in enumerate(RG)]
503
+ else:
504
+ #x['h2_obs'] = map(t('tot'), map(t('hsq2'), RG))
505
+ #x['h2_obs_se'] = map(t('tot_se'), map(t('hsq2'), RG))
506
+
507
+ x['h2_obs'] = [getattr(getattr(i, 'hsq2', 'NA'), 'tot', 'NA') for i in RG]
508
+ x['h2_obs_se'] = [getattr(getattr(i, 'hsq2', 'NA'), 'tot_se', 'NA') for i in RG]
509
+
510
+ #x['h2_int'] = map(t('intercept'), map(t('hsq2'), RG))
511
+ #x['h2_int_se'] = map(t('intercept_se'), map(t('hsq2'), RG))
512
+ #x['gcov_int'] = map(t('intercept'), map(t('gencov'), RG))
513
+ #x['gcov_int_se'] = map(t('intercept_se'), map(t('gencov'), RG))
514
+
515
+ x['h2_int'] = [getattr(getattr(i, 'hsq2', 'NA'), 'intercept', 'NA') for i in RG]
516
+ x['h2_int_se'] = [getattr(getattr(i, 'hsq2', 'NA'), 'intercept_se', 'NA') for i in RG]
517
+ x['gcov_int'] = [getattr(getattr(i, 'gencov', 'NA'), 'intercept', 'NA') for i in RG]
518
+ x['gcov_int_se'] = [getattr(getattr(i, 'gencov', 'NA'), 'intercept_se', 'NA') for i in RG]
519
+ return x.to_string(header=True, index=False) + '\n', x
520
+
521
+
522
+ def _print_gencor(args, log, rghat, ref_ld_cnames, i, rg_paths, print_hsq1):
523
+ #l = lambda x: x + ''.join(['-' for i in range(len(x.replace('\n', '')))])
524
+ l = lambda x: x
525
+ P = [args.samp_prev[0], args.samp_prev[i + 1]]
526
+ K = [args.pop_prev[0], args.pop_prev[i + 1]]
527
+ if args.samp_prev is None and args.pop_prev is None:
528
+ args.samp_prev = [None, None]
529
+ args.pop_prev = [None, None]
530
+ if print_hsq1:
531
+ log.log(l(log_prefix_short+'Heritability of phenotype 1'))
532
+ log.log(log_prefix_short+rghat.hsq1.summary(ref_ld_cnames, P=P[0], K=K[0]))
533
+
534
+ log.log(
535
+ l(log_prefix_short+'Heritability of phenotype {I}/{N}'.format(I=i + 2, N=len(rg_paths))))
536
+ log.log(log_prefix_short+rghat.hsq2.summary(ref_ld_cnames, P=P[1], K=K[1]))
537
+ log.log(l(log_prefix_short+'Genetic Covariance'))
538
+ log.log(log_prefix_short+rghat.gencov.summary(ref_ld_cnames, P=P, K=K))
539
+ log.log(l(log_prefix_short+'Genetic Correlation'))
540
+ log.log(log_prefix_short+rghat.summary() + '\n')
541
+
542
+
543
+ def _merge_sumstats_sumstats(args, sumstats1, sumstats2, log):
544
+ '''Merge two sets of summary statistics.'''
545
+ sumstats1.rename(columns={'N': 'N1', 'Z': 'Z1'}, inplace=True)
546
+ sumstats2.rename(
547
+ columns={'A1': 'A1x', 'A2': 'A2x', 'N': 'N2', 'Z': 'Z2'}, inplace=True)
548
+ x = _merge_and_log(sumstats1, sumstats2, 'summary statistics', log)
549
+ return x
550
+
551
+
552
+ def _filter_alleles(alleles):
553
+ '''Remove bad variants (mismatched alleles, non-SNPs, strand ambiguous).'''
554
+ ii = alleles.apply(lambda y: y in MATCH_ALLELES)
555
+ return ii
556
+
557
+
558
+ def _align_alleles(z, alleles):
559
+ '''Align Z1 and Z2 to same choice of ref allele (allowing for strand flip).'''
560
+ try:
561
+ z *= (-1) ** alleles.apply(lambda y: FLIP_ALLELES[y])
562
+ except KeyError as e:
563
+ msg = 'Incompatible alleles in .sumstats files: %s. ' % e.args
564
+ msg += 'Did you forget to use --merge-alleles with munge_sumstats.py?'
565
+ raise KeyError(msg)
566
+ return z
567
+
568
+
569
+ def _rg(sumstats, args, log, M_annot, ref_ld_cnames, w_ld_cname, i):
570
+ '''Run the regressions.'''
571
+ n_snp = len(sumstats)
572
+ s = lambda x: np.array(x).reshape((n_snp, 1))
573
+
574
+ if args.chisq_max is not None:
575
+ ii = sumstats.Z1**2*sumstats.Z2**2 < args.chisq_max**2
576
+ n_snp = np.sum(ii) # lambdas are late binding, so this works
577
+ sumstats = sumstats[ii]
578
+ n_blocks = min(args.n_blocks, n_snp)
579
+ #ref_ld = sumstats.as_matrix(columns=ref_ld_cnames)
580
+ ref_ld = sumstats[ref_ld_cnames].values
581
+ intercepts = [args.intercept_h2[0], args.intercept_h2[
582
+ i + 1], args.intercept_gencov[i + 1]]
583
+ rghat = reg.RG(s(sumstats.Z1), s(sumstats.Z2),
584
+ ref_ld, s(sumstats[w_ld_cname]), s(
585
+ sumstats.N1), s(sumstats.N2), M_annot,
586
+ intercept_hsq1=intercepts[0], intercept_hsq2=intercepts[1],
587
+ intercept_gencov=intercepts[2], n_blocks=n_blocks, twostep=args.two_step)
588
+
589
+ return rghat
590
+
591
+
592
+ def _parse_rg(rg):
593
+ '''Parse args.rg.'''
594
+ rg_paths = _splitp(rg)
595
+ rg_files = [x.split('/')[-1] for x in rg_paths]
596
+ if len(rg_paths) < 2:
597
+ raise ValueError(
598
+ 'Must specify at least two phenotypes for rg estimation.')
599
+
600
+ return rg_paths, rg_files
601
+
602
+
603
+ def _print_rg_delete_values(rg, fh, log):
604
+ '''Print block jackknife delete values.'''
605
+ _print_delete_values(rg.hsq1, fh + '.hsq1.delete', log)
606
+ _print_delete_values(rg.hsq2, fh + '.hsq2.delete', log)
607
+ _print_delete_values(rg.gencov, fh + '.gencov.delete', log)
608
+
609
+
610
+ def _print_rg_cov(rghat, fh, log):
611
+ '''Print covariance matrix of estimates.'''
612
+ _print_cov(rghat.hsq1, fh + '.hsq1.cov', log)
613
+ _print_cov(rghat.hsq2, fh + '.hsq2.cov', log)
614
+ _print_cov(rghat.gencov, fh + '.gencov.cov', log)
615
+
616
+
617
+ def _split_or_none(x, n):
618
+ if x is not None:
619
+ y = map(float, x.replace('N', '-').split(','))
620
+ else:
621
+ y = [None for _ in xrange(n)]
622
+ return y
623
+
624
+
625
+ def _check_arg_len(x, n):
626
+ x, m = x
627
+ if len(x) != n:
628
+ raise ValueError(
629
+ '{M} must have the same number of arguments as --rg/--h2.'.format(M=m))
@@ -33,7 +33,7 @@ def check_datatype(sumstats, verbose=True, log=Log()):
33
33
  log.write(" -Verified:", " ".join(verified), verbose=verbose)
34
34
 
35
35
  if len(raw_verified)>0:
36
- log.warning("Columns with possibly incompatable dtypes: {}".format(",".join(raw_verified)), verbose=verbose)
36
+ log.warning("Columns with possibly incompatible dtypes: {}".format(",".join(raw_verified)), verbose=verbose)
37
37
  except:
38
38
  pass
39
39