gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/data/formatbook.json +722 -721
  5. gwaslab/g_Log.py +22 -5
  6. gwaslab/g_Sumstats.py +110 -163
  7. gwaslab/g_SumstatsPair.py +76 -25
  8. gwaslab/g_SumstatsT.py +2 -2
  9. gwaslab/g_Sumstats_summary.py +3 -3
  10. gwaslab/g_version.py +10 -10
  11. gwaslab/hm_casting.py +36 -17
  12. gwaslab/hm_harmonize_sumstats.py +354 -221
  13. gwaslab/hm_rsid_to_chrpos.py +1 -1
  14. gwaslab/io_preformat_input.py +49 -43
  15. gwaslab/io_read_ldsc.py +49 -1
  16. gwaslab/io_to_formats.py +428 -295
  17. gwaslab/ldsc_irwls.py +198 -0
  18. gwaslab/ldsc_jackknife.py +514 -0
  19. gwaslab/ldsc_ldscore.py +417 -0
  20. gwaslab/ldsc_parse.py +294 -0
  21. gwaslab/ldsc_regressions.py +747 -0
  22. gwaslab/ldsc_sumstats.py +629 -0
  23. gwaslab/qc_check_datatype.py +3 -3
  24. gwaslab/qc_fix_sumstats.py +891 -778
  25. gwaslab/util_ex_calculate_ldmatrix.py +31 -13
  26. gwaslab/util_ex_gwascatalog.py +25 -25
  27. gwaslab/util_ex_ldproxyfinder.py +10 -10
  28. gwaslab/util_ex_ldsc.py +189 -0
  29. gwaslab/util_ex_process_ref.py +3 -3
  30. gwaslab/util_ex_run_coloc.py +26 -4
  31. gwaslab/util_in_calculate_gc.py +6 -6
  32. gwaslab/util_in_calculate_power.py +42 -43
  33. gwaslab/util_in_convert_h2.py +8 -8
  34. gwaslab/util_in_fill_data.py +30 -30
  35. gwaslab/util_in_filter_value.py +201 -74
  36. gwaslab/util_in_get_density.py +10 -10
  37. gwaslab/util_in_get_sig.py +445 -71
  38. gwaslab/viz_aux_annotate_plot.py +12 -12
  39. gwaslab/viz_aux_quickfix.py +42 -37
  40. gwaslab/viz_aux_reposition_text.py +10 -7
  41. gwaslab/viz_aux_save_figure.py +18 -8
  42. gwaslab/viz_plot_compare_af.py +32 -33
  43. gwaslab/viz_plot_compare_effect.py +63 -71
  44. gwaslab/viz_plot_miamiplot2.py +34 -26
  45. gwaslab/viz_plot_mqqplot.py +126 -75
  46. gwaslab/viz_plot_qqplot.py +11 -8
  47. gwaslab/viz_plot_regionalplot.py +36 -33
  48. gwaslab/viz_plot_rg_heatmap.py +28 -26
  49. gwaslab/viz_plot_stackedregional.py +40 -21
  50. gwaslab/viz_plot_trumpetplot.py +65 -61
  51. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  52. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
  53. gwaslab-3.4.39.dist-info/RECORD +80 -0
  54. gwaslab-3.4.37.dist-info/RECORD +0 -72
  55. /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  56. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  57. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,747 @@
1
+ '''
2
+ (c) 2014 Brendan Bulik-Sullivan and Hilary Finucane
3
+
4
+ Estimators of heritability and genetic correlation.
5
+
6
+ Shape convention is (n_snp, n_annot) for all classes.
7
+ Last column = intercept.
8
+
9
+ '''
10
+ from __future__ import division
11
+ import numpy as np
12
+ import pandas as pd
13
+ from scipy.stats import norm, chi2
14
+ import gwaslab.ldsc_jackknife as jk
15
+ from gwaslab.ldsc_irwls import IRWLS
16
+ from scipy.stats import t as tdist
17
+ from collections import namedtuple
18
+ np.seterr(divide='raise', invalid='raise')
19
+
20
+ log_prefix = ' -'
21
+ log_prefix_short = ' -'
22
+ s = lambda x: remove_brackets(str(np.matrix(x)))
23
+
24
+ def xrange(*args):
25
+ return range(*args)
26
+
27
+ def update_separators(s, ii):
28
+ '''s are separators with ii masked. Returns unmasked separators.'''
29
+ maplist = np.arange(len(ii))[np.squeeze(ii)]
30
+ mask_to_unmask = lambda i: maplist[i]
31
+ t = np.apply_along_axis(mask_to_unmask, 0, s[1:-1])
32
+ t = np.hstack(((0), t, (len(ii))))
33
+ return t
34
+
35
+
36
+ def p_z_norm(est, se):
37
+ '''Convert estimate and se to Z-score and P-value.'''
38
+ try:
39
+ Z = est / se
40
+ except (FloatingPointError, ZeroDivisionError):
41
+ Z = float('inf')
42
+
43
+ P = chi2.sf(Z ** 2, 1, loc=0, scale=1) # 0 if Z=inf
44
+ return P, Z
45
+
46
+
47
+ def remove_brackets(x):
48
+ '''Get rid of brackets and trailing whitespace in numpy arrays.'''
49
+ return x.replace('[', '').replace(']', '').strip()
50
+
51
+
52
+ def append_intercept(x):
53
+ '''
54
+ Appends an intercept term to the design matrix for a linear regression.
55
+
56
+ Parameters
57
+ ----------
58
+ x : np.matrix with shape (n_row, n_col)
59
+ Design matrix. Columns are predictors; rows are observations.
60
+
61
+ Returns
62
+ -------
63
+ x_new : np.matrix with shape (n_row, n_col+1)
64
+ Design matrix with intercept term appended.
65
+
66
+ '''
67
+ n_row = x.shape[0]
68
+ intercept = np.ones((n_row, 1))
69
+ x_new = np.concatenate((x, intercept), axis=1)
70
+ return x_new
71
+
72
+
73
+ def remove_intercept(x):
74
+ '''Removes the last column.'''
75
+ n_col = x.shape[1]
76
+ return x[:, 0:n_col - 1]
77
+
78
+
79
+ def gencov_obs_to_liab(gencov_obs, P1, P2, K1, K2):
80
+ '''
81
+ Converts genetic covariance on the observed scale in an ascertained sample to genetic
82
+ covariance on the liability scale in the population
83
+
84
+ Parameters
85
+ ----------
86
+ gencov_obs : float
87
+ Genetic covariance on the observed scale in an ascertained sample.
88
+ P1, P2 : float in (0,1)
89
+ Prevalences of phenotypes 1,2 in the sample.
90
+ K1, K2 : float in (0,1)
91
+ Prevalences of phenotypes 1,2 in the population.
92
+
93
+ Returns
94
+ -------
95
+ gencov_liab : float
96
+ Genetic covariance between liabilities in the population.
97
+
98
+ Note: if a trait is a QT, set P = K = None.
99
+
100
+ '''
101
+ c1 = 1
102
+ c2 = 1
103
+ if P1 is not None and K1 is not None:
104
+ c1 = np.sqrt(h2_obs_to_liab(1, P1, K1))
105
+ if P2 is not None and K2 is not None:
106
+ c2 = np.sqrt(h2_obs_to_liab(1, P2, K2))
107
+
108
+ return gencov_obs * c1 * c2
109
+
110
+
111
+ def h2_obs_to_liab(h2_obs, P, K):
112
+ '''
113
+ Converts heritability on the observed scale in an ascertained sample to heritability
114
+ on the liability scale in the population.
115
+
116
+ Parameters
117
+ ----------
118
+ h2_obs : float
119
+ Heritability on the observed scale in an ascertained sample.
120
+ P : float in (0,1)
121
+ Prevalence of the phenotype in the sample.
122
+ K : float in (0,1)
123
+ Prevalence of the phenotype in the population.
124
+
125
+ Returns
126
+ -------
127
+ h2_liab : float
128
+ Heritability of liability in the population.
129
+
130
+ '''
131
+ if np.isnan(P) and np.isnan(K):
132
+ return h2_obs
133
+ if K <= 0 or K >= 1:
134
+ raise ValueError('K must be in the range (0,1)')
135
+ if P <= 0 or P >= 1:
136
+ raise ValueError('P must be in the range (0,1)')
137
+
138
+ thresh = norm.isf(K)
139
+ conversion_factor = K ** 2 * \
140
+ (1 - K) ** 2 / (P * (1 - P) * norm.pdf(thresh) ** 2)
141
+ return h2_obs * conversion_factor
142
+
143
+
144
+ class LD_Score_Regression(object):
145
+
146
+ def __init__(self, y, x, w, N, M, n_blocks, intercept=None, slow=False, step1_ii=None, old_weights=False):
147
+ for i in [y, x, w, M, N]:
148
+ try:
149
+ if len(i.shape) != 2:
150
+ raise TypeError('Arguments must be 2D arrays.')
151
+ except AttributeError:
152
+ raise TypeError('Arguments must be arrays.')
153
+
154
+ n_snp, self.n_annot = x.shape
155
+ if any(i.shape != (n_snp, 1) for i in [y, w, N]):
156
+ raise ValueError(
157
+ 'N, weights and response (z1z2 or chisq) must have shape (n_snp, 1).')
158
+ if M.shape != (1, self.n_annot):
159
+ raise ValueError('M must have shape (1, n_annot).')
160
+
161
+ M_tot = float(np.sum(M))
162
+ x_tot = np.sum(x, axis=1).reshape((n_snp, 1))
163
+ self.constrain_intercept = intercept is not None
164
+ self.intercept = intercept
165
+ self.n_blocks = n_blocks
166
+ tot_agg = self.aggregate(y, x_tot, N, M_tot, intercept)
167
+ initial_w = self._update_weights(
168
+ x_tot, w, N, M_tot, tot_agg, intercept)
169
+ Nbar = np.mean(N) # keep condition number low
170
+ x = np.multiply(N, x) / Nbar
171
+ if not self.constrain_intercept:
172
+ x, x_tot = append_intercept(x), append_intercept(x_tot)
173
+ yp = y
174
+ else:
175
+ yp = y - intercept
176
+ self.intercept_se = 'NA'
177
+ del y
178
+ self.twostep_filtered = None
179
+ if step1_ii is not None and self.constrain_intercept:
180
+ raise ValueError(
181
+ 'twostep is not compatible with constrain_intercept.')
182
+ elif step1_ii is not None and self.n_annot > 1:
183
+ raise ValueError(
184
+ 'twostep not compatible with partitioned LD Score yet.')
185
+ elif step1_ii is not None:
186
+ n1 = np.sum(step1_ii)
187
+ self.twostep_filtered = n_snp - n1
188
+ x1 = x[np.squeeze(step1_ii), :]
189
+ yp1, w1, N1, initial_w1 = map(
190
+ lambda a: a[step1_ii].reshape((n1, 1)), (yp, w, N, initial_w))
191
+ update_func1 = lambda a: self._update_func(
192
+ a, x1, w1, N1, M_tot, Nbar, ii=step1_ii)
193
+ step1_jknife = IRWLS(
194
+ x1, yp1, update_func1, n_blocks, slow=slow, w=initial_w1)
195
+ step1_int, _ = self._intercept(step1_jknife)
196
+ yp = yp - step1_int
197
+ x = remove_intercept(x)
198
+ x_tot = remove_intercept(x_tot)
199
+ update_func2 = lambda a: self._update_func(
200
+ a, x_tot, w, N, M_tot, Nbar, step1_int)
201
+ s = update_separators(step1_jknife.separators, step1_ii)
202
+ step2_jknife = IRWLS(
203
+ x, yp, update_func2, n_blocks, slow=slow, w=initial_w, separators=s)
204
+ c = np.sum(np.multiply(initial_w, x)) / \
205
+ np.sum(np.multiply(initial_w, np.square(x)))
206
+ jknife = self._combine_twostep_jknives(
207
+ step1_jknife, step2_jknife, M_tot, c, Nbar)
208
+ elif old_weights:
209
+ initial_w = np.sqrt(initial_w)
210
+ x = IRWLS._weight(x, initial_w)
211
+ y = IRWLS._weight(yp, initial_w)
212
+ jknife = jk.LstsqJackknifeFast(x, y, n_blocks)
213
+ else:
214
+ update_func = lambda a: self._update_func(
215
+ a, x_tot, w, N, M_tot, Nbar, intercept)
216
+ jknife = IRWLS(
217
+ x, yp, update_func, n_blocks, slow=slow, w=initial_w)
218
+
219
+ self.coef, self.coef_cov, self.coef_se = self._coef(jknife, Nbar)
220
+ self.cat, self.cat_cov, self.cat_se =\
221
+ self._cat(jknife, M, Nbar, self.coef, self.coef_cov)
222
+
223
+ self.tot, self.tot_cov, self.tot_se = self._tot(self.cat, self.cat_cov)
224
+ self.prop, self.prop_cov, self.prop_se =\
225
+ self._prop(jknife, M, Nbar, self.cat, self.tot)
226
+
227
+ self.enrichment, self.M_prop = self._enrichment(
228
+ M, M_tot, self.cat, self.tot)
229
+ if not self.constrain_intercept:
230
+ self.intercept, self.intercept_se = self._intercept(jknife)
231
+
232
+ self.jknife = jknife
233
+ self.tot_delete_values = self._delete_vals_tot(jknife, Nbar, M)
234
+ self.part_delete_values = self._delete_vals_part(jknife, Nbar, M)
235
+ if not self.constrain_intercept:
236
+ self.intercept_delete_values = jknife.delete_values[
237
+ :, self.n_annot]
238
+
239
+ self.M = M
240
+
241
+ @classmethod
242
+ def aggregate(cls, y, x, N, M, intercept=None):
243
+ if intercept is None:
244
+ intercept = cls.__null_intercept__
245
+
246
+ num = M * (np.mean(y) - intercept)
247
+ denom = np.mean(np.multiply(x, N))
248
+ return num / denom
249
+
250
+ def _update_func(self, x, ref_ld_tot, w_ld, N, M, Nbar, intercept=None, ii=None):
251
+ raise NotImplementedError
252
+
253
+ def _delete_vals_tot(self, jknife, Nbar, M):
254
+ '''Get delete values for total h2 or gencov.'''
255
+ n_annot = self.n_annot
256
+ tot_delete_vals = jknife.delete_values[
257
+ :, 0:n_annot] # shape (n_blocks, n_annot)
258
+ # shape (n_blocks, 1)
259
+ tot_delete_vals = np.dot(tot_delete_vals, M.T) / Nbar
260
+ return tot_delete_vals
261
+
262
+ def _delete_vals_part(self, jknife, Nbar, M):
263
+ '''Get delete values for partitioned h2 or gencov.'''
264
+ n_annot = self.n_annot
265
+ return jknife.delete_values[:, 0:n_annot] / Nbar
266
+
267
+ def _coef(self, jknife, Nbar):
268
+ '''Get coefficient estimates + cov from the jackknife.'''
269
+ n_annot = self.n_annot
270
+ coef = jknife.est[0, 0:n_annot] / Nbar
271
+ coef_cov = jknife.jknife_cov[0:n_annot, 0:n_annot] / Nbar ** 2
272
+ coef_se = np.sqrt(np.diag(coef_cov))
273
+ return coef, coef_cov, coef_se
274
+
275
+ def _cat(self, jknife, M, Nbar, coef, coef_cov):
276
+ '''Convert coefficients to per-category h2 or gencov.'''
277
+ cat = np.multiply(M, coef)
278
+ cat_cov = np.multiply(np.dot(M.T, M), coef_cov)
279
+ cat_se = np.sqrt(np.diag(cat_cov))
280
+ return cat, cat_cov, cat_se
281
+
282
+ def _tot(self, cat, cat_cov):
283
+ '''Convert per-category h2 to total h2 or gencov.'''
284
+ tot = np.sum(cat)
285
+ tot_cov = np.sum(cat_cov)
286
+ tot_se = np.sqrt(tot_cov)
287
+ return tot, tot_cov, tot_se
288
+
289
+ def _prop(self, jknife, M, Nbar, cat, tot):
290
+ '''Convert total h2 and per-category h2 to per-category proportion h2 or gencov.'''
291
+ n_annot = self.n_annot
292
+ n_blocks = jknife.delete_values.shape[0]
293
+ numer_delete_vals = np.multiply(
294
+ M, jknife.delete_values[:, 0:n_annot]) / Nbar # (n_blocks, n_annot)
295
+ denom_delete_vals = np.sum(
296
+ numer_delete_vals, axis=1).reshape((n_blocks, 1))
297
+ denom_delete_vals = np.dot(denom_delete_vals, np.ones((1, n_annot)))
298
+ prop = jk.RatioJackknife(
299
+ cat / tot, numer_delete_vals, denom_delete_vals)
300
+ return prop.est, prop.jknife_cov, prop.jknife_se
301
+
302
+ def _enrichment(self, M, M_tot, cat, tot):
303
+ '''Compute proportion of SNPs per-category enrichment for h2 or gencov.'''
304
+ M_prop = M / M_tot
305
+ enrichment = np.divide(cat, M) / (tot / M_tot)
306
+ return enrichment, M_prop
307
+
308
+ def _intercept(self, jknife):
309
+ '''Extract intercept and intercept SE from block jackknife.'''
310
+ n_annot = self.n_annot
311
+ intercept = jknife.est[0, n_annot]
312
+ intercept_se = jknife.jknife_se[0, n_annot]
313
+ return intercept, intercept_se
314
+
315
+ def _combine_twostep_jknives(self, step1_jknife, step2_jknife, M_tot, c, Nbar=1):
316
+ '''Combine free intercept and constrained intercept jackknives for --two-step.'''
317
+ n_blocks, n_annot = step1_jknife.delete_values.shape
318
+ n_annot -= 1
319
+ if n_annot > 2:
320
+ raise ValueError(
321
+ 'twostep not yet implemented for partitioned LD Score.')
322
+
323
+ step1_int, _ = self._intercept(step1_jknife)
324
+ est = np.hstack(
325
+ (step2_jknife.est, np.array(step1_int).reshape((1, 1))))
326
+ delete_values = np.zeros((n_blocks, n_annot + 1))
327
+ delete_values[:, n_annot] = step1_jknife.delete_values[:, n_annot]
328
+ delete_values[:, 0:n_annot] = step2_jknife.delete_values -\
329
+ c * (step1_jknife.delete_values[:, n_annot] -
330
+ step1_int).reshape((n_blocks, n_annot)) # check this
331
+ pseudovalues = jk.Jackknife.delete_values_to_pseudovalues(
332
+ delete_values, est)
333
+ jknife_est, jknife_var, jknife_se, jknife_cov = jk.Jackknife.jknife(
334
+ pseudovalues)
335
+ jknife = namedtuple('jknife',
336
+ ['est', 'jknife_se', 'jknife_est', 'jknife_var', 'jknife_cov', 'delete_values'])
337
+ return jknife(est, jknife_se, jknife_est, jknife_var, jknife_cov, delete_values)
338
+
339
+
340
+ class Hsq(LD_Score_Regression):
341
+
342
+ __null_intercept__ = 1
343
+
344
+ def __init__(self, y, x, w, N, M, n_blocks=200, intercept=None, slow=False, twostep=None, old_weights=False):
345
+ step1_ii = None
346
+ if twostep is not None:
347
+ step1_ii = y < twostep
348
+
349
+ LD_Score_Regression.__init__(self, y, x, w, N, M, n_blocks, intercept=intercept,
350
+ slow=slow, step1_ii=step1_ii, old_weights=old_weights)
351
+ self.mean_chisq, self.lambda_gc = self._summarize_chisq(y)
352
+ if not self.constrain_intercept:
353
+ self.ratio, self.ratio_se = self._ratio(
354
+ self.intercept, self.intercept_se, self.mean_chisq)
355
+
356
+ def _update_func(self, x, ref_ld_tot, w_ld, N, M, Nbar, intercept=None, ii=None):
357
+ '''
358
+ Update function for IRWLS
359
+
360
+ x is the output of np.linalg.lstsq.
361
+ x[0] is the regression coefficients
362
+ x[0].shape is (# of dimensions, 1)
363
+ the last element of x[0] is the intercept.
364
+
365
+ intercept is None --> free intercept
366
+ intercept is not None --> constrained intercept
367
+ '''
368
+ hsq = M * x[0][0] / Nbar
369
+ if intercept is None:
370
+ intercept = max(x[0][1]) # divide by zero error if intercept < 0
371
+ else:
372
+ if ref_ld_tot.shape[1] > 1:
373
+ raise ValueError(
374
+ 'Design matrix has intercept column for constrained intercept regression!')
375
+
376
+ ld = ref_ld_tot[:, 0].reshape(w_ld.shape) # remove intercept
377
+ w = self.weights(ld, w_ld, N, M, hsq, intercept, ii)
378
+ return w
379
+
380
+ def _summarize_chisq(self, chisq):
381
+ '''Compute mean chi^2 and lambda_GC.'''
382
+ mean_chisq = np.mean(chisq)
383
+ # median and matrix don't play nice
384
+ lambda_gc = np.median(np.asarray(chisq)) / 0.4549
385
+ return mean_chisq, lambda_gc
386
+
387
+ def _ratio(self, intercept, intercept_se, mean_chisq):
388
+ '''Compute ratio (intercept - 1) / (mean chi^2 -1 ).'''
389
+ if mean_chisq > 1:
390
+ ratio_se = intercept_se / (mean_chisq - 1)
391
+ ratio = (intercept - 1) / (mean_chisq - 1)
392
+ else:
393
+ ratio = 'NA'
394
+ ratio_se = 'NA'
395
+
396
+ return ratio, ratio_se
397
+
398
+ def _overlap_output(self, category_names, overlap_matrix, M_annot, M_tot, print_coefficients):
399
+ '''LD Score regression summary for overlapping categories.'''
400
+ overlap_matrix_prop = np.zeros([self.n_annot,self.n_annot])
401
+ for i in range(self.n_annot):
402
+ overlap_matrix_prop[i, :] = overlap_matrix[i, :] / M_annot
403
+
404
+ prop_hsq_overlap = np.dot(
405
+ overlap_matrix_prop, self.prop.T).reshape((1, self.n_annot))
406
+ prop_hsq_overlap_var = np.diag(
407
+ np.dot(np.dot(overlap_matrix_prop, self.prop_cov), overlap_matrix_prop.T))
408
+ prop_hsq_overlap_se = np.sqrt(
409
+ np.maximum(0, prop_hsq_overlap_var)).reshape((1, self.n_annot))
410
+ one_d_convert = lambda x: np.array(x).reshape(np.prod(x.shape))
411
+ prop_M_overlap = M_annot / M_tot
412
+ enrichment = prop_hsq_overlap / prop_M_overlap
413
+ enrichment_se = prop_hsq_overlap_se / prop_M_overlap
414
+ overlap_matrix_diff = np.zeros([self.n_annot,self.n_annot])
415
+ for i in range(self.n_annot):
416
+ if not M_tot == M_annot[0,i]:
417
+ overlap_matrix_diff[i, :] = overlap_matrix[i,:]/M_annot[0,i] - \
418
+ (M_annot - overlap_matrix[i,:]) / (M_tot-M_annot[0,i])
419
+
420
+ diff_est = np.dot(overlap_matrix_diff,self.coef)
421
+ diff_cov = np.dot(np.dot(overlap_matrix_diff,self.coef_cov),overlap_matrix_diff.T)
422
+ diff_se = np.sqrt(np.diag(diff_cov))
423
+ diff_p = ['NA' if diff_se[i]==0 else 2*tdist.sf(abs(diff_est[i]/diff_se[i]),self.n_blocks) \
424
+ for i in range(self.n_annot)]
425
+
426
+ df = pd.DataFrame({
427
+ 'Category': category_names,
428
+ 'Prop._SNPs': one_d_convert(prop_M_overlap),
429
+ 'Prop._h2': one_d_convert(prop_hsq_overlap),
430
+ 'Prop._h2_std_error': one_d_convert(prop_hsq_overlap_se),
431
+ 'Enrichment': one_d_convert(enrichment),
432
+ 'Enrichment_std_error': one_d_convert(enrichment_se),
433
+ 'Enrichment_p':diff_p,
434
+ 'Coefficient': one_d_convert(self.coef),
435
+ 'Coefficient_std_error': self.coef_se,
436
+ 'Coefficient_z-score': one_d_convert(self.coef) / one_d_convert(self.coef_se)
437
+ })
438
+ if print_coefficients:
439
+ df = df[['Category', 'Prop._SNPs', 'Prop._h2', 'Prop._h2_std_error',
440
+ 'Enrichment','Enrichment_std_error', 'Enrichment_p',
441
+ 'Coefficient', 'Coefficient_std_error','Coefficient_z-score']]
442
+ else:
443
+ df = df[['Category', 'Prop._SNPs', 'Prop._h2', 'Prop._h2_std_error',
444
+ 'Enrichment','Enrichment_std_error', 'Enrichment_p']]
445
+ return df
446
+
447
+
448
+ def summary(self, ref_ld_colnames=None, P=None, K=None, overlap=False):
449
+ '''Print summary of the LD Score Regression.'''
450
+ if P is not None and K is not None:
451
+ T = 'Liability'
452
+ c = h2_obs_to_liab(1, P, K)
453
+ else:
454
+ T = 'Observed'
455
+ c = 1
456
+
457
+ out = ['Total ' + T + ' scale h2: ' +
458
+ s(c * self.tot) + ' (' + s(c * self.tot_se) + ')']
459
+ if self.n_annot > 1:
460
+ if ref_ld_colnames is None:
461
+ ref_ld_colnames = ['CAT_' + str(i)
462
+ for i in xrange(self.n_annot)]
463
+
464
+ out.append(log_prefix_short+'Categories: ' + ' '.join(ref_ld_colnames))
465
+
466
+ if not overlap:
467
+ out.append(log_prefix+T + ' scale h2: ' + s(c * self.cat))
468
+ out.append(log_prefix+T + ' scale h2 SE: ' + s(c * self.cat_se))
469
+ out.append(log_prefix+'Proportion of SNPs: ' + s(self.M_prop))
470
+ out.append(log_prefix+'Proportion of h2g: ' + s(self.prop))
471
+ out.append(log_prefix+'Enrichment: ' + s(self.enrichment))
472
+ out.append(log_prefix+'Coefficients: ' + s(self.coef))
473
+ out.append(log_prefix+'Coefficient SE: ' + s(self.coef_se))
474
+
475
+ out.append(log_prefix+'Lambda GC: ' + s(self.lambda_gc))
476
+ out.append(log_prefix+'Mean Chi^2: ' + s(self.mean_chisq))
477
+ if self.constrain_intercept:
478
+ out.append(
479
+ log_prefix+'Intercept: constrained to {C}'.format(C=s(self.intercept)))
480
+ else:
481
+ out.append(
482
+ log_prefix+ 'Intercept: ' + s(self.intercept) + ' (' + s(self.intercept_se) + ')')
483
+ if self.mean_chisq > 1:
484
+ if self.ratio < 0:
485
+ out.append(
486
+ log_prefix+'Ratio < 0 (usually indicates GC correction).')
487
+ else:
488
+ out.append(
489
+ log_prefix+'Ratio: ' + s(self.ratio) + ' (' + s(self.ratio_se) + ')')
490
+ else:
491
+ out.append(log_prefix+'Ratio: NA (mean chi^2 < 1)')
492
+
493
+ return remove_brackets('\n'.join(out))
494
+
495
+ def _update_weights(self, ld, w_ld, N, M, hsq, intercept, ii=None):
496
+ if intercept is None:
497
+ intercept = self.__null_intercept__
498
+
499
+ return self.weights(ld, w_ld, N, M, hsq, intercept, ii)
500
+
501
+ @classmethod
502
+ def weights(cls, ld, w_ld, N, M, hsq, intercept=None, ii=None):
503
+ '''
504
+ Regression weights.
505
+
506
+ Parameters
507
+ ----------
508
+ ld : np.matrix with shape (n_snp, 1)
509
+ LD Scores (non-partitioned).
510
+ w_ld : np.matrix with shape (n_snp, 1)
511
+ LD Scores (non-partitioned) computed with sum r^2 taken over only those SNPs included
512
+ in the regression.
513
+ N : np.matrix of ints > 0 with shape (n_snp, 1)
514
+ Number of individuals sampled for each SNP.
515
+ M : float > 0
516
+ Number of SNPs used for estimating LD Score (need not equal number of SNPs included in
517
+ the regression).
518
+ hsq : float in [0,1]
519
+ Heritability estimate.
520
+
521
+ Returns
522
+ -------
523
+ w : np.matrix with shape (n_snp, 1)
524
+ Regression weights. Approx equal to reciprocal of conditional variance function.
525
+
526
+ '''
527
+ M = float(M)
528
+ if intercept is None:
529
+ intercept = 1
530
+
531
+ hsq = max(hsq, 0.0)
532
+ hsq = min(hsq, 1.0)
533
+ ld = np.fmax(ld, 1.0)
534
+ w_ld = np.fmax(w_ld, 1.0)
535
+ c = hsq * N / M
536
+ het_w = 1.0 / (2 * np.square(intercept + np.multiply(c, ld)))
537
+ oc_w = 1.0 / w_ld
538
+ w = np.multiply(het_w, oc_w)
539
+ return w
540
+
541
+
542
+ class Gencov(LD_Score_Regression):
543
+ __null_intercept__ = 0
544
+
545
+ def __init__(self, z1, z2, x, w, N1, N2, M, hsq1, hsq2, intercept_hsq1, intercept_hsq2,
546
+ n_blocks=200, intercept_gencov=None, slow=False, twostep=None):
547
+ self.intercept_hsq1 = intercept_hsq1
548
+ self.intercept_hsq2 = intercept_hsq2
549
+ self.hsq1 = hsq1
550
+ self.hsq2 = hsq2
551
+ self.N1 = N1
552
+ self.N2 = N2
553
+ y = z1 * z2
554
+ step1_ii = None
555
+ if twostep is not None:
556
+ step1_ii = np.logical_and(z1**2 < twostep, z2**2 < twostep)
557
+
558
+ LD_Score_Regression.__init__(self, y, x, w, np.sqrt(N1 * N2), M, n_blocks,
559
+ intercept=intercept_gencov, slow=slow, step1_ii=step1_ii)
560
+ self.p, self.z = p_z_norm(self.tot, self.tot_se)
561
+ self.mean_z1z2 = np.mean(np.multiply(z1, z2))
562
+
563
+ def summary(self, ref_ld_colnames, P=None, K=None):
564
+ '''Print summary of the LD Score regression.'''
565
+ out = []
566
+ if P is not None and K is not None and\
567
+ all((i is not None for i in P)) and all((i is not None for i in K)):
568
+ T = 'Liability'
569
+ c = gencov_obs_to_liab(1, P[0], P[1], K[0], K[1])
570
+ else:
571
+ T = 'Observed'
572
+ c = 1
573
+
574
+ out.append('Total ' + T + ' scale gencov: ' +
575
+ s(c * self.tot) + ' (' + s(c * self.tot_se) + ')')
576
+ if self.n_annot > 1:
577
+ out.append(log_prefix+'Categories: ' + str(' '.join(ref_ld_colnames)))
578
+ out.append(log_prefix+T + ' scale gencov: ' + s(c * self.cat))
579
+ out.append(log_prefix+T + ' scale gencov SE: ' + s(c * self.cat_se))
580
+ out.append(log_prefix+'Proportion of SNPs: ' + s(self.M_prop))
581
+ out.append(log_prefix+'Proportion of gencov: ' + s(self.prop))
582
+ out.append(log_prefix+'Enrichment: ' + s(self.enrichment))
583
+
584
+ out.append(log_prefix+'Mean z1*z2: ' + s(self.mean_z1z2))
585
+ if self.constrain_intercept:
586
+ out.append(
587
+ log_prefix+'Intercept: constrained to {C}'.format(C=s(self.intercept)))
588
+ else:
589
+ out.append(
590
+ log_prefix+'Intercept: ' + s(self.intercept) + ' (' + s(self.intercept_se) + ')')
591
+
592
+ return remove_brackets('\n'.join(out))
593
+
594
+ def _update_func(self, x, ref_ld_tot, w_ld, N, M, Nbar, intercept=None, ii=None):
595
+ '''
596
+ Update function for IRWLS
597
+ x is the output of np.linalg.lstsq.
598
+ x[0] is the regression coefficients
599
+ x[0].shape is (# of dimensions, 1)
600
+ the last element of x[0] is the intercept.
601
+
602
+ '''
603
+ rho_g = M * x[0][0] / Nbar
604
+ if intercept is None: # if the regression includes an intercept
605
+ intercept = x[0][1]
606
+
607
+ # remove intercept if we have one
608
+ ld = ref_ld_tot[:, 0].reshape(w_ld.shape)
609
+ if ii is not None:
610
+ N1 = self.N1[ii].reshape((w_ld.shape))
611
+ N2 = self.N2[ii].reshape((w_ld.shape))
612
+ else:
613
+ N1 = self.N1
614
+ N2 = self.N2
615
+
616
+ return self.weights(ld, w_ld, N1, N2, np.sum(M), self.hsq1, self.hsq2, rho_g,
617
+ intercept, self.intercept_hsq1, self.intercept_hsq2, ii)
618
+
619
+ def _update_weights(self, ld, w_ld, sqrt_n1n2, M, rho_g, intercept, ii=None):
620
+ '''Weight function with the same signature for Hsq and Gencov.'''
621
+ w = self.weights(ld, w_ld, self.N1, self.N2, M, self.hsq1, self.hsq2, rho_g,
622
+ intercept, self.intercept_hsq1, self.intercept_hsq2)
623
+ return w
624
+
625
+ @classmethod
626
+ def weights(cls, ld, w_ld, N1, N2, M, h1, h2, rho_g, intercept_gencov=None,
627
+ intercept_hsq1=None, intercept_hsq2=None, ii=None):
628
+ '''
629
+ Regression weights.
630
+
631
+ Parameters
632
+ ----------
633
+ ld : np.matrix with shape (n_snp, 1)
634
+ LD Scores (non-partitioned)
635
+ w_ld : np.matrix with shape (n_snp, 1)
636
+ LD Scores (non-partitioned) computed with sum r^2 taken over only those SNPs included
637
+ in the regression.
638
+ M : float > 0
639
+ Number of SNPs used for estimating LD Score (need not equal number of SNPs included in
640
+ the regression).
641
+ N1, N2 : np.matrix of ints > 0 with shape (n_snp, 1)
642
+ Number of individuals sampled for each SNP for each study.
643
+ h1, h2 : float in [0,1]
644
+ Heritability estimates for each study.
645
+ rhog : float in [0,1]
646
+ Genetic covariance estimate.
647
+ intercept : float
648
+ Genetic covariance intercept, on the z1*z2 scale (so should be Ns*rho/sqrt(N1*N2)).
649
+
650
+ Returns
651
+ -------
652
+ w : np.matrix with shape (n_snp, 1)
653
+ Regression weights. Approx equal to reciprocal of conditional variance function.
654
+
655
+ '''
656
+ M = float(M)
657
+ if intercept_gencov is None:
658
+ intercept_gencov = 0
659
+ if intercept_hsq1 is None:
660
+ intercept_hsq1 = 1
661
+ if intercept_hsq2 is None:
662
+ intercept_hsq2 = 1
663
+
664
+ h1, h2 = max(h1, 0.0), max(h2, 0.0)
665
+ h1, h2 = min(h1, 1.0), min(h2, 1.0)
666
+ rho_g = min(rho_g, 1.0)
667
+ rho_g = max(rho_g, -1.0)
668
+ ld = np.fmax(ld, 1.0)
669
+ w_ld = np.fmax(w_ld, 1.0)
670
+ a = np.multiply(N1, h1 * ld) / M + intercept_hsq1
671
+ b = np.multiply(N2, h2 * ld) / M + intercept_hsq2
672
+ sqrt_n1n2 = np.sqrt(np.multiply(N1, N2))
673
+ c = np.multiply(sqrt_n1n2, rho_g * ld) / M + intercept_gencov
674
+ try:
675
+ het_w = 1.0 / (np.multiply(a, b) + np.square(c))
676
+ except FloatingPointError: # bizarre error; should never happen
677
+ raise FloatingPointError('Why did you set hsq intercept <= 0?')
678
+
679
+ oc_w = 1.0 / w_ld
680
+ w = np.multiply(het_w, oc_w)
681
+ return w
682
+
683
+
684
+ class RG(object):
685
+
686
+ def __init__(self, z1, z2, x, w, N1, N2, M, intercept_hsq1=None, intercept_hsq2=None,
687
+ intercept_gencov=None, n_blocks=200, slow=False, twostep=None):
688
+ self.intercept_gencov = intercept_gencov
689
+ self._negative_hsq = None
690
+ n_snp, n_annot = x.shape
691
+ hsq1 = Hsq(np.square(z1), x, w, N1, M, n_blocks=n_blocks, intercept=intercept_hsq1,
692
+ slow=slow, twostep=twostep)
693
+ hsq2 = Hsq(np.square(z2), x, w, N2, M, n_blocks=n_blocks, intercept=intercept_hsq2,
694
+ slow=slow, twostep=twostep)
695
+ gencov = Gencov(z1, z2, x, w, N1, N2, M, hsq1.tot, hsq2.tot, hsq1.intercept,
696
+ hsq2.intercept, n_blocks, intercept_gencov=intercept_gencov, slow=slow,
697
+ twostep=twostep)
698
+ gencov.N1 = None # save memory
699
+ gencov.N2 = None
700
+ self.hsq1, self.hsq2, self.gencov = hsq1, hsq2, gencov
701
+ if (hsq1.tot <= 0 or hsq2.tot <= 0):
702
+ self._negative_hsq = True
703
+ self.rg_ratio = self.rg = self.rg_se = 'NA'
704
+ self.p = self.z = 'NA'
705
+ else:
706
+ rg_ratio = np.array(
707
+ gencov.tot / np.sqrt(hsq1.tot * hsq2.tot)).reshape((1, 1))
708
+ denom_delete_values = np.sqrt(
709
+ np.multiply(hsq1.tot_delete_values, hsq2.tot_delete_values))
710
+ rg = jk.RatioJackknife(
711
+ rg_ratio, gencov.tot_delete_values, denom_delete_values)
712
+ self.rg_jknife = float(rg.jknife_est)
713
+ self.rg_se = float(rg.jknife_se)
714
+ self.rg_ratio = float(rg_ratio)
715
+ self.p, self.z = p_z_norm(self.rg_ratio, self.rg_se)
716
+
717
+ def summary(self, silly=False):
718
+ '''Print output of Gencor object.'''
719
+ out = []
720
+ if self._negative_hsq:
721
+ out.append(log_prefix_short+'Genetic Correlation: nan (nan) (h2 out of bounds) ')
722
+ out.append(log_prefix+'Z-score: nan (nan) (h2 out of bounds)')
723
+ out.append(log_prefix+'P: nan (nan) (h2 out of bounds)')
724
+ out.append(log_prefix+'WARNING: One of the h2\'s was out of bounds.')
725
+ out.append(
726
+ log_prefix+'This usually indicates a data-munging error ' +
727
+ 'or that h2 or N is low.')
728
+ elif (self.rg_ratio > 1.2 or self.rg_ratio < -1.2) and not silly:
729
+ out.append(log_prefix_short+'Genetic Correlation: nan (nan) (rg out of bounds) ')
730
+ out.append(log_prefix+'Z-score: nan (nan) (rg out of bounds)')
731
+ out.append(log_prefix+'P: nan (nan) (rg out of bounds)')
732
+ out.append(log_prefix+'WARNING: rg was out of bounds.')
733
+ if self.intercept_gencov is None:
734
+ out.append(
735
+ log_prefix+'This often means that h2 is not significantly ' +
736
+ 'different from zero.')
737
+ else:
738
+ out.append(
739
+ log_prefix+'This often means that you have constrained' +
740
+ ' the intercepts to the wrong values.')
741
+ else:
742
+ out.append(
743
+ 'Genetic Correlation: ' + s(self.rg_ratio) +
744
+ ' (' + s(self.rg_se) + ')')
745
+ out.append(log_prefix+'Z-score: ' + s(self.z))
746
+ out.append(log_prefix+'P: ' + s(self.p))
747
+ return remove_brackets('\n'.join(out))