gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (51) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/g_Log.py +14 -5
  5. gwaslab/g_Sumstats.py +86 -18
  6. gwaslab/g_SumstatsPair.py +70 -23
  7. gwaslab/g_SumstatsT.py +2 -2
  8. gwaslab/g_version.py +10 -10
  9. gwaslab/hm_casting.py +9 -4
  10. gwaslab/hm_harmonize_sumstats.py +88 -83
  11. gwaslab/io_preformat_input.py +14 -14
  12. gwaslab/io_read_ldsc.py +49 -1
  13. gwaslab/ldsc_irwls.py +198 -0
  14. gwaslab/ldsc_jackknife.py +514 -0
  15. gwaslab/ldsc_ldscore.py +417 -0
  16. gwaslab/ldsc_parse.py +294 -0
  17. gwaslab/ldsc_regressions.py +747 -0
  18. gwaslab/ldsc_sumstats.py +629 -0
  19. gwaslab/qc_check_datatype.py +1 -1
  20. gwaslab/qc_fix_sumstats.py +163 -161
  21. gwaslab/util_ex_calculate_ldmatrix.py +2 -2
  22. gwaslab/util_ex_gwascatalog.py +24 -24
  23. gwaslab/util_ex_ldproxyfinder.py +9 -9
  24. gwaslab/util_ex_ldsc.py +189 -0
  25. gwaslab/util_in_calculate_gc.py +6 -6
  26. gwaslab/util_in_calculate_power.py +42 -43
  27. gwaslab/util_in_convert_h2.py +8 -8
  28. gwaslab/util_in_fill_data.py +28 -28
  29. gwaslab/util_in_filter_value.py +91 -52
  30. gwaslab/util_in_get_density.py +8 -8
  31. gwaslab/util_in_get_sig.py +407 -65
  32. gwaslab/viz_aux_annotate_plot.py +12 -12
  33. gwaslab/viz_aux_quickfix.py +18 -18
  34. gwaslab/viz_aux_reposition_text.py +3 -3
  35. gwaslab/viz_aux_save_figure.py +14 -5
  36. gwaslab/viz_plot_compare_af.py +29 -30
  37. gwaslab/viz_plot_compare_effect.py +63 -71
  38. gwaslab/viz_plot_miamiplot2.py +6 -6
  39. gwaslab/viz_plot_mqqplot.py +17 -3
  40. gwaslab/viz_plot_qqplot.py +1 -1
  41. gwaslab/viz_plot_regionalplot.py +33 -32
  42. gwaslab/viz_plot_rg_heatmap.py +28 -26
  43. gwaslab/viz_plot_stackedregional.py +40 -21
  44. gwaslab/viz_plot_trumpetplot.py +50 -55
  45. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  46. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
  47. gwaslab-3.4.39.dist-info/RECORD +80 -0
  48. gwaslab-3.4.38.dist-info/RECORD +0 -72
  49. /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  50. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  51. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,417 @@
1
+ from __future__ import division
2
+ import numpy as np
3
+ import bitarray as ba
4
+
5
+ def xrange(*args):
6
+ return range(*args)
7
+
8
+ def getBlockLefts(coords, max_dist):
9
+ '''
10
+ Converts coordinates + max block length to the a list of coordinates of the leftmost
11
+ SNPs to be included in blocks.
12
+
13
+ Parameters
14
+ ----------
15
+ coords : array
16
+ Array of coordinates. Must be sorted.
17
+ max_dist : float
18
+ Maximum distance between SNPs included in the same window.
19
+
20
+ Returns
21
+ -------
22
+ block_left : 1D np.ndarray with same length as block_left
23
+ block_left[j] := min{k | dist(j, k) < max_dist}.
24
+
25
+ '''
26
+ M = len(coords)
27
+ j = 0
28
+ block_left = np.zeros(M)
29
+ for i in xrange(M):
30
+ while j < M and abs(coords[j] - coords[i]) > max_dist:
31
+ j += 1
32
+
33
+ block_left[i] = j
34
+
35
+ return block_left
36
+
37
+
38
+ def block_left_to_right(block_left):
39
+ '''
40
+ Converts block lefts to block rights.
41
+
42
+ Parameters
43
+ ----------
44
+ block_left : array
45
+ Array of block lefts.
46
+
47
+ Returns
48
+ -------
49
+ block_right : 1D np.ndarray with same length as block_left
50
+ block_right[j] := max {k | block_left[k] <= j}
51
+
52
+ '''
53
+ M = len(block_left)
54
+ j = 0
55
+ block_right = np.zeros(M)
56
+ for i in xrange(M):
57
+ while j < M and block_left[j] <= i:
58
+ j += 1
59
+
60
+ block_right[i] = j
61
+
62
+ return block_right
63
+
64
+
65
+ class __GenotypeArrayInMemory__(object):
66
+ '''
67
+ Parent class for various classes containing interfaces for files with genotype
68
+ matrices, e.g., plink .bed files, etc
69
+ '''
70
+ def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
71
+ self.m = len(snp_list.IDList)
72
+ self.n = n
73
+ self.keep_snps = keep_snps
74
+ self.keep_indivs = keep_indivs
75
+ self.df = np.array(snp_list.df[['CHR', 'SNP', 'BP', 'CM']])
76
+ self.colnames = ['CHR', 'SNP', 'BP', 'CM']
77
+ self.mafMin = mafMin if mafMin is not None else 0
78
+ self._currentSNP = 0
79
+ (self.nru, self.geno) = self.__read__(fname, self.m, n)
80
+ # filter individuals
81
+ if keep_indivs is not None:
82
+ keep_indivs = np.array(keep_indivs, dtype='int')
83
+ if np.any(keep_indivs > self.n):
84
+ raise ValueError('keep_indivs indices out of bounds')
85
+
86
+ (self.geno, self.m, self.n) = self.__filter_indivs__(self.geno, keep_indivs, self.m,
87
+ self.n)
88
+
89
+ if self.n > 0:
90
+ print( 'After filtering, {n} individuals remain'.format(n=self.n))
91
+ else:
92
+ raise ValueError('After filtering, no individuals remain')
93
+
94
+ # filter SNPs
95
+ if keep_snps is not None:
96
+ keep_snps = np.array(keep_snps, dtype='int')
97
+ if np.any(keep_snps > self.m): # if keep_snps is None, this returns False
98
+ raise ValueError('keep_snps indices out of bounds')
99
+
100
+ (self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
101
+ self.geno, self.m, self.n, self.mafMin, keep_snps)
102
+
103
+ if self.m > 0:
104
+ print( 'After filtering, {m} SNPs remain'.format(m=self.m))
105
+ else:
106
+ raise ValueError('After filtering, no SNPs remain')
107
+
108
+ self.df = self.df[self.kept_snps, :]
109
+ self.maf = np.minimum(self.freq, np.ones(self.m)-self.freq)
110
+ self.sqrtpq = np.sqrt(self.freq*(np.ones(self.m)-self.freq))
111
+ self.df = np.c_[self.df, self.maf]
112
+ self.colnames.append('MAF')
113
+
114
+ def __read__(self, fname, m, n):
115
+ raise NotImplementedError
116
+
117
+ def __filter_indivs__(geno, keep_indivs, m, n):
118
+ raise NotImplementedError
119
+
120
+ def __filter_maf_(geno, m, n, maf):
121
+ raise NotImplementedError
122
+
123
+ def ldScoreVarBlocks(self, block_left, c, annot=None):
124
+ '''Computes an unbiased estimate of L2(j) for j=1,..,M.'''
125
+ func = lambda x: self.__l2_unbiased__(x, self.n)
126
+ snp_getter = self.nextSNPs
127
+ return self.__corSumVarBlocks__(block_left, c, func, snp_getter, annot)
128
+
129
+ def ldScoreBlockJackknife(self, block_left, c, annot=None, jN=10):
130
+ func = lambda x: np.square(x)
131
+ snp_getter = self.nextSNPs
132
+ return self.__corSumBlockJackknife__(block_left, c, func, snp_getter, annot, jN)
133
+
134
+ def __l2_unbiased__(self, x, n):
135
+ denom = n-2 if n > 2 else n # allow n<2 for testing purposes
136
+ sq = np.square(x)
137
+ return sq - (1-sq) / denom
138
+
139
+ # general methods for calculating sums of Pearson correlation coefficients
140
+ def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
141
+ '''
142
+ Parameters
143
+ ----------
144
+ block_left : np.ndarray with shape (M, )
145
+ block_left[i] = index of leftmost SNP included in LD Score of SNP i.
146
+ if c > 1, then only entries that are multiples of c are examined, and it is
147
+ assumed that block_left[a*c+i] = block_left[a*c], except at
148
+ the beginning of the chromosome where the 0th SNP is included in the window.
149
+
150
+ c : int
151
+ Chunk size.
152
+ func : function
153
+ Function to be applied to the genotype correlation matrix. Before dotting with
154
+ annot. Examples: for biased L2, np.square. For biased L4,
155
+ lambda x: np.square(np.square(x)). For L1, lambda x: x.
156
+ snp_getter : function(int)
157
+ The method to be used to get the next SNPs (normalized genotypes? Normalized
158
+ genotypes with the minor allele as reference allele? etc)
159
+ annot: numpy array with shape (m,n_a)
160
+ SNP annotations.
161
+
162
+ Returns
163
+ -------
164
+ cor_sum : np.ndarray with shape (M, num_annots)
165
+ Estimates.
166
+
167
+ '''
168
+ m, n = self.m, self.n
169
+ block_sizes = np.array(np.arange(m) - block_left)
170
+ block_sizes = np.ceil(block_sizes / c)*c
171
+ if annot is None:
172
+ annot = np.ones((m, 1))
173
+ else:
174
+ annot_m = annot.shape[0]
175
+ if annot_m != self.m:
176
+ raise ValueError('Incorrect number of SNPs in annot')
177
+
178
+ n_a = annot.shape[1] # number of annotations
179
+ cor_sum = np.zeros((m, n_a))
180
+ # b = index of first SNP for which SNP 0 is not included in LD Score
181
+ b = np.nonzero(block_left > 0)
182
+ if np.any(b):
183
+ b = b[0][0]
184
+ else:
185
+ b = m
186
+ b = int(np.ceil(b/c)*c) # round up to a multiple of c
187
+ if b > m:
188
+ c = 1
189
+ b = m
190
+ l_A = 0 # l_A := index of leftmost SNP in matrix A
191
+ A = snp_getter(b)
192
+ rfuncAB = np.zeros((b, c))
193
+ rfuncBB = np.zeros((c, c))
194
+ # chunk inside of block
195
+ for l_B in xrange(0, b, c): # l_B := index of leftmost SNP in matrix B
196
+ B = A[:, l_B:l_B+c]
197
+ np.dot(A.T, B / n, out=rfuncAB)
198
+ rfuncAB = func(rfuncAB)
199
+ cor_sum[l_A:l_A+b, :] += np.dot(rfuncAB, annot[l_B:l_B+c, :])
200
+ # chunk to right of block
201
+ b0 = b
202
+ md = int(c*np.floor(m/c))
203
+ end = md + 1 if md != m else md
204
+ for l_B in xrange(b0, end, c):
205
+ # check if the annot matrix is all zeros for this block + chunk
206
+ # this happens w/ sparse categories (i.e., pathways)
207
+ # update the block
208
+ old_b = b
209
+ b = int(block_sizes[l_B])
210
+ if l_B > b0 and b > 0:
211
+ # block_size can't increase more than c
212
+ # block_size can't be less than c unless it is zero
213
+ # both of these things make sense
214
+ A = np.hstack((A[:, old_b-b+c:old_b], B))
215
+ l_A += old_b-b+c
216
+ elif l_B == b0 and b > 0:
217
+ A = A[:, b0-b:b0]
218
+ l_A = b0-b
219
+ elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
220
+ A = np.array(()).reshape((n, 0))
221
+ l_A = l_B
222
+ if l_B == md:
223
+ c = m - md
224
+ rfuncAB = np.zeros((b, c))
225
+ rfuncBB = np.zeros((c, c))
226
+ if b != old_b:
227
+ rfuncAB = np.zeros((b, c))
228
+
229
+ B = snp_getter(c)
230
+ p1 = np.all(annot[l_A:l_A+b, :] == 0)
231
+ p2 = np.all(annot[l_B:l_B+c, :] == 0)
232
+ if p1 and p2:
233
+ continue
234
+
235
+ np.dot(A.T, B / n, out=rfuncAB)
236
+ rfuncAB = func(rfuncAB)
237
+ cor_sum[l_A:l_A+b, :] += np.dot(rfuncAB, annot[l_B:l_B+c, :])
238
+ cor_sum[l_B:l_B+c, :] += np.dot(annot[l_A:l_A+b, :].T, rfuncAB).T
239
+ np.dot(B.T, B / n, out=rfuncBB)
240
+ rfuncBB = func(rfuncBB)
241
+ cor_sum[l_B:l_B+c, :] += np.dot(rfuncBB, annot[l_B:l_B+c, :])
242
+
243
+ return cor_sum
244
+
245
+
246
+ class PlinkBEDFile(__GenotypeArrayInMemory__):
247
+ '''
248
+ Interface for Plink .bed format
249
+ '''
250
+ def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
251
+ self._bedcode = {
252
+ 2: ba.bitarray('11'),
253
+ 9: ba.bitarray('10'),
254
+ 1: ba.bitarray('01'),
255
+ 0: ba.bitarray('00')
256
+ }
257
+
258
+ __GenotypeArrayInMemory__.__init__(self, fname, n, snp_list, keep_snps=keep_snps,
259
+ keep_indivs=keep_indivs, mafMin=mafMin)
260
+
261
+ def __read__(self, fname, m, n):
262
+ if not fname.endswith('.bed'):
263
+ raise ValueError('.bed filename must end in .bed')
264
+
265
+ fh = open(fname, 'rb')
266
+ magicNumber = ba.bitarray(endian="little")
267
+ magicNumber.fromfile(fh, 2)
268
+ bedMode = ba.bitarray(endian="little")
269
+ bedMode.fromfile(fh, 1)
270
+ e = (4 - n % 4) if n % 4 != 0 else 0
271
+ nru = n + e
272
+ self.nru = nru
273
+ # check magic number
274
+ if magicNumber != ba.bitarray('0011011011011000'):
275
+ raise IOError("Magic number from Plink .bed file not recognized")
276
+
277
+ if bedMode != ba.bitarray('10000000'):
278
+ raise IOError("Plink .bed file must be in default SNP-major mode")
279
+
280
+ # check file length
281
+ self.geno = ba.bitarray(endian="little")
282
+ self.geno.fromfile(fh)
283
+ self.__test_length__(self.geno, self.m, self.nru)
284
+ return (self.nru, self.geno)
285
+
286
+ def __test_length__(self, geno, m, nru):
287
+ exp_len = 2*m*nru
288
+ real_len = len(geno)
289
+ if real_len != exp_len:
290
+ s = "Plink .bed file has {n1} bits, expected {n2}"
291
+ raise IOError(s.format(n1=real_len, n2=exp_len))
292
+
293
+ def __filter_indivs__(self, geno, keep_indivs, m, n):
294
+ n_new = len(keep_indivs)
295
+ e = (4 - n_new % 4) if n_new % 4 != 0 else 0
296
+ nru_new = n_new + e
297
+ nru = self.nru
298
+ z = ba.bitarray(m*2*nru_new, endian="little")
299
+ z.setall(0)
300
+ for e, i in enumerate(keep_indivs):
301
+ z[2*e::2*nru_new] = geno[2*i::2*nru]
302
+ z[2*e+1::2*nru_new] = geno[2*i+1::2*nru]
303
+
304
+ self.nru = nru_new
305
+ return (z, m, n_new)
306
+
307
+ def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
308
+ '''
309
+ Credit to Chris Chang and the Plink2 developers for this algorithm
310
+ Modified from plink_filter.c
311
+ https://github.com/chrchang/plink-ng/blob/master/plink_filter.c
312
+
313
+ Genotypes are read forwards (since we are cheating and using endian="little")
314
+
315
+ A := (genotype) & 1010...
316
+ B := (genotype) & 0101...
317
+ C := (A >> 1) & B
318
+
319
+ Then
320
+
321
+ a := A.count() = missing ct + hom major ct
322
+ b := B.count() = het ct + hom major ct
323
+ c := C.count() = hom major ct
324
+
325
+ Which implies that
326
+
327
+ missing ct = a - c
328
+ # of indivs with nonmissing genotype = n - a + c
329
+ major allele ct = b + c
330
+ major allele frequency = (b+c)/(2*(n-a+c))
331
+ het ct + missing ct = a + b - 2*c
332
+
333
+ Why does bitarray not have >> ????
334
+
335
+ '''
336
+ nru = self.nru
337
+ m_poly = 0
338
+ y = ba.bitarray()
339
+ if keep_snps is None:
340
+ keep_snps = xrange(m)
341
+ kept_snps = []
342
+ freq = []
343
+ for e, j in enumerate(keep_snps):
344
+ z = geno[2*nru*j:2*nru*(j+1)]
345
+ A = z[0::2]
346
+ a = A.count()
347
+ B = z[1::2]
348
+ b = B.count()
349
+ c = (A & B).count()
350
+ major_ct = b + c # number of copies of the major allele
351
+ n_nomiss = n - a + c # number of individuals with nonmissing genotypes
352
+ f = major_ct / (2*n_nomiss) if n_nomiss > 0 else 0
353
+ het_miss_ct = a+b-2*c # remove SNPs that are only either het or missing
354
+ if np.minimum(f, 1-f) > mafMin and het_miss_ct < n:
355
+ freq.append(f)
356
+ y += z
357
+ m_poly += 1
358
+ kept_snps.append(j)
359
+
360
+ return (y, m_poly, n, kept_snps, freq)
361
+
362
+ def nextSNPs(self, b, minorRef=None):
363
+ '''
364
+ Unpacks the binary array of genotypes and returns an n x b matrix of floats of
365
+ normalized genotypes for the next b SNPs, where n := number of samples.
366
+
367
+ Parameters
368
+ ----------
369
+ b : int
370
+ Number of SNPs to return.
371
+ minorRef: bool, default None
372
+ Should we flip reference alleles so that the minor allele is the reference?
373
+ (This is useful for computing l1 w.r.t. minor allele).
374
+
375
+ Returns
376
+ -------
377
+ X : np.array with dtype float64 with shape (n, b), where n := number of samples
378
+ Matrix of genotypes normalized to mean zero and variance one. If minorRef is
379
+ not None, then the minor allele will be the positive allele (i.e., two copies
380
+ of the minor allele --> a positive number).
381
+
382
+ '''
383
+
384
+ try:
385
+ b = int(b)
386
+ if b <= 0:
387
+ raise ValueError("b must be > 0")
388
+ except TypeError:
389
+ raise TypeError("b must be an integer")
390
+
391
+ if self._currentSNP + b > self.m:
392
+ s = '{b} SNPs requested, {k} SNPs remain'
393
+ raise ValueError(s.format(b=b, k=(self.m-self._currentSNP)))
394
+
395
+ c = self._currentSNP
396
+ n = self.n
397
+ nru = self.nru
398
+ slice = self.geno[2*c*nru:2*(c+b)*nru]
399
+ X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
400
+ X = X[0:n, :]
401
+ Y = np.zeros(X.shape)
402
+ for j in xrange(0, b):
403
+ newsnp = X[:, j]
404
+ ii = newsnp != 9
405
+ avg = np.mean(newsnp[ii])
406
+ newsnp[np.logical_not(ii)] = avg
407
+ denom = np.std(newsnp)
408
+ if denom == 0:
409
+ denom = 1
410
+
411
+ if minorRef is not None and self.freq[self._currentSNP + j] > 0.5:
412
+ denom = denom*-1
413
+
414
+ Y[:, j] = (newsnp - avg) / denom
415
+
416
+ self._currentSNP += b
417
+ return Y