gsMap 1.71__py3-none-any.whl → 1.71.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN/__init__.py +0 -0
- gsMap/GNN/adjacency_matrix.py +75 -75
- gsMap/GNN/model.py +90 -89
- gsMap/GNN/train.py +0 -0
- gsMap/__init__.py +5 -5
- gsMap/__main__.py +2 -2
- gsMap/cauchy_combination_test.py +141 -141
- gsMap/config.py +805 -805
- gsMap/diagnosis.py +273 -273
- gsMap/find_latent_representation.py +133 -133
- gsMap/format_sumstats.py +407 -407
- gsMap/generate_ldscore.py +618 -618
- gsMap/latent_to_gene.py +234 -234
- gsMap/main.py +31 -31
- gsMap/report.py +160 -160
- gsMap/run_all_mode.py +194 -194
- gsMap/setup.py +0 -0
- gsMap/spatial_ldsc_multiple_sumstats.py +380 -380
- gsMap/templates/report_template.html +198 -198
- gsMap/utils/__init__.py +0 -0
- gsMap/utils/generate_r2_matrix.py +735 -735
- gsMap/utils/jackknife.py +514 -514
- gsMap/utils/make_annotations.py +518 -518
- gsMap/utils/manhattan_plot.py +639 -639
- gsMap/utils/regression_read.py +294 -294
- gsMap/visualize.py +198 -198
- {gsmap-1.71.dist-info → gsmap-1.71.1.dist-info}/LICENSE +21 -21
- {gsmap-1.71.dist-info → gsmap-1.71.1.dist-info}/METADATA +2 -2
- gsmap-1.71.1.dist-info/RECORD +31 -0
- gsmap-1.71.dist-info/RECORD +0 -31
- {gsmap-1.71.dist-info → gsmap-1.71.1.dist-info}/WHEEL +0 -0
- {gsmap-1.71.dist-info → gsmap-1.71.1.dist-info}/entry_points.txt +0 -0
@@ -1,735 +1,735 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
import bitarray as ba
|
3
|
-
import numpy as np
|
4
|
-
import pandas as pd
|
5
|
-
from scipy.sparse import csr_matrix
|
6
|
-
from scipy.sparse import save_npz, load_npz
|
7
|
-
from tqdm import trange, tqdm
|
8
|
-
|
9
|
-
|
10
|
-
# Define the log class
|
11
|
-
class Logger(object):
|
12
|
-
# -
|
13
|
-
def __init__(self, fh):
|
14
|
-
self.log_fh = open(fh, 'w')
|
15
|
-
|
16
|
-
# -
|
17
|
-
def log(self, msg):
|
18
|
-
'''
|
19
|
-
Print to log file and stdout.
|
20
|
-
'''
|
21
|
-
print(msg, file=self.log_fh)
|
22
|
-
print(msg)
|
23
|
-
|
24
|
-
# -
|
25
|
-
def close(self):
|
26
|
-
self.log_fh.close()
|
27
|
-
|
28
|
-
|
29
|
-
# Compute ld-score using cellular annotations
|
30
|
-
def get_compression(fh):
|
31
|
-
'''Which sort of compression should we use with read_csv?'''
|
32
|
-
if fh.endswith('gz'):
|
33
|
-
compression = 'gzip'
|
34
|
-
elif fh.endswith('bz2'):
|
35
|
-
compression = 'bz2'
|
36
|
-
else:
|
37
|
-
compression = None
|
38
|
-
# -
|
39
|
-
return compression
|
40
|
-
|
41
|
-
|
42
|
-
# Define the reading functions
|
43
|
-
def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
|
44
|
-
# -
|
45
|
-
class IDContainer(object):
|
46
|
-
"""
|
47
|
-
A class to read data from a file, store it as a DataFrame, and provide a method for a left outer join operation.
|
48
|
-
"""
|
49
|
-
|
50
|
-
def __init__(self, fname):
|
51
|
-
"""
|
52
|
-
Initialize the IDContainer with the given filename and reading options.
|
53
|
-
"""
|
54
|
-
self.usecols = usecols
|
55
|
-
self.colnames = colnames
|
56
|
-
self.keepcol = keepcol
|
57
|
-
self.fname_end = fname_end
|
58
|
-
self.header = header
|
59
|
-
self.read(fname)
|
60
|
-
self.n = len(self.df)
|
61
|
-
|
62
|
-
# -
|
63
|
-
def read(self, fname):
|
64
|
-
"""
|
65
|
-
Read data from the given file and store it as a DataFrame.
|
66
|
-
"""
|
67
|
-
end = self.fname_end
|
68
|
-
if end and not fname.endswith(end):
|
69
|
-
raise ValueError('{f} filename must end in {f}'.format(f=end))
|
70
|
-
comp = get_compression(fname)
|
71
|
-
self.df = pd.read_csv(fname, header=self.header, usecols=self.usecols,
|
72
|
-
sep='\s+', compression=comp)
|
73
|
-
if self.colnames:
|
74
|
-
self.df.columns = self.colnames
|
75
|
-
if self.keepcol is not None:
|
76
|
-
self.IDList = self.df.iloc[:, [self.keepcol]].astype('object')
|
77
|
-
|
78
|
-
# -
|
79
|
-
def loj(self, externalDf):
|
80
|
-
"""
|
81
|
-
Perform a left outer join operation with the given external DataFrame.
|
82
|
-
"""
|
83
|
-
r = externalDf.columns[0]
|
84
|
-
l = self.IDList.columns[0]
|
85
|
-
merge_df = externalDf.iloc[:, [0]]
|
86
|
-
merge_df['keep'] = True
|
87
|
-
z = pd.merge(self.IDList, merge_df, how='left', left_on=l, right_on=r,
|
88
|
-
sort=False)
|
89
|
-
ii = z['keep'] == True
|
90
|
-
return np.nonzero(ii)[0]
|
91
|
-
|
92
|
-
# -
|
93
|
-
return IDContainer
|
94
|
-
|
95
|
-
|
96
|
-
def getBlockLefts(coords, max_dist):
|
97
|
-
'''
|
98
|
-
Converts coordinates + max block length to the a list of coordinates of the leftmost
|
99
|
-
SNPs to be included in blocks.
|
100
|
-
Parameters
|
101
|
-
----------
|
102
|
-
coords : array
|
103
|
-
Array of coordinates. Must be sorted.
|
104
|
-
max_dist : float
|
105
|
-
Maximum distance between SNPs included in the same window.
|
106
|
-
Returns
|
107
|
-
-------
|
108
|
-
block_left : 1D np.ndarray with same length as block_left
|
109
|
-
block_left[j] := min{k | dist(j, k) < max_dist}.
|
110
|
-
'''
|
111
|
-
M = len(coords)
|
112
|
-
j = 0
|
113
|
-
block_left = np.zeros(M)
|
114
|
-
for i in range(M):
|
115
|
-
while j < M and abs(coords[j] - coords[i]) > max_dist:
|
116
|
-
j += 1
|
117
|
-
|
118
|
-
block_left[i] = j
|
119
|
-
return block_left
|
120
|
-
|
121
|
-
|
122
|
-
def block_left_to_right(block_left):
|
123
|
-
'''
|
124
|
-
Converts block lefts to block rights.
|
125
|
-
Parameters
|
126
|
-
----------
|
127
|
-
block_left : array
|
128
|
-
Array of block lefts.
|
129
|
-
Returns
|
130
|
-
-------
|
131
|
-
block_right : 1D np.ndarray with same length as block_left
|
132
|
-
block_right[j] := max {k | block_left[k] <= j}
|
133
|
-
'''
|
134
|
-
M = len(block_left)
|
135
|
-
j = 0
|
136
|
-
block_right = np.zeros(M)
|
137
|
-
for i in range(M):
|
138
|
-
while j < M and block_left[j] <= i:
|
139
|
-
j += 1
|
140
|
-
block_right[i] = j
|
141
|
-
|
142
|
-
return block_right
|
143
|
-
|
144
|
-
|
145
|
-
class GenotypeArrayInMemory(object):
|
146
|
-
'''
|
147
|
-
Parent class for various classes containing interfaces for files with genotype
|
148
|
-
matrices, e.g., plink .bed files, etc
|
149
|
-
'''
|
150
|
-
|
151
|
-
def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
|
152
|
-
self.m = len(snp_list.IDList)
|
153
|
-
self.n = n
|
154
|
-
self.keep_snps = keep_snps
|
155
|
-
self.keep_indivs = keep_indivs
|
156
|
-
self.df = np.array(snp_list.df[['CHR', 'SNP', 'BP', 'CM']])
|
157
|
-
self.colnames = ['CHR', 'SNP', 'BP', 'CM']
|
158
|
-
self.mafMin = mafMin if mafMin is not None else 0
|
159
|
-
self._currentSNP = 0
|
160
|
-
(self.nru, self.geno) = self.__read__(fname, self.m, n)
|
161
|
-
# filter individuals
|
162
|
-
if keep_indivs is not None:
|
163
|
-
keep_indivs = np.array(keep_indivs, dtype='int')
|
164
|
-
if np.any(keep_indivs > self.n):
|
165
|
-
raise ValueError('keep_indivs indices out of bounds')
|
166
|
-
# -
|
167
|
-
(self.geno, self.m, self.n) = self.__filter_indivs__(self.geno, keep_indivs, self.m, self.n)
|
168
|
-
# -
|
169
|
-
if self.n > 0:
|
170
|
-
print('After filtering, {n} individuals remain'.format(n=self.n))
|
171
|
-
else:
|
172
|
-
raise ValueError('After filtering, no individuals remain')
|
173
|
-
# -
|
174
|
-
# filter SNPs
|
175
|
-
if keep_snps is not None:
|
176
|
-
keep_snps = np.array(keep_snps, dtype='int')
|
177
|
-
if np.any(keep_snps > self.m): # if keep_snps is None, this returns False
|
178
|
-
raise ValueError('keep_snps indices out of bounds')
|
179
|
-
# -
|
180
|
-
(self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
|
181
|
-
self.geno, self.m, self.n, self.mafMin, keep_snps)
|
182
|
-
# -
|
183
|
-
if self.m > 0:
|
184
|
-
print('After filtering, {m} SNPs remain'.format(m=self.m))
|
185
|
-
else:
|
186
|
-
raise ValueError('After filtering, no SNPs remain')
|
187
|
-
# -
|
188
|
-
self.df = self.df[self.kept_snps, :]
|
189
|
-
self.maf = np.minimum(self.freq, np.ones(self.m) - self.freq)
|
190
|
-
self.sqrtpq = np.sqrt(self.freq * (np.ones(self.m) - self.freq))
|
191
|
-
self.df = np.c_[self.df, self.maf]
|
192
|
-
self.colnames.append('MAF')
|
193
|
-
|
194
|
-
# -
|
195
|
-
def __read__(self, fname, m, n):
|
196
|
-
raise NotImplementedError
|
197
|
-
|
198
|
-
def __restart__(self):
|
199
|
-
self._currentSNP = 0
|
200
|
-
|
201
|
-
# -
|
202
|
-
def __filter_indivs__(geno, keep_indivs, m, n):
|
203
|
-
raise NotImplementedError
|
204
|
-
|
205
|
-
# -
|
206
|
-
def __filter_maf_(geno, m, n, maf):
|
207
|
-
raise NotImplementedError
|
208
|
-
|
209
|
-
# -
|
210
|
-
def ldScoreVarBlocks(self, block_left, c, annot=None):
|
211
|
-
'''Computes an unbiased estimate of L2(j) for j=1,..,M.'''
|
212
|
-
func = lambda x: self.__l2_unbiased__(x, self.n)
|
213
|
-
snp_getter = self.nextSNPs
|
214
|
-
return self.__corSumVarBlocks__(block_left, c, func, snp_getter, annot)
|
215
|
-
|
216
|
-
# -
|
217
|
-
# In small samples, the observed r^2 tends to be higher than the true r^2 due to sampling variability.
|
218
|
-
# The bias correction term (1-sq) / denom adjusts for this bias by subtracting a small value that depends on the sample size and the observed r^2.
|
219
|
-
def __l2_unbiased__(self, x, n):
|
220
|
-
denom = n - 2 if n > 2 else n # allow n<2 for testing purposes
|
221
|
-
sq = np.square(x)
|
222
|
-
return sq - (1 - sq) / denom
|
223
|
-
|
224
|
-
# -
|
225
|
-
# Methods for calculating sums of Pearson correlation coefficients (i.e.,ld-score)
|
226
|
-
# c stands for the chunk size (default = 50)
|
227
|
-
def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
|
228
|
-
'''
|
229
|
-
Parameters
|
230
|
-
----------
|
231
|
-
block_left : np.ndarray with shape (M, )
|
232
|
-
block_left[i] = index of leftmost SNP included in LD Score of SNP i.
|
233
|
-
if c > 1, then only entries that are multiples of c are examined, and it is
|
234
|
-
assumed that block_left[a*c+i] = block_left[a*c], except at
|
235
|
-
the beginning of the chromosome where the 0th SNP is included in the window.
|
236
|
-
c : int
|
237
|
-
Chunk size.
|
238
|
-
func : function
|
239
|
-
Function to be applied to the genotype correlation matrix. Before dotting with
|
240
|
-
annot. Examples: for biased L2, np.square. For biased L4,
|
241
|
-
lambda x: np.square(np.square(x)). For L1, lambda x: x.
|
242
|
-
snp_getter : function(int)
|
243
|
-
The method to be used to get the next SNPs
|
244
|
-
annot: numpy array with shape (m,n_a)
|
245
|
-
SNP annotations.
|
246
|
-
Returns
|
247
|
-
-------
|
248
|
-
cor_sum : np.ndarray with shape (M, num_annots)
|
249
|
-
Estimates.
|
250
|
-
'''
|
251
|
-
m, n = self.m, self.n
|
252
|
-
block_sizes = np.array(np.arange(m) - block_left)
|
253
|
-
block_sizes = np.ceil(block_sizes / c) * c
|
254
|
-
if annot is None:
|
255
|
-
annot = np.ones((m, 1))
|
256
|
-
else:
|
257
|
-
annot_m = annot.shape[0]
|
258
|
-
if annot_m != self.m:
|
259
|
-
raise ValueError('Incorrect number of SNPs in annot')
|
260
|
-
# -
|
261
|
-
n_a = annot.shape[1] # number of annotations
|
262
|
-
cor_sum = np.zeros((m, n_a))
|
263
|
-
# b = index of first SNP for which SNP 0 is not included in LD Score
|
264
|
-
b = np.nonzero(block_left > 0)
|
265
|
-
if np.any(b):
|
266
|
-
b = b[0][0]
|
267
|
-
else:
|
268
|
-
b = m
|
269
|
-
b = int(np.ceil(b / c) * c) # round up to a multiple of c
|
270
|
-
if b > m:
|
271
|
-
c = 1
|
272
|
-
b = m
|
273
|
-
|
274
|
-
l_A = 0 # l_A := index of leftmost SNP in matrix A
|
275
|
-
A = snp_getter(b)
|
276
|
-
rfuncAB = np.zeros((b, c))
|
277
|
-
rfuncBB = np.zeros((c, c))
|
278
|
-
# chunk inside of block
|
279
|
-
for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
|
280
|
-
B = A[:, l_B:l_B + c]
|
281
|
-
# ld matrix
|
282
|
-
np.dot(A.T, B / n, out=rfuncAB)
|
283
|
-
# ld matrix square
|
284
|
-
rfuncAB = func(rfuncAB)
|
285
|
-
cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
286
|
-
|
287
|
-
# chunk to right of block
|
288
|
-
b0 = b
|
289
|
-
md = int(c * np.floor(m / c))
|
290
|
-
end = md + 1 if md != m else md
|
291
|
-
for l_B in tqdm(np.arange(b0, end, c), desc=f'Compute SNP Gene Weight'):
|
292
|
-
# check if the annot matrix is all zeros for this block + chunk
|
293
|
-
# this happens w/ sparse categories (i.e., pathways)
|
294
|
-
# update the block
|
295
|
-
old_b = b
|
296
|
-
b = int(block_sizes[l_B])
|
297
|
-
if l_B > b0 and b > 0:
|
298
|
-
# block_size can't increase more than c
|
299
|
-
# block_size can't be less than c unless it is zero
|
300
|
-
# both of these things make sense
|
301
|
-
A = np.hstack((A[:, old_b - b + c:old_b], B))
|
302
|
-
l_A += old_b - b + c
|
303
|
-
elif l_B == b0 and b > 0:
|
304
|
-
A = A[:, b0 - b:b0]
|
305
|
-
l_A = b0 - b
|
306
|
-
elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
|
307
|
-
A = np.array(()).reshape((n, 0))
|
308
|
-
l_A = l_B
|
309
|
-
if l_B == md:
|
310
|
-
c = m - md
|
311
|
-
rfuncAB = np.zeros((b, c))
|
312
|
-
rfuncBB = np.zeros((c, c))
|
313
|
-
if b != old_b:
|
314
|
-
rfuncAB = np.zeros((b, c))
|
315
|
-
# -
|
316
|
-
B = snp_getter(c)
|
317
|
-
p1 = np.all(annot[l_A:l_A + b, :] == 0)
|
318
|
-
p2 = np.all(annot[l_B:l_B + c, :] == 0)
|
319
|
-
if p1 and p2:
|
320
|
-
continue
|
321
|
-
# -
|
322
|
-
np.dot(A.T, B / n, out=rfuncAB)
|
323
|
-
rfuncAB = func(rfuncAB)
|
324
|
-
cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
325
|
-
cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
|
326
|
-
np.dot(B.T, B / n, out=rfuncBB)
|
327
|
-
rfuncBB = func(rfuncBB)
|
328
|
-
cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
|
329
|
-
# -
|
330
|
-
return cor_sum
|
331
|
-
|
332
|
-
|
333
|
-
class PlinkBEDFile(GenotypeArrayInMemory):
|
334
|
-
'''
|
335
|
-
Interface for Plink .bed format
|
336
|
-
'''
|
337
|
-
|
338
|
-
def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
|
339
|
-
self._bedcode = {
|
340
|
-
2: ba.bitarray('11'),
|
341
|
-
9: ba.bitarray('10'),
|
342
|
-
1: ba.bitarray('01'),
|
343
|
-
0: ba.bitarray('00')
|
344
|
-
}
|
345
|
-
# -
|
346
|
-
GenotypeArrayInMemory.__init__(self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs,
|
347
|
-
mafMin=mafMin)
|
348
|
-
|
349
|
-
# -
|
350
|
-
def __read__(self, fname, m, n):
|
351
|
-
if not fname.endswith('.bed'):
|
352
|
-
raise ValueError('.bed filename must end in .bed')
|
353
|
-
# -
|
354
|
-
fh = open(fname, 'rb')
|
355
|
-
magicNumber = ba.bitarray(endian="little")
|
356
|
-
magicNumber.fromfile(fh, 2)
|
357
|
-
bedMode = ba.bitarray(endian="little")
|
358
|
-
bedMode.fromfile(fh, 1)
|
359
|
-
e = (4 - n % 4) if n % 4 != 0 else 0
|
360
|
-
nru = n + e
|
361
|
-
self.nru = nru
|
362
|
-
# check magic number
|
363
|
-
if magicNumber != ba.bitarray('0011011011011000'):
|
364
|
-
raise IOError("Magic number from Plink .bed file not recognized")
|
365
|
-
# -
|
366
|
-
if bedMode != ba.bitarray('10000000'):
|
367
|
-
raise IOError("Plink .bed file must be in default SNP-major mode")
|
368
|
-
# check file length
|
369
|
-
self.geno = ba.bitarray(endian="little")
|
370
|
-
self.geno.fromfile(fh)
|
371
|
-
self.__test_length__(self.geno, self.m, self.nru)
|
372
|
-
return (self.nru, self.geno)
|
373
|
-
|
374
|
-
# -
|
375
|
-
def __test_length__(self, geno, m, nru):
|
376
|
-
exp_len = 2 * m * nru
|
377
|
-
real_len = len(geno)
|
378
|
-
if real_len != exp_len:
|
379
|
-
s = "Plink .bed file has {n1} bits, expected {n2}"
|
380
|
-
raise IOError(s.format(n1=real_len, n2=exp_len))
|
381
|
-
|
382
|
-
# -
|
383
|
-
def __filter_indivs__(self, geno, keep_indivs, m, n):
|
384
|
-
n_new = len(keep_indivs)
|
385
|
-
e = (4 - n_new % 4) if n_new % 4 != 0 else 0
|
386
|
-
nru_new = n_new + e
|
387
|
-
nru = self.nru
|
388
|
-
z = ba.bitarray(m * 2 * nru_new, endian="little")
|
389
|
-
z.setall(0)
|
390
|
-
for e, i in enumerate(keep_indivs):
|
391
|
-
z[2 * e::2 * nru_new] = geno[2 * i::2 * nru]
|
392
|
-
z[2 * e + 1::2 * nru_new] = geno[2 * i + 1::2 * nru]
|
393
|
-
self.nru = nru_new
|
394
|
-
return (z, m, n_new)
|
395
|
-
|
396
|
-
# -
|
397
|
-
def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
|
398
|
-
'''
|
399
|
-
Credit to Chris Chang and the Plink2 developers for this algorithm
|
400
|
-
Modified from plink_filter.c
|
401
|
-
https://github.com/chrchang/plink-ng/blob/master/plink_filter.c
|
402
|
-
Genotypes are read forwards (since we are cheating and using endian="little")
|
403
|
-
A := (genotype) & 1010...
|
404
|
-
B := (genotype) & 0101...
|
405
|
-
C := (A >> 1) & B
|
406
|
-
Then
|
407
|
-
a := A.count() = missing ct + hom major ct
|
408
|
-
b := B.count() = het ct + hom major ct
|
409
|
-
c := C.count() = hom major ct
|
410
|
-
Which implies that
|
411
|
-
missing ct = a - c
|
412
|
-
# of indivs with nonmissing genotype = n - a + c
|
413
|
-
major allele ct = b + c
|
414
|
-
major allele frequency = (b+c)/(2*(n-a+c))
|
415
|
-
het ct + missing ct = a + b - 2*c
|
416
|
-
Why does bitarray not have >> ????
|
417
|
-
'''
|
418
|
-
nru = self.nru
|
419
|
-
m_poly = 0
|
420
|
-
y = ba.bitarray()
|
421
|
-
if keep_snps is None:
|
422
|
-
keep_snps = range(m)
|
423
|
-
kept_snps = []
|
424
|
-
freq = []
|
425
|
-
for e, j in enumerate(keep_snps):
|
426
|
-
z = geno[2 * nru * j:2 * nru * (j + 1)]
|
427
|
-
A = z[0::2]
|
428
|
-
a = A.count()
|
429
|
-
B = z[1::2]
|
430
|
-
b = B.count()
|
431
|
-
c = (A & B).count()
|
432
|
-
major_ct = b + c # number of copies of the major allele
|
433
|
-
n_nomiss = n - a + c # number of individuals with nonmissing genotypes
|
434
|
-
f = major_ct / (2 * n_nomiss) if n_nomiss > 0 else 0
|
435
|
-
het_miss_ct = a + b - 2 * c # remove SNPs that are only either het or missing
|
436
|
-
if np.minimum(f, 1 - f) > mafMin and het_miss_ct < n:
|
437
|
-
freq.append(f)
|
438
|
-
y += z
|
439
|
-
m_poly += 1
|
440
|
-
kept_snps.append(j)
|
441
|
-
# -
|
442
|
-
return (y, m_poly, n, kept_snps, freq)
|
443
|
-
|
444
|
-
# -
|
445
|
-
def nextSNPs(self, b, minorRef=None):
|
446
|
-
'''
|
447
|
-
Unpacks the binary array of genotypes and returns an n x b matrix of floats of
|
448
|
-
normalized genotypes for the next b SNPs, where n := number of samples.
|
449
|
-
Parameters
|
450
|
-
----------
|
451
|
-
b : int
|
452
|
-
Number of SNPs to return.
|
453
|
-
minorRef: bool, default None
|
454
|
-
Should we flip reference alleles so that the minor allele is the reference?
|
455
|
-
(This is useful for computing l1 w.r.t. minor allele).
|
456
|
-
Returns
|
457
|
-
-------
|
458
|
-
X : np.array with dtype float64 with shape (n, b), where n := number of samples
|
459
|
-
Matrix of genotypes normalized to mean zero and variance one. If minorRef is
|
460
|
-
not None, then the minor allele will be the positive allele (i.e., two copies
|
461
|
-
of the minor allele --> a positive number).
|
462
|
-
'''
|
463
|
-
# -
|
464
|
-
try:
|
465
|
-
b = int(b)
|
466
|
-
if b <= 0:
|
467
|
-
raise ValueError("b must be > 0")
|
468
|
-
except TypeError:
|
469
|
-
raise TypeError("b must be an integer")
|
470
|
-
# -
|
471
|
-
if self._currentSNP + b > self.m:
|
472
|
-
s = '{b} SNPs requested, {k} SNPs remain'
|
473
|
-
raise ValueError(s.format(b=b, k=(self.m - self._currentSNP)))
|
474
|
-
# -
|
475
|
-
c = self._currentSNP
|
476
|
-
n = self.n
|
477
|
-
nru = self.nru
|
478
|
-
slice = self.geno[2 * c * nru:2 * (c + b) * nru]
|
479
|
-
X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
|
480
|
-
X = X[0:n, :]
|
481
|
-
Y = np.zeros(X.shape)
|
482
|
-
# normalize the SNPs and impute the missing one with the mean
|
483
|
-
for j in range(0, b):
|
484
|
-
newsnp = X[:, j]
|
485
|
-
ii = newsnp != 9
|
486
|
-
avg = np.mean(newsnp[ii])
|
487
|
-
newsnp[np.logical_not(ii)] = avg
|
488
|
-
denom = np.std(newsnp)
|
489
|
-
if denom == 0:
|
490
|
-
denom = 1
|
491
|
-
# -
|
492
|
-
if minorRef is not None and self.freq[self._currentSNP + j] > 0.5:
|
493
|
-
denom = denom * -1
|
494
|
-
# -
|
495
|
-
Y[:, j] = (newsnp - avg) / denom
|
496
|
-
# -
|
497
|
-
self._currentSNP += b
|
498
|
-
return Y
|
499
|
-
|
500
|
-
|
501
|
-
class PlinkBEDFileWithR2Cache(PlinkBEDFile):
|
502
|
-
def compute_r2_cache(self,
|
503
|
-
block_left,
|
504
|
-
output_cache_file_dir: Path,
|
505
|
-
chunk_size=500_000_000,
|
506
|
-
c=500,
|
507
|
-
r2_threshold=1e-4,
|
508
|
-
annot=None):
|
509
|
-
|
510
|
-
func = np.square
|
511
|
-
snp_getter = self.nextSNPs
|
512
|
-
data, rows, cols = [], [], []
|
513
|
-
|
514
|
-
def add_rfuncAB(rfuncAB, l_A, l_B):
|
515
|
-
non_zero_indices = np.nonzero(rfuncAB > r2_threshold)
|
516
|
-
data.extend(rfuncAB[non_zero_indices])
|
517
|
-
rows.extend(l_A + non_zero_indices[0])
|
518
|
-
cols.extend(l_B + non_zero_indices[1])
|
519
|
-
|
520
|
-
# def add_rfuncAB(rfuncAB, l_A, l_B):
|
521
|
-
# # not need select non zero indices
|
522
|
-
# data.extend(rfuncAB.flatten())
|
523
|
-
# rows.extend(l_A + np.repeat(np.arange(rfuncAB.shape[0]), rfuncAB.shape[1]))
|
524
|
-
# cols.extend(l_B + np.tile(np.arange(rfuncAB.shape[1]), rfuncAB.shape[0]))
|
525
|
-
|
526
|
-
# def add_rfuncBB(rfuncBB, l_B):
|
527
|
-
# non_zero_indices = np.nonzero(rfuncBB)
|
528
|
-
# data.extend(rfuncBB[non_zero_indices])
|
529
|
-
# rows.extend(l_B + non_zero_indices[0])
|
530
|
-
# cols.extend(l_B + non_zero_indices[1])
|
531
|
-
|
532
|
-
def add_rfuncBB(rfuncBB, l_B):
|
533
|
-
non_zero_indices = np.nonzero(rfuncBB > r2_threshold)
|
534
|
-
data.extend(rfuncBB[non_zero_indices])
|
535
|
-
rows.extend(l_B + non_zero_indices[0])
|
536
|
-
cols.extend(l_B + non_zero_indices[1])
|
537
|
-
if len(data) > chunk_size:
|
538
|
-
# save the cache
|
539
|
-
print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
|
540
|
-
r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(self.m, self.m), dtype='float16')
|
541
|
-
save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
|
542
|
-
# reset the data
|
543
|
-
data.clear()
|
544
|
-
rows.clear()
|
545
|
-
cols.clear()
|
546
|
-
|
547
|
-
m, n = self.m, self.n
|
548
|
-
block_sizes = np.array(np.arange(m) - block_left)
|
549
|
-
block_sizes = np.ceil(block_sizes / c) * c
|
550
|
-
if annot is None:
|
551
|
-
annot = np.ones((m, 1))
|
552
|
-
else:
|
553
|
-
annot_m = annot.shape[0]
|
554
|
-
if annot_m != self.m:
|
555
|
-
raise ValueError('Incorrect number of SNPs in annot')
|
556
|
-
# -
|
557
|
-
n_a = annot.shape[1] # number of annotations
|
558
|
-
# cor_sum = np.zeros((m, n_a))
|
559
|
-
# b = index of first SNP for which SNP 0 is not included in LD Score
|
560
|
-
b = np.nonzero(block_left > 0)
|
561
|
-
if np.any(b):
|
562
|
-
b = b[0][0]
|
563
|
-
else:
|
564
|
-
b = m
|
565
|
-
b = int(np.ceil(b / c) * c) # round up to a multiple of c
|
566
|
-
if b > m:
|
567
|
-
c = 1
|
568
|
-
b = m
|
569
|
-
|
570
|
-
l_A = 0 # l_A := index of leftmost SNP in matrix A
|
571
|
-
A = snp_getter(b)
|
572
|
-
rfuncAB = np.zeros((b, c))
|
573
|
-
rfuncBB = np.zeros((c, c))
|
574
|
-
# chunk inside of block
|
575
|
-
for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
|
576
|
-
B = A[:, l_B:l_B + c]
|
577
|
-
# ld matrix
|
578
|
-
np.dot(A.T, B / n, out=rfuncAB)
|
579
|
-
# ld matrix square
|
580
|
-
rfuncAB = func(rfuncAB)
|
581
|
-
add_rfuncAB(rfuncAB, l_A, l_B)
|
582
|
-
# cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
583
|
-
|
584
|
-
# chunk to right of block
|
585
|
-
b0 = b
|
586
|
-
md = int(c * np.floor(m / c))
|
587
|
-
end = md + 1 if md != m else md
|
588
|
-
for l_B in trange(b0, end, c, desc=f'Compute r2 cache for {output_cache_file_dir.name}'):
|
589
|
-
# check if the annot matrix is all zeros for this block + chunk
|
590
|
-
# this happens w/ sparse categories (i.e., pathways)
|
591
|
-
# update the block
|
592
|
-
old_b = b
|
593
|
-
b = int(block_sizes[l_B])
|
594
|
-
if l_B > b0 and b > 0:
|
595
|
-
# block_size can't increase more than c
|
596
|
-
# block_size can't be less than c unless it is zero
|
597
|
-
# both of these things make sense
|
598
|
-
A = np.hstack((A[:, old_b - b + c:old_b], B))
|
599
|
-
l_A += old_b - b + c
|
600
|
-
elif l_B == b0 and b > 0:
|
601
|
-
A = A[:, b0 - b:b0]
|
602
|
-
l_A = b0 - b
|
603
|
-
elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
|
604
|
-
A = np.array(()).reshape((n, 0))
|
605
|
-
l_A = l_B
|
606
|
-
if l_B == md:
|
607
|
-
c = m - md
|
608
|
-
rfuncAB = np.zeros((b, c))
|
609
|
-
rfuncBB = np.zeros((c, c))
|
610
|
-
if b != old_b:
|
611
|
-
rfuncAB = np.zeros((b, c))
|
612
|
-
# -
|
613
|
-
B = snp_getter(c)
|
614
|
-
p1 = np.all(annot[l_A:l_A + b, :] == 0)
|
615
|
-
p2 = np.all(annot[l_B:l_B + c, :] == 0)
|
616
|
-
if p1 and p2:
|
617
|
-
continue
|
618
|
-
# -
|
619
|
-
np.dot(A.T, B / n, out=rfuncAB)
|
620
|
-
rfuncAB = func(rfuncAB)
|
621
|
-
# cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
622
|
-
# cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
|
623
|
-
add_rfuncAB(rfuncAB, l_A, l_B)
|
624
|
-
add_rfuncAB(rfuncAB.T, l_B, l_A)
|
625
|
-
np.dot(B.T, B / n, out=rfuncBB)
|
626
|
-
rfuncBB = func(rfuncBB)
|
627
|
-
# cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
|
628
|
-
add_rfuncBB(rfuncBB, l_B)
|
629
|
-
if len(data) > 0:
|
630
|
-
# save remaining data
|
631
|
-
# save the cache
|
632
|
-
print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
|
633
|
-
r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(m, m), dtype='float16')
|
634
|
-
save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
|
635
|
-
# combine the cache files
|
636
|
-
print(f'Start combining the cache files in {output_cache_file_dir}')
|
637
|
-
cached_r2_matrix_files = list(output_cache_file_dir.glob('*.npz'))
|
638
|
-
combined_r2_matrix_files = self.load_r2_matrix_from_cache_files(output_cache_file_dir)
|
639
|
-
# remove the cache files
|
640
|
-
for cached_r2_matrix_file in cached_r2_matrix_files:
|
641
|
-
cached_r2_matrix_file.unlink()
|
642
|
-
# save the combined r2 matrix
|
643
|
-
print(f'Start saving the combined r2 matrix in {output_cache_file_dir}')
|
644
|
-
combined_r2_matrix_file = output_cache_file_dir / 'combined_r2_matrix.npz'
|
645
|
-
save_npz(combined_r2_matrix_file, combined_r2_matrix_files)
|
646
|
-
|
647
|
-
def get_ldscore_using_r2_cache(self, annot_matrix, cached_r2_matrix_dir):
|
648
|
-
"""
|
649
|
-
Compute the r2 matrix multiplication with annot_matrix
|
650
|
-
"""
|
651
|
-
# Compute the r2 matrix multiplication with annot_matrix
|
652
|
-
cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
|
653
|
-
# iter the cached r2 matrix files
|
654
|
-
result_matrix = np.zeros((self.m, annot_matrix.shape[1]))
|
655
|
-
cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
|
656
|
-
assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
|
657
|
-
f'Please run the function compute_r2_cache first!')
|
658
|
-
for r2_matrix_file in tqdm(cached_r2_matrix_files, desc=f'Compute ld score for {cached_r2_matrix_dir.name}'):
|
659
|
-
print(f'Compute r2 matrix multiplication for {r2_matrix_file}')
|
660
|
-
r2_matrix = load_npz(r2_matrix_file)
|
661
|
-
result_matrix += r2_matrix.dot(annot_matrix)
|
662
|
-
return result_matrix
|
663
|
-
|
664
|
-
def load_r2_matrix_from_cache_files(self, cached_r2_matrix_dir):
|
665
|
-
"""
|
666
|
-
Load the r2 matrix from cache
|
667
|
-
"""
|
668
|
-
cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
|
669
|
-
# iter the cached r2 matrix files
|
670
|
-
cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
|
671
|
-
assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
|
672
|
-
f'Please run the function compute_r2_cache first!')
|
673
|
-
# load the r2 matrix
|
674
|
-
r2_matrix = load_npz(cached_r2_matrix_files[0])
|
675
|
-
for r2_matrix_file in tqdm(cached_r2_matrix_files[1:], desc=f'Load r2 matrix from {cached_r2_matrix_dir.name}'):
|
676
|
-
print(f'Load r2 matrix from {r2_matrix_file}')
|
677
|
-
r2_matrix += load_npz(r2_matrix_file)
|
678
|
-
# to float16
|
679
|
-
r2_matrix = r2_matrix.astype('float16')
|
680
|
-
return r2_matrix
|
681
|
-
def load_combined_r2_matrix(self, cached_r2_matrix_dir):
|
682
|
-
"""
|
683
|
-
Load the combined r2 matrix
|
684
|
-
"""
|
685
|
-
combined_r2_matrix_file = Path(cached_r2_matrix_dir) / 'combined_r2_matrix.npz'
|
686
|
-
assert combined_r2_matrix_file.exists(), (f'No combined r2 matrix file in {cached_r2_matrix_dir}'
|
687
|
-
f'Should delete the cache files and run the function compute_r2_cache first!')
|
688
|
-
# load the r2 matrix
|
689
|
-
r2_matrix = load_npz(combined_r2_matrix_file)
|
690
|
-
# to float16
|
691
|
-
r2_matrix = r2_matrix.astype('float16')
|
692
|
-
return r2_matrix
|
693
|
-
|
694
|
-
def load_bfile(bfile_chr_prefix):
|
695
|
-
PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
|
696
|
-
PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
|
697
|
-
|
698
|
-
snp_file, snp_obj = bfile_chr_prefix + '.bim', PlinkBIMFile
|
699
|
-
array_snps = snp_obj(snp_file)
|
700
|
-
m = len(array_snps.IDList)
|
701
|
-
print(f'Read list of {m} SNPs from {snp_file}')
|
702
|
-
#
|
703
|
-
# Load fam
|
704
|
-
ind_file, ind_obj = bfile_chr_prefix + '.fam', PlinkFAMFile
|
705
|
-
array_indivs = ind_obj(ind_file)
|
706
|
-
n = len(array_indivs.IDList)
|
707
|
-
print(f'Read list of {n} individuals from {ind_file}')
|
708
|
-
|
709
|
-
# Load genotype array
|
710
|
-
array_file, array_obj = bfile_chr_prefix + '.bed', PlinkBEDFileWithR2Cache
|
711
|
-
geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
|
712
|
-
|
713
|
-
return array_snps, array_indivs, geno_array
|
714
|
-
|
715
|
-
|
716
|
-
def generate_r2_matrix_chr_cache(bfile_chr_prefix, ld_wind_cm, output_cache_file_dir):
|
717
|
-
# Load genotype array
|
718
|
-
array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix)
|
719
|
-
# Compute block lefts
|
720
|
-
block_left = getBlockLefts(geno_array.df[:, 3], ld_wind_cm)
|
721
|
-
# Compute LD score
|
722
|
-
r2_matrix = geno_array.load_r2_matrix_from_cache(output_cache_file_dir)
|
723
|
-
|
724
|
-
|
725
|
-
def generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_wind_cm=1):
|
726
|
-
r2_cache_dir = Path(r2_cache_dir)
|
727
|
-
|
728
|
-
for chr in chromosome_list:
|
729
|
-
output_cache_file_prefix = r2_cache_dir / f'chr{chr}'
|
730
|
-
output_cache_file_prefix.mkdir(parents=True, exist_ok=True)
|
731
|
-
bfile_chr_prefix = bfile_prefix + '.' + str(chr)
|
732
|
-
generate_r2_matrix_chr_cache(bfile_chr_prefix,
|
733
|
-
ld_wind_cm=ld_wind_cm,
|
734
|
-
output_cache_file_dir=output_cache_file_prefix)
|
735
|
-
print(f'Compute r2 matrix for chr{chr} done!')
|
1
|
+
from pathlib import Path
|
2
|
+
import bitarray as ba
|
3
|
+
import numpy as np
|
4
|
+
import pandas as pd
|
5
|
+
from scipy.sparse import csr_matrix
|
6
|
+
from scipy.sparse import save_npz, load_npz
|
7
|
+
from tqdm import trange, tqdm
|
8
|
+
|
9
|
+
|
10
|
+
# Define the log class
|
11
|
+
class Logger(object):
|
12
|
+
# -
|
13
|
+
def __init__(self, fh):
|
14
|
+
self.log_fh = open(fh, 'w')
|
15
|
+
|
16
|
+
# -
|
17
|
+
def log(self, msg):
|
18
|
+
'''
|
19
|
+
Print to log file and stdout.
|
20
|
+
'''
|
21
|
+
print(msg, file=self.log_fh)
|
22
|
+
print(msg)
|
23
|
+
|
24
|
+
# -
|
25
|
+
def close(self):
|
26
|
+
self.log_fh.close()
|
27
|
+
|
28
|
+
|
29
|
+
# Compute ld-score using cellular annotations
|
30
|
+
def get_compression(fh):
|
31
|
+
'''Which sort of compression should we use with read_csv?'''
|
32
|
+
if fh.endswith('gz'):
|
33
|
+
compression = 'gzip'
|
34
|
+
elif fh.endswith('bz2'):
|
35
|
+
compression = 'bz2'
|
36
|
+
else:
|
37
|
+
compression = None
|
38
|
+
# -
|
39
|
+
return compression
|
40
|
+
|
41
|
+
|
42
|
+
# Define the reading functions
|
43
|
+
def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
|
44
|
+
# -
|
45
|
+
class IDContainer(object):
|
46
|
+
"""
|
47
|
+
A class to read data from a file, store it as a DataFrame, and provide a method for a left outer join operation.
|
48
|
+
"""
|
49
|
+
|
50
|
+
def __init__(self, fname):
|
51
|
+
"""
|
52
|
+
Initialize the IDContainer with the given filename and reading options.
|
53
|
+
"""
|
54
|
+
self.usecols = usecols
|
55
|
+
self.colnames = colnames
|
56
|
+
self.keepcol = keepcol
|
57
|
+
self.fname_end = fname_end
|
58
|
+
self.header = header
|
59
|
+
self.read(fname)
|
60
|
+
self.n = len(self.df)
|
61
|
+
|
62
|
+
# -
|
63
|
+
def read(self, fname):
|
64
|
+
"""
|
65
|
+
Read data from the given file and store it as a DataFrame.
|
66
|
+
"""
|
67
|
+
end = self.fname_end
|
68
|
+
if end and not fname.endswith(end):
|
69
|
+
raise ValueError('{f} filename must end in {f}'.format(f=end))
|
70
|
+
comp = get_compression(fname)
|
71
|
+
self.df = pd.read_csv(fname, header=self.header, usecols=self.usecols,
|
72
|
+
sep='\s+', compression=comp)
|
73
|
+
if self.colnames:
|
74
|
+
self.df.columns = self.colnames
|
75
|
+
if self.keepcol is not None:
|
76
|
+
self.IDList = self.df.iloc[:, [self.keepcol]].astype('object')
|
77
|
+
|
78
|
+
# -
|
79
|
+
def loj(self, externalDf):
|
80
|
+
"""
|
81
|
+
Perform a left outer join operation with the given external DataFrame.
|
82
|
+
"""
|
83
|
+
r = externalDf.columns[0]
|
84
|
+
l = self.IDList.columns[0]
|
85
|
+
merge_df = externalDf.iloc[:, [0]]
|
86
|
+
merge_df['keep'] = True
|
87
|
+
z = pd.merge(self.IDList, merge_df, how='left', left_on=l, right_on=r,
|
88
|
+
sort=False)
|
89
|
+
ii = z['keep'] == True
|
90
|
+
return np.nonzero(ii)[0]
|
91
|
+
|
92
|
+
# -
|
93
|
+
return IDContainer
|
94
|
+
|
95
|
+
|
96
|
+
def getBlockLefts(coords, max_dist):
|
97
|
+
'''
|
98
|
+
Converts coordinates + max block length to the a list of coordinates of the leftmost
|
99
|
+
SNPs to be included in blocks.
|
100
|
+
Parameters
|
101
|
+
----------
|
102
|
+
coords : array
|
103
|
+
Array of coordinates. Must be sorted.
|
104
|
+
max_dist : float
|
105
|
+
Maximum distance between SNPs included in the same window.
|
106
|
+
Returns
|
107
|
+
-------
|
108
|
+
block_left : 1D np.ndarray with same length as block_left
|
109
|
+
block_left[j] := min{k | dist(j, k) < max_dist}.
|
110
|
+
'''
|
111
|
+
M = len(coords)
|
112
|
+
j = 0
|
113
|
+
block_left = np.zeros(M)
|
114
|
+
for i in range(M):
|
115
|
+
while j < M and abs(coords[j] - coords[i]) > max_dist:
|
116
|
+
j += 1
|
117
|
+
|
118
|
+
block_left[i] = j
|
119
|
+
return block_left
|
120
|
+
|
121
|
+
|
122
|
+
def block_left_to_right(block_left):
|
123
|
+
'''
|
124
|
+
Converts block lefts to block rights.
|
125
|
+
Parameters
|
126
|
+
----------
|
127
|
+
block_left : array
|
128
|
+
Array of block lefts.
|
129
|
+
Returns
|
130
|
+
-------
|
131
|
+
block_right : 1D np.ndarray with same length as block_left
|
132
|
+
block_right[j] := max {k | block_left[k] <= j}
|
133
|
+
'''
|
134
|
+
M = len(block_left)
|
135
|
+
j = 0
|
136
|
+
block_right = np.zeros(M)
|
137
|
+
for i in range(M):
|
138
|
+
while j < M and block_left[j] <= i:
|
139
|
+
j += 1
|
140
|
+
block_right[i] = j
|
141
|
+
|
142
|
+
return block_right
|
143
|
+
|
144
|
+
|
145
|
+
class GenotypeArrayInMemory(object):
|
146
|
+
'''
|
147
|
+
Parent class for various classes containing interfaces for files with genotype
|
148
|
+
matrices, e.g., plink .bed files, etc
|
149
|
+
'''
|
150
|
+
|
151
|
+
def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
|
152
|
+
self.m = len(snp_list.IDList)
|
153
|
+
self.n = n
|
154
|
+
self.keep_snps = keep_snps
|
155
|
+
self.keep_indivs = keep_indivs
|
156
|
+
self.df = np.array(snp_list.df[['CHR', 'SNP', 'BP', 'CM']])
|
157
|
+
self.colnames = ['CHR', 'SNP', 'BP', 'CM']
|
158
|
+
self.mafMin = mafMin if mafMin is not None else 0
|
159
|
+
self._currentSNP = 0
|
160
|
+
(self.nru, self.geno) = self.__read__(fname, self.m, n)
|
161
|
+
# filter individuals
|
162
|
+
if keep_indivs is not None:
|
163
|
+
keep_indivs = np.array(keep_indivs, dtype='int')
|
164
|
+
if np.any(keep_indivs > self.n):
|
165
|
+
raise ValueError('keep_indivs indices out of bounds')
|
166
|
+
# -
|
167
|
+
(self.geno, self.m, self.n) = self.__filter_indivs__(self.geno, keep_indivs, self.m, self.n)
|
168
|
+
# -
|
169
|
+
if self.n > 0:
|
170
|
+
print('After filtering, {n} individuals remain'.format(n=self.n))
|
171
|
+
else:
|
172
|
+
raise ValueError('After filtering, no individuals remain')
|
173
|
+
# -
|
174
|
+
# filter SNPs
|
175
|
+
if keep_snps is not None:
|
176
|
+
keep_snps = np.array(keep_snps, dtype='int')
|
177
|
+
if np.any(keep_snps > self.m): # if keep_snps is None, this returns False
|
178
|
+
raise ValueError('keep_snps indices out of bounds')
|
179
|
+
# -
|
180
|
+
(self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
|
181
|
+
self.geno, self.m, self.n, self.mafMin, keep_snps)
|
182
|
+
# -
|
183
|
+
if self.m > 0:
|
184
|
+
print('After filtering, {m} SNPs remain'.format(m=self.m))
|
185
|
+
else:
|
186
|
+
raise ValueError('After filtering, no SNPs remain')
|
187
|
+
# -
|
188
|
+
self.df = self.df[self.kept_snps, :]
|
189
|
+
self.maf = np.minimum(self.freq, np.ones(self.m) - self.freq)
|
190
|
+
self.sqrtpq = np.sqrt(self.freq * (np.ones(self.m) - self.freq))
|
191
|
+
self.df = np.c_[self.df, self.maf]
|
192
|
+
self.colnames.append('MAF')
|
193
|
+
|
194
|
+
# -
|
195
|
+
def __read__(self, fname, m, n):
|
196
|
+
raise NotImplementedError
|
197
|
+
|
198
|
+
def __restart__(self):
|
199
|
+
self._currentSNP = 0
|
200
|
+
|
201
|
+
# -
|
202
|
+
def __filter_indivs__(geno, keep_indivs, m, n):
|
203
|
+
raise NotImplementedError
|
204
|
+
|
205
|
+
# -
|
206
|
+
def __filter_maf_(geno, m, n, maf):
|
207
|
+
raise NotImplementedError
|
208
|
+
|
209
|
+
# -
|
210
|
+
def ldScoreVarBlocks(self, block_left, c, annot=None):
|
211
|
+
'''Computes an unbiased estimate of L2(j) for j=1,..,M.'''
|
212
|
+
func = lambda x: self.__l2_unbiased__(x, self.n)
|
213
|
+
snp_getter = self.nextSNPs
|
214
|
+
return self.__corSumVarBlocks__(block_left, c, func, snp_getter, annot)
|
215
|
+
|
216
|
+
# -
|
217
|
+
# In small samples, the observed r^2 tends to be higher than the true r^2 due to sampling variability.
|
218
|
+
# The bias correction term (1-sq) / denom adjusts for this bias by subtracting a small value that depends on the sample size and the observed r^2.
|
219
|
+
def __l2_unbiased__(self, x, n):
|
220
|
+
denom = n - 2 if n > 2 else n # allow n<2 for testing purposes
|
221
|
+
sq = np.square(x)
|
222
|
+
return sq - (1 - sq) / denom
|
223
|
+
|
224
|
+
# -
|
225
|
+
# Methods for calculating sums of Pearson correlation coefficients (i.e.,ld-score)
|
226
|
+
# c stands for the chunk size (default = 50)
|
227
|
+
def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
|
228
|
+
'''
|
229
|
+
Parameters
|
230
|
+
----------
|
231
|
+
block_left : np.ndarray with shape (M, )
|
232
|
+
block_left[i] = index of leftmost SNP included in LD Score of SNP i.
|
233
|
+
if c > 1, then only entries that are multiples of c are examined, and it is
|
234
|
+
assumed that block_left[a*c+i] = block_left[a*c], except at
|
235
|
+
the beginning of the chromosome where the 0th SNP is included in the window.
|
236
|
+
c : int
|
237
|
+
Chunk size.
|
238
|
+
func : function
|
239
|
+
Function to be applied to the genotype correlation matrix. Before dotting with
|
240
|
+
annot. Examples: for biased L2, np.square. For biased L4,
|
241
|
+
lambda x: np.square(np.square(x)). For L1, lambda x: x.
|
242
|
+
snp_getter : function(int)
|
243
|
+
The method to be used to get the next SNPs
|
244
|
+
annot: numpy array with shape (m,n_a)
|
245
|
+
SNP annotations.
|
246
|
+
Returns
|
247
|
+
-------
|
248
|
+
cor_sum : np.ndarray with shape (M, num_annots)
|
249
|
+
Estimates.
|
250
|
+
'''
|
251
|
+
m, n = self.m, self.n
|
252
|
+
block_sizes = np.array(np.arange(m) - block_left)
|
253
|
+
block_sizes = np.ceil(block_sizes / c) * c
|
254
|
+
if annot is None:
|
255
|
+
annot = np.ones((m, 1))
|
256
|
+
else:
|
257
|
+
annot_m = annot.shape[0]
|
258
|
+
if annot_m != self.m:
|
259
|
+
raise ValueError('Incorrect number of SNPs in annot')
|
260
|
+
# -
|
261
|
+
n_a = annot.shape[1] # number of annotations
|
262
|
+
cor_sum = np.zeros((m, n_a))
|
263
|
+
# b = index of first SNP for which SNP 0 is not included in LD Score
|
264
|
+
b = np.nonzero(block_left > 0)
|
265
|
+
if np.any(b):
|
266
|
+
b = b[0][0]
|
267
|
+
else:
|
268
|
+
b = m
|
269
|
+
b = int(np.ceil(b / c) * c) # round up to a multiple of c
|
270
|
+
if b > m:
|
271
|
+
c = 1
|
272
|
+
b = m
|
273
|
+
|
274
|
+
l_A = 0 # l_A := index of leftmost SNP in matrix A
|
275
|
+
A = snp_getter(b)
|
276
|
+
rfuncAB = np.zeros((b, c))
|
277
|
+
rfuncBB = np.zeros((c, c))
|
278
|
+
# chunk inside of block
|
279
|
+
for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
|
280
|
+
B = A[:, l_B:l_B + c]
|
281
|
+
# ld matrix
|
282
|
+
np.dot(A.T, B / n, out=rfuncAB)
|
283
|
+
# ld matrix square
|
284
|
+
rfuncAB = func(rfuncAB)
|
285
|
+
cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
286
|
+
|
287
|
+
# chunk to right of block
|
288
|
+
b0 = b
|
289
|
+
md = int(c * np.floor(m / c))
|
290
|
+
end = md + 1 if md != m else md
|
291
|
+
for l_B in tqdm(np.arange(b0, end, c), desc=f'Compute SNP Gene Weight'):
|
292
|
+
# check if the annot matrix is all zeros for this block + chunk
|
293
|
+
# this happens w/ sparse categories (i.e., pathways)
|
294
|
+
# update the block
|
295
|
+
old_b = b
|
296
|
+
b = int(block_sizes[l_B])
|
297
|
+
if l_B > b0 and b > 0:
|
298
|
+
# block_size can't increase more than c
|
299
|
+
# block_size can't be less than c unless it is zero
|
300
|
+
# both of these things make sense
|
301
|
+
A = np.hstack((A[:, old_b - b + c:old_b], B))
|
302
|
+
l_A += old_b - b + c
|
303
|
+
elif l_B == b0 and b > 0:
|
304
|
+
A = A[:, b0 - b:b0]
|
305
|
+
l_A = b0 - b
|
306
|
+
elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
|
307
|
+
A = np.array(()).reshape((n, 0))
|
308
|
+
l_A = l_B
|
309
|
+
if l_B == md:
|
310
|
+
c = m - md
|
311
|
+
rfuncAB = np.zeros((b, c))
|
312
|
+
rfuncBB = np.zeros((c, c))
|
313
|
+
if b != old_b:
|
314
|
+
rfuncAB = np.zeros((b, c))
|
315
|
+
# -
|
316
|
+
B = snp_getter(c)
|
317
|
+
p1 = np.all(annot[l_A:l_A + b, :] == 0)
|
318
|
+
p2 = np.all(annot[l_B:l_B + c, :] == 0)
|
319
|
+
if p1 and p2:
|
320
|
+
continue
|
321
|
+
# -
|
322
|
+
np.dot(A.T, B / n, out=rfuncAB)
|
323
|
+
rfuncAB = func(rfuncAB)
|
324
|
+
cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
325
|
+
cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
|
326
|
+
np.dot(B.T, B / n, out=rfuncBB)
|
327
|
+
rfuncBB = func(rfuncBB)
|
328
|
+
cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
|
329
|
+
# -
|
330
|
+
return cor_sum
|
331
|
+
|
332
|
+
|
333
|
+
class PlinkBEDFile(GenotypeArrayInMemory):
|
334
|
+
'''
|
335
|
+
Interface for Plink .bed format
|
336
|
+
'''
|
337
|
+
|
338
|
+
def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
|
339
|
+
self._bedcode = {
|
340
|
+
2: ba.bitarray('11'),
|
341
|
+
9: ba.bitarray('10'),
|
342
|
+
1: ba.bitarray('01'),
|
343
|
+
0: ba.bitarray('00')
|
344
|
+
}
|
345
|
+
# -
|
346
|
+
GenotypeArrayInMemory.__init__(self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs,
|
347
|
+
mafMin=mafMin)
|
348
|
+
|
349
|
+
# -
|
350
|
+
def __read__(self, fname, m, n):
|
351
|
+
if not fname.endswith('.bed'):
|
352
|
+
raise ValueError('.bed filename must end in .bed')
|
353
|
+
# -
|
354
|
+
fh = open(fname, 'rb')
|
355
|
+
magicNumber = ba.bitarray(endian="little")
|
356
|
+
magicNumber.fromfile(fh, 2)
|
357
|
+
bedMode = ba.bitarray(endian="little")
|
358
|
+
bedMode.fromfile(fh, 1)
|
359
|
+
e = (4 - n % 4) if n % 4 != 0 else 0
|
360
|
+
nru = n + e
|
361
|
+
self.nru = nru
|
362
|
+
# check magic number
|
363
|
+
if magicNumber != ba.bitarray('0011011011011000'):
|
364
|
+
raise IOError("Magic number from Plink .bed file not recognized")
|
365
|
+
# -
|
366
|
+
if bedMode != ba.bitarray('10000000'):
|
367
|
+
raise IOError("Plink .bed file must be in default SNP-major mode")
|
368
|
+
# check file length
|
369
|
+
self.geno = ba.bitarray(endian="little")
|
370
|
+
self.geno.fromfile(fh)
|
371
|
+
self.__test_length__(self.geno, self.m, self.nru)
|
372
|
+
return (self.nru, self.geno)
|
373
|
+
|
374
|
+
# -
|
375
|
+
def __test_length__(self, geno, m, nru):
|
376
|
+
exp_len = 2 * m * nru
|
377
|
+
real_len = len(geno)
|
378
|
+
if real_len != exp_len:
|
379
|
+
s = "Plink .bed file has {n1} bits, expected {n2}"
|
380
|
+
raise IOError(s.format(n1=real_len, n2=exp_len))
|
381
|
+
|
382
|
+
# -
|
383
|
+
def __filter_indivs__(self, geno, keep_indivs, m, n):
|
384
|
+
n_new = len(keep_indivs)
|
385
|
+
e = (4 - n_new % 4) if n_new % 4 != 0 else 0
|
386
|
+
nru_new = n_new + e
|
387
|
+
nru = self.nru
|
388
|
+
z = ba.bitarray(m * 2 * nru_new, endian="little")
|
389
|
+
z.setall(0)
|
390
|
+
for e, i in enumerate(keep_indivs):
|
391
|
+
z[2 * e::2 * nru_new] = geno[2 * i::2 * nru]
|
392
|
+
z[2 * e + 1::2 * nru_new] = geno[2 * i + 1::2 * nru]
|
393
|
+
self.nru = nru_new
|
394
|
+
return (z, m, n_new)
|
395
|
+
|
396
|
+
# -
|
397
|
+
def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
|
398
|
+
'''
|
399
|
+
Credit to Chris Chang and the Plink2 developers for this algorithm
|
400
|
+
Modified from plink_filter.c
|
401
|
+
https://github.com/chrchang/plink-ng/blob/master/plink_filter.c
|
402
|
+
Genotypes are read forwards (since we are cheating and using endian="little")
|
403
|
+
A := (genotype) & 1010...
|
404
|
+
B := (genotype) & 0101...
|
405
|
+
C := (A >> 1) & B
|
406
|
+
Then
|
407
|
+
a := A.count() = missing ct + hom major ct
|
408
|
+
b := B.count() = het ct + hom major ct
|
409
|
+
c := C.count() = hom major ct
|
410
|
+
Which implies that
|
411
|
+
missing ct = a - c
|
412
|
+
# of indivs with nonmissing genotype = n - a + c
|
413
|
+
major allele ct = b + c
|
414
|
+
major allele frequency = (b+c)/(2*(n-a+c))
|
415
|
+
het ct + missing ct = a + b - 2*c
|
416
|
+
Why does bitarray not have >> ????
|
417
|
+
'''
|
418
|
+
nru = self.nru
|
419
|
+
m_poly = 0
|
420
|
+
y = ba.bitarray()
|
421
|
+
if keep_snps is None:
|
422
|
+
keep_snps = range(m)
|
423
|
+
kept_snps = []
|
424
|
+
freq = []
|
425
|
+
for e, j in enumerate(keep_snps):
|
426
|
+
z = geno[2 * nru * j:2 * nru * (j + 1)]
|
427
|
+
A = z[0::2]
|
428
|
+
a = A.count()
|
429
|
+
B = z[1::2]
|
430
|
+
b = B.count()
|
431
|
+
c = (A & B).count()
|
432
|
+
major_ct = b + c # number of copies of the major allele
|
433
|
+
n_nomiss = n - a + c # number of individuals with nonmissing genotypes
|
434
|
+
f = major_ct / (2 * n_nomiss) if n_nomiss > 0 else 0
|
435
|
+
het_miss_ct = a + b - 2 * c # remove SNPs that are only either het or missing
|
436
|
+
if np.minimum(f, 1 - f) > mafMin and het_miss_ct < n:
|
437
|
+
freq.append(f)
|
438
|
+
y += z
|
439
|
+
m_poly += 1
|
440
|
+
kept_snps.append(j)
|
441
|
+
# -
|
442
|
+
return (y, m_poly, n, kept_snps, freq)
|
443
|
+
|
444
|
+
# -
|
445
|
+
def nextSNPs(self, b, minorRef=None):
|
446
|
+
'''
|
447
|
+
Unpacks the binary array of genotypes and returns an n x b matrix of floats of
|
448
|
+
normalized genotypes for the next b SNPs, where n := number of samples.
|
449
|
+
Parameters
|
450
|
+
----------
|
451
|
+
b : int
|
452
|
+
Number of SNPs to return.
|
453
|
+
minorRef: bool, default None
|
454
|
+
Should we flip reference alleles so that the minor allele is the reference?
|
455
|
+
(This is useful for computing l1 w.r.t. minor allele).
|
456
|
+
Returns
|
457
|
+
-------
|
458
|
+
X : np.array with dtype float64 with shape (n, b), where n := number of samples
|
459
|
+
Matrix of genotypes normalized to mean zero and variance one. If minorRef is
|
460
|
+
not None, then the minor allele will be the positive allele (i.e., two copies
|
461
|
+
of the minor allele --> a positive number).
|
462
|
+
'''
|
463
|
+
# -
|
464
|
+
try:
|
465
|
+
b = int(b)
|
466
|
+
if b <= 0:
|
467
|
+
raise ValueError("b must be > 0")
|
468
|
+
except TypeError:
|
469
|
+
raise TypeError("b must be an integer")
|
470
|
+
# -
|
471
|
+
if self._currentSNP + b > self.m:
|
472
|
+
s = '{b} SNPs requested, {k} SNPs remain'
|
473
|
+
raise ValueError(s.format(b=b, k=(self.m - self._currentSNP)))
|
474
|
+
# -
|
475
|
+
c = self._currentSNP
|
476
|
+
n = self.n
|
477
|
+
nru = self.nru
|
478
|
+
slice = self.geno[2 * c * nru:2 * (c + b) * nru]
|
479
|
+
X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
|
480
|
+
X = X[0:n, :]
|
481
|
+
Y = np.zeros(X.shape)
|
482
|
+
# normalize the SNPs and impute the missing one with the mean
|
483
|
+
for j in range(0, b):
|
484
|
+
newsnp = X[:, j]
|
485
|
+
ii = newsnp != 9
|
486
|
+
avg = np.mean(newsnp[ii])
|
487
|
+
newsnp[np.logical_not(ii)] = avg
|
488
|
+
denom = np.std(newsnp)
|
489
|
+
if denom == 0:
|
490
|
+
denom = 1
|
491
|
+
# -
|
492
|
+
if minorRef is not None and self.freq[self._currentSNP + j] > 0.5:
|
493
|
+
denom = denom * -1
|
494
|
+
# -
|
495
|
+
Y[:, j] = (newsnp - avg) / denom
|
496
|
+
# -
|
497
|
+
self._currentSNP += b
|
498
|
+
return Y
|
499
|
+
|
500
|
+
|
501
|
+
class PlinkBEDFileWithR2Cache(PlinkBEDFile):
|
502
|
+
def compute_r2_cache(self,
|
503
|
+
block_left,
|
504
|
+
output_cache_file_dir: Path,
|
505
|
+
chunk_size=500_000_000,
|
506
|
+
c=500,
|
507
|
+
r2_threshold=1e-4,
|
508
|
+
annot=None):
|
509
|
+
|
510
|
+
func = np.square
|
511
|
+
snp_getter = self.nextSNPs
|
512
|
+
data, rows, cols = [], [], []
|
513
|
+
|
514
|
+
def add_rfuncAB(rfuncAB, l_A, l_B):
|
515
|
+
non_zero_indices = np.nonzero(rfuncAB > r2_threshold)
|
516
|
+
data.extend(rfuncAB[non_zero_indices])
|
517
|
+
rows.extend(l_A + non_zero_indices[0])
|
518
|
+
cols.extend(l_B + non_zero_indices[1])
|
519
|
+
|
520
|
+
# def add_rfuncAB(rfuncAB, l_A, l_B):
|
521
|
+
# # not need select non zero indices
|
522
|
+
# data.extend(rfuncAB.flatten())
|
523
|
+
# rows.extend(l_A + np.repeat(np.arange(rfuncAB.shape[0]), rfuncAB.shape[1]))
|
524
|
+
# cols.extend(l_B + np.tile(np.arange(rfuncAB.shape[1]), rfuncAB.shape[0]))
|
525
|
+
|
526
|
+
# def add_rfuncBB(rfuncBB, l_B):
|
527
|
+
# non_zero_indices = np.nonzero(rfuncBB)
|
528
|
+
# data.extend(rfuncBB[non_zero_indices])
|
529
|
+
# rows.extend(l_B + non_zero_indices[0])
|
530
|
+
# cols.extend(l_B + non_zero_indices[1])
|
531
|
+
|
532
|
+
def add_rfuncBB(rfuncBB, l_B):
|
533
|
+
non_zero_indices = np.nonzero(rfuncBB > r2_threshold)
|
534
|
+
data.extend(rfuncBB[non_zero_indices])
|
535
|
+
rows.extend(l_B + non_zero_indices[0])
|
536
|
+
cols.extend(l_B + non_zero_indices[1])
|
537
|
+
if len(data) > chunk_size:
|
538
|
+
# save the cache
|
539
|
+
print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
|
540
|
+
r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(self.m, self.m), dtype='float16')
|
541
|
+
save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
|
542
|
+
# reset the data
|
543
|
+
data.clear()
|
544
|
+
rows.clear()
|
545
|
+
cols.clear()
|
546
|
+
|
547
|
+
m, n = self.m, self.n
|
548
|
+
block_sizes = np.array(np.arange(m) - block_left)
|
549
|
+
block_sizes = np.ceil(block_sizes / c) * c
|
550
|
+
if annot is None:
|
551
|
+
annot = np.ones((m, 1))
|
552
|
+
else:
|
553
|
+
annot_m = annot.shape[0]
|
554
|
+
if annot_m != self.m:
|
555
|
+
raise ValueError('Incorrect number of SNPs in annot')
|
556
|
+
# -
|
557
|
+
n_a = annot.shape[1] # number of annotations
|
558
|
+
# cor_sum = np.zeros((m, n_a))
|
559
|
+
# b = index of first SNP for which SNP 0 is not included in LD Score
|
560
|
+
b = np.nonzero(block_left > 0)
|
561
|
+
if np.any(b):
|
562
|
+
b = b[0][0]
|
563
|
+
else:
|
564
|
+
b = m
|
565
|
+
b = int(np.ceil(b / c) * c) # round up to a multiple of c
|
566
|
+
if b > m:
|
567
|
+
c = 1
|
568
|
+
b = m
|
569
|
+
|
570
|
+
l_A = 0 # l_A := index of leftmost SNP in matrix A
|
571
|
+
A = snp_getter(b)
|
572
|
+
rfuncAB = np.zeros((b, c))
|
573
|
+
rfuncBB = np.zeros((c, c))
|
574
|
+
# chunk inside of block
|
575
|
+
for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
|
576
|
+
B = A[:, l_B:l_B + c]
|
577
|
+
# ld matrix
|
578
|
+
np.dot(A.T, B / n, out=rfuncAB)
|
579
|
+
# ld matrix square
|
580
|
+
rfuncAB = func(rfuncAB)
|
581
|
+
add_rfuncAB(rfuncAB, l_A, l_B)
|
582
|
+
# cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
583
|
+
|
584
|
+
# chunk to right of block
|
585
|
+
b0 = b
|
586
|
+
md = int(c * np.floor(m / c))
|
587
|
+
end = md + 1 if md != m else md
|
588
|
+
for l_B in trange(b0, end, c, desc=f'Compute r2 cache for {output_cache_file_dir.name}'):
|
589
|
+
# check if the annot matrix is all zeros for this block + chunk
|
590
|
+
# this happens w/ sparse categories (i.e., pathways)
|
591
|
+
# update the block
|
592
|
+
old_b = b
|
593
|
+
b = int(block_sizes[l_B])
|
594
|
+
if l_B > b0 and b > 0:
|
595
|
+
# block_size can't increase more than c
|
596
|
+
# block_size can't be less than c unless it is zero
|
597
|
+
# both of these things make sense
|
598
|
+
A = np.hstack((A[:, old_b - b + c:old_b], B))
|
599
|
+
l_A += old_b - b + c
|
600
|
+
elif l_B == b0 and b > 0:
|
601
|
+
A = A[:, b0 - b:b0]
|
602
|
+
l_A = b0 - b
|
603
|
+
elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
|
604
|
+
A = np.array(()).reshape((n, 0))
|
605
|
+
l_A = l_B
|
606
|
+
if l_B == md:
|
607
|
+
c = m - md
|
608
|
+
rfuncAB = np.zeros((b, c))
|
609
|
+
rfuncBB = np.zeros((c, c))
|
610
|
+
if b != old_b:
|
611
|
+
rfuncAB = np.zeros((b, c))
|
612
|
+
# -
|
613
|
+
B = snp_getter(c)
|
614
|
+
p1 = np.all(annot[l_A:l_A + b, :] == 0)
|
615
|
+
p2 = np.all(annot[l_B:l_B + c, :] == 0)
|
616
|
+
if p1 and p2:
|
617
|
+
continue
|
618
|
+
# -
|
619
|
+
np.dot(A.T, B / n, out=rfuncAB)
|
620
|
+
rfuncAB = func(rfuncAB)
|
621
|
+
# cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
622
|
+
# cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
|
623
|
+
add_rfuncAB(rfuncAB, l_A, l_B)
|
624
|
+
add_rfuncAB(rfuncAB.T, l_B, l_A)
|
625
|
+
np.dot(B.T, B / n, out=rfuncBB)
|
626
|
+
rfuncBB = func(rfuncBB)
|
627
|
+
# cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
|
628
|
+
add_rfuncBB(rfuncBB, l_B)
|
629
|
+
if len(data) > 0:
|
630
|
+
# save remaining data
|
631
|
+
# save the cache
|
632
|
+
print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
|
633
|
+
r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(m, m), dtype='float16')
|
634
|
+
save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
|
635
|
+
# combine the cache files
|
636
|
+
print(f'Start combining the cache files in {output_cache_file_dir}')
|
637
|
+
cached_r2_matrix_files = list(output_cache_file_dir.glob('*.npz'))
|
638
|
+
combined_r2_matrix_files = self.load_r2_matrix_from_cache_files(output_cache_file_dir)
|
639
|
+
# remove the cache files
|
640
|
+
for cached_r2_matrix_file in cached_r2_matrix_files:
|
641
|
+
cached_r2_matrix_file.unlink()
|
642
|
+
# save the combined r2 matrix
|
643
|
+
print(f'Start saving the combined r2 matrix in {output_cache_file_dir}')
|
644
|
+
combined_r2_matrix_file = output_cache_file_dir / 'combined_r2_matrix.npz'
|
645
|
+
save_npz(combined_r2_matrix_file, combined_r2_matrix_files)
|
646
|
+
|
647
|
+
def get_ldscore_using_r2_cache(self, annot_matrix, cached_r2_matrix_dir):
|
648
|
+
"""
|
649
|
+
Compute the r2 matrix multiplication with annot_matrix
|
650
|
+
"""
|
651
|
+
# Compute the r2 matrix multiplication with annot_matrix
|
652
|
+
cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
|
653
|
+
# iter the cached r2 matrix files
|
654
|
+
result_matrix = np.zeros((self.m, annot_matrix.shape[1]))
|
655
|
+
cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
|
656
|
+
assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
|
657
|
+
f'Please run the function compute_r2_cache first!')
|
658
|
+
for r2_matrix_file in tqdm(cached_r2_matrix_files, desc=f'Compute ld score for {cached_r2_matrix_dir.name}'):
|
659
|
+
print(f'Compute r2 matrix multiplication for {r2_matrix_file}')
|
660
|
+
r2_matrix = load_npz(r2_matrix_file)
|
661
|
+
result_matrix += r2_matrix.dot(annot_matrix)
|
662
|
+
return result_matrix
|
663
|
+
|
664
|
+
def load_r2_matrix_from_cache_files(self, cached_r2_matrix_dir):
|
665
|
+
"""
|
666
|
+
Load the r2 matrix from cache
|
667
|
+
"""
|
668
|
+
cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
|
669
|
+
# iter the cached r2 matrix files
|
670
|
+
cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
|
671
|
+
assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
|
672
|
+
f'Please run the function compute_r2_cache first!')
|
673
|
+
# load the r2 matrix
|
674
|
+
r2_matrix = load_npz(cached_r2_matrix_files[0])
|
675
|
+
for r2_matrix_file in tqdm(cached_r2_matrix_files[1:], desc=f'Load r2 matrix from {cached_r2_matrix_dir.name}'):
|
676
|
+
print(f'Load r2 matrix from {r2_matrix_file}')
|
677
|
+
r2_matrix += load_npz(r2_matrix_file)
|
678
|
+
# to float16
|
679
|
+
r2_matrix = r2_matrix.astype('float16')
|
680
|
+
return r2_matrix
|
681
|
+
def load_combined_r2_matrix(self, cached_r2_matrix_dir):
|
682
|
+
"""
|
683
|
+
Load the combined r2 matrix
|
684
|
+
"""
|
685
|
+
combined_r2_matrix_file = Path(cached_r2_matrix_dir) / 'combined_r2_matrix.npz'
|
686
|
+
assert combined_r2_matrix_file.exists(), (f'No combined r2 matrix file in {cached_r2_matrix_dir}'
|
687
|
+
f'Should delete the cache files and run the function compute_r2_cache first!')
|
688
|
+
# load the r2 matrix
|
689
|
+
r2_matrix = load_npz(combined_r2_matrix_file)
|
690
|
+
# to float16
|
691
|
+
r2_matrix = r2_matrix.astype('float16')
|
692
|
+
return r2_matrix
|
693
|
+
|
694
|
+
def load_bfile(bfile_chr_prefix):
|
695
|
+
PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
|
696
|
+
PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
|
697
|
+
|
698
|
+
snp_file, snp_obj = bfile_chr_prefix + '.bim', PlinkBIMFile
|
699
|
+
array_snps = snp_obj(snp_file)
|
700
|
+
m = len(array_snps.IDList)
|
701
|
+
print(f'Read list of {m} SNPs from {snp_file}')
|
702
|
+
#
|
703
|
+
# Load fam
|
704
|
+
ind_file, ind_obj = bfile_chr_prefix + '.fam', PlinkFAMFile
|
705
|
+
array_indivs = ind_obj(ind_file)
|
706
|
+
n = len(array_indivs.IDList)
|
707
|
+
print(f'Read list of {n} individuals from {ind_file}')
|
708
|
+
|
709
|
+
# Load genotype array
|
710
|
+
array_file, array_obj = bfile_chr_prefix + '.bed', PlinkBEDFileWithR2Cache
|
711
|
+
geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
|
712
|
+
|
713
|
+
return array_snps, array_indivs, geno_array
|
714
|
+
|
715
|
+
|
716
|
+
def generate_r2_matrix_chr_cache(bfile_chr_prefix, ld_wind_cm, output_cache_file_dir):
|
717
|
+
# Load genotype array
|
718
|
+
array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix)
|
719
|
+
# Compute block lefts
|
720
|
+
block_left = getBlockLefts(geno_array.df[:, 3], ld_wind_cm)
|
721
|
+
# Compute LD score
|
722
|
+
r2_matrix = geno_array.load_r2_matrix_from_cache(output_cache_file_dir)
|
723
|
+
|
724
|
+
|
725
|
+
def generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_wind_cm=1):
|
726
|
+
r2_cache_dir = Path(r2_cache_dir)
|
727
|
+
|
728
|
+
for chr in chromosome_list:
|
729
|
+
output_cache_file_prefix = r2_cache_dir / f'chr{chr}'
|
730
|
+
output_cache_file_prefix.mkdir(parents=True, exist_ok=True)
|
731
|
+
bfile_chr_prefix = bfile_prefix + '.' + str(chr)
|
732
|
+
generate_r2_matrix_chr_cache(bfile_chr_prefix,
|
733
|
+
ld_wind_cm=ld_wind_cm,
|
734
|
+
output_cache_file_dir=output_cache_file_prefix)
|
735
|
+
print(f'Compute r2 matrix for chr{chr} done!')
|