gsMap 1.71.2__py3-none-any.whl → 1.73.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN/adjacency_matrix.py +25 -27
- gsMap/GNN/model.py +9 -7
- gsMap/GNN/train.py +8 -11
- gsMap/__init__.py +3 -3
- gsMap/__main__.py +3 -2
- gsMap/cauchy_combination_test.py +78 -75
- gsMap/config.py +948 -322
- gsMap/create_slice_mean.py +168 -0
- gsMap/diagnosis.py +179 -101
- gsMap/find_latent_representation.py +29 -27
- gsMap/format_sumstats.py +239 -201
- gsMap/generate_ldscore.py +334 -222
- gsMap/latent_to_gene.py +128 -68
- gsMap/main.py +23 -14
- gsMap/report.py +39 -25
- gsMap/run_all_mode.py +87 -46
- gsMap/setup.py +1 -1
- gsMap/spatial_ldsc_multiple_sumstats.py +154 -80
- gsMap/utils/generate_r2_matrix.py +100 -346
- gsMap/utils/jackknife.py +84 -80
- gsMap/utils/manhattan_plot.py +180 -207
- gsMap/utils/regression_read.py +83 -176
- gsMap/visualize.py +82 -64
- gsmap-1.73.0.dist-info/METADATA +169 -0
- gsmap-1.73.0.dist-info/RECORD +31 -0
- {gsmap-1.71.2.dist-info → gsmap-1.73.0.dist-info}/WHEEL +1 -1
- {gsmap-1.71.2.dist-info → gsmap-1.73.0.dist-info/licenses}/LICENSE +6 -6
- gsMap/utils/make_annotations.py +0 -518
- gsmap-1.71.2.dist-info/METADATA +0 -105
- gsmap-1.71.2.dist-info/RECORD +0 -31
- {gsmap-1.71.2.dist-info → gsmap-1.73.0.dist-info}/entry_points.txt +0 -0
@@ -1,48 +1,13 @@
|
|
1
|
-
from pathlib import Path
|
2
1
|
import bitarray as ba
|
3
2
|
import numpy as np
|
4
3
|
import pandas as pd
|
5
|
-
from
|
6
|
-
from scipy.sparse import save_npz, load_npz
|
7
|
-
from tqdm import trange, tqdm
|
8
|
-
|
9
|
-
|
10
|
-
# Define the log class
|
11
|
-
class Logger(object):
|
12
|
-
# -
|
13
|
-
def __init__(self, fh):
|
14
|
-
self.log_fh = open(fh, 'w')
|
15
|
-
|
16
|
-
# -
|
17
|
-
def log(self, msg):
|
18
|
-
'''
|
19
|
-
Print to log file and stdout.
|
20
|
-
'''
|
21
|
-
print(msg, file=self.log_fh)
|
22
|
-
print(msg)
|
23
|
-
|
24
|
-
# -
|
25
|
-
def close(self):
|
26
|
-
self.log_fh.close()
|
27
|
-
|
28
|
-
|
29
|
-
# Compute ld-score using cellular annotations
|
30
|
-
def get_compression(fh):
|
31
|
-
'''Which sort of compression should we use with read_csv?'''
|
32
|
-
if fh.endswith('gz'):
|
33
|
-
compression = 'gzip'
|
34
|
-
elif fh.endswith('bz2'):
|
35
|
-
compression = 'bz2'
|
36
|
-
else:
|
37
|
-
compression = None
|
38
|
-
# -
|
39
|
-
return compression
|
4
|
+
from tqdm import tqdm
|
40
5
|
|
41
6
|
|
42
7
|
# Define the reading functions
|
43
8
|
def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
|
44
9
|
# -
|
45
|
-
class IDContainer
|
10
|
+
class IDContainer:
|
46
11
|
"""
|
47
12
|
A class to read data from a file, store it as a DataFrame, and provide a method for a left outer join operation.
|
48
13
|
"""
|
@@ -66,48 +31,38 @@ def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
|
|
66
31
|
"""
|
67
32
|
end = self.fname_end
|
68
33
|
if end and not fname.endswith(end):
|
69
|
-
raise ValueError(
|
70
|
-
|
71
|
-
|
72
|
-
|
34
|
+
raise ValueError(f"{end} filename must end in {end}")
|
35
|
+
self.df = pd.read_csv(
|
36
|
+
fname,
|
37
|
+
header=self.header,
|
38
|
+
usecols=self.usecols,
|
39
|
+
sep=r"\s+",
|
40
|
+
)
|
73
41
|
if self.colnames:
|
74
42
|
self.df.columns = self.colnames
|
75
43
|
if self.keepcol is not None:
|
76
|
-
self.IDList = self.df.iloc[:, [self.keepcol]].astype(
|
44
|
+
self.IDList = self.df.iloc[:, [self.keepcol]].astype("object")
|
77
45
|
|
78
|
-
# -
|
79
|
-
def loj(self, externalDf):
|
80
|
-
"""
|
81
|
-
Perform a left outer join operation with the given external DataFrame.
|
82
|
-
"""
|
83
|
-
r = externalDf.columns[0]
|
84
|
-
l = self.IDList.columns[0]
|
85
|
-
merge_df = externalDf.iloc[:, [0]]
|
86
|
-
merge_df['keep'] = True
|
87
|
-
z = pd.merge(self.IDList, merge_df, how='left', left_on=l, right_on=r,
|
88
|
-
sort=False)
|
89
|
-
ii = z['keep'] == True
|
90
|
-
return np.nonzero(ii)[0]
|
91
|
-
|
92
|
-
# -
|
93
46
|
return IDContainer
|
94
47
|
|
95
48
|
|
96
49
|
def getBlockLefts(coords, max_dist):
|
97
|
-
|
50
|
+
"""
|
98
51
|
Converts coordinates + max block length to the a list of coordinates of the leftmost
|
99
52
|
SNPs to be included in blocks.
|
53
|
+
|
100
54
|
Parameters
|
101
55
|
----------
|
102
56
|
coords : array
|
103
57
|
Array of coordinates. Must be sorted.
|
104
58
|
max_dist : float
|
105
59
|
Maximum distance between SNPs included in the same window.
|
60
|
+
|
106
61
|
Returns
|
107
62
|
-------
|
108
63
|
block_left : 1D np.ndarray with same length as block_left
|
109
64
|
block_left[j] := min{k | dist(j, k) < max_dist}.
|
110
|
-
|
65
|
+
"""
|
111
66
|
M = len(coords)
|
112
67
|
j = 0
|
113
68
|
block_left = np.zeros(M)
|
@@ -120,17 +75,19 @@ def getBlockLefts(coords, max_dist):
|
|
120
75
|
|
121
76
|
|
122
77
|
def block_left_to_right(block_left):
|
123
|
-
|
78
|
+
"""
|
124
79
|
Converts block lefts to block rights.
|
80
|
+
|
125
81
|
Parameters
|
126
82
|
----------
|
127
83
|
block_left : array
|
128
84
|
Array of block lefts.
|
85
|
+
|
129
86
|
Returns
|
130
87
|
-------
|
131
88
|
block_right : 1D np.ndarray with same length as block_left
|
132
89
|
block_right[j] := max {k | block_left[k] <= j}
|
133
|
-
|
90
|
+
"""
|
134
91
|
M = len(block_left)
|
135
92
|
j = 0
|
136
93
|
block_right = np.zeros(M)
|
@@ -142,54 +99,57 @@ def block_left_to_right(block_left):
|
|
142
99
|
return block_right
|
143
100
|
|
144
101
|
|
145
|
-
class GenotypeArrayInMemory
|
146
|
-
|
102
|
+
class GenotypeArrayInMemory:
|
103
|
+
"""
|
147
104
|
Parent class for various classes containing interfaces for files with genotype
|
148
105
|
matrices, e.g., plink .bed files, etc
|
149
|
-
|
106
|
+
"""
|
150
107
|
|
151
108
|
def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
|
152
109
|
self.m = len(snp_list.IDList)
|
153
110
|
self.n = n
|
154
111
|
self.keep_snps = keep_snps
|
155
112
|
self.keep_indivs = keep_indivs
|
156
|
-
self.df = np.array(snp_list.df[[
|
157
|
-
self.colnames = [
|
113
|
+
self.df = np.array(snp_list.df[["CHR", "SNP", "BP", "CM"]])
|
114
|
+
self.colnames = ["CHR", "SNP", "BP", "CM"]
|
158
115
|
self.mafMin = mafMin if mafMin is not None else 0
|
159
116
|
self._currentSNP = 0
|
160
117
|
(self.nru, self.geno) = self.__read__(fname, self.m, n)
|
161
118
|
# filter individuals
|
162
119
|
if keep_indivs is not None:
|
163
|
-
keep_indivs = np.array(keep_indivs, dtype=
|
120
|
+
keep_indivs = np.array(keep_indivs, dtype="int")
|
164
121
|
if np.any(keep_indivs > self.n):
|
165
|
-
raise ValueError(
|
122
|
+
raise ValueError("keep_indivs indices out of bounds")
|
166
123
|
# -
|
167
|
-
(self.geno, self.m, self.n) = self.__filter_indivs__(
|
124
|
+
(self.geno, self.m, self.n) = self.__filter_indivs__(
|
125
|
+
self.geno, keep_indivs, self.m, self.n
|
126
|
+
)
|
168
127
|
# -
|
169
128
|
if self.n > 0:
|
170
|
-
print(
|
129
|
+
print(f"After filtering, {self.n} individuals remain")
|
171
130
|
else:
|
172
|
-
raise ValueError(
|
131
|
+
raise ValueError("After filtering, no individuals remain")
|
173
132
|
# -
|
174
133
|
# filter SNPs
|
175
134
|
if keep_snps is not None:
|
176
|
-
keep_snps = np.array(keep_snps, dtype=
|
135
|
+
keep_snps = np.array(keep_snps, dtype="int")
|
177
136
|
if np.any(keep_snps > self.m): # if keep_snps is None, this returns False
|
178
|
-
raise ValueError(
|
137
|
+
raise ValueError("keep_snps indices out of bounds")
|
179
138
|
# -
|
180
139
|
(self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
|
181
|
-
self.geno, self.m, self.n, self.mafMin, keep_snps
|
140
|
+
self.geno, self.m, self.n, self.mafMin, keep_snps
|
141
|
+
)
|
182
142
|
# -
|
183
143
|
if self.m > 0:
|
184
|
-
print(
|
144
|
+
print(f"After filtering, {self.m} SNPs remain")
|
185
145
|
else:
|
186
|
-
raise ValueError(
|
146
|
+
raise ValueError("After filtering, no SNPs remain")
|
187
147
|
# -
|
188
148
|
self.df = self.df[self.kept_snps, :]
|
189
149
|
self.maf = np.minimum(self.freq, np.ones(self.m) - self.freq)
|
190
150
|
self.sqrtpq = np.sqrt(self.freq * (np.ones(self.m) - self.freq))
|
191
151
|
self.df = np.c_[self.df, self.maf]
|
192
|
-
self.colnames.append(
|
152
|
+
self.colnames.append("MAF")
|
193
153
|
|
194
154
|
# -
|
195
155
|
def __read__(self, fname, m, n):
|
@@ -208,8 +168,11 @@ class GenotypeArrayInMemory(object):
|
|
208
168
|
|
209
169
|
# -
|
210
170
|
def ldScoreVarBlocks(self, block_left, c, annot=None):
|
211
|
-
|
212
|
-
|
171
|
+
"""Computes an unbiased estimate of L2(j) for j=1,..,M."""
|
172
|
+
|
173
|
+
def func(x):
|
174
|
+
return self.__l2_unbiased__(x, self.n)
|
175
|
+
|
213
176
|
snp_getter = self.nextSNPs
|
214
177
|
return self.__corSumVarBlocks__(block_left, c, func, snp_getter, annot)
|
215
178
|
|
@@ -225,7 +188,7 @@ class GenotypeArrayInMemory(object):
|
|
225
188
|
# Methods for calculating sums of Pearson correlation coefficients (i.e.,ld-score)
|
226
189
|
# c stands for the chunk size (default = 50)
|
227
190
|
def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
|
228
|
-
|
191
|
+
"""
|
229
192
|
Parameters
|
230
193
|
----------
|
231
194
|
block_left : np.ndarray with shape (M, )
|
@@ -243,11 +206,12 @@ class GenotypeArrayInMemory(object):
|
|
243
206
|
The method to be used to get the next SNPs
|
244
207
|
annot: numpy array with shape (m,n_a)
|
245
208
|
SNP annotations.
|
209
|
+
|
246
210
|
Returns
|
247
211
|
-------
|
248
212
|
cor_sum : np.ndarray with shape (M, num_annots)
|
249
213
|
Estimates.
|
250
|
-
|
214
|
+
"""
|
251
215
|
m, n = self.m, self.n
|
252
216
|
block_sizes = np.array(np.arange(m) - block_left)
|
253
217
|
block_sizes = np.ceil(block_sizes / c) * c
|
@@ -256,7 +220,7 @@ class GenotypeArrayInMemory(object):
|
|
256
220
|
else:
|
257
221
|
annot_m = annot.shape[0]
|
258
222
|
if annot_m != self.m:
|
259
|
-
raise ValueError(
|
223
|
+
raise ValueError("Incorrect number of SNPs in annot")
|
260
224
|
# -
|
261
225
|
n_a = annot.shape[1] # number of annotations
|
262
226
|
cor_sum = np.zeros((m, n_a))
|
@@ -277,18 +241,18 @@ class GenotypeArrayInMemory(object):
|
|
277
241
|
rfuncBB = np.zeros((c, c))
|
278
242
|
# chunk inside of block
|
279
243
|
for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
|
280
|
-
B = A[:, l_B:l_B + c]
|
244
|
+
B = A[:, l_B : l_B + c]
|
281
245
|
# ld matrix
|
282
246
|
np.dot(A.T, B / n, out=rfuncAB)
|
283
247
|
# ld matrix square
|
284
248
|
rfuncAB = func(rfuncAB)
|
285
|
-
cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
249
|
+
cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
|
286
250
|
|
287
251
|
# chunk to right of block
|
288
252
|
b0 = b
|
289
253
|
md = int(c * np.floor(m / c))
|
290
254
|
end = md + 1 if md != m else md
|
291
|
-
for l_B in tqdm(np.arange(b0, end, c), desc=
|
255
|
+
for l_B in tqdm(np.arange(b0, end, c), desc="Compute SNP Gene Weight"):
|
292
256
|
# check if the annot matrix is all zeros for this block + chunk
|
293
257
|
# this happens w/ sparse categories (i.e., pathways)
|
294
258
|
# update the block
|
@@ -298,10 +262,10 @@ class GenotypeArrayInMemory(object):
|
|
298
262
|
# block_size can't increase more than c
|
299
263
|
# block_size can't be less than c unless it is zero
|
300
264
|
# both of these things make sense
|
301
|
-
A = np.hstack((A[:, old_b - b + c:old_b], B))
|
265
|
+
A = np.hstack((A[:, old_b - b + c : old_b], B))
|
302
266
|
l_A += old_b - b + c
|
303
267
|
elif l_B == b0 and b > 0:
|
304
|
-
A = A[:, b0 - b:b0]
|
268
|
+
A = A[:, b0 - b : b0]
|
305
269
|
l_A = b0 - b
|
306
270
|
elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
|
307
271
|
A = np.array(()).reshape((n, 0))
|
@@ -314,44 +278,45 @@ class GenotypeArrayInMemory(object):
|
|
314
278
|
rfuncAB = np.zeros((b, c))
|
315
279
|
# -
|
316
280
|
B = snp_getter(c)
|
317
|
-
p1 = np.all(annot[l_A:l_A + b, :] == 0)
|
318
|
-
p2 = np.all(annot[l_B:l_B + c, :] == 0)
|
281
|
+
p1 = np.all(annot[l_A : l_A + b, :] == 0)
|
282
|
+
p2 = np.all(annot[l_B : l_B + c, :] == 0)
|
319
283
|
if p1 and p2:
|
320
284
|
continue
|
321
285
|
# -
|
322
286
|
np.dot(A.T, B / n, out=rfuncAB)
|
323
287
|
rfuncAB = func(rfuncAB)
|
324
|
-
cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
325
|
-
cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
|
288
|
+
cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
|
289
|
+
cor_sum[l_B : l_B + c, :] += np.dot(annot[l_A : l_A + b, :].T, rfuncAB).T
|
326
290
|
np.dot(B.T, B / n, out=rfuncBB)
|
327
291
|
rfuncBB = func(rfuncBB)
|
328
|
-
cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
|
292
|
+
cor_sum[l_B : l_B + c, :] += np.dot(rfuncBB, annot[l_B : l_B + c, :])
|
329
293
|
# -
|
330
294
|
return cor_sum
|
331
295
|
|
332
296
|
|
333
297
|
class PlinkBEDFile(GenotypeArrayInMemory):
|
334
|
-
|
298
|
+
"""
|
335
299
|
Interface for Plink .bed format
|
336
|
-
|
300
|
+
"""
|
337
301
|
|
338
302
|
def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
|
339
303
|
self._bedcode = {
|
340
|
-
2: ba.bitarray(
|
341
|
-
9: ba.bitarray(
|
342
|
-
1: ba.bitarray(
|
343
|
-
0: ba.bitarray(
|
304
|
+
2: ba.bitarray("11"),
|
305
|
+
9: ba.bitarray("10"),
|
306
|
+
1: ba.bitarray("01"),
|
307
|
+
0: ba.bitarray("00"),
|
344
308
|
}
|
345
309
|
# -
|
346
|
-
GenotypeArrayInMemory.__init__(
|
347
|
-
|
310
|
+
GenotypeArrayInMemory.__init__(
|
311
|
+
self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=mafMin
|
312
|
+
)
|
348
313
|
|
349
314
|
# -
|
350
315
|
def __read__(self, fname, m, n):
|
351
|
-
if not fname.endswith(
|
352
|
-
raise ValueError(
|
316
|
+
if not fname.endswith(".bed"):
|
317
|
+
raise ValueError(".bed filename must end in .bed")
|
353
318
|
# -
|
354
|
-
fh = open(fname,
|
319
|
+
fh = open(fname, "rb")
|
355
320
|
magicNumber = ba.bitarray(endian="little")
|
356
321
|
magicNumber.fromfile(fh, 2)
|
357
322
|
bedMode = ba.bitarray(endian="little")
|
@@ -360,11 +325,11 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
360
325
|
nru = n + e
|
361
326
|
self.nru = nru
|
362
327
|
# check magic number
|
363
|
-
if magicNumber != ba.bitarray(
|
364
|
-
raise
|
328
|
+
if magicNumber != ba.bitarray("0011011011011000"):
|
329
|
+
raise OSError("Magic number from Plink .bed file not recognized")
|
365
330
|
# -
|
366
|
-
if bedMode != ba.bitarray(
|
367
|
-
raise
|
331
|
+
if bedMode != ba.bitarray("10000000"):
|
332
|
+
raise OSError("Plink .bed file must be in default SNP-major mode")
|
368
333
|
# check file length
|
369
334
|
self.geno = ba.bitarray(endian="little")
|
370
335
|
self.geno.fromfile(fh)
|
@@ -377,7 +342,7 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
377
342
|
real_len = len(geno)
|
378
343
|
if real_len != exp_len:
|
379
344
|
s = "Plink .bed file has {n1} bits, expected {n2}"
|
380
|
-
raise
|
345
|
+
raise OSError(s.format(n1=real_len, n2=exp_len))
|
381
346
|
|
382
347
|
# -
|
383
348
|
def __filter_indivs__(self, geno, keep_indivs, m, n):
|
@@ -388,14 +353,14 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
388
353
|
z = ba.bitarray(m * 2 * nru_new, endian="little")
|
389
354
|
z.setall(0)
|
390
355
|
for e, i in enumerate(keep_indivs):
|
391
|
-
z[2 * e::2 * nru_new] = geno[2 * i::2 * nru]
|
392
|
-
z[2 * e + 1::2 * nru_new] = geno[2 * i + 1::2 * nru]
|
356
|
+
z[2 * e :: 2 * nru_new] = geno[2 * i :: 2 * nru]
|
357
|
+
z[2 * e + 1 :: 2 * nru_new] = geno[2 * i + 1 :: 2 * nru]
|
393
358
|
self.nru = nru_new
|
394
359
|
return (z, m, n_new)
|
395
360
|
|
396
361
|
# -
|
397
362
|
def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
|
398
|
-
|
363
|
+
"""
|
399
364
|
Credit to Chris Chang and the Plink2 developers for this algorithm
|
400
365
|
Modified from plink_filter.c
|
401
366
|
https://github.com/chrchang/plink-ng/blob/master/plink_filter.c
|
@@ -414,7 +379,7 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
414
379
|
major allele frequency = (b+c)/(2*(n-a+c))
|
415
380
|
het ct + missing ct = a + b - 2*c
|
416
381
|
Why does bitarray not have >> ????
|
417
|
-
|
382
|
+
"""
|
418
383
|
nru = self.nru
|
419
384
|
m_poly = 0
|
420
385
|
y = ba.bitarray()
|
@@ -423,7 +388,7 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
423
388
|
kept_snps = []
|
424
389
|
freq = []
|
425
390
|
for e, j in enumerate(keep_snps):
|
426
|
-
z = geno[2 * nru * j:2 * nru * (j + 1)]
|
391
|
+
z = geno[2 * nru * j : 2 * nru * (j + 1)]
|
427
392
|
A = z[0::2]
|
428
393
|
a = A.count()
|
429
394
|
B = z[1::2]
|
@@ -443,9 +408,10 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
443
408
|
|
444
409
|
# -
|
445
410
|
def nextSNPs(self, b, minorRef=None):
|
446
|
-
|
411
|
+
"""
|
447
412
|
Unpacks the binary array of genotypes and returns an n x b matrix of floats of
|
448
413
|
normalized genotypes for the next b SNPs, where n := number of samples.
|
414
|
+
|
449
415
|
Parameters
|
450
416
|
----------
|
451
417
|
b : int
|
@@ -453,29 +419,30 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
453
419
|
minorRef: bool, default None
|
454
420
|
Should we flip reference alleles so that the minor allele is the reference?
|
455
421
|
(This is useful for computing l1 w.r.t. minor allele).
|
422
|
+
|
456
423
|
Returns
|
457
424
|
-------
|
458
425
|
X : np.array with dtype float64 with shape (n, b), where n := number of samples
|
459
426
|
Matrix of genotypes normalized to mean zero and variance one. If minorRef is
|
460
427
|
not None, then the minor allele will be the positive allele (i.e., two copies
|
461
428
|
of the minor allele --> a positive number).
|
462
|
-
|
429
|
+
"""
|
463
430
|
# -
|
464
431
|
try:
|
465
432
|
b = int(b)
|
466
433
|
if b <= 0:
|
467
434
|
raise ValueError("b must be > 0")
|
468
|
-
except TypeError:
|
469
|
-
raise TypeError("b must be an integer")
|
435
|
+
except TypeError as e:
|
436
|
+
raise TypeError("b must be an integer") from e
|
470
437
|
# -
|
471
438
|
if self._currentSNP + b > self.m:
|
472
|
-
s =
|
439
|
+
s = "{b} SNPs requested, {k} SNPs remain"
|
473
440
|
raise ValueError(s.format(b=b, k=(self.m - self._currentSNP)))
|
474
441
|
# -
|
475
442
|
c = self._currentSNP
|
476
443
|
n = self.n
|
477
444
|
nru = self.nru
|
478
|
-
slice = self.geno[2 * c * nru:2 * (c + b) * nru]
|
445
|
+
slice = self.geno[2 * c * nru : 2 * (c + b) * nru]
|
479
446
|
X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
|
480
447
|
X = X[0:n, :]
|
481
448
|
Y = np.zeros(X.shape)
|
@@ -498,238 +465,25 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
498
465
|
return Y
|
499
466
|
|
500
467
|
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
c=500,
|
507
|
-
r2_threshold=1e-4,
|
508
|
-
annot=None):
|
509
|
-
|
510
|
-
func = np.square
|
511
|
-
snp_getter = self.nextSNPs
|
512
|
-
data, rows, cols = [], [], []
|
513
|
-
|
514
|
-
def add_rfuncAB(rfuncAB, l_A, l_B):
|
515
|
-
non_zero_indices = np.nonzero(rfuncAB > r2_threshold)
|
516
|
-
data.extend(rfuncAB[non_zero_indices])
|
517
|
-
rows.extend(l_A + non_zero_indices[0])
|
518
|
-
cols.extend(l_B + non_zero_indices[1])
|
519
|
-
|
520
|
-
# def add_rfuncAB(rfuncAB, l_A, l_B):
|
521
|
-
# # not need select non zero indices
|
522
|
-
# data.extend(rfuncAB.flatten())
|
523
|
-
# rows.extend(l_A + np.repeat(np.arange(rfuncAB.shape[0]), rfuncAB.shape[1]))
|
524
|
-
# cols.extend(l_B + np.tile(np.arange(rfuncAB.shape[1]), rfuncAB.shape[0]))
|
525
|
-
|
526
|
-
# def add_rfuncBB(rfuncBB, l_B):
|
527
|
-
# non_zero_indices = np.nonzero(rfuncBB)
|
528
|
-
# data.extend(rfuncBB[non_zero_indices])
|
529
|
-
# rows.extend(l_B + non_zero_indices[0])
|
530
|
-
# cols.extend(l_B + non_zero_indices[1])
|
531
|
-
|
532
|
-
def add_rfuncBB(rfuncBB, l_B):
|
533
|
-
non_zero_indices = np.nonzero(rfuncBB > r2_threshold)
|
534
|
-
data.extend(rfuncBB[non_zero_indices])
|
535
|
-
rows.extend(l_B + non_zero_indices[0])
|
536
|
-
cols.extend(l_B + non_zero_indices[1])
|
537
|
-
if len(data) > chunk_size:
|
538
|
-
# save the cache
|
539
|
-
print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
|
540
|
-
r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(self.m, self.m), dtype='float16')
|
541
|
-
save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
|
542
|
-
# reset the data
|
543
|
-
data.clear()
|
544
|
-
rows.clear()
|
545
|
-
cols.clear()
|
546
|
-
|
547
|
-
m, n = self.m, self.n
|
548
|
-
block_sizes = np.array(np.arange(m) - block_left)
|
549
|
-
block_sizes = np.ceil(block_sizes / c) * c
|
550
|
-
if annot is None:
|
551
|
-
annot = np.ones((m, 1))
|
552
|
-
else:
|
553
|
-
annot_m = annot.shape[0]
|
554
|
-
if annot_m != self.m:
|
555
|
-
raise ValueError('Incorrect number of SNPs in annot')
|
556
|
-
# -
|
557
|
-
n_a = annot.shape[1] # number of annotations
|
558
|
-
# cor_sum = np.zeros((m, n_a))
|
559
|
-
# b = index of first SNP for which SNP 0 is not included in LD Score
|
560
|
-
b = np.nonzero(block_left > 0)
|
561
|
-
if np.any(b):
|
562
|
-
b = b[0][0]
|
563
|
-
else:
|
564
|
-
b = m
|
565
|
-
b = int(np.ceil(b / c) * c) # round up to a multiple of c
|
566
|
-
if b > m:
|
567
|
-
c = 1
|
568
|
-
b = m
|
569
|
-
|
570
|
-
l_A = 0 # l_A := index of leftmost SNP in matrix A
|
571
|
-
A = snp_getter(b)
|
572
|
-
rfuncAB = np.zeros((b, c))
|
573
|
-
rfuncBB = np.zeros((c, c))
|
574
|
-
# chunk inside of block
|
575
|
-
for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
|
576
|
-
B = A[:, l_B:l_B + c]
|
577
|
-
# ld matrix
|
578
|
-
np.dot(A.T, B / n, out=rfuncAB)
|
579
|
-
# ld matrix square
|
580
|
-
rfuncAB = func(rfuncAB)
|
581
|
-
add_rfuncAB(rfuncAB, l_A, l_B)
|
582
|
-
# cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
468
|
+
def load_bfile(bfile_chr_prefix):
|
469
|
+
PlinkBIMFile = ID_List_Factory(
|
470
|
+
["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
|
471
|
+
)
|
472
|
+
PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
|
583
473
|
|
584
|
-
|
585
|
-
|
586
|
-
md = int(c * np.floor(m / c))
|
587
|
-
end = md + 1 if md != m else md
|
588
|
-
for l_B in trange(b0, end, c, desc=f'Compute r2 cache for {output_cache_file_dir.name}'):
|
589
|
-
# check if the annot matrix is all zeros for this block + chunk
|
590
|
-
# this happens w/ sparse categories (i.e., pathways)
|
591
|
-
# update the block
|
592
|
-
old_b = b
|
593
|
-
b = int(block_sizes[l_B])
|
594
|
-
if l_B > b0 and b > 0:
|
595
|
-
# block_size can't increase more than c
|
596
|
-
# block_size can't be less than c unless it is zero
|
597
|
-
# both of these things make sense
|
598
|
-
A = np.hstack((A[:, old_b - b + c:old_b], B))
|
599
|
-
l_A += old_b - b + c
|
600
|
-
elif l_B == b0 and b > 0:
|
601
|
-
A = A[:, b0 - b:b0]
|
602
|
-
l_A = b0 - b
|
603
|
-
elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
|
604
|
-
A = np.array(()).reshape((n, 0))
|
605
|
-
l_A = l_B
|
606
|
-
if l_B == md:
|
607
|
-
c = m - md
|
608
|
-
rfuncAB = np.zeros((b, c))
|
609
|
-
rfuncBB = np.zeros((c, c))
|
610
|
-
if b != old_b:
|
611
|
-
rfuncAB = np.zeros((b, c))
|
612
|
-
# -
|
613
|
-
B = snp_getter(c)
|
614
|
-
p1 = np.all(annot[l_A:l_A + b, :] == 0)
|
615
|
-
p2 = np.all(annot[l_B:l_B + c, :] == 0)
|
616
|
-
if p1 and p2:
|
617
|
-
continue
|
618
|
-
# -
|
619
|
-
np.dot(A.T, B / n, out=rfuncAB)
|
620
|
-
rfuncAB = func(rfuncAB)
|
621
|
-
# cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
622
|
-
# cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
|
623
|
-
add_rfuncAB(rfuncAB, l_A, l_B)
|
624
|
-
add_rfuncAB(rfuncAB.T, l_B, l_A)
|
625
|
-
np.dot(B.T, B / n, out=rfuncBB)
|
626
|
-
rfuncBB = func(rfuncBB)
|
627
|
-
# cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
|
628
|
-
add_rfuncBB(rfuncBB, l_B)
|
629
|
-
if len(data) > 0:
|
630
|
-
# save remaining data
|
631
|
-
# save the cache
|
632
|
-
print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
|
633
|
-
r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(m, m), dtype='float16')
|
634
|
-
save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
|
635
|
-
# combine the cache files
|
636
|
-
print(f'Start combining the cache files in {output_cache_file_dir}')
|
637
|
-
cached_r2_matrix_files = list(output_cache_file_dir.glob('*.npz'))
|
638
|
-
combined_r2_matrix_files = self.load_r2_matrix_from_cache_files(output_cache_file_dir)
|
639
|
-
# remove the cache files
|
640
|
-
for cached_r2_matrix_file in cached_r2_matrix_files:
|
641
|
-
cached_r2_matrix_file.unlink()
|
642
|
-
# save the combined r2 matrix
|
643
|
-
print(f'Start saving the combined r2 matrix in {output_cache_file_dir}')
|
644
|
-
combined_r2_matrix_file = output_cache_file_dir / 'combined_r2_matrix.npz'
|
645
|
-
save_npz(combined_r2_matrix_file, combined_r2_matrix_files)
|
646
|
-
|
647
|
-
def get_ldscore_using_r2_cache(self, annot_matrix, cached_r2_matrix_dir):
|
648
|
-
"""
|
649
|
-
Compute the r2 matrix multiplication with annot_matrix
|
650
|
-
"""
|
651
|
-
# Compute the r2 matrix multiplication with annot_matrix
|
652
|
-
cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
|
653
|
-
# iter the cached r2 matrix files
|
654
|
-
result_matrix = np.zeros((self.m, annot_matrix.shape[1]))
|
655
|
-
cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
|
656
|
-
assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
|
657
|
-
f'Please run the function compute_r2_cache first!')
|
658
|
-
for r2_matrix_file in tqdm(cached_r2_matrix_files, desc=f'Compute ld score for {cached_r2_matrix_dir.name}'):
|
659
|
-
print(f'Compute r2 matrix multiplication for {r2_matrix_file}')
|
660
|
-
r2_matrix = load_npz(r2_matrix_file)
|
661
|
-
result_matrix += r2_matrix.dot(annot_matrix)
|
662
|
-
return result_matrix
|
663
|
-
|
664
|
-
def load_r2_matrix_from_cache_files(self, cached_r2_matrix_dir):
|
665
|
-
"""
|
666
|
-
Load the r2 matrix from cache
|
667
|
-
"""
|
668
|
-
cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
|
669
|
-
# iter the cached r2 matrix files
|
670
|
-
cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
|
671
|
-
assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
|
672
|
-
f'Please run the function compute_r2_cache first!')
|
673
|
-
# load the r2 matrix
|
674
|
-
r2_matrix = load_npz(cached_r2_matrix_files[0])
|
675
|
-
for r2_matrix_file in tqdm(cached_r2_matrix_files[1:], desc=f'Load r2 matrix from {cached_r2_matrix_dir.name}'):
|
676
|
-
print(f'Load r2 matrix from {r2_matrix_file}')
|
677
|
-
r2_matrix += load_npz(r2_matrix_file)
|
678
|
-
# to float16
|
679
|
-
r2_matrix = r2_matrix.astype('float16')
|
680
|
-
return r2_matrix
|
681
|
-
def load_combined_r2_matrix(self, cached_r2_matrix_dir):
|
682
|
-
"""
|
683
|
-
Load the combined r2 matrix
|
684
|
-
"""
|
685
|
-
combined_r2_matrix_file = Path(cached_r2_matrix_dir) / 'combined_r2_matrix.npz'
|
686
|
-
assert combined_r2_matrix_file.exists(), (f'No combined r2 matrix file in {cached_r2_matrix_dir}'
|
687
|
-
f'Should delete the cache files and run the function compute_r2_cache first!')
|
688
|
-
# load the r2 matrix
|
689
|
-
r2_matrix = load_npz(combined_r2_matrix_file)
|
690
|
-
# to float16
|
691
|
-
r2_matrix = r2_matrix.astype('float16')
|
692
|
-
return r2_matrix
|
474
|
+
snp_file = bfile_chr_prefix + ".bim"
|
475
|
+
array_snps = PlinkBIMFile(snp_file)
|
693
476
|
|
694
|
-
def load_bfile(bfile_chr_prefix):
|
695
|
-
PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
|
696
|
-
PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
|
697
|
-
|
698
|
-
snp_file, snp_obj = bfile_chr_prefix + '.bim', PlinkBIMFile
|
699
|
-
array_snps = snp_obj(snp_file)
|
700
|
-
m = len(array_snps.IDList)
|
701
|
-
print(f'Read list of {m} SNPs from {snp_file}')
|
702
|
-
#
|
703
477
|
# Load fam
|
704
|
-
ind_file
|
705
|
-
array_indivs =
|
478
|
+
ind_file = bfile_chr_prefix + ".fam"
|
479
|
+
array_indivs = PlinkFAMFile(ind_file)
|
480
|
+
|
706
481
|
n = len(array_indivs.IDList)
|
707
|
-
print(f'Read list of {n} individuals from {ind_file}')
|
708
482
|
|
709
483
|
# Load genotype array
|
710
|
-
array_file
|
711
|
-
geno_array =
|
484
|
+
array_file = bfile_chr_prefix + ".bed"
|
485
|
+
geno_array = PlinkBEDFile(
|
486
|
+
array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None
|
487
|
+
)
|
712
488
|
|
713
489
|
return array_snps, array_indivs, geno_array
|
714
|
-
|
715
|
-
|
716
|
-
def generate_r2_matrix_chr_cache(bfile_chr_prefix, ld_wind_cm, output_cache_file_dir):
|
717
|
-
# Load genotype array
|
718
|
-
array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix)
|
719
|
-
# Compute block lefts
|
720
|
-
block_left = getBlockLefts(geno_array.df[:, 3], ld_wind_cm)
|
721
|
-
# Compute LD score
|
722
|
-
r2_matrix = geno_array.load_r2_matrix_from_cache(output_cache_file_dir)
|
723
|
-
|
724
|
-
|
725
|
-
def generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_wind_cm=1):
|
726
|
-
r2_cache_dir = Path(r2_cache_dir)
|
727
|
-
|
728
|
-
for chr in chromosome_list:
|
729
|
-
output_cache_file_prefix = r2_cache_dir / f'chr{chr}'
|
730
|
-
output_cache_file_prefix.mkdir(parents=True, exist_ok=True)
|
731
|
-
bfile_chr_prefix = bfile_prefix + '.' + str(chr)
|
732
|
-
generate_r2_matrix_chr_cache(bfile_chr_prefix,
|
733
|
-
ld_wind_cm=ld_wind_cm,
|
734
|
-
output_cache_file_dir=output_cache_file_prefix)
|
735
|
-
print(f'Compute r2 matrix for chr{chr} done!')
|