gsMap 1.71.2__py3-none-any.whl → 1.72.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN/adjacency_matrix.py +25 -27
- gsMap/GNN/model.py +9 -7
- gsMap/GNN/train.py +8 -11
- gsMap/__init__.py +3 -3
- gsMap/__main__.py +3 -2
- gsMap/cauchy_combination_test.py +75 -72
- gsMap/config.py +822 -316
- gsMap/create_slice_mean.py +154 -0
- gsMap/diagnosis.py +179 -101
- gsMap/find_latent_representation.py +28 -26
- gsMap/format_sumstats.py +233 -201
- gsMap/generate_ldscore.py +353 -209
- gsMap/latent_to_gene.py +92 -60
- gsMap/main.py +23 -14
- gsMap/report.py +39 -25
- gsMap/run_all_mode.py +86 -46
- gsMap/setup.py +1 -1
- gsMap/spatial_ldsc_multiple_sumstats.py +154 -80
- gsMap/utils/generate_r2_matrix.py +173 -140
- gsMap/utils/jackknife.py +84 -80
- gsMap/utils/manhattan_plot.py +180 -207
- gsMap/utils/regression_read.py +105 -122
- gsMap/visualize.py +82 -64
- {gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/METADATA +21 -6
- gsmap-1.72.3.dist-info/RECORD +31 -0
- {gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/WHEEL +1 -1
- gsMap/utils/make_annotations.py +0 -518
- gsmap-1.71.2.dist-info/RECORD +0 -31
- {gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/LICENSE +0 -0
- {gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/entry_points.txt +0 -0
@@ -1,23 +1,23 @@
|
|
1
1
|
from pathlib import Path
|
2
|
+
|
2
3
|
import bitarray as ba
|
3
4
|
import numpy as np
|
4
5
|
import pandas as pd
|
5
|
-
from scipy.sparse import csr_matrix
|
6
|
-
from
|
7
|
-
from tqdm import trange, tqdm
|
6
|
+
from scipy.sparse import csr_matrix, load_npz, save_npz
|
7
|
+
from tqdm import tqdm, trange
|
8
8
|
|
9
9
|
|
10
10
|
# Define the log class
|
11
|
-
class Logger
|
11
|
+
class Logger:
|
12
12
|
# -
|
13
13
|
def __init__(self, fh):
|
14
|
-
self.log_fh = open(fh,
|
14
|
+
self.log_fh = open(fh, "w")
|
15
15
|
|
16
16
|
# -
|
17
17
|
def log(self, msg):
|
18
|
-
|
18
|
+
"""
|
19
19
|
Print to log file and stdout.
|
20
|
-
|
20
|
+
"""
|
21
21
|
print(msg, file=self.log_fh)
|
22
22
|
print(msg)
|
23
23
|
|
@@ -28,11 +28,11 @@ class Logger(object):
|
|
28
28
|
|
29
29
|
# Compute ld-score using cellular annotations
|
30
30
|
def get_compression(fh):
|
31
|
-
|
32
|
-
if fh.endswith(
|
33
|
-
compression =
|
34
|
-
elif fh.endswith(
|
35
|
-
compression =
|
31
|
+
"""Which sort of compression should we use with read_csv?"""
|
32
|
+
if fh.endswith("gz"):
|
33
|
+
compression = "gzip"
|
34
|
+
elif fh.endswith("bz2"):
|
35
|
+
compression = "bz2"
|
36
36
|
else:
|
37
37
|
compression = None
|
38
38
|
# -
|
@@ -42,7 +42,7 @@ def get_compression(fh):
|
|
42
42
|
# Define the reading functions
|
43
43
|
def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
|
44
44
|
# -
|
45
|
-
class IDContainer
|
45
|
+
class IDContainer:
|
46
46
|
"""
|
47
47
|
A class to read data from a file, store it as a DataFrame, and provide a method for a left outer join operation.
|
48
48
|
"""
|
@@ -66,14 +66,15 @@ def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
|
|
66
66
|
"""
|
67
67
|
end = self.fname_end
|
68
68
|
if end and not fname.endswith(end):
|
69
|
-
raise ValueError(
|
69
|
+
raise ValueError(f"{end} filename must end in {end}")
|
70
70
|
comp = get_compression(fname)
|
71
|
-
self.df = pd.read_csv(
|
72
|
-
|
71
|
+
self.df = pd.read_csv(
|
72
|
+
fname, header=self.header, usecols=self.usecols, sep=r"\s+", compression=comp
|
73
|
+
)
|
73
74
|
if self.colnames:
|
74
75
|
self.df.columns = self.colnames
|
75
76
|
if self.keepcol is not None:
|
76
|
-
self.IDList = self.df.iloc[:, [self.keepcol]].astype(
|
77
|
+
self.IDList = self.df.iloc[:, [self.keepcol]].astype("object")
|
77
78
|
|
78
79
|
# -
|
79
80
|
def loj(self, externalDf):
|
@@ -83,10 +84,9 @@ def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
|
|
83
84
|
r = externalDf.columns[0]
|
84
85
|
l = self.IDList.columns[0]
|
85
86
|
merge_df = externalDf.iloc[:, [0]]
|
86
|
-
merge_df[
|
87
|
-
z = pd.merge(self.IDList, merge_df, how=
|
88
|
-
|
89
|
-
ii = z['keep'] == True
|
87
|
+
merge_df["keep"] = True
|
88
|
+
z = pd.merge(self.IDList, merge_df, how="left", left_on=l, right_on=r, sort=False)
|
89
|
+
ii = z["keep"]
|
90
90
|
return np.nonzero(ii)[0]
|
91
91
|
|
92
92
|
# -
|
@@ -94,20 +94,22 @@ def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
|
|
94
94
|
|
95
95
|
|
96
96
|
def getBlockLefts(coords, max_dist):
|
97
|
-
|
97
|
+
"""
|
98
98
|
Converts coordinates + max block length to the a list of coordinates of the leftmost
|
99
99
|
SNPs to be included in blocks.
|
100
|
+
|
100
101
|
Parameters
|
101
102
|
----------
|
102
103
|
coords : array
|
103
104
|
Array of coordinates. Must be sorted.
|
104
105
|
max_dist : float
|
105
106
|
Maximum distance between SNPs included in the same window.
|
107
|
+
|
106
108
|
Returns
|
107
109
|
-------
|
108
110
|
block_left : 1D np.ndarray with same length as block_left
|
109
111
|
block_left[j] := min{k | dist(j, k) < max_dist}.
|
110
|
-
|
112
|
+
"""
|
111
113
|
M = len(coords)
|
112
114
|
j = 0
|
113
115
|
block_left = np.zeros(M)
|
@@ -120,17 +122,19 @@ def getBlockLefts(coords, max_dist):
|
|
120
122
|
|
121
123
|
|
122
124
|
def block_left_to_right(block_left):
|
123
|
-
|
125
|
+
"""
|
124
126
|
Converts block lefts to block rights.
|
127
|
+
|
125
128
|
Parameters
|
126
129
|
----------
|
127
130
|
block_left : array
|
128
131
|
Array of block lefts.
|
132
|
+
|
129
133
|
Returns
|
130
134
|
-------
|
131
135
|
block_right : 1D np.ndarray with same length as block_left
|
132
136
|
block_right[j] := max {k | block_left[k] <= j}
|
133
|
-
|
137
|
+
"""
|
134
138
|
M = len(block_left)
|
135
139
|
j = 0
|
136
140
|
block_right = np.zeros(M)
|
@@ -142,54 +146,57 @@ def block_left_to_right(block_left):
|
|
142
146
|
return block_right
|
143
147
|
|
144
148
|
|
145
|
-
class GenotypeArrayInMemory
|
146
|
-
|
149
|
+
class GenotypeArrayInMemory:
|
150
|
+
"""
|
147
151
|
Parent class for various classes containing interfaces for files with genotype
|
148
152
|
matrices, e.g., plink .bed files, etc
|
149
|
-
|
153
|
+
"""
|
150
154
|
|
151
155
|
def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
|
152
156
|
self.m = len(snp_list.IDList)
|
153
157
|
self.n = n
|
154
158
|
self.keep_snps = keep_snps
|
155
159
|
self.keep_indivs = keep_indivs
|
156
|
-
self.df = np.array(snp_list.df[[
|
157
|
-
self.colnames = [
|
160
|
+
self.df = np.array(snp_list.df[["CHR", "SNP", "BP", "CM"]])
|
161
|
+
self.colnames = ["CHR", "SNP", "BP", "CM"]
|
158
162
|
self.mafMin = mafMin if mafMin is not None else 0
|
159
163
|
self._currentSNP = 0
|
160
164
|
(self.nru, self.geno) = self.__read__(fname, self.m, n)
|
161
165
|
# filter individuals
|
162
166
|
if keep_indivs is not None:
|
163
|
-
keep_indivs = np.array(keep_indivs, dtype=
|
167
|
+
keep_indivs = np.array(keep_indivs, dtype="int")
|
164
168
|
if np.any(keep_indivs > self.n):
|
165
|
-
raise ValueError(
|
169
|
+
raise ValueError("keep_indivs indices out of bounds")
|
166
170
|
# -
|
167
|
-
(self.geno, self.m, self.n) = self.__filter_indivs__(
|
171
|
+
(self.geno, self.m, self.n) = self.__filter_indivs__(
|
172
|
+
self.geno, keep_indivs, self.m, self.n
|
173
|
+
)
|
168
174
|
# -
|
169
175
|
if self.n > 0:
|
170
|
-
print(
|
176
|
+
print(f"After filtering, {self.n} individuals remain")
|
171
177
|
else:
|
172
|
-
raise ValueError(
|
178
|
+
raise ValueError("After filtering, no individuals remain")
|
173
179
|
# -
|
174
180
|
# filter SNPs
|
175
181
|
if keep_snps is not None:
|
176
|
-
keep_snps = np.array(keep_snps, dtype=
|
182
|
+
keep_snps = np.array(keep_snps, dtype="int")
|
177
183
|
if np.any(keep_snps > self.m): # if keep_snps is None, this returns False
|
178
|
-
raise ValueError(
|
184
|
+
raise ValueError("keep_snps indices out of bounds")
|
179
185
|
# -
|
180
186
|
(self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
|
181
|
-
self.geno, self.m, self.n, self.mafMin, keep_snps
|
187
|
+
self.geno, self.m, self.n, self.mafMin, keep_snps
|
188
|
+
)
|
182
189
|
# -
|
183
190
|
if self.m > 0:
|
184
|
-
print(
|
191
|
+
print(f"After filtering, {self.m} SNPs remain")
|
185
192
|
else:
|
186
|
-
raise ValueError(
|
193
|
+
raise ValueError("After filtering, no SNPs remain")
|
187
194
|
# -
|
188
195
|
self.df = self.df[self.kept_snps, :]
|
189
196
|
self.maf = np.minimum(self.freq, np.ones(self.m) - self.freq)
|
190
197
|
self.sqrtpq = np.sqrt(self.freq * (np.ones(self.m) - self.freq))
|
191
198
|
self.df = np.c_[self.df, self.maf]
|
192
|
-
self.colnames.append(
|
199
|
+
self.colnames.append("MAF")
|
193
200
|
|
194
201
|
# -
|
195
202
|
def __read__(self, fname, m, n):
|
@@ -208,8 +215,11 @@ class GenotypeArrayInMemory(object):
|
|
208
215
|
|
209
216
|
# -
|
210
217
|
def ldScoreVarBlocks(self, block_left, c, annot=None):
|
211
|
-
|
212
|
-
|
218
|
+
"""Computes an unbiased estimate of L2(j) for j=1,..,M."""
|
219
|
+
|
220
|
+
def func(x):
|
221
|
+
return self.__l2_unbiased__(x, self.n)
|
222
|
+
|
213
223
|
snp_getter = self.nextSNPs
|
214
224
|
return self.__corSumVarBlocks__(block_left, c, func, snp_getter, annot)
|
215
225
|
|
@@ -225,7 +235,7 @@ class GenotypeArrayInMemory(object):
|
|
225
235
|
# Methods for calculating sums of Pearson correlation coefficients (i.e.,ld-score)
|
226
236
|
# c stands for the chunk size (default = 50)
|
227
237
|
def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
|
228
|
-
|
238
|
+
"""
|
229
239
|
Parameters
|
230
240
|
----------
|
231
241
|
block_left : np.ndarray with shape (M, )
|
@@ -243,11 +253,12 @@ class GenotypeArrayInMemory(object):
|
|
243
253
|
The method to be used to get the next SNPs
|
244
254
|
annot: numpy array with shape (m,n_a)
|
245
255
|
SNP annotations.
|
256
|
+
|
246
257
|
Returns
|
247
258
|
-------
|
248
259
|
cor_sum : np.ndarray with shape (M, num_annots)
|
249
260
|
Estimates.
|
250
|
-
|
261
|
+
"""
|
251
262
|
m, n = self.m, self.n
|
252
263
|
block_sizes = np.array(np.arange(m) - block_left)
|
253
264
|
block_sizes = np.ceil(block_sizes / c) * c
|
@@ -256,7 +267,7 @@ class GenotypeArrayInMemory(object):
|
|
256
267
|
else:
|
257
268
|
annot_m = annot.shape[0]
|
258
269
|
if annot_m != self.m:
|
259
|
-
raise ValueError(
|
270
|
+
raise ValueError("Incorrect number of SNPs in annot")
|
260
271
|
# -
|
261
272
|
n_a = annot.shape[1] # number of annotations
|
262
273
|
cor_sum = np.zeros((m, n_a))
|
@@ -277,18 +288,18 @@ class GenotypeArrayInMemory(object):
|
|
277
288
|
rfuncBB = np.zeros((c, c))
|
278
289
|
# chunk inside of block
|
279
290
|
for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
|
280
|
-
B = A[:, l_B:l_B + c]
|
291
|
+
B = A[:, l_B : l_B + c]
|
281
292
|
# ld matrix
|
282
293
|
np.dot(A.T, B / n, out=rfuncAB)
|
283
294
|
# ld matrix square
|
284
295
|
rfuncAB = func(rfuncAB)
|
285
|
-
cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
296
|
+
cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
|
286
297
|
|
287
298
|
# chunk to right of block
|
288
299
|
b0 = b
|
289
300
|
md = int(c * np.floor(m / c))
|
290
301
|
end = md + 1 if md != m else md
|
291
|
-
for l_B in tqdm(np.arange(b0, end, c), desc=
|
302
|
+
for l_B in tqdm(np.arange(b0, end, c), desc="Compute SNP Gene Weight"):
|
292
303
|
# check if the annot matrix is all zeros for this block + chunk
|
293
304
|
# this happens w/ sparse categories (i.e., pathways)
|
294
305
|
# update the block
|
@@ -298,10 +309,10 @@ class GenotypeArrayInMemory(object):
|
|
298
309
|
# block_size can't increase more than c
|
299
310
|
# block_size can't be less than c unless it is zero
|
300
311
|
# both of these things make sense
|
301
|
-
A = np.hstack((A[:, old_b - b + c:old_b], B))
|
312
|
+
A = np.hstack((A[:, old_b - b + c : old_b], B))
|
302
313
|
l_A += old_b - b + c
|
303
314
|
elif l_B == b0 and b > 0:
|
304
|
-
A = A[:, b0 - b:b0]
|
315
|
+
A = A[:, b0 - b : b0]
|
305
316
|
l_A = b0 - b
|
306
317
|
elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
|
307
318
|
A = np.array(()).reshape((n, 0))
|
@@ -314,44 +325,45 @@ class GenotypeArrayInMemory(object):
|
|
314
325
|
rfuncAB = np.zeros((b, c))
|
315
326
|
# -
|
316
327
|
B = snp_getter(c)
|
317
|
-
p1 = np.all(annot[l_A:l_A + b, :] == 0)
|
318
|
-
p2 = np.all(annot[l_B:l_B + c, :] == 0)
|
328
|
+
p1 = np.all(annot[l_A : l_A + b, :] == 0)
|
329
|
+
p2 = np.all(annot[l_B : l_B + c, :] == 0)
|
319
330
|
if p1 and p2:
|
320
331
|
continue
|
321
332
|
# -
|
322
333
|
np.dot(A.T, B / n, out=rfuncAB)
|
323
334
|
rfuncAB = func(rfuncAB)
|
324
|
-
cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
|
325
|
-
cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
|
335
|
+
cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
|
336
|
+
cor_sum[l_B : l_B + c, :] += np.dot(annot[l_A : l_A + b, :].T, rfuncAB).T
|
326
337
|
np.dot(B.T, B / n, out=rfuncBB)
|
327
338
|
rfuncBB = func(rfuncBB)
|
328
|
-
cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
|
339
|
+
cor_sum[l_B : l_B + c, :] += np.dot(rfuncBB, annot[l_B : l_B + c, :])
|
329
340
|
# -
|
330
341
|
return cor_sum
|
331
342
|
|
332
343
|
|
333
344
|
class PlinkBEDFile(GenotypeArrayInMemory):
|
334
|
-
|
345
|
+
"""
|
335
346
|
Interface for Plink .bed format
|
336
|
-
|
347
|
+
"""
|
337
348
|
|
338
349
|
def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
|
339
350
|
self._bedcode = {
|
340
|
-
2: ba.bitarray(
|
341
|
-
9: ba.bitarray(
|
342
|
-
1: ba.bitarray(
|
343
|
-
0: ba.bitarray(
|
351
|
+
2: ba.bitarray("11"),
|
352
|
+
9: ba.bitarray("10"),
|
353
|
+
1: ba.bitarray("01"),
|
354
|
+
0: ba.bitarray("00"),
|
344
355
|
}
|
345
356
|
# -
|
346
|
-
GenotypeArrayInMemory.__init__(
|
347
|
-
|
357
|
+
GenotypeArrayInMemory.__init__(
|
358
|
+
self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=mafMin
|
359
|
+
)
|
348
360
|
|
349
361
|
# -
|
350
362
|
def __read__(self, fname, m, n):
|
351
|
-
if not fname.endswith(
|
352
|
-
raise ValueError(
|
363
|
+
if not fname.endswith(".bed"):
|
364
|
+
raise ValueError(".bed filename must end in .bed")
|
353
365
|
# -
|
354
|
-
fh = open(fname,
|
366
|
+
fh = open(fname, "rb")
|
355
367
|
magicNumber = ba.bitarray(endian="little")
|
356
368
|
magicNumber.fromfile(fh, 2)
|
357
369
|
bedMode = ba.bitarray(endian="little")
|
@@ -360,11 +372,11 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
360
372
|
nru = n + e
|
361
373
|
self.nru = nru
|
362
374
|
# check magic number
|
363
|
-
if magicNumber != ba.bitarray(
|
364
|
-
raise
|
375
|
+
if magicNumber != ba.bitarray("0011011011011000"):
|
376
|
+
raise OSError("Magic number from Plink .bed file not recognized")
|
365
377
|
# -
|
366
|
-
if bedMode != ba.bitarray(
|
367
|
-
raise
|
378
|
+
if bedMode != ba.bitarray("10000000"):
|
379
|
+
raise OSError("Plink .bed file must be in default SNP-major mode")
|
368
380
|
# check file length
|
369
381
|
self.geno = ba.bitarray(endian="little")
|
370
382
|
self.geno.fromfile(fh)
|
@@ -377,7 +389,7 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
377
389
|
real_len = len(geno)
|
378
390
|
if real_len != exp_len:
|
379
391
|
s = "Plink .bed file has {n1} bits, expected {n2}"
|
380
|
-
raise
|
392
|
+
raise OSError(s.format(n1=real_len, n2=exp_len))
|
381
393
|
|
382
394
|
# -
|
383
395
|
def __filter_indivs__(self, geno, keep_indivs, m, n):
|
@@ -388,14 +400,14 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
388
400
|
z = ba.bitarray(m * 2 * nru_new, endian="little")
|
389
401
|
z.setall(0)
|
390
402
|
for e, i in enumerate(keep_indivs):
|
391
|
-
z[2 * e::2 * nru_new] = geno[2 * i::2 * nru]
|
392
|
-
z[2 * e + 1::2 * nru_new] = geno[2 * i + 1::2 * nru]
|
403
|
+
z[2 * e :: 2 * nru_new] = geno[2 * i :: 2 * nru]
|
404
|
+
z[2 * e + 1 :: 2 * nru_new] = geno[2 * i + 1 :: 2 * nru]
|
393
405
|
self.nru = nru_new
|
394
406
|
return (z, m, n_new)
|
395
407
|
|
396
408
|
# -
|
397
409
|
def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
|
398
|
-
|
410
|
+
"""
|
399
411
|
Credit to Chris Chang and the Plink2 developers for this algorithm
|
400
412
|
Modified from plink_filter.c
|
401
413
|
https://github.com/chrchang/plink-ng/blob/master/plink_filter.c
|
@@ -414,7 +426,7 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
414
426
|
major allele frequency = (b+c)/(2*(n-a+c))
|
415
427
|
het ct + missing ct = a + b - 2*c
|
416
428
|
Why does bitarray not have >> ????
|
417
|
-
|
429
|
+
"""
|
418
430
|
nru = self.nru
|
419
431
|
m_poly = 0
|
420
432
|
y = ba.bitarray()
|
@@ -423,7 +435,7 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
423
435
|
kept_snps = []
|
424
436
|
freq = []
|
425
437
|
for e, j in enumerate(keep_snps):
|
426
|
-
z = geno[2 * nru * j:2 * nru * (j + 1)]
|
438
|
+
z = geno[2 * nru * j : 2 * nru * (j + 1)]
|
427
439
|
A = z[0::2]
|
428
440
|
a = A.count()
|
429
441
|
B = z[1::2]
|
@@ -443,9 +455,10 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
443
455
|
|
444
456
|
# -
|
445
457
|
def nextSNPs(self, b, minorRef=None):
|
446
|
-
|
458
|
+
"""
|
447
459
|
Unpacks the binary array of genotypes and returns an n x b matrix of floats of
|
448
460
|
normalized genotypes for the next b SNPs, where n := number of samples.
|
461
|
+
|
449
462
|
Parameters
|
450
463
|
----------
|
451
464
|
b : int
|
@@ -453,29 +466,30 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
453
466
|
minorRef: bool, default None
|
454
467
|
Should we flip reference alleles so that the minor allele is the reference?
|
455
468
|
(This is useful for computing l1 w.r.t. minor allele).
|
469
|
+
|
456
470
|
Returns
|
457
471
|
-------
|
458
472
|
X : np.array with dtype float64 with shape (n, b), where n := number of samples
|
459
473
|
Matrix of genotypes normalized to mean zero and variance one. If minorRef is
|
460
474
|
not None, then the minor allele will be the positive allele (i.e., two copies
|
461
475
|
of the minor allele --> a positive number).
|
462
|
-
|
476
|
+
"""
|
463
477
|
# -
|
464
478
|
try:
|
465
479
|
b = int(b)
|
466
480
|
if b <= 0:
|
467
481
|
raise ValueError("b must be > 0")
|
468
|
-
except TypeError:
|
469
|
-
raise TypeError("b must be an integer")
|
482
|
+
except TypeError as e:
|
483
|
+
raise TypeError("b must be an integer") from e
|
470
484
|
# -
|
471
485
|
if self._currentSNP + b > self.m:
|
472
|
-
s =
|
486
|
+
s = "{b} SNPs requested, {k} SNPs remain"
|
473
487
|
raise ValueError(s.format(b=b, k=(self.m - self._currentSNP)))
|
474
488
|
# -
|
475
489
|
c = self._currentSNP
|
476
490
|
n = self.n
|
477
491
|
nru = self.nru
|
478
|
-
slice = self.geno[2 * c * nru:2 * (c + b) * nru]
|
492
|
+
slice = self.geno[2 * c * nru : 2 * (c + b) * nru]
|
479
493
|
X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
|
480
494
|
X = X[0:n, :]
|
481
495
|
Y = np.zeros(X.shape)
|
@@ -499,14 +513,15 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
499
513
|
|
500
514
|
|
501
515
|
class PlinkBEDFileWithR2Cache(PlinkBEDFile):
|
502
|
-
def compute_r2_cache(
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
516
|
+
def compute_r2_cache(
|
517
|
+
self,
|
518
|
+
block_left,
|
519
|
+
output_cache_file_dir: Path,
|
520
|
+
chunk_size=500_000_000,
|
521
|
+
c=500,
|
522
|
+
r2_threshold=1e-4,
|
523
|
+
annot=None,
|
524
|
+
):
|
510
525
|
func = np.square
|
511
526
|
snp_getter = self.nextSNPs
|
512
527
|
data, rows, cols = [], [], []
|
@@ -536,9 +551,11 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
|
|
536
551
|
cols.extend(l_B + non_zero_indices[1])
|
537
552
|
if len(data) > chunk_size:
|
538
553
|
# save the cache
|
539
|
-
print(f
|
540
|
-
r2_sparse_matrix = csr_matrix(
|
541
|
-
|
554
|
+
print(f"Start saving the cache file: {output_cache_file_dir / f'{l_B}.npz'}")
|
555
|
+
r2_sparse_matrix = csr_matrix(
|
556
|
+
(data, (rows, cols)), shape=(self.m, self.m), dtype="float16"
|
557
|
+
)
|
558
|
+
save_npz(output_cache_file_dir / f"{l_B}.npz", r2_sparse_matrix)
|
542
559
|
# reset the data
|
543
560
|
data.clear()
|
544
561
|
rows.clear()
|
@@ -552,9 +569,9 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
|
|
552
569
|
else:
|
553
570
|
annot_m = annot.shape[0]
|
554
571
|
if annot_m != self.m:
|
555
|
-
raise ValueError(
|
572
|
+
raise ValueError("Incorrect number of SNPs in annot")
|
556
573
|
# -
|
557
|
-
n_a = annot.shape[1] # number of annotations
|
574
|
+
# n_a = annot.shape[1] # number of annotations
|
558
575
|
# cor_sum = np.zeros((m, n_a))
|
559
576
|
# b = index of first SNP for which SNP 0 is not included in LD Score
|
560
577
|
b = np.nonzero(block_left > 0)
|
@@ -573,7 +590,7 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
|
|
573
590
|
rfuncBB = np.zeros((c, c))
|
574
591
|
# chunk inside of block
|
575
592
|
for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
|
576
|
-
B = A[:, l_B:l_B + c]
|
593
|
+
B = A[:, l_B : l_B + c]
|
577
594
|
# ld matrix
|
578
595
|
np.dot(A.T, B / n, out=rfuncAB)
|
579
596
|
# ld matrix square
|
@@ -585,7 +602,7 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
|
|
585
602
|
b0 = b
|
586
603
|
md = int(c * np.floor(m / c))
|
587
604
|
end = md + 1 if md != m else md
|
588
|
-
for l_B in trange(b0, end, c, desc=f
|
605
|
+
for l_B in trange(b0, end, c, desc=f"Compute r2 cache for {output_cache_file_dir.name}"):
|
589
606
|
# check if the annot matrix is all zeros for this block + chunk
|
590
607
|
# this happens w/ sparse categories (i.e., pathways)
|
591
608
|
# update the block
|
@@ -595,10 +612,10 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
|
|
595
612
|
# block_size can't increase more than c
|
596
613
|
# block_size can't be less than c unless it is zero
|
597
614
|
# both of these things make sense
|
598
|
-
A = np.hstack((A[:, old_b - b + c:old_b], B))
|
615
|
+
A = np.hstack((A[:, old_b - b + c : old_b], B))
|
599
616
|
l_A += old_b - b + c
|
600
617
|
elif l_B == b0 and b > 0:
|
601
|
-
A = A[:, b0 - b:b0]
|
618
|
+
A = A[:, b0 - b : b0]
|
602
619
|
l_A = b0 - b
|
603
620
|
elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
|
604
621
|
A = np.array(()).reshape((n, 0))
|
@@ -611,8 +628,8 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
|
|
611
628
|
rfuncAB = np.zeros((b, c))
|
612
629
|
# -
|
613
630
|
B = snp_getter(c)
|
614
|
-
p1 = np.all(annot[l_A:l_A + b, :] == 0)
|
615
|
-
p2 = np.all(annot[l_B:l_B + c, :] == 0)
|
631
|
+
p1 = np.all(annot[l_A : l_A + b, :] == 0)
|
632
|
+
p2 = np.all(annot[l_B : l_B + c, :] == 0)
|
616
633
|
if p1 and p2:
|
617
634
|
continue
|
618
635
|
# -
|
@@ -629,19 +646,19 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
|
|
629
646
|
if len(data) > 0:
|
630
647
|
# save remaining data
|
631
648
|
# save the cache
|
632
|
-
print(f
|
633
|
-
r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(m, m), dtype=
|
634
|
-
save_npz(output_cache_file_dir / f
|
649
|
+
print(f"Start saving the cache file: {output_cache_file_dir / f'{l_B}.npz'}")
|
650
|
+
r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(m, m), dtype="float16")
|
651
|
+
save_npz(output_cache_file_dir / f"{l_B}.npz", r2_sparse_matrix)
|
635
652
|
# combine the cache files
|
636
|
-
print(f
|
637
|
-
cached_r2_matrix_files = list(output_cache_file_dir.glob(
|
653
|
+
print(f"Start combining the cache files in {output_cache_file_dir}")
|
654
|
+
cached_r2_matrix_files = list(output_cache_file_dir.glob("*.npz"))
|
638
655
|
combined_r2_matrix_files = self.load_r2_matrix_from_cache_files(output_cache_file_dir)
|
639
656
|
# remove the cache files
|
640
657
|
for cached_r2_matrix_file in cached_r2_matrix_files:
|
641
658
|
cached_r2_matrix_file.unlink()
|
642
659
|
# save the combined r2 matrix
|
643
|
-
print(f
|
644
|
-
combined_r2_matrix_file = output_cache_file_dir /
|
660
|
+
print(f"Start saving the combined r2 matrix in {output_cache_file_dir}")
|
661
|
+
combined_r2_matrix_file = output_cache_file_dir / "combined_r2_matrix.npz"
|
645
662
|
save_npz(combined_r2_matrix_file, combined_r2_matrix_files)
|
646
663
|
|
647
664
|
def get_ldscore_using_r2_cache(self, annot_matrix, cached_r2_matrix_dir):
|
@@ -652,11 +669,15 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
|
|
652
669
|
cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
|
653
670
|
# iter the cached r2 matrix files
|
654
671
|
result_matrix = np.zeros((self.m, annot_matrix.shape[1]))
|
655
|
-
cached_r2_matrix_files = list(cached_r2_matrix_dir.glob(
|
656
|
-
assert len(cached_r2_matrix_files) > 0, (
|
657
|
-
|
658
|
-
|
659
|
-
|
672
|
+
cached_r2_matrix_files = list(cached_r2_matrix_dir.glob("*.npz"))
|
673
|
+
assert len(cached_r2_matrix_files) > 0, (
|
674
|
+
f"No cached r2 matrix files in {cached_r2_matrix_dir}"
|
675
|
+
f"Please run the function compute_r2_cache first!"
|
676
|
+
)
|
677
|
+
for r2_matrix_file in tqdm(
|
678
|
+
cached_r2_matrix_files, desc=f"Compute ld score for {cached_r2_matrix_dir.name}"
|
679
|
+
):
|
680
|
+
print(f"Compute r2 matrix multiplication for {r2_matrix_file}")
|
660
681
|
r2_matrix = load_npz(r2_matrix_file)
|
661
682
|
result_matrix += r2_matrix.dot(annot_matrix)
|
662
683
|
return result_matrix
|
@@ -667,48 +688,60 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
|
|
667
688
|
"""
|
668
689
|
cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
|
669
690
|
# iter the cached r2 matrix files
|
670
|
-
cached_r2_matrix_files = list(cached_r2_matrix_dir.glob(
|
671
|
-
assert len(cached_r2_matrix_files) > 0, (
|
672
|
-
|
691
|
+
cached_r2_matrix_files = list(cached_r2_matrix_dir.glob("*.npz"))
|
692
|
+
assert len(cached_r2_matrix_files) > 0, (
|
693
|
+
f"No cached r2 matrix files in {cached_r2_matrix_dir}"
|
694
|
+
f"Please run the function compute_r2_cache first!"
|
695
|
+
)
|
673
696
|
# load the r2 matrix
|
674
697
|
r2_matrix = load_npz(cached_r2_matrix_files[0])
|
675
|
-
for r2_matrix_file in tqdm(
|
676
|
-
|
698
|
+
for r2_matrix_file in tqdm(
|
699
|
+
cached_r2_matrix_files[1:], desc=f"Load r2 matrix from {cached_r2_matrix_dir.name}"
|
700
|
+
):
|
701
|
+
print(f"Load r2 matrix from {r2_matrix_file}")
|
677
702
|
r2_matrix += load_npz(r2_matrix_file)
|
678
703
|
# to float16
|
679
|
-
r2_matrix = r2_matrix.astype(
|
704
|
+
r2_matrix = r2_matrix.astype("float16")
|
680
705
|
return r2_matrix
|
706
|
+
|
681
707
|
def load_combined_r2_matrix(self, cached_r2_matrix_dir):
|
682
708
|
"""
|
683
709
|
Load the combined r2 matrix
|
684
710
|
"""
|
685
|
-
combined_r2_matrix_file = Path(cached_r2_matrix_dir) /
|
686
|
-
assert combined_r2_matrix_file.exists(), (
|
687
|
-
|
711
|
+
combined_r2_matrix_file = Path(cached_r2_matrix_dir) / "combined_r2_matrix.npz"
|
712
|
+
assert combined_r2_matrix_file.exists(), (
|
713
|
+
f"No combined r2 matrix file in {cached_r2_matrix_dir}"
|
714
|
+
f"Should delete the cache files and run the function compute_r2_cache first!"
|
715
|
+
)
|
688
716
|
# load the r2 matrix
|
689
717
|
r2_matrix = load_npz(combined_r2_matrix_file)
|
690
718
|
# to float16
|
691
|
-
r2_matrix = r2_matrix.astype(
|
719
|
+
r2_matrix = r2_matrix.astype("float16")
|
692
720
|
return r2_matrix
|
693
721
|
|
722
|
+
|
694
723
|
def load_bfile(bfile_chr_prefix):
|
695
|
-
PlinkBIMFile = ID_List_Factory(
|
696
|
-
|
724
|
+
PlinkBIMFile = ID_List_Factory(
|
725
|
+
["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
|
726
|
+
)
|
727
|
+
PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
|
697
728
|
|
698
|
-
snp_file, snp_obj = bfile_chr_prefix +
|
729
|
+
snp_file, snp_obj = bfile_chr_prefix + ".bim", PlinkBIMFile
|
699
730
|
array_snps = snp_obj(snp_file)
|
700
731
|
m = len(array_snps.IDList)
|
701
|
-
print(f
|
732
|
+
print(f"Read list of {m} SNPs from {snp_file}")
|
702
733
|
#
|
703
734
|
# Load fam
|
704
|
-
ind_file, ind_obj = bfile_chr_prefix +
|
735
|
+
ind_file, ind_obj = bfile_chr_prefix + ".fam", PlinkFAMFile
|
705
736
|
array_indivs = ind_obj(ind_file)
|
706
737
|
n = len(array_indivs.IDList)
|
707
|
-
print(f
|
738
|
+
print(f"Read list of {n} individuals from {ind_file}")
|
708
739
|
|
709
740
|
# Load genotype array
|
710
|
-
array_file, array_obj = bfile_chr_prefix +
|
711
|
-
geno_array = array_obj(
|
741
|
+
array_file, array_obj = bfile_chr_prefix + ".bed", PlinkBEDFileWithR2Cache
|
742
|
+
geno_array = array_obj(
|
743
|
+
array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None
|
744
|
+
)
|
712
745
|
|
713
746
|
return array_snps, array_indivs, geno_array
|
714
747
|
|
@@ -717,19 +750,19 @@ def generate_r2_matrix_chr_cache(bfile_chr_prefix, ld_wind_cm, output_cache_file
|
|
717
750
|
# Load genotype array
|
718
751
|
array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix)
|
719
752
|
# Compute block lefts
|
720
|
-
block_left = getBlockLefts(geno_array.df[:, 3], ld_wind_cm)
|
753
|
+
# block_left = getBlockLefts(geno_array.df[:, 3], ld_wind_cm)
|
721
754
|
# Compute LD score
|
722
|
-
r2_matrix = geno_array.load_r2_matrix_from_cache(output_cache_file_dir)
|
755
|
+
# r2_matrix = geno_array.load_r2_matrix_from_cache(output_cache_file_dir)
|
723
756
|
|
724
757
|
|
725
758
|
def generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_wind_cm=1):
|
726
759
|
r2_cache_dir = Path(r2_cache_dir)
|
727
760
|
|
728
761
|
for chr in chromosome_list:
|
729
|
-
output_cache_file_prefix = r2_cache_dir / f
|
762
|
+
output_cache_file_prefix = r2_cache_dir / f"chr{chr}"
|
730
763
|
output_cache_file_prefix.mkdir(parents=True, exist_ok=True)
|
731
|
-
bfile_chr_prefix = bfile_prefix +
|
732
|
-
generate_r2_matrix_chr_cache(
|
733
|
-
|
734
|
-
|
735
|
-
print(f
|
764
|
+
bfile_chr_prefix = bfile_prefix + "." + str(chr)
|
765
|
+
generate_r2_matrix_chr_cache(
|
766
|
+
bfile_chr_prefix, ld_wind_cm=ld_wind_cm, output_cache_file_dir=output_cache_file_prefix
|
767
|
+
)
|
768
|
+
print(f"Compute r2 matrix for chr{chr} done!")
|