gsMap 1.73.2__py3-none-any.whl → 1.73.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/__init__.py +1 -1
- gsMap/config.py +2 -9
- gsMap/diagnosis.py +4 -3
- gsMap/generate_ldscore.py +115 -453
- gsMap/utils/generate_r2_matrix.py +455 -352
- gsMap/utils/regression_read.py +131 -157
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/METADATA +1 -1
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/RECORD +11 -11
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/WHEEL +0 -0
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/entry_points.txt +0 -0
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,67 +1,27 @@
|
|
1
|
+
"""
|
2
|
+
Module for reading and processing PLINK genotype data and calculating LD scores.
|
3
|
+
|
4
|
+
Note:
|
5
|
+
This code is adapted and modified from:
|
6
|
+
https://github.com/bulik/ldsc/blob/master/ldsc/ldscore.py
|
7
|
+
"""
|
8
|
+
|
9
|
+
import logging
|
10
|
+
|
1
11
|
import bitarray as ba
|
2
12
|
import numpy as np
|
3
13
|
import pandas as pd
|
14
|
+
import pyranges as pr
|
4
15
|
from tqdm import tqdm
|
5
16
|
|
6
|
-
|
7
|
-
|
8
|
-
def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
|
9
|
-
# -
|
10
|
-
class IDContainer:
|
11
|
-
"""
|
12
|
-
A class to read data from a file, store it as a DataFrame, and provide a method for a left outer join operation.
|
13
|
-
"""
|
14
|
-
|
15
|
-
def __init__(self, fname):
|
16
|
-
"""
|
17
|
-
Initialize the IDContainer with the given filename and reading options.
|
18
|
-
"""
|
19
|
-
self.usecols = usecols
|
20
|
-
self.colnames = colnames
|
21
|
-
self.keepcol = keepcol
|
22
|
-
self.fname_end = fname_end
|
23
|
-
self.header = header
|
24
|
-
self.read(fname)
|
25
|
-
self.n = len(self.df)
|
26
|
-
|
27
|
-
# -
|
28
|
-
def read(self, fname):
|
29
|
-
"""
|
30
|
-
Read data from the given file and store it as a DataFrame.
|
31
|
-
"""
|
32
|
-
end = self.fname_end
|
33
|
-
if end and not fname.endswith(end):
|
34
|
-
raise ValueError(f"{end} filename must end in {end}")
|
35
|
-
self.df = pd.read_csv(
|
36
|
-
fname,
|
37
|
-
header=self.header,
|
38
|
-
usecols=self.usecols,
|
39
|
-
sep=r"\s+",
|
40
|
-
)
|
41
|
-
if self.colnames:
|
42
|
-
self.df.columns = self.colnames
|
43
|
-
if self.keepcol is not None:
|
44
|
-
self.IDList = self.df.iloc[:, [self.keepcol]].astype("object")
|
45
|
-
|
46
|
-
return IDContainer
|
17
|
+
# Configure logger
|
18
|
+
logger = logging.getLogger("gsMap.utils.plink_ldscore_tool")
|
47
19
|
|
48
20
|
|
49
21
|
def getBlockLefts(coords, max_dist):
|
50
22
|
"""
|
51
|
-
Converts coordinates + max block length to
|
23
|
+
Converts coordinates + max block length to a list of coordinates of the leftmost
|
52
24
|
SNPs to be included in blocks.
|
53
|
-
|
54
|
-
Parameters
|
55
|
-
----------
|
56
|
-
coords : array
|
57
|
-
Array of coordinates. Must be sorted.
|
58
|
-
max_dist : float
|
59
|
-
Maximum distance between SNPs included in the same window.
|
60
|
-
|
61
|
-
Returns
|
62
|
-
-------
|
63
|
-
block_left : 1D np.ndarray with same length as block_left
|
64
|
-
block_left[j] := min{k | dist(j, k) < max_dist}.
|
65
25
|
"""
|
66
26
|
M = len(coords)
|
67
27
|
j = 0
|
@@ -77,16 +37,6 @@ def getBlockLefts(coords, max_dist):
|
|
77
37
|
def block_left_to_right(block_left):
|
78
38
|
"""
|
79
39
|
Converts block lefts to block rights.
|
80
|
-
|
81
|
-
Parameters
|
82
|
-
----------
|
83
|
-
block_left : array
|
84
|
-
Array of block lefts.
|
85
|
-
|
86
|
-
Returns
|
87
|
-
-------
|
88
|
-
block_right : 1D np.ndarray with same length as block_left
|
89
|
-
block_right[j] := max {k | block_left[k] <= j}
|
90
40
|
"""
|
91
41
|
M = len(block_left)
|
92
42
|
j = 0
|
@@ -99,223 +49,149 @@ def block_left_to_right(block_left):
|
|
99
49
|
return block_right
|
100
50
|
|
101
51
|
|
102
|
-
class
|
52
|
+
class PlinkBEDFile:
|
103
53
|
"""
|
104
|
-
|
105
|
-
matrices, e.g., plink .bed files, etc
|
54
|
+
Interface for Plink .bed format for reading and processing genotype data.
|
106
55
|
"""
|
107
56
|
|
108
|
-
def __init__(self,
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
(self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
|
140
|
-
self.geno, self.m, self.n, self.mafMin, keep_snps
|
57
|
+
def __init__(self, bfile_prefix):
|
58
|
+
"""
|
59
|
+
Initialize the PlinkBEDFile from a PLINK file prefix.
|
60
|
+
|
61
|
+
Parameters
|
62
|
+
----------
|
63
|
+
bfile_prefix : str
|
64
|
+
PLINK file prefix (without .bed/.bim/.fam extension)
|
65
|
+
"""
|
66
|
+
# Initialize bitarray for bed code mapping
|
67
|
+
self._bedcode = {
|
68
|
+
2: ba.bitarray("11"),
|
69
|
+
9: ba.bitarray("10"),
|
70
|
+
1: ba.bitarray("01"),
|
71
|
+
0: ba.bitarray("00"),
|
72
|
+
}
|
73
|
+
|
74
|
+
# Load BIM file
|
75
|
+
self.bim_df = self.load_bim(f"{bfile_prefix}.bim")
|
76
|
+
|
77
|
+
# Load FAM file
|
78
|
+
self.fam_df = self.load_fam(f"{bfile_prefix}.fam")
|
79
|
+
|
80
|
+
# Set up initial parameters
|
81
|
+
self.m_original = len(self.bim_df)
|
82
|
+
self.n_original = len(self.fam_df)
|
83
|
+
|
84
|
+
# Read the bed file
|
85
|
+
logger.info(f"Loading Plink genotype data from {bfile_prefix}.bed")
|
86
|
+
(self.nru_original, self.geno_original) = self._read(
|
87
|
+
f"{bfile_prefix}.bed", self.m_original, self.n_original
|
141
88
|
)
|
142
|
-
|
143
|
-
|
144
|
-
|
89
|
+
|
90
|
+
# Pre-calculate MAF for all SNPs
|
91
|
+
logger.info("Calculating MAF and QC for all SNPs")
|
92
|
+
self.all_snp_info = self._calculate_all_snp_info()
|
93
|
+
|
94
|
+
# Filter out invalid SNPs
|
95
|
+
valid_mask = self.all_snp_info["valid_snp"]
|
96
|
+
if num_invalid := np.sum(~valid_mask):
|
97
|
+
logger.warning(f"Filtering out {num_invalid} bad quality SNPs")
|
145
98
|
else:
|
146
|
-
|
147
|
-
# -
|
148
|
-
self.df = self.df[self.kept_snps, :]
|
149
|
-
self.maf = np.minimum(self.freq, np.ones(self.m) - self.freq)
|
150
|
-
self.sqrtpq = np.sqrt(self.freq * (np.ones(self.m) - self.freq))
|
151
|
-
self.df = np.c_[self.df, self.maf]
|
152
|
-
self.colnames.append("MAF")
|
153
|
-
|
154
|
-
# -
|
155
|
-
def __read__(self, fname, m, n):
|
156
|
-
raise NotImplementedError
|
157
|
-
|
158
|
-
def __restart__(self):
|
159
|
-
self._currentSNP = 0
|
99
|
+
logger.info("All SNPs passed the basic quality check")
|
160
100
|
|
161
|
-
|
162
|
-
|
163
|
-
raise NotImplementedError
|
101
|
+
# Only keep valid SNPs
|
102
|
+
self.kept_snps = np.arange(self.m_original)[valid_mask]
|
164
103
|
|
165
|
-
|
166
|
-
|
167
|
-
raise NotImplementedError
|
104
|
+
# Update bim_df to only include valid SNPs and reset index
|
105
|
+
self.bim_df = self.bim_df.loc[valid_mask].reset_index(drop=True)
|
168
106
|
|
169
|
-
|
170
|
-
|
171
|
-
|
107
|
+
# Create new genotype data with only the valid SNPs
|
108
|
+
new_geno = ba.bitarray()
|
109
|
+
for j in self.kept_snps:
|
110
|
+
new_geno += self.geno_original[
|
111
|
+
2 * self.nru_original * j : 2 * self.nru_original * (j + 1)
|
112
|
+
]
|
172
113
|
|
173
|
-
|
174
|
-
|
114
|
+
# Update original data to only include valid SNPs
|
115
|
+
self.geno_original = new_geno
|
116
|
+
self.m_original = len(self.kept_snps)
|
175
117
|
|
176
|
-
|
177
|
-
|
118
|
+
# Initialize current state variables
|
119
|
+
self._currentSNP = 0
|
120
|
+
self.m = self.m_original
|
121
|
+
self.n = self.n_original
|
122
|
+
self.nru = self.nru_original
|
123
|
+
self.geno = self.geno_original.copy()
|
178
124
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
denom = n - 2 if n > 2 else n # allow n<2 for testing purposes
|
184
|
-
sq = np.square(x)
|
185
|
-
return sq - (1 - sq) / denom
|
125
|
+
# Update frequency info based on valid SNPs
|
126
|
+
self.freq = self.all_snp_info["freq"][valid_mask]
|
127
|
+
self.maf = np.minimum(self.freq, 1 - self.freq)
|
128
|
+
self.sqrtpq = np.sqrt(self.freq * (1 - self.freq))
|
186
129
|
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
130
|
+
# Add MAF to the BIM dataframe
|
131
|
+
self.bim_df["MAF"] = self.maf
|
132
|
+
|
133
|
+
logger.info(f"Loaded genotype data with {self.m} SNPs and {self.n} individuals")
|
134
|
+
|
135
|
+
@staticmethod
|
136
|
+
def load_bim(bim_file):
|
191
137
|
"""
|
138
|
+
Load a BIM file into a pandas DataFrame.
|
139
|
+
|
192
140
|
Parameters
|
193
141
|
----------
|
194
|
-
|
195
|
-
|
196
|
-
if c > 1, then only entries that are multiples of c are examined, and it is
|
197
|
-
assumed that block_left[a*c+i] = block_left[a*c], except at
|
198
|
-
the beginning of the chromosome where the 0th SNP is included in the window.
|
199
|
-
c : int
|
200
|
-
Chunk size.
|
201
|
-
func : function
|
202
|
-
Function to be applied to the genotype correlation matrix. Before dotting with
|
203
|
-
annot. Examples: for biased L2, np.square. For biased L4,
|
204
|
-
lambda x: np.square(np.square(x)). For L1, lambda x: x.
|
205
|
-
snp_getter : function(int)
|
206
|
-
The method to be used to get the next SNPs
|
207
|
-
annot: numpy array with shape (m,n_a)
|
208
|
-
SNP annotations.
|
142
|
+
bim_file : str
|
143
|
+
Path to the BIM file
|
209
144
|
|
210
145
|
Returns
|
211
146
|
-------
|
212
|
-
|
213
|
-
|
147
|
+
pd.DataFrame
|
148
|
+
DataFrame containing BIM data
|
214
149
|
"""
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
annot = np.ones((m, 1))
|
220
|
-
else:
|
221
|
-
annot_m = annot.shape[0]
|
222
|
-
if annot_m != self.m:
|
223
|
-
raise ValueError("Incorrect number of SNPs in annot")
|
224
|
-
# -
|
225
|
-
n_a = annot.shape[1] # number of annotations
|
226
|
-
cor_sum = np.zeros((m, n_a))
|
227
|
-
# b = index of first SNP for which SNP 0 is not included in LD Score
|
228
|
-
b = np.nonzero(block_left > 0)
|
229
|
-
if np.any(b):
|
230
|
-
b = b[0][0]
|
231
|
-
else:
|
232
|
-
b = m
|
233
|
-
b = int(np.ceil(b / c) * c) # round up to a multiple of c
|
234
|
-
if b > m:
|
235
|
-
c = 1
|
236
|
-
b = m
|
150
|
+
df = pd.read_csv(
|
151
|
+
bim_file, sep="\t", header=None, names=["CHR", "SNP", "CM", "BP", "A1", "A2"]
|
152
|
+
)
|
153
|
+
return df
|
237
154
|
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
B = A[:, l_B : l_B + c]
|
245
|
-
# ld matrix
|
246
|
-
np.dot(A.T, B / n, out=rfuncAB)
|
247
|
-
# ld matrix square
|
248
|
-
rfuncAB = func(rfuncAB)
|
249
|
-
cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
|
155
|
+
@staticmethod
|
156
|
+
def convert_bim_to_pyrange(bim_df) -> pr.PyRanges:
|
157
|
+
bim_pr = bim_df.copy()
|
158
|
+
bim_pr.drop(columns=["MAF"], inplace=True)
|
159
|
+
bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
|
160
|
+
bim_pr.Chromosome = "chr" + bim_pr["Chromosome"].astype(str)
|
250
161
|
|
251
|
-
#
|
252
|
-
|
253
|
-
|
254
|
-
end = md + 1 if md != m else md
|
255
|
-
for l_B in tqdm(np.arange(b0, end, c), desc="Compute SNP Gene Weight"):
|
256
|
-
# check if the annot matrix is all zeros for this block + chunk
|
257
|
-
# this happens w/ sparse categories (i.e., pathways)
|
258
|
-
# update the block
|
259
|
-
old_b = b
|
260
|
-
b = int(block_sizes[l_B])
|
261
|
-
if l_B > b0 and b > 0:
|
262
|
-
# block_size can't increase more than c
|
263
|
-
# block_size can't be less than c unless it is zero
|
264
|
-
# both of these things make sense
|
265
|
-
A = np.hstack((A[:, old_b - b + c : old_b], B))
|
266
|
-
l_A += old_b - b + c
|
267
|
-
elif l_B == b0 and b > 0:
|
268
|
-
A = A[:, b0 - b : b0]
|
269
|
-
l_A = b0 - b
|
270
|
-
elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
|
271
|
-
A = np.array(()).reshape((n, 0))
|
272
|
-
l_A = l_B
|
273
|
-
if l_B == md:
|
274
|
-
c = m - md
|
275
|
-
rfuncAB = np.zeros((b, c))
|
276
|
-
rfuncBB = np.zeros((c, c))
|
277
|
-
if b != old_b:
|
278
|
-
rfuncAB = np.zeros((b, c))
|
279
|
-
# -
|
280
|
-
B = snp_getter(c)
|
281
|
-
p1 = np.all(annot[l_A : l_A + b, :] == 0)
|
282
|
-
p2 = np.all(annot[l_B : l_B + c, :] == 0)
|
283
|
-
if p1 and p2:
|
284
|
-
continue
|
285
|
-
# -
|
286
|
-
np.dot(A.T, B / n, out=rfuncAB)
|
287
|
-
rfuncAB = func(rfuncAB)
|
288
|
-
cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
|
289
|
-
cor_sum[l_B : l_B + c, :] += np.dot(annot[l_A : l_A + b, :].T, rfuncAB).T
|
290
|
-
np.dot(B.T, B / n, out=rfuncBB)
|
291
|
-
rfuncBB = func(rfuncBB)
|
292
|
-
cor_sum[l_B : l_B + c, :] += np.dot(rfuncBB, annot[l_B : l_B + c, :])
|
293
|
-
# -
|
294
|
-
return cor_sum
|
162
|
+
# Adjust coordinates (BIM is 1-based, PyRanges uses 0-based)
|
163
|
+
bim_pr["End"] = bim_pr["Start"].copy()
|
164
|
+
bim_pr["Start"] = bim_pr["Start"] - 1
|
295
165
|
|
166
|
+
bim_pr = pr.PyRanges(bim_pr)
|
296
167
|
|
297
|
-
|
298
|
-
"""
|
299
|
-
Interface for Plink .bed format
|
300
|
-
"""
|
168
|
+
return bim_pr
|
301
169
|
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
170
|
+
@staticmethod
|
171
|
+
def load_fam(fam_file):
|
172
|
+
"""
|
173
|
+
Load a FAM file into a pandas DataFrame.
|
174
|
+
|
175
|
+
Parameters
|
176
|
+
----------
|
177
|
+
fam_file : str
|
178
|
+
Path to the FAM file
|
179
|
+
|
180
|
+
Returns
|
181
|
+
-------
|
182
|
+
pd.DataFrame
|
183
|
+
DataFrame containing FAM data
|
184
|
+
"""
|
185
|
+
df = pd.read_csv(fam_file, sep=r"\s+", header=None, usecols=[1], names=["IID"])
|
186
|
+
return df
|
313
187
|
|
314
|
-
|
315
|
-
|
188
|
+
def _read(self, fname, m, n):
|
189
|
+
"""
|
190
|
+
Read the bed file and return the genotype data.
|
191
|
+
"""
|
316
192
|
if not fname.endswith(".bed"):
|
317
193
|
raise ValueError(".bed filename must end in .bed")
|
318
|
-
|
194
|
+
|
319
195
|
fh = open(fname, "rb")
|
320
196
|
magicNumber = ba.bitarray(endian="little")
|
321
197
|
magicNumber.fromfile(fh, 2)
|
@@ -323,29 +199,150 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
323
199
|
bedMode.fromfile(fh, 1)
|
324
200
|
e = (4 - n % 4) if n % 4 != 0 else 0
|
325
201
|
nru = n + e
|
326
|
-
|
327
|
-
#
|
202
|
+
|
203
|
+
# Check magic number
|
328
204
|
if magicNumber != ba.bitarray("0011011011011000"):
|
329
205
|
raise OSError("Magic number from Plink .bed file not recognized")
|
330
|
-
|
206
|
+
|
331
207
|
if bedMode != ba.bitarray("10000000"):
|
332
208
|
raise OSError("Plink .bed file must be in default SNP-major mode")
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
def
|
209
|
+
|
210
|
+
# Check file length
|
211
|
+
geno = ba.bitarray(endian="little")
|
212
|
+
geno.fromfile(fh)
|
213
|
+
self._test_length(geno, m, nru)
|
214
|
+
return (nru, geno)
|
215
|
+
|
216
|
+
def _test_length(self, geno, m, nru):
|
217
|
+
"""
|
218
|
+
Test if the genotype data has the expected length.
|
219
|
+
"""
|
341
220
|
exp_len = 2 * m * nru
|
342
221
|
real_len = len(geno)
|
343
222
|
if real_len != exp_len:
|
344
223
|
s = "Plink .bed file has {n1} bits, expected {n2}"
|
345
224
|
raise OSError(s.format(n1=real_len, n2=exp_len))
|
346
225
|
|
347
|
-
|
348
|
-
|
226
|
+
def _calculate_all_snp_info(self):
|
227
|
+
"""
|
228
|
+
Pre-calculate MAF and other information for all SNPs.
|
229
|
+
|
230
|
+
Returns
|
231
|
+
-------
|
232
|
+
dict
|
233
|
+
Dictionary containing information for all SNPs
|
234
|
+
"""
|
235
|
+
nru = self.nru_original
|
236
|
+
n = self.n_original
|
237
|
+
m = self.m_original
|
238
|
+
geno = self.geno_original
|
239
|
+
|
240
|
+
snp_info = {
|
241
|
+
"freq": np.zeros(m), # Allele frequencies
|
242
|
+
"het_miss_count": np.zeros(m), # Count of het or missing genotypes
|
243
|
+
"valid_snp": np.zeros(m, dtype=bool), # Whether SNP passes basic criteria
|
244
|
+
}
|
245
|
+
|
246
|
+
# For each SNP, calculate statistics
|
247
|
+
for j in range(m):
|
248
|
+
z = geno[2 * nru * j : 2 * nru * (j + 1)]
|
249
|
+
A = z[0::2]
|
250
|
+
a = A.count()
|
251
|
+
B = z[1::2]
|
252
|
+
b = B.count()
|
253
|
+
c = (A & B).count()
|
254
|
+
major_ct = b + c # number of copies of the major allele
|
255
|
+
n_nomiss = n - a + c # number of individuals with nonmissing genotypes
|
256
|
+
f = major_ct / (2 * n_nomiss) if n_nomiss > 0 else 0
|
257
|
+
het_miss_ct = a + b - 2 * c # count of SNPs that are het or missing
|
258
|
+
|
259
|
+
snp_info["freq"][j] = f
|
260
|
+
snp_info["het_miss_count"][j] = het_miss_ct
|
261
|
+
snp_info["valid_snp"][j] = het_miss_ct < n # Basic validity check
|
262
|
+
|
263
|
+
return snp_info
|
264
|
+
|
265
|
+
def apply_filters(self, keep_snps=None, keep_indivs=None, mafMin=None):
|
266
|
+
"""
|
267
|
+
Apply filters to the genotype data without reloading the bed file.
|
268
|
+
|
269
|
+
Parameters
|
270
|
+
----------
|
271
|
+
keep_snps : array-like, optional
|
272
|
+
Indices of SNPs to keep.
|
273
|
+
keep_indivs : array-like, optional
|
274
|
+
Indices of individuals to keep.
|
275
|
+
mafMin : float, optional
|
276
|
+
Minimum minor allele frequency.
|
277
|
+
|
278
|
+
Returns
|
279
|
+
-------
|
280
|
+
self
|
281
|
+
Returns self for method chaining.
|
282
|
+
"""
|
283
|
+
# Reset to original state first
|
284
|
+
self.geno = self.geno_original.copy()
|
285
|
+
self.m = self.m_original
|
286
|
+
self.n = self.n_original
|
287
|
+
self.nru = self.nru_original
|
288
|
+
self._currentSNP = 0
|
289
|
+
|
290
|
+
# Initialize with all SNPs
|
291
|
+
kept_snps = np.arange(self.m_original)
|
292
|
+
|
293
|
+
# Apply MAF filter using pre-calculated values
|
294
|
+
if mafMin is not None and mafMin > 0:
|
295
|
+
maf_values = np.minimum(self.all_snp_info["freq"], 1 - self.all_snp_info["freq"])
|
296
|
+
maf_mask = (maf_values > mafMin) & self.all_snp_info["valid_snp"]
|
297
|
+
kept_snps = kept_snps[maf_mask]
|
298
|
+
logger.info(f"After MAF filtering (>{mafMin}), {len(kept_snps)} SNPs remain")
|
299
|
+
|
300
|
+
# Apply SNP filter if specified
|
301
|
+
if keep_snps is not None:
|
302
|
+
keep_snps = np.array(keep_snps, dtype="int")
|
303
|
+
if np.any(keep_snps > self.m_original):
|
304
|
+
raise ValueError("keep_snps indices out of bounds")
|
305
|
+
|
306
|
+
# Intersect with current kept_snps
|
307
|
+
kept_snps = np.intersect1d(kept_snps, keep_snps)
|
308
|
+
logger.info(f"After keep_snps filtering, {len(kept_snps)} SNPs remain")
|
309
|
+
|
310
|
+
# Filter SNPs in the genotype data
|
311
|
+
if len(kept_snps) < self.m_original:
|
312
|
+
# Create new genotype data with only the kept SNPs
|
313
|
+
new_geno = ba.bitarray()
|
314
|
+
for j in kept_snps:
|
315
|
+
new_geno += self.geno_original[2 * self.nru * j : 2 * self.nru * (j + 1)]
|
316
|
+
self.geno = new_geno
|
317
|
+
self.m = len(kept_snps)
|
318
|
+
|
319
|
+
# Filter individuals if specified
|
320
|
+
if keep_indivs is not None:
|
321
|
+
keep_indivs = np.array(keep_indivs, dtype="int")
|
322
|
+
if np.any(keep_indivs > self.n):
|
323
|
+
raise ValueError("keep_indivs indices out of bounds")
|
324
|
+
|
325
|
+
(self.geno, self.m, self.n) = self._filter_indivs(
|
326
|
+
self.geno, keep_indivs, self.m, self.n
|
327
|
+
)
|
328
|
+
|
329
|
+
if self.n > 0:
|
330
|
+
logger.info(f"After filtering, {self.n} individuals remain")
|
331
|
+
else:
|
332
|
+
raise ValueError("After filtering, no individuals remain")
|
333
|
+
|
334
|
+
# Update kept_snps and other attributes
|
335
|
+
self.kept_snps = kept_snps
|
336
|
+
self.freq = self.all_snp_info["freq"][kept_snps]
|
337
|
+
self.maf = np.minimum(self.freq, 1 - self.freq)
|
338
|
+
self.sqrtpq = np.sqrt(self.freq * (1 - self.freq))
|
339
|
+
|
340
|
+
return self
|
341
|
+
|
342
|
+
def _filter_indivs(self, geno, keep_indivs, m, n):
|
343
|
+
"""
|
344
|
+
Filter individuals based on the keep_indivs parameter.
|
345
|
+
"""
|
349
346
|
n_new = len(keep_indivs)
|
350
347
|
e = (4 - n_new % 4) if n_new % 4 != 0 else 0
|
351
348
|
nru_new = n_new + e
|
@@ -358,95 +355,120 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
358
355
|
self.nru = nru_new
|
359
356
|
return (z, m, n_new)
|
360
357
|
|
361
|
-
|
362
|
-
def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
|
358
|
+
def get_snps_by_maf(self, mafMin):
|
363
359
|
"""
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
Which implies that
|
376
|
-
missing ct = a - c
|
377
|
-
# of indivs with nonmissing genotype = n - a + c
|
378
|
-
major allele ct = b + c
|
379
|
-
major allele frequency = (b+c)/(2*(n-a+c))
|
380
|
-
het ct + missing ct = a + b - 2*c
|
381
|
-
Why does bitarray not have >> ????
|
360
|
+
Get the list of SNPs that pass the MAF threshold.
|
361
|
+
|
362
|
+
Parameters
|
363
|
+
----------
|
364
|
+
mafMin : float
|
365
|
+
Minimum MAF threshold
|
366
|
+
|
367
|
+
Returns
|
368
|
+
-------
|
369
|
+
list
|
370
|
+
List of SNP IDs that pass the MAF threshold
|
382
371
|
"""
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
b = B.count()
|
396
|
-
c = (A & B).count()
|
397
|
-
major_ct = b + c # number of copies of the major allele
|
398
|
-
n_nomiss = n - a + c # number of individuals with nonmissing genotypes
|
399
|
-
f = major_ct / (2 * n_nomiss) if n_nomiss > 0 else 0
|
400
|
-
het_miss_ct = a + b - 2 * c # remove SNPs that are only either het or missing
|
401
|
-
if np.minimum(f, 1 - f) > mafMin and het_miss_ct < n:
|
402
|
-
freq.append(f)
|
403
|
-
y += z
|
404
|
-
m_poly += 1
|
405
|
-
kept_snps.append(j)
|
406
|
-
# -
|
407
|
-
return (y, m_poly, n, kept_snps, freq)
|
408
|
-
|
409
|
-
# -
|
410
|
-
def nextSNPs(self, b, minorRef=None):
|
372
|
+
# Use the pre-calculated MAF values
|
373
|
+
maf_values = np.minimum(self.all_snp_info["freq"], 1 - self.all_snp_info["freq"])
|
374
|
+
maf_mask = (maf_values > mafMin) & self.all_snp_info["valid_snp"]
|
375
|
+
|
376
|
+
# Get SNP names from the BIM dataframe
|
377
|
+
snp_pass_maf = self.bim_df.loc[maf_mask, "SNP"].tolist()
|
378
|
+
|
379
|
+
logger.info(f"{len(snp_pass_maf)} SNPs with MAF > f{mafMin}")
|
380
|
+
|
381
|
+
return snp_pass_maf
|
382
|
+
|
383
|
+
def get_ldscore(self, annot_matrix=None, ld_wind=1.0, ld_unit="CM", keep_snps_index=None):
|
411
384
|
"""
|
412
|
-
|
413
|
-
normalized genotypes for the next b SNPs, where n := number of samples.
|
385
|
+
Calculate LD scores using an annotation matrix.
|
414
386
|
|
415
387
|
Parameters
|
416
388
|
----------
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
389
|
+
annot_matrix : np.ndarray, optional
|
390
|
+
Annotation matrix. If None, uses a matrix of all ones.
|
391
|
+
ld_wind : float, optional
|
392
|
+
LD window size, by default 1.0
|
393
|
+
ld_unit : str, optional
|
394
|
+
Unit for the LD window, by default "CM"
|
395
|
+
keep_snps_index : list[int], optional
|
396
|
+
Indices of SNPs to keep, by default None
|
422
397
|
|
423
398
|
Returns
|
424
399
|
-------
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
400
|
+
np.ndarray
|
401
|
+
Array with calculated LD scores
|
402
|
+
"""
|
403
|
+
# Apply filters if needed
|
404
|
+
if keep_snps_index is not None:
|
405
|
+
original_kept_snps = self.kept_snps.copy()
|
406
|
+
self.apply_filters(keep_snps=keep_snps_index)
|
407
|
+
|
408
|
+
# Configure LD window based on specified unit
|
409
|
+
if ld_unit == "SNP":
|
410
|
+
max_dist = ld_wind
|
411
|
+
coords = np.array(range(self.m))
|
412
|
+
elif ld_unit == "KB":
|
413
|
+
max_dist = ld_wind * 1000
|
414
|
+
coords = np.array(self.bim_df.loc[self.kept_snps, "BP"])
|
415
|
+
elif ld_unit == "CM":
|
416
|
+
max_dist = ld_wind
|
417
|
+
coords = np.array(self.bim_df.loc[self.kept_snps, "CM"])
|
418
|
+
# Check if the CM is all 0
|
419
|
+
if np.all(coords == 0):
|
420
|
+
logger.warning(
|
421
|
+
"All CM values are 0. Using 1MB window size for LD score calculation."
|
422
|
+
)
|
423
|
+
max_dist = 1_000_000
|
424
|
+
coords = np.array(self.bim_df.loc[self.kept_snps, "BP"])
|
425
|
+
else:
|
426
|
+
raise ValueError(f"Invalid ld_wind_unit: {ld_unit}. Must be one of: SNP, KB, CM")
|
427
|
+
|
428
|
+
# Calculate blocks for LD computation
|
429
|
+
block_left = getBlockLefts(coords, max_dist)
|
430
|
+
assert block_left.sum() > 0, "Invalid window size, please check the ld_wind parameter."
|
431
|
+
|
432
|
+
# Calculate LD scores
|
433
|
+
ld_scores = self.ldScoreVarBlocks(block_left, 100, annot=annot_matrix)
|
434
|
+
|
435
|
+
# Restore original state if filters were applied
|
436
|
+
if keep_snps_index is not None:
|
437
|
+
self.apply_filters(keep_snps=original_kept_snps)
|
438
|
+
|
439
|
+
return ld_scores
|
440
|
+
|
441
|
+
def restart(self):
|
442
|
+
"""
|
443
|
+
Reset the current SNP index to 0.
|
444
|
+
"""
|
445
|
+
self._currentSNP = 0
|
446
|
+
|
447
|
+
def nextSNPs(self, b, minorRef=None):
|
448
|
+
"""
|
449
|
+
Unpacks the binary array of genotypes and returns an n x b matrix of floats of
|
450
|
+
normalized genotypes for the next b SNPs.
|
429
451
|
"""
|
430
|
-
# -
|
431
452
|
try:
|
432
453
|
b = int(b)
|
433
454
|
if b <= 0:
|
434
455
|
raise ValueError("b must be > 0")
|
435
456
|
except TypeError as e:
|
436
457
|
raise TypeError("b must be an integer") from e
|
437
|
-
|
458
|
+
|
438
459
|
if self._currentSNP + b > self.m:
|
439
460
|
s = "{b} SNPs requested, {k} SNPs remain"
|
440
461
|
raise ValueError(s.format(b=b, k=(self.m - self._currentSNP)))
|
441
|
-
|
462
|
+
|
442
463
|
c = self._currentSNP
|
443
464
|
n = self.n
|
444
465
|
nru = self.nru
|
445
466
|
slice = self.geno[2 * c * nru : 2 * (c + b) * nru]
|
446
|
-
X = np.array(slice.decode(self._bedcode), dtype="
|
467
|
+
X = np.array(slice.decode(self._bedcode), dtype="float32").reshape((b, nru)).T
|
447
468
|
X = X[0:n, :]
|
448
|
-
Y = np.zeros(X.shape)
|
449
|
-
|
469
|
+
Y = np.zeros(X.shape, dtype="float32")
|
470
|
+
|
471
|
+
# Normalize the SNPs and impute the missing ones with the mean
|
450
472
|
for j in range(0, b):
|
451
473
|
newsnp = X[:, j]
|
452
474
|
ii = newsnp != 9
|
@@ -455,35 +477,116 @@ class PlinkBEDFile(GenotypeArrayInMemory):
|
|
455
477
|
denom = np.std(newsnp)
|
456
478
|
if denom == 0:
|
457
479
|
denom = 1
|
458
|
-
|
480
|
+
|
459
481
|
if minorRef is not None and self.freq[self._currentSNP + j] > 0.5:
|
460
482
|
denom = denom * -1
|
461
|
-
|
483
|
+
|
462
484
|
Y[:, j] = (newsnp - avg) / denom
|
463
|
-
|
485
|
+
|
464
486
|
self._currentSNP += b
|
465
487
|
return Y
|
466
488
|
|
489
|
+
def _l2_unbiased(self, x, n):
|
490
|
+
"""
|
491
|
+
Calculate the unbiased estimate of L2.
|
492
|
+
"""
|
493
|
+
denom = n - 2 if n > 2 else n # allow n<2 for testing purposes
|
494
|
+
sq = np.square(x)
|
495
|
+
return sq - (1 - sq) / denom
|
496
|
+
|
497
|
+
def ldScoreVarBlocks(self, block_left, c, annot=None):
|
498
|
+
"""
|
499
|
+
Computes an unbiased estimate of L2(j) for j=1,..,M.
|
500
|
+
"""
|
467
501
|
|
468
|
-
def
|
469
|
-
|
470
|
-
["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
|
471
|
-
)
|
472
|
-
PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
|
502
|
+
def func(x):
|
503
|
+
return self._l2_unbiased(x, self.n)
|
473
504
|
|
474
|
-
|
475
|
-
|
505
|
+
snp_getter = self.nextSNPs
|
506
|
+
return self._corSumVarBlocks(block_left, c, func, snp_getter, annot)
|
476
507
|
|
477
|
-
|
478
|
-
|
479
|
-
|
508
|
+
def _corSumVarBlocks(self, block_left, c, func, snp_getter, annot=None):
|
509
|
+
"""
|
510
|
+
Calculate the sum of correlation coefficients.
|
511
|
+
"""
|
512
|
+
m, n = self.m, self.n
|
513
|
+
block_sizes = np.array(np.arange(m) - block_left)
|
514
|
+
block_sizes = np.ceil(block_sizes / c) * c
|
515
|
+
if annot is None:
|
516
|
+
annot = np.ones((m, 1), dtype="float32")
|
517
|
+
else:
|
518
|
+
# annot = annot.astype("float32") # Ensure annot is float32
|
519
|
+
annot_m = annot.shape[0]
|
520
|
+
if annot_m != self.m:
|
521
|
+
raise ValueError("Incorrect number of SNPs in annot")
|
480
522
|
|
481
|
-
|
523
|
+
n_a = annot.shape[1] # number of annotations
|
524
|
+
cor_sum = np.zeros((m, n_a), dtype="float32")
|
525
|
+
# b = index of first SNP for which SNP 0 is not included in LD Score
|
526
|
+
b = np.nonzero(block_left > 0)
|
527
|
+
if np.any(b):
|
528
|
+
b = b[0][0]
|
529
|
+
else:
|
530
|
+
b = m
|
531
|
+
b = int(np.ceil(b / c) * c) # round up to a multiple of c
|
532
|
+
if b > m:
|
533
|
+
c = 1
|
534
|
+
b = m
|
482
535
|
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
536
|
+
l_A = 0 # l_A := index of leftmost SNP in matrix A
|
537
|
+
A = snp_getter(b) # This now returns float32 data
|
538
|
+
rfuncAB = np.zeros((b, c), dtype="float32")
|
539
|
+
rfuncBB = np.zeros((c, c), dtype="float32")
|
540
|
+
# chunk inside of block
|
541
|
+
for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
|
542
|
+
B = A[:, l_B : l_B + c]
|
543
|
+
# ld matrix
|
544
|
+
np.dot(A.T, B / n, out=rfuncAB)
|
545
|
+
# ld matrix square
|
546
|
+
rfuncAB = func(rfuncAB)
|
547
|
+
cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
|
488
548
|
|
489
|
-
|
549
|
+
# chunk to right of block
|
550
|
+
b0 = b
|
551
|
+
md = int(c * np.floor(m / c))
|
552
|
+
end = md + 1 if md != m else md
|
553
|
+
for l_B in tqdm(np.arange(b0, end, c), desc="Compute SNP Gene Weight"):
|
554
|
+
# check if the annot matrix is all zeros for this block + chunk
|
555
|
+
# this happens w/ sparse categories (i.e., pathways)
|
556
|
+
# update the block
|
557
|
+
old_b = b
|
558
|
+
b = int(block_sizes[l_B])
|
559
|
+
if l_B > b0 and b > 0:
|
560
|
+
# block_size can't increase more than c
|
561
|
+
# block_size can't be less than c unless it is zero
|
562
|
+
# both of these things make sense
|
563
|
+
A = np.hstack((A[:, old_b - b + c : old_b], B))
|
564
|
+
l_A += old_b - b + c
|
565
|
+
elif l_B == b0 and b > 0:
|
566
|
+
A = A[:, b0 - b : b0]
|
567
|
+
l_A = b0 - b
|
568
|
+
elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
|
569
|
+
A = np.array((), dtype="float32").reshape((n, 0))
|
570
|
+
l_A = l_B
|
571
|
+
if l_B == md:
|
572
|
+
c = m - md
|
573
|
+
rfuncAB = np.zeros((b, c), dtype="float32")
|
574
|
+
rfuncBB = np.zeros((c, c), dtype="float32")
|
575
|
+
if b != old_b:
|
576
|
+
rfuncAB = np.zeros((b, c), dtype="float32")
|
577
|
+
|
578
|
+
B = snp_getter(c) # This now returns float32 data
|
579
|
+
p1 = np.all(annot[l_A : l_A + b, :] == 0)
|
580
|
+
p2 = np.all(annot[l_B : l_B + c, :] == 0)
|
581
|
+
if p1 and p2:
|
582
|
+
continue
|
583
|
+
|
584
|
+
np.dot(A.T, B / n, out=rfuncAB)
|
585
|
+
rfuncAB = func(rfuncAB)
|
586
|
+
cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
|
587
|
+
cor_sum[l_B : l_B + c, :] += np.dot(annot[l_A : l_A + b, :].T, rfuncAB).T
|
588
|
+
np.dot(B.T, B / n, out=rfuncBB)
|
589
|
+
rfuncBB = func(rfuncBB)
|
590
|
+
cor_sum[l_B : l_B + c, :] += np.dot(rfuncBB, annot[l_B : l_B + c, :])
|
591
|
+
|
592
|
+
return cor_sum
|