gsMap 1.73.3__py3-none-any.whl → 1.73.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,67 +1,27 @@
1
+ """
2
+ Module for reading and processing PLINK genotype data and calculating LD scores.
3
+
4
+ Note:
5
+ This code is adapted and modified from:
6
+ https://github.com/bulik/ldsc/blob/master/ldsc/ldscore.py
7
+ """
8
+
9
+ import logging
10
+
1
11
  import bitarray as ba
2
12
  import numpy as np
3
13
  import pandas as pd
14
+ import pyranges as pr
4
15
  from tqdm import tqdm
5
16
 
6
-
7
- # Define the reading functions
8
- def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
9
- # -
10
- class IDContainer:
11
- """
12
- A class to read data from a file, store it as a DataFrame, and provide a method for a left outer join operation.
13
- """
14
-
15
- def __init__(self, fname):
16
- """
17
- Initialize the IDContainer with the given filename and reading options.
18
- """
19
- self.usecols = usecols
20
- self.colnames = colnames
21
- self.keepcol = keepcol
22
- self.fname_end = fname_end
23
- self.header = header
24
- self.read(fname)
25
- self.n = len(self.df)
26
-
27
- # -
28
- def read(self, fname):
29
- """
30
- Read data from the given file and store it as a DataFrame.
31
- """
32
- end = self.fname_end
33
- if end and not fname.endswith(end):
34
- raise ValueError(f"{end} filename must end in {end}")
35
- self.df = pd.read_csv(
36
- fname,
37
- header=self.header,
38
- usecols=self.usecols,
39
- sep=r"\s+",
40
- )
41
- if self.colnames:
42
- self.df.columns = self.colnames
43
- if self.keepcol is not None:
44
- self.IDList = self.df.iloc[:, [self.keepcol]].astype("object")
45
-
46
- return IDContainer
17
+ # Configure logger
18
+ logger = logging.getLogger("gsMap.utils.plink_ldscore_tool")
47
19
 
48
20
 
49
21
  def getBlockLefts(coords, max_dist):
50
22
  """
51
- Converts coordinates + max block length to the a list of coordinates of the leftmost
23
+ Converts coordinates + max block length to a list of coordinates of the leftmost
52
24
  SNPs to be included in blocks.
53
-
54
- Parameters
55
- ----------
56
- coords : array
57
- Array of coordinates. Must be sorted.
58
- max_dist : float
59
- Maximum distance between SNPs included in the same window.
60
-
61
- Returns
62
- -------
63
- block_left : 1D np.ndarray with same length as block_left
64
- block_left[j] := min{k | dist(j, k) < max_dist}.
65
25
  """
66
26
  M = len(coords)
67
27
  j = 0
@@ -77,16 +37,6 @@ def getBlockLefts(coords, max_dist):
77
37
  def block_left_to_right(block_left):
78
38
  """
79
39
  Converts block lefts to block rights.
80
-
81
- Parameters
82
- ----------
83
- block_left : array
84
- Array of block lefts.
85
-
86
- Returns
87
- -------
88
- block_right : 1D np.ndarray with same length as block_left
89
- block_right[j] := max {k | block_left[k] <= j}
90
40
  """
91
41
  M = len(block_left)
92
42
  j = 0
@@ -99,223 +49,149 @@ def block_left_to_right(block_left):
99
49
  return block_right
100
50
 
101
51
 
102
- class GenotypeArrayInMemory:
52
+ class PlinkBEDFile:
103
53
  """
104
- Parent class for various classes containing interfaces for files with genotype
105
- matrices, e.g., plink .bed files, etc
54
+ Interface for Plink .bed format for reading and processing genotype data.
106
55
  """
107
56
 
108
- def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
109
- self.m = len(snp_list.IDList)
110
- self.n = n
111
- self.keep_snps = keep_snps
112
- self.keep_indivs = keep_indivs
113
- self.df = np.array(snp_list.df[["CHR", "SNP", "BP", "CM"]])
114
- self.colnames = ["CHR", "SNP", "BP", "CM"]
115
- self.mafMin = mafMin if mafMin is not None else 0
116
- self._currentSNP = 0
117
- (self.nru, self.geno) = self.__read__(fname, self.m, n)
118
- # filter individuals
119
- if keep_indivs is not None:
120
- keep_indivs = np.array(keep_indivs, dtype="int")
121
- if np.any(keep_indivs > self.n):
122
- raise ValueError("keep_indivs indices out of bounds")
123
- # -
124
- (self.geno, self.m, self.n) = self.__filter_indivs__(
125
- self.geno, keep_indivs, self.m, self.n
126
- )
127
- # -
128
- if self.n > 0:
129
- print(f"After filtering, {self.n} individuals remain")
130
- else:
131
- raise ValueError("After filtering, no individuals remain")
132
- # -
133
- # filter SNPs
134
- if keep_snps is not None:
135
- keep_snps = np.array(keep_snps, dtype="int")
136
- if np.any(keep_snps > self.m): # if keep_snps is None, this returns False
137
- raise ValueError("keep_snps indices out of bounds")
138
- # -
139
- (self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
140
- self.geno, self.m, self.n, self.mafMin, keep_snps
57
+ def __init__(self, bfile_prefix):
58
+ """
59
+ Initialize the PlinkBEDFile from a PLINK file prefix.
60
+
61
+ Parameters
62
+ ----------
63
+ bfile_prefix : str
64
+ PLINK file prefix (without .bed/.bim/.fam extension)
65
+ """
66
+ # Initialize bitarray for bed code mapping
67
+ self._bedcode = {
68
+ 2: ba.bitarray("11"),
69
+ 9: ba.bitarray("10"),
70
+ 1: ba.bitarray("01"),
71
+ 0: ba.bitarray("00"),
72
+ }
73
+
74
+ # Load BIM file
75
+ self.bim_df = self.load_bim(f"{bfile_prefix}.bim")
76
+
77
+ # Load FAM file
78
+ self.fam_df = self.load_fam(f"{bfile_prefix}.fam")
79
+
80
+ # Set up initial parameters
81
+ self.m_original = len(self.bim_df)
82
+ self.n_original = len(self.fam_df)
83
+
84
+ # Read the bed file
85
+ logger.info(f"Loading Plink genotype data from {bfile_prefix}.bed")
86
+ (self.nru_original, self.geno_original) = self._read(
87
+ f"{bfile_prefix}.bed", self.m_original, self.n_original
141
88
  )
142
- # -
143
- if self.m > 0:
144
- print(f"After filtering, {self.m} SNPs remain")
89
+
90
+ # Pre-calculate MAF for all SNPs
91
+ logger.info("Calculating MAF and QC for all SNPs")
92
+ self.all_snp_info = self._calculate_all_snp_info()
93
+
94
+ # Filter out invalid SNPs
95
+ valid_mask = self.all_snp_info["valid_snp"]
96
+ if num_invalid := np.sum(~valid_mask):
97
+ logger.warning(
98
+ f"Filtering out {num_invalid} bad quality SNPs: {self.bim_df.loc[~valid_mask, 'SNP'].tolist()}"
99
+ )
145
100
  else:
146
- raise ValueError("After filtering, no SNPs remain")
147
- # -
148
- self.df = self.df[self.kept_snps, :]
149
- self.maf = np.minimum(self.freq, np.ones(self.m) - self.freq)
150
- self.sqrtpq = np.sqrt(self.freq * (np.ones(self.m) - self.freq))
151
- self.df = np.c_[self.df, self.maf]
152
- self.colnames.append("MAF")
153
-
154
- # -
155
- def __read__(self, fname, m, n):
156
- raise NotImplementedError
157
-
158
- def __restart__(self):
159
- self._currentSNP = 0
101
+ logger.info("All SNPs passed the basic quality check")
160
102
 
161
- # -
162
- def __filter_indivs__(geno, keep_indivs, m, n):
163
- raise NotImplementedError
103
+ # Create new genotype data with only the valid SNPs
104
+ new_geno = ba.bitarray()
105
+ for j in np.arange(self.m_original)[valid_mask]:
106
+ new_geno += self.geno_original[
107
+ 2 * self.nru_original * j : 2 * self.nru_original * (j + 1)
108
+ ]
164
109
 
165
- # -
166
- def __filter_maf_(geno, m, n, maf):
167
- raise NotImplementedError
110
+ # Update original data to only include valid SNPs
111
+ self.geno_original = new_geno
168
112
 
169
- # -
170
- def ldScoreVarBlocks(self, block_left, c, annot=None):
171
- """Computes an unbiased estimate of L2(j) for j=1,..,M."""
113
+ # Only keep valid SNPs
114
+ self.bim_df = self.bim_df.loc[valid_mask].reset_index(drop=True)
115
+ self.m_original = len(self.bim_df)
116
+ self.kept_snps = np.arange(self.m_original)
172
117
 
173
- def func(x):
174
- return self.__l2_unbiased__(x, self.n)
118
+ # Initialize current state variables
119
+ self._currentSNP = 0
120
+ self.m = self.m_original
121
+ self.n = self.n_original
122
+ self.nru = self.nru_original
123
+ self.geno = self.geno_original.copy()
175
124
 
176
- snp_getter = self.nextSNPs
177
- return self.__corSumVarBlocks__(block_left, c, func, snp_getter, annot)
125
+ # Update frequency info based on valid SNPs
126
+ self.freq = self.all_snp_info["freq"][valid_mask]
127
+ self.maf = np.minimum(self.freq, 1 - self.freq)
128
+ self.sqrtpq = np.sqrt(self.freq * (1 - self.freq))
178
129
 
179
- # -
180
- # In small samples, the observed r^2 tends to be higher than the true r^2 due to sampling variability.
181
- # The bias correction term (1-sq) / denom adjusts for this bias by subtracting a small value that depends on the sample size and the observed r^2.
182
- def __l2_unbiased__(self, x, n):
183
- denom = n - 2 if n > 2 else n # allow n<2 for testing purposes
184
- sq = np.square(x)
185
- return sq - (1 - sq) / denom
130
+ # Add MAF to the BIM dataframe
131
+ self.bim_df["MAF"] = self.maf
132
+
133
+ logger.info(f"Loaded genotype data with {self.m} SNPs and {self.n} individuals")
186
134
 
187
- # -
188
- # Methods for calculating sums of Pearson correlation coefficients (i.e.,ld-score)
189
- # c stands for the chunk size (default = 50)
190
- def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
135
+ @staticmethod
136
+ def load_bim(bim_file):
191
137
  """
138
+ Load a BIM file into a pandas DataFrame.
139
+
192
140
  Parameters
193
141
  ----------
194
- block_left : np.ndarray with shape (M, )
195
- block_left[i] = index of leftmost SNP included in LD Score of SNP i.
196
- if c > 1, then only entries that are multiples of c are examined, and it is
197
- assumed that block_left[a*c+i] = block_left[a*c], except at
198
- the beginning of the chromosome where the 0th SNP is included in the window.
199
- c : int
200
- Chunk size.
201
- func : function
202
- Function to be applied to the genotype correlation matrix. Before dotting with
203
- annot. Examples: for biased L2, np.square. For biased L4,
204
- lambda x: np.square(np.square(x)). For L1, lambda x: x.
205
- snp_getter : function(int)
206
- The method to be used to get the next SNPs
207
- annot: numpy array with shape (m,n_a)
208
- SNP annotations.
142
+ bim_file : str
143
+ Path to the BIM file
209
144
 
210
145
  Returns
211
146
  -------
212
- cor_sum : np.ndarray with shape (M, num_annots)
213
- Estimates.
147
+ pd.DataFrame
148
+ DataFrame containing BIM data
214
149
  """
215
- m, n = self.m, self.n
216
- block_sizes = np.array(np.arange(m) - block_left)
217
- block_sizes = np.ceil(block_sizes / c) * c
218
- if annot is None:
219
- annot = np.ones((m, 1))
220
- else:
221
- annot_m = annot.shape[0]
222
- if annot_m != self.m:
223
- raise ValueError("Incorrect number of SNPs in annot")
224
- # -
225
- n_a = annot.shape[1] # number of annotations
226
- cor_sum = np.zeros((m, n_a))
227
- # b = index of first SNP for which SNP 0 is not included in LD Score
228
- b = np.nonzero(block_left > 0)
229
- if np.any(b):
230
- b = b[0][0]
231
- else:
232
- b = m
233
- b = int(np.ceil(b / c) * c) # round up to a multiple of c
234
- if b > m:
235
- c = 1
236
- b = m
150
+ df = pd.read_csv(
151
+ bim_file, sep="\t", header=None, names=["CHR", "SNP", "CM", "BP", "A1", "A2"]
152
+ )
153
+ return df
237
154
 
238
- l_A = 0 # l_A := index of leftmost SNP in matrix A
239
- A = snp_getter(b)
240
- rfuncAB = np.zeros((b, c))
241
- rfuncBB = np.zeros((c, c))
242
- # chunk inside of block
243
- for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
244
- B = A[:, l_B : l_B + c]
245
- # ld matrix
246
- np.dot(A.T, B / n, out=rfuncAB)
247
- # ld matrix square
248
- rfuncAB = func(rfuncAB)
249
- cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
155
+ @staticmethod
156
+ def convert_bim_to_pyrange(bim_df) -> pr.PyRanges:
157
+ bim_pr = bim_df.copy()
158
+ bim_pr.drop(columns=["MAF"], inplace=True)
159
+ bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
160
+ bim_pr.Chromosome = "chr" + bim_pr["Chromosome"].astype(str)
250
161
 
251
- # chunk to right of block
252
- b0 = b
253
- md = int(c * np.floor(m / c))
254
- end = md + 1 if md != m else md
255
- for l_B in tqdm(np.arange(b0, end, c), desc="Compute SNP Gene Weight"):
256
- # check if the annot matrix is all zeros for this block + chunk
257
- # this happens w/ sparse categories (i.e., pathways)
258
- # update the block
259
- old_b = b
260
- b = int(block_sizes[l_B])
261
- if l_B > b0 and b > 0:
262
- # block_size can't increase more than c
263
- # block_size can't be less than c unless it is zero
264
- # both of these things make sense
265
- A = np.hstack((A[:, old_b - b + c : old_b], B))
266
- l_A += old_b - b + c
267
- elif l_B == b0 and b > 0:
268
- A = A[:, b0 - b : b0]
269
- l_A = b0 - b
270
- elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
271
- A = np.array(()).reshape((n, 0))
272
- l_A = l_B
273
- if l_B == md:
274
- c = m - md
275
- rfuncAB = np.zeros((b, c))
276
- rfuncBB = np.zeros((c, c))
277
- if b != old_b:
278
- rfuncAB = np.zeros((b, c))
279
- # -
280
- B = snp_getter(c)
281
- p1 = np.all(annot[l_A : l_A + b, :] == 0)
282
- p2 = np.all(annot[l_B : l_B + c, :] == 0)
283
- if p1 and p2:
284
- continue
285
- # -
286
- np.dot(A.T, B / n, out=rfuncAB)
287
- rfuncAB = func(rfuncAB)
288
- cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
289
- cor_sum[l_B : l_B + c, :] += np.dot(annot[l_A : l_A + b, :].T, rfuncAB).T
290
- np.dot(B.T, B / n, out=rfuncBB)
291
- rfuncBB = func(rfuncBB)
292
- cor_sum[l_B : l_B + c, :] += np.dot(rfuncBB, annot[l_B : l_B + c, :])
293
- # -
294
- return cor_sum
162
+ # Adjust coordinates (BIM is 1-based, PyRanges uses 0-based)
163
+ bim_pr["End"] = bim_pr["Start"].copy()
164
+ bim_pr["Start"] = bim_pr["Start"] - 1
295
165
 
166
+ bim_pr = pr.PyRanges(bim_pr)
296
167
 
297
- class PlinkBEDFile(GenotypeArrayInMemory):
298
- """
299
- Interface for Plink .bed format
300
- """
168
+ return bim_pr
301
169
 
302
- def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
303
- self._bedcode = {
304
- 2: ba.bitarray("11"),
305
- 9: ba.bitarray("10"),
306
- 1: ba.bitarray("01"),
307
- 0: ba.bitarray("00"),
308
- }
309
- # -
310
- GenotypeArrayInMemory.__init__(
311
- self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=mafMin
312
- )
170
+ @staticmethod
171
+ def load_fam(fam_file):
172
+ """
173
+ Load a FAM file into a pandas DataFrame.
174
+
175
+ Parameters
176
+ ----------
177
+ fam_file : str
178
+ Path to the FAM file
179
+
180
+ Returns
181
+ -------
182
+ pd.DataFrame
183
+ DataFrame containing FAM data
184
+ """
185
+ df = pd.read_csv(fam_file, sep=r"\s+", header=None, usecols=[1], names=["IID"])
186
+ return df
313
187
 
314
- # -
315
- def __read__(self, fname, m, n):
188
+ def _read(self, fname, m, n):
189
+ """
190
+ Read the bed file and return the genotype data.
191
+ """
316
192
  if not fname.endswith(".bed"):
317
193
  raise ValueError(".bed filename must end in .bed")
318
- # -
194
+
319
195
  fh = open(fname, "rb")
320
196
  magicNumber = ba.bitarray(endian="little")
321
197
  magicNumber.fromfile(fh, 2)
@@ -323,29 +199,150 @@ class PlinkBEDFile(GenotypeArrayInMemory):
323
199
  bedMode.fromfile(fh, 1)
324
200
  e = (4 - n % 4) if n % 4 != 0 else 0
325
201
  nru = n + e
326
- self.nru = nru
327
- # check magic number
202
+
203
+ # Check magic number
328
204
  if magicNumber != ba.bitarray("0011011011011000"):
329
205
  raise OSError("Magic number from Plink .bed file not recognized")
330
- # -
206
+
331
207
  if bedMode != ba.bitarray("10000000"):
332
208
  raise OSError("Plink .bed file must be in default SNP-major mode")
333
- # check file length
334
- self.geno = ba.bitarray(endian="little")
335
- self.geno.fromfile(fh)
336
- self.__test_length__(self.geno, self.m, self.nru)
337
- return (self.nru, self.geno)
338
-
339
- # -
340
- def __test_length__(self, geno, m, nru):
209
+
210
+ # Check file length
211
+ geno = ba.bitarray(endian="little")
212
+ geno.fromfile(fh)
213
+ self._test_length(geno, m, nru)
214
+ return (nru, geno)
215
+
216
+ def _test_length(self, geno, m, nru):
217
+ """
218
+ Test if the genotype data has the expected length.
219
+ """
341
220
  exp_len = 2 * m * nru
342
221
  real_len = len(geno)
343
222
  if real_len != exp_len:
344
223
  s = "Plink .bed file has {n1} bits, expected {n2}"
345
224
  raise OSError(s.format(n1=real_len, n2=exp_len))
346
225
 
347
- # -
348
- def __filter_indivs__(self, geno, keep_indivs, m, n):
226
+ def _calculate_all_snp_info(self):
227
+ """
228
+ Pre-calculate MAF and other information for all SNPs.
229
+
230
+ Returns
231
+ -------
232
+ dict
233
+ Dictionary containing information for all SNPs
234
+ """
235
+ nru = self.nru_original
236
+ n = self.n_original
237
+ m = self.m_original
238
+ geno = self.geno_original
239
+
240
+ snp_info = {
241
+ "freq": np.zeros(m), # Allele frequencies
242
+ "het_miss_count": np.zeros(m), # Count of het or missing genotypes
243
+ "valid_snp": np.zeros(m, dtype=bool), # Whether SNP passes basic criteria
244
+ }
245
+
246
+ # For each SNP, calculate statistics
247
+ for j in range(m):
248
+ z = geno[2 * nru * j : 2 * nru * (j + 1)]
249
+ A = z[0::2]
250
+ a = A.count()
251
+ B = z[1::2]
252
+ b = B.count()
253
+ c = (A & B).count()
254
+ major_ct = b + c # number of copies of the major allele
255
+ n_nomiss = n - a + c # number of individuals with nonmissing genotypes
256
+ f = major_ct / (2 * n_nomiss) if n_nomiss > 0 else 0
257
+ het_miss_ct = a + b - 2 * c # count of SNPs that are het or missing
258
+
259
+ snp_info["freq"][j] = f
260
+ snp_info["het_miss_count"][j] = het_miss_ct
261
+ snp_info["valid_snp"][j] = het_miss_ct < n # Basic validity check
262
+
263
+ return snp_info
264
+
265
+ def apply_filters(self, keep_snps=None, keep_indivs=None, mafMin=None):
266
+ """
267
+ Apply filters to the genotype data without reloading the bed file.
268
+
269
+ Parameters
270
+ ----------
271
+ keep_snps : array-like, optional
272
+ Indices of SNPs to keep.
273
+ keep_indivs : array-like, optional
274
+ Indices of individuals to keep.
275
+ mafMin : float, optional
276
+ Minimum minor allele frequency.
277
+
278
+ Returns
279
+ -------
280
+ self
281
+ Returns self for method chaining.
282
+ """
283
+ # Reset to original state first
284
+ self.geno = self.geno_original.copy()
285
+ self.m = self.m_original
286
+ self.n = self.n_original
287
+ self.nru = self.nru_original
288
+ self._currentSNP = 0
289
+
290
+ # Initialize with all SNPs
291
+ kept_snps = np.arange(self.m_original)
292
+
293
+ # Apply MAF filter using pre-calculated values
294
+ if mafMin is not None and mafMin > 0:
295
+ # Remove the redundant valid_snp check since all SNPs are already valid
296
+ maf_mask = self.maf > mafMin
297
+ kept_snps = kept_snps[maf_mask]
298
+ logger.info(f"After MAF filtering (>{mafMin}), {len(kept_snps)} SNPs remain")
299
+
300
+ # Apply SNP filter if specified
301
+ if keep_snps is not None:
302
+ keep_snps = np.array(keep_snps, dtype="int")
303
+ if np.any(keep_snps > self.m_original):
304
+ raise ValueError("keep_snps indices out of bounds")
305
+
306
+ # Intersect with current kept_snps
307
+ kept_snps = np.intersect1d(kept_snps, keep_snps)
308
+ logger.info(f"After keep_snps filtering, {len(kept_snps)} SNPs remain")
309
+
310
+ # Filter SNPs in the genotype data
311
+ if len(kept_snps) < self.m_original:
312
+ # Create new genotype data with only the kept SNPs
313
+ new_geno = ba.bitarray()
314
+ for j in kept_snps:
315
+ new_geno += self.geno_original[2 * self.nru * j : 2 * self.nru * (j + 1)]
316
+ self.geno = new_geno
317
+ self.m = len(kept_snps)
318
+
319
+ # Filter individuals if specified
320
+ if keep_indivs is not None:
321
+ keep_indivs = np.array(keep_indivs, dtype="int")
322
+ if np.any(keep_indivs > self.n):
323
+ raise ValueError("keep_indivs indices out of bounds")
324
+
325
+ (self.geno, self.m, self.n) = self._filter_indivs(
326
+ self.geno, keep_indivs, self.m, self.n
327
+ )
328
+
329
+ if self.n > 0:
330
+ logger.info(f"After filtering, {self.n} individuals remain")
331
+ else:
332
+ raise ValueError("After filtering, no individuals remain")
333
+
334
+ # Update kept_snps and other attributes
335
+ self.kept_snps = kept_snps
336
+ self.freq = self.all_snp_info["freq"][kept_snps]
337
+ self.maf = np.minimum(self.freq, 1 - self.freq)
338
+ self.sqrtpq = np.sqrt(self.freq * (1 - self.freq))
339
+
340
+ return self
341
+
342
+ def _filter_indivs(self, geno, keep_indivs, m, n):
343
+ """
344
+ Filter individuals based on the keep_indivs parameter.
345
+ """
349
346
  n_new = len(keep_indivs)
350
347
  e = (4 - n_new % 4) if n_new % 4 != 0 else 0
351
348
  nru_new = n_new + e
@@ -358,95 +355,118 @@ class PlinkBEDFile(GenotypeArrayInMemory):
358
355
  self.nru = nru_new
359
356
  return (z, m, n_new)
360
357
 
361
- # -
362
- def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
358
+ def get_snps_by_maf(self, mafMin):
363
359
  """
364
- Credit to Chris Chang and the Plink2 developers for this algorithm
365
- Modified from plink_filter.c
366
- https://github.com/chrchang/plink-ng/blob/master/plink_filter.c
367
- Genotypes are read forwards (since we are cheating and using endian="little")
368
- A := (genotype) & 1010...
369
- B := (genotype) & 0101...
370
- C := (A >> 1) & B
371
- Then
372
- a := A.count() = missing ct + hom major ct
373
- b := B.count() = het ct + hom major ct
374
- c := C.count() = hom major ct
375
- Which implies that
376
- missing ct = a - c
377
- # of indivs with nonmissing genotype = n - a + c
378
- major allele ct = b + c
379
- major allele frequency = (b+c)/(2*(n-a+c))
380
- het ct + missing ct = a + b - 2*c
381
- Why does bitarray not have >> ????
360
+ Get the list of SNPs that pass the MAF threshold.
361
+
362
+ Parameters
363
+ ----------
364
+ mafMin : float
365
+ Minimum MAF threshold
366
+
367
+ Returns
368
+ -------
369
+ list
370
+ List of SNP IDs that pass the MAF threshold
382
371
  """
383
- nru = self.nru
384
- m_poly = 0
385
- y = ba.bitarray()
386
- if keep_snps is None:
387
- keep_snps = range(m)
388
- kept_snps = []
389
- freq = []
390
- for e, j in enumerate(keep_snps):
391
- z = geno[2 * nru * j : 2 * nru * (j + 1)]
392
- A = z[0::2]
393
- a = A.count()
394
- B = z[1::2]
395
- b = B.count()
396
- c = (A & B).count()
397
- major_ct = b + c # number of copies of the major allele
398
- n_nomiss = n - a + c # number of individuals with nonmissing genotypes
399
- f = major_ct / (2 * n_nomiss) if n_nomiss > 0 else 0
400
- het_miss_ct = a + b - 2 * c # remove SNPs that are only either het or missing
401
- if np.minimum(f, 1 - f) > mafMin and het_miss_ct < n:
402
- freq.append(f)
403
- y += z
404
- m_poly += 1
405
- kept_snps.append(j)
406
- # -
407
- return (y, m_poly, n, kept_snps, freq)
408
-
409
- # -
410
- def nextSNPs(self, b, minorRef=None):
372
+ maf_mask = self.maf > mafMin
373
+
374
+ # Get SNP names from the BIM dataframe
375
+ snp_pass_maf = self.bim_df.loc[maf_mask, "SNP"].tolist()
376
+
377
+ logger.info(f"{len(snp_pass_maf)} SNPs with MAF > f{mafMin}")
378
+
379
+ return snp_pass_maf
380
+
381
+ def get_ldscore(self, annot_matrix=None, ld_wind=1.0, ld_unit="CM", keep_snps_index=None):
411
382
  """
412
- Unpacks the binary array of genotypes and returns an n x b matrix of floats of
413
- normalized genotypes for the next b SNPs, where n := number of samples.
383
+ Calculate LD scores using an annotation matrix.
414
384
 
415
385
  Parameters
416
386
  ----------
417
- b : int
418
- Number of SNPs to return.
419
- minorRef: bool, default None
420
- Should we flip reference alleles so that the minor allele is the reference?
421
- (This is useful for computing l1 w.r.t. minor allele).
387
+ annot_matrix : np.ndarray, optional
388
+ Annotation matrix. If None, uses a matrix of all ones.
389
+ ld_wind : float, optional
390
+ LD window size, by default 1.0
391
+ ld_unit : str, optional
392
+ Unit for the LD window, by default "CM"
393
+ keep_snps_index : list[int], optional
394
+ Indices of SNPs to keep, by default None
422
395
 
423
396
  Returns
424
397
  -------
425
- X : np.array with dtype float64 with shape (n, b), where n := number of samples
426
- Matrix of genotypes normalized to mean zero and variance one. If minorRef is
427
- not None, then the minor allele will be the positive allele (i.e., two copies
428
- of the minor allele --> a positive number).
398
+ np.ndarray
399
+ Array with calculated LD scores
400
+ """
401
+ # Apply filters if needed
402
+ if keep_snps_index is not None:
403
+ original_kept_snps = self.kept_snps.copy()
404
+ self.apply_filters(keep_snps=keep_snps_index)
405
+
406
+ # Configure LD window based on specified unit
407
+ if ld_unit == "SNP":
408
+ max_dist = ld_wind
409
+ coords = np.array(range(self.m))
410
+ elif ld_unit == "KB":
411
+ max_dist = ld_wind * 1000
412
+ coords = np.array(self.bim_df.loc[self.kept_snps, "BP"])
413
+ elif ld_unit == "CM":
414
+ max_dist = ld_wind
415
+ coords = np.array(self.bim_df.loc[self.kept_snps, "CM"])
416
+ # Check if the CM is all 0
417
+ if np.all(coords == 0):
418
+ logger.warning(
419
+ "All CM values are 0. Using 1MB window size for LD score calculation."
420
+ )
421
+ max_dist = 1_000_000
422
+ coords = np.array(self.bim_df.loc[self.kept_snps, "BP"])
423
+ else:
424
+ raise ValueError(f"Invalid ld_wind_unit: {ld_unit}. Must be one of: SNP, KB, CM")
425
+
426
+ # Calculate blocks for LD computation
427
+ block_left = getBlockLefts(coords, max_dist)
428
+ assert block_left.sum() > 0, "Invalid window size, please check the ld_wind parameter."
429
+
430
+ # Calculate LD scores
431
+ ld_scores = self.ldScoreVarBlocks(block_left, 100, annot=annot_matrix)
432
+
433
+ # Restore original state if filters were applied
434
+ if keep_snps_index is not None:
435
+ self.apply_filters(keep_snps=original_kept_snps)
436
+
437
+ return ld_scores
438
+
439
+ def restart(self):
440
+ """
441
+ Reset the current SNP index to 0.
442
+ """
443
+ self._currentSNP = 0
444
+
445
+ def nextSNPs(self, b, minorRef=None):
446
+ """
447
+ Unpacks the binary array of genotypes and returns an n x b matrix of floats of
448
+ normalized genotypes for the next b SNPs.
429
449
  """
430
- # -
431
450
  try:
432
451
  b = int(b)
433
452
  if b <= 0:
434
453
  raise ValueError("b must be > 0")
435
454
  except TypeError as e:
436
455
  raise TypeError("b must be an integer") from e
437
- # -
456
+
438
457
  if self._currentSNP + b > self.m:
439
458
  s = "{b} SNPs requested, {k} SNPs remain"
440
459
  raise ValueError(s.format(b=b, k=(self.m - self._currentSNP)))
441
- # -
460
+
442
461
  c = self._currentSNP
443
462
  n = self.n
444
463
  nru = self.nru
445
464
  slice = self.geno[2 * c * nru : 2 * (c + b) * nru]
446
- X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
465
+ X = np.array(slice.decode(self._bedcode), dtype="float32").reshape((b, nru)).T
447
466
  X = X[0:n, :]
448
- Y = np.zeros(X.shape)
449
- # normalize the SNPs and impute the missing one with the mean
467
+ Y = np.zeros(X.shape, dtype="float32")
468
+
469
+ # Normalize the SNPs and impute the missing ones with the mean
450
470
  for j in range(0, b):
451
471
  newsnp = X[:, j]
452
472
  ii = newsnp != 9
@@ -455,35 +475,116 @@ class PlinkBEDFile(GenotypeArrayInMemory):
455
475
  denom = np.std(newsnp)
456
476
  if denom == 0:
457
477
  denom = 1
458
- # -
478
+
459
479
  if minorRef is not None and self.freq[self._currentSNP + j] > 0.5:
460
480
  denom = denom * -1
461
- # -
481
+
462
482
  Y[:, j] = (newsnp - avg) / denom
463
- # -
483
+
464
484
  self._currentSNP += b
465
485
  return Y
466
486
 
487
+ def _l2_unbiased(self, x, n):
488
+ """
489
+ Calculate the unbiased estimate of L2.
490
+ """
491
+ denom = n - 2 if n > 2 else n # allow n<2 for testing purposes
492
+ sq = np.square(x)
493
+ return sq - (1 - sq) / denom
494
+
495
+ def ldScoreVarBlocks(self, block_left, c, annot=None):
496
+ """
497
+ Computes an unbiased estimate of L2(j) for j=1,..,M.
498
+ """
467
499
 
468
- def load_bfile(bfile_chr_prefix, keep_snps=None, keep_indivs=None, mafMin=None):
469
- PlinkBIMFile = ID_List_Factory(
470
- ["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
471
- )
472
- PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
500
+ def func(x):
501
+ return self._l2_unbiased(x, self.n)
473
502
 
474
- snp_file = bfile_chr_prefix + ".bim"
475
- array_snps = PlinkBIMFile(snp_file)
503
+ snp_getter = self.nextSNPs
504
+ return self._corSumVarBlocks(block_left, c, func, snp_getter, annot)
476
505
 
477
- # Load fam
478
- ind_file = bfile_chr_prefix + ".fam"
479
- array_indivs = PlinkFAMFile(ind_file)
506
+ def _corSumVarBlocks(self, block_left, c, func, snp_getter, annot=None):
507
+ """
508
+ Calculate the sum of correlation coefficients.
509
+ """
510
+ m, n = self.m, self.n
511
+ block_sizes = np.array(np.arange(m) - block_left)
512
+ block_sizes = np.ceil(block_sizes / c) * c
513
+ if annot is None:
514
+ annot = np.ones((m, 1), dtype="float32")
515
+ else:
516
+ # annot = annot.astype("float32") # Ensure annot is float32
517
+ annot_m = annot.shape[0]
518
+ if annot_m != self.m:
519
+ raise ValueError("Incorrect number of SNPs in annot")
480
520
 
481
- n = len(array_indivs.IDList)
521
+ n_a = annot.shape[1] # number of annotations
522
+ cor_sum = np.zeros((m, n_a), dtype="float32")
523
+ # b = index of first SNP for which SNP 0 is not included in LD Score
524
+ b = np.nonzero(block_left > 0)
525
+ if np.any(b):
526
+ b = b[0][0]
527
+ else:
528
+ b = m
529
+ b = int(np.ceil(b / c) * c) # round up to a multiple of c
530
+ if b > m:
531
+ c = 1
532
+ b = m
482
533
 
483
- # Load genotype array
484
- array_file = bfile_chr_prefix + ".bed"
485
- geno_array = PlinkBEDFile(
486
- array_file, n, array_snps, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=mafMin
487
- )
534
+ l_A = 0 # l_A := index of leftmost SNP in matrix A
535
+ A = snp_getter(b) # This now returns float32 data
536
+ rfuncAB = np.zeros((b, c), dtype="float32")
537
+ rfuncBB = np.zeros((c, c), dtype="float32")
538
+ # chunk inside of block
539
+ for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
540
+ B = A[:, l_B : l_B + c]
541
+ # ld matrix
542
+ np.dot(A.T, B / n, out=rfuncAB)
543
+ # ld matrix square
544
+ rfuncAB = func(rfuncAB)
545
+ cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
488
546
 
489
- return array_snps, array_indivs, geno_array
547
+ # chunk to right of block
548
+ b0 = b
549
+ md = int(c * np.floor(m / c))
550
+ end = md + 1 if md != m else md
551
+ for l_B in tqdm(np.arange(b0, end, c), desc="Compute SNP Gene Weight"):
552
+ # check if the annot matrix is all zeros for this block + chunk
553
+ # this happens w/ sparse categories (i.e., pathways)
554
+ # update the block
555
+ old_b = b
556
+ b = int(block_sizes[l_B])
557
+ if l_B > b0 and b > 0:
558
+ # block_size can't increase more than c
559
+ # block_size can't be less than c unless it is zero
560
+ # both of these things make sense
561
+ A = np.hstack((A[:, old_b - b + c : old_b], B))
562
+ l_A += old_b - b + c
563
+ elif l_B == b0 and b > 0:
564
+ A = A[:, b0 - b : b0]
565
+ l_A = b0 - b
566
+ elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
567
+ A = np.array((), dtype="float32").reshape((n, 0))
568
+ l_A = l_B
569
+ if l_B == md:
570
+ c = m - md
571
+ rfuncAB = np.zeros((b, c), dtype="float32")
572
+ rfuncBB = np.zeros((c, c), dtype="float32")
573
+ if b != old_b:
574
+ rfuncAB = np.zeros((b, c), dtype="float32")
575
+
576
+ B = snp_getter(c) # This now returns float32 data
577
+ p1 = np.all(annot[l_A : l_A + b, :] == 0)
578
+ p2 = np.all(annot[l_B : l_B + c, :] == 0)
579
+ if p1 and p2:
580
+ continue
581
+
582
+ np.dot(A.T, B / n, out=rfuncAB)
583
+ rfuncAB = func(rfuncAB)
584
+ cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
585
+ cor_sum[l_B : l_B + c, :] += np.dot(annot[l_A : l_A + b, :].T, rfuncAB).T
586
+ np.dot(B.T, B / n, out=rfuncBB)
587
+ rfuncBB = func(rfuncBB)
588
+ cor_sum[l_B : l_B + c, :] += np.dot(rfuncBB, annot[l_B : l_B + c, :])
589
+
590
+ return cor_sum