gsMap 1.73.2__py3-none-any.whl → 1.73.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,67 +1,27 @@
1
+ """
2
+ Module for reading and processing PLINK genotype data and calculating LD scores.
3
+
4
+ Note:
5
+ This code is adapted and modified from:
6
+ https://github.com/bulik/ldsc/blob/master/ldsc/ldscore.py
7
+ """
8
+
9
+ import logging
10
+
1
11
  import bitarray as ba
2
12
  import numpy as np
3
13
  import pandas as pd
14
+ import pyranges as pr
4
15
  from tqdm import tqdm
5
16
 
6
-
7
- # Define the reading functions
8
- def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
9
- # -
10
- class IDContainer:
11
- """
12
- A class to read data from a file, store it as a DataFrame, and provide a method for a left outer join operation.
13
- """
14
-
15
- def __init__(self, fname):
16
- """
17
- Initialize the IDContainer with the given filename and reading options.
18
- """
19
- self.usecols = usecols
20
- self.colnames = colnames
21
- self.keepcol = keepcol
22
- self.fname_end = fname_end
23
- self.header = header
24
- self.read(fname)
25
- self.n = len(self.df)
26
-
27
- # -
28
- def read(self, fname):
29
- """
30
- Read data from the given file and store it as a DataFrame.
31
- """
32
- end = self.fname_end
33
- if end and not fname.endswith(end):
34
- raise ValueError(f"{end} filename must end in {end}")
35
- self.df = pd.read_csv(
36
- fname,
37
- header=self.header,
38
- usecols=self.usecols,
39
- sep=r"\s+",
40
- )
41
- if self.colnames:
42
- self.df.columns = self.colnames
43
- if self.keepcol is not None:
44
- self.IDList = self.df.iloc[:, [self.keepcol]].astype("object")
45
-
46
- return IDContainer
17
+ # Configure logger
18
+ logger = logging.getLogger("gsMap.utils.plink_ldscore_tool")
47
19
 
48
20
 
49
21
  def getBlockLefts(coords, max_dist):
50
22
  """
51
- Converts coordinates + max block length to the a list of coordinates of the leftmost
23
+ Converts coordinates + max block length to a list of coordinates of the leftmost
52
24
  SNPs to be included in blocks.
53
-
54
- Parameters
55
- ----------
56
- coords : array
57
- Array of coordinates. Must be sorted.
58
- max_dist : float
59
- Maximum distance between SNPs included in the same window.
60
-
61
- Returns
62
- -------
63
- block_left : 1D np.ndarray with same length as block_left
64
- block_left[j] := min{k | dist(j, k) < max_dist}.
65
25
  """
66
26
  M = len(coords)
67
27
  j = 0
@@ -77,16 +37,6 @@ def getBlockLefts(coords, max_dist):
77
37
  def block_left_to_right(block_left):
78
38
  """
79
39
  Converts block lefts to block rights.
80
-
81
- Parameters
82
- ----------
83
- block_left : array
84
- Array of block lefts.
85
-
86
- Returns
87
- -------
88
- block_right : 1D np.ndarray with same length as block_left
89
- block_right[j] := max {k | block_left[k] <= j}
90
40
  """
91
41
  M = len(block_left)
92
42
  j = 0
@@ -99,223 +49,149 @@ def block_left_to_right(block_left):
99
49
  return block_right
100
50
 
101
51
 
102
- class GenotypeArrayInMemory:
52
+ class PlinkBEDFile:
103
53
  """
104
- Parent class for various classes containing interfaces for files with genotype
105
- matrices, e.g., plink .bed files, etc
54
+ Interface for Plink .bed format for reading and processing genotype data.
106
55
  """
107
56
 
108
- def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
109
- self.m = len(snp_list.IDList)
110
- self.n = n
111
- self.keep_snps = keep_snps
112
- self.keep_indivs = keep_indivs
113
- self.df = np.array(snp_list.df[["CHR", "SNP", "BP", "CM"]])
114
- self.colnames = ["CHR", "SNP", "BP", "CM"]
115
- self.mafMin = mafMin if mafMin is not None else 0
116
- self._currentSNP = 0
117
- (self.nru, self.geno) = self.__read__(fname, self.m, n)
118
- # filter individuals
119
- if keep_indivs is not None:
120
- keep_indivs = np.array(keep_indivs, dtype="int")
121
- if np.any(keep_indivs > self.n):
122
- raise ValueError("keep_indivs indices out of bounds")
123
- # -
124
- (self.geno, self.m, self.n) = self.__filter_indivs__(
125
- self.geno, keep_indivs, self.m, self.n
126
- )
127
- # -
128
- if self.n > 0:
129
- print(f"After filtering, {self.n} individuals remain")
130
- else:
131
- raise ValueError("After filtering, no individuals remain")
132
- # -
133
- # filter SNPs
134
- if keep_snps is not None:
135
- keep_snps = np.array(keep_snps, dtype="int")
136
- if np.any(keep_snps > self.m): # if keep_snps is None, this returns False
137
- raise ValueError("keep_snps indices out of bounds")
138
- # -
139
- (self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
140
- self.geno, self.m, self.n, self.mafMin, keep_snps
57
+ def __init__(self, bfile_prefix):
58
+ """
59
+ Initialize the PlinkBEDFile from a PLINK file prefix.
60
+
61
+ Parameters
62
+ ----------
63
+ bfile_prefix : str
64
+ PLINK file prefix (without .bed/.bim/.fam extension)
65
+ """
66
+ # Initialize bitarray for bed code mapping
67
+ self._bedcode = {
68
+ 2: ba.bitarray("11"),
69
+ 9: ba.bitarray("10"),
70
+ 1: ba.bitarray("01"),
71
+ 0: ba.bitarray("00"),
72
+ }
73
+
74
+ # Load BIM file
75
+ self.bim_df = self.load_bim(f"{bfile_prefix}.bim")
76
+
77
+ # Load FAM file
78
+ self.fam_df = self.load_fam(f"{bfile_prefix}.fam")
79
+
80
+ # Set up initial parameters
81
+ self.m_original = len(self.bim_df)
82
+ self.n_original = len(self.fam_df)
83
+
84
+ # Read the bed file
85
+ logger.info(f"Loading Plink genotype data from {bfile_prefix}.bed")
86
+ (self.nru_original, self.geno_original) = self._read(
87
+ f"{bfile_prefix}.bed", self.m_original, self.n_original
141
88
  )
142
- # -
143
- if self.m > 0:
144
- print(f"After filtering, {self.m} SNPs remain")
89
+
90
+ # Pre-calculate MAF for all SNPs
91
+ logger.info("Calculating MAF and QC for all SNPs")
92
+ self.all_snp_info = self._calculate_all_snp_info()
93
+
94
+ # Filter out invalid SNPs
95
+ valid_mask = self.all_snp_info["valid_snp"]
96
+ if num_invalid := np.sum(~valid_mask):
97
+ logger.warning(f"Filtering out {num_invalid} bad quality SNPs")
145
98
  else:
146
- raise ValueError("After filtering, no SNPs remain")
147
- # -
148
- self.df = self.df[self.kept_snps, :]
149
- self.maf = np.minimum(self.freq, np.ones(self.m) - self.freq)
150
- self.sqrtpq = np.sqrt(self.freq * (np.ones(self.m) - self.freq))
151
- self.df = np.c_[self.df, self.maf]
152
- self.colnames.append("MAF")
153
-
154
- # -
155
- def __read__(self, fname, m, n):
156
- raise NotImplementedError
157
-
158
- def __restart__(self):
159
- self._currentSNP = 0
99
+ logger.info("All SNPs passed the basic quality check")
160
100
 
161
- # -
162
- def __filter_indivs__(geno, keep_indivs, m, n):
163
- raise NotImplementedError
101
+ # Only keep valid SNPs
102
+ self.kept_snps = np.arange(self.m_original)[valid_mask]
164
103
 
165
- # -
166
- def __filter_maf_(geno, m, n, maf):
167
- raise NotImplementedError
104
+ # Update bim_df to only include valid SNPs and reset index
105
+ self.bim_df = self.bim_df.loc[valid_mask].reset_index(drop=True)
168
106
 
169
- # -
170
- def ldScoreVarBlocks(self, block_left, c, annot=None):
171
- """Computes an unbiased estimate of L2(j) for j=1,..,M."""
107
+ # Create new genotype data with only the valid SNPs
108
+ new_geno = ba.bitarray()
109
+ for j in self.kept_snps:
110
+ new_geno += self.geno_original[
111
+ 2 * self.nru_original * j : 2 * self.nru_original * (j + 1)
112
+ ]
172
113
 
173
- def func(x):
174
- return self.__l2_unbiased__(x, self.n)
114
+ # Update original data to only include valid SNPs
115
+ self.geno_original = new_geno
116
+ self.m_original = len(self.kept_snps)
175
117
 
176
- snp_getter = self.nextSNPs
177
- return self.__corSumVarBlocks__(block_left, c, func, snp_getter, annot)
118
+ # Initialize current state variables
119
+ self._currentSNP = 0
120
+ self.m = self.m_original
121
+ self.n = self.n_original
122
+ self.nru = self.nru_original
123
+ self.geno = self.geno_original.copy()
178
124
 
179
- # -
180
- # In small samples, the observed r^2 tends to be higher than the true r^2 due to sampling variability.
181
- # The bias correction term (1-sq) / denom adjusts for this bias by subtracting a small value that depends on the sample size and the observed r^2.
182
- def __l2_unbiased__(self, x, n):
183
- denom = n - 2 if n > 2 else n # allow n<2 for testing purposes
184
- sq = np.square(x)
185
- return sq - (1 - sq) / denom
125
+ # Update frequency info based on valid SNPs
126
+ self.freq = self.all_snp_info["freq"][valid_mask]
127
+ self.maf = np.minimum(self.freq, 1 - self.freq)
128
+ self.sqrtpq = np.sqrt(self.freq * (1 - self.freq))
186
129
 
187
- # -
188
- # Methods for calculating sums of Pearson correlation coefficients (i.e.,ld-score)
189
- # c stands for the chunk size (default = 50)
190
- def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
130
+ # Add MAF to the BIM dataframe
131
+ self.bim_df["MAF"] = self.maf
132
+
133
+ logger.info(f"Loaded genotype data with {self.m} SNPs and {self.n} individuals")
134
+
135
+ @staticmethod
136
+ def load_bim(bim_file):
191
137
  """
138
+ Load a BIM file into a pandas DataFrame.
139
+
192
140
  Parameters
193
141
  ----------
194
- block_left : np.ndarray with shape (M, )
195
- block_left[i] = index of leftmost SNP included in LD Score of SNP i.
196
- if c > 1, then only entries that are multiples of c are examined, and it is
197
- assumed that block_left[a*c+i] = block_left[a*c], except at
198
- the beginning of the chromosome where the 0th SNP is included in the window.
199
- c : int
200
- Chunk size.
201
- func : function
202
- Function to be applied to the genotype correlation matrix. Before dotting with
203
- annot. Examples: for biased L2, np.square. For biased L4,
204
- lambda x: np.square(np.square(x)). For L1, lambda x: x.
205
- snp_getter : function(int)
206
- The method to be used to get the next SNPs
207
- annot: numpy array with shape (m,n_a)
208
- SNP annotations.
142
+ bim_file : str
143
+ Path to the BIM file
209
144
 
210
145
  Returns
211
146
  -------
212
- cor_sum : np.ndarray with shape (M, num_annots)
213
- Estimates.
147
+ pd.DataFrame
148
+ DataFrame containing BIM data
214
149
  """
215
- m, n = self.m, self.n
216
- block_sizes = np.array(np.arange(m) - block_left)
217
- block_sizes = np.ceil(block_sizes / c) * c
218
- if annot is None:
219
- annot = np.ones((m, 1))
220
- else:
221
- annot_m = annot.shape[0]
222
- if annot_m != self.m:
223
- raise ValueError("Incorrect number of SNPs in annot")
224
- # -
225
- n_a = annot.shape[1] # number of annotations
226
- cor_sum = np.zeros((m, n_a))
227
- # b = index of first SNP for which SNP 0 is not included in LD Score
228
- b = np.nonzero(block_left > 0)
229
- if np.any(b):
230
- b = b[0][0]
231
- else:
232
- b = m
233
- b = int(np.ceil(b / c) * c) # round up to a multiple of c
234
- if b > m:
235
- c = 1
236
- b = m
150
+ df = pd.read_csv(
151
+ bim_file, sep="\t", header=None, names=["CHR", "SNP", "CM", "BP", "A1", "A2"]
152
+ )
153
+ return df
237
154
 
238
- l_A = 0 # l_A := index of leftmost SNP in matrix A
239
- A = snp_getter(b)
240
- rfuncAB = np.zeros((b, c))
241
- rfuncBB = np.zeros((c, c))
242
- # chunk inside of block
243
- for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
244
- B = A[:, l_B : l_B + c]
245
- # ld matrix
246
- np.dot(A.T, B / n, out=rfuncAB)
247
- # ld matrix square
248
- rfuncAB = func(rfuncAB)
249
- cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
155
+ @staticmethod
156
+ def convert_bim_to_pyrange(bim_df) -> pr.PyRanges:
157
+ bim_pr = bim_df.copy()
158
+ bim_pr.drop(columns=["MAF"], inplace=True)
159
+ bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
160
+ bim_pr.Chromosome = "chr" + bim_pr["Chromosome"].astype(str)
250
161
 
251
- # chunk to right of block
252
- b0 = b
253
- md = int(c * np.floor(m / c))
254
- end = md + 1 if md != m else md
255
- for l_B in tqdm(np.arange(b0, end, c), desc="Compute SNP Gene Weight"):
256
- # check if the annot matrix is all zeros for this block + chunk
257
- # this happens w/ sparse categories (i.e., pathways)
258
- # update the block
259
- old_b = b
260
- b = int(block_sizes[l_B])
261
- if l_B > b0 and b > 0:
262
- # block_size can't increase more than c
263
- # block_size can't be less than c unless it is zero
264
- # both of these things make sense
265
- A = np.hstack((A[:, old_b - b + c : old_b], B))
266
- l_A += old_b - b + c
267
- elif l_B == b0 and b > 0:
268
- A = A[:, b0 - b : b0]
269
- l_A = b0 - b
270
- elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
271
- A = np.array(()).reshape((n, 0))
272
- l_A = l_B
273
- if l_B == md:
274
- c = m - md
275
- rfuncAB = np.zeros((b, c))
276
- rfuncBB = np.zeros((c, c))
277
- if b != old_b:
278
- rfuncAB = np.zeros((b, c))
279
- # -
280
- B = snp_getter(c)
281
- p1 = np.all(annot[l_A : l_A + b, :] == 0)
282
- p2 = np.all(annot[l_B : l_B + c, :] == 0)
283
- if p1 and p2:
284
- continue
285
- # -
286
- np.dot(A.T, B / n, out=rfuncAB)
287
- rfuncAB = func(rfuncAB)
288
- cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
289
- cor_sum[l_B : l_B + c, :] += np.dot(annot[l_A : l_A + b, :].T, rfuncAB).T
290
- np.dot(B.T, B / n, out=rfuncBB)
291
- rfuncBB = func(rfuncBB)
292
- cor_sum[l_B : l_B + c, :] += np.dot(rfuncBB, annot[l_B : l_B + c, :])
293
- # -
294
- return cor_sum
162
+ # Adjust coordinates (BIM is 1-based, PyRanges uses 0-based)
163
+ bim_pr["End"] = bim_pr["Start"].copy()
164
+ bim_pr["Start"] = bim_pr["Start"] - 1
295
165
 
166
+ bim_pr = pr.PyRanges(bim_pr)
296
167
 
297
- class PlinkBEDFile(GenotypeArrayInMemory):
298
- """
299
- Interface for Plink .bed format
300
- """
168
+ return bim_pr
301
169
 
302
- def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
303
- self._bedcode = {
304
- 2: ba.bitarray("11"),
305
- 9: ba.bitarray("10"),
306
- 1: ba.bitarray("01"),
307
- 0: ba.bitarray("00"),
308
- }
309
- # -
310
- GenotypeArrayInMemory.__init__(
311
- self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=mafMin
312
- )
170
+ @staticmethod
171
+ def load_fam(fam_file):
172
+ """
173
+ Load a FAM file into a pandas DataFrame.
174
+
175
+ Parameters
176
+ ----------
177
+ fam_file : str
178
+ Path to the FAM file
179
+
180
+ Returns
181
+ -------
182
+ pd.DataFrame
183
+ DataFrame containing FAM data
184
+ """
185
+ df = pd.read_csv(fam_file, sep=r"\s+", header=None, usecols=[1], names=["IID"])
186
+ return df
313
187
 
314
- # -
315
- def __read__(self, fname, m, n):
188
+ def _read(self, fname, m, n):
189
+ """
190
+ Read the bed file and return the genotype data.
191
+ """
316
192
  if not fname.endswith(".bed"):
317
193
  raise ValueError(".bed filename must end in .bed")
318
- # -
194
+
319
195
  fh = open(fname, "rb")
320
196
  magicNumber = ba.bitarray(endian="little")
321
197
  magicNumber.fromfile(fh, 2)
@@ -323,29 +199,150 @@ class PlinkBEDFile(GenotypeArrayInMemory):
323
199
  bedMode.fromfile(fh, 1)
324
200
  e = (4 - n % 4) if n % 4 != 0 else 0
325
201
  nru = n + e
326
- self.nru = nru
327
- # check magic number
202
+
203
+ # Check magic number
328
204
  if magicNumber != ba.bitarray("0011011011011000"):
329
205
  raise OSError("Magic number from Plink .bed file not recognized")
330
- # -
206
+
331
207
  if bedMode != ba.bitarray("10000000"):
332
208
  raise OSError("Plink .bed file must be in default SNP-major mode")
333
- # check file length
334
- self.geno = ba.bitarray(endian="little")
335
- self.geno.fromfile(fh)
336
- self.__test_length__(self.geno, self.m, self.nru)
337
- return (self.nru, self.geno)
338
-
339
- # -
340
- def __test_length__(self, geno, m, nru):
209
+
210
+ # Check file length
211
+ geno = ba.bitarray(endian="little")
212
+ geno.fromfile(fh)
213
+ self._test_length(geno, m, nru)
214
+ return (nru, geno)
215
+
216
+ def _test_length(self, geno, m, nru):
217
+ """
218
+ Test if the genotype data has the expected length.
219
+ """
341
220
  exp_len = 2 * m * nru
342
221
  real_len = len(geno)
343
222
  if real_len != exp_len:
344
223
  s = "Plink .bed file has {n1} bits, expected {n2}"
345
224
  raise OSError(s.format(n1=real_len, n2=exp_len))
346
225
 
347
- # -
348
- def __filter_indivs__(self, geno, keep_indivs, m, n):
226
+ def _calculate_all_snp_info(self):
227
+ """
228
+ Pre-calculate MAF and other information for all SNPs.
229
+
230
+ Returns
231
+ -------
232
+ dict
233
+ Dictionary containing information for all SNPs
234
+ """
235
+ nru = self.nru_original
236
+ n = self.n_original
237
+ m = self.m_original
238
+ geno = self.geno_original
239
+
240
+ snp_info = {
241
+ "freq": np.zeros(m), # Allele frequencies
242
+ "het_miss_count": np.zeros(m), # Count of het or missing genotypes
243
+ "valid_snp": np.zeros(m, dtype=bool), # Whether SNP passes basic criteria
244
+ }
245
+
246
+ # For each SNP, calculate statistics
247
+ for j in range(m):
248
+ z = geno[2 * nru * j : 2 * nru * (j + 1)]
249
+ A = z[0::2]
250
+ a = A.count()
251
+ B = z[1::2]
252
+ b = B.count()
253
+ c = (A & B).count()
254
+ major_ct = b + c # number of copies of the major allele
255
+ n_nomiss = n - a + c # number of individuals with nonmissing genotypes
256
+ f = major_ct / (2 * n_nomiss) if n_nomiss > 0 else 0
257
+ het_miss_ct = a + b - 2 * c # count of SNPs that are het or missing
258
+
259
+ snp_info["freq"][j] = f
260
+ snp_info["het_miss_count"][j] = het_miss_ct
261
+ snp_info["valid_snp"][j] = het_miss_ct < n # Basic validity check
262
+
263
+ return snp_info
264
+
265
+ def apply_filters(self, keep_snps=None, keep_indivs=None, mafMin=None):
266
+ """
267
+ Apply filters to the genotype data without reloading the bed file.
268
+
269
+ Parameters
270
+ ----------
271
+ keep_snps : array-like, optional
272
+ Indices of SNPs to keep.
273
+ keep_indivs : array-like, optional
274
+ Indices of individuals to keep.
275
+ mafMin : float, optional
276
+ Minimum minor allele frequency.
277
+
278
+ Returns
279
+ -------
280
+ self
281
+ Returns self for method chaining.
282
+ """
283
+ # Reset to original state first
284
+ self.geno = self.geno_original.copy()
285
+ self.m = self.m_original
286
+ self.n = self.n_original
287
+ self.nru = self.nru_original
288
+ self._currentSNP = 0
289
+
290
+ # Initialize with all SNPs
291
+ kept_snps = np.arange(self.m_original)
292
+
293
+ # Apply MAF filter using pre-calculated values
294
+ if mafMin is not None and mafMin > 0:
295
+ maf_values = np.minimum(self.all_snp_info["freq"], 1 - self.all_snp_info["freq"])
296
+ maf_mask = (maf_values > mafMin) & self.all_snp_info["valid_snp"]
297
+ kept_snps = kept_snps[maf_mask]
298
+ logger.info(f"After MAF filtering (>{mafMin}), {len(kept_snps)} SNPs remain")
299
+
300
+ # Apply SNP filter if specified
301
+ if keep_snps is not None:
302
+ keep_snps = np.array(keep_snps, dtype="int")
303
+ if np.any(keep_snps > self.m_original):
304
+ raise ValueError("keep_snps indices out of bounds")
305
+
306
+ # Intersect with current kept_snps
307
+ kept_snps = np.intersect1d(kept_snps, keep_snps)
308
+ logger.info(f"After keep_snps filtering, {len(kept_snps)} SNPs remain")
309
+
310
+ # Filter SNPs in the genotype data
311
+ if len(kept_snps) < self.m_original:
312
+ # Create new genotype data with only the kept SNPs
313
+ new_geno = ba.bitarray()
314
+ for j in kept_snps:
315
+ new_geno += self.geno_original[2 * self.nru * j : 2 * self.nru * (j + 1)]
316
+ self.geno = new_geno
317
+ self.m = len(kept_snps)
318
+
319
+ # Filter individuals if specified
320
+ if keep_indivs is not None:
321
+ keep_indivs = np.array(keep_indivs, dtype="int")
322
+ if np.any(keep_indivs > self.n):
323
+ raise ValueError("keep_indivs indices out of bounds")
324
+
325
+ (self.geno, self.m, self.n) = self._filter_indivs(
326
+ self.geno, keep_indivs, self.m, self.n
327
+ )
328
+
329
+ if self.n > 0:
330
+ logger.info(f"After filtering, {self.n} individuals remain")
331
+ else:
332
+ raise ValueError("After filtering, no individuals remain")
333
+
334
+ # Update kept_snps and other attributes
335
+ self.kept_snps = kept_snps
336
+ self.freq = self.all_snp_info["freq"][kept_snps]
337
+ self.maf = np.minimum(self.freq, 1 - self.freq)
338
+ self.sqrtpq = np.sqrt(self.freq * (1 - self.freq))
339
+
340
+ return self
341
+
342
+ def _filter_indivs(self, geno, keep_indivs, m, n):
343
+ """
344
+ Filter individuals based on the keep_indivs parameter.
345
+ """
349
346
  n_new = len(keep_indivs)
350
347
  e = (4 - n_new % 4) if n_new % 4 != 0 else 0
351
348
  nru_new = n_new + e
@@ -358,95 +355,120 @@ class PlinkBEDFile(GenotypeArrayInMemory):
358
355
  self.nru = nru_new
359
356
  return (z, m, n_new)
360
357
 
361
- # -
362
- def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
358
+ def get_snps_by_maf(self, mafMin):
363
359
  """
364
- Credit to Chris Chang and the Plink2 developers for this algorithm
365
- Modified from plink_filter.c
366
- https://github.com/chrchang/plink-ng/blob/master/plink_filter.c
367
- Genotypes are read forwards (since we are cheating and using endian="little")
368
- A := (genotype) & 1010...
369
- B := (genotype) & 0101...
370
- C := (A >> 1) & B
371
- Then
372
- a := A.count() = missing ct + hom major ct
373
- b := B.count() = het ct + hom major ct
374
- c := C.count() = hom major ct
375
- Which implies that
376
- missing ct = a - c
377
- # of indivs with nonmissing genotype = n - a + c
378
- major allele ct = b + c
379
- major allele frequency = (b+c)/(2*(n-a+c))
380
- het ct + missing ct = a + b - 2*c
381
- Why does bitarray not have >> ????
360
+ Get the list of SNPs that pass the MAF threshold.
361
+
362
+ Parameters
363
+ ----------
364
+ mafMin : float
365
+ Minimum MAF threshold
366
+
367
+ Returns
368
+ -------
369
+ list
370
+ List of SNP IDs that pass the MAF threshold
382
371
  """
383
- nru = self.nru
384
- m_poly = 0
385
- y = ba.bitarray()
386
- if keep_snps is None:
387
- keep_snps = range(m)
388
- kept_snps = []
389
- freq = []
390
- for e, j in enumerate(keep_snps):
391
- z = geno[2 * nru * j : 2 * nru * (j + 1)]
392
- A = z[0::2]
393
- a = A.count()
394
- B = z[1::2]
395
- b = B.count()
396
- c = (A & B).count()
397
- major_ct = b + c # number of copies of the major allele
398
- n_nomiss = n - a + c # number of individuals with nonmissing genotypes
399
- f = major_ct / (2 * n_nomiss) if n_nomiss > 0 else 0
400
- het_miss_ct = a + b - 2 * c # remove SNPs that are only either het or missing
401
- if np.minimum(f, 1 - f) > mafMin and het_miss_ct < n:
402
- freq.append(f)
403
- y += z
404
- m_poly += 1
405
- kept_snps.append(j)
406
- # -
407
- return (y, m_poly, n, kept_snps, freq)
408
-
409
- # -
410
- def nextSNPs(self, b, minorRef=None):
372
+ # Use the pre-calculated MAF values
373
+ maf_values = np.minimum(self.all_snp_info["freq"], 1 - self.all_snp_info["freq"])
374
+ maf_mask = (maf_values > mafMin) & self.all_snp_info["valid_snp"]
375
+
376
+ # Get SNP names from the BIM dataframe
377
+ snp_pass_maf = self.bim_df.loc[maf_mask, "SNP"].tolist()
378
+
379
+ logger.info(f"{len(snp_pass_maf)} SNPs with MAF > f{mafMin}")
380
+
381
+ return snp_pass_maf
382
+
383
+ def get_ldscore(self, annot_matrix=None, ld_wind=1.0, ld_unit="CM", keep_snps_index=None):
411
384
  """
412
- Unpacks the binary array of genotypes and returns an n x b matrix of floats of
413
- normalized genotypes for the next b SNPs, where n := number of samples.
385
+ Calculate LD scores using an annotation matrix.
414
386
 
415
387
  Parameters
416
388
  ----------
417
- b : int
418
- Number of SNPs to return.
419
- minorRef: bool, default None
420
- Should we flip reference alleles so that the minor allele is the reference?
421
- (This is useful for computing l1 w.r.t. minor allele).
389
+ annot_matrix : np.ndarray, optional
390
+ Annotation matrix. If None, uses a matrix of all ones.
391
+ ld_wind : float, optional
392
+ LD window size, by default 1.0
393
+ ld_unit : str, optional
394
+ Unit for the LD window, by default "CM"
395
+ keep_snps_index : list[int], optional
396
+ Indices of SNPs to keep, by default None
422
397
 
423
398
  Returns
424
399
  -------
425
- X : np.array with dtype float64 with shape (n, b), where n := number of samples
426
- Matrix of genotypes normalized to mean zero and variance one. If minorRef is
427
- not None, then the minor allele will be the positive allele (i.e., two copies
428
- of the minor allele --> a positive number).
400
+ np.ndarray
401
+ Array with calculated LD scores
402
+ """
403
+ # Apply filters if needed
404
+ if keep_snps_index is not None:
405
+ original_kept_snps = self.kept_snps.copy()
406
+ self.apply_filters(keep_snps=keep_snps_index)
407
+
408
+ # Configure LD window based on specified unit
409
+ if ld_unit == "SNP":
410
+ max_dist = ld_wind
411
+ coords = np.array(range(self.m))
412
+ elif ld_unit == "KB":
413
+ max_dist = ld_wind * 1000
414
+ coords = np.array(self.bim_df.loc[self.kept_snps, "BP"])
415
+ elif ld_unit == "CM":
416
+ max_dist = ld_wind
417
+ coords = np.array(self.bim_df.loc[self.kept_snps, "CM"])
418
+ # Check if the CM is all 0
419
+ if np.all(coords == 0):
420
+ logger.warning(
421
+ "All CM values are 0. Using 1MB window size for LD score calculation."
422
+ )
423
+ max_dist = 1_000_000
424
+ coords = np.array(self.bim_df.loc[self.kept_snps, "BP"])
425
+ else:
426
+ raise ValueError(f"Invalid ld_wind_unit: {ld_unit}. Must be one of: SNP, KB, CM")
427
+
428
+ # Calculate blocks for LD computation
429
+ block_left = getBlockLefts(coords, max_dist)
430
+ assert block_left.sum() > 0, "Invalid window size, please check the ld_wind parameter."
431
+
432
+ # Calculate LD scores
433
+ ld_scores = self.ldScoreVarBlocks(block_left, 100, annot=annot_matrix)
434
+
435
+ # Restore original state if filters were applied
436
+ if keep_snps_index is not None:
437
+ self.apply_filters(keep_snps=original_kept_snps)
438
+
439
+ return ld_scores
440
+
441
+ def restart(self):
442
+ """
443
+ Reset the current SNP index to 0.
444
+ """
445
+ self._currentSNP = 0
446
+
447
+ def nextSNPs(self, b, minorRef=None):
448
+ """
449
+ Unpacks the binary array of genotypes and returns an n x b matrix of floats of
450
+ normalized genotypes for the next b SNPs.
429
451
  """
430
- # -
431
452
  try:
432
453
  b = int(b)
433
454
  if b <= 0:
434
455
  raise ValueError("b must be > 0")
435
456
  except TypeError as e:
436
457
  raise TypeError("b must be an integer") from e
437
- # -
458
+
438
459
  if self._currentSNP + b > self.m:
439
460
  s = "{b} SNPs requested, {k} SNPs remain"
440
461
  raise ValueError(s.format(b=b, k=(self.m - self._currentSNP)))
441
- # -
462
+
442
463
  c = self._currentSNP
443
464
  n = self.n
444
465
  nru = self.nru
445
466
  slice = self.geno[2 * c * nru : 2 * (c + b) * nru]
446
- X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
467
+ X = np.array(slice.decode(self._bedcode), dtype="float32").reshape((b, nru)).T
447
468
  X = X[0:n, :]
448
- Y = np.zeros(X.shape)
449
- # normalize the SNPs and impute the missing one with the mean
469
+ Y = np.zeros(X.shape, dtype="float32")
470
+
471
+ # Normalize the SNPs and impute the missing ones with the mean
450
472
  for j in range(0, b):
451
473
  newsnp = X[:, j]
452
474
  ii = newsnp != 9
@@ -455,35 +477,116 @@ class PlinkBEDFile(GenotypeArrayInMemory):
455
477
  denom = np.std(newsnp)
456
478
  if denom == 0:
457
479
  denom = 1
458
- # -
480
+
459
481
  if minorRef is not None and self.freq[self._currentSNP + j] > 0.5:
460
482
  denom = denom * -1
461
- # -
483
+
462
484
  Y[:, j] = (newsnp - avg) / denom
463
- # -
485
+
464
486
  self._currentSNP += b
465
487
  return Y
466
488
 
489
+ def _l2_unbiased(self, x, n):
490
+ """
491
+ Calculate the unbiased estimate of L2.
492
+ """
493
+ denom = n - 2 if n > 2 else n # allow n<2 for testing purposes
494
+ sq = np.square(x)
495
+ return sq - (1 - sq) / denom
496
+
497
+ def ldScoreVarBlocks(self, block_left, c, annot=None):
498
+ """
499
+ Computes an unbiased estimate of L2(j) for j=1,..,M.
500
+ """
467
501
 
468
- def load_bfile(bfile_chr_prefix, keep_snps=None, keep_indivs=None, mafMin=None):
469
- PlinkBIMFile = ID_List_Factory(
470
- ["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
471
- )
472
- PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
502
+ def func(x):
503
+ return self._l2_unbiased(x, self.n)
473
504
 
474
- snp_file = bfile_chr_prefix + ".bim"
475
- array_snps = PlinkBIMFile(snp_file)
505
+ snp_getter = self.nextSNPs
506
+ return self._corSumVarBlocks(block_left, c, func, snp_getter, annot)
476
507
 
477
- # Load fam
478
- ind_file = bfile_chr_prefix + ".fam"
479
- array_indivs = PlinkFAMFile(ind_file)
508
+ def _corSumVarBlocks(self, block_left, c, func, snp_getter, annot=None):
509
+ """
510
+ Calculate the sum of correlation coefficients.
511
+ """
512
+ m, n = self.m, self.n
513
+ block_sizes = np.array(np.arange(m) - block_left)
514
+ block_sizes = np.ceil(block_sizes / c) * c
515
+ if annot is None:
516
+ annot = np.ones((m, 1), dtype="float32")
517
+ else:
518
+ # annot = annot.astype("float32") # Ensure annot is float32
519
+ annot_m = annot.shape[0]
520
+ if annot_m != self.m:
521
+ raise ValueError("Incorrect number of SNPs in annot")
480
522
 
481
- n = len(array_indivs.IDList)
523
+ n_a = annot.shape[1] # number of annotations
524
+ cor_sum = np.zeros((m, n_a), dtype="float32")
525
+ # b = index of first SNP for which SNP 0 is not included in LD Score
526
+ b = np.nonzero(block_left > 0)
527
+ if np.any(b):
528
+ b = b[0][0]
529
+ else:
530
+ b = m
531
+ b = int(np.ceil(b / c) * c) # round up to a multiple of c
532
+ if b > m:
533
+ c = 1
534
+ b = m
482
535
 
483
- # Load genotype array
484
- array_file = bfile_chr_prefix + ".bed"
485
- geno_array = PlinkBEDFile(
486
- array_file, n, array_snps, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=mafMin
487
- )
536
+ l_A = 0 # l_A := index of leftmost SNP in matrix A
537
+ A = snp_getter(b) # This now returns float32 data
538
+ rfuncAB = np.zeros((b, c), dtype="float32")
539
+ rfuncBB = np.zeros((c, c), dtype="float32")
540
+ # chunk inside of block
541
+ for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
542
+ B = A[:, l_B : l_B + c]
543
+ # ld matrix
544
+ np.dot(A.T, B / n, out=rfuncAB)
545
+ # ld matrix square
546
+ rfuncAB = func(rfuncAB)
547
+ cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
488
548
 
489
- return array_snps, array_indivs, geno_array
549
+ # chunk to right of block
550
+ b0 = b
551
+ md = int(c * np.floor(m / c))
552
+ end = md + 1 if md != m else md
553
+ for l_B in tqdm(np.arange(b0, end, c), desc="Compute SNP Gene Weight"):
554
+ # check if the annot matrix is all zeros for this block + chunk
555
+ # this happens w/ sparse categories (i.e., pathways)
556
+ # update the block
557
+ old_b = b
558
+ b = int(block_sizes[l_B])
559
+ if l_B > b0 and b > 0:
560
+ # block_size can't increase more than c
561
+ # block_size can't be less than c unless it is zero
562
+ # both of these things make sense
563
+ A = np.hstack((A[:, old_b - b + c : old_b], B))
564
+ l_A += old_b - b + c
565
+ elif l_B == b0 and b > 0:
566
+ A = A[:, b0 - b : b0]
567
+ l_A = b0 - b
568
+ elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
569
+ A = np.array((), dtype="float32").reshape((n, 0))
570
+ l_A = l_B
571
+ if l_B == md:
572
+ c = m - md
573
+ rfuncAB = np.zeros((b, c), dtype="float32")
574
+ rfuncBB = np.zeros((c, c), dtype="float32")
575
+ if b != old_b:
576
+ rfuncAB = np.zeros((b, c), dtype="float32")
577
+
578
+ B = snp_getter(c) # This now returns float32 data
579
+ p1 = np.all(annot[l_A : l_A + b, :] == 0)
580
+ p2 = np.all(annot[l_B : l_B + c, :] == 0)
581
+ if p1 and p2:
582
+ continue
583
+
584
+ np.dot(A.T, B / n, out=rfuncAB)
585
+ rfuncAB = func(rfuncAB)
586
+ cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
587
+ cor_sum[l_B : l_B + c, :] += np.dot(annot[l_A : l_A + b, :].T, rfuncAB).T
588
+ np.dot(B.T, B / n, out=rfuncBB)
589
+ rfuncBB = func(rfuncBB)
590
+ cor_sum[l_B : l_B + c, :] += np.dot(rfuncBB, annot[l_B : l_B + c, :])
591
+
592
+ return cor_sum