gsMap 1.71.2__py3-none-any.whl → 1.73.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,48 +1,13 @@
1
- from pathlib import Path
2
1
  import bitarray as ba
3
2
  import numpy as np
4
3
  import pandas as pd
5
- from scipy.sparse import csr_matrix
6
- from scipy.sparse import save_npz, load_npz
7
- from tqdm import trange, tqdm
8
-
9
-
10
- # Define the log class
11
- class Logger(object):
12
- # -
13
- def __init__(self, fh):
14
- self.log_fh = open(fh, 'w')
15
-
16
- # -
17
- def log(self, msg):
18
- '''
19
- Print to log file and stdout.
20
- '''
21
- print(msg, file=self.log_fh)
22
- print(msg)
23
-
24
- # -
25
- def close(self):
26
- self.log_fh.close()
27
-
28
-
29
- # Compute ld-score using cellular annotations
30
- def get_compression(fh):
31
- '''Which sort of compression should we use with read_csv?'''
32
- if fh.endswith('gz'):
33
- compression = 'gzip'
34
- elif fh.endswith('bz2'):
35
- compression = 'bz2'
36
- else:
37
- compression = None
38
- # -
39
- return compression
4
+ from tqdm import tqdm
40
5
 
41
6
 
42
7
  # Define the reading functions
43
8
  def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
44
9
  # -
45
- class IDContainer(object):
10
+ class IDContainer:
46
11
  """
47
12
  A class to read data from a file, store it as a DataFrame, and provide a method for a left outer join operation.
48
13
  """
@@ -66,48 +31,38 @@ def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
66
31
  """
67
32
  end = self.fname_end
68
33
  if end and not fname.endswith(end):
69
- raise ValueError('{f} filename must end in {f}'.format(f=end))
70
- comp = get_compression(fname)
71
- self.df = pd.read_csv(fname, header=self.header, usecols=self.usecols,
72
- sep='\s+', compression=comp)
34
+ raise ValueError(f"{end} filename must end in {end}")
35
+ self.df = pd.read_csv(
36
+ fname,
37
+ header=self.header,
38
+ usecols=self.usecols,
39
+ sep=r"\s+",
40
+ )
73
41
  if self.colnames:
74
42
  self.df.columns = self.colnames
75
43
  if self.keepcol is not None:
76
- self.IDList = self.df.iloc[:, [self.keepcol]].astype('object')
44
+ self.IDList = self.df.iloc[:, [self.keepcol]].astype("object")
77
45
 
78
- # -
79
- def loj(self, externalDf):
80
- """
81
- Perform a left outer join operation with the given external DataFrame.
82
- """
83
- r = externalDf.columns[0]
84
- l = self.IDList.columns[0]
85
- merge_df = externalDf.iloc[:, [0]]
86
- merge_df['keep'] = True
87
- z = pd.merge(self.IDList, merge_df, how='left', left_on=l, right_on=r,
88
- sort=False)
89
- ii = z['keep'] == True
90
- return np.nonzero(ii)[0]
91
-
92
- # -
93
46
  return IDContainer
94
47
 
95
48
 
96
49
  def getBlockLefts(coords, max_dist):
97
- '''
50
+ """
98
51
  Converts coordinates + max block length to the a list of coordinates of the leftmost
99
52
  SNPs to be included in blocks.
53
+
100
54
  Parameters
101
55
  ----------
102
56
  coords : array
103
57
  Array of coordinates. Must be sorted.
104
58
  max_dist : float
105
59
  Maximum distance between SNPs included in the same window.
60
+
106
61
  Returns
107
62
  -------
108
63
  block_left : 1D np.ndarray with same length as block_left
109
64
  block_left[j] := min{k | dist(j, k) < max_dist}.
110
- '''
65
+ """
111
66
  M = len(coords)
112
67
  j = 0
113
68
  block_left = np.zeros(M)
@@ -120,17 +75,19 @@ def getBlockLefts(coords, max_dist):
120
75
 
121
76
 
122
77
  def block_left_to_right(block_left):
123
- '''
78
+ """
124
79
  Converts block lefts to block rights.
80
+
125
81
  Parameters
126
82
  ----------
127
83
  block_left : array
128
84
  Array of block lefts.
85
+
129
86
  Returns
130
87
  -------
131
88
  block_right : 1D np.ndarray with same length as block_left
132
89
  block_right[j] := max {k | block_left[k] <= j}
133
- '''
90
+ """
134
91
  M = len(block_left)
135
92
  j = 0
136
93
  block_right = np.zeros(M)
@@ -142,54 +99,57 @@ def block_left_to_right(block_left):
142
99
  return block_right
143
100
 
144
101
 
145
- class GenotypeArrayInMemory(object):
146
- '''
102
+ class GenotypeArrayInMemory:
103
+ """
147
104
  Parent class for various classes containing interfaces for files with genotype
148
105
  matrices, e.g., plink .bed files, etc
149
- '''
106
+ """
150
107
 
151
108
  def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
152
109
  self.m = len(snp_list.IDList)
153
110
  self.n = n
154
111
  self.keep_snps = keep_snps
155
112
  self.keep_indivs = keep_indivs
156
- self.df = np.array(snp_list.df[['CHR', 'SNP', 'BP', 'CM']])
157
- self.colnames = ['CHR', 'SNP', 'BP', 'CM']
113
+ self.df = np.array(snp_list.df[["CHR", "SNP", "BP", "CM"]])
114
+ self.colnames = ["CHR", "SNP", "BP", "CM"]
158
115
  self.mafMin = mafMin if mafMin is not None else 0
159
116
  self._currentSNP = 0
160
117
  (self.nru, self.geno) = self.__read__(fname, self.m, n)
161
118
  # filter individuals
162
119
  if keep_indivs is not None:
163
- keep_indivs = np.array(keep_indivs, dtype='int')
120
+ keep_indivs = np.array(keep_indivs, dtype="int")
164
121
  if np.any(keep_indivs > self.n):
165
- raise ValueError('keep_indivs indices out of bounds')
122
+ raise ValueError("keep_indivs indices out of bounds")
166
123
  # -
167
- (self.geno, self.m, self.n) = self.__filter_indivs__(self.geno, keep_indivs, self.m, self.n)
124
+ (self.geno, self.m, self.n) = self.__filter_indivs__(
125
+ self.geno, keep_indivs, self.m, self.n
126
+ )
168
127
  # -
169
128
  if self.n > 0:
170
- print('After filtering, {n} individuals remain'.format(n=self.n))
129
+ print(f"After filtering, {self.n} individuals remain")
171
130
  else:
172
- raise ValueError('After filtering, no individuals remain')
131
+ raise ValueError("After filtering, no individuals remain")
173
132
  # -
174
133
  # filter SNPs
175
134
  if keep_snps is not None:
176
- keep_snps = np.array(keep_snps, dtype='int')
135
+ keep_snps = np.array(keep_snps, dtype="int")
177
136
  if np.any(keep_snps > self.m): # if keep_snps is None, this returns False
178
- raise ValueError('keep_snps indices out of bounds')
137
+ raise ValueError("keep_snps indices out of bounds")
179
138
  # -
180
139
  (self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
181
- self.geno, self.m, self.n, self.mafMin, keep_snps)
140
+ self.geno, self.m, self.n, self.mafMin, keep_snps
141
+ )
182
142
  # -
183
143
  if self.m > 0:
184
- print('After filtering, {m} SNPs remain'.format(m=self.m))
144
+ print(f"After filtering, {self.m} SNPs remain")
185
145
  else:
186
- raise ValueError('After filtering, no SNPs remain')
146
+ raise ValueError("After filtering, no SNPs remain")
187
147
  # -
188
148
  self.df = self.df[self.kept_snps, :]
189
149
  self.maf = np.minimum(self.freq, np.ones(self.m) - self.freq)
190
150
  self.sqrtpq = np.sqrt(self.freq * (np.ones(self.m) - self.freq))
191
151
  self.df = np.c_[self.df, self.maf]
192
- self.colnames.append('MAF')
152
+ self.colnames.append("MAF")
193
153
 
194
154
  # -
195
155
  def __read__(self, fname, m, n):
@@ -208,8 +168,11 @@ class GenotypeArrayInMemory(object):
208
168
 
209
169
  # -
210
170
  def ldScoreVarBlocks(self, block_left, c, annot=None):
211
- '''Computes an unbiased estimate of L2(j) for j=1,..,M.'''
212
- func = lambda x: self.__l2_unbiased__(x, self.n)
171
+ """Computes an unbiased estimate of L2(j) for j=1,..,M."""
172
+
173
+ def func(x):
174
+ return self.__l2_unbiased__(x, self.n)
175
+
213
176
  snp_getter = self.nextSNPs
214
177
  return self.__corSumVarBlocks__(block_left, c, func, snp_getter, annot)
215
178
 
@@ -225,7 +188,7 @@ class GenotypeArrayInMemory(object):
225
188
  # Methods for calculating sums of Pearson correlation coefficients (i.e.,ld-score)
226
189
  # c stands for the chunk size (default = 50)
227
190
  def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
228
- '''
191
+ """
229
192
  Parameters
230
193
  ----------
231
194
  block_left : np.ndarray with shape (M, )
@@ -243,11 +206,12 @@ class GenotypeArrayInMemory(object):
243
206
  The method to be used to get the next SNPs
244
207
  annot: numpy array with shape (m,n_a)
245
208
  SNP annotations.
209
+
246
210
  Returns
247
211
  -------
248
212
  cor_sum : np.ndarray with shape (M, num_annots)
249
213
  Estimates.
250
- '''
214
+ """
251
215
  m, n = self.m, self.n
252
216
  block_sizes = np.array(np.arange(m) - block_left)
253
217
  block_sizes = np.ceil(block_sizes / c) * c
@@ -256,7 +220,7 @@ class GenotypeArrayInMemory(object):
256
220
  else:
257
221
  annot_m = annot.shape[0]
258
222
  if annot_m != self.m:
259
- raise ValueError('Incorrect number of SNPs in annot')
223
+ raise ValueError("Incorrect number of SNPs in annot")
260
224
  # -
261
225
  n_a = annot.shape[1] # number of annotations
262
226
  cor_sum = np.zeros((m, n_a))
@@ -277,18 +241,18 @@ class GenotypeArrayInMemory(object):
277
241
  rfuncBB = np.zeros((c, c))
278
242
  # chunk inside of block
279
243
  for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
280
- B = A[:, l_B:l_B + c]
244
+ B = A[:, l_B : l_B + c]
281
245
  # ld matrix
282
246
  np.dot(A.T, B / n, out=rfuncAB)
283
247
  # ld matrix square
284
248
  rfuncAB = func(rfuncAB)
285
- cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
249
+ cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
286
250
 
287
251
  # chunk to right of block
288
252
  b0 = b
289
253
  md = int(c * np.floor(m / c))
290
254
  end = md + 1 if md != m else md
291
- for l_B in tqdm(np.arange(b0, end, c), desc=f'Compute SNP Gene Weight'):
255
+ for l_B in tqdm(np.arange(b0, end, c), desc="Compute SNP Gene Weight"):
292
256
  # check if the annot matrix is all zeros for this block + chunk
293
257
  # this happens w/ sparse categories (i.e., pathways)
294
258
  # update the block
@@ -298,10 +262,10 @@ class GenotypeArrayInMemory(object):
298
262
  # block_size can't increase more than c
299
263
  # block_size can't be less than c unless it is zero
300
264
  # both of these things make sense
301
- A = np.hstack((A[:, old_b - b + c:old_b], B))
265
+ A = np.hstack((A[:, old_b - b + c : old_b], B))
302
266
  l_A += old_b - b + c
303
267
  elif l_B == b0 and b > 0:
304
- A = A[:, b0 - b:b0]
268
+ A = A[:, b0 - b : b0]
305
269
  l_A = b0 - b
306
270
  elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
307
271
  A = np.array(()).reshape((n, 0))
@@ -314,44 +278,45 @@ class GenotypeArrayInMemory(object):
314
278
  rfuncAB = np.zeros((b, c))
315
279
  # -
316
280
  B = snp_getter(c)
317
- p1 = np.all(annot[l_A:l_A + b, :] == 0)
318
- p2 = np.all(annot[l_B:l_B + c, :] == 0)
281
+ p1 = np.all(annot[l_A : l_A + b, :] == 0)
282
+ p2 = np.all(annot[l_B : l_B + c, :] == 0)
319
283
  if p1 and p2:
320
284
  continue
321
285
  # -
322
286
  np.dot(A.T, B / n, out=rfuncAB)
323
287
  rfuncAB = func(rfuncAB)
324
- cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
325
- cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
288
+ cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
289
+ cor_sum[l_B : l_B + c, :] += np.dot(annot[l_A : l_A + b, :].T, rfuncAB).T
326
290
  np.dot(B.T, B / n, out=rfuncBB)
327
291
  rfuncBB = func(rfuncBB)
328
- cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
292
+ cor_sum[l_B : l_B + c, :] += np.dot(rfuncBB, annot[l_B : l_B + c, :])
329
293
  # -
330
294
  return cor_sum
331
295
 
332
296
 
333
297
  class PlinkBEDFile(GenotypeArrayInMemory):
334
- '''
298
+ """
335
299
  Interface for Plink .bed format
336
- '''
300
+ """
337
301
 
338
302
  def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
339
303
  self._bedcode = {
340
- 2: ba.bitarray('11'),
341
- 9: ba.bitarray('10'),
342
- 1: ba.bitarray('01'),
343
- 0: ba.bitarray('00')
304
+ 2: ba.bitarray("11"),
305
+ 9: ba.bitarray("10"),
306
+ 1: ba.bitarray("01"),
307
+ 0: ba.bitarray("00"),
344
308
  }
345
309
  # -
346
- GenotypeArrayInMemory.__init__(self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs,
347
- mafMin=mafMin)
310
+ GenotypeArrayInMemory.__init__(
311
+ self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=mafMin
312
+ )
348
313
 
349
314
  # -
350
315
  def __read__(self, fname, m, n):
351
- if not fname.endswith('.bed'):
352
- raise ValueError('.bed filename must end in .bed')
316
+ if not fname.endswith(".bed"):
317
+ raise ValueError(".bed filename must end in .bed")
353
318
  # -
354
- fh = open(fname, 'rb')
319
+ fh = open(fname, "rb")
355
320
  magicNumber = ba.bitarray(endian="little")
356
321
  magicNumber.fromfile(fh, 2)
357
322
  bedMode = ba.bitarray(endian="little")
@@ -360,11 +325,11 @@ class PlinkBEDFile(GenotypeArrayInMemory):
360
325
  nru = n + e
361
326
  self.nru = nru
362
327
  # check magic number
363
- if magicNumber != ba.bitarray('0011011011011000'):
364
- raise IOError("Magic number from Plink .bed file not recognized")
328
+ if magicNumber != ba.bitarray("0011011011011000"):
329
+ raise OSError("Magic number from Plink .bed file not recognized")
365
330
  # -
366
- if bedMode != ba.bitarray('10000000'):
367
- raise IOError("Plink .bed file must be in default SNP-major mode")
331
+ if bedMode != ba.bitarray("10000000"):
332
+ raise OSError("Plink .bed file must be in default SNP-major mode")
368
333
  # check file length
369
334
  self.geno = ba.bitarray(endian="little")
370
335
  self.geno.fromfile(fh)
@@ -377,7 +342,7 @@ class PlinkBEDFile(GenotypeArrayInMemory):
377
342
  real_len = len(geno)
378
343
  if real_len != exp_len:
379
344
  s = "Plink .bed file has {n1} bits, expected {n2}"
380
- raise IOError(s.format(n1=real_len, n2=exp_len))
345
+ raise OSError(s.format(n1=real_len, n2=exp_len))
381
346
 
382
347
  # -
383
348
  def __filter_indivs__(self, geno, keep_indivs, m, n):
@@ -388,14 +353,14 @@ class PlinkBEDFile(GenotypeArrayInMemory):
388
353
  z = ba.bitarray(m * 2 * nru_new, endian="little")
389
354
  z.setall(0)
390
355
  for e, i in enumerate(keep_indivs):
391
- z[2 * e::2 * nru_new] = geno[2 * i::2 * nru]
392
- z[2 * e + 1::2 * nru_new] = geno[2 * i + 1::2 * nru]
356
+ z[2 * e :: 2 * nru_new] = geno[2 * i :: 2 * nru]
357
+ z[2 * e + 1 :: 2 * nru_new] = geno[2 * i + 1 :: 2 * nru]
393
358
  self.nru = nru_new
394
359
  return (z, m, n_new)
395
360
 
396
361
  # -
397
362
  def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
398
- '''
363
+ """
399
364
  Credit to Chris Chang and the Plink2 developers for this algorithm
400
365
  Modified from plink_filter.c
401
366
  https://github.com/chrchang/plink-ng/blob/master/plink_filter.c
@@ -414,7 +379,7 @@ class PlinkBEDFile(GenotypeArrayInMemory):
414
379
  major allele frequency = (b+c)/(2*(n-a+c))
415
380
  het ct + missing ct = a + b - 2*c
416
381
  Why does bitarray not have >> ????
417
- '''
382
+ """
418
383
  nru = self.nru
419
384
  m_poly = 0
420
385
  y = ba.bitarray()
@@ -423,7 +388,7 @@ class PlinkBEDFile(GenotypeArrayInMemory):
423
388
  kept_snps = []
424
389
  freq = []
425
390
  for e, j in enumerate(keep_snps):
426
- z = geno[2 * nru * j:2 * nru * (j + 1)]
391
+ z = geno[2 * nru * j : 2 * nru * (j + 1)]
427
392
  A = z[0::2]
428
393
  a = A.count()
429
394
  B = z[1::2]
@@ -443,9 +408,10 @@ class PlinkBEDFile(GenotypeArrayInMemory):
443
408
 
444
409
  # -
445
410
  def nextSNPs(self, b, minorRef=None):
446
- '''
411
+ """
447
412
  Unpacks the binary array of genotypes and returns an n x b matrix of floats of
448
413
  normalized genotypes for the next b SNPs, where n := number of samples.
414
+
449
415
  Parameters
450
416
  ----------
451
417
  b : int
@@ -453,29 +419,30 @@ class PlinkBEDFile(GenotypeArrayInMemory):
453
419
  minorRef: bool, default None
454
420
  Should we flip reference alleles so that the minor allele is the reference?
455
421
  (This is useful for computing l1 w.r.t. minor allele).
422
+
456
423
  Returns
457
424
  -------
458
425
  X : np.array with dtype float64 with shape (n, b), where n := number of samples
459
426
  Matrix of genotypes normalized to mean zero and variance one. If minorRef is
460
427
  not None, then the minor allele will be the positive allele (i.e., two copies
461
428
  of the minor allele --> a positive number).
462
- '''
429
+ """
463
430
  # -
464
431
  try:
465
432
  b = int(b)
466
433
  if b <= 0:
467
434
  raise ValueError("b must be > 0")
468
- except TypeError:
469
- raise TypeError("b must be an integer")
435
+ except TypeError as e:
436
+ raise TypeError("b must be an integer") from e
470
437
  # -
471
438
  if self._currentSNP + b > self.m:
472
- s = '{b} SNPs requested, {k} SNPs remain'
439
+ s = "{b} SNPs requested, {k} SNPs remain"
473
440
  raise ValueError(s.format(b=b, k=(self.m - self._currentSNP)))
474
441
  # -
475
442
  c = self._currentSNP
476
443
  n = self.n
477
444
  nru = self.nru
478
- slice = self.geno[2 * c * nru:2 * (c + b) * nru]
445
+ slice = self.geno[2 * c * nru : 2 * (c + b) * nru]
479
446
  X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
480
447
  X = X[0:n, :]
481
448
  Y = np.zeros(X.shape)
@@ -498,238 +465,25 @@ class PlinkBEDFile(GenotypeArrayInMemory):
498
465
  return Y
499
466
 
500
467
 
501
- class PlinkBEDFileWithR2Cache(PlinkBEDFile):
502
- def compute_r2_cache(self,
503
- block_left,
504
- output_cache_file_dir: Path,
505
- chunk_size=500_000_000,
506
- c=500,
507
- r2_threshold=1e-4,
508
- annot=None):
509
-
510
- func = np.square
511
- snp_getter = self.nextSNPs
512
- data, rows, cols = [], [], []
513
-
514
- def add_rfuncAB(rfuncAB, l_A, l_B):
515
- non_zero_indices = np.nonzero(rfuncAB > r2_threshold)
516
- data.extend(rfuncAB[non_zero_indices])
517
- rows.extend(l_A + non_zero_indices[0])
518
- cols.extend(l_B + non_zero_indices[1])
519
-
520
- # def add_rfuncAB(rfuncAB, l_A, l_B):
521
- # # not need select non zero indices
522
- # data.extend(rfuncAB.flatten())
523
- # rows.extend(l_A + np.repeat(np.arange(rfuncAB.shape[0]), rfuncAB.shape[1]))
524
- # cols.extend(l_B + np.tile(np.arange(rfuncAB.shape[1]), rfuncAB.shape[0]))
525
-
526
- # def add_rfuncBB(rfuncBB, l_B):
527
- # non_zero_indices = np.nonzero(rfuncBB)
528
- # data.extend(rfuncBB[non_zero_indices])
529
- # rows.extend(l_B + non_zero_indices[0])
530
- # cols.extend(l_B + non_zero_indices[1])
531
-
532
- def add_rfuncBB(rfuncBB, l_B):
533
- non_zero_indices = np.nonzero(rfuncBB > r2_threshold)
534
- data.extend(rfuncBB[non_zero_indices])
535
- rows.extend(l_B + non_zero_indices[0])
536
- cols.extend(l_B + non_zero_indices[1])
537
- if len(data) > chunk_size:
538
- # save the cache
539
- print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
540
- r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(self.m, self.m), dtype='float16')
541
- save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
542
- # reset the data
543
- data.clear()
544
- rows.clear()
545
- cols.clear()
546
-
547
- m, n = self.m, self.n
548
- block_sizes = np.array(np.arange(m) - block_left)
549
- block_sizes = np.ceil(block_sizes / c) * c
550
- if annot is None:
551
- annot = np.ones((m, 1))
552
- else:
553
- annot_m = annot.shape[0]
554
- if annot_m != self.m:
555
- raise ValueError('Incorrect number of SNPs in annot')
556
- # -
557
- n_a = annot.shape[1] # number of annotations
558
- # cor_sum = np.zeros((m, n_a))
559
- # b = index of first SNP for which SNP 0 is not included in LD Score
560
- b = np.nonzero(block_left > 0)
561
- if np.any(b):
562
- b = b[0][0]
563
- else:
564
- b = m
565
- b = int(np.ceil(b / c) * c) # round up to a multiple of c
566
- if b > m:
567
- c = 1
568
- b = m
569
-
570
- l_A = 0 # l_A := index of leftmost SNP in matrix A
571
- A = snp_getter(b)
572
- rfuncAB = np.zeros((b, c))
573
- rfuncBB = np.zeros((c, c))
574
- # chunk inside of block
575
- for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
576
- B = A[:, l_B:l_B + c]
577
- # ld matrix
578
- np.dot(A.T, B / n, out=rfuncAB)
579
- # ld matrix square
580
- rfuncAB = func(rfuncAB)
581
- add_rfuncAB(rfuncAB, l_A, l_B)
582
- # cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
468
+ def load_bfile(bfile_chr_prefix):
469
+ PlinkBIMFile = ID_List_Factory(
470
+ ["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
471
+ )
472
+ PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
583
473
 
584
- # chunk to right of block
585
- b0 = b
586
- md = int(c * np.floor(m / c))
587
- end = md + 1 if md != m else md
588
- for l_B in trange(b0, end, c, desc=f'Compute r2 cache for {output_cache_file_dir.name}'):
589
- # check if the annot matrix is all zeros for this block + chunk
590
- # this happens w/ sparse categories (i.e., pathways)
591
- # update the block
592
- old_b = b
593
- b = int(block_sizes[l_B])
594
- if l_B > b0 and b > 0:
595
- # block_size can't increase more than c
596
- # block_size can't be less than c unless it is zero
597
- # both of these things make sense
598
- A = np.hstack((A[:, old_b - b + c:old_b], B))
599
- l_A += old_b - b + c
600
- elif l_B == b0 and b > 0:
601
- A = A[:, b0 - b:b0]
602
- l_A = b0 - b
603
- elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
604
- A = np.array(()).reshape((n, 0))
605
- l_A = l_B
606
- if l_B == md:
607
- c = m - md
608
- rfuncAB = np.zeros((b, c))
609
- rfuncBB = np.zeros((c, c))
610
- if b != old_b:
611
- rfuncAB = np.zeros((b, c))
612
- # -
613
- B = snp_getter(c)
614
- p1 = np.all(annot[l_A:l_A + b, :] == 0)
615
- p2 = np.all(annot[l_B:l_B + c, :] == 0)
616
- if p1 and p2:
617
- continue
618
- # -
619
- np.dot(A.T, B / n, out=rfuncAB)
620
- rfuncAB = func(rfuncAB)
621
- # cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
622
- # cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
623
- add_rfuncAB(rfuncAB, l_A, l_B)
624
- add_rfuncAB(rfuncAB.T, l_B, l_A)
625
- np.dot(B.T, B / n, out=rfuncBB)
626
- rfuncBB = func(rfuncBB)
627
- # cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
628
- add_rfuncBB(rfuncBB, l_B)
629
- if len(data) > 0:
630
- # save remaining data
631
- # save the cache
632
- print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
633
- r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(m, m), dtype='float16')
634
- save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
635
- # combine the cache files
636
- print(f'Start combining the cache files in {output_cache_file_dir}')
637
- cached_r2_matrix_files = list(output_cache_file_dir.glob('*.npz'))
638
- combined_r2_matrix_files = self.load_r2_matrix_from_cache_files(output_cache_file_dir)
639
- # remove the cache files
640
- for cached_r2_matrix_file in cached_r2_matrix_files:
641
- cached_r2_matrix_file.unlink()
642
- # save the combined r2 matrix
643
- print(f'Start saving the combined r2 matrix in {output_cache_file_dir}')
644
- combined_r2_matrix_file = output_cache_file_dir / 'combined_r2_matrix.npz'
645
- save_npz(combined_r2_matrix_file, combined_r2_matrix_files)
646
-
647
- def get_ldscore_using_r2_cache(self, annot_matrix, cached_r2_matrix_dir):
648
- """
649
- Compute the r2 matrix multiplication with annot_matrix
650
- """
651
- # Compute the r2 matrix multiplication with annot_matrix
652
- cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
653
- # iter the cached r2 matrix files
654
- result_matrix = np.zeros((self.m, annot_matrix.shape[1]))
655
- cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
656
- assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
657
- f'Please run the function compute_r2_cache first!')
658
- for r2_matrix_file in tqdm(cached_r2_matrix_files, desc=f'Compute ld score for {cached_r2_matrix_dir.name}'):
659
- print(f'Compute r2 matrix multiplication for {r2_matrix_file}')
660
- r2_matrix = load_npz(r2_matrix_file)
661
- result_matrix += r2_matrix.dot(annot_matrix)
662
- return result_matrix
663
-
664
- def load_r2_matrix_from_cache_files(self, cached_r2_matrix_dir):
665
- """
666
- Load the r2 matrix from cache
667
- """
668
- cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
669
- # iter the cached r2 matrix files
670
- cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
671
- assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
672
- f'Please run the function compute_r2_cache first!')
673
- # load the r2 matrix
674
- r2_matrix = load_npz(cached_r2_matrix_files[0])
675
- for r2_matrix_file in tqdm(cached_r2_matrix_files[1:], desc=f'Load r2 matrix from {cached_r2_matrix_dir.name}'):
676
- print(f'Load r2 matrix from {r2_matrix_file}')
677
- r2_matrix += load_npz(r2_matrix_file)
678
- # to float16
679
- r2_matrix = r2_matrix.astype('float16')
680
- return r2_matrix
681
- def load_combined_r2_matrix(self, cached_r2_matrix_dir):
682
- """
683
- Load the combined r2 matrix
684
- """
685
- combined_r2_matrix_file = Path(cached_r2_matrix_dir) / 'combined_r2_matrix.npz'
686
- assert combined_r2_matrix_file.exists(), (f'No combined r2 matrix file in {cached_r2_matrix_dir}'
687
- f'Should delete the cache files and run the function compute_r2_cache first!')
688
- # load the r2 matrix
689
- r2_matrix = load_npz(combined_r2_matrix_file)
690
- # to float16
691
- r2_matrix = r2_matrix.astype('float16')
692
- return r2_matrix
474
+ snp_file = bfile_chr_prefix + ".bim"
475
+ array_snps = PlinkBIMFile(snp_file)
693
476
 
694
- def load_bfile(bfile_chr_prefix):
695
- PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
696
- PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
697
-
698
- snp_file, snp_obj = bfile_chr_prefix + '.bim', PlinkBIMFile
699
- array_snps = snp_obj(snp_file)
700
- m = len(array_snps.IDList)
701
- print(f'Read list of {m} SNPs from {snp_file}')
702
- #
703
477
  # Load fam
704
- ind_file, ind_obj = bfile_chr_prefix + '.fam', PlinkFAMFile
705
- array_indivs = ind_obj(ind_file)
478
+ ind_file = bfile_chr_prefix + ".fam"
479
+ array_indivs = PlinkFAMFile(ind_file)
480
+
706
481
  n = len(array_indivs.IDList)
707
- print(f'Read list of {n} individuals from {ind_file}')
708
482
 
709
483
  # Load genotype array
710
- array_file, array_obj = bfile_chr_prefix + '.bed', PlinkBEDFileWithR2Cache
711
- geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
484
+ array_file = bfile_chr_prefix + ".bed"
485
+ geno_array = PlinkBEDFile(
486
+ array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None
487
+ )
712
488
 
713
489
  return array_snps, array_indivs, geno_array
714
-
715
-
716
- def generate_r2_matrix_chr_cache(bfile_chr_prefix, ld_wind_cm, output_cache_file_dir):
717
- # Load genotype array
718
- array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix)
719
- # Compute block lefts
720
- block_left = getBlockLefts(geno_array.df[:, 3], ld_wind_cm)
721
- # Compute LD score
722
- r2_matrix = geno_array.load_r2_matrix_from_cache(output_cache_file_dir)
723
-
724
-
725
- def generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_wind_cm=1):
726
- r2_cache_dir = Path(r2_cache_dir)
727
-
728
- for chr in chromosome_list:
729
- output_cache_file_prefix = r2_cache_dir / f'chr{chr}'
730
- output_cache_file_prefix.mkdir(parents=True, exist_ok=True)
731
- bfile_chr_prefix = bfile_prefix + '.' + str(chr)
732
- generate_r2_matrix_chr_cache(bfile_chr_prefix,
733
- ld_wind_cm=ld_wind_cm,
734
- output_cache_file_dir=output_cache_file_prefix)
735
- print(f'Compute r2 matrix for chr{chr} done!')