gsMap 1.71.2__py3-none-any.whl → 1.72.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,23 +1,23 @@
1
1
  from pathlib import Path
2
+
2
3
  import bitarray as ba
3
4
  import numpy as np
4
5
  import pandas as pd
5
- from scipy.sparse import csr_matrix
6
- from scipy.sparse import save_npz, load_npz
7
- from tqdm import trange, tqdm
6
+ from scipy.sparse import csr_matrix, load_npz, save_npz
7
+ from tqdm import tqdm, trange
8
8
 
9
9
 
10
10
  # Define the log class
11
- class Logger(object):
11
+ class Logger:
12
12
  # -
13
13
  def __init__(self, fh):
14
- self.log_fh = open(fh, 'w')
14
+ self.log_fh = open(fh, "w")
15
15
 
16
16
  # -
17
17
  def log(self, msg):
18
- '''
18
+ """
19
19
  Print to log file and stdout.
20
- '''
20
+ """
21
21
  print(msg, file=self.log_fh)
22
22
  print(msg)
23
23
 
@@ -28,11 +28,11 @@ class Logger(object):
28
28
 
29
29
  # Compute ld-score using cellular annotations
30
30
  def get_compression(fh):
31
- '''Which sort of compression should we use with read_csv?'''
32
- if fh.endswith('gz'):
33
- compression = 'gzip'
34
- elif fh.endswith('bz2'):
35
- compression = 'bz2'
31
+ """Which sort of compression should we use with read_csv?"""
32
+ if fh.endswith("gz"):
33
+ compression = "gzip"
34
+ elif fh.endswith("bz2"):
35
+ compression = "bz2"
36
36
  else:
37
37
  compression = None
38
38
  # -
@@ -42,7 +42,7 @@ def get_compression(fh):
42
42
  # Define the reading functions
43
43
  def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
44
44
  # -
45
- class IDContainer(object):
45
+ class IDContainer:
46
46
  """
47
47
  A class to read data from a file, store it as a DataFrame, and provide a method for a left outer join operation.
48
48
  """
@@ -66,14 +66,15 @@ def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
66
66
  """
67
67
  end = self.fname_end
68
68
  if end and not fname.endswith(end):
69
- raise ValueError('{f} filename must end in {f}'.format(f=end))
69
+ raise ValueError(f"{end} filename must end in {end}")
70
70
  comp = get_compression(fname)
71
- self.df = pd.read_csv(fname, header=self.header, usecols=self.usecols,
72
- sep='\s+', compression=comp)
71
+ self.df = pd.read_csv(
72
+ fname, header=self.header, usecols=self.usecols, sep=r"\s+", compression=comp
73
+ )
73
74
  if self.colnames:
74
75
  self.df.columns = self.colnames
75
76
  if self.keepcol is not None:
76
- self.IDList = self.df.iloc[:, [self.keepcol]].astype('object')
77
+ self.IDList = self.df.iloc[:, [self.keepcol]].astype("object")
77
78
 
78
79
  # -
79
80
  def loj(self, externalDf):
@@ -83,10 +84,9 @@ def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
83
84
  r = externalDf.columns[0]
84
85
  l = self.IDList.columns[0]
85
86
  merge_df = externalDf.iloc[:, [0]]
86
- merge_df['keep'] = True
87
- z = pd.merge(self.IDList, merge_df, how='left', left_on=l, right_on=r,
88
- sort=False)
89
- ii = z['keep'] == True
87
+ merge_df["keep"] = True
88
+ z = pd.merge(self.IDList, merge_df, how="left", left_on=l, right_on=r, sort=False)
89
+ ii = z["keep"]
90
90
  return np.nonzero(ii)[0]
91
91
 
92
92
  # -
@@ -94,20 +94,22 @@ def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
94
94
 
95
95
 
96
96
  def getBlockLefts(coords, max_dist):
97
- '''
97
+ """
98
98
  Converts coordinates + max block length to the a list of coordinates of the leftmost
99
99
  SNPs to be included in blocks.
100
+
100
101
  Parameters
101
102
  ----------
102
103
  coords : array
103
104
  Array of coordinates. Must be sorted.
104
105
  max_dist : float
105
106
  Maximum distance between SNPs included in the same window.
107
+
106
108
  Returns
107
109
  -------
108
110
  block_left : 1D np.ndarray with same length as block_left
109
111
  block_left[j] := min{k | dist(j, k) < max_dist}.
110
- '''
112
+ """
111
113
  M = len(coords)
112
114
  j = 0
113
115
  block_left = np.zeros(M)
@@ -120,17 +122,19 @@ def getBlockLefts(coords, max_dist):
120
122
 
121
123
 
122
124
  def block_left_to_right(block_left):
123
- '''
125
+ """
124
126
  Converts block lefts to block rights.
127
+
125
128
  Parameters
126
129
  ----------
127
130
  block_left : array
128
131
  Array of block lefts.
132
+
129
133
  Returns
130
134
  -------
131
135
  block_right : 1D np.ndarray with same length as block_left
132
136
  block_right[j] := max {k | block_left[k] <= j}
133
- '''
137
+ """
134
138
  M = len(block_left)
135
139
  j = 0
136
140
  block_right = np.zeros(M)
@@ -142,54 +146,57 @@ def block_left_to_right(block_left):
142
146
  return block_right
143
147
 
144
148
 
145
- class GenotypeArrayInMemory(object):
146
- '''
149
+ class GenotypeArrayInMemory:
150
+ """
147
151
  Parent class for various classes containing interfaces for files with genotype
148
152
  matrices, e.g., plink .bed files, etc
149
- '''
153
+ """
150
154
 
151
155
  def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
152
156
  self.m = len(snp_list.IDList)
153
157
  self.n = n
154
158
  self.keep_snps = keep_snps
155
159
  self.keep_indivs = keep_indivs
156
- self.df = np.array(snp_list.df[['CHR', 'SNP', 'BP', 'CM']])
157
- self.colnames = ['CHR', 'SNP', 'BP', 'CM']
160
+ self.df = np.array(snp_list.df[["CHR", "SNP", "BP", "CM"]])
161
+ self.colnames = ["CHR", "SNP", "BP", "CM"]
158
162
  self.mafMin = mafMin if mafMin is not None else 0
159
163
  self._currentSNP = 0
160
164
  (self.nru, self.geno) = self.__read__(fname, self.m, n)
161
165
  # filter individuals
162
166
  if keep_indivs is not None:
163
- keep_indivs = np.array(keep_indivs, dtype='int')
167
+ keep_indivs = np.array(keep_indivs, dtype="int")
164
168
  if np.any(keep_indivs > self.n):
165
- raise ValueError('keep_indivs indices out of bounds')
169
+ raise ValueError("keep_indivs indices out of bounds")
166
170
  # -
167
- (self.geno, self.m, self.n) = self.__filter_indivs__(self.geno, keep_indivs, self.m, self.n)
171
+ (self.geno, self.m, self.n) = self.__filter_indivs__(
172
+ self.geno, keep_indivs, self.m, self.n
173
+ )
168
174
  # -
169
175
  if self.n > 0:
170
- print('After filtering, {n} individuals remain'.format(n=self.n))
176
+ print(f"After filtering, {self.n} individuals remain")
171
177
  else:
172
- raise ValueError('After filtering, no individuals remain')
178
+ raise ValueError("After filtering, no individuals remain")
173
179
  # -
174
180
  # filter SNPs
175
181
  if keep_snps is not None:
176
- keep_snps = np.array(keep_snps, dtype='int')
182
+ keep_snps = np.array(keep_snps, dtype="int")
177
183
  if np.any(keep_snps > self.m): # if keep_snps is None, this returns False
178
- raise ValueError('keep_snps indices out of bounds')
184
+ raise ValueError("keep_snps indices out of bounds")
179
185
  # -
180
186
  (self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
181
- self.geno, self.m, self.n, self.mafMin, keep_snps)
187
+ self.geno, self.m, self.n, self.mafMin, keep_snps
188
+ )
182
189
  # -
183
190
  if self.m > 0:
184
- print('After filtering, {m} SNPs remain'.format(m=self.m))
191
+ print(f"After filtering, {self.m} SNPs remain")
185
192
  else:
186
- raise ValueError('After filtering, no SNPs remain')
193
+ raise ValueError("After filtering, no SNPs remain")
187
194
  # -
188
195
  self.df = self.df[self.kept_snps, :]
189
196
  self.maf = np.minimum(self.freq, np.ones(self.m) - self.freq)
190
197
  self.sqrtpq = np.sqrt(self.freq * (np.ones(self.m) - self.freq))
191
198
  self.df = np.c_[self.df, self.maf]
192
- self.colnames.append('MAF')
199
+ self.colnames.append("MAF")
193
200
 
194
201
  # -
195
202
  def __read__(self, fname, m, n):
@@ -208,8 +215,11 @@ class GenotypeArrayInMemory(object):
208
215
 
209
216
  # -
210
217
  def ldScoreVarBlocks(self, block_left, c, annot=None):
211
- '''Computes an unbiased estimate of L2(j) for j=1,..,M.'''
212
- func = lambda x: self.__l2_unbiased__(x, self.n)
218
+ """Computes an unbiased estimate of L2(j) for j=1,..,M."""
219
+
220
+ def func(x):
221
+ return self.__l2_unbiased__(x, self.n)
222
+
213
223
  snp_getter = self.nextSNPs
214
224
  return self.__corSumVarBlocks__(block_left, c, func, snp_getter, annot)
215
225
 
@@ -225,7 +235,7 @@ class GenotypeArrayInMemory(object):
225
235
  # Methods for calculating sums of Pearson correlation coefficients (i.e.,ld-score)
226
236
  # c stands for the chunk size (default = 50)
227
237
  def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
228
- '''
238
+ """
229
239
  Parameters
230
240
  ----------
231
241
  block_left : np.ndarray with shape (M, )
@@ -243,11 +253,12 @@ class GenotypeArrayInMemory(object):
243
253
  The method to be used to get the next SNPs
244
254
  annot: numpy array with shape (m,n_a)
245
255
  SNP annotations.
256
+
246
257
  Returns
247
258
  -------
248
259
  cor_sum : np.ndarray with shape (M, num_annots)
249
260
  Estimates.
250
- '''
261
+ """
251
262
  m, n = self.m, self.n
252
263
  block_sizes = np.array(np.arange(m) - block_left)
253
264
  block_sizes = np.ceil(block_sizes / c) * c
@@ -256,7 +267,7 @@ class GenotypeArrayInMemory(object):
256
267
  else:
257
268
  annot_m = annot.shape[0]
258
269
  if annot_m != self.m:
259
- raise ValueError('Incorrect number of SNPs in annot')
270
+ raise ValueError("Incorrect number of SNPs in annot")
260
271
  # -
261
272
  n_a = annot.shape[1] # number of annotations
262
273
  cor_sum = np.zeros((m, n_a))
@@ -277,18 +288,18 @@ class GenotypeArrayInMemory(object):
277
288
  rfuncBB = np.zeros((c, c))
278
289
  # chunk inside of block
279
290
  for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
280
- B = A[:, l_B:l_B + c]
291
+ B = A[:, l_B : l_B + c]
281
292
  # ld matrix
282
293
  np.dot(A.T, B / n, out=rfuncAB)
283
294
  # ld matrix square
284
295
  rfuncAB = func(rfuncAB)
285
- cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
296
+ cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
286
297
 
287
298
  # chunk to right of block
288
299
  b0 = b
289
300
  md = int(c * np.floor(m / c))
290
301
  end = md + 1 if md != m else md
291
- for l_B in tqdm(np.arange(b0, end, c), desc=f'Compute SNP Gene Weight'):
302
+ for l_B in tqdm(np.arange(b0, end, c), desc="Compute SNP Gene Weight"):
292
303
  # check if the annot matrix is all zeros for this block + chunk
293
304
  # this happens w/ sparse categories (i.e., pathways)
294
305
  # update the block
@@ -298,10 +309,10 @@ class GenotypeArrayInMemory(object):
298
309
  # block_size can't increase more than c
299
310
  # block_size can't be less than c unless it is zero
300
311
  # both of these things make sense
301
- A = np.hstack((A[:, old_b - b + c:old_b], B))
312
+ A = np.hstack((A[:, old_b - b + c : old_b], B))
302
313
  l_A += old_b - b + c
303
314
  elif l_B == b0 and b > 0:
304
- A = A[:, b0 - b:b0]
315
+ A = A[:, b0 - b : b0]
305
316
  l_A = b0 - b
306
317
  elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
307
318
  A = np.array(()).reshape((n, 0))
@@ -314,44 +325,45 @@ class GenotypeArrayInMemory(object):
314
325
  rfuncAB = np.zeros((b, c))
315
326
  # -
316
327
  B = snp_getter(c)
317
- p1 = np.all(annot[l_A:l_A + b, :] == 0)
318
- p2 = np.all(annot[l_B:l_B + c, :] == 0)
328
+ p1 = np.all(annot[l_A : l_A + b, :] == 0)
329
+ p2 = np.all(annot[l_B : l_B + c, :] == 0)
319
330
  if p1 and p2:
320
331
  continue
321
332
  # -
322
333
  np.dot(A.T, B / n, out=rfuncAB)
323
334
  rfuncAB = func(rfuncAB)
324
- cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
325
- cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
335
+ cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
336
+ cor_sum[l_B : l_B + c, :] += np.dot(annot[l_A : l_A + b, :].T, rfuncAB).T
326
337
  np.dot(B.T, B / n, out=rfuncBB)
327
338
  rfuncBB = func(rfuncBB)
328
- cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
339
+ cor_sum[l_B : l_B + c, :] += np.dot(rfuncBB, annot[l_B : l_B + c, :])
329
340
  # -
330
341
  return cor_sum
331
342
 
332
343
 
333
344
  class PlinkBEDFile(GenotypeArrayInMemory):
334
- '''
345
+ """
335
346
  Interface for Plink .bed format
336
- '''
347
+ """
337
348
 
338
349
  def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
339
350
  self._bedcode = {
340
- 2: ba.bitarray('11'),
341
- 9: ba.bitarray('10'),
342
- 1: ba.bitarray('01'),
343
- 0: ba.bitarray('00')
351
+ 2: ba.bitarray("11"),
352
+ 9: ba.bitarray("10"),
353
+ 1: ba.bitarray("01"),
354
+ 0: ba.bitarray("00"),
344
355
  }
345
356
  # -
346
- GenotypeArrayInMemory.__init__(self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs,
347
- mafMin=mafMin)
357
+ GenotypeArrayInMemory.__init__(
358
+ self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=mafMin
359
+ )
348
360
 
349
361
  # -
350
362
  def __read__(self, fname, m, n):
351
- if not fname.endswith('.bed'):
352
- raise ValueError('.bed filename must end in .bed')
363
+ if not fname.endswith(".bed"):
364
+ raise ValueError(".bed filename must end in .bed")
353
365
  # -
354
- fh = open(fname, 'rb')
366
+ fh = open(fname, "rb")
355
367
  magicNumber = ba.bitarray(endian="little")
356
368
  magicNumber.fromfile(fh, 2)
357
369
  bedMode = ba.bitarray(endian="little")
@@ -360,11 +372,11 @@ class PlinkBEDFile(GenotypeArrayInMemory):
360
372
  nru = n + e
361
373
  self.nru = nru
362
374
  # check magic number
363
- if magicNumber != ba.bitarray('0011011011011000'):
364
- raise IOError("Magic number from Plink .bed file not recognized")
375
+ if magicNumber != ba.bitarray("0011011011011000"):
376
+ raise OSError("Magic number from Plink .bed file not recognized")
365
377
  # -
366
- if bedMode != ba.bitarray('10000000'):
367
- raise IOError("Plink .bed file must be in default SNP-major mode")
378
+ if bedMode != ba.bitarray("10000000"):
379
+ raise OSError("Plink .bed file must be in default SNP-major mode")
368
380
  # check file length
369
381
  self.geno = ba.bitarray(endian="little")
370
382
  self.geno.fromfile(fh)
@@ -377,7 +389,7 @@ class PlinkBEDFile(GenotypeArrayInMemory):
377
389
  real_len = len(geno)
378
390
  if real_len != exp_len:
379
391
  s = "Plink .bed file has {n1} bits, expected {n2}"
380
- raise IOError(s.format(n1=real_len, n2=exp_len))
392
+ raise OSError(s.format(n1=real_len, n2=exp_len))
381
393
 
382
394
  # -
383
395
  def __filter_indivs__(self, geno, keep_indivs, m, n):
@@ -388,14 +400,14 @@ class PlinkBEDFile(GenotypeArrayInMemory):
388
400
  z = ba.bitarray(m * 2 * nru_new, endian="little")
389
401
  z.setall(0)
390
402
  for e, i in enumerate(keep_indivs):
391
- z[2 * e::2 * nru_new] = geno[2 * i::2 * nru]
392
- z[2 * e + 1::2 * nru_new] = geno[2 * i + 1::2 * nru]
403
+ z[2 * e :: 2 * nru_new] = geno[2 * i :: 2 * nru]
404
+ z[2 * e + 1 :: 2 * nru_new] = geno[2 * i + 1 :: 2 * nru]
393
405
  self.nru = nru_new
394
406
  return (z, m, n_new)
395
407
 
396
408
  # -
397
409
  def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
398
- '''
410
+ """
399
411
  Credit to Chris Chang and the Plink2 developers for this algorithm
400
412
  Modified from plink_filter.c
401
413
  https://github.com/chrchang/plink-ng/blob/master/plink_filter.c
@@ -414,7 +426,7 @@ class PlinkBEDFile(GenotypeArrayInMemory):
414
426
  major allele frequency = (b+c)/(2*(n-a+c))
415
427
  het ct + missing ct = a + b - 2*c
416
428
  Why does bitarray not have >> ????
417
- '''
429
+ """
418
430
  nru = self.nru
419
431
  m_poly = 0
420
432
  y = ba.bitarray()
@@ -423,7 +435,7 @@ class PlinkBEDFile(GenotypeArrayInMemory):
423
435
  kept_snps = []
424
436
  freq = []
425
437
  for e, j in enumerate(keep_snps):
426
- z = geno[2 * nru * j:2 * nru * (j + 1)]
438
+ z = geno[2 * nru * j : 2 * nru * (j + 1)]
427
439
  A = z[0::2]
428
440
  a = A.count()
429
441
  B = z[1::2]
@@ -443,9 +455,10 @@ class PlinkBEDFile(GenotypeArrayInMemory):
443
455
 
444
456
  # -
445
457
  def nextSNPs(self, b, minorRef=None):
446
- '''
458
+ """
447
459
  Unpacks the binary array of genotypes and returns an n x b matrix of floats of
448
460
  normalized genotypes for the next b SNPs, where n := number of samples.
461
+
449
462
  Parameters
450
463
  ----------
451
464
  b : int
@@ -453,29 +466,30 @@ class PlinkBEDFile(GenotypeArrayInMemory):
453
466
  minorRef: bool, default None
454
467
  Should we flip reference alleles so that the minor allele is the reference?
455
468
  (This is useful for computing l1 w.r.t. minor allele).
469
+
456
470
  Returns
457
471
  -------
458
472
  X : np.array with dtype float64 with shape (n, b), where n := number of samples
459
473
  Matrix of genotypes normalized to mean zero and variance one. If minorRef is
460
474
  not None, then the minor allele will be the positive allele (i.e., two copies
461
475
  of the minor allele --> a positive number).
462
- '''
476
+ """
463
477
  # -
464
478
  try:
465
479
  b = int(b)
466
480
  if b <= 0:
467
481
  raise ValueError("b must be > 0")
468
- except TypeError:
469
- raise TypeError("b must be an integer")
482
+ except TypeError as e:
483
+ raise TypeError("b must be an integer") from e
470
484
  # -
471
485
  if self._currentSNP + b > self.m:
472
- s = '{b} SNPs requested, {k} SNPs remain'
486
+ s = "{b} SNPs requested, {k} SNPs remain"
473
487
  raise ValueError(s.format(b=b, k=(self.m - self._currentSNP)))
474
488
  # -
475
489
  c = self._currentSNP
476
490
  n = self.n
477
491
  nru = self.nru
478
- slice = self.geno[2 * c * nru:2 * (c + b) * nru]
492
+ slice = self.geno[2 * c * nru : 2 * (c + b) * nru]
479
493
  X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
480
494
  X = X[0:n, :]
481
495
  Y = np.zeros(X.shape)
@@ -499,14 +513,15 @@ class PlinkBEDFile(GenotypeArrayInMemory):
499
513
 
500
514
 
501
515
  class PlinkBEDFileWithR2Cache(PlinkBEDFile):
502
- def compute_r2_cache(self,
503
- block_left,
504
- output_cache_file_dir: Path,
505
- chunk_size=500_000_000,
506
- c=500,
507
- r2_threshold=1e-4,
508
- annot=None):
509
-
516
+ def compute_r2_cache(
517
+ self,
518
+ block_left,
519
+ output_cache_file_dir: Path,
520
+ chunk_size=500_000_000,
521
+ c=500,
522
+ r2_threshold=1e-4,
523
+ annot=None,
524
+ ):
510
525
  func = np.square
511
526
  snp_getter = self.nextSNPs
512
527
  data, rows, cols = [], [], []
@@ -536,9 +551,11 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
536
551
  cols.extend(l_B + non_zero_indices[1])
537
552
  if len(data) > chunk_size:
538
553
  # save the cache
539
- print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
540
- r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(self.m, self.m), dtype='float16')
541
- save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
554
+ print(f"Start saving the cache file: {output_cache_file_dir / f'{l_B}.npz'}")
555
+ r2_sparse_matrix = csr_matrix(
556
+ (data, (rows, cols)), shape=(self.m, self.m), dtype="float16"
557
+ )
558
+ save_npz(output_cache_file_dir / f"{l_B}.npz", r2_sparse_matrix)
542
559
  # reset the data
543
560
  data.clear()
544
561
  rows.clear()
@@ -552,9 +569,9 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
552
569
  else:
553
570
  annot_m = annot.shape[0]
554
571
  if annot_m != self.m:
555
- raise ValueError('Incorrect number of SNPs in annot')
572
+ raise ValueError("Incorrect number of SNPs in annot")
556
573
  # -
557
- n_a = annot.shape[1] # number of annotations
574
+ # n_a = annot.shape[1] # number of annotations
558
575
  # cor_sum = np.zeros((m, n_a))
559
576
  # b = index of first SNP for which SNP 0 is not included in LD Score
560
577
  b = np.nonzero(block_left > 0)
@@ -573,7 +590,7 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
573
590
  rfuncBB = np.zeros((c, c))
574
591
  # chunk inside of block
575
592
  for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
576
- B = A[:, l_B:l_B + c]
593
+ B = A[:, l_B : l_B + c]
577
594
  # ld matrix
578
595
  np.dot(A.T, B / n, out=rfuncAB)
579
596
  # ld matrix square
@@ -585,7 +602,7 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
585
602
  b0 = b
586
603
  md = int(c * np.floor(m / c))
587
604
  end = md + 1 if md != m else md
588
- for l_B in trange(b0, end, c, desc=f'Compute r2 cache for {output_cache_file_dir.name}'):
605
+ for l_B in trange(b0, end, c, desc=f"Compute r2 cache for {output_cache_file_dir.name}"):
589
606
  # check if the annot matrix is all zeros for this block + chunk
590
607
  # this happens w/ sparse categories (i.e., pathways)
591
608
  # update the block
@@ -595,10 +612,10 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
595
612
  # block_size can't increase more than c
596
613
  # block_size can't be less than c unless it is zero
597
614
  # both of these things make sense
598
- A = np.hstack((A[:, old_b - b + c:old_b], B))
615
+ A = np.hstack((A[:, old_b - b + c : old_b], B))
599
616
  l_A += old_b - b + c
600
617
  elif l_B == b0 and b > 0:
601
- A = A[:, b0 - b:b0]
618
+ A = A[:, b0 - b : b0]
602
619
  l_A = b0 - b
603
620
  elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
604
621
  A = np.array(()).reshape((n, 0))
@@ -611,8 +628,8 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
611
628
  rfuncAB = np.zeros((b, c))
612
629
  # -
613
630
  B = snp_getter(c)
614
- p1 = np.all(annot[l_A:l_A + b, :] == 0)
615
- p2 = np.all(annot[l_B:l_B + c, :] == 0)
631
+ p1 = np.all(annot[l_A : l_A + b, :] == 0)
632
+ p2 = np.all(annot[l_B : l_B + c, :] == 0)
616
633
  if p1 and p2:
617
634
  continue
618
635
  # -
@@ -629,19 +646,19 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
629
646
  if len(data) > 0:
630
647
  # save remaining data
631
648
  # save the cache
632
- print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
633
- r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(m, m), dtype='float16')
634
- save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
649
+ print(f"Start saving the cache file: {output_cache_file_dir / f'{l_B}.npz'}")
650
+ r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(m, m), dtype="float16")
651
+ save_npz(output_cache_file_dir / f"{l_B}.npz", r2_sparse_matrix)
635
652
  # combine the cache files
636
- print(f'Start combining the cache files in {output_cache_file_dir}')
637
- cached_r2_matrix_files = list(output_cache_file_dir.glob('*.npz'))
653
+ print(f"Start combining the cache files in {output_cache_file_dir}")
654
+ cached_r2_matrix_files = list(output_cache_file_dir.glob("*.npz"))
638
655
  combined_r2_matrix_files = self.load_r2_matrix_from_cache_files(output_cache_file_dir)
639
656
  # remove the cache files
640
657
  for cached_r2_matrix_file in cached_r2_matrix_files:
641
658
  cached_r2_matrix_file.unlink()
642
659
  # save the combined r2 matrix
643
- print(f'Start saving the combined r2 matrix in {output_cache_file_dir}')
644
- combined_r2_matrix_file = output_cache_file_dir / 'combined_r2_matrix.npz'
660
+ print(f"Start saving the combined r2 matrix in {output_cache_file_dir}")
661
+ combined_r2_matrix_file = output_cache_file_dir / "combined_r2_matrix.npz"
645
662
  save_npz(combined_r2_matrix_file, combined_r2_matrix_files)
646
663
 
647
664
  def get_ldscore_using_r2_cache(self, annot_matrix, cached_r2_matrix_dir):
@@ -652,11 +669,15 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
652
669
  cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
653
670
  # iter the cached r2 matrix files
654
671
  result_matrix = np.zeros((self.m, annot_matrix.shape[1]))
655
- cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
656
- assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
657
- f'Please run the function compute_r2_cache first!')
658
- for r2_matrix_file in tqdm(cached_r2_matrix_files, desc=f'Compute ld score for {cached_r2_matrix_dir.name}'):
659
- print(f'Compute r2 matrix multiplication for {r2_matrix_file}')
672
+ cached_r2_matrix_files = list(cached_r2_matrix_dir.glob("*.npz"))
673
+ assert len(cached_r2_matrix_files) > 0, (
674
+ f"No cached r2 matrix files in {cached_r2_matrix_dir}"
675
+ f"Please run the function compute_r2_cache first!"
676
+ )
677
+ for r2_matrix_file in tqdm(
678
+ cached_r2_matrix_files, desc=f"Compute ld score for {cached_r2_matrix_dir.name}"
679
+ ):
680
+ print(f"Compute r2 matrix multiplication for {r2_matrix_file}")
660
681
  r2_matrix = load_npz(r2_matrix_file)
661
682
  result_matrix += r2_matrix.dot(annot_matrix)
662
683
  return result_matrix
@@ -667,48 +688,60 @@ class PlinkBEDFileWithR2Cache(PlinkBEDFile):
667
688
  """
668
689
  cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
669
690
  # iter the cached r2 matrix files
670
- cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
671
- assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
672
- f'Please run the function compute_r2_cache first!')
691
+ cached_r2_matrix_files = list(cached_r2_matrix_dir.glob("*.npz"))
692
+ assert len(cached_r2_matrix_files) > 0, (
693
+ f"No cached r2 matrix files in {cached_r2_matrix_dir}"
694
+ f"Please run the function compute_r2_cache first!"
695
+ )
673
696
  # load the r2 matrix
674
697
  r2_matrix = load_npz(cached_r2_matrix_files[0])
675
- for r2_matrix_file in tqdm(cached_r2_matrix_files[1:], desc=f'Load r2 matrix from {cached_r2_matrix_dir.name}'):
676
- print(f'Load r2 matrix from {r2_matrix_file}')
698
+ for r2_matrix_file in tqdm(
699
+ cached_r2_matrix_files[1:], desc=f"Load r2 matrix from {cached_r2_matrix_dir.name}"
700
+ ):
701
+ print(f"Load r2 matrix from {r2_matrix_file}")
677
702
  r2_matrix += load_npz(r2_matrix_file)
678
703
  # to float16
679
- r2_matrix = r2_matrix.astype('float16')
704
+ r2_matrix = r2_matrix.astype("float16")
680
705
  return r2_matrix
706
+
681
707
  def load_combined_r2_matrix(self, cached_r2_matrix_dir):
682
708
  """
683
709
  Load the combined r2 matrix
684
710
  """
685
- combined_r2_matrix_file = Path(cached_r2_matrix_dir) / 'combined_r2_matrix.npz'
686
- assert combined_r2_matrix_file.exists(), (f'No combined r2 matrix file in {cached_r2_matrix_dir}'
687
- f'Should delete the cache files and run the function compute_r2_cache first!')
711
+ combined_r2_matrix_file = Path(cached_r2_matrix_dir) / "combined_r2_matrix.npz"
712
+ assert combined_r2_matrix_file.exists(), (
713
+ f"No combined r2 matrix file in {cached_r2_matrix_dir}"
714
+ f"Should delete the cache files and run the function compute_r2_cache first!"
715
+ )
688
716
  # load the r2 matrix
689
717
  r2_matrix = load_npz(combined_r2_matrix_file)
690
718
  # to float16
691
- r2_matrix = r2_matrix.astype('float16')
719
+ r2_matrix = r2_matrix.astype("float16")
692
720
  return r2_matrix
693
721
 
722
+
694
723
  def load_bfile(bfile_chr_prefix):
695
- PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
696
- PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
724
+ PlinkBIMFile = ID_List_Factory(
725
+ ["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
726
+ )
727
+ PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
697
728
 
698
- snp_file, snp_obj = bfile_chr_prefix + '.bim', PlinkBIMFile
729
+ snp_file, snp_obj = bfile_chr_prefix + ".bim", PlinkBIMFile
699
730
  array_snps = snp_obj(snp_file)
700
731
  m = len(array_snps.IDList)
701
- print(f'Read list of {m} SNPs from {snp_file}')
732
+ print(f"Read list of {m} SNPs from {snp_file}")
702
733
  #
703
734
  # Load fam
704
- ind_file, ind_obj = bfile_chr_prefix + '.fam', PlinkFAMFile
735
+ ind_file, ind_obj = bfile_chr_prefix + ".fam", PlinkFAMFile
705
736
  array_indivs = ind_obj(ind_file)
706
737
  n = len(array_indivs.IDList)
707
- print(f'Read list of {n} individuals from {ind_file}')
738
+ print(f"Read list of {n} individuals from {ind_file}")
708
739
 
709
740
  # Load genotype array
710
- array_file, array_obj = bfile_chr_prefix + '.bed', PlinkBEDFileWithR2Cache
711
- geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
741
+ array_file, array_obj = bfile_chr_prefix + ".bed", PlinkBEDFileWithR2Cache
742
+ geno_array = array_obj(
743
+ array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None
744
+ )
712
745
 
713
746
  return array_snps, array_indivs, geno_array
714
747
 
@@ -717,19 +750,19 @@ def generate_r2_matrix_chr_cache(bfile_chr_prefix, ld_wind_cm, output_cache_file
717
750
  # Load genotype array
718
751
  array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix)
719
752
  # Compute block lefts
720
- block_left = getBlockLefts(geno_array.df[:, 3], ld_wind_cm)
753
+ # block_left = getBlockLefts(geno_array.df[:, 3], ld_wind_cm)
721
754
  # Compute LD score
722
- r2_matrix = geno_array.load_r2_matrix_from_cache(output_cache_file_dir)
755
+ # r2_matrix = geno_array.load_r2_matrix_from_cache(output_cache_file_dir)
723
756
 
724
757
 
725
758
  def generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_wind_cm=1):
726
759
  r2_cache_dir = Path(r2_cache_dir)
727
760
 
728
761
  for chr in chromosome_list:
729
- output_cache_file_prefix = r2_cache_dir / f'chr{chr}'
762
+ output_cache_file_prefix = r2_cache_dir / f"chr{chr}"
730
763
  output_cache_file_prefix.mkdir(parents=True, exist_ok=True)
731
- bfile_chr_prefix = bfile_prefix + '.' + str(chr)
732
- generate_r2_matrix_chr_cache(bfile_chr_prefix,
733
- ld_wind_cm=ld_wind_cm,
734
- output_cache_file_dir=output_cache_file_prefix)
735
- print(f'Compute r2 matrix for chr{chr} done!')
764
+ bfile_chr_prefix = bfile_prefix + "." + str(chr)
765
+ generate_r2_matrix_chr_cache(
766
+ bfile_chr_prefix, ld_wind_cm=ld_wind_cm, output_cache_file_dir=output_cache_file_prefix
767
+ )
768
+ print(f"Compute r2 matrix for chr{chr} done!")