gsMap 1.67__py3-none-any.whl → 1.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/{GNN_VAE → GNN}/__init__.py +0 -0
 - gsMap/{GNN_VAE → GNN}/adjacency_matrix.py +75 -75
 - gsMap/{GNN_VAE → GNN}/model.py +89 -89
 - gsMap/{GNN_VAE → GNN}/train.py +88 -86
 - gsMap/__init__.py +5 -5
 - gsMap/__main__.py +2 -2
 - gsMap/cauchy_combination_test.py +141 -141
 - gsMap/config.py +805 -803
 - gsMap/diagnosis.py +273 -273
 - gsMap/find_latent_representation.py +133 -145
 - gsMap/format_sumstats.py +407 -407
 - gsMap/generate_ldscore.py +618 -618
 - gsMap/latent_to_gene.py +234 -234
 - gsMap/main.py +31 -31
 - gsMap/report.py +160 -160
 - gsMap/run_all_mode.py +194 -194
 - gsMap/setup.py +0 -0
 - gsMap/spatial_ldsc_multiple_sumstats.py +380 -380
 - gsMap/templates/report_template.html +198 -198
 - gsMap/utils/__init__.py +0 -0
 - gsMap/utils/generate_r2_matrix.py +735 -735
 - gsMap/utils/jackknife.py +514 -514
 - gsMap/utils/make_annotations.py +518 -518
 - gsMap/utils/manhattan_plot.py +639 -639
 - gsMap/utils/regression_read.py +294 -294
 - gsMap/visualize.py +198 -198
 - {gsmap-1.67.dist-info → gsmap-1.71.dist-info}/LICENSE +21 -21
 - {gsmap-1.67.dist-info → gsmap-1.71.dist-info}/METADATA +28 -22
 - gsmap-1.71.dist-info/RECORD +31 -0
 - gsmap-1.67.dist-info/RECORD +0 -31
 - {gsmap-1.67.dist-info → gsmap-1.71.dist-info}/WHEEL +0 -0
 - {gsmap-1.67.dist-info → gsmap-1.71.dist-info}/entry_points.txt +0 -0
 
| 
         @@ -1,735 +1,735 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            from pathlib import Path
         
     | 
| 
       2 
     | 
    
         
            -
            import bitarray as ba
         
     | 
| 
       3 
     | 
    
         
            -
            import numpy as np
         
     | 
| 
       4 
     | 
    
         
            -
            import pandas as pd
         
     | 
| 
       5 
     | 
    
         
            -
            from scipy.sparse import csr_matrix
         
     | 
| 
       6 
     | 
    
         
            -
            from scipy.sparse import save_npz, load_npz
         
     | 
| 
       7 
     | 
    
         
            -
            from tqdm import trange, tqdm
         
     | 
| 
       8 
     | 
    
         
            -
             
     | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
       10 
     | 
    
         
            -
            # Define the log class
         
     | 
| 
       11 
     | 
    
         
            -
            class Logger(object):
         
     | 
| 
       12 
     | 
    
         
            -
                # -
         
     | 
| 
       13 
     | 
    
         
            -
                def __init__(self, fh):
         
     | 
| 
       14 
     | 
    
         
            -
                    self.log_fh = open(fh, 'w')
         
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
                # -
         
     | 
| 
       17 
     | 
    
         
            -
                def log(self, msg):
         
     | 
| 
       18 
     | 
    
         
            -
                    '''
         
     | 
| 
       19 
     | 
    
         
            -
                    Print to log file and stdout.
         
     | 
| 
       20 
     | 
    
         
            -
                    '''
         
     | 
| 
       21 
     | 
    
         
            -
                    print(msg, file=self.log_fh)
         
     | 
| 
       22 
     | 
    
         
            -
                    print(msg)
         
     | 
| 
       23 
     | 
    
         
            -
             
     | 
| 
       24 
     | 
    
         
            -
                # -
         
     | 
| 
       25 
     | 
    
         
            -
                def close(self):
         
     | 
| 
       26 
     | 
    
         
            -
                    self.log_fh.close()
         
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
       29 
     | 
    
         
            -
            # Compute ld-score using cellular annotations
         
     | 
| 
       30 
     | 
    
         
            -
            def get_compression(fh):
         
     | 
| 
       31 
     | 
    
         
            -
                '''Which sort of compression should we use with read_csv?'''
         
     | 
| 
       32 
     | 
    
         
            -
                if fh.endswith('gz'):
         
     | 
| 
       33 
     | 
    
         
            -
                    compression = 'gzip'
         
     | 
| 
       34 
     | 
    
         
            -
                elif fh.endswith('bz2'):
         
     | 
| 
       35 
     | 
    
         
            -
                    compression = 'bz2'
         
     | 
| 
       36 
     | 
    
         
            -
                else:
         
     | 
| 
       37 
     | 
    
         
            -
                    compression = None
         
     | 
| 
       38 
     | 
    
         
            -
                # -
         
     | 
| 
       39 
     | 
    
         
            -
                return compression
         
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
       41 
     | 
    
         
            -
             
     | 
| 
       42 
     | 
    
         
            -
            # Define the reading functions
         
     | 
| 
       43 
     | 
    
         
            -
            def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
         
     | 
| 
       44 
     | 
    
         
            -
                # -
         
     | 
| 
       45 
     | 
    
         
            -
                class IDContainer(object):
         
     | 
| 
       46 
     | 
    
         
            -
                    """
         
     | 
| 
       47 
     | 
    
         
            -
                    A class to read data from a file, store it as a DataFrame, and provide a method for a left outer join operation.
         
     | 
| 
       48 
     | 
    
         
            -
                    """
         
     | 
| 
       49 
     | 
    
         
            -
             
     | 
| 
       50 
     | 
    
         
            -
                    def __init__(self, fname):
         
     | 
| 
       51 
     | 
    
         
            -
                        """
         
     | 
| 
       52 
     | 
    
         
            -
                        Initialize the IDContainer with the given filename and reading options.
         
     | 
| 
       53 
     | 
    
         
            -
                        """
         
     | 
| 
       54 
     | 
    
         
            -
                        self.usecols = usecols
         
     | 
| 
       55 
     | 
    
         
            -
                        self.colnames = colnames
         
     | 
| 
       56 
     | 
    
         
            -
                        self.keepcol = keepcol
         
     | 
| 
       57 
     | 
    
         
            -
                        self.fname_end = fname_end
         
     | 
| 
       58 
     | 
    
         
            -
                        self.header = header
         
     | 
| 
       59 
     | 
    
         
            -
                        self.read(fname)
         
     | 
| 
       60 
     | 
    
         
            -
                        self.n = len(self.df)
         
     | 
| 
       61 
     | 
    
         
            -
             
     | 
| 
       62 
     | 
    
         
            -
                    # -
         
     | 
| 
       63 
     | 
    
         
            -
                    def read(self, fname):
         
     | 
| 
       64 
     | 
    
         
            -
                        """
         
     | 
| 
       65 
     | 
    
         
            -
                        Read data from the given file and store it as a DataFrame.
         
     | 
| 
       66 
     | 
    
         
            -
                        """
         
     | 
| 
       67 
     | 
    
         
            -
                        end = self.fname_end
         
     | 
| 
       68 
     | 
    
         
            -
                        if end and not fname.endswith(end):
         
     | 
| 
       69 
     | 
    
         
            -
                            raise ValueError('{f} filename must end in {f}'.format(f=end))
         
     | 
| 
       70 
     | 
    
         
            -
                        comp = get_compression(fname)
         
     | 
| 
       71 
     | 
    
         
            -
                        self.df = pd.read_csv(fname, header=self.header, usecols=self.usecols,
         
     | 
| 
       72 
     | 
    
         
            -
                                              sep='\s+', compression=comp)
         
     | 
| 
       73 
     | 
    
         
            -
                        if self.colnames:
         
     | 
| 
       74 
     | 
    
         
            -
                            self.df.columns = self.colnames
         
     | 
| 
       75 
     | 
    
         
            -
                        if self.keepcol is not None:
         
     | 
| 
       76 
     | 
    
         
            -
                            self.IDList = self.df.iloc[:, [self.keepcol]].astype('object')
         
     | 
| 
       77 
     | 
    
         
            -
             
     | 
| 
       78 
     | 
    
         
            -
                    # -
         
     | 
| 
       79 
     | 
    
         
            -
                    def loj(self, externalDf):
         
     | 
| 
       80 
     | 
    
         
            -
                        """
         
     | 
| 
       81 
     | 
    
         
            -
                        Perform a left outer join operation with the given external DataFrame.
         
     | 
| 
       82 
     | 
    
         
            -
                        """
         
     | 
| 
       83 
     | 
    
         
            -
                        r = externalDf.columns[0]
         
     | 
| 
       84 
     | 
    
         
            -
                        l = self.IDList.columns[0]
         
     | 
| 
       85 
     | 
    
         
            -
                        merge_df = externalDf.iloc[:, [0]]
         
     | 
| 
       86 
     | 
    
         
            -
                        merge_df['keep'] = True
         
     | 
| 
       87 
     | 
    
         
            -
                        z = pd.merge(self.IDList, merge_df, how='left', left_on=l, right_on=r,
         
     | 
| 
       88 
     | 
    
         
            -
                                     sort=False)
         
     | 
| 
       89 
     | 
    
         
            -
                        ii = z['keep'] == True
         
     | 
| 
       90 
     | 
    
         
            -
                        return np.nonzero(ii)[0]
         
     | 
| 
       91 
     | 
    
         
            -
             
     | 
| 
       92 
     | 
    
         
            -
                # -
         
     | 
| 
       93 
     | 
    
         
            -
                return IDContainer
         
     | 
| 
       94 
     | 
    
         
            -
             
     | 
| 
       95 
     | 
    
         
            -
             
     | 
| 
       96 
     | 
    
         
            -
            def getBlockLefts(coords, max_dist):
         
     | 
| 
       97 
     | 
    
         
            -
                '''
         
     | 
| 
       98 
     | 
    
         
            -
                Converts coordinates + max block length to the a list of coordinates of the leftmost
         
     | 
| 
       99 
     | 
    
         
            -
                SNPs to be included in blocks.
         
     | 
| 
       100 
     | 
    
         
            -
                Parameters
         
     | 
| 
       101 
     | 
    
         
            -
                ----------
         
     | 
| 
       102 
     | 
    
         
            -
                coords : array
         
     | 
| 
       103 
     | 
    
         
            -
                    Array of coordinates. Must be sorted.
         
     | 
| 
       104 
     | 
    
         
            -
                max_dist : float
         
     | 
| 
       105 
     | 
    
         
            -
                    Maximum distance between SNPs included in the same window.
         
     | 
| 
       106 
     | 
    
         
            -
                Returns
         
     | 
| 
       107 
     | 
    
         
            -
                -------
         
     | 
| 
       108 
     | 
    
         
            -
                block_left : 1D np.ndarray with same length as block_left
         
     | 
| 
       109 
     | 
    
         
            -
                    block_left[j] :=  min{k | dist(j, k) < max_dist}.
         
     | 
| 
       110 
     | 
    
         
            -
                '''
         
     | 
| 
       111 
     | 
    
         
            -
                M = len(coords)
         
     | 
| 
       112 
     | 
    
         
            -
                j = 0
         
     | 
| 
       113 
     | 
    
         
            -
                block_left = np.zeros(M)
         
     | 
| 
       114 
     | 
    
         
            -
                for i in range(M):
         
     | 
| 
       115 
     | 
    
         
            -
                    while j < M and abs(coords[j] - coords[i]) > max_dist:
         
     | 
| 
       116 
     | 
    
         
            -
                        j += 1
         
     | 
| 
       117 
     | 
    
         
            -
             
     | 
| 
       118 
     | 
    
         
            -
                    block_left[i] = j
         
     | 
| 
       119 
     | 
    
         
            -
                return block_left
         
     | 
| 
       120 
     | 
    
         
            -
             
     | 
| 
       121 
     | 
    
         
            -
             
     | 
| 
       122 
     | 
    
         
            -
            def block_left_to_right(block_left):
         
     | 
| 
       123 
     | 
    
         
            -
                '''
         
     | 
| 
       124 
     | 
    
         
            -
                Converts block lefts to block rights.
         
     | 
| 
       125 
     | 
    
         
            -
                Parameters
         
     | 
| 
       126 
     | 
    
         
            -
                ----------
         
     | 
| 
       127 
     | 
    
         
            -
                block_left : array
         
     | 
| 
       128 
     | 
    
         
            -
                    Array of block lefts.
         
     | 
| 
       129 
     | 
    
         
            -
                Returns
         
     | 
| 
       130 
     | 
    
         
            -
                -------
         
     | 
| 
       131 
     | 
    
         
            -
                block_right : 1D np.ndarray with same length as block_left
         
     | 
| 
       132 
     | 
    
         
            -
                    block_right[j] := max {k | block_left[k] <= j}
         
     | 
| 
       133 
     | 
    
         
            -
                '''
         
     | 
| 
       134 
     | 
    
         
            -
                M = len(block_left)
         
     | 
| 
       135 
     | 
    
         
            -
                j = 0
         
     | 
| 
       136 
     | 
    
         
            -
                block_right = np.zeros(M)
         
     | 
| 
       137 
     | 
    
         
            -
                for i in range(M):
         
     | 
| 
       138 
     | 
    
         
            -
                    while j < M and block_left[j] <= i:
         
     | 
| 
       139 
     | 
    
         
            -
                        j += 1
         
     | 
| 
       140 
     | 
    
         
            -
                    block_right[i] = j
         
     | 
| 
       141 
     | 
    
         
            -
             
     | 
| 
       142 
     | 
    
         
            -
                return block_right
         
     | 
| 
       143 
     | 
    
         
            -
             
     | 
| 
       144 
     | 
    
         
            -
             
     | 
| 
       145 
     | 
    
         
            -
            class GenotypeArrayInMemory(object):
         
     | 
| 
       146 
     | 
    
         
            -
                '''
         
     | 
| 
       147 
     | 
    
         
            -
                Parent class for various classes containing interfaces for files with genotype
         
     | 
| 
       148 
     | 
    
         
            -
                matrices, e.g., plink .bed files, etc
         
     | 
| 
       149 
     | 
    
         
            -
                '''
         
     | 
| 
       150 
     | 
    
         
            -
             
     | 
| 
       151 
     | 
    
         
            -
                def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
         
     | 
| 
       152 
     | 
    
         
            -
                    self.m = len(snp_list.IDList)
         
     | 
| 
       153 
     | 
    
         
            -
                    self.n = n
         
     | 
| 
       154 
     | 
    
         
            -
                    self.keep_snps = keep_snps
         
     | 
| 
       155 
     | 
    
         
            -
                    self.keep_indivs = keep_indivs
         
     | 
| 
       156 
     | 
    
         
            -
                    self.df = np.array(snp_list.df[['CHR', 'SNP', 'BP', 'CM']])
         
     | 
| 
       157 
     | 
    
         
            -
                    self.colnames = ['CHR', 'SNP', 'BP', 'CM']
         
     | 
| 
       158 
     | 
    
         
            -
                    self.mafMin = mafMin if mafMin is not None else 0
         
     | 
| 
       159 
     | 
    
         
            -
                    self._currentSNP = 0
         
     | 
| 
       160 
     | 
    
         
            -
                    (self.nru, self.geno) = self.__read__(fname, self.m, n)
         
     | 
| 
       161 
     | 
    
         
            -
                    # filter individuals
         
     | 
| 
       162 
     | 
    
         
            -
                    if keep_indivs is not None:
         
     | 
| 
       163 
     | 
    
         
            -
                        keep_indivs = np.array(keep_indivs, dtype='int')
         
     | 
| 
       164 
     | 
    
         
            -
                        if np.any(keep_indivs > self.n):
         
     | 
| 
       165 
     | 
    
         
            -
                            raise ValueError('keep_indivs indices out of bounds')
         
     | 
| 
       166 
     | 
    
         
            -
                        # -
         
     | 
| 
       167 
     | 
    
         
            -
                        (self.geno, self.m, self.n) = self.__filter_indivs__(self.geno, keep_indivs, self.m, self.n)
         
     | 
| 
       168 
     | 
    
         
            -
                        # -
         
     | 
| 
       169 
     | 
    
         
            -
                        if self.n > 0:
         
     | 
| 
       170 
     | 
    
         
            -
                            print('After filtering, {n} individuals remain'.format(n=self.n))
         
     | 
| 
       171 
     | 
    
         
            -
                        else:
         
     | 
| 
       172 
     | 
    
         
            -
                            raise ValueError('After filtering, no individuals remain')
         
     | 
| 
       173 
     | 
    
         
            -
                    # -
         
     | 
| 
       174 
     | 
    
         
            -
                    # filter SNPs
         
     | 
| 
       175 
     | 
    
         
            -
                    if keep_snps is not None:
         
     | 
| 
       176 
     | 
    
         
            -
                        keep_snps = np.array(keep_snps, dtype='int')
         
     | 
| 
       177 
     | 
    
         
            -
                        if np.any(keep_snps > self.m):  # if keep_snps is None, this returns False
         
     | 
| 
       178 
     | 
    
         
            -
                            raise ValueError('keep_snps indices out of bounds')
         
     | 
| 
       179 
     | 
    
         
            -
                    # -
         
     | 
| 
       180 
     | 
    
         
            -
                    (self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
         
     | 
| 
       181 
     | 
    
         
            -
                        self.geno, self.m, self.n, self.mafMin, keep_snps)
         
     | 
| 
       182 
     | 
    
         
            -
                    # -
         
     | 
| 
       183 
     | 
    
         
            -
                    if self.m > 0:
         
     | 
| 
       184 
     | 
    
         
            -
                        print('After filtering, {m} SNPs remain'.format(m=self.m))
         
     | 
| 
       185 
     | 
    
         
            -
                    else:
         
     | 
| 
       186 
     | 
    
         
            -
                        raise ValueError('After filtering, no SNPs remain')
         
     | 
| 
       187 
     | 
    
         
            -
                    # -
         
     | 
| 
       188 
     | 
    
         
            -
                    self.df = self.df[self.kept_snps, :]
         
     | 
| 
       189 
     | 
    
         
            -
                    self.maf = np.minimum(self.freq, np.ones(self.m) - self.freq)
         
     | 
| 
       190 
     | 
    
         
            -
                    self.sqrtpq = np.sqrt(self.freq * (np.ones(self.m) - self.freq))
         
     | 
| 
       191 
     | 
    
         
            -
                    self.df = np.c_[self.df, self.maf]
         
     | 
| 
       192 
     | 
    
         
            -
                    self.colnames.append('MAF')
         
     | 
| 
       193 
     | 
    
         
            -
             
     | 
| 
       194 
     | 
    
         
            -
                # -
         
     | 
| 
       195 
     | 
    
         
            -
                def __read__(self, fname, m, n):
         
     | 
| 
       196 
     | 
    
         
            -
                    raise NotImplementedError
         
     | 
| 
       197 
     | 
    
         
            -
             
     | 
| 
       198 
     | 
    
         
            -
                def __restart__(self):
         
     | 
| 
       199 
     | 
    
         
            -
                    self._currentSNP = 0
         
     | 
| 
       200 
     | 
    
         
            -
             
     | 
| 
       201 
     | 
    
         
            -
                # -
         
     | 
| 
       202 
     | 
    
         
            -
                def __filter_indivs__(geno, keep_indivs, m, n):
         
     | 
| 
       203 
     | 
    
         
            -
                    raise NotImplementedError
         
     | 
| 
       204 
     | 
    
         
            -
             
     | 
| 
       205 
     | 
    
         
            -
                # -
         
     | 
| 
       206 
     | 
    
         
            -
                def __filter_maf_(geno, m, n, maf):
         
     | 
| 
       207 
     | 
    
         
            -
                    raise NotImplementedError
         
     | 
| 
       208 
     | 
    
         
            -
             
     | 
| 
       209 
     | 
    
         
            -
                # -
         
     | 
| 
       210 
     | 
    
         
            -
                def ldScoreVarBlocks(self, block_left, c, annot=None):
         
     | 
| 
       211 
     | 
    
         
            -
                    '''Computes an unbiased estimate of L2(j) for j=1,..,M.'''
         
     | 
| 
       212 
     | 
    
         
            -
                    func = lambda x: self.__l2_unbiased__(x, self.n)
         
     | 
| 
       213 
     | 
    
         
            -
                    snp_getter = self.nextSNPs
         
     | 
| 
       214 
     | 
    
         
            -
                    return self.__corSumVarBlocks__(block_left, c, func, snp_getter, annot)
         
     | 
| 
       215 
     | 
    
         
            -
             
     | 
| 
       216 
     | 
    
         
            -
                # -
         
     | 
| 
       217 
     | 
    
         
            -
                # In small samples, the observed r^2 tends to be higher than the true r^2 due to sampling variability.
         
     | 
| 
       218 
     | 
    
         
            -
                # The bias correction term (1-sq) / denom adjusts for this bias by subtracting a small value that depends on the sample size and the observed r^2.
         
     | 
| 
       219 
     | 
    
         
            -
                def __l2_unbiased__(self, x, n):
         
     | 
| 
       220 
     | 
    
         
            -
                    denom = n - 2 if n > 2 else n  # allow n<2 for testing purposes
         
     | 
| 
       221 
     | 
    
         
            -
                    sq = np.square(x)
         
     | 
| 
       222 
     | 
    
         
            -
                    return sq - (1 - sq) / denom
         
     | 
| 
       223 
     | 
    
         
            -
             
     | 
| 
       224 
     | 
    
         
            -
                # -
         
     | 
| 
       225 
     | 
    
         
            -
                # Methods for calculating sums of Pearson correlation coefficients (i.e.,ld-score)
         
     | 
| 
       226 
     | 
    
         
            -
                # c stands for the chunk size (default = 50)
         
     | 
| 
       227 
     | 
    
         
            -
                def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
         
     | 
| 
       228 
     | 
    
         
            -
                    '''
         
     | 
| 
       229 
     | 
    
         
            -
                    Parameters
         
     | 
| 
       230 
     | 
    
         
            -
                    ----------
         
     | 
| 
       231 
     | 
    
         
            -
                    block_left : np.ndarray with shape (M, )
         
     | 
| 
       232 
     | 
    
         
            -
                        block_left[i] = index of leftmost SNP included in LD Score of SNP i.
         
     | 
| 
       233 
     | 
    
         
            -
                        if c > 1, then only entries that are multiples of c are examined, and it is
         
     | 
| 
       234 
     | 
    
         
            -
                        assumed that block_left[a*c+i] = block_left[a*c], except at
         
     | 
| 
       235 
     | 
    
         
            -
                        the beginning of the chromosome where the 0th SNP is included in the window.
         
     | 
| 
       236 
     | 
    
         
            -
                    c : int
         
     | 
| 
       237 
     | 
    
         
            -
                        Chunk size.
         
     | 
| 
       238 
     | 
    
         
            -
                    func : function
         
     | 
| 
       239 
     | 
    
         
            -
                        Function to be applied to the genotype correlation matrix. Before dotting with
         
     | 
| 
       240 
     | 
    
         
            -
                        annot. Examples: for biased L2, np.square. For biased L4,
         
     | 
| 
       241 
     | 
    
         
            -
                        lambda x: np.square(np.square(x)). For L1, lambda x: x.
         
     | 
| 
       242 
     | 
    
         
            -
                    snp_getter : function(int)
         
     | 
| 
       243 
     | 
    
         
            -
                        The method to be used to get the next SNPs
         
     | 
| 
       244 
     | 
    
         
            -
                    annot: numpy array with shape (m,n_a)
         
     | 
| 
       245 
     | 
    
         
            -
                        SNP annotations.
         
     | 
| 
       246 
     | 
    
         
            -
                    Returns
         
     | 
| 
       247 
     | 
    
         
            -
                    -------
         
     | 
| 
       248 
     | 
    
         
            -
                    cor_sum : np.ndarray with shape (M, num_annots)
         
     | 
| 
       249 
     | 
    
         
            -
                        Estimates.
         
     | 
| 
       250 
     | 
    
         
            -
                    '''
         
     | 
| 
       251 
     | 
    
         
            -
                    m, n = self.m, self.n
         
     | 
| 
       252 
     | 
    
         
            -
                    block_sizes = np.array(np.arange(m) - block_left)
         
     | 
| 
       253 
     | 
    
         
            -
                    block_sizes = np.ceil(block_sizes / c) * c
         
     | 
| 
       254 
     | 
    
         
            -
                    if annot is None:
         
     | 
| 
       255 
     | 
    
         
            -
                        annot = np.ones((m, 1))
         
     | 
| 
       256 
     | 
    
         
            -
                    else:
         
     | 
| 
       257 
     | 
    
         
            -
                        annot_m = annot.shape[0]
         
     | 
| 
       258 
     | 
    
         
            -
                        if annot_m != self.m:
         
     | 
| 
       259 
     | 
    
         
            -
                            raise ValueError('Incorrect number of SNPs in annot')
         
     | 
| 
       260 
     | 
    
         
            -
                    # -
         
     | 
| 
       261 
     | 
    
         
            -
                    n_a = annot.shape[1]  # number of annotations
         
     | 
| 
       262 
     | 
    
         
            -
                    cor_sum = np.zeros((m, n_a))
         
     | 
| 
       263 
     | 
    
         
            -
                    # b = index of first SNP for which SNP 0 is not included in LD Score
         
     | 
| 
       264 
     | 
    
         
            -
                    b = np.nonzero(block_left > 0)
         
     | 
| 
       265 
     | 
    
         
            -
                    if np.any(b):
         
     | 
| 
       266 
     | 
    
         
            -
                        b = b[0][0]
         
     | 
| 
       267 
     | 
    
         
            -
                    else:
         
     | 
| 
       268 
     | 
    
         
            -
                        b = m
         
     | 
| 
       269 
     | 
    
         
            -
                    b = int(np.ceil(b / c) * c)  # round up to a multiple of c
         
     | 
| 
       270 
     | 
    
         
            -
                    if b > m:
         
     | 
| 
       271 
     | 
    
         
            -
                        c = 1
         
     | 
| 
       272 
     | 
    
         
            -
                        b = m
         
     | 
| 
       273 
     | 
    
         
            -
             
     | 
| 
       274 
     | 
    
         
            -
                    l_A = 0  # l_A := index of leftmost SNP in matrix A
         
     | 
| 
       275 
     | 
    
         
            -
                    A = snp_getter(b)
         
     | 
| 
       276 
     | 
    
         
            -
                    rfuncAB = np.zeros((b, c))
         
     | 
| 
       277 
     | 
    
         
            -
                    rfuncBB = np.zeros((c, c))
         
     | 
| 
       278 
     | 
    
         
            -
                    # chunk inside of block
         
     | 
| 
       279 
     | 
    
         
            -
                    for l_B in np.arange(0, b, c):  # l_B := index of leftmost SNP in matrix B
         
     | 
| 
       280 
     | 
    
         
            -
                        B = A[:, l_B:l_B + c]
         
     | 
| 
       281 
     | 
    
         
            -
                        # ld matrix
         
     | 
| 
       282 
     | 
    
         
            -
                        np.dot(A.T, B / n, out=rfuncAB)
         
     | 
| 
       283 
     | 
    
         
            -
                        # ld matrix square
         
     | 
| 
       284 
     | 
    
         
            -
                        rfuncAB = func(rfuncAB)
         
     | 
| 
       285 
     | 
    
         
            -
                        cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
         
     | 
| 
       286 
     | 
    
         
            -
             
     | 
| 
       287 
     | 
    
         
            -
                    # chunk to right of block
         
     | 
| 
       288 
     | 
    
         
            -
                    b0 = b
         
     | 
| 
       289 
     | 
    
         
            -
                    md = int(c * np.floor(m / c))
         
     | 
| 
       290 
     | 
    
         
            -
                    end = md + 1 if md != m else md
         
     | 
| 
       291 
     | 
    
         
            -
                    for l_B in tqdm(np.arange(b0, end, c), desc=f'Compute SNP Gene Weight'):
         
     | 
| 
       292 
     | 
    
         
            -
                        # check if the annot matrix is all zeros for this block + chunk
         
     | 
| 
       293 
     | 
    
         
            -
                        # this happens w/ sparse categories (i.e., pathways)
         
     | 
| 
       294 
     | 
    
         
            -
                        # update the block
         
     | 
| 
       295 
     | 
    
         
            -
                        old_b = b
         
     | 
| 
       296 
     | 
    
         
            -
                        b = int(block_sizes[l_B])
         
     | 
| 
       297 
     | 
    
         
            -
                        if l_B > b0 and b > 0:
         
     | 
| 
       298 
     | 
    
         
            -
                            # block_size can't increase more than c
         
     | 
| 
       299 
     | 
    
         
            -
                            # block_size can't be less than c unless it is zero
         
     | 
| 
       300 
     | 
    
         
            -
                            # both of these things make sense
         
     | 
| 
       301 
     | 
    
         
            -
                            A = np.hstack((A[:, old_b - b + c:old_b], B))
         
     | 
| 
       302 
     | 
    
         
            -
                            l_A += old_b - b + c
         
     | 
| 
       303 
     | 
    
         
            -
                        elif l_B == b0 and b > 0:
         
     | 
| 
       304 
     | 
    
         
            -
                            A = A[:, b0 - b:b0]
         
     | 
| 
       305 
     | 
    
         
            -
                            l_A = b0 - b
         
     | 
| 
       306 
     | 
    
         
            -
                        elif b == 0:  # no SNPs to left in window, e.g., after a sequence gap
         
     | 
| 
       307 
     | 
    
         
            -
                            A = np.array(()).reshape((n, 0))
         
     | 
| 
       308 
     | 
    
         
            -
                            l_A = l_B
         
     | 
| 
       309 
     | 
    
         
            -
                        if l_B == md:
         
     | 
| 
       310 
     | 
    
         
            -
                            c = m - md
         
     | 
| 
       311 
     | 
    
         
            -
                            rfuncAB = np.zeros((b, c))
         
     | 
| 
       312 
     | 
    
         
            -
                            rfuncBB = np.zeros((c, c))
         
     | 
| 
       313 
     | 
    
         
            -
                        if b != old_b:
         
     | 
| 
       314 
     | 
    
         
            -
                            rfuncAB = np.zeros((b, c))
         
     | 
| 
       315 
     | 
    
         
            -
                        # -
         
     | 
| 
       316 
     | 
    
         
            -
                        B = snp_getter(c)
         
     | 
| 
       317 
     | 
    
         
            -
                        p1 = np.all(annot[l_A:l_A + b, :] == 0)
         
     | 
| 
       318 
     | 
    
         
            -
                        p2 = np.all(annot[l_B:l_B + c, :] == 0)
         
     | 
| 
       319 
     | 
    
         
            -
                        if p1 and p2:
         
     | 
| 
       320 
     | 
    
         
            -
                            continue
         
     | 
| 
       321 
     | 
    
         
            -
                        # -
         
     | 
| 
       322 
     | 
    
         
            -
                        np.dot(A.T, B / n, out=rfuncAB)
         
     | 
| 
       323 
     | 
    
         
            -
                        rfuncAB = func(rfuncAB)
         
     | 
| 
       324 
     | 
    
         
            -
                        cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
         
     | 
| 
       325 
     | 
    
         
            -
                        cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
         
     | 
| 
       326 
     | 
    
         
            -
                        np.dot(B.T, B / n, out=rfuncBB)
         
     | 
| 
       327 
     | 
    
         
            -
                        rfuncBB = func(rfuncBB)
         
     | 
| 
       328 
     | 
    
         
            -
                        cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
         
     | 
| 
       329 
     | 
    
         
            -
                    # -
         
     | 
| 
       330 
     | 
    
         
            -
                    return cor_sum
         
     | 
| 
       331 
     | 
    
         
            -
             
     | 
| 
       332 
     | 
    
         
            -
             
     | 
| 
       333 
     | 
    
         
            -
            class PlinkBEDFile(GenotypeArrayInMemory):
         
     | 
| 
       334 
     | 
    
         
            -
                '''
         
     | 
| 
       335 
     | 
    
         
            -
                Interface for Plink .bed format
         
     | 
| 
       336 
     | 
    
         
            -
                '''
         
     | 
| 
       337 
     | 
    
         
            -
             
     | 
| 
       338 
     | 
    
         
            -
                def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
         
     | 
| 
       339 
     | 
    
         
            -
                    self._bedcode = {
         
     | 
| 
       340 
     | 
    
         
            -
                        2: ba.bitarray('11'),
         
     | 
| 
       341 
     | 
    
         
            -
                        9: ba.bitarray('10'),
         
     | 
| 
       342 
     | 
    
         
            -
                        1: ba.bitarray('01'),
         
     | 
| 
       343 
     | 
    
         
            -
                        0: ba.bitarray('00')
         
     | 
| 
       344 
     | 
    
         
            -
                    }
         
     | 
| 
       345 
     | 
    
         
            -
                    # -
         
     | 
| 
       346 
     | 
    
         
            -
                    GenotypeArrayInMemory.__init__(self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs,
         
     | 
| 
       347 
     | 
    
         
            -
                                                   mafMin=mafMin)
         
     | 
| 
       348 
     | 
    
         
            -
             
     | 
| 
       349 
     | 
    
         
            -
                # -
         
     | 
| 
       350 
     | 
    
         
            -
                def __read__(self, fname, m, n):
         
     | 
| 
       351 
     | 
    
         
            -
                    if not fname.endswith('.bed'):
         
     | 
| 
       352 
     | 
    
         
            -
                        raise ValueError('.bed filename must end in .bed')
         
     | 
| 
       353 
     | 
    
         
            -
                    # -
         
     | 
| 
       354 
     | 
    
         
            -
                    fh = open(fname, 'rb')
         
     | 
| 
       355 
     | 
    
         
            -
                    magicNumber = ba.bitarray(endian="little")
         
     | 
| 
       356 
     | 
    
         
            -
                    magicNumber.fromfile(fh, 2)
         
     | 
| 
       357 
     | 
    
         
            -
                    bedMode = ba.bitarray(endian="little")
         
     | 
| 
       358 
     | 
    
         
            -
                    bedMode.fromfile(fh, 1)
         
     | 
| 
       359 
     | 
    
         
            -
                    e = (4 - n % 4) if n % 4 != 0 else 0
         
     | 
| 
       360 
     | 
    
         
            -
                    nru = n + e
         
     | 
| 
       361 
     | 
    
         
            -
                    self.nru = nru
         
     | 
| 
       362 
     | 
    
         
            -
                    # check magic number
         
     | 
| 
       363 
     | 
    
         
            -
                    if magicNumber != ba.bitarray('0011011011011000'):
         
     | 
| 
       364 
     | 
    
         
            -
                        raise IOError("Magic number from Plink .bed file not recognized")
         
     | 
| 
       365 
     | 
    
         
            -
                    # -
         
     | 
| 
       366 
     | 
    
         
            -
                    if bedMode != ba.bitarray('10000000'):
         
     | 
| 
       367 
     | 
    
         
            -
                        raise IOError("Plink .bed file must be in default SNP-major mode")
         
     | 
| 
       368 
     | 
    
         
            -
                    # check file length
         
     | 
| 
       369 
     | 
    
         
            -
                    self.geno = ba.bitarray(endian="little")
         
     | 
| 
       370 
     | 
    
         
            -
                    self.geno.fromfile(fh)
         
     | 
| 
       371 
     | 
    
         
            -
                    self.__test_length__(self.geno, self.m, self.nru)
         
     | 
| 
       372 
     | 
    
         
            -
                    return (self.nru, self.geno)
         
     | 
| 
       373 
     | 
    
         
            -
             
     | 
| 
       374 
     | 
    
         
            -
                # -
         
     | 
| 
       375 
     | 
    
         
            -
                def __test_length__(self, geno, m, nru):
         
     | 
| 
       376 
     | 
    
         
            -
                    exp_len = 2 * m * nru
         
     | 
| 
       377 
     | 
    
         
            -
                    real_len = len(geno)
         
     | 
| 
       378 
     | 
    
         
            -
                    if real_len != exp_len:
         
     | 
| 
       379 
     | 
    
         
            -
                        s = "Plink .bed file has {n1} bits, expected {n2}"
         
     | 
| 
       380 
     | 
    
         
            -
                        raise IOError(s.format(n1=real_len, n2=exp_len))
         
     | 
| 
       381 
     | 
    
         
            -
             
     | 
| 
       382 
     | 
    
         
            -
                # -
         
     | 
| 
       383 
     | 
    
         
            -
                def __filter_indivs__(self, geno, keep_indivs, m, n):
         
     | 
| 
       384 
     | 
    
         
            -
                    n_new = len(keep_indivs)
         
     | 
| 
       385 
     | 
    
         
            -
                    e = (4 - n_new % 4) if n_new % 4 != 0 else 0
         
     | 
| 
       386 
     | 
    
         
            -
                    nru_new = n_new + e
         
     | 
| 
       387 
     | 
    
         
            -
                    nru = self.nru
         
     | 
| 
       388 
     | 
    
         
            -
                    z = ba.bitarray(m * 2 * nru_new, endian="little")
         
     | 
| 
       389 
     | 
    
         
            -
                    z.setall(0)
         
     | 
| 
       390 
     | 
    
         
            -
                    for e, i in enumerate(keep_indivs):
         
     | 
| 
       391 
     | 
    
         
            -
                        z[2 * e::2 * nru_new] = geno[2 * i::2 * nru]
         
     | 
| 
       392 
     | 
    
         
            -
                        z[2 * e + 1::2 * nru_new] = geno[2 * i + 1::2 * nru]
         
     | 
| 
       393 
     | 
    
         
            -
                    self.nru = nru_new
         
     | 
| 
       394 
     | 
    
         
            -
                    return (z, m, n_new)
         
     | 
| 
       395 
     | 
    
         
            -
             
     | 
| 
       396 
     | 
    
         
            -
                # -
         
     | 
| 
       397 
     | 
    
         
            -
                def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
         
     | 
| 
       398 
     | 
    
         
            -
                    '''
         
     | 
| 
       399 
     | 
    
         
            -
                    Credit to Chris Chang and the Plink2 developers for this algorithm
         
     | 
| 
       400 
     | 
    
         
            -
                    Modified from plink_filter.c
         
     | 
| 
       401 
     | 
    
         
            -
                    https://github.com/chrchang/plink-ng/blob/master/plink_filter.c
         
     | 
| 
       402 
     | 
    
         
            -
                    Genotypes are read forwards (since we are cheating and using endian="little")
         
     | 
| 
       403 
     | 
    
         
            -
                    A := (genotype) & 1010...
         
     | 
| 
       404 
     | 
    
         
            -
                    B := (genotype) & 0101...
         
     | 
| 
       405 
     | 
    
         
            -
                    C := (A >> 1) & B
         
     | 
| 
       406 
     | 
    
         
            -
                    Then
         
     | 
| 
       407 
     | 
    
         
            -
                    a := A.count() = missing ct + hom major ct
         
     | 
| 
       408 
     | 
    
         
            -
                    b := B.count() = het ct + hom major ct
         
     | 
| 
       409 
     | 
    
         
            -
                    c := C.count() = hom major ct
         
     | 
| 
       410 
     | 
    
         
            -
                    Which implies that
         
     | 
| 
       411 
     | 
    
         
            -
                    missing ct = a - c
         
     | 
| 
       412 
     | 
    
         
            -
                    # of indivs with nonmissing genotype = n - a + c
         
     | 
| 
       413 
     | 
    
         
            -
                    major allele ct = b + c
         
     | 
| 
       414 
     | 
    
         
            -
                    major allele frequency = (b+c)/(2*(n-a+c))
         
     | 
| 
       415 
     | 
    
         
            -
                    het ct + missing ct = a + b - 2*c
         
     | 
| 
       416 
     | 
    
         
            -
                    Why does bitarray not have >> ????
         
     | 
| 
       417 
     | 
    
         
            -
                    '''
         
     | 
| 
       418 
     | 
    
         
            -
                    nru = self.nru
         
     | 
| 
       419 
     | 
    
         
            -
                    m_poly = 0
         
     | 
| 
       420 
     | 
    
         
            -
                    y = ba.bitarray()
         
     | 
| 
       421 
     | 
    
         
            -
                    if keep_snps is None:
         
     | 
| 
       422 
     | 
    
         
            -
                        keep_snps = range(m)
         
     | 
| 
       423 
     | 
    
         
            -
                    kept_snps = []
         
     | 
| 
       424 
     | 
    
         
            -
                    freq = []
         
     | 
| 
       425 
     | 
    
         
            -
                    for e, j in enumerate(keep_snps):
         
     | 
| 
       426 
     | 
    
         
            -
                        z = geno[2 * nru * j:2 * nru * (j + 1)]
         
     | 
| 
       427 
     | 
    
         
            -
                        A = z[0::2]
         
     | 
| 
       428 
     | 
    
         
            -
                        a = A.count()
         
     | 
| 
       429 
     | 
    
         
            -
                        B = z[1::2]
         
     | 
| 
       430 
     | 
    
         
            -
                        b = B.count()
         
     | 
| 
       431 
     | 
    
         
            -
                        c = (A & B).count()
         
     | 
| 
       432 
     | 
    
         
            -
                        major_ct = b + c  # number of copies of the major allele
         
     | 
| 
       433 
     | 
    
         
            -
                        n_nomiss = n - a + c  # number of individuals with nonmissing genotypes
         
     | 
| 
       434 
     | 
    
         
            -
                        f = major_ct / (2 * n_nomiss) if n_nomiss > 0 else 0
         
     | 
| 
       435 
     | 
    
         
            -
                        het_miss_ct = a + b - 2 * c  # remove SNPs that are only either het or missing
         
     | 
| 
       436 
     | 
    
         
            -
                        if np.minimum(f, 1 - f) > mafMin and het_miss_ct < n:
         
     | 
| 
       437 
     | 
    
         
            -
                            freq.append(f)
         
     | 
| 
       438 
     | 
    
         
            -
                            y += z
         
     | 
| 
       439 
     | 
    
         
            -
                            m_poly += 1
         
     | 
| 
       440 
     | 
    
         
            -
                            kept_snps.append(j)
         
     | 
| 
       441 
     | 
    
         
            -
                    # -
         
     | 
| 
       442 
     | 
    
         
            -
                    return (y, m_poly, n, kept_snps, freq)
         
     | 
| 
       443 
     | 
    
         
            -
             
     | 
| 
       444 
     | 
    
         
            -
                # -
         
     | 
| 
       445 
     | 
    
         
            -
                def nextSNPs(self, b, minorRef=None):
         
     | 
| 
       446 
     | 
    
         
            -
                    '''
         
     | 
| 
       447 
     | 
    
         
            -
                    Unpacks the binary array of genotypes and returns an n x b matrix of floats of
         
     | 
| 
       448 
     | 
    
         
            -
                    normalized genotypes for the next b SNPs, where n := number of samples.
         
     | 
| 
       449 
     | 
    
         
            -
                    Parameters
         
     | 
| 
       450 
     | 
    
         
            -
                    ----------
         
     | 
| 
       451 
     | 
    
         
            -
                    b : int
         
     | 
| 
       452 
     | 
    
         
            -
                        Number of SNPs to return.
         
     | 
| 
       453 
     | 
    
         
            -
                    minorRef: bool, default None
         
     | 
| 
       454 
     | 
    
         
            -
                        Should we flip reference alleles so that the minor allele is the reference?
         
     | 
| 
       455 
     | 
    
         
            -
                        (This is useful for computing l1 w.r.t. minor allele).
         
     | 
| 
       456 
     | 
    
         
            -
                    Returns
         
     | 
| 
       457 
     | 
    
         
            -
                    -------
         
     | 
| 
       458 
     | 
    
         
            -
                    X : np.array with dtype float64 with shape (n, b), where n := number of samples
         
     | 
| 
       459 
     | 
    
         
            -
                        Matrix of genotypes normalized to mean zero and variance one. If minorRef is
         
     | 
| 
       460 
     | 
    
         
            -
                        not None, then the minor allele will be the positive allele (i.e., two copies
         
     | 
| 
       461 
     | 
    
         
            -
                        of the minor allele --> a positive number).
         
     | 
| 
       462 
     | 
    
         
            -
                    '''
         
     | 
| 
       463 
     | 
    
         
            -
                    # -
         
     | 
| 
       464 
     | 
    
         
            -
                    try:
         
     | 
| 
       465 
     | 
    
         
            -
                        b = int(b)
         
     | 
| 
       466 
     | 
    
         
            -
                        if b <= 0:
         
     | 
| 
       467 
     | 
    
         
            -
                            raise ValueError("b must be > 0")
         
     | 
| 
       468 
     | 
    
         
            -
                    except TypeError:
         
     | 
| 
       469 
     | 
    
         
            -
                        raise TypeError("b must be an integer")
         
     | 
| 
       470 
     | 
    
         
            -
                    # -
         
     | 
| 
       471 
     | 
    
         
            -
                    if self._currentSNP + b > self.m:
         
     | 
| 
       472 
     | 
    
         
            -
                        s = '{b} SNPs requested, {k} SNPs remain'
         
     | 
| 
       473 
     | 
    
         
            -
                        raise ValueError(s.format(b=b, k=(self.m - self._currentSNP)))
         
     | 
| 
       474 
     | 
    
         
            -
                    # -
         
     | 
| 
       475 
     | 
    
         
            -
                    c = self._currentSNP
         
     | 
| 
       476 
     | 
    
         
            -
                    n = self.n
         
     | 
| 
       477 
     | 
    
         
            -
                    nru = self.nru
         
     | 
| 
       478 
     | 
    
         
            -
                    slice = self.geno[2 * c * nru:2 * (c + b) * nru]
         
     | 
| 
       479 
     | 
    
         
            -
                    X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
         
     | 
| 
       480 
     | 
    
         
            -
                    X = X[0:n, :]
         
     | 
| 
       481 
     | 
    
         
            -
                    Y = np.zeros(X.shape)
         
     | 
| 
       482 
     | 
    
         
            -
                    # normalize the SNPs and impute the missing one with the mean
         
     | 
| 
       483 
     | 
    
         
            -
                    for j in range(0, b):
         
     | 
| 
       484 
     | 
    
         
            -
                        newsnp = X[:, j]
         
     | 
| 
       485 
     | 
    
         
            -
                        ii = newsnp != 9
         
     | 
| 
       486 
     | 
    
         
            -
                        avg = np.mean(newsnp[ii])
         
     | 
| 
       487 
     | 
    
         
            -
                        newsnp[np.logical_not(ii)] = avg
         
     | 
| 
       488 
     | 
    
         
            -
                        denom = np.std(newsnp)
         
     | 
| 
       489 
     | 
    
         
            -
                        if denom == 0:
         
     | 
| 
       490 
     | 
    
         
            -
                            denom = 1
         
     | 
| 
       491 
     | 
    
         
            -
                        # -
         
     | 
| 
       492 
     | 
    
         
            -
                        if minorRef is not None and self.freq[self._currentSNP + j] > 0.5:
         
     | 
| 
       493 
     | 
    
         
            -
                            denom = denom * -1
         
     | 
| 
       494 
     | 
    
         
            -
                        # -
         
     | 
| 
       495 
     | 
    
         
            -
                        Y[:, j] = (newsnp - avg) / denom
         
     | 
| 
       496 
     | 
    
         
            -
                    # -
         
     | 
| 
       497 
     | 
    
         
            -
                    self._currentSNP += b
         
     | 
| 
       498 
     | 
    
         
            -
                    return Y
         
     | 
| 
       499 
     | 
    
         
            -
             
     | 
| 
       500 
     | 
    
         
            -
             
     | 
| 
       501 
     | 
    
         
            -
            class PlinkBEDFileWithR2Cache(PlinkBEDFile):
         
     | 
| 
       502 
     | 
    
         
            -
                def compute_r2_cache(self,
         
     | 
| 
       503 
     | 
    
         
            -
                                     block_left,
         
     | 
| 
       504 
     | 
    
         
            -
                                     output_cache_file_dir: Path,
         
     | 
| 
       505 
     | 
    
         
            -
                                     chunk_size=500_000_000,
         
     | 
| 
       506 
     | 
    
         
            -
                                     c=500,
         
     | 
| 
       507 
     | 
    
         
            -
                                     r2_threshold=1e-4,
         
     | 
| 
       508 
     | 
    
         
            -
                                     annot=None):
         
     | 
| 
       509 
     | 
    
         
            -
             
     | 
| 
       510 
     | 
    
         
            -
                    func = np.square
         
     | 
| 
       511 
     | 
    
         
            -
                    snp_getter = self.nextSNPs
         
     | 
| 
       512 
     | 
    
         
            -
                    data, rows, cols = [], [], []
         
     | 
| 
       513 
     | 
    
         
            -
             
     | 
| 
       514 
     | 
    
         
            -
                    def add_rfuncAB(rfuncAB, l_A, l_B):
         
     | 
| 
       515 
     | 
    
         
            -
                        non_zero_indices = np.nonzero(rfuncAB > r2_threshold)
         
     | 
| 
       516 
     | 
    
         
            -
                        data.extend(rfuncAB[non_zero_indices])
         
     | 
| 
       517 
     | 
    
         
            -
                        rows.extend(l_A + non_zero_indices[0])
         
     | 
| 
       518 
     | 
    
         
            -
                        cols.extend(l_B + non_zero_indices[1])
         
     | 
| 
       519 
     | 
    
         
            -
             
     | 
| 
       520 
     | 
    
         
            -
                    # def add_rfuncAB(rfuncAB, l_A, l_B):
         
     | 
| 
       521 
     | 
    
         
            -
                    #     # not need select non zero indices
         
     | 
| 
       522 
     | 
    
         
            -
                    #     data.extend(rfuncAB.flatten())
         
     | 
| 
       523 
     | 
    
         
            -
                    #     rows.extend(l_A + np.repeat(np.arange(rfuncAB.shape[0]), rfuncAB.shape[1]))
         
     | 
| 
       524 
     | 
    
         
            -
                    #     cols.extend(l_B + np.tile(np.arange(rfuncAB.shape[1]), rfuncAB.shape[0]))
         
     | 
| 
       525 
     | 
    
         
            -
             
     | 
| 
       526 
     | 
    
         
            -
                    # def add_rfuncBB(rfuncBB, l_B):
         
     | 
| 
       527 
     | 
    
         
            -
                    #     non_zero_indices = np.nonzero(rfuncBB)
         
     | 
| 
       528 
     | 
    
         
            -
                    #     data.extend(rfuncBB[non_zero_indices])
         
     | 
| 
       529 
     | 
    
         
            -
                    #     rows.extend(l_B + non_zero_indices[0])
         
     | 
| 
       530 
     | 
    
         
            -
                    #     cols.extend(l_B + non_zero_indices[1])
         
     | 
| 
       531 
     | 
    
         
            -
             
     | 
| 
       532 
     | 
    
         
            -
                    def add_rfuncBB(rfuncBB, l_B):
         
     | 
| 
       533 
     | 
    
         
            -
                        non_zero_indices = np.nonzero(rfuncBB > r2_threshold)
         
     | 
| 
       534 
     | 
    
         
            -
                        data.extend(rfuncBB[non_zero_indices])
         
     | 
| 
       535 
     | 
    
         
            -
                        rows.extend(l_B + non_zero_indices[0])
         
     | 
| 
       536 
     | 
    
         
            -
                        cols.extend(l_B + non_zero_indices[1])
         
     | 
| 
       537 
     | 
    
         
            -
                        if len(data) > chunk_size:
         
     | 
| 
       538 
     | 
    
         
            -
                            # save the cache
         
     | 
| 
       539 
     | 
    
         
            -
                            print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
         
     | 
| 
       540 
     | 
    
         
            -
                            r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(self.m, self.m), dtype='float16')
         
     | 
| 
       541 
     | 
    
         
            -
                            save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
         
     | 
| 
       542 
     | 
    
         
            -
                            # reset the data
         
     | 
| 
       543 
     | 
    
         
            -
                            data.clear()
         
     | 
| 
       544 
     | 
    
         
            -
                            rows.clear()
         
     | 
| 
       545 
     | 
    
         
            -
                            cols.clear()
         
     | 
| 
       546 
     | 
    
         
            -
             
     | 
| 
       547 
     | 
    
         
            -
                    m, n = self.m, self.n
         
     | 
| 
       548 
     | 
    
         
            -
                    block_sizes = np.array(np.arange(m) - block_left)
         
     | 
| 
       549 
     | 
    
         
            -
                    block_sizes = np.ceil(block_sizes / c) * c
         
     | 
| 
       550 
     | 
    
         
            -
                    if annot is None:
         
     | 
| 
       551 
     | 
    
         
            -
                        annot = np.ones((m, 1))
         
     | 
| 
       552 
     | 
    
         
            -
                    else:
         
     | 
| 
       553 
     | 
    
         
            -
                        annot_m = annot.shape[0]
         
     | 
| 
       554 
     | 
    
         
            -
                        if annot_m != self.m:
         
     | 
| 
       555 
     | 
    
         
            -
                            raise ValueError('Incorrect number of SNPs in annot')
         
     | 
| 
       556 
     | 
    
         
            -
                    # -
         
     | 
| 
       557 
     | 
    
         
            -
                    n_a = annot.shape[1]  # number of annotations
         
     | 
| 
       558 
     | 
    
         
            -
                    # cor_sum = np.zeros((m, n_a))
         
     | 
| 
       559 
     | 
    
         
            -
                    # b = index of first SNP for which SNP 0 is not included in LD Score
         
     | 
| 
       560 
     | 
    
         
            -
                    b = np.nonzero(block_left > 0)
         
     | 
| 
       561 
     | 
    
         
            -
                    if np.any(b):
         
     | 
| 
       562 
     | 
    
         
            -
                        b = b[0][0]
         
     | 
| 
       563 
     | 
    
         
            -
                    else:
         
     | 
| 
       564 
     | 
    
         
            -
                        b = m
         
     | 
| 
       565 
     | 
    
         
            -
                    b = int(np.ceil(b / c) * c)  # round up to a multiple of c
         
     | 
| 
       566 
     | 
    
         
            -
                    if b > m:
         
     | 
| 
       567 
     | 
    
         
            -
                        c = 1
         
     | 
| 
       568 
     | 
    
         
            -
                        b = m
         
     | 
| 
       569 
     | 
    
         
            -
             
     | 
| 
       570 
     | 
    
         
            -
                    l_A = 0  # l_A := index of leftmost SNP in matrix A
         
     | 
| 
       571 
     | 
    
         
            -
                    A = snp_getter(b)
         
     | 
| 
       572 
     | 
    
         
            -
                    rfuncAB = np.zeros((b, c))
         
     | 
| 
       573 
     | 
    
         
            -
                    rfuncBB = np.zeros((c, c))
         
     | 
| 
       574 
     | 
    
         
            -
                    # chunk inside of block
         
     | 
| 
       575 
     | 
    
         
            -
                    for l_B in np.arange(0, b, c):  # l_B := index of leftmost SNP in matrix B
         
     | 
| 
       576 
     | 
    
         
            -
                        B = A[:, l_B:l_B + c]
         
     | 
| 
       577 
     | 
    
         
            -
                        # ld matrix
         
     | 
| 
       578 
     | 
    
         
            -
                        np.dot(A.T, B / n, out=rfuncAB)
         
     | 
| 
       579 
     | 
    
         
            -
                        # ld matrix square
         
     | 
| 
       580 
     | 
    
         
            -
                        rfuncAB = func(rfuncAB)
         
     | 
| 
       581 
     | 
    
         
            -
                        add_rfuncAB(rfuncAB, l_A, l_B)
         
     | 
| 
       582 
     | 
    
         
            -
                        # cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
         
     | 
| 
       583 
     | 
    
         
            -
             
     | 
| 
       584 
     | 
    
         
            -
                    # chunk to right of block
         
     | 
| 
       585 
     | 
    
         
            -
                    b0 = b
         
     | 
| 
       586 
     | 
    
         
            -
                    md = int(c * np.floor(m / c))
         
     | 
| 
       587 
     | 
    
         
            -
                    end = md + 1 if md != m else md
         
     | 
| 
       588 
     | 
    
         
            -
                    for l_B in trange(b0, end, c, desc=f'Compute r2 cache for {output_cache_file_dir.name}'):
         
     | 
| 
       589 
     | 
    
         
            -
                        # check if the annot matrix is all zeros for this block + chunk
         
     | 
| 
       590 
     | 
    
         
            -
                        # this happens w/ sparse categories (i.e., pathways)
         
     | 
| 
       591 
     | 
    
         
            -
                        # update the block
         
     | 
| 
       592 
     | 
    
         
            -
                        old_b = b
         
     | 
| 
       593 
     | 
    
         
            -
                        b = int(block_sizes[l_B])
         
     | 
| 
       594 
     | 
    
         
            -
                        if l_B > b0 and b > 0:
         
     | 
| 
       595 
     | 
    
         
            -
                            # block_size can't increase more than c
         
     | 
| 
       596 
     | 
    
         
            -
                            # block_size can't be less than c unless it is zero
         
     | 
| 
       597 
     | 
    
         
            -
                            # both of these things make sense
         
     | 
| 
       598 
     | 
    
         
            -
                            A = np.hstack((A[:, old_b - b + c:old_b], B))
         
     | 
| 
       599 
     | 
    
         
            -
                            l_A += old_b - b + c
         
     | 
| 
       600 
     | 
    
         
            -
                        elif l_B == b0 and b > 0:
         
     | 
| 
       601 
     | 
    
         
            -
                            A = A[:, b0 - b:b0]
         
     | 
| 
       602 
     | 
    
         
            -
                            l_A = b0 - b
         
     | 
| 
       603 
     | 
    
         
            -
                        elif b == 0:  # no SNPs to left in window, e.g., after a sequence gap
         
     | 
| 
       604 
     | 
    
         
            -
                            A = np.array(()).reshape((n, 0))
         
     | 
| 
       605 
     | 
    
         
            -
                            l_A = l_B
         
     | 
| 
       606 
     | 
    
         
            -
                        if l_B == md:
         
     | 
| 
       607 
     | 
    
         
            -
                            c = m - md
         
     | 
| 
       608 
     | 
    
         
            -
                            rfuncAB = np.zeros((b, c))
         
     | 
| 
       609 
     | 
    
         
            -
                            rfuncBB = np.zeros((c, c))
         
     | 
| 
       610 
     | 
    
         
            -
                        if b != old_b:
         
     | 
| 
       611 
     | 
    
         
            -
                            rfuncAB = np.zeros((b, c))
         
     | 
| 
       612 
     | 
    
         
            -
                        # -
         
     | 
| 
       613 
     | 
    
         
            -
                        B = snp_getter(c)
         
     | 
| 
       614 
     | 
    
         
            -
                        p1 = np.all(annot[l_A:l_A + b, :] == 0)
         
     | 
| 
       615 
     | 
    
         
            -
                        p2 = np.all(annot[l_B:l_B + c, :] == 0)
         
     | 
| 
       616 
     | 
    
         
            -
                        if p1 and p2:
         
     | 
| 
       617 
     | 
    
         
            -
                            continue
         
     | 
| 
       618 
     | 
    
         
            -
                        # -
         
     | 
| 
       619 
     | 
    
         
            -
                        np.dot(A.T, B / n, out=rfuncAB)
         
     | 
| 
       620 
     | 
    
         
            -
                        rfuncAB = func(rfuncAB)
         
     | 
| 
       621 
     | 
    
         
            -
                        # cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
         
     | 
| 
       622 
     | 
    
         
            -
                        # cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
         
     | 
| 
       623 
     | 
    
         
            -
                        add_rfuncAB(rfuncAB, l_A, l_B)
         
     | 
| 
       624 
     | 
    
         
            -
                        add_rfuncAB(rfuncAB.T, l_B, l_A)
         
     | 
| 
       625 
     | 
    
         
            -
                        np.dot(B.T, B / n, out=rfuncBB)
         
     | 
| 
       626 
     | 
    
         
            -
                        rfuncBB = func(rfuncBB)
         
     | 
| 
       627 
     | 
    
         
            -
                        # cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
         
     | 
| 
       628 
     | 
    
         
            -
                        add_rfuncBB(rfuncBB, l_B)
         
     | 
| 
       629 
     | 
    
         
            -
                    if len(data) > 0:
         
     | 
| 
       630 
     | 
    
         
            -
                        # save remaining data
         
     | 
| 
       631 
     | 
    
         
            -
                        # save the cache
         
     | 
| 
       632 
     | 
    
         
            -
                        print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
         
     | 
| 
       633 
     | 
    
         
            -
                        r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(m, m), dtype='float16')
         
     | 
| 
       634 
     | 
    
         
            -
                        save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
         
     | 
| 
       635 
     | 
    
         
            -
                    # combine the cache files
         
     | 
| 
       636 
     | 
    
         
            -
                    print(f'Start combining the cache files in {output_cache_file_dir}')
         
     | 
| 
       637 
     | 
    
         
            -
                    cached_r2_matrix_files = list(output_cache_file_dir.glob('*.npz'))
         
     | 
| 
       638 
     | 
    
         
            -
                    combined_r2_matrix_files = self.load_r2_matrix_from_cache_files(output_cache_file_dir)
         
     | 
| 
       639 
     | 
    
         
            -
                    # remove the cache files
         
     | 
| 
       640 
     | 
    
         
            -
                    for cached_r2_matrix_file in cached_r2_matrix_files:
         
     | 
| 
       641 
     | 
    
         
            -
                        cached_r2_matrix_file.unlink()
         
     | 
| 
       642 
     | 
    
         
            -
                    # save the combined r2 matrix
         
     | 
| 
       643 
     | 
    
         
            -
                    print(f'Start saving the combined r2 matrix in {output_cache_file_dir}')
         
     | 
| 
       644 
     | 
    
         
            -
                    combined_r2_matrix_file = output_cache_file_dir / 'combined_r2_matrix.npz'
         
     | 
| 
       645 
     | 
    
         
            -
                    save_npz(combined_r2_matrix_file, combined_r2_matrix_files)
         
     | 
| 
       646 
     | 
    
         
            -
             
     | 
| 
       647 
     | 
    
         
            -
                def get_ldscore_using_r2_cache(self, annot_matrix, cached_r2_matrix_dir):
         
     | 
| 
       648 
     | 
    
         
            -
                    """
         
     | 
| 
       649 
     | 
    
         
            -
                    Compute the r2 matrix multiplication with annot_matrix
         
     | 
| 
       650 
     | 
    
         
            -
                    """
         
     | 
| 
       651 
     | 
    
         
            -
                    # Compute the r2 matrix multiplication with annot_matrix
         
     | 
| 
       652 
     | 
    
         
            -
                    cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
         
     | 
| 
       653 
     | 
    
         
            -
                    # iter the cached r2 matrix files
         
     | 
| 
       654 
     | 
    
         
            -
                    result_matrix = np.zeros((self.m, annot_matrix.shape[1]))
         
     | 
| 
       655 
     | 
    
         
            -
                    cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
         
     | 
| 
       656 
     | 
    
         
            -
                    assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
         
     | 
| 
       657 
     | 
    
         
            -
                                                             f'Please run the function compute_r2_cache first!')
         
     | 
| 
       658 
     | 
    
         
            -
                    for r2_matrix_file in tqdm(cached_r2_matrix_files, desc=f'Compute ld score for {cached_r2_matrix_dir.name}'):
         
     | 
| 
       659 
     | 
    
         
            -
                        print(f'Compute r2 matrix multiplication for {r2_matrix_file}')
         
     | 
| 
       660 
     | 
    
         
            -
                        r2_matrix = load_npz(r2_matrix_file)
         
     | 
| 
       661 
     | 
    
         
            -
                        result_matrix += r2_matrix.dot(annot_matrix)
         
     | 
| 
       662 
     | 
    
         
            -
                    return result_matrix
         
     | 
| 
       663 
     | 
    
         
            -
             
     | 
| 
       664 
     | 
    
         
            -
                def load_r2_matrix_from_cache_files(self, cached_r2_matrix_dir):
         
     | 
| 
       665 
     | 
    
         
            -
                    """
         
     | 
| 
       666 
     | 
    
         
            -
                    Load the r2 matrix from cache
         
     | 
| 
       667 
     | 
    
         
            -
                    """
         
     | 
| 
       668 
     | 
    
         
            -
                    cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
         
     | 
| 
       669 
     | 
    
         
            -
                    # iter the cached r2 matrix files
         
     | 
| 
       670 
     | 
    
         
            -
                    cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
         
     | 
| 
       671 
     | 
    
         
            -
                    assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
         
     | 
| 
       672 
     | 
    
         
            -
                                                             f'Please run the function compute_r2_cache first!')
         
     | 
| 
       673 
     | 
    
         
            -
                    # load the r2 matrix
         
     | 
| 
       674 
     | 
    
         
            -
                    r2_matrix = load_npz(cached_r2_matrix_files[0])
         
     | 
| 
       675 
     | 
    
         
            -
                    for r2_matrix_file in tqdm(cached_r2_matrix_files[1:], desc=f'Load r2 matrix from {cached_r2_matrix_dir.name}'):
         
     | 
| 
       676 
     | 
    
         
            -
                        print(f'Load r2 matrix from {r2_matrix_file}')
         
     | 
| 
       677 
     | 
    
         
            -
                        r2_matrix += load_npz(r2_matrix_file)
         
     | 
| 
       678 
     | 
    
         
            -
                    # to float16
         
     | 
| 
       679 
     | 
    
         
            -
                    r2_matrix = r2_matrix.astype('float16')
         
     | 
| 
       680 
     | 
    
         
            -
                    return r2_matrix
         
     | 
| 
       681 
     | 
    
         
            -
                def load_combined_r2_matrix(self, cached_r2_matrix_dir):
         
     | 
| 
       682 
     | 
    
         
            -
                    """
         
     | 
| 
       683 
     | 
    
         
            -
                    Load the combined r2 matrix
         
     | 
| 
       684 
     | 
    
         
            -
                    """
         
     | 
| 
       685 
     | 
    
         
            -
                    combined_r2_matrix_file = Path(cached_r2_matrix_dir) / 'combined_r2_matrix.npz'
         
     | 
| 
       686 
     | 
    
         
            -
                    assert combined_r2_matrix_file.exists(), (f'No combined r2 matrix file in {cached_r2_matrix_dir}'
         
     | 
| 
       687 
     | 
    
         
            -
                                                              f'Should delete the cache files and run the function compute_r2_cache first!')
         
     | 
| 
       688 
     | 
    
         
            -
                    # load the r2 matrix
         
     | 
| 
       689 
     | 
    
         
            -
                    r2_matrix = load_npz(combined_r2_matrix_file)
         
     | 
| 
       690 
     | 
    
         
            -
                    # to float16
         
     | 
| 
       691 
     | 
    
         
            -
                    r2_matrix = r2_matrix.astype('float16')
         
     | 
| 
       692 
     | 
    
         
            -
                    return r2_matrix
         
     | 
| 
       693 
     | 
    
         
            -
             
     | 
| 
       694 
     | 
    
         
            -
            def load_bfile(bfile_chr_prefix):
         
     | 
| 
       695 
     | 
    
         
            -
                PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
         
     | 
| 
       696 
     | 
    
         
            -
                PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
         
     | 
| 
       697 
     | 
    
         
            -
             
     | 
| 
       698 
     | 
    
         
            -
                snp_file, snp_obj = bfile_chr_prefix + '.bim', PlinkBIMFile
         
     | 
| 
       699 
     | 
    
         
            -
                array_snps = snp_obj(snp_file)
         
     | 
| 
       700 
     | 
    
         
            -
                m = len(array_snps.IDList)
         
     | 
| 
       701 
     | 
    
         
            -
                print(f'Read list of {m} SNPs from {snp_file}')
         
     | 
| 
       702 
     | 
    
         
            -
                #
         
     | 
| 
       703 
     | 
    
         
            -
                # Load fam
         
     | 
| 
       704 
     | 
    
         
            -
                ind_file, ind_obj = bfile_chr_prefix + '.fam', PlinkFAMFile
         
     | 
| 
       705 
     | 
    
         
            -
                array_indivs = ind_obj(ind_file)
         
     | 
| 
       706 
     | 
    
         
            -
                n = len(array_indivs.IDList)
         
     | 
| 
       707 
     | 
    
         
            -
                print(f'Read list of {n} individuals from {ind_file}')
         
     | 
| 
       708 
     | 
    
         
            -
             
     | 
| 
       709 
     | 
    
         
            -
                # Load genotype array
         
     | 
| 
       710 
     | 
    
         
            -
                array_file, array_obj = bfile_chr_prefix + '.bed', PlinkBEDFileWithR2Cache
         
     | 
| 
       711 
     | 
    
         
            -
                geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
         
     | 
| 
       712 
     | 
    
         
            -
             
     | 
| 
       713 
     | 
    
         
            -
                return array_snps, array_indivs, geno_array
         
     | 
| 
       714 
     | 
    
         
            -
             
     | 
| 
       715 
     | 
    
         
            -
             
     | 
| 
       716 
     | 
    
         
            -
            def generate_r2_matrix_chr_cache(bfile_chr_prefix, ld_wind_cm, output_cache_file_dir):
         
     | 
| 
       717 
     | 
    
         
            -
                # Load genotype array
         
     | 
| 
       718 
     | 
    
         
            -
                array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix)
         
     | 
| 
       719 
     | 
    
         
            -
                # Compute block lefts
         
     | 
| 
       720 
     | 
    
         
            -
                block_left = getBlockLefts(geno_array.df[:, 3], ld_wind_cm)
         
     | 
| 
       721 
     | 
    
         
            -
                # Compute LD score
         
     | 
| 
       722 
     | 
    
         
            -
                r2_matrix = geno_array.load_r2_matrix_from_cache(output_cache_file_dir)
         
     | 
| 
       723 
     | 
    
         
            -
             
     | 
| 
       724 
     | 
    
         
            -
             
     | 
| 
       725 
     | 
    
         
            -
            def generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_wind_cm=1):
         
     | 
| 
       726 
     | 
    
         
            -
                r2_cache_dir = Path(r2_cache_dir)
         
     | 
| 
       727 
     | 
    
         
            -
             
     | 
| 
       728 
     | 
    
         
            -
                for chr in chromosome_list:
         
     | 
| 
       729 
     | 
    
         
            -
                    output_cache_file_prefix = r2_cache_dir / f'chr{chr}'
         
     | 
| 
       730 
     | 
    
         
            -
                    output_cache_file_prefix.mkdir(parents=True, exist_ok=True)
         
     | 
| 
       731 
     | 
    
         
            -
                    bfile_chr_prefix = bfile_prefix + '.' + str(chr)
         
     | 
| 
       732 
     | 
    
         
            -
                    generate_r2_matrix_chr_cache(bfile_chr_prefix,
         
     | 
| 
       733 
     | 
    
         
            -
                                                 ld_wind_cm=ld_wind_cm,
         
     | 
| 
       734 
     | 
    
         
            -
                                                 output_cache_file_dir=output_cache_file_prefix)
         
     | 
| 
       735 
     | 
    
         
            -
                    print(f'Compute r2 matrix for chr{chr} done!')
         
     | 
| 
      
 1 
     | 
    
         
            +
            from pathlib import Path
         
     | 
| 
      
 2 
     | 
    
         
            +
            import bitarray as ba
         
     | 
| 
      
 3 
     | 
    
         
            +
            import numpy as np
         
     | 
| 
      
 4 
     | 
    
         
            +
            import pandas as pd
         
     | 
| 
      
 5 
     | 
    
         
            +
            from scipy.sparse import csr_matrix
         
     | 
| 
      
 6 
     | 
    
         
            +
            from scipy.sparse import save_npz, load_npz
         
     | 
| 
      
 7 
     | 
    
         
            +
            from tqdm import trange, tqdm
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
            # Define the log class
         
     | 
| 
      
 11 
     | 
    
         
            +
            class Logger(object):
         
     | 
| 
      
 12 
     | 
    
         
            +
                # -
         
     | 
| 
      
 13 
     | 
    
         
            +
                def __init__(self, fh):
         
     | 
| 
      
 14 
     | 
    
         
            +
                    self.log_fh = open(fh, 'w')
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                # -
         
     | 
| 
      
 17 
     | 
    
         
            +
                def log(self, msg):
         
     | 
| 
      
 18 
     | 
    
         
            +
                    '''
         
     | 
| 
      
 19 
     | 
    
         
            +
                    Print to log file and stdout.
         
     | 
| 
      
 20 
     | 
    
         
            +
                    '''
         
     | 
| 
      
 21 
     | 
    
         
            +
                    print(msg, file=self.log_fh)
         
     | 
| 
      
 22 
     | 
    
         
            +
                    print(msg)
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
                # -
         
     | 
| 
      
 25 
     | 
    
         
            +
                def close(self):
         
     | 
| 
      
 26 
     | 
    
         
            +
                    self.log_fh.close()
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
            # Compute ld-score using cellular annotations
         
     | 
| 
      
 30 
     | 
    
         
            +
            def get_compression(fh):
         
     | 
| 
      
 31 
     | 
    
         
            +
                '''Which sort of compression should we use with read_csv?'''
         
     | 
| 
      
 32 
     | 
    
         
            +
                if fh.endswith('gz'):
         
     | 
| 
      
 33 
     | 
    
         
            +
                    compression = 'gzip'
         
     | 
| 
      
 34 
     | 
    
         
            +
                elif fh.endswith('bz2'):
         
     | 
| 
      
 35 
     | 
    
         
            +
                    compression = 'bz2'
         
     | 
| 
      
 36 
     | 
    
         
            +
                else:
         
     | 
| 
      
 37 
     | 
    
         
            +
                    compression = None
         
     | 
| 
      
 38 
     | 
    
         
            +
                # -
         
     | 
| 
      
 39 
     | 
    
         
            +
                return compression
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
            # Define the reading functions
         
     | 
| 
      
 43 
     | 
    
         
            +
            def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
         
     | 
| 
      
 44 
     | 
    
         
            +
                # -
         
     | 
| 
      
 45 
     | 
    
         
            +
                class IDContainer(object):
         
     | 
| 
      
 46 
     | 
    
         
            +
                    """
         
     | 
| 
      
 47 
     | 
    
         
            +
                    A class to read data from a file, store it as a DataFrame, and provide a method for a left outer join operation.
         
     | 
| 
      
 48 
     | 
    
         
            +
                    """
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                    def __init__(self, fname):
         
     | 
| 
      
 51 
     | 
    
         
            +
                        """
         
     | 
| 
      
 52 
     | 
    
         
            +
                        Initialize the IDContainer with the given filename and reading options.
         
     | 
| 
      
 53 
     | 
    
         
            +
                        """
         
     | 
| 
      
 54 
     | 
    
         
            +
                        self.usecols = usecols
         
     | 
| 
      
 55 
     | 
    
         
            +
                        self.colnames = colnames
         
     | 
| 
      
 56 
     | 
    
         
            +
                        self.keepcol = keepcol
         
     | 
| 
      
 57 
     | 
    
         
            +
                        self.fname_end = fname_end
         
     | 
| 
      
 58 
     | 
    
         
            +
                        self.header = header
         
     | 
| 
      
 59 
     | 
    
         
            +
                        self.read(fname)
         
     | 
| 
      
 60 
     | 
    
         
            +
                        self.n = len(self.df)
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 63 
     | 
    
         
            +
                    def read(self, fname):
         
     | 
| 
      
 64 
     | 
    
         
            +
                        """
         
     | 
| 
      
 65 
     | 
    
         
            +
                        Read data from the given file and store it as a DataFrame.
         
     | 
| 
      
 66 
     | 
    
         
            +
                        """
         
     | 
| 
      
 67 
     | 
    
         
            +
                        end = self.fname_end
         
     | 
| 
      
 68 
     | 
    
         
            +
                        if end and not fname.endswith(end):
         
     | 
| 
      
 69 
     | 
    
         
            +
                            raise ValueError('{f} filename must end in {f}'.format(f=end))
         
     | 
| 
      
 70 
     | 
    
         
            +
                        comp = get_compression(fname)
         
     | 
| 
      
 71 
     | 
    
         
            +
                        self.df = pd.read_csv(fname, header=self.header, usecols=self.usecols,
         
     | 
| 
      
 72 
     | 
    
         
            +
                                              sep='\s+', compression=comp)
         
     | 
| 
      
 73 
     | 
    
         
            +
                        if self.colnames:
         
     | 
| 
      
 74 
     | 
    
         
            +
                            self.df.columns = self.colnames
         
     | 
| 
      
 75 
     | 
    
         
            +
                        if self.keepcol is not None:
         
     | 
| 
      
 76 
     | 
    
         
            +
                            self.IDList = self.df.iloc[:, [self.keepcol]].astype('object')
         
     | 
| 
      
 77 
     | 
    
         
            +
             
     | 
| 
      
 78 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 79 
     | 
    
         
            +
                    def loj(self, externalDf):
         
     | 
| 
      
 80 
     | 
    
         
            +
                        """
         
     | 
| 
      
 81 
     | 
    
         
            +
                        Perform a left outer join operation with the given external DataFrame.
         
     | 
| 
      
 82 
     | 
    
         
            +
                        """
         
     | 
| 
      
 83 
     | 
    
         
            +
                        r = externalDf.columns[0]
         
     | 
| 
      
 84 
     | 
    
         
            +
                        l = self.IDList.columns[0]
         
     | 
| 
      
 85 
     | 
    
         
            +
                        merge_df = externalDf.iloc[:, [0]]
         
     | 
| 
      
 86 
     | 
    
         
            +
                        merge_df['keep'] = True
         
     | 
| 
      
 87 
     | 
    
         
            +
                        z = pd.merge(self.IDList, merge_df, how='left', left_on=l, right_on=r,
         
     | 
| 
      
 88 
     | 
    
         
            +
                                     sort=False)
         
     | 
| 
      
 89 
     | 
    
         
            +
                        ii = z['keep'] == True
         
     | 
| 
      
 90 
     | 
    
         
            +
                        return np.nonzero(ii)[0]
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
                # -
         
     | 
| 
      
 93 
     | 
    
         
            +
                return IDContainer
         
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
      
 95 
     | 
    
         
            +
             
     | 
| 
      
 96 
     | 
    
         
            +
            def getBlockLefts(coords, max_dist):
         
     | 
| 
      
 97 
     | 
    
         
            +
                '''
         
     | 
| 
      
 98 
     | 
    
         
            +
                Converts coordinates + max block length to the a list of coordinates of the leftmost
         
     | 
| 
      
 99 
     | 
    
         
            +
                SNPs to be included in blocks.
         
     | 
| 
      
 100 
     | 
    
         
            +
                Parameters
         
     | 
| 
      
 101 
     | 
    
         
            +
                ----------
         
     | 
| 
      
 102 
     | 
    
         
            +
                coords : array
         
     | 
| 
      
 103 
     | 
    
         
            +
                    Array of coordinates. Must be sorted.
         
     | 
| 
      
 104 
     | 
    
         
            +
                max_dist : float
         
     | 
| 
      
 105 
     | 
    
         
            +
                    Maximum distance between SNPs included in the same window.
         
     | 
| 
      
 106 
     | 
    
         
            +
                Returns
         
     | 
| 
      
 107 
     | 
    
         
            +
                -------
         
     | 
| 
      
 108 
     | 
    
         
            +
                block_left : 1D np.ndarray with same length as block_left
         
     | 
| 
      
 109 
     | 
    
         
            +
                    block_left[j] :=  min{k | dist(j, k) < max_dist}.
         
     | 
| 
      
 110 
     | 
    
         
            +
                '''
         
     | 
| 
      
 111 
     | 
    
         
            +
                M = len(coords)
         
     | 
| 
      
 112 
     | 
    
         
            +
                j = 0
         
     | 
| 
      
 113 
     | 
    
         
            +
                block_left = np.zeros(M)
         
     | 
| 
      
 114 
     | 
    
         
            +
                for i in range(M):
         
     | 
| 
      
 115 
     | 
    
         
            +
                    while j < M and abs(coords[j] - coords[i]) > max_dist:
         
     | 
| 
      
 116 
     | 
    
         
            +
                        j += 1
         
     | 
| 
      
 117 
     | 
    
         
            +
             
     | 
| 
      
 118 
     | 
    
         
            +
                    block_left[i] = j
         
     | 
| 
      
 119 
     | 
    
         
            +
                return block_left
         
     | 
| 
      
 120 
     | 
    
         
            +
             
     | 
| 
      
 121 
     | 
    
         
            +
             
     | 
| 
      
 122 
     | 
    
         
            +
            def block_left_to_right(block_left):
         
     | 
| 
      
 123 
     | 
    
         
            +
                '''
         
     | 
| 
      
 124 
     | 
    
         
            +
                Converts block lefts to block rights.
         
     | 
| 
      
 125 
     | 
    
         
            +
                Parameters
         
     | 
| 
      
 126 
     | 
    
         
            +
                ----------
         
     | 
| 
      
 127 
     | 
    
         
            +
                block_left : array
         
     | 
| 
      
 128 
     | 
    
         
            +
                    Array of block lefts.
         
     | 
| 
      
 129 
     | 
    
         
            +
                Returns
         
     | 
| 
      
 130 
     | 
    
         
            +
                -------
         
     | 
| 
      
 131 
     | 
    
         
            +
                block_right : 1D np.ndarray with same length as block_left
         
     | 
| 
      
 132 
     | 
    
         
            +
                    block_right[j] := max {k | block_left[k] <= j}
         
     | 
| 
      
 133 
     | 
    
         
            +
                '''
         
     | 
| 
      
 134 
     | 
    
         
            +
                M = len(block_left)
         
     | 
| 
      
 135 
     | 
    
         
            +
                j = 0
         
     | 
| 
      
 136 
     | 
    
         
            +
                block_right = np.zeros(M)
         
     | 
| 
      
 137 
     | 
    
         
            +
                for i in range(M):
         
     | 
| 
      
 138 
     | 
    
         
            +
                    while j < M and block_left[j] <= i:
         
     | 
| 
      
 139 
     | 
    
         
            +
                        j += 1
         
     | 
| 
      
 140 
     | 
    
         
            +
                    block_right[i] = j
         
     | 
| 
      
 141 
     | 
    
         
            +
             
     | 
| 
      
 142 
     | 
    
         
            +
                return block_right
         
     | 
| 
      
 143 
     | 
    
         
            +
             
     | 
| 
      
 144 
     | 
    
         
            +
             
     | 
| 
      
 145 
     | 
    
         
            +
            class GenotypeArrayInMemory(object):
         
     | 
| 
      
 146 
     | 
    
         
            +
                '''
         
     | 
| 
      
 147 
     | 
    
         
            +
                Parent class for various classes containing interfaces for files with genotype
         
     | 
| 
      
 148 
     | 
    
         
            +
                matrices, e.g., plink .bed files, etc
         
     | 
| 
      
 149 
     | 
    
         
            +
                '''
         
     | 
| 
      
 150 
     | 
    
         
            +
             
     | 
| 
      
 151 
     | 
    
         
            +
                def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
         
     | 
| 
      
 152 
     | 
    
         
            +
                    self.m = len(snp_list.IDList)
         
     | 
| 
      
 153 
     | 
    
         
            +
                    self.n = n
         
     | 
| 
      
 154 
     | 
    
         
            +
                    self.keep_snps = keep_snps
         
     | 
| 
      
 155 
     | 
    
         
            +
                    self.keep_indivs = keep_indivs
         
     | 
| 
      
 156 
     | 
    
         
            +
                    self.df = np.array(snp_list.df[['CHR', 'SNP', 'BP', 'CM']])
         
     | 
| 
      
 157 
     | 
    
         
            +
                    self.colnames = ['CHR', 'SNP', 'BP', 'CM']
         
     | 
| 
      
 158 
     | 
    
         
            +
                    self.mafMin = mafMin if mafMin is not None else 0
         
     | 
| 
      
 159 
     | 
    
         
            +
                    self._currentSNP = 0
         
     | 
| 
      
 160 
     | 
    
         
            +
                    (self.nru, self.geno) = self.__read__(fname, self.m, n)
         
     | 
| 
      
 161 
     | 
    
         
            +
                    # filter individuals
         
     | 
| 
      
 162 
     | 
    
         
            +
                    if keep_indivs is not None:
         
     | 
| 
      
 163 
     | 
    
         
            +
                        keep_indivs = np.array(keep_indivs, dtype='int')
         
     | 
| 
      
 164 
     | 
    
         
            +
                        if np.any(keep_indivs > self.n):
         
     | 
| 
      
 165 
     | 
    
         
            +
                            raise ValueError('keep_indivs indices out of bounds')
         
     | 
| 
      
 166 
     | 
    
         
            +
                        # -
         
     | 
| 
      
 167 
     | 
    
         
            +
                        (self.geno, self.m, self.n) = self.__filter_indivs__(self.geno, keep_indivs, self.m, self.n)
         
     | 
| 
      
 168 
     | 
    
         
            +
                        # -
         
     | 
| 
      
 169 
     | 
    
         
            +
                        if self.n > 0:
         
     | 
| 
      
 170 
     | 
    
         
            +
                            print('After filtering, {n} individuals remain'.format(n=self.n))
         
     | 
| 
      
 171 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 172 
     | 
    
         
            +
                            raise ValueError('After filtering, no individuals remain')
         
     | 
| 
      
 173 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 174 
     | 
    
         
            +
                    # filter SNPs
         
     | 
| 
      
 175 
     | 
    
         
            +
                    if keep_snps is not None:
         
     | 
| 
      
 176 
     | 
    
         
            +
                        keep_snps = np.array(keep_snps, dtype='int')
         
     | 
| 
      
 177 
     | 
    
         
            +
                        if np.any(keep_snps > self.m):  # if keep_snps is None, this returns False
         
     | 
| 
      
 178 
     | 
    
         
            +
                            raise ValueError('keep_snps indices out of bounds')
         
     | 
| 
      
 179 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 180 
     | 
    
         
            +
                    (self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
         
     | 
| 
      
 181 
     | 
    
         
            +
                        self.geno, self.m, self.n, self.mafMin, keep_snps)
         
     | 
| 
      
 182 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 183 
     | 
    
         
            +
                    if self.m > 0:
         
     | 
| 
      
 184 
     | 
    
         
            +
                        print('After filtering, {m} SNPs remain'.format(m=self.m))
         
     | 
| 
      
 185 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 186 
     | 
    
         
            +
                        raise ValueError('After filtering, no SNPs remain')
         
     | 
| 
      
 187 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 188 
     | 
    
         
            +
                    self.df = self.df[self.kept_snps, :]
         
     | 
| 
      
 189 
     | 
    
         
            +
                    self.maf = np.minimum(self.freq, np.ones(self.m) - self.freq)
         
     | 
| 
      
 190 
     | 
    
         
            +
                    self.sqrtpq = np.sqrt(self.freq * (np.ones(self.m) - self.freq))
         
     | 
| 
      
 191 
     | 
    
         
            +
                    self.df = np.c_[self.df, self.maf]
         
     | 
| 
      
 192 
     | 
    
         
            +
                    self.colnames.append('MAF')
         
     | 
| 
      
 193 
     | 
    
         
            +
             
     | 
| 
      
 194 
     | 
    
         
            +
                # -
         
     | 
| 
      
 195 
     | 
    
         
            +
                def __read__(self, fname, m, n):
         
     | 
| 
      
 196 
     | 
    
         
            +
                    raise NotImplementedError
         
     | 
| 
      
 197 
     | 
    
         
            +
             
     | 
| 
      
 198 
     | 
    
         
            +
                def __restart__(self):
         
     | 
| 
      
 199 
     | 
    
         
            +
                    self._currentSNP = 0
         
     | 
| 
      
 200 
     | 
    
         
            +
             
     | 
| 
      
 201 
     | 
    
         
            +
                # -
         
     | 
| 
      
 202 
     | 
    
         
            +
                def __filter_indivs__(geno, keep_indivs, m, n):
         
     | 
| 
      
 203 
     | 
    
         
            +
                    raise NotImplementedError
         
     | 
| 
      
 204 
     | 
    
         
            +
             
     | 
| 
      
 205 
     | 
    
         
            +
                # -
         
     | 
| 
      
 206 
     | 
    
         
            +
                def __filter_maf_(geno, m, n, maf):
         
     | 
| 
      
 207 
     | 
    
         
            +
                    raise NotImplementedError
         
     | 
| 
      
 208 
     | 
    
         
            +
             
     | 
| 
      
 209 
     | 
    
         
            +
                # -
         
     | 
| 
      
 210 
     | 
    
         
            +
                def ldScoreVarBlocks(self, block_left, c, annot=None):
         
     | 
| 
      
 211 
     | 
    
         
            +
                    '''Computes an unbiased estimate of L2(j) for j=1,..,M.'''
         
     | 
| 
      
 212 
     | 
    
         
            +
                    func = lambda x: self.__l2_unbiased__(x, self.n)
         
     | 
| 
      
 213 
     | 
    
         
            +
                    snp_getter = self.nextSNPs
         
     | 
| 
      
 214 
     | 
    
         
            +
                    return self.__corSumVarBlocks__(block_left, c, func, snp_getter, annot)
         
     | 
| 
      
 215 
     | 
    
         
            +
             
     | 
| 
      
 216 
     | 
    
         
            +
                # -
         
     | 
| 
      
 217 
     | 
    
         
            +
                # In small samples, the observed r^2 tends to be higher than the true r^2 due to sampling variability.
         
     | 
| 
      
 218 
     | 
    
         
            +
                # The bias correction term (1-sq) / denom adjusts for this bias by subtracting a small value that depends on the sample size and the observed r^2.
         
     | 
| 
      
 219 
     | 
    
         
            +
                def __l2_unbiased__(self, x, n):
         
     | 
| 
      
 220 
     | 
    
         
            +
                    denom = n - 2 if n > 2 else n  # allow n<2 for testing purposes
         
     | 
| 
      
 221 
     | 
    
         
            +
                    sq = np.square(x)
         
     | 
| 
      
 222 
     | 
    
         
            +
                    return sq - (1 - sq) / denom
         
     | 
| 
      
 223 
     | 
    
         
            +
             
     | 
| 
      
 224 
     | 
    
         
            +
                # -
         
     | 
| 
      
 225 
     | 
    
         
            +
                # Methods for calculating sums of Pearson correlation coefficients (i.e.,ld-score)
         
     | 
| 
      
 226 
     | 
    
         
            +
                # c stands for the chunk size (default = 50)
         
     | 
| 
      
 227 
     | 
    
         
            +
                def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
         
     | 
| 
      
 228 
     | 
    
         
            +
                    '''
         
     | 
| 
      
 229 
     | 
    
         
            +
                    Parameters
         
     | 
| 
      
 230 
     | 
    
         
            +
                    ----------
         
     | 
| 
      
 231 
     | 
    
         
            +
                    block_left : np.ndarray with shape (M, )
         
     | 
| 
      
 232 
     | 
    
         
            +
                        block_left[i] = index of leftmost SNP included in LD Score of SNP i.
         
     | 
| 
      
 233 
     | 
    
         
            +
                        if c > 1, then only entries that are multiples of c are examined, and it is
         
     | 
| 
      
 234 
     | 
    
         
            +
                        assumed that block_left[a*c+i] = block_left[a*c], except at
         
     | 
| 
      
 235 
     | 
    
         
            +
                        the beginning of the chromosome where the 0th SNP is included in the window.
         
     | 
| 
      
 236 
     | 
    
         
            +
                    c : int
         
     | 
| 
      
 237 
     | 
    
         
            +
                        Chunk size.
         
     | 
| 
      
 238 
     | 
    
         
            +
                    func : function
         
     | 
| 
      
 239 
     | 
    
         
            +
                        Function to be applied to the genotype correlation matrix. Before dotting with
         
     | 
| 
      
 240 
     | 
    
         
            +
                        annot. Examples: for biased L2, np.square. For biased L4,
         
     | 
| 
      
 241 
     | 
    
         
            +
                        lambda x: np.square(np.square(x)). For L1, lambda x: x.
         
     | 
| 
      
 242 
     | 
    
         
            +
                    snp_getter : function(int)
         
     | 
| 
      
 243 
     | 
    
         
            +
                        The method to be used to get the next SNPs
         
     | 
| 
      
 244 
     | 
    
         
            +
                    annot: numpy array with shape (m,n_a)
         
     | 
| 
      
 245 
     | 
    
         
            +
                        SNP annotations.
         
     | 
| 
      
 246 
     | 
    
         
            +
                    Returns
         
     | 
| 
      
 247 
     | 
    
         
            +
                    -------
         
     | 
| 
      
 248 
     | 
    
         
            +
                    cor_sum : np.ndarray with shape (M, num_annots)
         
     | 
| 
      
 249 
     | 
    
         
            +
                        Estimates.
         
     | 
| 
      
 250 
     | 
    
         
            +
                    '''
         
     | 
| 
      
 251 
     | 
    
         
            +
                    m, n = self.m, self.n
         
     | 
| 
      
 252 
     | 
    
         
            +
                    block_sizes = np.array(np.arange(m) - block_left)
         
     | 
| 
      
 253 
     | 
    
         
            +
                    block_sizes = np.ceil(block_sizes / c) * c
         
     | 
| 
      
 254 
     | 
    
         
            +
                    if annot is None:
         
     | 
| 
      
 255 
     | 
    
         
            +
                        annot = np.ones((m, 1))
         
     | 
| 
      
 256 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 257 
     | 
    
         
            +
                        annot_m = annot.shape[0]
         
     | 
| 
      
 258 
     | 
    
         
            +
                        if annot_m != self.m:
         
     | 
| 
      
 259 
     | 
    
         
            +
                            raise ValueError('Incorrect number of SNPs in annot')
         
     | 
| 
      
 260 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 261 
     | 
    
         
            +
                    n_a = annot.shape[1]  # number of annotations
         
     | 
| 
      
 262 
     | 
    
         
            +
                    cor_sum = np.zeros((m, n_a))
         
     | 
| 
      
 263 
     | 
    
         
            +
                    # b = index of first SNP for which SNP 0 is not included in LD Score
         
     | 
| 
      
 264 
     | 
    
         
            +
                    b = np.nonzero(block_left > 0)
         
     | 
| 
      
 265 
     | 
    
         
            +
                    if np.any(b):
         
     | 
| 
      
 266 
     | 
    
         
            +
                        b = b[0][0]
         
     | 
| 
      
 267 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 268 
     | 
    
         
            +
                        b = m
         
     | 
| 
      
 269 
     | 
    
         
            +
                    b = int(np.ceil(b / c) * c)  # round up to a multiple of c
         
     | 
| 
      
 270 
     | 
    
         
            +
                    if b > m:
         
     | 
| 
      
 271 
     | 
    
         
            +
                        c = 1
         
     | 
| 
      
 272 
     | 
    
         
            +
                        b = m
         
     | 
| 
      
 273 
     | 
    
         
            +
             
     | 
| 
      
 274 
     | 
    
         
            +
                    l_A = 0  # l_A := index of leftmost SNP in matrix A
         
     | 
| 
      
 275 
     | 
    
         
            +
                    A = snp_getter(b)
         
     | 
| 
      
 276 
     | 
    
         
            +
                    rfuncAB = np.zeros((b, c))
         
     | 
| 
      
 277 
     | 
    
         
            +
                    rfuncBB = np.zeros((c, c))
         
     | 
| 
      
 278 
     | 
    
         
            +
                    # chunk inside of block
         
     | 
| 
      
 279 
     | 
    
         
            +
                    for l_B in np.arange(0, b, c):  # l_B := index of leftmost SNP in matrix B
         
     | 
| 
      
 280 
     | 
    
         
            +
                        B = A[:, l_B:l_B + c]
         
     | 
| 
      
 281 
     | 
    
         
            +
                        # ld matrix
         
     | 
| 
      
 282 
     | 
    
         
            +
                        np.dot(A.T, B / n, out=rfuncAB)
         
     | 
| 
      
 283 
     | 
    
         
            +
                        # ld matrix square
         
     | 
| 
      
 284 
     | 
    
         
            +
                        rfuncAB = func(rfuncAB)
         
     | 
| 
      
 285 
     | 
    
         
            +
                        cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
         
     | 
| 
      
 286 
     | 
    
         
            +
             
     | 
| 
      
 287 
     | 
    
         
            +
                    # chunk to right of block
         
     | 
| 
      
 288 
     | 
    
         
            +
                    b0 = b
         
     | 
| 
      
 289 
     | 
    
         
            +
                    md = int(c * np.floor(m / c))
         
     | 
| 
      
 290 
     | 
    
         
            +
                    end = md + 1 if md != m else md
         
     | 
| 
      
 291 
     | 
    
         
            +
                    for l_B in tqdm(np.arange(b0, end, c), desc=f'Compute SNP Gene Weight'):
         
     | 
| 
      
 292 
     | 
    
         
            +
                        # check if the annot matrix is all zeros for this block + chunk
         
     | 
| 
      
 293 
     | 
    
         
            +
                        # this happens w/ sparse categories (i.e., pathways)
         
     | 
| 
      
 294 
     | 
    
         
            +
                        # update the block
         
     | 
| 
      
 295 
     | 
    
         
            +
                        old_b = b
         
     | 
| 
      
 296 
     | 
    
         
            +
                        b = int(block_sizes[l_B])
         
     | 
| 
      
 297 
     | 
    
         
            +
                        if l_B > b0 and b > 0:
         
     | 
| 
      
 298 
     | 
    
         
            +
                            # block_size can't increase more than c
         
     | 
| 
      
 299 
     | 
    
         
            +
                            # block_size can't be less than c unless it is zero
         
     | 
| 
      
 300 
     | 
    
         
            +
                            # both of these things make sense
         
     | 
| 
      
 301 
     | 
    
         
            +
                            A = np.hstack((A[:, old_b - b + c:old_b], B))
         
     | 
| 
      
 302 
     | 
    
         
            +
                            l_A += old_b - b + c
         
     | 
| 
      
 303 
     | 
    
         
            +
                        elif l_B == b0 and b > 0:
         
     | 
| 
      
 304 
     | 
    
         
            +
                            A = A[:, b0 - b:b0]
         
     | 
| 
      
 305 
     | 
    
         
            +
                            l_A = b0 - b
         
     | 
| 
      
 306 
     | 
    
         
            +
                        elif b == 0:  # no SNPs to left in window, e.g., after a sequence gap
         
     | 
| 
      
 307 
     | 
    
         
            +
                            A = np.array(()).reshape((n, 0))
         
     | 
| 
      
 308 
     | 
    
         
            +
                            l_A = l_B
         
     | 
| 
      
 309 
     | 
    
         
            +
                        if l_B == md:
         
     | 
| 
      
 310 
     | 
    
         
            +
                            c = m - md
         
     | 
| 
      
 311 
     | 
    
         
            +
                            rfuncAB = np.zeros((b, c))
         
     | 
| 
      
 312 
     | 
    
         
            +
                            rfuncBB = np.zeros((c, c))
         
     | 
| 
      
 313 
     | 
    
         
            +
                        if b != old_b:
         
     | 
| 
      
 314 
     | 
    
         
            +
                            rfuncAB = np.zeros((b, c))
         
     | 
| 
      
 315 
     | 
    
         
            +
                        # -
         
     | 
| 
      
 316 
     | 
    
         
            +
                        B = snp_getter(c)
         
     | 
| 
      
 317 
     | 
    
         
            +
                        p1 = np.all(annot[l_A:l_A + b, :] == 0)
         
     | 
| 
      
 318 
     | 
    
         
            +
                        p2 = np.all(annot[l_B:l_B + c, :] == 0)
         
     | 
| 
      
 319 
     | 
    
         
            +
                        if p1 and p2:
         
     | 
| 
      
 320 
     | 
    
         
            +
                            continue
         
     | 
| 
      
 321 
     | 
    
         
            +
                        # -
         
     | 
| 
      
 322 
     | 
    
         
            +
                        np.dot(A.T, B / n, out=rfuncAB)
         
     | 
| 
      
 323 
     | 
    
         
            +
                        rfuncAB = func(rfuncAB)
         
     | 
| 
      
 324 
     | 
    
         
            +
                        cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
         
     | 
| 
      
 325 
     | 
    
         
            +
                        cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
         
     | 
| 
      
 326 
     | 
    
         
            +
                        np.dot(B.T, B / n, out=rfuncBB)
         
     | 
| 
      
 327 
     | 
    
         
            +
                        rfuncBB = func(rfuncBB)
         
     | 
| 
      
 328 
     | 
    
         
            +
                        cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
         
     | 
| 
      
 329 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 330 
     | 
    
         
            +
                    return cor_sum
         
     | 
| 
      
 331 
     | 
    
         
            +
             
     | 
| 
      
 332 
     | 
    
         
            +
             
     | 
| 
      
 333 
     | 
    
         
            +
            class PlinkBEDFile(GenotypeArrayInMemory):
         
     | 
| 
      
 334 
     | 
    
         
            +
                '''
         
     | 
| 
      
 335 
     | 
    
         
            +
                Interface for Plink .bed format
         
     | 
| 
      
 336 
     | 
    
         
            +
                '''
         
     | 
| 
      
 337 
     | 
    
         
            +
             
     | 
| 
      
 338 
     | 
    
         
            +
                def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
         
     | 
| 
      
 339 
     | 
    
         
            +
                    self._bedcode = {
         
     | 
| 
      
 340 
     | 
    
         
            +
                        2: ba.bitarray('11'),
         
     | 
| 
      
 341 
     | 
    
         
            +
                        9: ba.bitarray('10'),
         
     | 
| 
      
 342 
     | 
    
         
            +
                        1: ba.bitarray('01'),
         
     | 
| 
      
 343 
     | 
    
         
            +
                        0: ba.bitarray('00')
         
     | 
| 
      
 344 
     | 
    
         
            +
                    }
         
     | 
| 
      
 345 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 346 
     | 
    
         
            +
                    GenotypeArrayInMemory.__init__(self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs,
         
     | 
| 
      
 347 
     | 
    
         
            +
                                                   mafMin=mafMin)
         
     | 
| 
      
 348 
     | 
    
         
            +
             
     | 
| 
      
 349 
     | 
    
         
            +
                # -
         
     | 
| 
      
 350 
     | 
    
         
            +
                def __read__(self, fname, m, n):
         
     | 
| 
      
 351 
     | 
    
         
            +
                    if not fname.endswith('.bed'):
         
     | 
| 
      
 352 
     | 
    
         
            +
                        raise ValueError('.bed filename must end in .bed')
         
     | 
| 
      
 353 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 354 
     | 
    
         
            +
                    fh = open(fname, 'rb')
         
     | 
| 
      
 355 
     | 
    
         
            +
                    magicNumber = ba.bitarray(endian="little")
         
     | 
| 
      
 356 
     | 
    
         
            +
                    magicNumber.fromfile(fh, 2)
         
     | 
| 
      
 357 
     | 
    
         
            +
                    bedMode = ba.bitarray(endian="little")
         
     | 
| 
      
 358 
     | 
    
         
            +
                    bedMode.fromfile(fh, 1)
         
     | 
| 
      
 359 
     | 
    
         
            +
                    e = (4 - n % 4) if n % 4 != 0 else 0
         
     | 
| 
      
 360 
     | 
    
         
            +
                    nru = n + e
         
     | 
| 
      
 361 
     | 
    
         
            +
                    self.nru = nru
         
     | 
| 
      
 362 
     | 
    
         
            +
                    # check magic number
         
     | 
| 
      
 363 
     | 
    
         
            +
                    if magicNumber != ba.bitarray('0011011011011000'):
         
     | 
| 
      
 364 
     | 
    
         
            +
                        raise IOError("Magic number from Plink .bed file not recognized")
         
     | 
| 
      
 365 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 366 
     | 
    
         
            +
                    if bedMode != ba.bitarray('10000000'):
         
     | 
| 
      
 367 
     | 
    
         
            +
                        raise IOError("Plink .bed file must be in default SNP-major mode")
         
     | 
| 
      
 368 
     | 
    
         
            +
                    # check file length
         
     | 
| 
      
 369 
     | 
    
         
            +
                    self.geno = ba.bitarray(endian="little")
         
     | 
| 
      
 370 
     | 
    
         
            +
                    self.geno.fromfile(fh)
         
     | 
| 
      
 371 
     | 
    
         
            +
                    self.__test_length__(self.geno, self.m, self.nru)
         
     | 
| 
      
 372 
     | 
    
         
            +
                    return (self.nru, self.geno)
         
     | 
| 
      
 373 
     | 
    
         
            +
             
     | 
| 
      
 374 
     | 
    
         
            +
                # -
         
     | 
| 
      
 375 
     | 
    
         
            +
                def __test_length__(self, geno, m, nru):
         
     | 
| 
      
 376 
     | 
    
         
            +
                    exp_len = 2 * m * nru
         
     | 
| 
      
 377 
     | 
    
         
            +
                    real_len = len(geno)
         
     | 
| 
      
 378 
     | 
    
         
            +
                    if real_len != exp_len:
         
     | 
| 
      
 379 
     | 
    
         
            +
                        s = "Plink .bed file has {n1} bits, expected {n2}"
         
     | 
| 
      
 380 
     | 
    
         
            +
                        raise IOError(s.format(n1=real_len, n2=exp_len))
         
     | 
| 
      
 381 
     | 
    
         
            +
             
     | 
| 
      
 382 
     | 
    
         
            +
                # -
         
     | 
| 
      
 383 
     | 
    
         
            +
                def __filter_indivs__(self, geno, keep_indivs, m, n):
         
     | 
| 
      
 384 
     | 
    
         
            +
                    n_new = len(keep_indivs)
         
     | 
| 
      
 385 
     | 
    
         
            +
                    e = (4 - n_new % 4) if n_new % 4 != 0 else 0
         
     | 
| 
      
 386 
     | 
    
         
            +
                    nru_new = n_new + e
         
     | 
| 
      
 387 
     | 
    
         
            +
                    nru = self.nru
         
     | 
| 
      
 388 
     | 
    
         
            +
                    z = ba.bitarray(m * 2 * nru_new, endian="little")
         
     | 
| 
      
 389 
     | 
    
         
            +
                    z.setall(0)
         
     | 
| 
      
 390 
     | 
    
         
            +
                    for e, i in enumerate(keep_indivs):
         
     | 
| 
      
 391 
     | 
    
         
            +
                        z[2 * e::2 * nru_new] = geno[2 * i::2 * nru]
         
     | 
| 
      
 392 
     | 
    
         
            +
                        z[2 * e + 1::2 * nru_new] = geno[2 * i + 1::2 * nru]
         
     | 
| 
      
 393 
     | 
    
         
            +
                    self.nru = nru_new
         
     | 
| 
      
 394 
     | 
    
         
            +
                    return (z, m, n_new)
         
     | 
| 
      
 395 
     | 
    
         
            +
             
     | 
| 
      
 396 
     | 
    
         
            +
                # -
         
     | 
| 
      
 397 
     | 
    
         
            +
                def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
         
     | 
| 
      
 398 
     | 
    
         
            +
                    '''
         
     | 
| 
      
 399 
     | 
    
         
            +
                    Credit to Chris Chang and the Plink2 developers for this algorithm
         
     | 
| 
      
 400 
     | 
    
         
            +
                    Modified from plink_filter.c
         
     | 
| 
      
 401 
     | 
    
         
            +
                    https://github.com/chrchang/plink-ng/blob/master/plink_filter.c
         
     | 
| 
      
 402 
     | 
    
         
            +
                    Genotypes are read forwards (since we are cheating and using endian="little")
         
     | 
| 
      
 403 
     | 
    
         
            +
                    A := (genotype) & 1010...
         
     | 
| 
      
 404 
     | 
    
         
            +
                    B := (genotype) & 0101...
         
     | 
| 
      
 405 
     | 
    
         
            +
                    C := (A >> 1) & B
         
     | 
| 
      
 406 
     | 
    
         
            +
                    Then
         
     | 
| 
      
 407 
     | 
    
         
            +
                    a := A.count() = missing ct + hom major ct
         
     | 
| 
      
 408 
     | 
    
         
            +
                    b := B.count() = het ct + hom major ct
         
     | 
| 
      
 409 
     | 
    
         
            +
                    c := C.count() = hom major ct
         
     | 
| 
      
 410 
     | 
    
         
            +
                    Which implies that
         
     | 
| 
      
 411 
     | 
    
         
            +
                    missing ct = a - c
         
     | 
| 
      
 412 
     | 
    
         
            +
                    # of indivs with nonmissing genotype = n - a + c
         
     | 
| 
      
 413 
     | 
    
         
            +
                    major allele ct = b + c
         
     | 
| 
      
 414 
     | 
    
         
            +
                    major allele frequency = (b+c)/(2*(n-a+c))
         
     | 
| 
      
 415 
     | 
    
         
            +
                    het ct + missing ct = a + b - 2*c
         
     | 
| 
      
 416 
     | 
    
         
            +
                    Why does bitarray not have >> ????
         
     | 
| 
      
 417 
     | 
    
         
            +
                    '''
         
     | 
| 
      
 418 
     | 
    
         
            +
                    nru = self.nru
         
     | 
| 
      
 419 
     | 
    
         
            +
                    m_poly = 0
         
     | 
| 
      
 420 
     | 
    
         
            +
                    y = ba.bitarray()
         
     | 
| 
      
 421 
     | 
    
         
            +
                    if keep_snps is None:
         
     | 
| 
      
 422 
     | 
    
         
            +
                        keep_snps = range(m)
         
     | 
| 
      
 423 
     | 
    
         
            +
                    kept_snps = []
         
     | 
| 
      
 424 
     | 
    
         
            +
                    freq = []
         
     | 
| 
      
 425 
     | 
    
         
            +
                    for e, j in enumerate(keep_snps):
         
     | 
| 
      
 426 
     | 
    
         
            +
                        z = geno[2 * nru * j:2 * nru * (j + 1)]
         
     | 
| 
      
 427 
     | 
    
         
            +
                        A = z[0::2]
         
     | 
| 
      
 428 
     | 
    
         
            +
                        a = A.count()
         
     | 
| 
      
 429 
     | 
    
         
            +
                        B = z[1::2]
         
     | 
| 
      
 430 
     | 
    
         
            +
                        b = B.count()
         
     | 
| 
      
 431 
     | 
    
         
            +
                        c = (A & B).count()
         
     | 
| 
      
 432 
     | 
    
         
            +
                        major_ct = b + c  # number of copies of the major allele
         
     | 
| 
      
 433 
     | 
    
         
            +
                        n_nomiss = n - a + c  # number of individuals with nonmissing genotypes
         
     | 
| 
      
 434 
     | 
    
         
            +
                        f = major_ct / (2 * n_nomiss) if n_nomiss > 0 else 0
         
     | 
| 
      
 435 
     | 
    
         
            +
                        het_miss_ct = a + b - 2 * c  # remove SNPs that are only either het or missing
         
     | 
| 
      
 436 
     | 
    
         
            +
                        if np.minimum(f, 1 - f) > mafMin and het_miss_ct < n:
         
     | 
| 
      
 437 
     | 
    
         
            +
                            freq.append(f)
         
     | 
| 
      
 438 
     | 
    
         
            +
                            y += z
         
     | 
| 
      
 439 
     | 
    
         
            +
                            m_poly += 1
         
     | 
| 
      
 440 
     | 
    
         
            +
                            kept_snps.append(j)
         
     | 
| 
      
 441 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 442 
     | 
    
         
            +
                    return (y, m_poly, n, kept_snps, freq)
         
     | 
| 
      
 443 
     | 
    
         
            +
             
     | 
| 
      
 444 
     | 
    
         
            +
                # -
         
     | 
| 
      
 445 
     | 
    
         
            +
                def nextSNPs(self, b, minorRef=None):
         
     | 
| 
      
 446 
     | 
    
         
            +
                    '''
         
     | 
| 
      
 447 
     | 
    
         
            +
                    Unpacks the binary array of genotypes and returns an n x b matrix of floats of
         
     | 
| 
      
 448 
     | 
    
         
            +
                    normalized genotypes for the next b SNPs, where n := number of samples.
         
     | 
| 
      
 449 
     | 
    
         
            +
                    Parameters
         
     | 
| 
      
 450 
     | 
    
         
            +
                    ----------
         
     | 
| 
      
 451 
     | 
    
         
            +
                    b : int
         
     | 
| 
      
 452 
     | 
    
         
            +
                        Number of SNPs to return.
         
     | 
| 
      
 453 
     | 
    
         
            +
                    minorRef: bool, default None
         
     | 
| 
      
 454 
     | 
    
         
            +
                        Should we flip reference alleles so that the minor allele is the reference?
         
     | 
| 
      
 455 
     | 
    
         
            +
                        (This is useful for computing l1 w.r.t. minor allele).
         
     | 
| 
      
 456 
     | 
    
         
            +
                    Returns
         
     | 
| 
      
 457 
     | 
    
         
            +
                    -------
         
     | 
| 
      
 458 
     | 
    
         
            +
                    X : np.array with dtype float64 with shape (n, b), where n := number of samples
         
     | 
| 
      
 459 
     | 
    
         
            +
                        Matrix of genotypes normalized to mean zero and variance one. If minorRef is
         
     | 
| 
      
 460 
     | 
    
         
            +
                        not None, then the minor allele will be the positive allele (i.e., two copies
         
     | 
| 
      
 461 
     | 
    
         
            +
                        of the minor allele --> a positive number).
         
     | 
| 
      
 462 
     | 
    
         
            +
                    '''
         
     | 
| 
      
 463 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 464 
     | 
    
         
            +
                    try:
         
     | 
| 
      
 465 
     | 
    
         
            +
                        b = int(b)
         
     | 
| 
      
 466 
     | 
    
         
            +
                        if b <= 0:
         
     | 
| 
      
 467 
     | 
    
         
            +
                            raise ValueError("b must be > 0")
         
     | 
| 
      
 468 
     | 
    
         
            +
                    except TypeError:
         
     | 
| 
      
 469 
     | 
    
         
            +
                        raise TypeError("b must be an integer")
         
     | 
| 
      
 470 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 471 
     | 
    
         
            +
                    if self._currentSNP + b > self.m:
         
     | 
| 
      
 472 
     | 
    
         
            +
                        s = '{b} SNPs requested, {k} SNPs remain'
         
     | 
| 
      
 473 
     | 
    
         
            +
                        raise ValueError(s.format(b=b, k=(self.m - self._currentSNP)))
         
     | 
| 
      
 474 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 475 
     | 
    
         
            +
                    c = self._currentSNP
         
     | 
| 
      
 476 
     | 
    
         
            +
                    n = self.n
         
     | 
| 
      
 477 
     | 
    
         
            +
                    nru = self.nru
         
     | 
| 
      
 478 
     | 
    
         
            +
                    slice = self.geno[2 * c * nru:2 * (c + b) * nru]
         
     | 
| 
      
 479 
     | 
    
         
            +
                    X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
         
     | 
| 
      
 480 
     | 
    
         
            +
                    X = X[0:n, :]
         
     | 
| 
      
 481 
     | 
    
         
            +
                    Y = np.zeros(X.shape)
         
     | 
| 
      
 482 
     | 
    
         
            +
                    # normalize the SNPs and impute the missing one with the mean
         
     | 
| 
      
 483 
     | 
    
         
            +
                    for j in range(0, b):
         
     | 
| 
      
 484 
     | 
    
         
            +
                        newsnp = X[:, j]
         
     | 
| 
      
 485 
     | 
    
         
            +
                        ii = newsnp != 9
         
     | 
| 
      
 486 
     | 
    
         
            +
                        avg = np.mean(newsnp[ii])
         
     | 
| 
      
 487 
     | 
    
         
            +
                        newsnp[np.logical_not(ii)] = avg
         
     | 
| 
      
 488 
     | 
    
         
            +
                        denom = np.std(newsnp)
         
     | 
| 
      
 489 
     | 
    
         
            +
                        if denom == 0:
         
     | 
| 
      
 490 
     | 
    
         
            +
                            denom = 1
         
     | 
| 
      
 491 
     | 
    
         
            +
                        # -
         
     | 
| 
      
 492 
     | 
    
         
            +
                        if minorRef is not None and self.freq[self._currentSNP + j] > 0.5:
         
     | 
| 
      
 493 
     | 
    
         
            +
                            denom = denom * -1
         
     | 
| 
      
 494 
     | 
    
         
            +
                        # -
         
     | 
| 
      
 495 
     | 
    
         
            +
                        Y[:, j] = (newsnp - avg) / denom
         
     | 
| 
      
 496 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 497 
     | 
    
         
            +
                    self._currentSNP += b
         
     | 
| 
      
 498 
     | 
    
         
            +
                    return Y
         
     | 
| 
      
 499 
     | 
    
         
            +
             
     | 
| 
      
 500 
     | 
    
         
            +
             
     | 
| 
      
 501 
     | 
    
         
            +
            class PlinkBEDFileWithR2Cache(PlinkBEDFile):
         
     | 
| 
      
 502 
     | 
    
         
            +
                def compute_r2_cache(self,
         
     | 
| 
      
 503 
     | 
    
         
            +
                                     block_left,
         
     | 
| 
      
 504 
     | 
    
         
            +
                                     output_cache_file_dir: Path,
         
     | 
| 
      
 505 
     | 
    
         
            +
                                     chunk_size=500_000_000,
         
     | 
| 
      
 506 
     | 
    
         
            +
                                     c=500,
         
     | 
| 
      
 507 
     | 
    
         
            +
                                     r2_threshold=1e-4,
         
     | 
| 
      
 508 
     | 
    
         
            +
                                     annot=None):
         
     | 
| 
      
 509 
     | 
    
         
            +
             
     | 
| 
      
 510 
     | 
    
         
            +
                    func = np.square
         
     | 
| 
      
 511 
     | 
    
         
            +
                    snp_getter = self.nextSNPs
         
     | 
| 
      
 512 
     | 
    
         
            +
                    data, rows, cols = [], [], []
         
     | 
| 
      
 513 
     | 
    
         
            +
             
     | 
| 
      
 514 
     | 
    
         
            +
                    def add_rfuncAB(rfuncAB, l_A, l_B):
         
     | 
| 
      
 515 
     | 
    
         
            +
                        non_zero_indices = np.nonzero(rfuncAB > r2_threshold)
         
     | 
| 
      
 516 
     | 
    
         
            +
                        data.extend(rfuncAB[non_zero_indices])
         
     | 
| 
      
 517 
     | 
    
         
            +
                        rows.extend(l_A + non_zero_indices[0])
         
     | 
| 
      
 518 
     | 
    
         
            +
                        cols.extend(l_B + non_zero_indices[1])
         
     | 
| 
      
 519 
     | 
    
         
            +
             
     | 
| 
      
 520 
     | 
    
         
            +
                    # def add_rfuncAB(rfuncAB, l_A, l_B):
         
     | 
| 
      
 521 
     | 
    
         
            +
                    #     # not need select non zero indices
         
     | 
| 
      
 522 
     | 
    
         
            +
                    #     data.extend(rfuncAB.flatten())
         
     | 
| 
      
 523 
     | 
    
         
            +
                    #     rows.extend(l_A + np.repeat(np.arange(rfuncAB.shape[0]), rfuncAB.shape[1]))
         
     | 
| 
      
 524 
     | 
    
         
            +
                    #     cols.extend(l_B + np.tile(np.arange(rfuncAB.shape[1]), rfuncAB.shape[0]))
         
     | 
| 
      
 525 
     | 
    
         
            +
             
     | 
| 
      
 526 
     | 
    
         
            +
                    # def add_rfuncBB(rfuncBB, l_B):
         
     | 
| 
      
 527 
     | 
    
         
            +
                    #     non_zero_indices = np.nonzero(rfuncBB)
         
     | 
| 
      
 528 
     | 
    
         
            +
                    #     data.extend(rfuncBB[non_zero_indices])
         
     | 
| 
      
 529 
     | 
    
         
            +
                    #     rows.extend(l_B + non_zero_indices[0])
         
     | 
| 
      
 530 
     | 
    
         
            +
                    #     cols.extend(l_B + non_zero_indices[1])
         
     | 
| 
      
 531 
     | 
    
         
            +
             
     | 
| 
      
 532 
     | 
    
         
            +
                    def add_rfuncBB(rfuncBB, l_B):
         
     | 
| 
      
 533 
     | 
    
         
            +
                        non_zero_indices = np.nonzero(rfuncBB > r2_threshold)
         
     | 
| 
      
 534 
     | 
    
         
            +
                        data.extend(rfuncBB[non_zero_indices])
         
     | 
| 
      
 535 
     | 
    
         
            +
                        rows.extend(l_B + non_zero_indices[0])
         
     | 
| 
      
 536 
     | 
    
         
            +
                        cols.extend(l_B + non_zero_indices[1])
         
     | 
| 
      
 537 
     | 
    
         
            +
                        if len(data) > chunk_size:
         
     | 
| 
      
 538 
     | 
    
         
            +
                            # save the cache
         
     | 
| 
      
 539 
     | 
    
         
            +
                            print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
         
     | 
| 
      
 540 
     | 
    
         
            +
                            r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(self.m, self.m), dtype='float16')
         
     | 
| 
      
 541 
     | 
    
         
            +
                            save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
         
     | 
| 
      
 542 
     | 
    
         
            +
                            # reset the data
         
     | 
| 
      
 543 
     | 
    
         
            +
                            data.clear()
         
     | 
| 
      
 544 
     | 
    
         
            +
                            rows.clear()
         
     | 
| 
      
 545 
     | 
    
         
            +
                            cols.clear()
         
     | 
| 
      
 546 
     | 
    
         
            +
             
     | 
| 
      
 547 
     | 
    
         
            +
                    m, n = self.m, self.n
         
     | 
| 
      
 548 
     | 
    
         
            +
                    block_sizes = np.array(np.arange(m) - block_left)
         
     | 
| 
      
 549 
     | 
    
         
            +
                    block_sizes = np.ceil(block_sizes / c) * c
         
     | 
| 
      
 550 
     | 
    
         
            +
                    if annot is None:
         
     | 
| 
      
 551 
     | 
    
         
            +
                        annot = np.ones((m, 1))
         
     | 
| 
      
 552 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 553 
     | 
    
         
            +
                        annot_m = annot.shape[0]
         
     | 
| 
      
 554 
     | 
    
         
            +
                        if annot_m != self.m:
         
     | 
| 
      
 555 
     | 
    
         
            +
                            raise ValueError('Incorrect number of SNPs in annot')
         
     | 
| 
      
 556 
     | 
    
         
            +
                    # -
         
     | 
| 
      
 557 
     | 
    
         
            +
                    n_a = annot.shape[1]  # number of annotations
         
     | 
| 
      
 558 
     | 
    
         
            +
                    # cor_sum = np.zeros((m, n_a))
         
     | 
| 
      
 559 
     | 
    
         
            +
                    # b = index of first SNP for which SNP 0 is not included in LD Score
         
     | 
| 
      
 560 
     | 
    
         
            +
                    b = np.nonzero(block_left > 0)
         
     | 
| 
      
 561 
     | 
    
         
            +
                    if np.any(b):
         
     | 
| 
      
 562 
     | 
    
         
            +
                        b = b[0][0]
         
     | 
| 
      
 563 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 564 
     | 
    
         
            +
                        b = m
         
     | 
| 
      
 565 
     | 
    
         
            +
                    b = int(np.ceil(b / c) * c)  # round up to a multiple of c
         
     | 
| 
      
 566 
     | 
    
         
            +
                    if b > m:
         
     | 
| 
      
 567 
     | 
    
         
            +
                        c = 1
         
     | 
| 
      
 568 
     | 
    
         
            +
                        b = m
         
     | 
| 
      
 569 
     | 
    
         
            +
             
     | 
| 
      
 570 
     | 
    
         
            +
                    l_A = 0  # l_A := index of leftmost SNP in matrix A
         
     | 
| 
      
 571 
     | 
    
         
            +
                    A = snp_getter(b)
         
     | 
| 
      
 572 
     | 
    
         
            +
                    rfuncAB = np.zeros((b, c))
         
     | 
| 
      
 573 
     | 
    
         
            +
                    rfuncBB = np.zeros((c, c))
         
     | 
| 
      
 574 
     | 
    
         
            +
                    # chunk inside of block
         
     | 
| 
      
 575 
     | 
    
         
            +
                    for l_B in np.arange(0, b, c):  # l_B := index of leftmost SNP in matrix B
         
     | 
| 
      
 576 
     | 
    
         
            +
                        B = A[:, l_B:l_B + c]
         
     | 
| 
      
 577 
     | 
    
         
            +
                        # ld matrix
         
     | 
| 
      
 578 
     | 
    
         
            +
                        np.dot(A.T, B / n, out=rfuncAB)
         
     | 
| 
      
 579 
     | 
    
         
            +
                        # ld matrix square
         
     | 
| 
      
 580 
     | 
    
         
            +
                        rfuncAB = func(rfuncAB)
         
     | 
| 
      
 581 
     | 
    
         
            +
                        add_rfuncAB(rfuncAB, l_A, l_B)
         
     | 
| 
      
 582 
     | 
    
         
            +
                        # cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
         
     | 
| 
      
 583 
     | 
    
         
            +
             
     | 
| 
      
 584 
     | 
    
         
            +
                    # chunk to right of block
         
     | 
| 
      
 585 
     | 
    
         
            +
                    b0 = b
         
     | 
| 
      
 586 
     | 
    
         
            +
                    md = int(c * np.floor(m / c))
         
     | 
| 
      
 587 
     | 
    
         
            +
                    end = md + 1 if md != m else md
         
     | 
| 
      
 588 
     | 
    
         
            +
                    for l_B in trange(b0, end, c, desc=f'Compute r2 cache for {output_cache_file_dir.name}'):
         
     | 
| 
      
 589 
     | 
    
         
            +
                        # check if the annot matrix is all zeros for this block + chunk
         
     | 
| 
      
 590 
     | 
    
         
            +
                        # this happens w/ sparse categories (i.e., pathways)
         
     | 
| 
      
 591 
     | 
    
         
            +
                        # update the block
         
     | 
| 
      
 592 
     | 
    
         
            +
                        old_b = b
         
     | 
| 
      
 593 
     | 
    
         
            +
                        b = int(block_sizes[l_B])
         
     | 
| 
      
 594 
     | 
    
         
            +
                        if l_B > b0 and b > 0:
         
     | 
| 
      
 595 
     | 
    
         
            +
                            # block_size can't increase more than c
         
     | 
| 
      
 596 
     | 
    
         
            +
                            # block_size can't be less than c unless it is zero
         
     | 
| 
      
 597 
     | 
    
         
            +
                            # both of these things make sense
         
     | 
| 
      
 598 
     | 
    
         
            +
                            A = np.hstack((A[:, old_b - b + c:old_b], B))
         
     | 
| 
      
 599 
     | 
    
         
            +
                            l_A += old_b - b + c
         
     | 
| 
      
 600 
     | 
    
         
            +
                        elif l_B == b0 and b > 0:
         
     | 
| 
      
 601 
     | 
    
         
            +
                            A = A[:, b0 - b:b0]
         
     | 
| 
      
 602 
     | 
    
         
            +
                            l_A = b0 - b
         
     | 
| 
      
 603 
     | 
    
         
            +
                        elif b == 0:  # no SNPs to left in window, e.g., after a sequence gap
         
     | 
| 
      
 604 
     | 
    
         
            +
                            A = np.array(()).reshape((n, 0))
         
     | 
| 
      
 605 
     | 
    
         
            +
                            l_A = l_B
         
     | 
| 
      
 606 
     | 
    
         
            +
                        if l_B == md:
         
     | 
| 
      
 607 
     | 
    
         
            +
                            c = m - md
         
     | 
| 
      
 608 
     | 
    
         
            +
                            rfuncAB = np.zeros((b, c))
         
     | 
| 
      
 609 
     | 
    
         
            +
                            rfuncBB = np.zeros((c, c))
         
     | 
| 
      
 610 
     | 
    
         
            +
                        if b != old_b:
         
     | 
| 
      
 611 
     | 
    
         
            +
                            rfuncAB = np.zeros((b, c))
         
     | 
| 
      
 612 
     | 
    
         
            +
                        # -
         
     | 
| 
      
 613 
     | 
    
         
            +
                        B = snp_getter(c)
         
     | 
| 
      
 614 
     | 
    
         
            +
                        p1 = np.all(annot[l_A:l_A + b, :] == 0)
         
     | 
| 
      
 615 
     | 
    
         
            +
                        p2 = np.all(annot[l_B:l_B + c, :] == 0)
         
     | 
| 
      
 616 
     | 
    
         
            +
                        if p1 and p2:
         
     | 
| 
      
 617 
     | 
    
         
            +
                            continue
         
     | 
| 
      
 618 
     | 
    
         
            +
                        # -
         
     | 
| 
      
 619 
     | 
    
         
            +
                        np.dot(A.T, B / n, out=rfuncAB)
         
     | 
| 
      
 620 
     | 
    
         
            +
                        rfuncAB = func(rfuncAB)
         
     | 
| 
      
 621 
     | 
    
         
            +
                        # cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
         
     | 
| 
      
 622 
     | 
    
         
            +
                        # cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
         
     | 
| 
      
 623 
     | 
    
         
            +
                        add_rfuncAB(rfuncAB, l_A, l_B)
         
     | 
| 
      
 624 
     | 
    
         
            +
                        add_rfuncAB(rfuncAB.T, l_B, l_A)
         
     | 
| 
      
 625 
     | 
    
         
            +
                        np.dot(B.T, B / n, out=rfuncBB)
         
     | 
| 
      
 626 
     | 
    
         
            +
                        rfuncBB = func(rfuncBB)
         
     | 
| 
      
 627 
     | 
    
         
            +
                        # cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
         
     | 
| 
      
 628 
     | 
    
         
            +
                        add_rfuncBB(rfuncBB, l_B)
         
     | 
| 
      
 629 
     | 
    
         
            +
                    if len(data) > 0:
         
     | 
| 
      
 630 
     | 
    
         
            +
                        # save remaining data
         
     | 
| 
      
 631 
     | 
    
         
            +
                        # save the cache
         
     | 
| 
      
 632 
     | 
    
         
            +
                        print(f'Start saving the cache file: {output_cache_file_dir / f"{l_B}.npz"}')
         
     | 
| 
      
 633 
     | 
    
         
            +
                        r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(m, m), dtype='float16')
         
     | 
| 
      
 634 
     | 
    
         
            +
                        save_npz(output_cache_file_dir / f'{l_B}.npz', r2_sparse_matrix)
         
     | 
| 
      
 635 
     | 
    
         
            +
                    # combine the cache files
         
     | 
| 
      
 636 
     | 
    
         
            +
                    print(f'Start combining the cache files in {output_cache_file_dir}')
         
     | 
| 
      
 637 
     | 
    
         
            +
                    cached_r2_matrix_files = list(output_cache_file_dir.glob('*.npz'))
         
     | 
| 
      
 638 
     | 
    
         
            +
                    combined_r2_matrix_files = self.load_r2_matrix_from_cache_files(output_cache_file_dir)
         
     | 
| 
      
 639 
     | 
    
         
            +
                    # remove the cache files
         
     | 
| 
      
 640 
     | 
    
         
            +
                    for cached_r2_matrix_file in cached_r2_matrix_files:
         
     | 
| 
      
 641 
     | 
    
         
            +
                        cached_r2_matrix_file.unlink()
         
     | 
| 
      
 642 
     | 
    
         
            +
                    # save the combined r2 matrix
         
     | 
| 
      
 643 
     | 
    
         
            +
                    print(f'Start saving the combined r2 matrix in {output_cache_file_dir}')
         
     | 
| 
      
 644 
     | 
    
         
            +
                    combined_r2_matrix_file = output_cache_file_dir / 'combined_r2_matrix.npz'
         
     | 
| 
      
 645 
     | 
    
         
            +
                    save_npz(combined_r2_matrix_file, combined_r2_matrix_files)
         
     | 
| 
      
 646 
     | 
    
         
            +
             
     | 
| 
      
 647 
     | 
    
         
            +
                def get_ldscore_using_r2_cache(self, annot_matrix, cached_r2_matrix_dir):
         
     | 
| 
      
 648 
     | 
    
         
            +
                    """
         
     | 
| 
      
 649 
     | 
    
         
            +
                    Compute the r2 matrix multiplication with annot_matrix
         
     | 
| 
      
 650 
     | 
    
         
            +
                    """
         
     | 
| 
      
 651 
     | 
    
         
            +
                    # Compute the r2 matrix multiplication with annot_matrix
         
     | 
| 
      
 652 
     | 
    
         
            +
                    cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
         
     | 
| 
      
 653 
     | 
    
         
            +
                    # iter the cached r2 matrix files
         
     | 
| 
      
 654 
     | 
    
         
            +
                    result_matrix = np.zeros((self.m, annot_matrix.shape[1]))
         
     | 
| 
      
 655 
     | 
    
         
            +
                    cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
         
     | 
| 
      
 656 
     | 
    
         
            +
                    assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
         
     | 
| 
      
 657 
     | 
    
         
            +
                                                             f'Please run the function compute_r2_cache first!')
         
     | 
| 
      
 658 
     | 
    
         
            +
                    for r2_matrix_file in tqdm(cached_r2_matrix_files, desc=f'Compute ld score for {cached_r2_matrix_dir.name}'):
         
     | 
| 
      
 659 
     | 
    
         
            +
                        print(f'Compute r2 matrix multiplication for {r2_matrix_file}')
         
     | 
| 
      
 660 
     | 
    
         
            +
                        r2_matrix = load_npz(r2_matrix_file)
         
     | 
| 
      
 661 
     | 
    
         
            +
                        result_matrix += r2_matrix.dot(annot_matrix)
         
     | 
| 
      
 662 
     | 
    
         
            +
                    return result_matrix
         
     | 
| 
      
 663 
     | 
    
         
            +
             
     | 
| 
      
 664 
     | 
    
         
            +
                def load_r2_matrix_from_cache_files(self, cached_r2_matrix_dir):
         
     | 
| 
      
 665 
     | 
    
         
            +
                    """
         
     | 
| 
      
 666 
     | 
    
         
            +
                    Load the r2 matrix from cache
         
     | 
| 
      
 667 
     | 
    
         
            +
                    """
         
     | 
| 
      
 668 
     | 
    
         
            +
                    cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
         
     | 
| 
      
 669 
     | 
    
         
            +
                    # iter the cached r2 matrix files
         
     | 
| 
      
 670 
     | 
    
         
            +
                    cached_r2_matrix_files = list(cached_r2_matrix_dir.glob('*.npz'))
         
     | 
| 
      
 671 
     | 
    
         
            +
                    assert len(cached_r2_matrix_files) > 0, (f'No cached r2 matrix files in {cached_r2_matrix_dir}'
         
     | 
| 
      
 672 
     | 
    
         
            +
                                                             f'Please run the function compute_r2_cache first!')
         
     | 
| 
      
 673 
     | 
    
         
            +
                    # load the r2 matrix
         
     | 
| 
      
 674 
     | 
    
         
            +
                    r2_matrix = load_npz(cached_r2_matrix_files[0])
         
     | 
| 
      
 675 
     | 
    
         
            +
                    for r2_matrix_file in tqdm(cached_r2_matrix_files[1:], desc=f'Load r2 matrix from {cached_r2_matrix_dir.name}'):
         
     | 
| 
      
 676 
     | 
    
         
            +
                        print(f'Load r2 matrix from {r2_matrix_file}')
         
     | 
| 
      
 677 
     | 
    
         
            +
                        r2_matrix += load_npz(r2_matrix_file)
         
     | 
| 
      
 678 
     | 
    
         
            +
                    # to float16
         
     | 
| 
      
 679 
     | 
    
         
            +
                    r2_matrix = r2_matrix.astype('float16')
         
     | 
| 
      
 680 
     | 
    
         
            +
                    return r2_matrix
         
     | 
| 
      
 681 
     | 
    
         
            +
                def load_combined_r2_matrix(self, cached_r2_matrix_dir):
         
     | 
| 
      
 682 
     | 
    
         
            +
                    """
         
     | 
| 
      
 683 
     | 
    
         
            +
                    Load the combined r2 matrix
         
     | 
| 
      
 684 
     | 
    
         
            +
                    """
         
     | 
| 
      
 685 
     | 
    
         
            +
                    combined_r2_matrix_file = Path(cached_r2_matrix_dir) / 'combined_r2_matrix.npz'
         
     | 
| 
      
 686 
     | 
    
         
            +
                    assert combined_r2_matrix_file.exists(), (f'No combined r2 matrix file in {cached_r2_matrix_dir}'
         
     | 
| 
      
 687 
     | 
    
         
            +
                                                              f'Should delete the cache files and run the function compute_r2_cache first!')
         
     | 
| 
      
 688 
     | 
    
         
            +
                    # load the r2 matrix
         
     | 
| 
      
 689 
     | 
    
         
            +
                    r2_matrix = load_npz(combined_r2_matrix_file)
         
     | 
| 
      
 690 
     | 
    
         
            +
                    # to float16
         
     | 
| 
      
 691 
     | 
    
         
            +
                    r2_matrix = r2_matrix.astype('float16')
         
     | 
| 
      
 692 
     | 
    
         
            +
                    return r2_matrix
         
     | 
| 
      
 693 
     | 
    
         
            +
             
     | 
| 
      
 694 
     | 
    
         
            +
            def load_bfile(bfile_chr_prefix):
         
     | 
| 
      
 695 
     | 
    
         
            +
                PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
         
     | 
| 
      
 696 
     | 
    
         
            +
                PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
         
     | 
| 
      
 697 
     | 
    
         
            +
             
     | 
| 
      
 698 
     | 
    
         
            +
                snp_file, snp_obj = bfile_chr_prefix + '.bim', PlinkBIMFile
         
     | 
| 
      
 699 
     | 
    
         
            +
                array_snps = snp_obj(snp_file)
         
     | 
| 
      
 700 
     | 
    
         
            +
                m = len(array_snps.IDList)
         
     | 
| 
      
 701 
     | 
    
         
            +
                print(f'Read list of {m} SNPs from {snp_file}')
         
     | 
| 
      
 702 
     | 
    
         
            +
                #
         
     | 
| 
      
 703 
     | 
    
         
            +
                # Load fam
         
     | 
| 
      
 704 
     | 
    
         
            +
                ind_file, ind_obj = bfile_chr_prefix + '.fam', PlinkFAMFile
         
     | 
| 
      
 705 
     | 
    
         
            +
                array_indivs = ind_obj(ind_file)
         
     | 
| 
      
 706 
     | 
    
         
            +
                n = len(array_indivs.IDList)
         
     | 
| 
      
 707 
     | 
    
         
            +
                print(f'Read list of {n} individuals from {ind_file}')
         
     | 
| 
      
 708 
     | 
    
         
            +
             
     | 
| 
      
 709 
     | 
    
         
            +
                # Load genotype array
         
     | 
| 
      
 710 
     | 
    
         
            +
                array_file, array_obj = bfile_chr_prefix + '.bed', PlinkBEDFileWithR2Cache
         
     | 
| 
      
 711 
     | 
    
         
            +
                geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
         
     | 
| 
      
 712 
     | 
    
         
            +
             
     | 
| 
      
 713 
     | 
    
         
            +
                return array_snps, array_indivs, geno_array
         
     | 
| 
      
 714 
     | 
    
         
            +
             
     | 
| 
      
 715 
     | 
    
         
            +
             
     | 
| 
      
 716 
     | 
    
         
            +
            def generate_r2_matrix_chr_cache(bfile_chr_prefix, ld_wind_cm, output_cache_file_dir):
         
     | 
| 
      
 717 
     | 
    
         
            +
                # Load genotype array
         
     | 
| 
      
 718 
     | 
    
         
            +
                array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix)
         
     | 
| 
      
 719 
     | 
    
         
            +
                # Compute block lefts
         
     | 
| 
      
 720 
     | 
    
         
            +
                block_left = getBlockLefts(geno_array.df[:, 3], ld_wind_cm)
         
     | 
| 
      
 721 
     | 
    
         
            +
                # Compute LD score
         
     | 
| 
      
 722 
     | 
    
         
            +
                r2_matrix = geno_array.load_r2_matrix_from_cache(output_cache_file_dir)
         
     | 
| 
      
 723 
     | 
    
         
            +
             
     | 
| 
      
 724 
     | 
    
         
            +
             
     | 
| 
      
 725 
     | 
    
         
            +
            def generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_wind_cm=1):
         
     | 
| 
      
 726 
     | 
    
         
            +
                r2_cache_dir = Path(r2_cache_dir)
         
     | 
| 
      
 727 
     | 
    
         
            +
             
     | 
| 
      
 728 
     | 
    
         
            +
                for chr in chromosome_list:
         
     | 
| 
      
 729 
     | 
    
         
            +
                    output_cache_file_prefix = r2_cache_dir / f'chr{chr}'
         
     | 
| 
      
 730 
     | 
    
         
            +
                    output_cache_file_prefix.mkdir(parents=True, exist_ok=True)
         
     | 
| 
      
 731 
     | 
    
         
            +
                    bfile_chr_prefix = bfile_prefix + '.' + str(chr)
         
     | 
| 
      
 732 
     | 
    
         
            +
                    generate_r2_matrix_chr_cache(bfile_chr_prefix,
         
     | 
| 
      
 733 
     | 
    
         
            +
                                                 ld_wind_cm=ld_wind_cm,
         
     | 
| 
      
 734 
     | 
    
         
            +
                                                 output_cache_file_dir=output_cache_file_prefix)
         
     | 
| 
      
 735 
     | 
    
         
            +
                    print(f'Compute r2 matrix for chr{chr} done!')
         
     |