gsMap 1.67__py3-none-any.whl → 1.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/{GNN_VAE → GNN}/__init__.py +0 -0
 - gsMap/{GNN_VAE → GNN}/adjacency_matrix.py +75 -75
 - gsMap/{GNN_VAE → GNN}/model.py +89 -89
 - gsMap/{GNN_VAE → GNN}/train.py +88 -86
 - gsMap/__init__.py +5 -5
 - gsMap/__main__.py +2 -2
 - gsMap/cauchy_combination_test.py +141 -141
 - gsMap/config.py +805 -803
 - gsMap/diagnosis.py +273 -273
 - gsMap/find_latent_representation.py +133 -145
 - gsMap/format_sumstats.py +407 -407
 - gsMap/generate_ldscore.py +618 -618
 - gsMap/latent_to_gene.py +234 -234
 - gsMap/main.py +31 -31
 - gsMap/report.py +160 -160
 - gsMap/run_all_mode.py +194 -194
 - gsMap/setup.py +0 -0
 - gsMap/spatial_ldsc_multiple_sumstats.py +380 -380
 - gsMap/templates/report_template.html +198 -198
 - gsMap/utils/__init__.py +0 -0
 - gsMap/utils/generate_r2_matrix.py +735 -735
 - gsMap/utils/jackknife.py +514 -514
 - gsMap/utils/make_annotations.py +518 -518
 - gsMap/utils/manhattan_plot.py +639 -639
 - gsMap/utils/regression_read.py +294 -294
 - gsMap/visualize.py +198 -198
 - {gsmap-1.67.dist-info → gsmap-1.71.dist-info}/LICENSE +21 -21
 - {gsmap-1.67.dist-info → gsmap-1.71.dist-info}/METADATA +28 -22
 - gsmap-1.71.dist-info/RECORD +31 -0
 - gsmap-1.67.dist-info/RECORD +0 -31
 - {gsmap-1.67.dist-info → gsmap-1.71.dist-info}/WHEEL +0 -0
 - {gsmap-1.67.dist-info → gsmap-1.71.dist-info}/entry_points.txt +0 -0
 
    
        gsMap/utils/regression_read.py
    CHANGED
    
    | 
         @@ -1,294 +1,294 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            import numpy as np
         
     | 
| 
       2 
     | 
    
         
            -
            import pandas as pd
         
     | 
| 
       3 
     | 
    
         
            -
            import os
         
     | 
| 
       4 
     | 
    
         
            -
             
     | 
| 
       5 
     | 
    
         
            -
             
     | 
| 
       6 
     | 
    
         
            -
            # Fun for reading gwas data
         
     | 
| 
       7 
     | 
    
         
            -
            def _read_sumstats(fh, alleles=False, dropna=False):
         
     | 
| 
       8 
     | 
    
         
            -
                '''
         
     | 
| 
       9 
     | 
    
         
            -
                Parse gwas summary statistics.
         
     | 
| 
       10 
     | 
    
         
            -
                '''
         
     | 
| 
       11 
     | 
    
         
            -
                print('Reading summary statistics from {S} ...'.format(S=fh))
         
     | 
| 
       12 
     | 
    
         
            -
                sumstats = ps_sumstats(fh, alleles=alleles, dropna=dropna)
         
     | 
| 
       13 
     | 
    
         
            -
                print('Read summary statistics for {N} SNPs.'.format(N=len(sumstats)))
         
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
                m = len(sumstats)
         
     | 
| 
       16 
     | 
    
         
            -
                sumstats = sumstats.drop_duplicates(subset='SNP')
         
     | 
| 
       17 
     | 
    
         
            -
                if m > len(sumstats):
         
     | 
| 
       18 
     | 
    
         
            -
                    print('Dropped {M} SNPs with duplicated rs numbers.'.format(M=m - len(sumstats)))
         
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
                return sumstats
         
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
       23 
     | 
    
         
            -
            def ps_sumstats(fh, alleles=False, dropna=True):
         
     | 
| 
       24 
     | 
    
         
            -
                '''
         
     | 
| 
       25 
     | 
    
         
            -
                Parses .sumstats files. See docs/file_formats_sumstats.txt.
         
     | 
| 
       26 
     | 
    
         
            -
                '''
         
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
                dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
         
     | 
| 
       29 
     | 
    
         
            -
                compression = get_compression(fh)
         
     | 
| 
       30 
     | 
    
         
            -
                usecols = ['SNP', 'Z', 'N']
         
     | 
| 
       31 
     | 
    
         
            -
                if alleles:
         
     | 
| 
       32 
     | 
    
         
            -
                    usecols += ['A1', 'A2']
         
     | 
| 
       33 
     | 
    
         
            -
             
     | 
| 
       34 
     | 
    
         
            -
                try:
         
     | 
| 
       35 
     | 
    
         
            -
                    x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
         
     | 
| 
       36 
     | 
    
         
            -
                except (AttributeError, ValueError) as e:
         
     | 
| 
       37 
     | 
    
         
            -
                    raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
         
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
       39 
     | 
    
         
            -
                if dropna:
         
     | 
| 
       40 
     | 
    
         
            -
                    x = x.dropna(how='any')
         
     | 
| 
       41 
     | 
    
         
            -
             
     | 
| 
       42 
     | 
    
         
            -
                return x
         
     | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
       44 
     | 
    
         
            -
             
     | 
| 
       45 
     | 
    
         
            -
            def get_compression(fh):
         
     | 
| 
       46 
     | 
    
         
            -
                '''
         
     | 
| 
       47 
     | 
    
         
            -
                Determin the format of compression used with read_csv?
         
     | 
| 
       48 
     | 
    
         
            -
                '''
         
     | 
| 
       49 
     | 
    
         
            -
                if fh.endswith('gz'):
         
     | 
| 
       50 
     | 
    
         
            -
                    compression = 'gzip'
         
     | 
| 
       51 
     | 
    
         
            -
                elif fh.endswith('bz2'):
         
     | 
| 
       52 
     | 
    
         
            -
                    compression = 'bz2'
         
     | 
| 
       53 
     | 
    
         
            -
                else:
         
     | 
| 
       54 
     | 
    
         
            -
                    compression = None
         
     | 
| 
       55 
     | 
    
         
            -
                # -
         
     | 
| 
       56 
     | 
    
         
            -
                return compression
         
     | 
| 
       57 
     | 
    
         
            -
             
     | 
| 
       58 
     | 
    
         
            -
             
     | 
| 
       59 
     | 
    
         
            -
            def read_csv(fh, **kwargs):
         
     | 
| 
       60 
     | 
    
         
            -
                '''
         
     | 
| 
       61 
     | 
    
         
            -
                Read the csv data
         
     | 
| 
       62 
     | 
    
         
            -
                '''
         
     | 
| 
       63 
     | 
    
         
            -
                return pd.read_csv(fh, sep='\s+', na_values='.', **kwargs)
         
     | 
| 
       64 
     | 
    
         
            -
             
     | 
| 
       65 
     | 
    
         
            -
             
     | 
| 
       66 
     | 
    
         
            -
            # Fun for reading loading LD scores 
         
     | 
| 
       67 
     | 
    
         
            -
            def which_compression(fh):
         
     | 
| 
       68 
     | 
    
         
            -
                '''
         
     | 
| 
       69 
     | 
    
         
            -
                Given a file prefix, figure out what sort of compression to use.
         
     | 
| 
       70 
     | 
    
         
            -
                '''
         
     | 
| 
       71 
     | 
    
         
            -
                if os.access(fh + '.bz2', 4):
         
     | 
| 
       72 
     | 
    
         
            -
                    suffix = '.bz2'
         
     | 
| 
       73 
     | 
    
         
            -
                    compression = 'bz2'
         
     | 
| 
       74 
     | 
    
         
            -
                elif os.access(fh + '.gz', 4):
         
     | 
| 
       75 
     | 
    
         
            -
                    suffix = '.gz'
         
     | 
| 
       76 
     | 
    
         
            -
                    compression = 'gzip'
         
     | 
| 
       77 
     | 
    
         
            -
                elif os.access(fh + '.parquet', 4):
         
     | 
| 
       78 
     | 
    
         
            -
                    suffix = '.parquet'
         
     | 
| 
       79 
     | 
    
         
            -
                    compression = 'parquet'
         
     | 
| 
       80 
     | 
    
         
            -
                elif os.access(fh + '.feather', 4):
         
     | 
| 
       81 
     | 
    
         
            -
                    suffix = '.feather'
         
     | 
| 
       82 
     | 
    
         
            -
                    compression = 'feather'
         
     | 
| 
       83 
     | 
    
         
            -
                elif os.access(fh, 4):
         
     | 
| 
       84 
     | 
    
         
            -
                    suffix = ''
         
     | 
| 
       85 
     | 
    
         
            -
                    compression = None
         
     | 
| 
       86 
     | 
    
         
            -
                else:
         
     | 
| 
       87 
     | 
    
         
            -
                    raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
         
     | 
| 
       88 
     | 
    
         
            -
                # -
         
     | 
| 
       89 
     | 
    
         
            -
                return suffix, compression
         
     | 
| 
       90 
     | 
    
         
            -
             
     | 
| 
       91 
     | 
    
         
            -
             
     | 
| 
       92 
     | 
    
         
            -
            def _read_ref_ld(ld_file):
         
     | 
| 
       93 
     | 
    
         
            -
                suffix = '.l2.ldscore'
         
     | 
| 
       94 
     | 
    
         
            -
                file = ld_file
         
     | 
| 
       95 
     | 
    
         
            -
                first_fh = f'{file}1{suffix}'
         
     | 
| 
       96 
     | 
    
         
            -
                s, compression = which_compression(first_fh)
         
     | 
| 
       97 
     | 
    
         
            -
                #
         
     | 
| 
       98 
     | 
    
         
            -
                ldscore_array = []
         
     | 
| 
       99 
     | 
    
         
            -
                print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
         
     | 
| 
       100 
     | 
    
         
            -
             
     | 
| 
       101 
     | 
    
         
            -
                for chr in range(1, 23):
         
     | 
| 
       102 
     | 
    
         
            -
                    file_chr = f'{file}{chr}{suffix}{s}'
         
     | 
| 
       103 
     | 
    
         
            -
                    #
         
     | 
| 
       104 
     | 
    
         
            -
                    if compression == 'parquet':
         
     | 
| 
       105 
     | 
    
         
            -
                        x = pd.read_parquet(file_chr)
         
     | 
| 
       106 
     | 
    
         
            -
                    elif compression == 'feather':
         
     | 
| 
       107 
     | 
    
         
            -
                        x = pd.read_feather(file_chr)
         
     | 
| 
       108 
     | 
    
         
            -
                    else:
         
     | 
| 
       109 
     | 
    
         
            -
                        x = pd.read_csv(file_chr, compression=compression, sep='\t')
         
     | 
| 
       110 
     | 
    
         
            -
             
     | 
| 
       111 
     | 
    
         
            -
                    x = x.sort_values(by=['CHR', 'BP'])  # SEs will be wrong unless sorted
         
     | 
| 
       112 
     | 
    
         
            -
             
     | 
| 
       113 
     | 
    
         
            -
                    columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
         
     | 
| 
       114 
     | 
    
         
            -
                    columns_to_drop = [col for col in columns_to_drop if col in x.columns]
         
     | 
| 
       115 
     | 
    
         
            -
                    x = x.drop(columns_to_drop, axis=1)
         
     | 
| 
       116 
     | 
    
         
            -
             
     | 
| 
       117 
     | 
    
         
            -
                    ldscore_array.append(x)
         
     | 
| 
       118 
     | 
    
         
            -
                #
         
     | 
| 
       119 
     | 
    
         
            -
                ref_ld = pd.concat(ldscore_array, axis=0)
         
     | 
| 
       120 
     | 
    
         
            -
                return ref_ld
         
     | 
| 
       121 
     | 
    
         
            -
             
     | 
| 
       122 
     | 
    
         
            -
             
     | 
| 
       123 
     | 
    
         
            -
            def _read_ref_ld_v2(ld_file):
         
     | 
| 
       124 
     | 
    
         
            -
                suffix = '.l2.ldscore'
         
     | 
| 
       125 
     | 
    
         
            -
                file = ld_file
         
     | 
| 
       126 
     | 
    
         
            -
                first_fh = f'{file}1{suffix}'
         
     | 
| 
       127 
     | 
    
         
            -
                s, compression = which_compression(first_fh)
         
     | 
| 
       128 
     | 
    
         
            -
                print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
         
     | 
| 
       129 
     | 
    
         
            -
                ref_ld = pd.concat(
         
     | 
| 
       130 
     | 
    
         
            -
                    [pd.read_feather(f'{file}{chr}{suffix}{s}') for chr in range(1, 23)], axis=0
         
     | 
| 
       131 
     | 
    
         
            -
                )
         
     | 
| 
       132 
     | 
    
         
            -
                # set first column as index
         
     | 
| 
       133 
     | 
    
         
            -
                ref_ld.rename(columns={'index': 'SNP'}, inplace=True)
         
     | 
| 
       134 
     | 
    
         
            -
                ref_ld.set_index('SNP', inplace=True)
         
     | 
| 
       135 
     | 
    
         
            -
                return ref_ld
         
     | 
| 
       136 
     | 
    
         
            -
             
     | 
| 
       137 
     | 
    
         
            -
            def _read_M_v2(ld_file, n_annot, not_M_5_50):
         
     | 
| 
       138 
     | 
    
         
            -
                suffix = '.l2.M'
         
     | 
| 
       139 
     | 
    
         
            -
                if not not_M_5_50:
         
     | 
| 
       140 
     | 
    
         
            -
                    suffix += '_5_50'
         
     | 
| 
       141 
     | 
    
         
            -
                M_annot= np.array(
         
     | 
| 
       142 
     | 
    
         
            -
                    [
         
     | 
| 
       143 
     | 
    
         
            -
                        np.loadtxt(f'{ld_file}{chr}{suffix}', )
         
     | 
| 
       144 
     | 
    
         
            -
                     for chr in range(1, 23)]
         
     | 
| 
       145 
     | 
    
         
            -
             
     | 
| 
       146 
     | 
    
         
            -
                )
         
     | 
| 
       147 
     | 
    
         
            -
                assert M_annot.shape == (22, n_annot)
         
     | 
| 
       148 
     | 
    
         
            -
                return M_annot.sum(axis=0).reshape((1, n_annot))
         
     | 
| 
       149 
     | 
    
         
            -
            # Fun for reading M annotations 
         
     | 
| 
       150 
     | 
    
         
            -
            def _read_M(ld_file, n_annot, not_M_5_50):
         
     | 
| 
       151 
     | 
    
         
            -
                '''
         
     | 
| 
       152 
     | 
    
         
            -
                Read M (--M, --M-file, etc).
         
     | 
| 
       153 
     | 
    
         
            -
                '''
         
     | 
| 
       154 
     | 
    
         
            -
                M_annot = M(ld_file, common=(not not_M_5_50))
         
     | 
| 
       155 
     | 
    
         
            -
             
     | 
| 
       156 
     | 
    
         
            -
                try:
         
     | 
| 
       157 
     | 
    
         
            -
                    M_annot = np.array(M_annot).reshape((1, n_annot))
         
     | 
| 
       158 
     | 
    
         
            -
                except ValueError as e:
         
     | 
| 
       159 
     | 
    
         
            -
                    raise ValueError('# terms in --M must match # of LD Scores in --ref-ld.\n' + str(e.args))
         
     | 
| 
       160 
     | 
    
         
            -
                return M_annot
         
     | 
| 
       161 
     | 
    
         
            -
             
     | 
| 
       162 
     | 
    
         
            -
             
     | 
| 
       163 
     | 
    
         
            -
            def M(fh, common=False):
         
     | 
| 
       164 
     | 
    
         
            -
                '''
         
     | 
| 
       165 
     | 
    
         
            -
                Parses .l{N}.M files, split across num chromosomes.
         
     | 
| 
       166 
     | 
    
         
            -
                '''
         
     | 
| 
       167 
     | 
    
         
            -
                suffix = '.l2.M'
         
     | 
| 
       168 
     | 
    
         
            -
                if common:
         
     | 
| 
       169 
     | 
    
         
            -
                    suffix += '_5_50'
         
     | 
| 
       170 
     | 
    
         
            -
                # -
         
     | 
| 
       171 
     | 
    
         
            -
                M_array = []
         
     | 
| 
       172 
     | 
    
         
            -
                for i in range(1, 23):
         
     | 
| 
       173 
     | 
    
         
            -
                    M_current = pd.read_csv(f'{fh}{i}' + suffix, header=None)
         
     | 
| 
       174 
     | 
    
         
            -
                    M_array.append(M_current)
         
     | 
| 
       175 
     | 
    
         
            -
             
     | 
| 
       176 
     | 
    
         
            -
                M_array = pd.concat(M_array, axis=1).sum(axis=1)
         
     | 
| 
       177 
     | 
    
         
            -
                # -
         
     | 
| 
       178 
     | 
    
         
            -
                return np.array(M_array).reshape((1, len(M_array)))
         
     | 
| 
       179 
     | 
    
         
            -
             
     | 
| 
       180 
     | 
    
         
            -
             
     | 
| 
       181 
     | 
    
         
            -
            def _check_variance(M_annot, ref_ld):
         
     | 
| 
       182 
     | 
    
         
            -
                '''
         
     | 
| 
       183 
     | 
    
         
            -
                Remove zero-variance LD Scores.
         
     | 
| 
       184 
     | 
    
         
            -
                '''
         
     | 
| 
       185 
     | 
    
         
            -
                ii = ref_ld.iloc[:, 1:].var() == 0  # NB there is a SNP column here
         
     | 
| 
       186 
     | 
    
         
            -
                if ii.all():
         
     | 
| 
       187 
     | 
    
         
            -
                    raise ValueError('All LD Scores have zero variance.')
         
     | 
| 
       188 
     | 
    
         
            -
                else:
         
     | 
| 
       189 
     | 
    
         
            -
                    print('Removing partitioned LD Scores with zero variance.')
         
     | 
| 
       190 
     | 
    
         
            -
                    ii_snp = np.array([True] + list(~ii))
         
     | 
| 
       191 
     | 
    
         
            -
                    ii_m = np.array(~ii)
         
     | 
| 
       192 
     | 
    
         
            -
                    ref_ld = ref_ld.iloc[:, ii_snp]
         
     | 
| 
       193 
     | 
    
         
            -
                    M_annot = M_annot[:, ii_m]
         
     | 
| 
       194 
     | 
    
         
            -
                # -
         
     | 
| 
       195 
     | 
    
         
            -
                return M_annot, ref_ld, ii
         
     | 
| 
       196 
     | 
    
         
            -
            def _check_variance_v2(M_annot, ref_ld):
         
     | 
| 
       197 
     | 
    
         
            -
                ii = ref_ld.var() == 0
         
     | 
| 
       198 
     | 
    
         
            -
                if ii.all():
         
     | 
| 
       199 
     | 
    
         
            -
                    raise ValueError('All LD Scores have zero variance.')
         
     | 
| 
       200 
     | 
    
         
            -
                elif not ii.any():
         
     | 
| 
       201 
     | 
    
         
            -
                    print('No partitioned LD Scores have zero variance.')
         
     | 
| 
       202 
     | 
    
         
            -
                else:
         
     | 
| 
       203 
     | 
    
         
            -
                    ii_snp= ii_m = np.array(~ii)
         
     | 
| 
       204 
     | 
    
         
            -
                    print(f'Removing {sum(ii)} partitioned LD Scores with zero variance.')
         
     | 
| 
       205 
     | 
    
         
            -
                    ref_ld = ref_ld.iloc[:, ii_snp]
         
     | 
| 
       206 
     | 
    
         
            -
                    M_annot = M_annot[:, ii_m]
         
     | 
| 
       207 
     | 
    
         
            -
                return M_annot, ref_ld
         
     | 
| 
       208 
     | 
    
         
            -
             
     | 
| 
       209 
     | 
    
         
            -
             
     | 
| 
       210 
     | 
    
         
            -
            # Fun for reading regression weights
         
     | 
| 
       211 
     | 
    
         
            -
            def which_compression(fh):
         
     | 
| 
       212 
     | 
    
         
            -
                '''
         
     | 
| 
       213 
     | 
    
         
            -
                Given a file prefix, figure out what sort of compression to use.
         
     | 
| 
       214 
     | 
    
         
            -
                '''
         
     | 
| 
       215 
     | 
    
         
            -
                if os.access(fh + '.bz2', 4):
         
     | 
| 
       216 
     | 
    
         
            -
                    suffix = '.bz2'
         
     | 
| 
       217 
     | 
    
         
            -
                    compression = 'bz2'
         
     | 
| 
       218 
     | 
    
         
            -
                elif os.access(fh + '.gz', 4):
         
     | 
| 
       219 
     | 
    
         
            -
                    suffix = '.gz'
         
     | 
| 
       220 
     | 
    
         
            -
                    compression = 'gzip'
         
     | 
| 
       221 
     | 
    
         
            -
                elif os.access(fh + '.parquet', 4):
         
     | 
| 
       222 
     | 
    
         
            -
                    suffix = '.parquet'
         
     | 
| 
       223 
     | 
    
         
            -
                    compression = 'parquet'
         
     | 
| 
       224 
     | 
    
         
            -
                elif os.access(fh + '.feather', 4):
         
     | 
| 
       225 
     | 
    
         
            -
                    suffix = '.feather'
         
     | 
| 
       226 
     | 
    
         
            -
                    compression = 'feather'
         
     | 
| 
       227 
     | 
    
         
            -
                elif os.access(fh, 4):
         
     | 
| 
       228 
     | 
    
         
            -
                    suffix = ''
         
     | 
| 
       229 
     | 
    
         
            -
                    compression = None
         
     | 
| 
       230 
     | 
    
         
            -
                else:
         
     | 
| 
       231 
     | 
    
         
            -
                    raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
         
     | 
| 
       232 
     | 
    
         
            -
                # -
         
     | 
| 
       233 
     | 
    
         
            -
                return suffix, compression
         
     | 
| 
       234 
     | 
    
         
            -
             
     | 
| 
       235 
     | 
    
         
            -
             
     | 
| 
       236 
     | 
    
         
            -
            def _read_w_ld(w_file):
         
     | 
| 
       237 
     | 
    
         
            -
                suffix = '.l2.ldscore'
         
     | 
| 
       238 
     | 
    
         
            -
                file = w_file
         
     | 
| 
       239 
     | 
    
         
            -
                first_fh = f'{file}1{suffix}'
         
     | 
| 
       240 
     | 
    
         
            -
                s, compression = which_compression(first_fh)
         
     | 
| 
       241 
     | 
    
         
            -
                #
         
     | 
| 
       242 
     | 
    
         
            -
                w_array = []
         
     | 
| 
       243 
     | 
    
         
            -
                print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
         
     | 
| 
       244 
     | 
    
         
            -
             
     | 
| 
       245 
     | 
    
         
            -
                for chr in range(1, 23):
         
     | 
| 
       246 
     | 
    
         
            -
                    file_chr = f'{file}{chr}{suffix}{s}'
         
     | 
| 
       247 
     | 
    
         
            -
                    #
         
     | 
| 
       248 
     | 
    
         
            -
                    if compression == 'parquet':
         
     | 
| 
       249 
     | 
    
         
            -
                        x = pd.read_parquet(file_chr)
         
     | 
| 
       250 
     | 
    
         
            -
                    elif compression == 'feather':
         
     | 
| 
       251 
     | 
    
         
            -
                        x = pd.read_feather(file_chr)
         
     | 
| 
       252 
     | 
    
         
            -
                    else:
         
     | 
| 
       253 
     | 
    
         
            -
                        x = pd.read_csv(file_chr, compression=compression, sep='\t')
         
     | 
| 
       254 
     | 
    
         
            -
             
     | 
| 
       255 
     | 
    
         
            -
                    x = x.sort_values(by=['CHR', 'BP'])
         
     | 
| 
       256 
     | 
    
         
            -
             
     | 
| 
       257 
     | 
    
         
            -
                    columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
         
     | 
| 
       258 
     | 
    
         
            -
                    columns_to_drop = [col for col in columns_to_drop if col in x.columns]
         
     | 
| 
       259 
     | 
    
         
            -
                    x = x.drop(columns_to_drop, axis=1)
         
     | 
| 
       260 
     | 
    
         
            -
             
     | 
| 
       261 
     | 
    
         
            -
                    w_array.append(x)
         
     | 
| 
       262 
     | 
    
         
            -
                #
         
     | 
| 
       263 
     | 
    
         
            -
                w_ld = pd.concat(w_array, axis=0)
         
     | 
| 
       264 
     | 
    
         
            -
                w_ld.columns = ['SNP', 'LD_weights']
         
     | 
| 
       265 
     | 
    
         
            -
             
     | 
| 
       266 
     | 
    
         
            -
                return w_ld
         
     | 
| 
       267 
     | 
    
         
            -
             
     | 
| 
       268 
     | 
    
         
            -
             
     | 
| 
       269 
     | 
    
         
            -
            # Fun for merging
         
     | 
| 
       270 
     | 
    
         
            -
            def _merge_and_log(ld, sumstats, noun):
         
     | 
| 
       271 
     | 
    
         
            -
                '''
         
     | 
| 
       272 
     | 
    
         
            -
                Wrap smart merge with log messages about # of SNPs.
         
     | 
| 
       273 
     | 
    
         
            -
                '''
         
     | 
| 
       274 
     | 
    
         
            -
                sumstats = smart_merge(ld, sumstats)
         
     | 
| 
       275 
     | 
    
         
            -
                msg = 'After merging with {F}, {N} SNPs remain.'
         
     | 
| 
       276 
     | 
    
         
            -
                if len(sumstats) == 0:
         
     | 
| 
       277 
     | 
    
         
            -
                    raise ValueError(msg.format(N=len(sumstats), F=noun))
         
     | 
| 
       278 
     | 
    
         
            -
                else:
         
     | 
| 
       279 
     | 
    
         
            -
                    print(msg.format(N=len(sumstats), F=noun))
         
     | 
| 
       280 
     | 
    
         
            -
                # -
         
     | 
| 
       281 
     | 
    
         
            -
                return sumstats
         
     | 
| 
       282 
     | 
    
         
            -
             
     | 
| 
       283 
     | 
    
         
            -
             
     | 
| 
       284 
     | 
    
         
            -
            def smart_merge(x, y):
         
     | 
| 
       285 
     | 
    
         
            -
                '''
         
     | 
| 
       286 
     | 
    
         
            -
                Check if SNP columns are equal. If so, save time by using concat instead of merge.
         
     | 
| 
       287 
     | 
    
         
            -
                '''
         
     | 
| 
       288 
     | 
    
         
            -
                if len(x) == len(y) and (x.index == y.index).all() and (x.SNP == y.SNP).all():
         
     | 
| 
       289 
     | 
    
         
            -
                    x = x.reset_index(drop=True)
         
     | 
| 
       290 
     | 
    
         
            -
                    y = y.reset_index(drop=True).drop('SNP', 1)
         
     | 
| 
       291 
     | 
    
         
            -
                    out = pd.concat([x, y], axis=1)
         
     | 
| 
       292 
     | 
    
         
            -
                else:
         
     | 
| 
       293 
     | 
    
         
            -
                    out = pd.merge(x, y, how='inner', on='SNP')
         
     | 
| 
       294 
     | 
    
         
            -
                return out
         
     | 
| 
      
 1 
     | 
    
         
            +
            import numpy as np
         
     | 
| 
      
 2 
     | 
    
         
            +
            import pandas as pd
         
     | 
| 
      
 3 
     | 
    
         
            +
            import os
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            # Fun for reading gwas data
         
     | 
| 
      
 7 
     | 
    
         
            +
            def _read_sumstats(fh, alleles=False, dropna=False):
         
     | 
| 
      
 8 
     | 
    
         
            +
                '''
         
     | 
| 
      
 9 
     | 
    
         
            +
                Parse gwas summary statistics.
         
     | 
| 
      
 10 
     | 
    
         
            +
                '''
         
     | 
| 
      
 11 
     | 
    
         
            +
                print('Reading summary statistics from {S} ...'.format(S=fh))
         
     | 
| 
      
 12 
     | 
    
         
            +
                sumstats = ps_sumstats(fh, alleles=alleles, dropna=dropna)
         
     | 
| 
      
 13 
     | 
    
         
            +
                print('Read summary statistics for {N} SNPs.'.format(N=len(sumstats)))
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                m = len(sumstats)
         
     | 
| 
      
 16 
     | 
    
         
            +
                sumstats = sumstats.drop_duplicates(subset='SNP')
         
     | 
| 
      
 17 
     | 
    
         
            +
                if m > len(sumstats):
         
     | 
| 
      
 18 
     | 
    
         
            +
                    print('Dropped {M} SNPs with duplicated rs numbers.'.format(M=m - len(sumstats)))
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
                return sumstats
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            def ps_sumstats(fh, alleles=False, dropna=True):
         
     | 
| 
      
 24 
     | 
    
         
            +
                '''
         
     | 
| 
      
 25 
     | 
    
         
            +
                Parses .sumstats files. See docs/file_formats_sumstats.txt.
         
     | 
| 
      
 26 
     | 
    
         
            +
                '''
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
                dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
         
     | 
| 
      
 29 
     | 
    
         
            +
                compression = get_compression(fh)
         
     | 
| 
      
 30 
     | 
    
         
            +
                usecols = ['SNP', 'Z', 'N']
         
     | 
| 
      
 31 
     | 
    
         
            +
                if alleles:
         
     | 
| 
      
 32 
     | 
    
         
            +
                    usecols += ['A1', 'A2']
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
                try:
         
     | 
| 
      
 35 
     | 
    
         
            +
                    x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
         
     | 
| 
      
 36 
     | 
    
         
            +
                except (AttributeError, ValueError) as e:
         
     | 
| 
      
 37 
     | 
    
         
            +
                    raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                if dropna:
         
     | 
| 
      
 40 
     | 
    
         
            +
                    x = x.dropna(how='any')
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                return x
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
            def get_compression(fh):
         
     | 
| 
      
 46 
     | 
    
         
            +
                '''
         
     | 
| 
      
 47 
     | 
    
         
            +
                Determin the format of compression used with read_csv?
         
     | 
| 
      
 48 
     | 
    
         
            +
                '''
         
     | 
| 
      
 49 
     | 
    
         
            +
                if fh.endswith('gz'):
         
     | 
| 
      
 50 
     | 
    
         
            +
                    compression = 'gzip'
         
     | 
| 
      
 51 
     | 
    
         
            +
                elif fh.endswith('bz2'):
         
     | 
| 
      
 52 
     | 
    
         
            +
                    compression = 'bz2'
         
     | 
| 
      
 53 
     | 
    
         
            +
                else:
         
     | 
| 
      
 54 
     | 
    
         
            +
                    compression = None
         
     | 
| 
      
 55 
     | 
    
         
            +
                # -
         
     | 
| 
      
 56 
     | 
    
         
            +
                return compression
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
            def read_csv(fh, **kwargs):
         
     | 
| 
      
 60 
     | 
    
         
            +
                '''
         
     | 
| 
      
 61 
     | 
    
         
            +
                Read the csv data
         
     | 
| 
      
 62 
     | 
    
         
            +
                '''
         
     | 
| 
      
 63 
     | 
    
         
            +
                return pd.read_csv(fh, sep='\s+', na_values='.', **kwargs)
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
| 
      
 66 
     | 
    
         
            +
            # Fun for reading loading LD scores 
         
     | 
| 
      
 67 
     | 
    
         
            +
            def which_compression(fh):
         
     | 
| 
      
 68 
     | 
    
         
            +
                '''
         
     | 
| 
      
 69 
     | 
    
         
            +
                Given a file prefix, figure out what sort of compression to use.
         
     | 
| 
      
 70 
     | 
    
         
            +
                '''
         
     | 
| 
      
 71 
     | 
    
         
            +
                if os.access(fh + '.bz2', 4):
         
     | 
| 
      
 72 
     | 
    
         
            +
                    suffix = '.bz2'
         
     | 
| 
      
 73 
     | 
    
         
            +
                    compression = 'bz2'
         
     | 
| 
      
 74 
     | 
    
         
            +
                elif os.access(fh + '.gz', 4):
         
     | 
| 
      
 75 
     | 
    
         
            +
                    suffix = '.gz'
         
     | 
| 
      
 76 
     | 
    
         
            +
                    compression = 'gzip'
         
     | 
| 
      
 77 
     | 
    
         
            +
                elif os.access(fh + '.parquet', 4):
         
     | 
| 
      
 78 
     | 
    
         
            +
                    suffix = '.parquet'
         
     | 
| 
      
 79 
     | 
    
         
            +
                    compression = 'parquet'
         
     | 
| 
      
 80 
     | 
    
         
            +
                elif os.access(fh + '.feather', 4):
         
     | 
| 
      
 81 
     | 
    
         
            +
                    suffix = '.feather'
         
     | 
| 
      
 82 
     | 
    
         
            +
                    compression = 'feather'
         
     | 
| 
      
 83 
     | 
    
         
            +
                elif os.access(fh, 4):
         
     | 
| 
      
 84 
     | 
    
         
            +
                    suffix = ''
         
     | 
| 
      
 85 
     | 
    
         
            +
                    compression = None
         
     | 
| 
      
 86 
     | 
    
         
            +
                else:
         
     | 
| 
      
 87 
     | 
    
         
            +
                    raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
         
     | 
| 
      
 88 
     | 
    
         
            +
                # -
         
     | 
| 
      
 89 
     | 
    
         
            +
                return suffix, compression
         
     | 
| 
      
 90 
     | 
    
         
            +
             
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
            def _read_ref_ld(ld_file):
         
     | 
| 
      
 93 
     | 
    
         
            +
                suffix = '.l2.ldscore'
         
     | 
| 
      
 94 
     | 
    
         
            +
                file = ld_file
         
     | 
| 
      
 95 
     | 
    
         
            +
                first_fh = f'{file}1{suffix}'
         
     | 
| 
      
 96 
     | 
    
         
            +
                s, compression = which_compression(first_fh)
         
     | 
| 
      
 97 
     | 
    
         
            +
                #
         
     | 
| 
      
 98 
     | 
    
         
            +
                ldscore_array = []
         
     | 
| 
      
 99 
     | 
    
         
            +
                print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
      
 101 
     | 
    
         
            +
                for chr in range(1, 23):
         
     | 
| 
      
 102 
     | 
    
         
            +
                    file_chr = f'{file}{chr}{suffix}{s}'
         
     | 
| 
      
 103 
     | 
    
         
            +
                    #
         
     | 
| 
      
 104 
     | 
    
         
            +
                    if compression == 'parquet':
         
     | 
| 
      
 105 
     | 
    
         
            +
                        x = pd.read_parquet(file_chr)
         
     | 
| 
      
 106 
     | 
    
         
            +
                    elif compression == 'feather':
         
     | 
| 
      
 107 
     | 
    
         
            +
                        x = pd.read_feather(file_chr)
         
     | 
| 
      
 108 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 109 
     | 
    
         
            +
                        x = pd.read_csv(file_chr, compression=compression, sep='\t')
         
     | 
| 
      
 110 
     | 
    
         
            +
             
     | 
| 
      
 111 
     | 
    
         
            +
                    x = x.sort_values(by=['CHR', 'BP'])  # SEs will be wrong unless sorted
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
                    columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
         
     | 
| 
      
 114 
     | 
    
         
            +
                    columns_to_drop = [col for col in columns_to_drop if col in x.columns]
         
     | 
| 
      
 115 
     | 
    
         
            +
                    x = x.drop(columns_to_drop, axis=1)
         
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
                    ldscore_array.append(x)
         
     | 
| 
      
 118 
     | 
    
         
            +
                #
         
     | 
| 
      
 119 
     | 
    
         
            +
                ref_ld = pd.concat(ldscore_array, axis=0)
         
     | 
| 
      
 120 
     | 
    
         
            +
                return ref_ld
         
     | 
| 
      
 121 
     | 
    
         
            +
             
     | 
| 
      
 122 
     | 
    
         
            +
             
     | 
| 
      
 123 
     | 
    
         
            +
            def _read_ref_ld_v2(ld_file):
         
     | 
| 
      
 124 
     | 
    
         
            +
                suffix = '.l2.ldscore'
         
     | 
| 
      
 125 
     | 
    
         
            +
                file = ld_file
         
     | 
| 
      
 126 
     | 
    
         
            +
                first_fh = f'{file}1{suffix}'
         
     | 
| 
      
 127 
     | 
    
         
            +
                s, compression = which_compression(first_fh)
         
     | 
| 
      
 128 
     | 
    
         
            +
                print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
         
     | 
| 
      
 129 
     | 
    
         
            +
                ref_ld = pd.concat(
         
     | 
| 
      
 130 
     | 
    
         
            +
                    [pd.read_feather(f'{file}{chr}{suffix}{s}') for chr in range(1, 23)], axis=0
         
     | 
| 
      
 131 
     | 
    
         
            +
                )
         
     | 
| 
      
 132 
     | 
    
         
            +
                # set first column as index
         
     | 
| 
      
 133 
     | 
    
         
            +
                ref_ld.rename(columns={'index': 'SNP'}, inplace=True)
         
     | 
| 
      
 134 
     | 
    
         
            +
                ref_ld.set_index('SNP', inplace=True)
         
     | 
| 
      
 135 
     | 
    
         
            +
                return ref_ld
         
     | 
| 
      
 136 
     | 
    
         
            +
             
     | 
| 
      
 137 
     | 
    
         
            +
            def _read_M_v2(ld_file, n_annot, not_M_5_50):
         
     | 
| 
      
 138 
     | 
    
         
            +
                suffix = '.l2.M'
         
     | 
| 
      
 139 
     | 
    
         
            +
                if not not_M_5_50:
         
     | 
| 
      
 140 
     | 
    
         
            +
                    suffix += '_5_50'
         
     | 
| 
      
 141 
     | 
    
         
            +
                M_annot= np.array(
         
     | 
| 
      
 142 
     | 
    
         
            +
                    [
         
     | 
| 
      
 143 
     | 
    
         
            +
                        np.loadtxt(f'{ld_file}{chr}{suffix}', )
         
     | 
| 
      
 144 
     | 
    
         
            +
                     for chr in range(1, 23)]
         
     | 
| 
      
 145 
     | 
    
         
            +
             
     | 
| 
      
 146 
     | 
    
         
            +
                )
         
     | 
| 
      
 147 
     | 
    
         
            +
                assert M_annot.shape == (22, n_annot)
         
     | 
| 
      
 148 
     | 
    
         
            +
                return M_annot.sum(axis=0).reshape((1, n_annot))
         
     | 
| 
      
 149 
     | 
    
         
            +
            # Fun for reading M annotations 
         
     | 
| 
      
 150 
     | 
    
         
            +
            def _read_M(ld_file, n_annot, not_M_5_50):
         
     | 
| 
      
 151 
     | 
    
         
            +
                '''
         
     | 
| 
      
 152 
     | 
    
         
            +
                Read M (--M, --M-file, etc).
         
     | 
| 
      
 153 
     | 
    
         
            +
                '''
         
     | 
| 
      
 154 
     | 
    
         
            +
                M_annot = M(ld_file, common=(not not_M_5_50))
         
     | 
| 
      
 155 
     | 
    
         
            +
             
     | 
| 
      
 156 
     | 
    
         
            +
                try:
         
     | 
| 
      
 157 
     | 
    
         
            +
                    M_annot = np.array(M_annot).reshape((1, n_annot))
         
     | 
| 
      
 158 
     | 
    
         
            +
                except ValueError as e:
         
     | 
| 
      
 159 
     | 
    
         
            +
                    raise ValueError('# terms in --M must match # of LD Scores in --ref-ld.\n' + str(e.args))
         
     | 
| 
      
 160 
     | 
    
         
            +
                return M_annot
         
     | 
| 
      
 161 
     | 
    
         
            +
             
     | 
| 
      
 162 
     | 
    
         
            +
             
     | 
| 
      
 163 
     | 
    
         
            +
            def M(fh, common=False):
         
     | 
| 
      
 164 
     | 
    
         
            +
                '''
         
     | 
| 
      
 165 
     | 
    
         
            +
                Parses .l{N}.M files, split across num chromosomes.
         
     | 
| 
      
 166 
     | 
    
         
            +
                '''
         
     | 
| 
      
 167 
     | 
    
         
            +
                suffix = '.l2.M'
         
     | 
| 
      
 168 
     | 
    
         
            +
                if common:
         
     | 
| 
      
 169 
     | 
    
         
            +
                    suffix += '_5_50'
         
     | 
| 
      
 170 
     | 
    
         
            +
                # -
         
     | 
| 
      
 171 
     | 
    
         
            +
                M_array = []
         
     | 
| 
      
 172 
     | 
    
         
            +
                for i in range(1, 23):
         
     | 
| 
      
 173 
     | 
    
         
            +
                    M_current = pd.read_csv(f'{fh}{i}' + suffix, header=None)
         
     | 
| 
      
 174 
     | 
    
         
            +
                    M_array.append(M_current)
         
     | 
| 
      
 175 
     | 
    
         
            +
             
     | 
| 
      
 176 
     | 
    
         
            +
                M_array = pd.concat(M_array, axis=1).sum(axis=1)
         
     | 
| 
      
 177 
     | 
    
         
            +
                # -
         
     | 
| 
      
 178 
     | 
    
         
            +
                return np.array(M_array).reshape((1, len(M_array)))
         
     | 
| 
      
 179 
     | 
    
         
            +
             
     | 
| 
      
 180 
     | 
    
         
            +
             
     | 
| 
      
 181 
     | 
    
         
            +
            def _check_variance(M_annot, ref_ld):
         
     | 
| 
      
 182 
     | 
    
         
            +
                '''
         
     | 
| 
      
 183 
     | 
    
         
            +
                Remove zero-variance LD Scores.
         
     | 
| 
      
 184 
     | 
    
         
            +
                '''
         
     | 
| 
      
 185 
     | 
    
         
            +
                ii = ref_ld.iloc[:, 1:].var() == 0  # NB there is a SNP column here
         
     | 
| 
      
 186 
     | 
    
         
            +
                if ii.all():
         
     | 
| 
      
 187 
     | 
    
         
            +
                    raise ValueError('All LD Scores have zero variance.')
         
     | 
| 
      
 188 
     | 
    
         
            +
                else:
         
     | 
| 
      
 189 
     | 
    
         
            +
                    print('Removing partitioned LD Scores with zero variance.')
         
     | 
| 
      
 190 
     | 
    
         
            +
                    ii_snp = np.array([True] + list(~ii))
         
     | 
| 
      
 191 
     | 
    
         
            +
                    ii_m = np.array(~ii)
         
     | 
| 
      
 192 
     | 
    
         
            +
                    ref_ld = ref_ld.iloc[:, ii_snp]
         
     | 
| 
      
 193 
     | 
    
         
            +
                    M_annot = M_annot[:, ii_m]
         
     | 
| 
      
 194 
     | 
    
         
            +
                # -
         
     | 
| 
      
 195 
     | 
    
         
            +
                return M_annot, ref_ld, ii
         
     | 
| 
      
 196 
     | 
    
         
            +
            def _check_variance_v2(M_annot, ref_ld):
         
     | 
| 
      
 197 
     | 
    
         
            +
                ii = ref_ld.var() == 0
         
     | 
| 
      
 198 
     | 
    
         
            +
                if ii.all():
         
     | 
| 
      
 199 
     | 
    
         
            +
                    raise ValueError('All LD Scores have zero variance.')
         
     | 
| 
      
 200 
     | 
    
         
            +
                elif not ii.any():
         
     | 
| 
      
 201 
     | 
    
         
            +
                    print('No partitioned LD Scores have zero variance.')
         
     | 
| 
      
 202 
     | 
    
         
            +
                else:
         
     | 
| 
      
 203 
     | 
    
         
            +
                    ii_snp= ii_m = np.array(~ii)
         
     | 
| 
      
 204 
     | 
    
         
            +
                    print(f'Removing {sum(ii)} partitioned LD Scores with zero variance.')
         
     | 
| 
      
 205 
     | 
    
         
            +
                    ref_ld = ref_ld.iloc[:, ii_snp]
         
     | 
| 
      
 206 
     | 
    
         
            +
                    M_annot = M_annot[:, ii_m]
         
     | 
| 
      
 207 
     | 
    
         
            +
                return M_annot, ref_ld
         
     | 
| 
      
 208 
     | 
    
         
            +
             
     | 
| 
      
 209 
     | 
    
         
            +
             
     | 
| 
      
 210 
     | 
    
         
            +
            # Fun for reading regression weights
         
     | 
| 
      
 211 
     | 
    
         
            +
            def which_compression(fh):
         
     | 
| 
      
 212 
     | 
    
         
            +
                '''
         
     | 
| 
      
 213 
     | 
    
         
            +
                Given a file prefix, figure out what sort of compression to use.
         
     | 
| 
      
 214 
     | 
    
         
            +
                '''
         
     | 
| 
      
 215 
     | 
    
         
            +
                if os.access(fh + '.bz2', 4):
         
     | 
| 
      
 216 
     | 
    
         
            +
                    suffix = '.bz2'
         
     | 
| 
      
 217 
     | 
    
         
            +
                    compression = 'bz2'
         
     | 
| 
      
 218 
     | 
    
         
            +
                elif os.access(fh + '.gz', 4):
         
     | 
| 
      
 219 
     | 
    
         
            +
                    suffix = '.gz'
         
     | 
| 
      
 220 
     | 
    
         
            +
                    compression = 'gzip'
         
     | 
| 
      
 221 
     | 
    
         
            +
                elif os.access(fh + '.parquet', 4):
         
     | 
| 
      
 222 
     | 
    
         
            +
                    suffix = '.parquet'
         
     | 
| 
      
 223 
     | 
    
         
            +
                    compression = 'parquet'
         
     | 
| 
      
 224 
     | 
    
         
            +
                elif os.access(fh + '.feather', 4):
         
     | 
| 
      
 225 
     | 
    
         
            +
                    suffix = '.feather'
         
     | 
| 
      
 226 
     | 
    
         
            +
                    compression = 'feather'
         
     | 
| 
      
 227 
     | 
    
         
            +
                elif os.access(fh, 4):
         
     | 
| 
      
 228 
     | 
    
         
            +
                    suffix = ''
         
     | 
| 
      
 229 
     | 
    
         
            +
                    compression = None
         
     | 
| 
      
 230 
     | 
    
         
            +
                else:
         
     | 
| 
      
 231 
     | 
    
         
            +
                    raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
         
     | 
| 
      
 232 
     | 
    
         
            +
                # -
         
     | 
| 
      
 233 
     | 
    
         
            +
                return suffix, compression
         
     | 
| 
      
 234 
     | 
    
         
            +
             
     | 
| 
      
 235 
     | 
    
         
            +
             
     | 
| 
      
 236 
     | 
    
         
            +
            def _read_w_ld(w_file):
         
     | 
| 
      
 237 
     | 
    
         
            +
                suffix = '.l2.ldscore'
         
     | 
| 
      
 238 
     | 
    
         
            +
                file = w_file
         
     | 
| 
      
 239 
     | 
    
         
            +
                first_fh = f'{file}1{suffix}'
         
     | 
| 
      
 240 
     | 
    
         
            +
                s, compression = which_compression(first_fh)
         
     | 
| 
      
 241 
     | 
    
         
            +
                #
         
     | 
| 
      
 242 
     | 
    
         
            +
                w_array = []
         
     | 
| 
      
 243 
     | 
    
         
            +
                print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
         
     | 
| 
      
 244 
     | 
    
         
            +
             
     | 
| 
      
 245 
     | 
    
         
            +
                for chr in range(1, 23):
         
     | 
| 
      
 246 
     | 
    
         
            +
                    file_chr = f'{file}{chr}{suffix}{s}'
         
     | 
| 
      
 247 
     | 
    
         
            +
                    #
         
     | 
| 
      
 248 
     | 
    
         
            +
                    if compression == 'parquet':
         
     | 
| 
      
 249 
     | 
    
         
            +
                        x = pd.read_parquet(file_chr)
         
     | 
| 
      
 250 
     | 
    
         
            +
                    elif compression == 'feather':
         
     | 
| 
      
 251 
     | 
    
         
            +
                        x = pd.read_feather(file_chr)
         
     | 
| 
      
 252 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 253 
     | 
    
         
            +
                        x = pd.read_csv(file_chr, compression=compression, sep='\t')
         
     | 
| 
      
 254 
     | 
    
         
            +
             
     | 
| 
      
 255 
     | 
    
         
            +
                    x = x.sort_values(by=['CHR', 'BP'])
         
     | 
| 
      
 256 
     | 
    
         
            +
             
     | 
| 
      
 257 
     | 
    
         
            +
                    columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
         
     | 
| 
      
 258 
     | 
    
         
            +
                    columns_to_drop = [col for col in columns_to_drop if col in x.columns]
         
     | 
| 
      
 259 
     | 
    
         
            +
                    x = x.drop(columns_to_drop, axis=1)
         
     | 
| 
      
 260 
     | 
    
         
            +
             
     | 
| 
      
 261 
     | 
    
         
            +
                    w_array.append(x)
         
     | 
| 
      
 262 
     | 
    
         
            +
                #
         
     | 
| 
      
 263 
     | 
    
         
            +
                w_ld = pd.concat(w_array, axis=0)
         
     | 
| 
      
 264 
     | 
    
         
            +
                w_ld.columns = ['SNP', 'LD_weights']
         
     | 
| 
      
 265 
     | 
    
         
            +
             
     | 
| 
      
 266 
     | 
    
         
            +
                return w_ld
         
     | 
| 
      
 267 
     | 
    
         
            +
             
     | 
| 
      
 268 
     | 
    
         
            +
             
     | 
| 
      
 269 
     | 
    
         
            +
            # Fun for merging
         
     | 
| 
      
 270 
     | 
    
         
            +
            def _merge_and_log(ld, sumstats, noun):
         
     | 
| 
      
 271 
     | 
    
         
            +
                '''
         
     | 
| 
      
 272 
     | 
    
         
            +
                Wrap smart merge with log messages about # of SNPs.
         
     | 
| 
      
 273 
     | 
    
         
            +
                '''
         
     | 
| 
      
 274 
     | 
    
         
            +
                sumstats = smart_merge(ld, sumstats)
         
     | 
| 
      
 275 
     | 
    
         
            +
                msg = 'After merging with {F}, {N} SNPs remain.'
         
     | 
| 
      
 276 
     | 
    
         
            +
                if len(sumstats) == 0:
         
     | 
| 
      
 277 
     | 
    
         
            +
                    raise ValueError(msg.format(N=len(sumstats), F=noun))
         
     | 
| 
      
 278 
     | 
    
         
            +
                else:
         
     | 
| 
      
 279 
     | 
    
         
            +
                    print(msg.format(N=len(sumstats), F=noun))
         
     | 
| 
      
 280 
     | 
    
         
            +
                # -
         
     | 
| 
      
 281 
     | 
    
         
            +
                return sumstats
         
     | 
| 
      
 282 
     | 
    
         
            +
             
     | 
| 
      
 283 
     | 
    
         
            +
             
     | 
| 
      
 284 
     | 
    
         
            +
            def smart_merge(x, y):
         
     | 
| 
      
 285 
     | 
    
         
            +
                '''
         
     | 
| 
      
 286 
     | 
    
         
            +
                Check if SNP columns are equal. If so, save time by using concat instead of merge.
         
     | 
| 
      
 287 
     | 
    
         
            +
                '''
         
     | 
| 
      
 288 
     | 
    
         
            +
                if len(x) == len(y) and (x.index == y.index).all() and (x.SNP == y.SNP).all():
         
     | 
| 
      
 289 
     | 
    
         
            +
                    x = x.reset_index(drop=True)
         
     | 
| 
      
 290 
     | 
    
         
            +
                    y = y.reset_index(drop=True).drop('SNP', 1)
         
     | 
| 
      
 291 
     | 
    
         
            +
                    out = pd.concat([x, y], axis=1)
         
     | 
| 
      
 292 
     | 
    
         
            +
                else:
         
     | 
| 
      
 293 
     | 
    
         
            +
                    out = pd.merge(x, y, how='inner', on='SNP')
         
     | 
| 
      
 294 
     | 
    
         
            +
                return out
         
     |