gsMap 1.67__py3-none-any.whl → 1.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/{GNN_VAE → GNN}/__init__.py +0 -0
 - gsMap/{GNN_VAE → GNN}/adjacency_matrix.py +75 -75
 - gsMap/{GNN_VAE → GNN}/model.py +89 -89
 - gsMap/{GNN_VAE → GNN}/train.py +88 -86
 - gsMap/__init__.py +5 -5
 - gsMap/__main__.py +2 -2
 - gsMap/cauchy_combination_test.py +141 -141
 - gsMap/config.py +805 -803
 - gsMap/diagnosis.py +273 -273
 - gsMap/find_latent_representation.py +133 -145
 - gsMap/format_sumstats.py +407 -407
 - gsMap/generate_ldscore.py +618 -618
 - gsMap/latent_to_gene.py +234 -234
 - gsMap/main.py +31 -31
 - gsMap/report.py +160 -160
 - gsMap/run_all_mode.py +194 -194
 - gsMap/setup.py +0 -0
 - gsMap/spatial_ldsc_multiple_sumstats.py +380 -380
 - gsMap/templates/report_template.html +198 -198
 - gsMap/utils/__init__.py +0 -0
 - gsMap/utils/generate_r2_matrix.py +735 -735
 - gsMap/utils/jackknife.py +514 -514
 - gsMap/utils/make_annotations.py +518 -518
 - gsMap/utils/manhattan_plot.py +639 -639
 - gsMap/utils/regression_read.py +294 -294
 - gsMap/visualize.py +198 -198
 - {gsmap-1.67.dist-info → gsmap-1.71.dist-info}/LICENSE +21 -21
 - {gsmap-1.67.dist-info → gsmap-1.71.dist-info}/METADATA +28 -22
 - gsmap-1.71.dist-info/RECORD +31 -0
 - gsmap-1.67.dist-info/RECORD +0 -31
 - {gsmap-1.67.dist-info → gsmap-1.71.dist-info}/WHEEL +0 -0
 - {gsmap-1.67.dist-info → gsmap-1.71.dist-info}/entry_points.txt +0 -0
 
    
        gsMap/generate_ldscore.py
    CHANGED
    
    | 
         @@ -1,618 +1,618 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            import logging
         
     | 
| 
       2 
     | 
    
         
            -
            import warnings
         
     | 
| 
       3 
     | 
    
         
            -
            from pathlib import Path
         
     | 
| 
       4 
     | 
    
         
            -
             
     | 
| 
       5 
     | 
    
         
            -
            import numpy as np
         
     | 
| 
       6 
     | 
    
         
            -
            import pandas as pd
         
     | 
| 
       7 
     | 
    
         
            -
            import pyranges as pr
         
     | 
| 
       8 
     | 
    
         
            -
            import zarr
         
     | 
| 
       9 
     | 
    
         
            -
            from scipy.sparse import csr_matrix
         
     | 
| 
       10 
     | 
    
         
            -
            from tqdm import trange
         
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
            from gsMap.config import GenerateLDScoreConfig
         
     | 
| 
       13 
     | 
    
         
            -
            from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
         
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
            warnings.filterwarnings("ignore", category=FutureWarning)
         
     | 
| 
       16 
     | 
    
         
            -
            logger = logging.getLogger(__name__)
         
     | 
| 
       17 
     | 
    
         
            -
             
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
            # %%
         
     | 
| 
       20 
     | 
    
         
            -
            # load gtf
         
     | 
| 
       21 
     | 
    
         
            -
            def load_gtf(gtf_file, mk_score, window_size):
         
     | 
| 
       22 
     | 
    
         
            -
                """
         
     | 
| 
       23 
     | 
    
         
            -
                Load the gene annotation file (gtf).
         
     | 
| 
       24 
     | 
    
         
            -
                """
         
     | 
| 
       25 
     | 
    
         
            -
                print("Loading gtf data")
         
     | 
| 
       26 
     | 
    
         
            -
                #
         
     | 
| 
       27 
     | 
    
         
            -
                # Load GTF file
         
     | 
| 
       28 
     | 
    
         
            -
                gtf = pr.read_gtf(gtf_file, )
         
     | 
| 
       29 
     | 
    
         
            -
                gtf = gtf.df
         
     | 
| 
       30 
     | 
    
         
            -
                #
         
     | 
| 
       31 
     | 
    
         
            -
                # Select the common genes
         
     | 
| 
       32 
     | 
    
         
            -
                gtf = gtf[gtf['Feature'] == 'gene']
         
     | 
| 
       33 
     | 
    
         
            -
                common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
         
     | 
| 
       34 
     | 
    
         
            -
                #
         
     | 
| 
       35 
     | 
    
         
            -
                gtf = gtf[gtf.gene_name.isin(common_gene)]
         
     | 
| 
       36 
     | 
    
         
            -
                mk_score = mk_score[mk_score.index.isin(common_gene)]
         
     | 
| 
       37 
     | 
    
         
            -
                #
         
     | 
| 
       38 
     | 
    
         
            -
                # Remove duplicated lines
         
     | 
| 
       39 
     | 
    
         
            -
                gtf = gtf.drop_duplicates(subset='gene_name', keep="first")
         
     | 
| 
       40 
     | 
    
         
            -
                #
         
     | 
| 
       41 
     | 
    
         
            -
                # Process the GTF (open 100-KB window: Tss - Ted)
         
     | 
| 
       42 
     | 
    
         
            -
                gtf_bed = gtf[['Chromosome', 'Start', 'End', 'gene_name', 'Strand']].copy()
         
     | 
| 
       43 
     | 
    
         
            -
                gtf_bed.loc[:, 'TSS'] = gtf_bed['Start']
         
     | 
| 
       44 
     | 
    
         
            -
                gtf_bed.loc[:, 'TED'] = gtf_bed['End']
         
     | 
| 
       45 
     | 
    
         
            -
             
     | 
| 
       46 
     | 
    
         
            -
                gtf_bed.loc[:, 'Start'] = gtf_bed['TSS'] - window_size
         
     | 
| 
       47 
     | 
    
         
            -
                gtf_bed.loc[:, 'End'] = gtf_bed['TED'] + window_size
         
     | 
| 
       48 
     | 
    
         
            -
                gtf_bed.loc[gtf_bed['Start'] < 0, 'Start'] = 0
         
     | 
| 
       49 
     | 
    
         
            -
                #
         
     | 
| 
       50 
     | 
    
         
            -
                # Correct the negative strand
         
     | 
| 
       51 
     | 
    
         
            -
                tss_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS']
         
     | 
| 
       52 
     | 
    
         
            -
                ted_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED']
         
     | 
| 
       53 
     | 
    
         
            -
                gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS'] = ted_neg
         
     | 
| 
       54 
     | 
    
         
            -
                gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED'] = tss_neg
         
     | 
| 
       55 
     | 
    
         
            -
                gtf_bed = gtf_bed.drop('Strand', axis=1)
         
     | 
| 
       56 
     | 
    
         
            -
                #
         
     | 
| 
       57 
     | 
    
         
            -
                # Transform the GTF to PyRanges
         
     | 
| 
       58 
     | 
    
         
            -
                gtf_pr = pr.PyRanges(gtf_bed)
         
     | 
| 
       59 
     | 
    
         
            -
                return gtf_pr, mk_score
         
     | 
| 
       60 
     | 
    
         
            -
             
     | 
| 
       61 
     | 
    
         
            -
             
     | 
| 
       62 
     | 
    
         
            -
            # %%
         
     | 
| 
       63 
     | 
    
         
            -
            def load_marker_score(mk_score_file):
         
     | 
| 
       64 
     | 
    
         
            -
                """
         
     | 
| 
       65 
     | 
    
         
            -
                Load marker scores of each cell.
         
     | 
| 
       66 
     | 
    
         
            -
                """
         
     | 
| 
       67 
     | 
    
         
            -
                mk_score = pd.read_feather(mk_score_file).set_index('HUMAN_GENE_SYM').rename_axis('gene_name')
         
     | 
| 
       68 
     | 
    
         
            -
                mk_score = mk_score.astype(np.float32, copy=False)
         
     | 
| 
       69 
     | 
    
         
            -
                return mk_score
         
     | 
| 
       70 
     | 
    
         
            -
             
     | 
| 
       71 
     | 
    
         
            -
             
     | 
| 
       72 
     | 
    
         
            -
            # %%
         
     | 
| 
       73 
     | 
    
         
            -
            # load mkscore get common gene
         
     | 
| 
       74 
     | 
    
         
            -
            # %%
         
     | 
| 
       75 
     | 
    
         
            -
            # load bim
         
     | 
| 
       76 
     | 
    
         
            -
            def load_bim(bfile_root, chrom):
         
     | 
| 
       77 
     | 
    
         
            -
                """
         
     | 
| 
       78 
     | 
    
         
            -
                Load the bim file.
         
     | 
| 
       79 
     | 
    
         
            -
                """
         
     | 
| 
       80 
     | 
    
         
            -
                bim = pd.read_csv(f'{bfile_root}.{chrom}.bim', sep='\t', header=None)
         
     | 
| 
       81 
     | 
    
         
            -
                bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
         
     | 
| 
       82 
     | 
    
         
            -
                #
         
     | 
| 
       83 
     | 
    
         
            -
                # Transform bim to PyRanges
         
     | 
| 
       84 
     | 
    
         
            -
                bim_pr = bim.copy()
         
     | 
| 
       85 
     | 
    
         
            -
                bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
         
     | 
| 
       86 
     | 
    
         
            -
             
     | 
| 
       87 
     | 
    
         
            -
                bim_pr['End'] = bim_pr['Start'].copy()
         
     | 
| 
       88 
     | 
    
         
            -
                bim_pr['Start'] = bim_pr['Start'] - 1  # Due to bim file is 1-based
         
     | 
| 
       89 
     | 
    
         
            -
             
     | 
| 
       90 
     | 
    
         
            -
                bim_pr = pr.PyRanges(bim_pr)
         
     | 
| 
       91 
     | 
    
         
            -
                bim_pr.Chromosome = f'chr{chrom}'
         
     | 
| 
       92 
     | 
    
         
            -
                return bim, bim_pr
         
     | 
| 
       93 
     | 
    
         
            -
             
     | 
| 
       94 
     | 
    
         
            -
             
     | 
| 
       95 
     | 
    
         
            -
            # %%
         
     | 
| 
       96 
     | 
    
         
            -
            def Overlaps_gtf_bim(gtf_pr, bim_pr):
         
     | 
| 
       97 
     | 
    
         
            -
                """
         
     | 
| 
       98 
     | 
    
         
            -
                Find overlaps between gtf and bim file.
         
     | 
| 
       99 
     | 
    
         
            -
                """
         
     | 
| 
       100 
     | 
    
         
            -
                # Select the overlapped regions (SNPs in gene windows)
         
     | 
| 
       101 
     | 
    
         
            -
                overlaps = gtf_pr.join(bim_pr)
         
     | 
| 
       102 
     | 
    
         
            -
                overlaps = overlaps.df
         
     | 
| 
       103 
     | 
    
         
            -
                overlaps['Distance'] = np.abs(overlaps['Start_b'] - overlaps['TSS'])
         
     | 
| 
       104 
     | 
    
         
            -
                overlaps_small = overlaps.copy()
         
     | 
| 
       105 
     | 
    
         
            -
                overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
         
     | 
| 
       106 
     | 
    
         
            -
                return overlaps_small
         
     | 
| 
       107 
     | 
    
         
            -
             
     | 
| 
       108 
     | 
    
         
            -
             
     | 
| 
       109 
     | 
    
         
            -
            # %%
         
     | 
| 
       110 
     | 
    
         
            -
            def filter_snps_by_keep_snp(bim_df, keep_snp_file):
         
     | 
| 
       111 
     | 
    
         
            -
                # Load the keep_snp file and filter the BIM DataFrame
         
     | 
| 
       112 
     | 
    
         
            -
                keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
         
     | 
| 
       113 
     | 
    
         
            -
                filtered_bim_df = bim_df[bim_df['SNP'].isin(keep_snp)]
         
     | 
| 
       114 
     | 
    
         
            -
                return filtered_bim_df
         
     | 
| 
       115 
     | 
    
         
            -
             
     | 
| 
       116 
     | 
    
         
            -
             
     | 
| 
       117 
     | 
    
         
            -
            def get_snp_counts(config):
         
     | 
| 
       118 
     | 
    
         
            -
                snp_counts = {}
         
     | 
| 
       119 
     | 
    
         
            -
                total_snp = 0
         
     | 
| 
       120 
     | 
    
         
            -
             
     | 
| 
       121 
     | 
    
         
            -
                for chrom in range(1, 23):
         
     | 
| 
       122 
     | 
    
         
            -
                    bim_df, _ = load_bim(config.bfile_root, chrom)
         
     | 
| 
       123 
     | 
    
         
            -
             
     | 
| 
       124 
     | 
    
         
            -
                    if config.keep_snp_root:
         
     | 
| 
       125 
     | 
    
         
            -
                        keep_snp_file = f'{config.keep_snp_root}.{chrom}.snp'
         
     | 
| 
       126 
     | 
    
         
            -
                        filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
         
     | 
| 
       127 
     | 
    
         
            -
                    else:
         
     | 
| 
       128 
     | 
    
         
            -
                        filtered_bim_df = bim_df
         
     | 
| 
       129 
     | 
    
         
            -
             
     | 
| 
       130 
     | 
    
         
            -
                    snp_counts[chrom] = filtered_bim_df.shape[0]
         
     | 
| 
       131 
     | 
    
         
            -
                    total_snp += snp_counts[chrom]
         
     | 
| 
       132 
     | 
    
         
            -
             
     | 
| 
       133 
     | 
    
         
            -
                snp_counts['total'] = total_snp
         
     | 
| 
       134 
     | 
    
         
            -
             
     | 
| 
       135 
     | 
    
         
            -
                chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
         
     | 
| 
       136 
     | 
    
         
            -
             
     | 
| 
       137 
     | 
    
         
            -
                snp_counts['chrom_snp_start_point'] = [0] + chrom_snp_length_array.tolist()
         
     | 
| 
       138 
     | 
    
         
            -
             
     | 
| 
       139 
     | 
    
         
            -
                return snp_counts
         
     | 
| 
       140 
     | 
    
         
            -
             
     | 
| 
       141 
     | 
    
         
            -
             
     | 
| 
       142 
     | 
    
         
            -
            # %%
         
     | 
| 
       143 
     | 
    
         
            -
            def get_snp_pass_maf(bfile_root, chrom, maf_min=0.05):
         
     | 
| 
       144 
     | 
    
         
            -
                """
         
     | 
| 
       145 
     | 
    
         
            -
                Get the dummy matrix of SNP-gene pairs.
         
     | 
| 
       146 
     | 
    
         
            -
                """
         
     | 
| 
       147 
     | 
    
         
            -
                # Load the bim file
         
     | 
| 
       148 
     | 
    
         
            -
                PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
         
     | 
| 
       149 
     | 
    
         
            -
                PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
         
     | 
| 
       150 
     | 
    
         
            -
             
     | 
| 
       151 
     | 
    
         
            -
                bfile = f'{bfile_root}.{chrom}'
         
     | 
| 
       152 
     | 
    
         
            -
                snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
         
     | 
| 
       153 
     | 
    
         
            -
                array_snps = snp_obj(snp_file)
         
     | 
| 
       154 
     | 
    
         
            -
                m = len(array_snps.IDList)
         
     | 
| 
       155 
     | 
    
         
            -
             
     | 
| 
       156 
     | 
    
         
            -
                # Load fam
         
     | 
| 
       157 
     | 
    
         
            -
                ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
         
     | 
| 
       158 
     | 
    
         
            -
                array_indivs = ind_obj(ind_file)
         
     | 
| 
       159 
     | 
    
         
            -
                n = len(array_indivs.IDList)
         
     | 
| 
       160 
     | 
    
         
            -
                array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
         
     | 
| 
       161 
     | 
    
         
            -
                geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
         
     | 
| 
       162 
     | 
    
         
            -
                ii = geno_array.maf > maf_min
         
     | 
| 
       163 
     | 
    
         
            -
                snp_pass_maf = array_snps.IDList[ii]
         
     | 
| 
       164 
     | 
    
         
            -
                print(f'After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain.')
         
     | 
| 
       165 
     | 
    
         
            -
                return snp_pass_maf.SNP.to_list()
         
     | 
| 
       166 
     | 
    
         
            -
             
     | 
| 
       167 
     | 
    
         
            -
             
     | 
| 
       168 
     | 
    
         
            -
            def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit='CM'):
         
     | 
| 
       169 
     | 
    
         
            -
                PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
         
     | 
| 
       170 
     | 
    
         
            -
                PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
         
     | 
| 
       171 
     | 
    
         
            -
             
     | 
| 
       172 
     | 
    
         
            -
                bfile = f'{bfile_root}.{chrom}'
         
     | 
| 
       173 
     | 
    
         
            -
                snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
         
     | 
| 
       174 
     | 
    
         
            -
                array_snps = snp_obj(snp_file)
         
     | 
| 
       175 
     | 
    
         
            -
                m = len(array_snps.IDList)
         
     | 
| 
       176 
     | 
    
         
            -
                print(f'Read list of {m} SNPs from {snp_file}')
         
     | 
| 
       177 
     | 
    
         
            -
             
     | 
| 
       178 
     | 
    
         
            -
                # Load fam
         
     | 
| 
       179 
     | 
    
         
            -
                ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
         
     | 
| 
       180 
     | 
    
         
            -
                array_indivs = ind_obj(ind_file)
         
     | 
| 
       181 
     | 
    
         
            -
                n = len(array_indivs.IDList)
         
     | 
| 
       182 
     | 
    
         
            -
                print(f'Read list of {n} individuals from {ind_file}')
         
     | 
| 
       183 
     | 
    
         
            -
                array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
         
     | 
| 
       184 
     | 
    
         
            -
                geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
         
     | 
| 
       185 
     | 
    
         
            -
                # Load the annotations of the baseline
         
     | 
| 
       186 
     | 
    
         
            -
                if ld_unit == 'SNP':
         
     | 
| 
       187 
     | 
    
         
            -
                    max_dist = ld_wind
         
     | 
| 
       188 
     | 
    
         
            -
                    coords = np.array(range(geno_array.m))
         
     | 
| 
       189 
     | 
    
         
            -
                elif ld_unit == 'KB':
         
     | 
| 
       190 
     | 
    
         
            -
                    max_dist = ld_wind * 1000
         
     | 
| 
       191 
     | 
    
         
            -
                    coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
         
     | 
| 
       192 
     | 
    
         
            -
                elif ld_unit == 'CM':
         
     | 
| 
       193 
     | 
    
         
            -
                    max_dist = ld_wind
         
     | 
| 
       194 
     | 
    
         
            -
                    coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
         
     | 
| 
       195 
     | 
    
         
            -
                else:
         
     | 
| 
       196 
     | 
    
         
            -
                    raise ValueError(f'Invalid ld_wind_unit: {ld_unit}')
         
     | 
| 
       197 
     | 
    
         
            -
                block_left = getBlockLefts(coords, max_dist)
         
     | 
| 
       198 
     | 
    
         
            -
                # Calculate the LD score
         
     | 
| 
       199 
     | 
    
         
            -
                lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
         
     | 
| 
       200 
     | 
    
         
            -
                return lN_df
         
     | 
| 
       201 
     | 
    
         
            -
             
     | 
| 
       202 
     | 
    
         
            -
             
     | 
| 
       203 
     | 
    
         
            -
            # %%
         
     | 
| 
       204 
     | 
    
         
            -
            def calculate_ldscore_from_annotation(SNP_annotation_df, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
         
     | 
| 
       205 
     | 
    
         
            -
                """
         
     | 
| 
       206 
     | 
    
         
            -
                Calculate the SNP-gene weight matrix.
         
     | 
| 
       207 
     | 
    
         
            -
                """
         
     | 
| 
       208 
     | 
    
         
            -
                # Get the dummy matrix
         
     | 
| 
       209 
     | 
    
         
            -
                # Get the SNP-gene weight matrix
         
     | 
| 
       210 
     | 
    
         
            -
                snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
         
     | 
| 
       211 
     | 
    
         
            -
                                                     ld_unit=ld_unit)
         
     | 
| 
       212 
     | 
    
         
            -
                snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
         
     | 
| 
       213 
     | 
    
         
            -
                snp_gene_weight_matrix.index = SNP_annotation_df.index
         
     | 
| 
       214 
     | 
    
         
            -
                snp_gene_weight_matrix.columns = SNP_annotation_df.columns
         
     | 
| 
       215 
     | 
    
         
            -
                return snp_gene_weight_matrix
         
     | 
| 
       216 
     | 
    
         
            -
             
     | 
| 
       217 
     | 
    
         
            -
             
     | 
| 
       218 
     | 
    
         
            -
            def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
         
     | 
| 
       219 
     | 
    
         
            -
                SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1).astype(np.float32, copy=False)
         
     | 
| 
       220 
     | 
    
         
            -
             
     | 
| 
       221 
     | 
    
         
            -
                snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
         
     | 
| 
       222 
     | 
    
         
            -
                                                     ld_unit=ld_unit)
         
     | 
| 
       223 
     | 
    
         
            -
                snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
         
     | 
| 
       224 
     | 
    
         
            -
                snp_gene_weight_matrix.index = SNP_annotation_df.index
         
     | 
| 
       225 
     | 
    
         
            -
                snp_gene_weight_matrix.columns = SNP_annotation_df.columns
         
     | 
| 
       226 
     | 
    
         
            -
             
     | 
| 
       227 
     | 
    
         
            -
                # split to each annotation
         
     | 
| 
       228 
     | 
    
         
            -
                snp_annotation_len_list = [len(df.columns) for df in SNP_annotation_df_list]
         
     | 
| 
       229 
     | 
    
         
            -
                snp_gene_weight_matrix_list = []
         
     | 
| 
       230 
     | 
    
         
            -
                start = 0
         
     | 
| 
       231 
     | 
    
         
            -
                for snp_annotation_len in snp_annotation_len_list:
         
     | 
| 
       232 
     | 
    
         
            -
                    snp_gene_weight_matrix_list.append(snp_gene_weight_matrix.iloc[:, start:start + snp_annotation_len])
         
     | 
| 
       233 
     | 
    
         
            -
                    start += snp_annotation_len
         
     | 
| 
       234 
     | 
    
         
            -
                return snp_gene_weight_matrix_list
         
     | 
| 
       235 
     | 
    
         
            -
             
     | 
| 
       236 
     | 
    
         
            -
             
     | 
| 
       237 
     | 
    
         
            -
            # %%
         
     | 
| 
       238 
     | 
    
         
            -
            class S_LDSC_Boost:
         
     | 
| 
       239 
     | 
    
         
            -
                def __init__(self, config: GenerateLDScoreConfig):
         
     | 
| 
       240 
     | 
    
         
            -
                    self.config = config
         
     | 
| 
       241 
     | 
    
         
            -
             
     | 
| 
       242 
     | 
    
         
            -
                    self.mk_score = load_marker_score(config.mkscore_feather_path)
         
     | 
| 
       243 
     | 
    
         
            -
             
     | 
| 
       244 
     | 
    
         
            -
                    # Load GTF and get common markers
         
     | 
| 
       245 
     | 
    
         
            -
                    self.gtf_pr, self.mk_score_common = load_gtf(config.gtf_annotation_file, self.mk_score,
         
     | 
| 
       246 
     | 
    
         
            -
                                                                 window_size=config.gene_window_size)
         
     | 
| 
       247 
     | 
    
         
            -
             
     | 
| 
       248 
     | 
    
         
            -
                    # Load enhancer
         
     | 
| 
       249 
     | 
    
         
            -
                    if config.enhancer_annotation_file is not None:
         
     | 
| 
       250 
     | 
    
         
            -
                        enhancer_df = pr.read_bed(config.enhancer_annotation_file, as_df=True)
         
     | 
| 
       251 
     | 
    
         
            -
                        enhancer_df.set_index('Name', inplace=True)
         
     | 
| 
       252 
     | 
    
         
            -
                        enhancer_df.index.name = 'gene_name'
         
     | 
| 
       253 
     | 
    
         
            -
             
     | 
| 
       254 
     | 
    
         
            -
                        # keep the common genes and add the enhancer score
         
     | 
| 
       255 
     | 
    
         
            -
                        avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=['avg_mkscore'])
         
     | 
| 
       256 
     | 
    
         
            -
                        enhancer_df = enhancer_df.join(avg_mkscore, how='inner', on='gene_name', )
         
     | 
| 
       257 
     | 
    
         
            -
             
     | 
| 
       258 
     | 
    
         
            -
                        # add distance to TSS
         
     | 
| 
       259 
     | 
    
         
            -
                        enhancer_df['TSS'] = self.gtf_pr.df.set_index('gene_name').reindex(enhancer_df.index)['TSS']
         
     | 
| 
       260 
     | 
    
         
            -
             
     | 
| 
       261 
     | 
    
         
            -
                        # convert to pyranges
         
     | 
| 
       262 
     | 
    
         
            -
                        self.enhancer_pr = pr.PyRanges(enhancer_df.reset_index())
         
     | 
| 
       263 
     | 
    
         
            -
             
     | 
| 
       264 
     | 
    
         
            -
                    else:
         
     | 
| 
       265 
     | 
    
         
            -
                        self.enhancer_pr = None
         
     | 
| 
       266 
     | 
    
         
            -
             
     | 
| 
       267 
     | 
    
         
            -
                    # create tha zarr file
         
     | 
| 
       268 
     | 
    
         
            -
                    if config.ldscore_save_format == 'zarr':
         
     | 
| 
       269 
     | 
    
         
            -
             
     | 
| 
       270 
     | 
    
         
            -
                        chrom_snp_length_dict = get_snp_counts(config)
         
     | 
| 
       271 
     | 
    
         
            -
                        self.chrom_snp_start_point = chrom_snp_length_dict['chrom_snp_start_point']
         
     | 
| 
       272 
     | 
    
         
            -
             
     | 
| 
       273 
     | 
    
         
            -
                        zarr_path = Path(config.ldscore_save_dir) / f'{config.sample_name}.ldscore.zarr'
         
     | 
| 
       274 
     | 
    
         
            -
                        if not zarr_path.exists():
         
     | 
| 
       275 
     | 
    
         
            -
                            self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a', dtype=np.float16,
         
     | 
| 
       276 
     | 
    
         
            -
                                                       chunks=config.zarr_chunk_size,
         
     | 
| 
       277 
     | 
    
         
            -
                                                       shape=(chrom_snp_length_dict['total'], self.mk_score_common.shape[1]))
         
     | 
| 
       278 
     | 
    
         
            -
                            zarr_path.mkdir(parents=True, exist_ok=True)
         
     | 
| 
       279 
     | 
    
         
            -
                            # save spot names
         
     | 
| 
       280 
     | 
    
         
            -
                            self.zarr_file.attrs['spot_names'] = self.mk_score_common.columns.to_list()
         
     | 
| 
       281 
     | 
    
         
            -
                            # save chrom_snp_length_dict
         
     | 
| 
       282 
     | 
    
         
            -
                            self.zarr_file.attrs['chrom_snp_start_point'] = self.chrom_snp_start_point
         
     | 
| 
       283 
     | 
    
         
            -
                        else:
         
     | 
| 
       284 
     | 
    
         
            -
                            self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a')
         
     | 
| 
       285 
     | 
    
         
            -
             
     | 
| 
       286 
     | 
    
         
            -
                def process_chromosome(self, chrom: int):
         
     | 
| 
       287 
     | 
    
         
            -
                    self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
         
     | 
| 
       288 
     | 
    
         
            -
             
     | 
| 
       289 
     | 
    
         
            -
                    # Get SNP-Gene dummy pairs
         
     | 
| 
       290 
     | 
    
         
            -
                    self.snp_gene_pair_dummy = self.get_snp_gene_dummy(chrom, )
         
     | 
| 
       291 
     | 
    
         
            -
             
     | 
| 
       292 
     | 
    
         
            -
                    if self.config.keep_snp_root is not None:
         
     | 
| 
       293 
     | 
    
         
            -
                        keep_snp = pd.read_csv(f'{self.config.keep_snp_root}.{chrom}.snp', header=None)[0].to_list()
         
     | 
| 
       294 
     | 
    
         
            -
                        self.keep_snp_mask = self.snp_gene_pair_dummy.index.isin(keep_snp)
         
     | 
| 
       295 
     | 
    
         
            -
                        # the SNP name of keeped
         
     | 
| 
       296 
     | 
    
         
            -
                        self.snp_name = self.snp_gene_pair_dummy.index[self.keep_snp_mask].to_list()
         
     | 
| 
       297 
     | 
    
         
            -
                    else:
         
     | 
| 
       298 
     | 
    
         
            -
                        self.keep_snp_mask = None
         
     | 
| 
       299 
     | 
    
         
            -
                        self.snp_name = self.snp_gene_pair_dummy.index.to_list()
         
     | 
| 
       300 
     | 
    
         
            -
             
     | 
| 
       301 
     | 
    
         
            -
                    if self.config.additional_baseline_annotation is not None:
         
     | 
| 
       302 
     | 
    
         
            -
                        additional_baseline_annotation = Path(self.config.additional_baseline_annotation)
         
     | 
| 
       303 
     | 
    
         
            -
                        additional_baseline_annotation_file_path = additional_baseline_annotation / f'baseline.{chrom}.annot.gz'
         
     | 
| 
       304 
     | 
    
         
            -
                        assert additional_baseline_annotation_file_path.exists(), f'additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}'
         
     | 
| 
       305 
     | 
    
         
            -
                        additional_baseline_annotation_df = pd.read_csv(additional_baseline_annotation_file_path, sep='\t')
         
     | 
| 
       306 
     | 
    
         
            -
                        additional_baseline_annotation_df.set_index('SNP', inplace=True)
         
     | 
| 
       307 
     | 
    
         
            -
             
     | 
| 
       308 
     | 
    
         
            -
                        # drop these columns if exists CHR         BP       CM]
         
     | 
| 
       309 
     | 
    
         
            -
                        additional_baseline_annotation_df.drop(['CHR', 'BP', 'CM'], axis=1, inplace=True, errors='ignore')
         
     | 
| 
       310 
     | 
    
         
            -
             
     | 
| 
       311 
     | 
    
         
            -
                        # reindex, for those SNPs not in additional_baseline_annotation_df, set to 0
         
     | 
| 
       312 
     | 
    
         
            -
                        num_of_not_exist_snp = (~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)).sum()
         
     | 
| 
       313 
     | 
    
         
            -
                        if num_of_not_exist_snp > 0:
         
     | 
| 
       314 
     | 
    
         
            -
                            logger.warning(
         
     | 
| 
       315 
     | 
    
         
            -
                                f'{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0')
         
     | 
| 
       316 
     | 
    
         
            -
                            additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
         
     | 
| 
       317 
     | 
    
         
            -
                                self.snp_gene_pair_dummy.index,
         
     | 
| 
       318 
     | 
    
         
            -
                                fill_value=0)
         
     | 
| 
       319 
     | 
    
         
            -
                        else:
         
     | 
| 
       320 
     | 
    
         
            -
                            additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
         
     | 
| 
       321 
     | 
    
         
            -
                                self.snp_gene_pair_dummy.index)
         
     | 
| 
       322 
     | 
    
         
            -
             
     | 
| 
       323 
     | 
    
         
            -
                        # do this for saving the cpu time, only calculate r2 once
         
     | 
| 
       324 
     | 
    
         
            -
                        self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
         
     | 
| 
       325 
     | 
    
         
            -
                            calculate_ldscore_from_multiple_annotation(
         
     | 
| 
       326 
     | 
    
         
            -
                                [self.snp_gene_pair_dummy, additional_baseline_annotation_df],
         
     | 
| 
       327 
     | 
    
         
            -
                                chrom,
         
     | 
| 
       328 
     | 
    
         
            -
                                self.config.bfile_root,
         
     | 
| 
       329 
     | 
    
         
            -
                                ld_wind=self.config.ld_wind,
         
     | 
| 
       330 
     | 
    
         
            -
                                ld_unit=self.config.ld_unit))
         
     | 
| 
       331 
     | 
    
         
            -
             
     | 
| 
       332 
     | 
    
         
            -
                        additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[self.snp_name]
         
     | 
| 
       333 
     | 
    
         
            -
                        # print(additional_baseline_annotation_ldscore.index.to_list()==self.snp_name)
         
     | 
| 
       334 
     | 
    
         
            -
             
     | 
| 
       335 
     | 
    
         
            -
                        ld_score_file = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather'
         
     | 
| 
       336 
     | 
    
         
            -
                        M_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M'
         
     | 
| 
       337 
     | 
    
         
            -
                        M_5_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50'
         
     | 
| 
       338 
     | 
    
         
            -
             
     | 
| 
       339 
     | 
    
         
            -
                        # save additional baseline annotation ldscore
         
     | 
| 
       340 
     | 
    
         
            -
                        self.save_ldscore_to_feather(additional_baseline_annotation_ldscore.values,
         
     | 
| 
       341 
     | 
    
         
            -
                                                     column_names=additional_baseline_annotation_ldscore.columns,
         
     | 
| 
       342 
     | 
    
         
            -
                                                     save_file_name=ld_score_file,
         
     | 
| 
       343 
     | 
    
         
            -
                                                     )
         
     | 
| 
       344 
     | 
    
         
            -
             
     | 
| 
       345 
     | 
    
         
            -
                        # caculate the M and save
         
     | 
| 
       346 
     | 
    
         
            -
                        save_dir = Path(M_file_path).parent
         
     | 
| 
       347 
     | 
    
         
            -
                        save_dir.mkdir(parents=True, exist_ok=True)
         
     | 
| 
       348 
     | 
    
         
            -
                        M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
         
     | 
| 
       349 
     | 
    
         
            -
                        M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0, keepdims=True)
         
     | 
| 
       350 
     | 
    
         
            -
                        np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
         
     | 
| 
       351 
     | 
    
         
            -
                        np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
         
     | 
| 
       352 
     | 
    
         
            -
             
     | 
| 
       353 
     | 
    
         
            -
                    else:
         
     | 
| 
       354 
     | 
    
         
            -
                        # Calculate SNP-Gene weight matrix
         
     | 
| 
       355 
     | 
    
         
            -
                        self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(self.snp_gene_pair_dummy, chrom,
         
     | 
| 
       356 
     | 
    
         
            -
                                                                                        self.config.bfile_root,
         
     | 
| 
       357 
     | 
    
         
            -
                                                                                        ld_wind=self.config.ld_wind,
         
     | 
| 
       358 
     | 
    
         
            -
                                                                                        ld_unit=self.config.ld_unit)
         
     | 
| 
       359 
     | 
    
         
            -
                    # only keep the snp in keep_snp_root
         
     | 
| 
       360 
     | 
    
         
            -
                    if self.keep_snp_mask is not None:
         
     | 
| 
       361 
     | 
    
         
            -
                        self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
         
     | 
| 
       362 
     | 
    
         
            -
             
     | 
| 
       363 
     | 
    
         
            -
                    if self.config.save_pre_calculate_snp_gene_weight_matrix:
         
     | 
| 
       364 
     | 
    
         
            -
                        snp_gene_weight_matrix_save_dir = Path(self.config.ldscore_save_dir) / 'snp_gene_weight_matrix'
         
     | 
| 
       365 
     | 
    
         
            -
                        snp_gene_weight_matrix_save_dir.mkdir(parents=True, exist_ok=True)
         
     | 
| 
       366 
     | 
    
         
            -
                        logger.info(f'Saving snp_gene_weight_matrix for chr{chrom}...')
         
     | 
| 
       367 
     | 
    
         
            -
                        self.snp_gene_weight_matrix.reset_index().to_feather(
         
     | 
| 
       368 
     | 
    
         
            -
                            snp_gene_weight_matrix_save_dir / f'{chrom}.snp_gene_weight_matrix.feather')
         
     | 
| 
       369 
     | 
    
         
            -
             
     | 
| 
       370 
     | 
    
         
            -
                    # convert to sparse
         
     | 
| 
       371 
     | 
    
         
            -
                    self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
         
     | 
| 
       372 
     | 
    
         
            -
                    logger.info(f'Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}')
         
     | 
| 
       373 
     | 
    
         
            -
             
     | 
| 
       374 
     | 
    
         
            -
                    # calculate baseline ld score
         
     | 
| 
       375 
     | 
    
         
            -
                    logger.info(f'Calculating baseline ld score for chr{chrom}...')
         
     | 
| 
       376 
     | 
    
         
            -
                    self.calculate_ldscore_for_base_line(chrom, self.config.sample_name, self.config.ldscore_save_dir)
         
     | 
| 
       377 
     | 
    
         
            -
             
     | 
| 
       378 
     | 
    
         
            -
                    # calculate ld score for annotation
         
     | 
| 
       379 
     | 
    
         
            -
                    logger.info(f'Calculating ld score for annotation for chr{chrom}...')
         
     | 
| 
       380 
     | 
    
         
            -
                    self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
         
     | 
| 
       381 
     | 
    
         
            -
                        self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
         
     | 
| 
       382 
     | 
    
         
            -
                        chrom,
         
     | 
| 
       383 
     | 
    
         
            -
                        self.config.sample_name,
         
     | 
| 
       384 
     | 
    
         
            -
                        self.config.ldscore_save_dir,
         
     | 
| 
       385 
     | 
    
         
            -
                    )
         
     | 
| 
       386 
     | 
    
         
            -
             
     | 
| 
       387 
     | 
    
         
            -
                def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
         
     | 
| 
       388 
     | 
    
         
            -
                                                                          mk_score_chunk,
         
     | 
| 
       389 
     | 
    
         
            -
                                                                          drop_dummy_na=True,
         
     | 
| 
       390 
     | 
    
         
            -
                                                                          ):
         
     | 
| 
       391 
     | 
    
         
            -
             
     | 
| 
       392 
     | 
    
         
            -
                    if drop_dummy_na:
         
     | 
| 
       393 
     | 
    
         
            -
                        ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
         
     | 
| 
       394 
     | 
    
         
            -
                    else:
         
     | 
| 
       395 
     | 
    
         
            -
                        ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
         
     | 
| 
       396 
     | 
    
         
            -
             
     | 
| 
       397 
     | 
    
         
            -
                    return ldscore_chr_chunk
         
     | 
| 
       398 
     | 
    
         
            -
             
     | 
| 
       399 
     | 
    
         
            -
                def save_ldscore_to_feather(self, ldscore_chr_chunk: np.ndarray, column_names, save_file_name):
         
     | 
| 
       400 
     | 
    
         
            -
                    save_dir = Path(save_file_name).parent
         
     | 
| 
       401 
     | 
    
         
            -
                    save_dir.mkdir(parents=True, exist_ok=True)
         
     | 
| 
       402 
     | 
    
         
            -
             
     | 
| 
       403 
     | 
    
         
            -
                    ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
         
     | 
| 
       404 
     | 
    
         
            -
                    # avoid overflow of float16, if inf, set to max of float16
         
     | 
| 
       405 
     | 
    
         
            -
                    ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
         
     | 
| 
       406 
     | 
    
         
            -
                    # ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
         
     | 
| 
       407 
     | 
    
         
            -
                    #     self.keep_snp_mask]
         
     | 
| 
       408 
     | 
    
         
            -
             
     | 
| 
       409 
     | 
    
         
            -
                    # save for each chunk
         
     | 
| 
       410 
     | 
    
         
            -
                    df = pd.DataFrame(ldscore_chr_chunk,
         
     | 
| 
       411 
     | 
    
         
            -
                                      index=self.snp_name,
         
     | 
| 
       412 
     | 
    
         
            -
                                      columns=column_names,
         
     | 
| 
       413 
     | 
    
         
            -
                                      )
         
     | 
| 
       414 
     | 
    
         
            -
                    df.index.name = 'SNP'
         
     | 
| 
       415 
     | 
    
         
            -
                    df.reset_index().to_feather(save_file_name)
         
     | 
| 
       416 
     | 
    
         
            -
             
     | 
| 
       417 
     | 
    
         
            -
                def save_ldscore_chunk_to_zarr(self, ldscore_chr_chunk: np.ndarray,
         
     | 
| 
       418 
     | 
    
         
            -
                                               chrom: int, start_col_index,
         
     | 
| 
       419 
     | 
    
         
            -
                                               ):
         
     | 
| 
       420 
     | 
    
         
            -
                    ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
         
     | 
| 
       421 
     | 
    
         
            -
                    # avoid overflow of float16, if inf, set to max of float16
         
     | 
| 
       422 
     | 
    
         
            -
                    ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
         
     | 
| 
       423 
     | 
    
         
            -
             
     | 
| 
       424 
     | 
    
         
            -
                    # save for each chunk
         
     | 
| 
       425 
     | 
    
         
            -
                    chrom_snp_start_point = self.chrom_snp_start_point[chrom - 1]
         
     | 
| 
       426 
     | 
    
         
            -
                    chrom_snp_end_point = self.chrom_snp_start_point[chrom]
         
     | 
| 
       427 
     | 
    
         
            -
             
     | 
| 
       428 
     | 
    
         
            -
                    self.zarr_file[chrom_snp_start_point:chrom_snp_end_point,
         
     | 
| 
       429 
     | 
    
         
            -
                    start_col_index:start_col_index + ldscore_chr_chunk.shape[1]] = ldscore_chr_chunk
         
     | 
| 
       430 
     | 
    
         
            -
             
     | 
| 
       431 
     | 
    
         
            -
                def calculate_M_use_SNP_gene_pair_dummy_by_chunk(self,
         
     | 
| 
       432 
     | 
    
         
            -
                                                                 mk_score_chunk,
         
     | 
| 
       433 
     | 
    
         
            -
                                                                 M_file_path, M_5_file_path,
         
     | 
| 
       434 
     | 
    
         
            -
                                                                 drop_dummy_na=True,
         
     | 
| 
       435 
     | 
    
         
            -
                                                                 ):
         
     | 
| 
       436 
     | 
    
         
            -
                    '''
         
     | 
| 
       437 
     | 
    
         
            -
                    calculate M use SNP_gene_pair_dummy_sumed_along_snp_axis and mk_score_chunk
         
     | 
| 
       438 
     | 
    
         
            -
                    '''
         
     | 
| 
       439 
     | 
    
         
            -
                    SNP_gene_pair_dummy_sumed_along_snp_axis = self.snp_gene_pair_dummy.values.sum(axis=0, keepdims=True)
         
     | 
| 
       440 
     | 
    
         
            -
                    SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = self.snp_gene_pair_dummy.loc[self.snp_pass_maf].values.sum(
         
     | 
| 
       441 
     | 
    
         
            -
                        axis=0,
         
     | 
| 
       442 
     | 
    
         
            -
                        keepdims=True)
         
     | 
| 
       443 
     | 
    
         
            -
                    if drop_dummy_na:
         
     | 
| 
       444 
     | 
    
         
            -
                        SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[:, :-1]
         
     | 
| 
       445 
     | 
    
         
            -
                        SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf[:,
         
     | 
| 
       446 
     | 
    
         
            -
                                                                            :-1]
         
     | 
| 
       447 
     | 
    
         
            -
                    save_dir = Path(M_file_path).parent
         
     | 
| 
       448 
     | 
    
         
            -
                    save_dir.mkdir(parents=True, exist_ok=True)
         
     | 
| 
       449 
     | 
    
         
            -
                    M_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis @ mk_score_chunk
         
     | 
| 
       450 
     | 
    
         
            -
                    M_5_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf @ mk_score_chunk
         
     | 
| 
       451 
     | 
    
         
            -
                    np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
         
     | 
| 
       452 
     | 
    
         
            -
                    np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
         
     | 
| 
       453 
     | 
    
         
            -
             
     | 
| 
       454 
     | 
    
         
            -
                def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(self, mk_score_common, chrom, sample_name, save_dir):
         
     | 
| 
       455 
     | 
    
         
            -
                    """
         
     | 
| 
       456 
     | 
    
         
            -
                    Calculate the LD score using the SNP-gene weight matrix.
         
     | 
| 
       457 
     | 
    
         
            -
                    :param sample_name:
         
     | 
| 
       458 
     | 
    
         
            -
                    """
         
     | 
| 
       459 
     | 
    
         
            -
                    # Calculate the LD score
         
     | 
| 
       460 
     | 
    
         
            -
                    chunk_index = 1
         
     | 
| 
       461 
     | 
    
         
            -
                    for i in trange(0, mk_score_common.shape[1], self.config.spots_per_chunk,
         
     | 
| 
       462 
     | 
    
         
            -
                                    desc=f'Calculating LD score by chunk for chr{chrom}'):
         
     | 
| 
       463 
     | 
    
         
            -
                        mk_score_chunk = mk_score_common.iloc[:, i:i + self.config.spots_per_chunk]
         
     | 
| 
       464 
     | 
    
         
            -
             
     | 
| 
       465 
     | 
    
         
            -
                        ld_score_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather'
         
     | 
| 
       466 
     | 
    
         
            -
                        M_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M'
         
     | 
| 
       467 
     | 
    
         
            -
                        M_5_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50'
         
     | 
| 
       468 
     | 
    
         
            -
             
     | 
| 
       469 
     | 
    
         
            -
                        ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
         
     | 
| 
       470 
     | 
    
         
            -
                            mk_score_chunk,
         
     | 
| 
       471 
     | 
    
         
            -
                            drop_dummy_na=True,
         
     | 
| 
       472 
     | 
    
         
            -
                        )
         
     | 
| 
       473 
     | 
    
         
            -
                        if self.config.ldscore_save_format == 'feather':
         
     | 
| 
       474 
     | 
    
         
            -
                            self.save_ldscore_to_feather(ldscore_chr_chunk,
         
     | 
| 
       475 
     | 
    
         
            -
                                                         column_names=mk_score_chunk.columns,
         
     | 
| 
       476 
     | 
    
         
            -
                                                         save_file_name=ld_score_file,
         
     | 
| 
       477 
     | 
    
         
            -
                                                         )
         
     | 
| 
       478 
     | 
    
         
            -
                        elif self.config.ldscore_save_format == 'zarr':
         
     | 
| 
       479 
     | 
    
         
            -
                            self.save_ldscore_chunk_to_zarr(ldscore_chr_chunk,
         
     | 
| 
       480 
     | 
    
         
            -
                                                            chrom=chrom,
         
     | 
| 
       481 
     | 
    
         
            -
                                                            start_col_index=i,
         
     | 
| 
       482 
     | 
    
         
            -
                                                            )
         
     | 
| 
       483 
     | 
    
         
            -
                        else:
         
     | 
| 
       484 
     | 
    
         
            -
                            raise ValueError(f'Invalid ldscore_save_format: {self.config.ldscore_save_format}')
         
     | 
| 
       485 
     | 
    
         
            -
             
     | 
| 
       486 
     | 
    
         
            -
                        self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
         
     | 
| 
       487 
     | 
    
         
            -
                            mk_score_chunk,
         
     | 
| 
       488 
     | 
    
         
            -
                            M_file,
         
     | 
| 
       489 
     | 
    
         
            -
                            M_5_file,
         
     | 
| 
       490 
     | 
    
         
            -
                            drop_dummy_na=True,
         
     | 
| 
       491 
     | 
    
         
            -
                        )
         
     | 
| 
       492 
     | 
    
         
            -
             
     | 
| 
       493 
     | 
    
         
            -
                        chunk_index += 1
         
     | 
| 
       494 
     | 
    
         
            -
             
     | 
| 
       495 
     | 
    
         
            -
                def calculate_ldscore_for_base_line(self, chrom, sample_name, save_dir):
         
     | 
| 
       496 
     | 
    
         
            -
                    # save baseline ld score
         
     | 
| 
       497 
     | 
    
         
            -
                    baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
         
     | 
| 
       498 
     | 
    
         
            -
                    baseline_mk_score[-1, 0] = 0  # all_gene
         
     | 
| 
       499 
     | 
    
         
            -
                    baseline_mk_score_df = pd.DataFrame(baseline_mk_score, index=self.snp_gene_pair_dummy.columns,
         
     | 
| 
       500 
     | 
    
         
            -
                                                        columns=['all_gene', 'base'])
         
     | 
| 
       501 
     | 
    
         
            -
                    ld_score_file = f'{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather'
         
     | 
| 
       502 
     | 
    
         
            -
                    M_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M'
         
     | 
| 
       503 
     | 
    
         
            -
                    M_5_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M_5_50'
         
     | 
| 
       504 
     | 
    
         
            -
             
     | 
| 
       505 
     | 
    
         
            -
                    ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
         
     | 
| 
       506 
     | 
    
         
            -
                        baseline_mk_score_df,
         
     | 
| 
       507 
     | 
    
         
            -
                        drop_dummy_na=False,
         
     | 
| 
       508 
     | 
    
         
            -
                    )
         
     | 
| 
       509 
     | 
    
         
            -
             
     | 
| 
       510 
     | 
    
         
            -
                    self.save_ldscore_to_feather(ldscore_chr_chunk,
         
     | 
| 
       511 
     | 
    
         
            -
                                                 column_names=baseline_mk_score_df.columns,
         
     | 
| 
       512 
     | 
    
         
            -
                                                 save_file_name=ld_score_file,
         
     | 
| 
       513 
     | 
    
         
            -
                                                 )
         
     | 
| 
       514 
     | 
    
         
            -
                    # save baseline M
         
     | 
| 
       515 
     | 
    
         
            -
                    self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
         
     | 
| 
       516 
     | 
    
         
            -
                        baseline_mk_score_df,
         
     | 
| 
       517 
     | 
    
         
            -
                        M_file,
         
     | 
| 
       518 
     | 
    
         
            -
                        M_5_file,
         
     | 
| 
       519 
     | 
    
         
            -
                        drop_dummy_na=False,
         
     | 
| 
       520 
     | 
    
         
            -
                    )
         
     | 
| 
       521 
     | 
    
         
            -
             
     | 
| 
       522 
     | 
    
         
            -
                def get_snp_gene_dummy(self, chrom, ):
         
     | 
| 
       523 
     | 
    
         
            -
                    """
         
     | 
| 
       524 
     | 
    
         
            -
                    Get the dummy matrix of SNP-gene pairs.
         
     | 
| 
       525 
     | 
    
         
            -
                    """
         
     | 
| 
       526 
     | 
    
         
            -
                    # Load the bim file
         
     | 
| 
       527 
     | 
    
         
            -
                    print("Loading bim data")
         
     | 
| 
       528 
     | 
    
         
            -
                    bim, bim_pr = load_bim(self.config.bfile_root, chrom)
         
     | 
| 
       529 
     | 
    
         
            -
             
     | 
| 
       530 
     | 
    
         
            -
                    if self.config.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
         
     | 
| 
       531 
     | 
    
         
            -
             
     | 
| 
       532 
     | 
    
         
            -
                        SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
         
     | 
| 
       533 
     | 
    
         
            -
                        SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
         
     | 
| 
       534 
     | 
    
         
            -
                        # total_SNP_gene_pair = SNP_gene_pair_gtf.join(SNP_gene_pair_enhancer, how='outer', lsuffix='_gtf', )
         
     | 
| 
       535 
     | 
    
         
            -
             
     | 
| 
       536 
     | 
    
         
            -
                        mask_of_nan_gtf = SNP_gene_pair_gtf.gene_name.isna()
         
     | 
| 
       537 
     | 
    
         
            -
                        mask_of_nan_enhancer = SNP_gene_pair_enhancer.gene_name.isna()
         
     | 
| 
       538 
     | 
    
         
            -
             
     | 
| 
       539 
     | 
    
         
            -
                        if self.config.gene_window_enhancer_priority == 'gene_window_first':
         
     | 
| 
       540 
     | 
    
         
            -
                            SNP_gene_pair = SNP_gene_pair_gtf
         
     | 
| 
       541 
     | 
    
         
            -
                            SNP_gene_pair.loc[mask_of_nan_gtf, 'gene_name'] = SNP_gene_pair_enhancer.loc[
         
     | 
| 
       542 
     | 
    
         
            -
                                mask_of_nan_gtf, 'gene_name']
         
     | 
| 
       543 
     | 
    
         
            -
                        elif self.config.gene_window_enhancer_priority == 'enhancer_first':
         
     | 
| 
       544 
     | 
    
         
            -
                            SNP_gene_pair = SNP_gene_pair_enhancer
         
     | 
| 
       545 
     | 
    
         
            -
                            SNP_gene_pair.loc[mask_of_nan_enhancer, 'gene_name'] = SNP_gene_pair_gtf.loc[
         
     | 
| 
       546 
     | 
    
         
            -
                                mask_of_nan_enhancer, 'gene_name']
         
     | 
| 
       547 
     | 
    
         
            -
                        else:
         
     | 
| 
       548 
     | 
    
         
            -
                            raise ValueError(
         
     | 
| 
       549 
     | 
    
         
            -
                                f'Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}')
         
     | 
| 
       550 
     | 
    
         
            -
             
     | 
| 
       551 
     | 
    
         
            -
                    elif self.config.gene_window_enhancer_priority is None:  # use gtf only
         
     | 
| 
       552 
     | 
    
         
            -
                        SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
         
     | 
| 
       553 
     | 
    
         
            -
                        SNP_gene_pair = SNP_gene_pair_gtf
         
     | 
| 
       554 
     | 
    
         
            -
             
     | 
| 
       555 
     | 
    
         
            -
                    elif self.config.gene_window_enhancer_priority == 'enhancer_only':
         
     | 
| 
       556 
     | 
    
         
            -
                        SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
         
     | 
| 
       557 
     | 
    
         
            -
                        SNP_gene_pair = SNP_gene_pair_enhancer
         
     | 
| 
       558 
     | 
    
         
            -
                    else:
         
     | 
| 
       559 
     | 
    
         
            -
                        raise ValueError('gtf_pr and enhancer_pr cannot be None at the same time')
         
     | 
| 
       560 
     | 
    
         
            -
             
     | 
| 
       561 
     | 
    
         
            -
                    # save the SNP_gene_pair to feather
         
     | 
| 
       562 
     | 
    
         
            -
                    SNP_gene_pair_save_path = Path(
         
     | 
| 
       563 
     | 
    
         
            -
                        self.config.ldscore_save_dir) / f'SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather'
         
     | 
| 
       564 
     | 
    
         
            -
                    SNP_gene_pair_save_path.parent.mkdir(parents=True, exist_ok=True)
         
     | 
| 
       565 
     | 
    
         
            -
                    SNP_gene_pair.reset_index().to_feather(SNP_gene_pair_save_path)
         
     | 
| 
       566 
     | 
    
         
            -
             
     | 
| 
       567 
     | 
    
         
            -
                    # Get the dummy matrix
         
     | 
| 
       568 
     | 
    
         
            -
                    SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair['gene_name'], dummy_na=True)
         
     | 
| 
       569 
     | 
    
         
            -
                    return SNP_gene_pair_dummy
         
     | 
| 
       570 
     | 
    
         
            -
             
     | 
| 
       571 
     | 
    
         
            -
                def get_SNP_gene_pair_from_gtf(self, bim, bim_pr):
         
     | 
| 
       572 
     | 
    
         
            -
                    logger.info(
         
     | 
| 
       573 
     | 
    
         
            -
                        "Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)")
         
     | 
| 
       574 
     | 
    
         
            -
                    overlaps_small = Overlaps_gtf_bim(self.gtf_pr, bim_pr)
         
     | 
| 
       575 
     | 
    
         
            -
                    # Get the SNP-gene pair
         
     | 
| 
       576 
     | 
    
         
            -
                    annot = bim[["CHR", "BP", "SNP", "CM"]]
         
     | 
| 
       577 
     | 
    
         
            -
                    SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
         
     | 
| 
       578 
     | 
    
         
            -
                    return SNP_gene_pair
         
     | 
| 
       579 
     | 
    
         
            -
             
     | 
| 
       580 
     | 
    
         
            -
                def get_SNP_gene_pair_from_enhancer(self, bim, bim_pr, ):
         
     | 
| 
       581 
     | 
    
         
            -
                    logger.info(
         
     | 
| 
       582 
     | 
    
         
            -
                        "Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score")
         
     | 
| 
       583 
     | 
    
         
            -
                    # Get the SNP-gene pair
         
     | 
| 
       584 
     | 
    
         
            -
                    overlaps_small = self.enhancer_pr.join(bim_pr).df
         
     | 
| 
       585 
     | 
    
         
            -
                    annot = bim[["CHR", "BP", "SNP", "CM"]]
         
     | 
| 
       586 
     | 
    
         
            -
                    if self.config.snp_multiple_enhancer_strategy == 'max_mkscore':
         
     | 
| 
       587 
     | 
    
         
            -
                        logger.debug('select the gene with highest marker score')
         
     | 
| 
       588 
     | 
    
         
            -
                        overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').avg_mkscore.idxmax()]
         
     | 
| 
       589 
     | 
    
         
            -
             
     | 
| 
       590 
     | 
    
         
            -
                    elif self.config.snp_multiple_enhancer_strategy == 'nearest_TSS':
         
     | 
| 
       591 
     | 
    
         
            -
                        logger.debug('select the gene with nearest TSS')
         
     | 
| 
       592 
     | 
    
         
            -
                        overlaps_small['Distance'] = np.abs(overlaps_small['Start_b'] - overlaps_small['TSS'])
         
     | 
| 
       593 
     | 
    
         
            -
                        overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
         
     | 
| 
       594 
     | 
    
         
            -
             
     | 
| 
       595 
     | 
    
         
            -
                    SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
         
     | 
| 
       596 
     | 
    
         
            -
             
     | 
| 
       597 
     | 
    
         
            -
                    return SNP_gene_pair
         
     | 
| 
       598 
     | 
    
         
            -
             
     | 
| 
       599 
     | 
    
         
            -
             
     | 
| 
       600 
     | 
    
         
            -
            def run_generate_ldscore(config: GenerateLDScoreConfig):
         
     | 
| 
       601 
     | 
    
         
            -
                if config.ldscore_save_format == 'quick_mode':
         
     | 
| 
       602 
     | 
    
         
            -
                    logger.info('Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore.')
         
     | 
| 
       603 
     | 
    
         
            -
                    ldscore_save_dir = config.ldscore_save_dir
         
     | 
| 
       604 
     | 
    
         
            -
             
     | 
| 
       605 
     | 
    
         
            -
                    # link the baseline annotation
         
     | 
| 
       606 
     | 
    
         
            -
                    baseline_annotation_dir = Path(config.baseline_annotation_dir)
         
     | 
| 
       607 
     | 
    
         
            -
                    (ldscore_save_dir / 'baseline').symlink_to(baseline_annotation_dir, target_is_directory=True)
         
     | 
| 
       608 
     | 
    
         
            -
             
     | 
| 
       609 
     | 
    
         
            -
                    # link the SNP_gene_pair
         
     | 
| 
       610 
     | 
    
         
            -
                    SNP_gene_pair_dir = Path(config.SNP_gene_pair_dir)
         
     | 
| 
       611 
     | 
    
         
            -
                    (ldscore_save_dir / 'SNP_gene_pair').symlink_to(SNP_gene_pair_dir, target_is_directory=True)
         
     | 
| 
       612 
     | 
    
         
            -
                    return
         
     | 
| 
       613 
     | 
    
         
            -
                s_ldsc_boost = S_LDSC_Boost(config)
         
     | 
| 
       614 
     | 
    
         
            -
                if config.chrom == 'all':
         
     | 
| 
       615 
     | 
    
         
            -
                    for chrom in range(1, 23):
         
     | 
| 
       616 
     | 
    
         
            -
                        s_ldsc_boost.process_chromosome(chrom)
         
     | 
| 
       617 
     | 
    
         
            -
                else:
         
     | 
| 
       618 
     | 
    
         
            -
                    s_ldsc_boost.process_chromosome(config.chrom)
         
     | 
| 
      
 1 
     | 
    
         
            +
            import logging
         
     | 
| 
      
 2 
     | 
    
         
            +
            import warnings
         
     | 
| 
      
 3 
     | 
    
         
            +
            from pathlib import Path
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            import numpy as np
         
     | 
| 
      
 6 
     | 
    
         
            +
            import pandas as pd
         
     | 
| 
      
 7 
     | 
    
         
            +
            import pyranges as pr
         
     | 
| 
      
 8 
     | 
    
         
            +
            import zarr
         
     | 
| 
      
 9 
     | 
    
         
            +
            from scipy.sparse import csr_matrix
         
     | 
| 
      
 10 
     | 
    
         
            +
            from tqdm import trange
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            from gsMap.config import GenerateLDScoreConfig
         
     | 
| 
      
 13 
     | 
    
         
            +
            from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            warnings.filterwarnings("ignore", category=FutureWarning)
         
     | 
| 
      
 16 
     | 
    
         
            +
            logger = logging.getLogger(__name__)
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
            # %%
         
     | 
| 
      
 20 
     | 
    
         
            +
            # load gtf
         
     | 
| 
      
 21 
     | 
    
         
            +
            def load_gtf(gtf_file, mk_score, window_size):
         
     | 
| 
      
 22 
     | 
    
         
            +
                """
         
     | 
| 
      
 23 
     | 
    
         
            +
                Load the gene annotation file (gtf).
         
     | 
| 
      
 24 
     | 
    
         
            +
                """
         
     | 
| 
      
 25 
     | 
    
         
            +
                print("Loading gtf data")
         
     | 
| 
      
 26 
     | 
    
         
            +
                #
         
     | 
| 
      
 27 
     | 
    
         
            +
                # Load GTF file
         
     | 
| 
      
 28 
     | 
    
         
            +
                gtf = pr.read_gtf(gtf_file, )
         
     | 
| 
      
 29 
     | 
    
         
            +
                gtf = gtf.df
         
     | 
| 
      
 30 
     | 
    
         
            +
                #
         
     | 
| 
      
 31 
     | 
    
         
            +
                # Select the common genes
         
     | 
| 
      
 32 
     | 
    
         
            +
                gtf = gtf[gtf['Feature'] == 'gene']
         
     | 
| 
      
 33 
     | 
    
         
            +
                common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
         
     | 
| 
      
 34 
     | 
    
         
            +
                #
         
     | 
| 
      
 35 
     | 
    
         
            +
                gtf = gtf[gtf.gene_name.isin(common_gene)]
         
     | 
| 
      
 36 
     | 
    
         
            +
                mk_score = mk_score[mk_score.index.isin(common_gene)]
         
     | 
| 
      
 37 
     | 
    
         
            +
                #
         
     | 
| 
      
 38 
     | 
    
         
            +
                # Remove duplicated lines
         
     | 
| 
      
 39 
     | 
    
         
            +
                gtf = gtf.drop_duplicates(subset='gene_name', keep="first")
         
     | 
| 
      
 40 
     | 
    
         
            +
                #
         
     | 
| 
      
 41 
     | 
    
         
            +
                # Process the GTF (open 100-KB window: Tss - Ted)
         
     | 
| 
      
 42 
     | 
    
         
            +
                gtf_bed = gtf[['Chromosome', 'Start', 'End', 'gene_name', 'Strand']].copy()
         
     | 
| 
      
 43 
     | 
    
         
            +
                gtf_bed.loc[:, 'TSS'] = gtf_bed['Start']
         
     | 
| 
      
 44 
     | 
    
         
            +
                gtf_bed.loc[:, 'TED'] = gtf_bed['End']
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                gtf_bed.loc[:, 'Start'] = gtf_bed['TSS'] - window_size
         
     | 
| 
      
 47 
     | 
    
         
            +
                gtf_bed.loc[:, 'End'] = gtf_bed['TED'] + window_size
         
     | 
| 
      
 48 
     | 
    
         
            +
                gtf_bed.loc[gtf_bed['Start'] < 0, 'Start'] = 0
         
     | 
| 
      
 49 
     | 
    
         
            +
                #
         
     | 
| 
      
 50 
     | 
    
         
            +
                # Correct the negative strand
         
     | 
| 
      
 51 
     | 
    
         
            +
                tss_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS']
         
     | 
| 
      
 52 
     | 
    
         
            +
                ted_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED']
         
     | 
| 
      
 53 
     | 
    
         
            +
                gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS'] = ted_neg
         
     | 
| 
      
 54 
     | 
    
         
            +
                gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED'] = tss_neg
         
     | 
| 
      
 55 
     | 
    
         
            +
                gtf_bed = gtf_bed.drop('Strand', axis=1)
         
     | 
| 
      
 56 
     | 
    
         
            +
                #
         
     | 
| 
      
 57 
     | 
    
         
            +
                # Transform the GTF to PyRanges
         
     | 
| 
      
 58 
     | 
    
         
            +
                gtf_pr = pr.PyRanges(gtf_bed)
         
     | 
| 
      
 59 
     | 
    
         
            +
                return gtf_pr, mk_score
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
            # %%
         
     | 
| 
      
 63 
     | 
    
         
            +
            def load_marker_score(mk_score_file):
         
     | 
| 
      
 64 
     | 
    
         
            +
                """
         
     | 
| 
      
 65 
     | 
    
         
            +
                Load marker scores of each cell.
         
     | 
| 
      
 66 
     | 
    
         
            +
                """
         
     | 
| 
      
 67 
     | 
    
         
            +
                mk_score = pd.read_feather(mk_score_file).set_index('HUMAN_GENE_SYM').rename_axis('gene_name')
         
     | 
| 
      
 68 
     | 
    
         
            +
                mk_score = mk_score.astype(np.float32, copy=False)
         
     | 
| 
      
 69 
     | 
    
         
            +
                return mk_score
         
     | 
| 
      
 70 
     | 
    
         
            +
             
     | 
| 
      
 71 
     | 
    
         
            +
             
     | 
| 
      
 72 
     | 
    
         
            +
            # %%
         
     | 
| 
      
 73 
     | 
    
         
            +
            # load mkscore get common gene
         
     | 
| 
      
 74 
     | 
    
         
            +
            # %%
         
     | 
| 
      
 75 
     | 
    
         
            +
            # load bim
         
     | 
| 
      
 76 
     | 
    
         
            +
            def load_bim(bfile_root, chrom):
         
     | 
| 
      
 77 
     | 
    
         
            +
                """
         
     | 
| 
      
 78 
     | 
    
         
            +
                Load the bim file.
         
     | 
| 
      
 79 
     | 
    
         
            +
                """
         
     | 
| 
      
 80 
     | 
    
         
            +
                bim = pd.read_csv(f'{bfile_root}.{chrom}.bim', sep='\t', header=None)
         
     | 
| 
      
 81 
     | 
    
         
            +
                bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
         
     | 
| 
      
 82 
     | 
    
         
            +
                #
         
     | 
| 
      
 83 
     | 
    
         
            +
                # Transform bim to PyRanges
         
     | 
| 
      
 84 
     | 
    
         
            +
                bim_pr = bim.copy()
         
     | 
| 
      
 85 
     | 
    
         
            +
                bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
         
     | 
| 
      
 86 
     | 
    
         
            +
             
     | 
| 
      
 87 
     | 
    
         
            +
                bim_pr['End'] = bim_pr['Start'].copy()
         
     | 
| 
      
 88 
     | 
    
         
            +
                bim_pr['Start'] = bim_pr['Start'] - 1  # Due to bim file is 1-based
         
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
      
 90 
     | 
    
         
            +
                bim_pr = pr.PyRanges(bim_pr)
         
     | 
| 
      
 91 
     | 
    
         
            +
                bim_pr.Chromosome = f'chr{chrom}'
         
     | 
| 
      
 92 
     | 
    
         
            +
                return bim, bim_pr
         
     | 
| 
      
 93 
     | 
    
         
            +
             
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
      
 95 
     | 
    
         
            +
            # %%
         
     | 
| 
      
 96 
     | 
    
         
            +
            def Overlaps_gtf_bim(gtf_pr, bim_pr):
         
     | 
| 
      
 97 
     | 
    
         
            +
                """
         
     | 
| 
      
 98 
     | 
    
         
            +
                Find overlaps between gtf and bim file.
         
     | 
| 
      
 99 
     | 
    
         
            +
                """
         
     | 
| 
      
 100 
     | 
    
         
            +
                # Select the overlapped regions (SNPs in gene windows)
         
     | 
| 
      
 101 
     | 
    
         
            +
                overlaps = gtf_pr.join(bim_pr)
         
     | 
| 
      
 102 
     | 
    
         
            +
                overlaps = overlaps.df
         
     | 
| 
      
 103 
     | 
    
         
            +
                overlaps['Distance'] = np.abs(overlaps['Start_b'] - overlaps['TSS'])
         
     | 
| 
      
 104 
     | 
    
         
            +
                overlaps_small = overlaps.copy()
         
     | 
| 
      
 105 
     | 
    
         
            +
                overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
         
     | 
| 
      
 106 
     | 
    
         
            +
                return overlaps_small
         
     | 
| 
      
 107 
     | 
    
         
            +
             
     | 
| 
      
 108 
     | 
    
         
            +
             
     | 
| 
      
 109 
     | 
    
         
            +
            # %%
         
     | 
| 
      
 110 
     | 
    
         
            +
            def filter_snps_by_keep_snp(bim_df, keep_snp_file):
         
     | 
| 
      
 111 
     | 
    
         
            +
                # Load the keep_snp file and filter the BIM DataFrame
         
     | 
| 
      
 112 
     | 
    
         
            +
                keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
         
     | 
| 
      
 113 
     | 
    
         
            +
                filtered_bim_df = bim_df[bim_df['SNP'].isin(keep_snp)]
         
     | 
| 
      
 114 
     | 
    
         
            +
                return filtered_bim_df
         
     | 
| 
      
 115 
     | 
    
         
            +
             
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
            def get_snp_counts(config):
         
     | 
| 
      
 118 
     | 
    
         
            +
                snp_counts = {}
         
     | 
| 
      
 119 
     | 
    
         
            +
                total_snp = 0
         
     | 
| 
      
 120 
     | 
    
         
            +
             
     | 
| 
      
 121 
     | 
    
         
            +
                for chrom in range(1, 23):
         
     | 
| 
      
 122 
     | 
    
         
            +
                    bim_df, _ = load_bim(config.bfile_root, chrom)
         
     | 
| 
      
 123 
     | 
    
         
            +
             
     | 
| 
      
 124 
     | 
    
         
            +
                    if config.keep_snp_root:
         
     | 
| 
      
 125 
     | 
    
         
            +
                        keep_snp_file = f'{config.keep_snp_root}.{chrom}.snp'
         
     | 
| 
      
 126 
     | 
    
         
            +
                        filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
         
     | 
| 
      
 127 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 128 
     | 
    
         
            +
                        filtered_bim_df = bim_df
         
     | 
| 
      
 129 
     | 
    
         
            +
             
     | 
| 
      
 130 
     | 
    
         
            +
                    snp_counts[chrom] = filtered_bim_df.shape[0]
         
     | 
| 
      
 131 
     | 
    
         
            +
                    total_snp += snp_counts[chrom]
         
     | 
| 
      
 132 
     | 
    
         
            +
             
     | 
| 
      
 133 
     | 
    
         
            +
                snp_counts['total'] = total_snp
         
     | 
| 
      
 134 
     | 
    
         
            +
             
     | 
| 
      
 135 
     | 
    
         
            +
                chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
         
     | 
| 
      
 136 
     | 
    
         
            +
             
     | 
| 
      
 137 
     | 
    
         
            +
                snp_counts['chrom_snp_start_point'] = [0] + chrom_snp_length_array.tolist()
         
     | 
| 
      
 138 
     | 
    
         
            +
             
     | 
| 
      
 139 
     | 
    
         
            +
                return snp_counts
         
     | 
| 
      
 140 
     | 
    
         
            +
             
     | 
| 
      
 141 
     | 
    
         
            +
             
     | 
| 
      
 142 
     | 
    
         
            +
            # %%
         
     | 
| 
      
 143 
     | 
    
         
            +
            def get_snp_pass_maf(bfile_root, chrom, maf_min=0.05):
         
     | 
| 
      
 144 
     | 
    
         
            +
                """
         
     | 
| 
      
 145 
     | 
    
         
            +
                Get the dummy matrix of SNP-gene pairs.
         
     | 
| 
      
 146 
     | 
    
         
            +
                """
         
     | 
| 
      
 147 
     | 
    
         
            +
                # Load the bim file
         
     | 
| 
      
 148 
     | 
    
         
            +
                PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
         
     | 
| 
      
 149 
     | 
    
         
            +
                PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
         
     | 
| 
      
 150 
     | 
    
         
            +
             
     | 
| 
      
 151 
     | 
    
         
            +
                bfile = f'{bfile_root}.{chrom}'
         
     | 
| 
      
 152 
     | 
    
         
            +
                snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
         
     | 
| 
      
 153 
     | 
    
         
            +
                array_snps = snp_obj(snp_file)
         
     | 
| 
      
 154 
     | 
    
         
            +
                m = len(array_snps.IDList)
         
     | 
| 
      
 155 
     | 
    
         
            +
             
     | 
| 
      
 156 
     | 
    
         
            +
                # Load fam
         
     | 
| 
      
 157 
     | 
    
         
            +
                ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
         
     | 
| 
      
 158 
     | 
    
         
            +
                array_indivs = ind_obj(ind_file)
         
     | 
| 
      
 159 
     | 
    
         
            +
                n = len(array_indivs.IDList)
         
     | 
| 
      
 160 
     | 
    
         
            +
                array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
         
     | 
| 
      
 161 
     | 
    
         
            +
                geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
         
     | 
| 
      
 162 
     | 
    
         
            +
                ii = geno_array.maf > maf_min
         
     | 
| 
      
 163 
     | 
    
         
            +
                snp_pass_maf = array_snps.IDList[ii]
         
     | 
| 
      
 164 
     | 
    
         
            +
                print(f'After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain.')
         
     | 
| 
      
 165 
     | 
    
         
            +
                return snp_pass_maf.SNP.to_list()
         
     | 
| 
      
 166 
     | 
    
         
            +
             
     | 
| 
      
 167 
     | 
    
         
            +
             
     | 
| 
      
 168 
     | 
    
         
            +
            def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit='CM'):
         
     | 
| 
      
 169 
     | 
    
         
            +
                PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
         
     | 
| 
      
 170 
     | 
    
         
            +
                PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
         
     | 
| 
      
 171 
     | 
    
         
            +
             
     | 
| 
      
 172 
     | 
    
         
            +
                bfile = f'{bfile_root}.{chrom}'
         
     | 
| 
      
 173 
     | 
    
         
            +
                snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
         
     | 
| 
      
 174 
     | 
    
         
            +
                array_snps = snp_obj(snp_file)
         
     | 
| 
      
 175 
     | 
    
         
            +
                m = len(array_snps.IDList)
         
     | 
| 
      
 176 
     | 
    
         
            +
                print(f'Read list of {m} SNPs from {snp_file}')
         
     | 
| 
      
 177 
     | 
    
         
            +
             
     | 
| 
      
 178 
     | 
    
         
            +
                # Load fam
         
     | 
| 
      
 179 
     | 
    
         
            +
                ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
         
     | 
| 
      
 180 
     | 
    
         
            +
                array_indivs = ind_obj(ind_file)
         
     | 
| 
      
 181 
     | 
    
         
            +
                n = len(array_indivs.IDList)
         
     | 
| 
      
 182 
     | 
    
         
            +
                print(f'Read list of {n} individuals from {ind_file}')
         
     | 
| 
      
 183 
     | 
    
         
            +
                array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
         
     | 
| 
      
 184 
     | 
    
         
            +
                geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
         
     | 
| 
      
 185 
     | 
    
         
            +
                # Load the annotations of the baseline
         
     | 
| 
      
 186 
     | 
    
         
            +
                if ld_unit == 'SNP':
         
     | 
| 
      
 187 
     | 
    
         
            +
                    max_dist = ld_wind
         
     | 
| 
      
 188 
     | 
    
         
            +
                    coords = np.array(range(geno_array.m))
         
     | 
| 
      
 189 
     | 
    
         
            +
                elif ld_unit == 'KB':
         
     | 
| 
      
 190 
     | 
    
         
            +
                    max_dist = ld_wind * 1000
         
     | 
| 
      
 191 
     | 
    
         
            +
                    coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
         
     | 
| 
      
 192 
     | 
    
         
            +
                elif ld_unit == 'CM':
         
     | 
| 
      
 193 
     | 
    
         
            +
                    max_dist = ld_wind
         
     | 
| 
      
 194 
     | 
    
         
            +
                    coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
         
     | 
| 
      
 195 
     | 
    
         
            +
                else:
         
     | 
| 
      
 196 
     | 
    
         
            +
                    raise ValueError(f'Invalid ld_wind_unit: {ld_unit}')
         
     | 
| 
      
 197 
     | 
    
         
            +
                block_left = getBlockLefts(coords, max_dist)
         
     | 
| 
      
 198 
     | 
    
         
            +
                # Calculate the LD score
         
     | 
| 
      
 199 
     | 
    
         
            +
                lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
         
     | 
| 
      
 200 
     | 
    
         
            +
                return lN_df
         
     | 
| 
      
 201 
     | 
    
         
            +
             
     | 
| 
      
 202 
     | 
    
         
            +
             
     | 
| 
      
 203 
     | 
    
         
            +
            # %%
         
     | 
| 
      
 204 
     | 
    
         
            +
            def calculate_ldscore_from_annotation(SNP_annotation_df, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
         
     | 
| 
      
 205 
     | 
    
         
            +
                """
         
     | 
| 
      
 206 
     | 
    
         
            +
                Calculate the SNP-gene weight matrix.
         
     | 
| 
      
 207 
     | 
    
         
            +
                """
         
     | 
| 
      
 208 
     | 
    
         
            +
                # Get the dummy matrix
         
     | 
| 
      
 209 
     | 
    
         
            +
                # Get the SNP-gene weight matrix
         
     | 
| 
      
 210 
     | 
    
         
            +
                snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
         
     | 
| 
      
 211 
     | 
    
         
            +
                                                     ld_unit=ld_unit)
         
     | 
| 
      
 212 
     | 
    
         
            +
                snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
         
     | 
| 
      
 213 
     | 
    
         
            +
                snp_gene_weight_matrix.index = SNP_annotation_df.index
         
     | 
| 
      
 214 
     | 
    
         
            +
                snp_gene_weight_matrix.columns = SNP_annotation_df.columns
         
     | 
| 
      
 215 
     | 
    
         
            +
                return snp_gene_weight_matrix
         
     | 
| 
      
 216 
     | 
    
         
            +
             
     | 
| 
      
 217 
     | 
    
         
            +
             
     | 
| 
      
 218 
     | 
    
         
            +
            def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
         
     | 
| 
      
 219 
     | 
    
         
            +
                SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1).astype(np.float32, copy=False)
         
     | 
| 
      
 220 
     | 
    
         
            +
             
     | 
| 
      
 221 
     | 
    
         
            +
                snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
         
     | 
| 
      
 222 
     | 
    
         
            +
                                                     ld_unit=ld_unit)
         
     | 
| 
      
 223 
     | 
    
         
            +
                snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
         
     | 
| 
      
 224 
     | 
    
         
            +
                snp_gene_weight_matrix.index = SNP_annotation_df.index
         
     | 
| 
      
 225 
     | 
    
         
            +
                snp_gene_weight_matrix.columns = SNP_annotation_df.columns
         
     | 
| 
      
 226 
     | 
    
         
            +
             
     | 
| 
      
 227 
     | 
    
         
            +
                # split to each annotation
         
     | 
| 
      
 228 
     | 
    
         
            +
                snp_annotation_len_list = [len(df.columns) for df in SNP_annotation_df_list]
         
     | 
| 
      
 229 
     | 
    
         
            +
                snp_gene_weight_matrix_list = []
         
     | 
| 
      
 230 
     | 
    
         
            +
                start = 0
         
     | 
| 
      
 231 
     | 
    
         
            +
                for snp_annotation_len in snp_annotation_len_list:
         
     | 
| 
      
 232 
     | 
    
         
            +
                    snp_gene_weight_matrix_list.append(snp_gene_weight_matrix.iloc[:, start:start + snp_annotation_len])
         
     | 
| 
      
 233 
     | 
    
         
            +
                    start += snp_annotation_len
         
     | 
| 
      
 234 
     | 
    
         
            +
                return snp_gene_weight_matrix_list
         
     | 
| 
      
 235 
     | 
    
         
            +
             
     | 
| 
      
 236 
     | 
    
         
            +
             
     | 
| 
      
 237 
     | 
    
         
            +
            # %%
         
     | 
| 
      
 238 
     | 
    
         
            +
            class S_LDSC_Boost:
         
     | 
| 
      
 239 
     | 
    
         
            +
                def __init__(self, config: GenerateLDScoreConfig):
         
     | 
| 
      
 240 
     | 
    
         
            +
                    self.config = config
         
     | 
| 
      
 241 
     | 
    
         
            +
             
     | 
| 
      
 242 
     | 
    
         
            +
                    self.mk_score = load_marker_score(config.mkscore_feather_path)
         
     | 
| 
      
 243 
     | 
    
         
            +
             
     | 
| 
      
 244 
     | 
    
         
            +
                    # Load GTF and get common markers
         
     | 
| 
      
 245 
     | 
    
         
            +
                    self.gtf_pr, self.mk_score_common = load_gtf(config.gtf_annotation_file, self.mk_score,
         
     | 
| 
      
 246 
     | 
    
         
            +
                                                                 window_size=config.gene_window_size)
         
     | 
| 
      
 247 
     | 
    
         
            +
             
     | 
| 
      
 248 
     | 
    
         
            +
                    # Load enhancer
         
     | 
| 
      
 249 
     | 
    
         
            +
                    if config.enhancer_annotation_file is not None:
         
     | 
| 
      
 250 
     | 
    
         
            +
                        enhancer_df = pr.read_bed(config.enhancer_annotation_file, as_df=True)
         
     | 
| 
      
 251 
     | 
    
         
            +
                        enhancer_df.set_index('Name', inplace=True)
         
     | 
| 
      
 252 
     | 
    
         
            +
                        enhancer_df.index.name = 'gene_name'
         
     | 
| 
      
 253 
     | 
    
         
            +
             
     | 
| 
      
 254 
     | 
    
         
            +
                        # keep the common genes and add the enhancer score
         
     | 
| 
      
 255 
     | 
    
         
            +
                        avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=['avg_mkscore'])
         
     | 
| 
      
 256 
     | 
    
         
            +
                        enhancer_df = enhancer_df.join(avg_mkscore, how='inner', on='gene_name', )
         
     | 
| 
      
 257 
     | 
    
         
            +
             
     | 
| 
      
 258 
     | 
    
         
            +
                        # add distance to TSS
         
     | 
| 
      
 259 
     | 
    
         
            +
                        enhancer_df['TSS'] = self.gtf_pr.df.set_index('gene_name').reindex(enhancer_df.index)['TSS']
         
     | 
| 
      
 260 
     | 
    
         
            +
             
     | 
| 
      
 261 
     | 
    
         
            +
                        # convert to pyranges
         
     | 
| 
      
 262 
     | 
    
         
            +
                        self.enhancer_pr = pr.PyRanges(enhancer_df.reset_index())
         
     | 
| 
      
 263 
     | 
    
         
            +
             
     | 
| 
      
 264 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 265 
     | 
    
         
            +
                        self.enhancer_pr = None
         
     | 
| 
      
 266 
     | 
    
         
            +
             
     | 
| 
      
 267 
     | 
    
         
            +
                    # create tha zarr file
         
     | 
| 
      
 268 
     | 
    
         
            +
                    if config.ldscore_save_format == 'zarr':
         
     | 
| 
      
 269 
     | 
    
         
            +
             
     | 
| 
      
 270 
     | 
    
         
            +
                        chrom_snp_length_dict = get_snp_counts(config)
         
     | 
| 
      
 271 
     | 
    
         
            +
                        self.chrom_snp_start_point = chrom_snp_length_dict['chrom_snp_start_point']
         
     | 
| 
      
 272 
     | 
    
         
            +
             
     | 
| 
      
 273 
     | 
    
         
            +
                        zarr_path = Path(config.ldscore_save_dir) / f'{config.sample_name}.ldscore.zarr'
         
     | 
| 
      
 274 
     | 
    
         
            +
                        if not zarr_path.exists():
         
     | 
| 
      
 275 
     | 
    
         
            +
                            self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a', dtype=np.float16,
         
     | 
| 
      
 276 
     | 
    
         
            +
                                                       chunks=config.zarr_chunk_size,
         
     | 
| 
      
 277 
     | 
    
         
            +
                                                       shape=(chrom_snp_length_dict['total'], self.mk_score_common.shape[1]))
         
     | 
| 
      
 278 
     | 
    
         
            +
                            zarr_path.mkdir(parents=True, exist_ok=True)
         
     | 
| 
      
 279 
     | 
    
         
            +
                            # save spot names
         
     | 
| 
      
 280 
     | 
    
         
            +
                            self.zarr_file.attrs['spot_names'] = self.mk_score_common.columns.to_list()
         
     | 
| 
      
 281 
     | 
    
         
            +
                            # save chrom_snp_length_dict
         
     | 
| 
      
 282 
     | 
    
         
            +
                            self.zarr_file.attrs['chrom_snp_start_point'] = self.chrom_snp_start_point
         
     | 
| 
      
 283 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 284 
     | 
    
         
            +
                            self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a')
         
     | 
| 
      
 285 
     | 
    
         
            +
             
     | 
| 
      
 286 
     | 
    
         
            +
                def process_chromosome(self, chrom: int):
         
     | 
| 
      
 287 
     | 
    
         
            +
                    self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
         
     | 
| 
      
 288 
     | 
    
         
            +
             
     | 
| 
      
 289 
     | 
    
         
            +
                    # Get SNP-Gene dummy pairs
         
     | 
| 
      
 290 
     | 
    
         
            +
                    self.snp_gene_pair_dummy = self.get_snp_gene_dummy(chrom, )
         
     | 
| 
      
 291 
     | 
    
         
            +
             
     | 
| 
      
 292 
     | 
    
         
            +
                    if self.config.keep_snp_root is not None:
         
     | 
| 
      
 293 
     | 
    
         
            +
                        keep_snp = pd.read_csv(f'{self.config.keep_snp_root}.{chrom}.snp', header=None)[0].to_list()
         
     | 
| 
      
 294 
     | 
    
         
            +
                        self.keep_snp_mask = self.snp_gene_pair_dummy.index.isin(keep_snp)
         
     | 
| 
      
 295 
     | 
    
         
            +
                        # the SNP name of keeped
         
     | 
| 
      
 296 
     | 
    
         
            +
                        self.snp_name = self.snp_gene_pair_dummy.index[self.keep_snp_mask].to_list()
         
     | 
| 
      
 297 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 298 
     | 
    
         
            +
                        self.keep_snp_mask = None
         
     | 
| 
      
 299 
     | 
    
         
            +
                        self.snp_name = self.snp_gene_pair_dummy.index.to_list()
         
     | 
| 
      
 300 
     | 
    
         
            +
             
     | 
| 
      
 301 
     | 
    
         
            +
                    if self.config.additional_baseline_annotation is not None:
         
     | 
| 
      
 302 
     | 
    
         
            +
                        additional_baseline_annotation = Path(self.config.additional_baseline_annotation)
         
     | 
| 
      
 303 
     | 
    
         
            +
                        additional_baseline_annotation_file_path = additional_baseline_annotation / f'baseline.{chrom}.annot.gz'
         
     | 
| 
      
 304 
     | 
    
         
            +
                        assert additional_baseline_annotation_file_path.exists(), f'additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}'
         
     | 
| 
      
 305 
     | 
    
         
            +
                        additional_baseline_annotation_df = pd.read_csv(additional_baseline_annotation_file_path, sep='\t')
         
     | 
| 
      
 306 
     | 
    
         
            +
                        additional_baseline_annotation_df.set_index('SNP', inplace=True)
         
     | 
| 
      
 307 
     | 
    
         
            +
             
     | 
| 
      
 308 
     | 
    
         
            +
                        # drop these columns if exists CHR         BP       CM]
         
     | 
| 
      
 309 
     | 
    
         
            +
                        additional_baseline_annotation_df.drop(['CHR', 'BP', 'CM'], axis=1, inplace=True, errors='ignore')
         
     | 
| 
      
 310 
     | 
    
         
            +
             
     | 
| 
      
 311 
     | 
    
         
            +
                        # reindex, for those SNPs not in additional_baseline_annotation_df, set to 0
         
     | 
| 
      
 312 
     | 
    
         
            +
                        num_of_not_exist_snp = (~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)).sum()
         
     | 
| 
      
 313 
     | 
    
         
            +
                        if num_of_not_exist_snp > 0:
         
     | 
| 
      
 314 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 315 
     | 
    
         
            +
                                f'{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0')
         
     | 
| 
      
 316 
     | 
    
         
            +
                            additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
         
     | 
| 
      
 317 
     | 
    
         
            +
                                self.snp_gene_pair_dummy.index,
         
     | 
| 
      
 318 
     | 
    
         
            +
                                fill_value=0)
         
     | 
| 
      
 319 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 320 
     | 
    
         
            +
                            additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
         
     | 
| 
      
 321 
     | 
    
         
            +
                                self.snp_gene_pair_dummy.index)
         
     | 
| 
      
 322 
     | 
    
         
            +
             
     | 
| 
      
 323 
     | 
    
         
            +
                        # do this for saving the cpu time, only calculate r2 once
         
     | 
| 
      
 324 
     | 
    
         
            +
                        self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
         
     | 
| 
      
 325 
     | 
    
         
            +
                            calculate_ldscore_from_multiple_annotation(
         
     | 
| 
      
 326 
     | 
    
         
            +
                                [self.snp_gene_pair_dummy, additional_baseline_annotation_df],
         
     | 
| 
      
 327 
     | 
    
         
            +
                                chrom,
         
     | 
| 
      
 328 
     | 
    
         
            +
                                self.config.bfile_root,
         
     | 
| 
      
 329 
     | 
    
         
            +
                                ld_wind=self.config.ld_wind,
         
     | 
| 
      
 330 
     | 
    
         
            +
                                ld_unit=self.config.ld_unit))
         
     | 
| 
      
 331 
     | 
    
         
            +
             
     | 
| 
      
 332 
     | 
    
         
            +
                        additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[self.snp_name]
         
     | 
| 
      
 333 
     | 
    
         
            +
                        # print(additional_baseline_annotation_ldscore.index.to_list()==self.snp_name)
         
     | 
| 
      
 334 
     | 
    
         
            +
             
     | 
| 
      
 335 
     | 
    
         
            +
                        ld_score_file = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather'
         
     | 
| 
      
 336 
     | 
    
         
            +
                        M_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M'
         
     | 
| 
      
 337 
     | 
    
         
            +
                        M_5_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50'
         
     | 
| 
      
 338 
     | 
    
         
            +
             
     | 
| 
      
 339 
     | 
    
         
            +
                        # save additional baseline annotation ldscore
         
     | 
| 
      
 340 
     | 
    
         
            +
                        self.save_ldscore_to_feather(additional_baseline_annotation_ldscore.values,
         
     | 
| 
      
 341 
     | 
    
         
            +
                                                     column_names=additional_baseline_annotation_ldscore.columns,
         
     | 
| 
      
 342 
     | 
    
         
            +
                                                     save_file_name=ld_score_file,
         
     | 
| 
      
 343 
     | 
    
         
            +
                                                     )
         
     | 
| 
      
 344 
     | 
    
         
            +
             
     | 
| 
      
 345 
     | 
    
         
            +
                        # caculate the M and save
         
     | 
| 
      
 346 
     | 
    
         
            +
                        save_dir = Path(M_file_path).parent
         
     | 
| 
      
 347 
     | 
    
         
            +
                        save_dir.mkdir(parents=True, exist_ok=True)
         
     | 
| 
      
 348 
     | 
    
         
            +
                        M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
         
     | 
| 
      
 349 
     | 
    
         
            +
                        M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0, keepdims=True)
         
     | 
| 
      
 350 
     | 
    
         
            +
                        np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
         
     | 
| 
      
 351 
     | 
    
         
            +
                        np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
         
     | 
| 
      
 352 
     | 
    
         
            +
             
     | 
| 
      
 353 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 354 
     | 
    
         
            +
                        # Calculate SNP-Gene weight matrix
         
     | 
| 
      
 355 
     | 
    
         
            +
                        self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(self.snp_gene_pair_dummy, chrom,
         
     | 
| 
      
 356 
     | 
    
         
            +
                                                                                        self.config.bfile_root,
         
     | 
| 
      
 357 
     | 
    
         
            +
                                                                                        ld_wind=self.config.ld_wind,
         
     | 
| 
      
 358 
     | 
    
         
            +
                                                                                        ld_unit=self.config.ld_unit)
         
     | 
| 
      
 359 
     | 
    
         
            +
                    # only keep the snp in keep_snp_root
         
     | 
| 
      
 360 
     | 
    
         
            +
                    if self.keep_snp_mask is not None:
         
     | 
| 
      
 361 
     | 
    
         
            +
                        self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
         
     | 
| 
      
 362 
     | 
    
         
            +
             
     | 
| 
      
 363 
     | 
    
         
            +
                    if self.config.save_pre_calculate_snp_gene_weight_matrix:
         
     | 
| 
      
 364 
     | 
    
         
            +
                        snp_gene_weight_matrix_save_dir = Path(self.config.ldscore_save_dir) / 'snp_gene_weight_matrix'
         
     | 
| 
      
 365 
     | 
    
         
            +
                        snp_gene_weight_matrix_save_dir.mkdir(parents=True, exist_ok=True)
         
     | 
| 
      
 366 
     | 
    
         
            +
                        logger.info(f'Saving snp_gene_weight_matrix for chr{chrom}...')
         
     | 
| 
      
 367 
     | 
    
         
            +
                        self.snp_gene_weight_matrix.reset_index().to_feather(
         
     | 
| 
      
 368 
     | 
    
         
            +
                            snp_gene_weight_matrix_save_dir / f'{chrom}.snp_gene_weight_matrix.feather')
         
     | 
| 
      
 369 
     | 
    
         
            +
             
     | 
| 
      
 370 
     | 
    
         
            +
                    # convert to sparse
         
     | 
| 
      
 371 
     | 
    
         
            +
                    self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
         
     | 
| 
      
 372 
     | 
    
         
            +
                    logger.info(f'Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}')
         
     | 
| 
      
 373 
     | 
    
         
            +
             
     | 
| 
      
 374 
     | 
    
         
            +
                    # calculate baseline ld score
         
     | 
| 
      
 375 
     | 
    
         
            +
                    logger.info(f'Calculating baseline ld score for chr{chrom}...')
         
     | 
| 
      
 376 
     | 
    
         
            +
                    self.calculate_ldscore_for_base_line(chrom, self.config.sample_name, self.config.ldscore_save_dir)
         
     | 
| 
      
 377 
     | 
    
         
            +
             
     | 
| 
      
 378 
     | 
    
         
            +
                    # calculate ld score for annotation
         
     | 
| 
      
 379 
     | 
    
         
            +
                    logger.info(f'Calculating ld score for annotation for chr{chrom}...')
         
     | 
| 
      
 380 
     | 
    
         
            +
                    self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
         
     | 
| 
      
 381 
     | 
    
         
            +
                        self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
         
     | 
| 
      
 382 
     | 
    
         
            +
                        chrom,
         
     | 
| 
      
 383 
     | 
    
         
            +
                        self.config.sample_name,
         
     | 
| 
      
 384 
     | 
    
         
            +
                        self.config.ldscore_save_dir,
         
     | 
| 
      
 385 
     | 
    
         
            +
                    )
         
     | 
| 
      
 386 
     | 
    
         
            +
             
     | 
| 
      
 387 
     | 
    
         
            +
                def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
         
     | 
| 
      
 388 
     | 
    
         
            +
                                                                          mk_score_chunk,
         
     | 
| 
      
 389 
     | 
    
         
            +
                                                                          drop_dummy_na=True,
         
     | 
| 
      
 390 
     | 
    
         
            +
                                                                          ):
         
     | 
| 
      
 391 
     | 
    
         
            +
             
     | 
| 
      
 392 
     | 
    
         
            +
                    if drop_dummy_na:
         
     | 
| 
      
 393 
     | 
    
         
            +
                        ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
         
     | 
| 
      
 394 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 395 
     | 
    
         
            +
                        ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
         
     | 
| 
      
 396 
     | 
    
         
            +
             
     | 
| 
      
 397 
     | 
    
         
            +
                    return ldscore_chr_chunk
         
     | 
| 
      
 398 
     | 
    
         
            +
             
     | 
| 
      
 399 
     | 
    
         
            +
                def save_ldscore_to_feather(self, ldscore_chr_chunk: np.ndarray, column_names, save_file_name):
         
     | 
| 
      
 400 
     | 
    
         
            +
                    save_dir = Path(save_file_name).parent
         
     | 
| 
      
 401 
     | 
    
         
            +
                    save_dir.mkdir(parents=True, exist_ok=True)
         
     | 
| 
      
 402 
     | 
    
         
            +
             
     | 
| 
      
 403 
     | 
    
         
            +
                    ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
         
     | 
| 
      
 404 
     | 
    
         
            +
                    # avoid overflow of float16, if inf, set to max of float16
         
     | 
| 
      
 405 
     | 
    
         
            +
                    ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
         
     | 
| 
      
 406 
     | 
    
         
            +
                    # ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
         
     | 
| 
      
 407 
     | 
    
         
            +
                    #     self.keep_snp_mask]
         
     | 
| 
      
 408 
     | 
    
         
            +
             
     | 
| 
      
 409 
     | 
    
         
            +
                    # save for each chunk
         
     | 
| 
      
 410 
     | 
    
         
            +
                    df = pd.DataFrame(ldscore_chr_chunk,
         
     | 
| 
      
 411 
     | 
    
         
            +
                                      index=self.snp_name,
         
     | 
| 
      
 412 
     | 
    
         
            +
                                      columns=column_names,
         
     | 
| 
      
 413 
     | 
    
         
            +
                                      )
         
     | 
| 
      
 414 
     | 
    
         
            +
                    df.index.name = 'SNP'
         
     | 
| 
      
 415 
     | 
    
         
            +
                    df.reset_index().to_feather(save_file_name)
         
     | 
| 
      
 416 
     | 
    
         
            +
             
     | 
| 
      
 417 
     | 
    
         
            +
                def save_ldscore_chunk_to_zarr(self, ldscore_chr_chunk: np.ndarray,
         
     | 
| 
      
 418 
     | 
    
         
            +
                                               chrom: int, start_col_index,
         
     | 
| 
      
 419 
     | 
    
         
            +
                                               ):
         
     | 
| 
      
 420 
     | 
    
         
            +
                    ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
         
     | 
| 
      
 421 
     | 
    
         
            +
                    # avoid overflow of float16, if inf, set to max of float16
         
     | 
| 
      
 422 
     | 
    
         
            +
                    ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
         
     | 
| 
      
 423 
     | 
    
         
            +
             
     | 
| 
      
 424 
     | 
    
         
            +
                    # save for each chunk
         
     | 
| 
      
 425 
     | 
    
         
            +
                    chrom_snp_start_point = self.chrom_snp_start_point[chrom - 1]
         
     | 
| 
      
 426 
     | 
    
         
            +
                    chrom_snp_end_point = self.chrom_snp_start_point[chrom]
         
     | 
| 
      
 427 
     | 
    
         
            +
             
     | 
| 
      
 428 
     | 
    
         
            +
                    self.zarr_file[chrom_snp_start_point:chrom_snp_end_point,
         
     | 
| 
      
 429 
     | 
    
         
            +
                    start_col_index:start_col_index + ldscore_chr_chunk.shape[1]] = ldscore_chr_chunk
         
     | 
| 
      
 430 
     | 
    
         
            +
             
     | 
| 
      
 431 
     | 
    
         
            +
                def calculate_M_use_SNP_gene_pair_dummy_by_chunk(self,
         
     | 
| 
      
 432 
     | 
    
         
            +
                                                                 mk_score_chunk,
         
     | 
| 
      
 433 
     | 
    
         
            +
                                                                 M_file_path, M_5_file_path,
         
     | 
| 
      
 434 
     | 
    
         
            +
                                                                 drop_dummy_na=True,
         
     | 
| 
      
 435 
     | 
    
         
            +
                                                                 ):
         
     | 
| 
      
 436 
     | 
    
         
            +
                    '''
         
     | 
| 
      
 437 
     | 
    
         
            +
                    calculate M use SNP_gene_pair_dummy_sumed_along_snp_axis and mk_score_chunk
         
     | 
| 
      
 438 
     | 
    
         
            +
                    '''
         
     | 
| 
      
 439 
     | 
    
         
            +
                    SNP_gene_pair_dummy_sumed_along_snp_axis = self.snp_gene_pair_dummy.values.sum(axis=0, keepdims=True)
         
     | 
| 
      
 440 
     | 
    
         
            +
                    SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = self.snp_gene_pair_dummy.loc[self.snp_pass_maf].values.sum(
         
     | 
| 
      
 441 
     | 
    
         
            +
                        axis=0,
         
     | 
| 
      
 442 
     | 
    
         
            +
                        keepdims=True)
         
     | 
| 
      
 443 
     | 
    
         
            +
                    if drop_dummy_na:
         
     | 
| 
      
 444 
     | 
    
         
            +
                        SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[:, :-1]
         
     | 
| 
      
 445 
     | 
    
         
            +
                        SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf[:,
         
     | 
| 
      
 446 
     | 
    
         
            +
                                                                            :-1]
         
     | 
| 
      
 447 
     | 
    
         
            +
                    save_dir = Path(M_file_path).parent
         
     | 
| 
      
 448 
     | 
    
         
            +
                    save_dir.mkdir(parents=True, exist_ok=True)
         
     | 
| 
      
 449 
     | 
    
         
            +
                    M_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis @ mk_score_chunk
         
     | 
| 
      
 450 
     | 
    
         
            +
                    M_5_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf @ mk_score_chunk
         
     | 
| 
      
 451 
     | 
    
         
            +
                    np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
         
     | 
| 
      
 452 
     | 
    
         
            +
                    np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
         
     | 
| 
      
 453 
     | 
    
         
            +
             
     | 
| 
      
 454 
     | 
    
         
            +
                def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(self, mk_score_common, chrom, sample_name, save_dir):
         
     | 
| 
      
 455 
     | 
    
         
            +
                    """
         
     | 
| 
      
 456 
     | 
    
         
            +
                    Calculate the LD score using the SNP-gene weight matrix.
         
     | 
| 
      
 457 
     | 
    
         
            +
                    :param sample_name:
         
     | 
| 
      
 458 
     | 
    
         
            +
                    """
         
     | 
| 
      
 459 
     | 
    
         
            +
                    # Calculate the LD score
         
     | 
| 
      
 460 
     | 
    
         
            +
                    chunk_index = 1
         
     | 
| 
      
 461 
     | 
    
         
            +
                    for i in trange(0, mk_score_common.shape[1], self.config.spots_per_chunk,
         
     | 
| 
      
 462 
     | 
    
         
            +
                                    desc=f'Calculating LD score by chunk for chr{chrom}'):
         
     | 
| 
      
 463 
     | 
    
         
            +
                        mk_score_chunk = mk_score_common.iloc[:, i:i + self.config.spots_per_chunk]
         
     | 
| 
      
 464 
     | 
    
         
            +
             
     | 
| 
      
 465 
     | 
    
         
            +
                        ld_score_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather'
         
     | 
| 
      
 466 
     | 
    
         
            +
                        M_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M'
         
     | 
| 
      
 467 
     | 
    
         
            +
                        M_5_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50'
         
     | 
| 
      
 468 
     | 
    
         
            +
             
     | 
| 
      
 469 
     | 
    
         
            +
                        ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
         
     | 
| 
      
 470 
     | 
    
         
            +
                            mk_score_chunk,
         
     | 
| 
      
 471 
     | 
    
         
            +
                            drop_dummy_na=True,
         
     | 
| 
      
 472 
     | 
    
         
            +
                        )
         
     | 
| 
      
 473 
     | 
    
         
            +
                        if self.config.ldscore_save_format == 'feather':
         
     | 
| 
      
 474 
     | 
    
         
            +
                            self.save_ldscore_to_feather(ldscore_chr_chunk,
         
     | 
| 
      
 475 
     | 
    
         
            +
                                                         column_names=mk_score_chunk.columns,
         
     | 
| 
      
 476 
     | 
    
         
            +
                                                         save_file_name=ld_score_file,
         
     | 
| 
      
 477 
     | 
    
         
            +
                                                         )
         
     | 
| 
      
 478 
     | 
    
         
            +
                        elif self.config.ldscore_save_format == 'zarr':
         
     | 
| 
      
 479 
     | 
    
         
            +
                            self.save_ldscore_chunk_to_zarr(ldscore_chr_chunk,
         
     | 
| 
      
 480 
     | 
    
         
            +
                                                            chrom=chrom,
         
     | 
| 
      
 481 
     | 
    
         
            +
                                                            start_col_index=i,
         
     | 
| 
      
 482 
     | 
    
         
            +
                                                            )
         
     | 
| 
      
 483 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 484 
     | 
    
         
            +
                            raise ValueError(f'Invalid ldscore_save_format: {self.config.ldscore_save_format}')
         
     | 
| 
      
 485 
     | 
    
         
            +
             
     | 
| 
      
 486 
     | 
    
         
            +
                        self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
         
     | 
| 
      
 487 
     | 
    
         
            +
                            mk_score_chunk,
         
     | 
| 
      
 488 
     | 
    
         
            +
                            M_file,
         
     | 
| 
      
 489 
     | 
    
         
            +
                            M_5_file,
         
     | 
| 
      
 490 
     | 
    
         
            +
                            drop_dummy_na=True,
         
     | 
| 
      
 491 
     | 
    
         
            +
                        )
         
     | 
| 
      
 492 
     | 
    
         
            +
             
     | 
| 
      
 493 
     | 
    
         
            +
                        chunk_index += 1
         
     | 
| 
      
 494 
     | 
    
         
            +
             
     | 
| 
      
 495 
     | 
    
         
            +
                def calculate_ldscore_for_base_line(self, chrom, sample_name, save_dir):
         
     | 
| 
      
 496 
     | 
    
         
            +
                    # save baseline ld score
         
     | 
| 
      
 497 
     | 
    
         
            +
                    baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
         
     | 
| 
      
 498 
     | 
    
         
            +
                    baseline_mk_score[-1, 0] = 0  # all_gene
         
     | 
| 
      
 499 
     | 
    
         
            +
                    baseline_mk_score_df = pd.DataFrame(baseline_mk_score, index=self.snp_gene_pair_dummy.columns,
         
     | 
| 
      
 500 
     | 
    
         
            +
                                                        columns=['all_gene', 'base'])
         
     | 
| 
      
 501 
     | 
    
         
            +
                    ld_score_file = f'{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather'
         
     | 
| 
      
 502 
     | 
    
         
            +
                    M_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M'
         
     | 
| 
      
 503 
     | 
    
         
            +
                    M_5_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M_5_50'
         
     | 
| 
      
 504 
     | 
    
         
            +
             
     | 
| 
      
 505 
     | 
    
         
            +
                    ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
         
     | 
| 
      
 506 
     | 
    
         
            +
                        baseline_mk_score_df,
         
     | 
| 
      
 507 
     | 
    
         
            +
                        drop_dummy_na=False,
         
     | 
| 
      
 508 
     | 
    
         
            +
                    )
         
     | 
| 
      
 509 
     | 
    
         
            +
             
     | 
| 
      
 510 
     | 
    
         
            +
                    self.save_ldscore_to_feather(ldscore_chr_chunk,
         
     | 
| 
      
 511 
     | 
    
         
            +
                                                 column_names=baseline_mk_score_df.columns,
         
     | 
| 
      
 512 
     | 
    
         
            +
                                                 save_file_name=ld_score_file,
         
     | 
| 
      
 513 
     | 
    
         
            +
                                                 )
         
     | 
| 
      
 514 
     | 
    
         
            +
                    # save baseline M
         
     | 
| 
      
 515 
     | 
    
         
            +
                    self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
         
     | 
| 
      
 516 
     | 
    
         
            +
                        baseline_mk_score_df,
         
     | 
| 
      
 517 
     | 
    
         
            +
                        M_file,
         
     | 
| 
      
 518 
     | 
    
         
            +
                        M_5_file,
         
     | 
| 
      
 519 
     | 
    
         
            +
                        drop_dummy_na=False,
         
     | 
| 
      
 520 
     | 
    
         
            +
                    )
         
     | 
| 
      
 521 
     | 
    
         
            +
             
     | 
| 
      
 522 
     | 
    
         
            +
                def get_snp_gene_dummy(self, chrom, ):
         
     | 
| 
      
 523 
     | 
    
         
            +
                    """
         
     | 
| 
      
 524 
     | 
    
         
            +
                    Get the dummy matrix of SNP-gene pairs.
         
     | 
| 
      
 525 
     | 
    
         
            +
                    """
         
     | 
| 
      
 526 
     | 
    
         
            +
                    # Load the bim file
         
     | 
| 
      
 527 
     | 
    
         
            +
                    print("Loading bim data")
         
     | 
| 
      
 528 
     | 
    
         
            +
                    bim, bim_pr = load_bim(self.config.bfile_root, chrom)
         
     | 
| 
      
 529 
     | 
    
         
            +
             
     | 
| 
      
 530 
     | 
    
         
            +
                    if self.config.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
         
     | 
| 
      
 531 
     | 
    
         
            +
             
     | 
| 
      
 532 
     | 
    
         
            +
                        SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
         
     | 
| 
      
 533 
     | 
    
         
            +
                        SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
         
     | 
| 
      
 534 
     | 
    
         
            +
                        # total_SNP_gene_pair = SNP_gene_pair_gtf.join(SNP_gene_pair_enhancer, how='outer', lsuffix='_gtf', )
         
     | 
| 
      
 535 
     | 
    
         
            +
             
     | 
| 
      
 536 
     | 
    
         
            +
                        mask_of_nan_gtf = SNP_gene_pair_gtf.gene_name.isna()
         
     | 
| 
      
 537 
     | 
    
         
            +
                        mask_of_nan_enhancer = SNP_gene_pair_enhancer.gene_name.isna()
         
     | 
| 
      
 538 
     | 
    
         
            +
             
     | 
| 
      
 539 
     | 
    
         
            +
                        if self.config.gene_window_enhancer_priority == 'gene_window_first':
         
     | 
| 
      
 540 
     | 
    
         
            +
                            SNP_gene_pair = SNP_gene_pair_gtf
         
     | 
| 
      
 541 
     | 
    
         
            +
                            SNP_gene_pair.loc[mask_of_nan_gtf, 'gene_name'] = SNP_gene_pair_enhancer.loc[
         
     | 
| 
      
 542 
     | 
    
         
            +
                                mask_of_nan_gtf, 'gene_name']
         
     | 
| 
      
 543 
     | 
    
         
            +
                        elif self.config.gene_window_enhancer_priority == 'enhancer_first':
         
     | 
| 
      
 544 
     | 
    
         
            +
                            SNP_gene_pair = SNP_gene_pair_enhancer
         
     | 
| 
      
 545 
     | 
    
         
            +
                            SNP_gene_pair.loc[mask_of_nan_enhancer, 'gene_name'] = SNP_gene_pair_gtf.loc[
         
     | 
| 
      
 546 
     | 
    
         
            +
                                mask_of_nan_enhancer, 'gene_name']
         
     | 
| 
      
 547 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 548 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 549 
     | 
    
         
            +
                                f'Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}')
         
     | 
| 
      
 550 
     | 
    
         
            +
             
     | 
| 
      
 551 
     | 
    
         
            +
                    elif self.config.gene_window_enhancer_priority is None:  # use gtf only
         
     | 
| 
      
 552 
     | 
    
         
            +
                        SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
         
     | 
| 
      
 553 
     | 
    
         
            +
                        SNP_gene_pair = SNP_gene_pair_gtf
         
     | 
| 
      
 554 
     | 
    
         
            +
             
     | 
| 
      
 555 
     | 
    
         
            +
                    elif self.config.gene_window_enhancer_priority == 'enhancer_only':
         
     | 
| 
      
 556 
     | 
    
         
            +
                        SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
         
     | 
| 
      
 557 
     | 
    
         
            +
                        SNP_gene_pair = SNP_gene_pair_enhancer
         
     | 
| 
      
 558 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 559 
     | 
    
         
            +
                        raise ValueError('gtf_pr and enhancer_pr cannot be None at the same time')
         
     | 
| 
      
 560 
     | 
    
         
            +
             
     | 
| 
      
 561 
     | 
    
         
            +
                    # save the SNP_gene_pair to feather
         
     | 
| 
      
 562 
     | 
    
         
            +
                    SNP_gene_pair_save_path = Path(
         
     | 
| 
      
 563 
     | 
    
         
            +
                        self.config.ldscore_save_dir) / f'SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather'
         
     | 
| 
      
 564 
     | 
    
         
            +
                    SNP_gene_pair_save_path.parent.mkdir(parents=True, exist_ok=True)
         
     | 
| 
      
 565 
     | 
    
         
            +
                    SNP_gene_pair.reset_index().to_feather(SNP_gene_pair_save_path)
         
     | 
| 
      
 566 
     | 
    
         
            +
             
     | 
| 
      
 567 
     | 
    
         
            +
                    # Get the dummy matrix
         
     | 
| 
      
 568 
     | 
    
         
            +
                    SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair['gene_name'], dummy_na=True)
         
     | 
| 
      
 569 
     | 
    
         
            +
                    return SNP_gene_pair_dummy
         
     | 
| 
      
 570 
     | 
    
         
            +
             
     | 
| 
      
 571 
     | 
    
         
            +
                def get_SNP_gene_pair_from_gtf(self, bim, bim_pr):
         
     | 
| 
      
 572 
     | 
    
         
            +
                    logger.info(
         
     | 
| 
      
 573 
     | 
    
         
            +
                        "Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)")
         
     | 
| 
      
 574 
     | 
    
         
            +
                    overlaps_small = Overlaps_gtf_bim(self.gtf_pr, bim_pr)
         
     | 
| 
      
 575 
     | 
    
         
            +
                    # Get the SNP-gene pair
         
     | 
| 
      
 576 
     | 
    
         
            +
                    annot = bim[["CHR", "BP", "SNP", "CM"]]
         
     | 
| 
      
 577 
     | 
    
         
            +
                    SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
         
     | 
| 
      
 578 
     | 
    
         
            +
                    return SNP_gene_pair
         
     | 
| 
      
 579 
     | 
    
         
            +
             
     | 
| 
      
 580 
     | 
    
         
            +
                def get_SNP_gene_pair_from_enhancer(self, bim, bim_pr, ):
         
     | 
| 
      
 581 
     | 
    
         
            +
                    logger.info(
         
     | 
| 
      
 582 
     | 
    
         
            +
                        "Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score")
         
     | 
| 
      
 583 
     | 
    
         
            +
                    # Get the SNP-gene pair
         
     | 
| 
      
 584 
     | 
    
         
            +
                    overlaps_small = self.enhancer_pr.join(bim_pr).df
         
     | 
| 
      
 585 
     | 
    
         
            +
                    annot = bim[["CHR", "BP", "SNP", "CM"]]
         
     | 
| 
      
 586 
     | 
    
         
            +
                    if self.config.snp_multiple_enhancer_strategy == 'max_mkscore':
         
     | 
| 
      
 587 
     | 
    
         
            +
                        logger.debug('select the gene with highest marker score')
         
     | 
| 
      
 588 
     | 
    
         
            +
                        overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').avg_mkscore.idxmax()]
         
     | 
| 
      
 589 
     | 
    
         
            +
             
     | 
| 
      
 590 
     | 
    
         
            +
                    elif self.config.snp_multiple_enhancer_strategy == 'nearest_TSS':
         
     | 
| 
      
 591 
     | 
    
         
            +
                        logger.debug('select the gene with nearest TSS')
         
     | 
| 
      
 592 
     | 
    
         
            +
                        overlaps_small['Distance'] = np.abs(overlaps_small['Start_b'] - overlaps_small['TSS'])
         
     | 
| 
      
 593 
     | 
    
         
            +
                        overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
         
     | 
| 
      
 594 
     | 
    
         
            +
             
     | 
| 
      
 595 
     | 
    
         
            +
                    SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
         
     | 
| 
      
 596 
     | 
    
         
            +
             
     | 
| 
      
 597 
     | 
    
         
            +
                    return SNP_gene_pair
         
     | 
| 
      
 598 
     | 
    
         
            +
             
     | 
| 
      
 599 
     | 
    
         
            +
             
     | 
| 
      
 600 
     | 
    
         
            +
            def run_generate_ldscore(config: GenerateLDScoreConfig):
         
     | 
| 
      
 601 
     | 
    
         
            +
                if config.ldscore_save_format == 'quick_mode':
         
     | 
| 
      
 602 
     | 
    
         
            +
                    logger.info('Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore.')
         
     | 
| 
      
 603 
     | 
    
         
            +
                    ldscore_save_dir = config.ldscore_save_dir
         
     | 
| 
      
 604 
     | 
    
         
            +
             
     | 
| 
      
 605 
     | 
    
         
            +
                    # link the baseline annotation
         
     | 
| 
      
 606 
     | 
    
         
            +
                    baseline_annotation_dir = Path(config.baseline_annotation_dir)
         
     | 
| 
      
 607 
     | 
    
         
            +
                    (ldscore_save_dir / 'baseline').symlink_to(baseline_annotation_dir, target_is_directory=True)
         
     | 
| 
      
 608 
     | 
    
         
            +
             
     | 
| 
      
 609 
     | 
    
         
            +
                    # link the SNP_gene_pair
         
     | 
| 
      
 610 
     | 
    
         
            +
                    SNP_gene_pair_dir = Path(config.SNP_gene_pair_dir)
         
     | 
| 
      
 611 
     | 
    
         
            +
                    (ldscore_save_dir / 'SNP_gene_pair').symlink_to(SNP_gene_pair_dir, target_is_directory=True)
         
     | 
| 
      
 612 
     | 
    
         
            +
                    return
         
     | 
| 
      
 613 
     | 
    
         
            +
                s_ldsc_boost = S_LDSC_Boost(config)
         
     | 
| 
      
 614 
     | 
    
         
            +
                if config.chrom == 'all':
         
     | 
| 
      
 615 
     | 
    
         
            +
                    for chrom in range(1, 23):
         
     | 
| 
      
 616 
     | 
    
         
            +
                        s_ldsc_boost.process_chromosome(chrom)
         
     | 
| 
      
 617 
     | 
    
         
            +
                else:
         
     | 
| 
      
 618 
     | 
    
         
            +
                    s_ldsc_boost.process_chromosome(config.chrom)
         
     |