gsMap 1.70__py3-none-any.whl → 1.71.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/generate_ldscore.py CHANGED
@@ -1,618 +1,618 @@
1
- import logging
2
- import warnings
3
- from pathlib import Path
4
-
5
- import numpy as np
6
- import pandas as pd
7
- import pyranges as pr
8
- import zarr
9
- from scipy.sparse import csr_matrix
10
- from tqdm import trange
11
-
12
- from gsMap.config import GenerateLDScoreConfig
13
- from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
14
-
15
- warnings.filterwarnings("ignore", category=FutureWarning)
16
- logger = logging.getLogger(__name__)
17
-
18
-
19
- # %%
20
- # load gtf
21
- def load_gtf(gtf_file, mk_score, window_size):
22
- """
23
- Load the gene annotation file (gtf).
24
- """
25
- print("Loading gtf data")
26
- #
27
- # Load GTF file
28
- gtf = pr.read_gtf(gtf_file, )
29
- gtf = gtf.df
30
- #
31
- # Select the common genes
32
- gtf = gtf[gtf['Feature'] == 'gene']
33
- common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
34
- #
35
- gtf = gtf[gtf.gene_name.isin(common_gene)]
36
- mk_score = mk_score[mk_score.index.isin(common_gene)]
37
- #
38
- # Remove duplicated lines
39
- gtf = gtf.drop_duplicates(subset='gene_name', keep="first")
40
- #
41
- # Process the GTF (open 100-KB window: Tss - Ted)
42
- gtf_bed = gtf[['Chromosome', 'Start', 'End', 'gene_name', 'Strand']].copy()
43
- gtf_bed.loc[:, 'TSS'] = gtf_bed['Start']
44
- gtf_bed.loc[:, 'TED'] = gtf_bed['End']
45
-
46
- gtf_bed.loc[:, 'Start'] = gtf_bed['TSS'] - window_size
47
- gtf_bed.loc[:, 'End'] = gtf_bed['TED'] + window_size
48
- gtf_bed.loc[gtf_bed['Start'] < 0, 'Start'] = 0
49
- #
50
- # Correct the negative strand
51
- tss_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS']
52
- ted_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED']
53
- gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS'] = ted_neg
54
- gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED'] = tss_neg
55
- gtf_bed = gtf_bed.drop('Strand', axis=1)
56
- #
57
- # Transform the GTF to PyRanges
58
- gtf_pr = pr.PyRanges(gtf_bed)
59
- return gtf_pr, mk_score
60
-
61
-
62
- # %%
63
- def load_marker_score(mk_score_file):
64
- """
65
- Load marker scores of each cell.
66
- """
67
- mk_score = pd.read_feather(mk_score_file).set_index('HUMAN_GENE_SYM').rename_axis('gene_name')
68
- mk_score = mk_score.astype(np.float32, copy=False)
69
- return mk_score
70
-
71
-
72
- # %%
73
- # load mkscore get common gene
74
- # %%
75
- # load bim
76
- def load_bim(bfile_root, chrom):
77
- """
78
- Load the bim file.
79
- """
80
- bim = pd.read_csv(f'{bfile_root}.{chrom}.bim', sep='\t', header=None)
81
- bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
82
- #
83
- # Transform bim to PyRanges
84
- bim_pr = bim.copy()
85
- bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
86
-
87
- bim_pr['End'] = bim_pr['Start'].copy()
88
- bim_pr['Start'] = bim_pr['Start'] - 1 # Due to bim file is 1-based
89
-
90
- bim_pr = pr.PyRanges(bim_pr)
91
- bim_pr.Chromosome = f'chr{chrom}'
92
- return bim, bim_pr
93
-
94
-
95
- # %%
96
- def Overlaps_gtf_bim(gtf_pr, bim_pr):
97
- """
98
- Find overlaps between gtf and bim file.
99
- """
100
- # Select the overlapped regions (SNPs in gene windows)
101
- overlaps = gtf_pr.join(bim_pr)
102
- overlaps = overlaps.df
103
- overlaps['Distance'] = np.abs(overlaps['Start_b'] - overlaps['TSS'])
104
- overlaps_small = overlaps.copy()
105
- overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
106
- return overlaps_small
107
-
108
-
109
- # %%
110
- def filter_snps_by_keep_snp(bim_df, keep_snp_file):
111
- # Load the keep_snp file and filter the BIM DataFrame
112
- keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
113
- filtered_bim_df = bim_df[bim_df['SNP'].isin(keep_snp)]
114
- return filtered_bim_df
115
-
116
-
117
- def get_snp_counts(config):
118
- snp_counts = {}
119
- total_snp = 0
120
-
121
- for chrom in range(1, 23):
122
- bim_df, _ = load_bim(config.bfile_root, chrom)
123
-
124
- if config.keep_snp_root:
125
- keep_snp_file = f'{config.keep_snp_root}.{chrom}.snp'
126
- filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
127
- else:
128
- filtered_bim_df = bim_df
129
-
130
- snp_counts[chrom] = filtered_bim_df.shape[0]
131
- total_snp += snp_counts[chrom]
132
-
133
- snp_counts['total'] = total_snp
134
-
135
- chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
136
-
137
- snp_counts['chrom_snp_start_point'] = [0] + chrom_snp_length_array.tolist()
138
-
139
- return snp_counts
140
-
141
-
142
- # %%
143
- def get_snp_pass_maf(bfile_root, chrom, maf_min=0.05):
144
- """
145
- Get the dummy matrix of SNP-gene pairs.
146
- """
147
- # Load the bim file
148
- PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
149
- PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
150
-
151
- bfile = f'{bfile_root}.{chrom}'
152
- snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
153
- array_snps = snp_obj(snp_file)
154
- m = len(array_snps.IDList)
155
-
156
- # Load fam
157
- ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
158
- array_indivs = ind_obj(ind_file)
159
- n = len(array_indivs.IDList)
160
- array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
161
- geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
162
- ii = geno_array.maf > maf_min
163
- snp_pass_maf = array_snps.IDList[ii]
164
- print(f'After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain.')
165
- return snp_pass_maf.SNP.to_list()
166
-
167
-
168
- def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit='CM'):
169
- PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
170
- PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
171
-
172
- bfile = f'{bfile_root}.{chrom}'
173
- snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
174
- array_snps = snp_obj(snp_file)
175
- m = len(array_snps.IDList)
176
- print(f'Read list of {m} SNPs from {snp_file}')
177
-
178
- # Load fam
179
- ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
180
- array_indivs = ind_obj(ind_file)
181
- n = len(array_indivs.IDList)
182
- print(f'Read list of {n} individuals from {ind_file}')
183
- array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
184
- geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
185
- # Load the annotations of the baseline
186
- if ld_unit == 'SNP':
187
- max_dist = ld_wind
188
- coords = np.array(range(geno_array.m))
189
- elif ld_unit == 'KB':
190
- max_dist = ld_wind * 1000
191
- coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
192
- elif ld_unit == 'CM':
193
- max_dist = ld_wind
194
- coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
195
- else:
196
- raise ValueError(f'Invalid ld_wind_unit: {ld_unit}')
197
- block_left = getBlockLefts(coords, max_dist)
198
- # Calculate the LD score
199
- lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
200
- return lN_df
201
-
202
-
203
- # %%
204
- def calculate_ldscore_from_annotation(SNP_annotation_df, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
205
- """
206
- Calculate the SNP-gene weight matrix.
207
- """
208
- # Get the dummy matrix
209
- # Get the SNP-gene weight matrix
210
- snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
211
- ld_unit=ld_unit)
212
- snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
213
- snp_gene_weight_matrix.index = SNP_annotation_df.index
214
- snp_gene_weight_matrix.columns = SNP_annotation_df.columns
215
- return snp_gene_weight_matrix
216
-
217
-
218
- def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
219
- SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1).astype(np.float32, copy=False)
220
-
221
- snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
222
- ld_unit=ld_unit)
223
- snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
224
- snp_gene_weight_matrix.index = SNP_annotation_df.index
225
- snp_gene_weight_matrix.columns = SNP_annotation_df.columns
226
-
227
- # split to each annotation
228
- snp_annotation_len_list = [len(df.columns) for df in SNP_annotation_df_list]
229
- snp_gene_weight_matrix_list = []
230
- start = 0
231
- for snp_annotation_len in snp_annotation_len_list:
232
- snp_gene_weight_matrix_list.append(snp_gene_weight_matrix.iloc[:, start:start + snp_annotation_len])
233
- start += snp_annotation_len
234
- return snp_gene_weight_matrix_list
235
-
236
-
237
- # %%
238
- class S_LDSC_Boost:
239
- def __init__(self, config: GenerateLDScoreConfig):
240
- self.config = config
241
-
242
- self.mk_score = load_marker_score(config.mkscore_feather_path)
243
-
244
- # Load GTF and get common markers
245
- self.gtf_pr, self.mk_score_common = load_gtf(config.gtf_annotation_file, self.mk_score,
246
- window_size=config.gene_window_size)
247
-
248
- # Load enhancer
249
- if config.enhancer_annotation_file is not None:
250
- enhancer_df = pr.read_bed(config.enhancer_annotation_file, as_df=True)
251
- enhancer_df.set_index('Name', inplace=True)
252
- enhancer_df.index.name = 'gene_name'
253
-
254
- # keep the common genes and add the enhancer score
255
- avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=['avg_mkscore'])
256
- enhancer_df = enhancer_df.join(avg_mkscore, how='inner', on='gene_name', )
257
-
258
- # add distance to TSS
259
- enhancer_df['TSS'] = self.gtf_pr.df.set_index('gene_name').reindex(enhancer_df.index)['TSS']
260
-
261
- # convert to pyranges
262
- self.enhancer_pr = pr.PyRanges(enhancer_df.reset_index())
263
-
264
- else:
265
- self.enhancer_pr = None
266
-
267
- # create tha zarr file
268
- if config.ldscore_save_format == 'zarr':
269
-
270
- chrom_snp_length_dict = get_snp_counts(config)
271
- self.chrom_snp_start_point = chrom_snp_length_dict['chrom_snp_start_point']
272
-
273
- zarr_path = Path(config.ldscore_save_dir) / f'{config.sample_name}.ldscore.zarr'
274
- if not zarr_path.exists():
275
- self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a', dtype=np.float16,
276
- chunks=config.zarr_chunk_size,
277
- shape=(chrom_snp_length_dict['total'], self.mk_score_common.shape[1]))
278
- zarr_path.mkdir(parents=True, exist_ok=True)
279
- # save spot names
280
- self.zarr_file.attrs['spot_names'] = self.mk_score_common.columns.to_list()
281
- # save chrom_snp_length_dict
282
- self.zarr_file.attrs['chrom_snp_start_point'] = self.chrom_snp_start_point
283
- else:
284
- self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a')
285
-
286
- def process_chromosome(self, chrom: int):
287
- self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
288
-
289
- # Get SNP-Gene dummy pairs
290
- self.snp_gene_pair_dummy = self.get_snp_gene_dummy(chrom, )
291
-
292
- if self.config.keep_snp_root is not None:
293
- keep_snp = pd.read_csv(f'{self.config.keep_snp_root}.{chrom}.snp', header=None)[0].to_list()
294
- self.keep_snp_mask = self.snp_gene_pair_dummy.index.isin(keep_snp)
295
- # the SNP name of keeped
296
- self.snp_name = self.snp_gene_pair_dummy.index[self.keep_snp_mask].to_list()
297
- else:
298
- self.keep_snp_mask = None
299
- self.snp_name = self.snp_gene_pair_dummy.index.to_list()
300
-
301
- if self.config.additional_baseline_annotation is not None:
302
- additional_baseline_annotation = Path(self.config.additional_baseline_annotation)
303
- additional_baseline_annotation_file_path = additional_baseline_annotation / f'baseline.{chrom}.annot.gz'
304
- assert additional_baseline_annotation_file_path.exists(), f'additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}'
305
- additional_baseline_annotation_df = pd.read_csv(additional_baseline_annotation_file_path, sep='\t')
306
- additional_baseline_annotation_df.set_index('SNP', inplace=True)
307
-
308
- # drop these columns if exists CHR BP CM]
309
- additional_baseline_annotation_df.drop(['CHR', 'BP', 'CM'], axis=1, inplace=True, errors='ignore')
310
-
311
- # reindex, for those SNPs not in additional_baseline_annotation_df, set to 0
312
- num_of_not_exist_snp = (~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)).sum()
313
- if num_of_not_exist_snp > 0:
314
- logger.warning(
315
- f'{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0')
316
- additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
317
- self.snp_gene_pair_dummy.index,
318
- fill_value=0)
319
- else:
320
- additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
321
- self.snp_gene_pair_dummy.index)
322
-
323
- # do this for saving the cpu time, only calculate r2 once
324
- self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
325
- calculate_ldscore_from_multiple_annotation(
326
- [self.snp_gene_pair_dummy, additional_baseline_annotation_df],
327
- chrom,
328
- self.config.bfile_root,
329
- ld_wind=self.config.ld_wind,
330
- ld_unit=self.config.ld_unit))
331
-
332
- additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[self.snp_name]
333
- # print(additional_baseline_annotation_ldscore.index.to_list()==self.snp_name)
334
-
335
- ld_score_file = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather'
336
- M_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M'
337
- M_5_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50'
338
-
339
- # save additional baseline annotation ldscore
340
- self.save_ldscore_to_feather(additional_baseline_annotation_ldscore.values,
341
- column_names=additional_baseline_annotation_ldscore.columns,
342
- save_file_name=ld_score_file,
343
- )
344
-
345
- # caculate the M and save
346
- save_dir = Path(M_file_path).parent
347
- save_dir.mkdir(parents=True, exist_ok=True)
348
- M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
349
- M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0, keepdims=True)
350
- np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
351
- np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
352
-
353
- else:
354
- # Calculate SNP-Gene weight matrix
355
- self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(self.snp_gene_pair_dummy, chrom,
356
- self.config.bfile_root,
357
- ld_wind=self.config.ld_wind,
358
- ld_unit=self.config.ld_unit)
359
- # only keep the snp in keep_snp_root
360
- if self.keep_snp_mask is not None:
361
- self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
362
-
363
- if self.config.save_pre_calculate_snp_gene_weight_matrix:
364
- snp_gene_weight_matrix_save_dir = Path(self.config.ldscore_save_dir) / 'snp_gene_weight_matrix'
365
- snp_gene_weight_matrix_save_dir.mkdir(parents=True, exist_ok=True)
366
- logger.info(f'Saving snp_gene_weight_matrix for chr{chrom}...')
367
- self.snp_gene_weight_matrix.reset_index().to_feather(
368
- snp_gene_weight_matrix_save_dir / f'{chrom}.snp_gene_weight_matrix.feather')
369
-
370
- # convert to sparse
371
- self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
372
- logger.info(f'Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}')
373
-
374
- # calculate baseline ld score
375
- logger.info(f'Calculating baseline ld score for chr{chrom}...')
376
- self.calculate_ldscore_for_base_line(chrom, self.config.sample_name, self.config.ldscore_save_dir)
377
-
378
- # calculate ld score for annotation
379
- logger.info(f'Calculating ld score for annotation for chr{chrom}...')
380
- self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
381
- self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
382
- chrom,
383
- self.config.sample_name,
384
- self.config.ldscore_save_dir,
385
- )
386
-
387
- def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
388
- mk_score_chunk,
389
- drop_dummy_na=True,
390
- ):
391
-
392
- if drop_dummy_na:
393
- ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
394
- else:
395
- ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
396
-
397
- return ldscore_chr_chunk
398
-
399
- def save_ldscore_to_feather(self, ldscore_chr_chunk: np.ndarray, column_names, save_file_name):
400
- save_dir = Path(save_file_name).parent
401
- save_dir.mkdir(parents=True, exist_ok=True)
402
-
403
- ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
404
- # avoid overflow of float16, if inf, set to max of float16
405
- ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
406
- # ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
407
- # self.keep_snp_mask]
408
-
409
- # save for each chunk
410
- df = pd.DataFrame(ldscore_chr_chunk,
411
- index=self.snp_name,
412
- columns=column_names,
413
- )
414
- df.index.name = 'SNP'
415
- df.reset_index().to_feather(save_file_name)
416
-
417
- def save_ldscore_chunk_to_zarr(self, ldscore_chr_chunk: np.ndarray,
418
- chrom: int, start_col_index,
419
- ):
420
- ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
421
- # avoid overflow of float16, if inf, set to max of float16
422
- ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
423
-
424
- # save for each chunk
425
- chrom_snp_start_point = self.chrom_snp_start_point[chrom - 1]
426
- chrom_snp_end_point = self.chrom_snp_start_point[chrom]
427
-
428
- self.zarr_file[chrom_snp_start_point:chrom_snp_end_point,
429
- start_col_index:start_col_index + ldscore_chr_chunk.shape[1]] = ldscore_chr_chunk
430
-
431
- def calculate_M_use_SNP_gene_pair_dummy_by_chunk(self,
432
- mk_score_chunk,
433
- M_file_path, M_5_file_path,
434
- drop_dummy_na=True,
435
- ):
436
- '''
437
- calculate M use SNP_gene_pair_dummy_sumed_along_snp_axis and mk_score_chunk
438
- '''
439
- SNP_gene_pair_dummy_sumed_along_snp_axis = self.snp_gene_pair_dummy.values.sum(axis=0, keepdims=True)
440
- SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = self.snp_gene_pair_dummy.loc[self.snp_pass_maf].values.sum(
441
- axis=0,
442
- keepdims=True)
443
- if drop_dummy_na:
444
- SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[:, :-1]
445
- SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf[:,
446
- :-1]
447
- save_dir = Path(M_file_path).parent
448
- save_dir.mkdir(parents=True, exist_ok=True)
449
- M_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis @ mk_score_chunk
450
- M_5_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf @ mk_score_chunk
451
- np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
452
- np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
453
-
454
- def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(self, mk_score_common, chrom, sample_name, save_dir):
455
- """
456
- Calculate the LD score using the SNP-gene weight matrix.
457
- :param sample_name:
458
- """
459
- # Calculate the LD score
460
- chunk_index = 1
461
- for i in trange(0, mk_score_common.shape[1], self.config.spots_per_chunk,
462
- desc=f'Calculating LD score by chunk for chr{chrom}'):
463
- mk_score_chunk = mk_score_common.iloc[:, i:i + self.config.spots_per_chunk]
464
-
465
- ld_score_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather'
466
- M_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M'
467
- M_5_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50'
468
-
469
- ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
470
- mk_score_chunk,
471
- drop_dummy_na=True,
472
- )
473
- if self.config.ldscore_save_format == 'feather':
474
- self.save_ldscore_to_feather(ldscore_chr_chunk,
475
- column_names=mk_score_chunk.columns,
476
- save_file_name=ld_score_file,
477
- )
478
- elif self.config.ldscore_save_format == 'zarr':
479
- self.save_ldscore_chunk_to_zarr(ldscore_chr_chunk,
480
- chrom=chrom,
481
- start_col_index=i,
482
- )
483
- else:
484
- raise ValueError(f'Invalid ldscore_save_format: {self.config.ldscore_save_format}')
485
-
486
- self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
487
- mk_score_chunk,
488
- M_file,
489
- M_5_file,
490
- drop_dummy_na=True,
491
- )
492
-
493
- chunk_index += 1
494
-
495
- def calculate_ldscore_for_base_line(self, chrom, sample_name, save_dir):
496
- # save baseline ld score
497
- baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
498
- baseline_mk_score[-1, 0] = 0 # all_gene
499
- baseline_mk_score_df = pd.DataFrame(baseline_mk_score, index=self.snp_gene_pair_dummy.columns,
500
- columns=['all_gene', 'base'])
501
- ld_score_file = f'{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather'
502
- M_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M'
503
- M_5_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M_5_50'
504
-
505
- ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
506
- baseline_mk_score_df,
507
- drop_dummy_na=False,
508
- )
509
-
510
- self.save_ldscore_to_feather(ldscore_chr_chunk,
511
- column_names=baseline_mk_score_df.columns,
512
- save_file_name=ld_score_file,
513
- )
514
- # save baseline M
515
- self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
516
- baseline_mk_score_df,
517
- M_file,
518
- M_5_file,
519
- drop_dummy_na=False,
520
- )
521
-
522
- def get_snp_gene_dummy(self, chrom, ):
523
- """
524
- Get the dummy matrix of SNP-gene pairs.
525
- """
526
- # Load the bim file
527
- print("Loading bim data")
528
- bim, bim_pr = load_bim(self.config.bfile_root, chrom)
529
-
530
- if self.config.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
531
-
532
- SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
533
- SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
534
- # total_SNP_gene_pair = SNP_gene_pair_gtf.join(SNP_gene_pair_enhancer, how='outer', lsuffix='_gtf', )
535
-
536
- mask_of_nan_gtf = SNP_gene_pair_gtf.gene_name.isna()
537
- mask_of_nan_enhancer = SNP_gene_pair_enhancer.gene_name.isna()
538
-
539
- if self.config.gene_window_enhancer_priority == 'gene_window_first':
540
- SNP_gene_pair = SNP_gene_pair_gtf
541
- SNP_gene_pair.loc[mask_of_nan_gtf, 'gene_name'] = SNP_gene_pair_enhancer.loc[
542
- mask_of_nan_gtf, 'gene_name']
543
- elif self.config.gene_window_enhancer_priority == 'enhancer_first':
544
- SNP_gene_pair = SNP_gene_pair_enhancer
545
- SNP_gene_pair.loc[mask_of_nan_enhancer, 'gene_name'] = SNP_gene_pair_gtf.loc[
546
- mask_of_nan_enhancer, 'gene_name']
547
- else:
548
- raise ValueError(
549
- f'Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}')
550
-
551
- elif self.config.gene_window_enhancer_priority is None: # use gtf only
552
- SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
553
- SNP_gene_pair = SNP_gene_pair_gtf
554
-
555
- elif self.config.gene_window_enhancer_priority == 'enhancer_only':
556
- SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
557
- SNP_gene_pair = SNP_gene_pair_enhancer
558
- else:
559
- raise ValueError('gtf_pr and enhancer_pr cannot be None at the same time')
560
-
561
- # save the SNP_gene_pair to feather
562
- SNP_gene_pair_save_path = Path(
563
- self.config.ldscore_save_dir) / f'SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather'
564
- SNP_gene_pair_save_path.parent.mkdir(parents=True, exist_ok=True)
565
- SNP_gene_pair.reset_index().to_feather(SNP_gene_pair_save_path)
566
-
567
- # Get the dummy matrix
568
- SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair['gene_name'], dummy_na=True)
569
- return SNP_gene_pair_dummy
570
-
571
- def get_SNP_gene_pair_from_gtf(self, bim, bim_pr):
572
- logger.info(
573
- "Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)")
574
- overlaps_small = Overlaps_gtf_bim(self.gtf_pr, bim_pr)
575
- # Get the SNP-gene pair
576
- annot = bim[["CHR", "BP", "SNP", "CM"]]
577
- SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
578
- return SNP_gene_pair
579
-
580
- def get_SNP_gene_pair_from_enhancer(self, bim, bim_pr, ):
581
- logger.info(
582
- "Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score")
583
- # Get the SNP-gene pair
584
- overlaps_small = self.enhancer_pr.join(bim_pr).df
585
- annot = bim[["CHR", "BP", "SNP", "CM"]]
586
- if self.config.snp_multiple_enhancer_strategy == 'max_mkscore':
587
- logger.debug('select the gene with highest marker score')
588
- overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').avg_mkscore.idxmax()]
589
-
590
- elif self.config.snp_multiple_enhancer_strategy == 'nearest_TSS':
591
- logger.debug('select the gene with nearest TSS')
592
- overlaps_small['Distance'] = np.abs(overlaps_small['Start_b'] - overlaps_small['TSS'])
593
- overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
594
-
595
- SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
596
-
597
- return SNP_gene_pair
598
-
599
-
600
- def run_generate_ldscore(config: GenerateLDScoreConfig):
601
- if config.ldscore_save_format == 'quick_mode':
602
- logger.info('Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore.')
603
- ldscore_save_dir = config.ldscore_save_dir
604
-
605
- # link the baseline annotation
606
- baseline_annotation_dir = Path(config.baseline_annotation_dir)
607
- (ldscore_save_dir / 'baseline').symlink_to(baseline_annotation_dir, target_is_directory=True)
608
-
609
- # link the SNP_gene_pair
610
- SNP_gene_pair_dir = Path(config.SNP_gene_pair_dir)
611
- (ldscore_save_dir / 'SNP_gene_pair').symlink_to(SNP_gene_pair_dir, target_is_directory=True)
612
- return
613
- s_ldsc_boost = S_LDSC_Boost(config)
614
- if config.chrom == 'all':
615
- for chrom in range(1, 23):
616
- s_ldsc_boost.process_chromosome(chrom)
617
- else:
618
- s_ldsc_boost.process_chromosome(config.chrom)
1
+ import logging
2
+ import warnings
3
+ from pathlib import Path
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import pyranges as pr
8
+ import zarr
9
+ from scipy.sparse import csr_matrix
10
+ from tqdm import trange
11
+
12
+ from gsMap.config import GenerateLDScoreConfig
13
+ from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
14
+
15
+ warnings.filterwarnings("ignore", category=FutureWarning)
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ # %%
20
+ # load gtf
21
+ def load_gtf(gtf_file, mk_score, window_size):
22
+ """
23
+ Load the gene annotation file (gtf).
24
+ """
25
+ print("Loading gtf data")
26
+ #
27
+ # Load GTF file
28
+ gtf = pr.read_gtf(gtf_file, )
29
+ gtf = gtf.df
30
+ #
31
+ # Select the common genes
32
+ gtf = gtf[gtf['Feature'] == 'gene']
33
+ common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
34
+ #
35
+ gtf = gtf[gtf.gene_name.isin(common_gene)]
36
+ mk_score = mk_score[mk_score.index.isin(common_gene)]
37
+ #
38
+ # Remove duplicated lines
39
+ gtf = gtf.drop_duplicates(subset='gene_name', keep="first")
40
+ #
41
+ # Process the GTF (open 100-KB window: Tss - Ted)
42
+ gtf_bed = gtf[['Chromosome', 'Start', 'End', 'gene_name', 'Strand']].copy()
43
+ gtf_bed.loc[:, 'TSS'] = gtf_bed['Start']
44
+ gtf_bed.loc[:, 'TED'] = gtf_bed['End']
45
+
46
+ gtf_bed.loc[:, 'Start'] = gtf_bed['TSS'] - window_size
47
+ gtf_bed.loc[:, 'End'] = gtf_bed['TED'] + window_size
48
+ gtf_bed.loc[gtf_bed['Start'] < 0, 'Start'] = 0
49
+ #
50
+ # Correct the negative strand
51
+ tss_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS']
52
+ ted_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED']
53
+ gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS'] = ted_neg
54
+ gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED'] = tss_neg
55
+ gtf_bed = gtf_bed.drop('Strand', axis=1)
56
+ #
57
+ # Transform the GTF to PyRanges
58
+ gtf_pr = pr.PyRanges(gtf_bed)
59
+ return gtf_pr, mk_score
60
+
61
+
62
+ # %%
63
+ def load_marker_score(mk_score_file):
64
+ """
65
+ Load marker scores of each cell.
66
+ """
67
+ mk_score = pd.read_feather(mk_score_file).set_index('HUMAN_GENE_SYM').rename_axis('gene_name')
68
+ mk_score = mk_score.astype(np.float32, copy=False)
69
+ return mk_score
70
+
71
+
72
+ # %%
73
+ # load mkscore get common gene
74
+ # %%
75
+ # load bim
76
+ def load_bim(bfile_root, chrom):
77
+ """
78
+ Load the bim file.
79
+ """
80
+ bim = pd.read_csv(f'{bfile_root}.{chrom}.bim', sep='\t', header=None)
81
+ bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
82
+ #
83
+ # Transform bim to PyRanges
84
+ bim_pr = bim.copy()
85
+ bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
86
+
87
+ bim_pr['End'] = bim_pr['Start'].copy()
88
+ bim_pr['Start'] = bim_pr['Start'] - 1 # Due to bim file is 1-based
89
+
90
+ bim_pr = pr.PyRanges(bim_pr)
91
+ bim_pr.Chromosome = f'chr{chrom}'
92
+ return bim, bim_pr
93
+
94
+
95
+ # %%
96
+ def Overlaps_gtf_bim(gtf_pr, bim_pr):
97
+ """
98
+ Find overlaps between gtf and bim file.
99
+ """
100
+ # Select the overlapped regions (SNPs in gene windows)
101
+ overlaps = gtf_pr.join(bim_pr)
102
+ overlaps = overlaps.df
103
+ overlaps['Distance'] = np.abs(overlaps['Start_b'] - overlaps['TSS'])
104
+ overlaps_small = overlaps.copy()
105
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
106
+ return overlaps_small
107
+
108
+
109
+ # %%
110
+ def filter_snps_by_keep_snp(bim_df, keep_snp_file):
111
+ # Load the keep_snp file and filter the BIM DataFrame
112
+ keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
113
+ filtered_bim_df = bim_df[bim_df['SNP'].isin(keep_snp)]
114
+ return filtered_bim_df
115
+
116
+
117
+ def get_snp_counts(config):
118
+ snp_counts = {}
119
+ total_snp = 0
120
+
121
+ for chrom in range(1, 23):
122
+ bim_df, _ = load_bim(config.bfile_root, chrom)
123
+
124
+ if config.keep_snp_root:
125
+ keep_snp_file = f'{config.keep_snp_root}.{chrom}.snp'
126
+ filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
127
+ else:
128
+ filtered_bim_df = bim_df
129
+
130
+ snp_counts[chrom] = filtered_bim_df.shape[0]
131
+ total_snp += snp_counts[chrom]
132
+
133
+ snp_counts['total'] = total_snp
134
+
135
+ chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
136
+
137
+ snp_counts['chrom_snp_start_point'] = [0] + chrom_snp_length_array.tolist()
138
+
139
+ return snp_counts
140
+
141
+
142
+ # %%
143
+ def get_snp_pass_maf(bfile_root, chrom, maf_min=0.05):
144
+ """
145
+ Get the dummy matrix of SNP-gene pairs.
146
+ """
147
+ # Load the bim file
148
+ PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
149
+ PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
150
+
151
+ bfile = f'{bfile_root}.{chrom}'
152
+ snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
153
+ array_snps = snp_obj(snp_file)
154
+ m = len(array_snps.IDList)
155
+
156
+ # Load fam
157
+ ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
158
+ array_indivs = ind_obj(ind_file)
159
+ n = len(array_indivs.IDList)
160
+ array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
161
+ geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
162
+ ii = geno_array.maf > maf_min
163
+ snp_pass_maf = array_snps.IDList[ii]
164
+ print(f'After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain.')
165
+ return snp_pass_maf.SNP.to_list()
166
+
167
+
168
+ def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit='CM'):
169
+ PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
170
+ PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
171
+
172
+ bfile = f'{bfile_root}.{chrom}'
173
+ snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
174
+ array_snps = snp_obj(snp_file)
175
+ m = len(array_snps.IDList)
176
+ print(f'Read list of {m} SNPs from {snp_file}')
177
+
178
+ # Load fam
179
+ ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
180
+ array_indivs = ind_obj(ind_file)
181
+ n = len(array_indivs.IDList)
182
+ print(f'Read list of {n} individuals from {ind_file}')
183
+ array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
184
+ geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
185
+ # Load the annotations of the baseline
186
+ if ld_unit == 'SNP':
187
+ max_dist = ld_wind
188
+ coords = np.array(range(geno_array.m))
189
+ elif ld_unit == 'KB':
190
+ max_dist = ld_wind * 1000
191
+ coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
192
+ elif ld_unit == 'CM':
193
+ max_dist = ld_wind
194
+ coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
195
+ else:
196
+ raise ValueError(f'Invalid ld_wind_unit: {ld_unit}')
197
+ block_left = getBlockLefts(coords, max_dist)
198
+ # Calculate the LD score
199
+ lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
200
+ return lN_df
201
+
202
+
203
+ # %%
204
+ def calculate_ldscore_from_annotation(SNP_annotation_df, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
205
+ """
206
+ Calculate the SNP-gene weight matrix.
207
+ """
208
+ # Get the dummy matrix
209
+ # Get the SNP-gene weight matrix
210
+ snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
211
+ ld_unit=ld_unit)
212
+ snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
213
+ snp_gene_weight_matrix.index = SNP_annotation_df.index
214
+ snp_gene_weight_matrix.columns = SNP_annotation_df.columns
215
+ return snp_gene_weight_matrix
216
+
217
+
218
+ def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
219
+ SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1).astype(np.float32, copy=False)
220
+
221
+ snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
222
+ ld_unit=ld_unit)
223
+ snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
224
+ snp_gene_weight_matrix.index = SNP_annotation_df.index
225
+ snp_gene_weight_matrix.columns = SNP_annotation_df.columns
226
+
227
+ # split to each annotation
228
+ snp_annotation_len_list = [len(df.columns) for df in SNP_annotation_df_list]
229
+ snp_gene_weight_matrix_list = []
230
+ start = 0
231
+ for snp_annotation_len in snp_annotation_len_list:
232
+ snp_gene_weight_matrix_list.append(snp_gene_weight_matrix.iloc[:, start:start + snp_annotation_len])
233
+ start += snp_annotation_len
234
+ return snp_gene_weight_matrix_list
235
+
236
+
237
+ # %%
238
+ class S_LDSC_Boost:
239
+ def __init__(self, config: GenerateLDScoreConfig):
240
+ self.config = config
241
+
242
+ self.mk_score = load_marker_score(config.mkscore_feather_path)
243
+
244
+ # Load GTF and get common markers
245
+ self.gtf_pr, self.mk_score_common = load_gtf(config.gtf_annotation_file, self.mk_score,
246
+ window_size=config.gene_window_size)
247
+
248
+ # Load enhancer
249
+ if config.enhancer_annotation_file is not None:
250
+ enhancer_df = pr.read_bed(config.enhancer_annotation_file, as_df=True)
251
+ enhancer_df.set_index('Name', inplace=True)
252
+ enhancer_df.index.name = 'gene_name'
253
+
254
+ # keep the common genes and add the enhancer score
255
+ avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=['avg_mkscore'])
256
+ enhancer_df = enhancer_df.join(avg_mkscore, how='inner', on='gene_name', )
257
+
258
+ # add distance to TSS
259
+ enhancer_df['TSS'] = self.gtf_pr.df.set_index('gene_name').reindex(enhancer_df.index)['TSS']
260
+
261
+ # convert to pyranges
262
+ self.enhancer_pr = pr.PyRanges(enhancer_df.reset_index())
263
+
264
+ else:
265
+ self.enhancer_pr = None
266
+
267
+ # create tha zarr file
268
+ if config.ldscore_save_format == 'zarr':
269
+
270
+ chrom_snp_length_dict = get_snp_counts(config)
271
+ self.chrom_snp_start_point = chrom_snp_length_dict['chrom_snp_start_point']
272
+
273
+ zarr_path = Path(config.ldscore_save_dir) / f'{config.sample_name}.ldscore.zarr'
274
+ if not zarr_path.exists():
275
+ self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a', dtype=np.float16,
276
+ chunks=config.zarr_chunk_size,
277
+ shape=(chrom_snp_length_dict['total'], self.mk_score_common.shape[1]))
278
+ zarr_path.mkdir(parents=True, exist_ok=True)
279
+ # save spot names
280
+ self.zarr_file.attrs['spot_names'] = self.mk_score_common.columns.to_list()
281
+ # save chrom_snp_length_dict
282
+ self.zarr_file.attrs['chrom_snp_start_point'] = self.chrom_snp_start_point
283
+ else:
284
+ self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a')
285
+
286
+ def process_chromosome(self, chrom: int):
287
+ self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
288
+
289
+ # Get SNP-Gene dummy pairs
290
+ self.snp_gene_pair_dummy = self.get_snp_gene_dummy(chrom, )
291
+
292
+ if self.config.keep_snp_root is not None:
293
+ keep_snp = pd.read_csv(f'{self.config.keep_snp_root}.{chrom}.snp', header=None)[0].to_list()
294
+ self.keep_snp_mask = self.snp_gene_pair_dummy.index.isin(keep_snp)
295
+ # the SNP name of keeped
296
+ self.snp_name = self.snp_gene_pair_dummy.index[self.keep_snp_mask].to_list()
297
+ else:
298
+ self.keep_snp_mask = None
299
+ self.snp_name = self.snp_gene_pair_dummy.index.to_list()
300
+
301
+ if self.config.additional_baseline_annotation is not None:
302
+ additional_baseline_annotation = Path(self.config.additional_baseline_annotation)
303
+ additional_baseline_annotation_file_path = additional_baseline_annotation / f'baseline.{chrom}.annot.gz'
304
+ assert additional_baseline_annotation_file_path.exists(), f'additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}'
305
+ additional_baseline_annotation_df = pd.read_csv(additional_baseline_annotation_file_path, sep='\t')
306
+ additional_baseline_annotation_df.set_index('SNP', inplace=True)
307
+
308
+ # drop these columns if exists CHR BP CM]
309
+ additional_baseline_annotation_df.drop(['CHR', 'BP', 'CM'], axis=1, inplace=True, errors='ignore')
310
+
311
+ # reindex, for those SNPs not in additional_baseline_annotation_df, set to 0
312
+ num_of_not_exist_snp = (~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)).sum()
313
+ if num_of_not_exist_snp > 0:
314
+ logger.warning(
315
+ f'{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0')
316
+ additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
317
+ self.snp_gene_pair_dummy.index,
318
+ fill_value=0)
319
+ else:
320
+ additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
321
+ self.snp_gene_pair_dummy.index)
322
+
323
+ # do this for saving the cpu time, only calculate r2 once
324
+ self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
325
+ calculate_ldscore_from_multiple_annotation(
326
+ [self.snp_gene_pair_dummy, additional_baseline_annotation_df],
327
+ chrom,
328
+ self.config.bfile_root,
329
+ ld_wind=self.config.ld_wind,
330
+ ld_unit=self.config.ld_unit))
331
+
332
+ additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[self.snp_name]
333
+ # print(additional_baseline_annotation_ldscore.index.to_list()==self.snp_name)
334
+
335
+ ld_score_file = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather'
336
+ M_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M'
337
+ M_5_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50'
338
+
339
+ # save additional baseline annotation ldscore
340
+ self.save_ldscore_to_feather(additional_baseline_annotation_ldscore.values,
341
+ column_names=additional_baseline_annotation_ldscore.columns,
342
+ save_file_name=ld_score_file,
343
+ )
344
+
345
+ # caculate the M and save
346
+ save_dir = Path(M_file_path).parent
347
+ save_dir.mkdir(parents=True, exist_ok=True)
348
+ M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
349
+ M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0, keepdims=True)
350
+ np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
351
+ np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
352
+
353
+ else:
354
+ # Calculate SNP-Gene weight matrix
355
+ self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(self.snp_gene_pair_dummy, chrom,
356
+ self.config.bfile_root,
357
+ ld_wind=self.config.ld_wind,
358
+ ld_unit=self.config.ld_unit)
359
+ # only keep the snp in keep_snp_root
360
+ if self.keep_snp_mask is not None:
361
+ self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
362
+
363
+ if self.config.save_pre_calculate_snp_gene_weight_matrix:
364
+ snp_gene_weight_matrix_save_dir = Path(self.config.ldscore_save_dir) / 'snp_gene_weight_matrix'
365
+ snp_gene_weight_matrix_save_dir.mkdir(parents=True, exist_ok=True)
366
+ logger.info(f'Saving snp_gene_weight_matrix for chr{chrom}...')
367
+ self.snp_gene_weight_matrix.reset_index().to_feather(
368
+ snp_gene_weight_matrix_save_dir / f'{chrom}.snp_gene_weight_matrix.feather')
369
+
370
+ # convert to sparse
371
+ self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
372
+ logger.info(f'Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}')
373
+
374
+ # calculate baseline ld score
375
+ logger.info(f'Calculating baseline ld score for chr{chrom}...')
376
+ self.calculate_ldscore_for_base_line(chrom, self.config.sample_name, self.config.ldscore_save_dir)
377
+
378
+ # calculate ld score for annotation
379
+ logger.info(f'Calculating ld score for annotation for chr{chrom}...')
380
+ self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
381
+ self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
382
+ chrom,
383
+ self.config.sample_name,
384
+ self.config.ldscore_save_dir,
385
+ )
386
+
387
+ def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
388
+ mk_score_chunk,
389
+ drop_dummy_na=True,
390
+ ):
391
+
392
+ if drop_dummy_na:
393
+ ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
394
+ else:
395
+ ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
396
+
397
+ return ldscore_chr_chunk
398
+
399
+ def save_ldscore_to_feather(self, ldscore_chr_chunk: np.ndarray, column_names, save_file_name):
400
+ save_dir = Path(save_file_name).parent
401
+ save_dir.mkdir(parents=True, exist_ok=True)
402
+
403
+ ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
404
+ # avoid overflow of float16, if inf, set to max of float16
405
+ ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
406
+ # ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
407
+ # self.keep_snp_mask]
408
+
409
+ # save for each chunk
410
+ df = pd.DataFrame(ldscore_chr_chunk,
411
+ index=self.snp_name,
412
+ columns=column_names,
413
+ )
414
+ df.index.name = 'SNP'
415
+ df.reset_index().to_feather(save_file_name)
416
+
417
+ def save_ldscore_chunk_to_zarr(self, ldscore_chr_chunk: np.ndarray,
418
+ chrom: int, start_col_index,
419
+ ):
420
+ ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
421
+ # avoid overflow of float16, if inf, set to max of float16
422
+ ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
423
+
424
+ # save for each chunk
425
+ chrom_snp_start_point = self.chrom_snp_start_point[chrom - 1]
426
+ chrom_snp_end_point = self.chrom_snp_start_point[chrom]
427
+
428
+ self.zarr_file[chrom_snp_start_point:chrom_snp_end_point,
429
+ start_col_index:start_col_index + ldscore_chr_chunk.shape[1]] = ldscore_chr_chunk
430
+
431
+ def calculate_M_use_SNP_gene_pair_dummy_by_chunk(self,
432
+ mk_score_chunk,
433
+ M_file_path, M_5_file_path,
434
+ drop_dummy_na=True,
435
+ ):
436
+ '''
437
+ calculate M use SNP_gene_pair_dummy_sumed_along_snp_axis and mk_score_chunk
438
+ '''
439
+ SNP_gene_pair_dummy_sumed_along_snp_axis = self.snp_gene_pair_dummy.values.sum(axis=0, keepdims=True)
440
+ SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = self.snp_gene_pair_dummy.loc[self.snp_pass_maf].values.sum(
441
+ axis=0,
442
+ keepdims=True)
443
+ if drop_dummy_na:
444
+ SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[:, :-1]
445
+ SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf[:,
446
+ :-1]
447
+ save_dir = Path(M_file_path).parent
448
+ save_dir.mkdir(parents=True, exist_ok=True)
449
+ M_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis @ mk_score_chunk
450
+ M_5_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf @ mk_score_chunk
451
+ np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
452
+ np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
453
+
454
+ def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(self, mk_score_common, chrom, sample_name, save_dir):
455
+ """
456
+ Calculate the LD score using the SNP-gene weight matrix.
457
+ :param sample_name:
458
+ """
459
+ # Calculate the LD score
460
+ chunk_index = 1
461
+ for i in trange(0, mk_score_common.shape[1], self.config.spots_per_chunk,
462
+ desc=f'Calculating LD score by chunk for chr{chrom}'):
463
+ mk_score_chunk = mk_score_common.iloc[:, i:i + self.config.spots_per_chunk]
464
+
465
+ ld_score_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather'
466
+ M_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M'
467
+ M_5_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50'
468
+
469
+ ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
470
+ mk_score_chunk,
471
+ drop_dummy_na=True,
472
+ )
473
+ if self.config.ldscore_save_format == 'feather':
474
+ self.save_ldscore_to_feather(ldscore_chr_chunk,
475
+ column_names=mk_score_chunk.columns,
476
+ save_file_name=ld_score_file,
477
+ )
478
+ elif self.config.ldscore_save_format == 'zarr':
479
+ self.save_ldscore_chunk_to_zarr(ldscore_chr_chunk,
480
+ chrom=chrom,
481
+ start_col_index=i,
482
+ )
483
+ else:
484
+ raise ValueError(f'Invalid ldscore_save_format: {self.config.ldscore_save_format}')
485
+
486
+ self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
487
+ mk_score_chunk,
488
+ M_file,
489
+ M_5_file,
490
+ drop_dummy_na=True,
491
+ )
492
+
493
+ chunk_index += 1
494
+
495
+ def calculate_ldscore_for_base_line(self, chrom, sample_name, save_dir):
496
+ # save baseline ld score
497
+ baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
498
+ baseline_mk_score[-1, 0] = 0 # all_gene
499
+ baseline_mk_score_df = pd.DataFrame(baseline_mk_score, index=self.snp_gene_pair_dummy.columns,
500
+ columns=['all_gene', 'base'])
501
+ ld_score_file = f'{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather'
502
+ M_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M'
503
+ M_5_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M_5_50'
504
+
505
+ ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
506
+ baseline_mk_score_df,
507
+ drop_dummy_na=False,
508
+ )
509
+
510
+ self.save_ldscore_to_feather(ldscore_chr_chunk,
511
+ column_names=baseline_mk_score_df.columns,
512
+ save_file_name=ld_score_file,
513
+ )
514
+ # save baseline M
515
+ self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
516
+ baseline_mk_score_df,
517
+ M_file,
518
+ M_5_file,
519
+ drop_dummy_na=False,
520
+ )
521
+
522
+ def get_snp_gene_dummy(self, chrom, ):
523
+ """
524
+ Get the dummy matrix of SNP-gene pairs.
525
+ """
526
+ # Load the bim file
527
+ print("Loading bim data")
528
+ bim, bim_pr = load_bim(self.config.bfile_root, chrom)
529
+
530
+ if self.config.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
531
+
532
+ SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
533
+ SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
534
+ # total_SNP_gene_pair = SNP_gene_pair_gtf.join(SNP_gene_pair_enhancer, how='outer', lsuffix='_gtf', )
535
+
536
+ mask_of_nan_gtf = SNP_gene_pair_gtf.gene_name.isna()
537
+ mask_of_nan_enhancer = SNP_gene_pair_enhancer.gene_name.isna()
538
+
539
+ if self.config.gene_window_enhancer_priority == 'gene_window_first':
540
+ SNP_gene_pair = SNP_gene_pair_gtf
541
+ SNP_gene_pair.loc[mask_of_nan_gtf, 'gene_name'] = SNP_gene_pair_enhancer.loc[
542
+ mask_of_nan_gtf, 'gene_name']
543
+ elif self.config.gene_window_enhancer_priority == 'enhancer_first':
544
+ SNP_gene_pair = SNP_gene_pair_enhancer
545
+ SNP_gene_pair.loc[mask_of_nan_enhancer, 'gene_name'] = SNP_gene_pair_gtf.loc[
546
+ mask_of_nan_enhancer, 'gene_name']
547
+ else:
548
+ raise ValueError(
549
+ f'Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}')
550
+
551
+ elif self.config.gene_window_enhancer_priority is None: # use gtf only
552
+ SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
553
+ SNP_gene_pair = SNP_gene_pair_gtf
554
+
555
+ elif self.config.gene_window_enhancer_priority == 'enhancer_only':
556
+ SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
557
+ SNP_gene_pair = SNP_gene_pair_enhancer
558
+ else:
559
+ raise ValueError('gtf_pr and enhancer_pr cannot be None at the same time')
560
+
561
+ # save the SNP_gene_pair to feather
562
+ SNP_gene_pair_save_path = Path(
563
+ self.config.ldscore_save_dir) / f'SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather'
564
+ SNP_gene_pair_save_path.parent.mkdir(parents=True, exist_ok=True)
565
+ SNP_gene_pair.reset_index().to_feather(SNP_gene_pair_save_path)
566
+
567
+ # Get the dummy matrix
568
+ SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair['gene_name'], dummy_na=True)
569
+ return SNP_gene_pair_dummy
570
+
571
+ def get_SNP_gene_pair_from_gtf(self, bim, bim_pr):
572
+ logger.info(
573
+ "Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)")
574
+ overlaps_small = Overlaps_gtf_bim(self.gtf_pr, bim_pr)
575
+ # Get the SNP-gene pair
576
+ annot = bim[["CHR", "BP", "SNP", "CM"]]
577
+ SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
578
+ return SNP_gene_pair
579
+
580
+ def get_SNP_gene_pair_from_enhancer(self, bim, bim_pr, ):
581
+ logger.info(
582
+ "Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score")
583
+ # Get the SNP-gene pair
584
+ overlaps_small = self.enhancer_pr.join(bim_pr).df
585
+ annot = bim[["CHR", "BP", "SNP", "CM"]]
586
+ if self.config.snp_multiple_enhancer_strategy == 'max_mkscore':
587
+ logger.debug('select the gene with highest marker score')
588
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').avg_mkscore.idxmax()]
589
+
590
+ elif self.config.snp_multiple_enhancer_strategy == 'nearest_TSS':
591
+ logger.debug('select the gene with nearest TSS')
592
+ overlaps_small['Distance'] = np.abs(overlaps_small['Start_b'] - overlaps_small['TSS'])
593
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
594
+
595
+ SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
596
+
597
+ return SNP_gene_pair
598
+
599
+
600
+ def run_generate_ldscore(config: GenerateLDScoreConfig):
601
+ if config.ldscore_save_format == 'quick_mode':
602
+ logger.info('Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore.')
603
+ ldscore_save_dir = config.ldscore_save_dir
604
+
605
+ # link the baseline annotation
606
+ baseline_annotation_dir = Path(config.baseline_annotation_dir)
607
+ (ldscore_save_dir / 'baseline').symlink_to(baseline_annotation_dir, target_is_directory=True)
608
+
609
+ # link the SNP_gene_pair
610
+ SNP_gene_pair_dir = Path(config.SNP_gene_pair_dir)
611
+ (ldscore_save_dir / 'SNP_gene_pair').symlink_to(SNP_gene_pair_dir, target_is_directory=True)
612
+ return
613
+ s_ldsc_boost = S_LDSC_Boost(config)
614
+ if config.chrom == 'all':
615
+ for chrom in range(1, 23):
616
+ s_ldsc_boost.process_chromosome(chrom)
617
+ else:
618
+ s_ldsc_boost.process_chromosome(config.chrom)