gsMap 1.60__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,551 @@
1
+ import argparse
2
+ import logging
3
+ from pathlib import Path
4
+
5
+ import numpy as np
6
+ # %%
7
+ import pandas as pd
8
+ import pyranges as pr
9
+ from scipy.sparse import csr_matrix
10
+ from tqdm import trange
11
+
12
+ from gsMap.config import GenerateLDScoreConfig, add_generate_ldscore_args
13
+ # %%
14
+ from gsMap.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
15
+
16
+ logger = logging.getLogger(__name__)
17
+ logger.setLevel(logging.DEBUG)
18
+ handler = logging.StreamHandler()
19
+ handler.setFormatter(logging.Formatter(
20
+ '[{asctime}] {levelname:6s} {message}', style='{'))
21
+ logger.addHandler(handler)
22
+
23
+
24
+ # %%
25
+ # load gtf
26
+ def load_gtf(gtf_file, mk_score, window_size):
27
+ """
28
+ Load the gene annotation file (gtf).
29
+ """
30
+ print("Loading gtf data")
31
+ #
32
+ # Load GTF file
33
+ gtf = pr.read_gtf(gtf_file)
34
+ gtf = gtf.df
35
+ #
36
+ # Select the common genes
37
+ gtf = gtf[gtf['Feature'] == 'gene']
38
+ common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
39
+ #
40
+ gtf = gtf[gtf.gene_name.isin(common_gene)]
41
+ mk_score = mk_score[mk_score.index.isin(common_gene)]
42
+ #
43
+ # Remove duplicated lines
44
+ gtf = gtf.drop_duplicates(subset='gene_name', keep="first")
45
+ #
46
+ # Process the GTF (open 100-KB window: Tss - Ted)
47
+ gtf_bed = gtf[['Chromosome', 'Start', 'End', 'gene_name', 'Strand']].copy()
48
+ gtf_bed.loc[:, 'TSS'] = gtf_bed['Start']
49
+ gtf_bed.loc[:, 'TED'] = gtf_bed['End']
50
+
51
+ gtf_bed.loc[:, 'Start'] = gtf_bed['TSS'] - window_size
52
+ gtf_bed.loc[:, 'End'] = gtf_bed['TED'] + window_size
53
+ gtf_bed.loc[gtf_bed['Start'] < 0, 'Start'] = 0
54
+ #
55
+ # Correct the negative strand
56
+ tss_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS']
57
+ ted_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED']
58
+ gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS'] = ted_neg
59
+ gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED'] = tss_neg
60
+ gtf_bed = gtf_bed.drop('Strand', axis=1)
61
+ #
62
+ # Transform the GTF to PyRanges
63
+ gtf_pr = pr.PyRanges(gtf_bed)
64
+ return gtf_pr, mk_score
65
+
66
+
67
+ # %%
68
+ def load_marker_score(mk_score_file):
69
+ """
70
+ Load marker scores of each cell.
71
+ """
72
+ mk_score = pd.read_feather(mk_score_file).set_index('HUMAN_GENE_SYM').rename_axis('gene_name')
73
+ mk_score = mk_score.astype(np.float32, copy=False)
74
+ return mk_score
75
+
76
+
77
+ # %%
78
+ # load mkscore get common gene
79
+ # %%
80
+ # load bim
81
+ def load_bim(bfile_root, chrom):
82
+ """
83
+ Load the bim file.
84
+ """
85
+ print("Loading bim data")
86
+ bim = pd.read_csv(f'{bfile_root}.{chrom}.bim', sep='\t', header=None)
87
+ bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
88
+ #
89
+ # Transform bim to PyRanges
90
+ bim_pr = bim.copy()
91
+ bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
92
+ bim_pr['End'] = bim_pr['Start']
93
+ bim_pr = pr.PyRanges(bim_pr)
94
+ bim_pr.Chromosome = f'chr{chrom}'
95
+ return bim, bim_pr
96
+
97
+
98
+ # %%
99
+ def Overlaps_gtf_bim(gtf_pr, bim_pr):
100
+ """
101
+ Find overlaps between gtf and bim file.
102
+ """
103
+ # Select the overlapped regions (SNPs in gene windows)
104
+ overlaps = gtf_pr.join(bim_pr)
105
+ overlaps = overlaps.df
106
+ overlaps['Distance'] = np.abs(overlaps['Start_b'] - overlaps['TSS'])
107
+ overlaps_small = overlaps.copy()
108
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
109
+ return overlaps_small
110
+
111
+
112
+ # %%
113
+
114
+
115
+ # %%
116
+ def get_snp_pass_maf(bfile_root, chrom, maf_min=0.05):
117
+ """
118
+ Get the dummy matrix of SNP-gene pairs.
119
+ """
120
+ # Load the bim file
121
+ PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
122
+ PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
123
+
124
+ bfile = f'{bfile_root}.{chrom}'
125
+ snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
126
+ array_snps = snp_obj(snp_file)
127
+ m = len(array_snps.IDList)
128
+
129
+ # Load fam
130
+ ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
131
+ array_indivs = ind_obj(ind_file)
132
+ n = len(array_indivs.IDList)
133
+ array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
134
+ geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
135
+ ii = geno_array.maf > maf_min
136
+ snp_pass_maf = array_snps.IDList[ii]
137
+ print(f'After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain.')
138
+ return snp_pass_maf.SNP.to_list()
139
+
140
+
141
+ def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit='CM'):
142
+ PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
143
+ PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
144
+
145
+ bfile = f'{bfile_root}.{chrom}'
146
+ snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
147
+ array_snps = snp_obj(snp_file)
148
+ m = len(array_snps.IDList)
149
+ print(f'Read list of {m} SNPs from {snp_file}')
150
+
151
+ # Load fam
152
+ ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
153
+ array_indivs = ind_obj(ind_file)
154
+ n = len(array_indivs.IDList)
155
+ print(f'Read list of {n} individuals from {ind_file}')
156
+ array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
157
+ geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
158
+ # Load the annotations of the baseline
159
+ if ld_unit == 'SNP':
160
+ max_dist = ld_wind
161
+ coords = np.array(range(geno_array.m))
162
+ elif ld_unit == 'KB':
163
+ max_dist = ld_wind * 1000
164
+ coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
165
+ elif ld_unit == 'CM':
166
+ max_dist = ld_wind
167
+ coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
168
+ else:
169
+ raise ValueError(f'Invalid ld_wind_unit: {ld_unit}')
170
+ block_left = getBlockLefts(coords, max_dist)
171
+ # Calculate the LD score
172
+ lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
173
+ return lN_df
174
+
175
+
176
+ # %%
177
+ def calculate_ldscore_from_annotation(SNP_annotation_df, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
178
+ """
179
+ Calculate the SNP-gene weight matrix.
180
+ """
181
+ # Get the dummy matrix
182
+ # Get the SNP-gene weight matrix
183
+ snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
184
+ ld_unit=ld_unit)
185
+ snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
186
+ snp_gene_weight_matrix.index = SNP_annotation_df.index
187
+ snp_gene_weight_matrix.columns = SNP_annotation_df.columns
188
+ return snp_gene_weight_matrix
189
+
190
+
191
+ def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
192
+ SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1)
193
+
194
+ snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
195
+ ld_unit=ld_unit)
196
+ snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
197
+ snp_gene_weight_matrix.index = SNP_annotation_df.index
198
+ snp_gene_weight_matrix.columns = SNP_annotation_df.columns
199
+
200
+ # split to each annotation
201
+ snp_annotation_len_list = [len(df.columns) for df in SNP_annotation_df_list]
202
+ snp_gene_weight_matrix_list = []
203
+ start = 0
204
+ for snp_annotation_len in snp_annotation_len_list:
205
+ snp_gene_weight_matrix_list.append(snp_gene_weight_matrix.iloc[:, start:start + snp_annotation_len])
206
+ start += snp_annotation_len
207
+ return snp_gene_weight_matrix_list
208
+
209
+
210
+ # %%
211
+ class S_LDSC_Boost:
212
+ def __init__(self, config: GenerateLDScoreConfig):
213
+ self.config = config
214
+
215
+ self.mk_score = load_marker_score(config.mkscore_feather_file)
216
+
217
+ # Load GTF and get common markers
218
+ self.gtf_pr, self.mk_score_common = load_gtf(config.gtf_annotation_file, self.mk_score,
219
+ window_size=config.gene_window_size)
220
+
221
+ # Load enhancer
222
+ if config.enhancer_annotation_file is not None:
223
+ enhancer_df = pr.read_bed(config.enhancer_annotation_file, as_df=True)
224
+ enhancer_df.set_index('Name', inplace=True)
225
+ enhancer_df.index.name = 'gene_name'
226
+
227
+ # keep the common genes and add the enhancer score
228
+ avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=['avg_mkscore'])
229
+ enhancer_df = enhancer_df.join(avg_mkscore, how='inner', on='gene_name', )
230
+
231
+ # add distance to TSS
232
+ enhancer_df['TSS'] = self.gtf_pr.df.set_index('gene_name').reindex(enhancer_df.index)['TSS']
233
+
234
+ # convert to pyranges
235
+ self.enhancer_pr = pr.PyRanges(enhancer_df.reset_index())
236
+
237
+ else:
238
+ self.enhancer_pr = None
239
+
240
+ def process_chromosome(self, chrom: int):
241
+ self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
242
+
243
+ # Get SNP-Gene dummy pairs
244
+ self.snp_gene_pair_dummy = self.get_snp_gene_dummy(chrom, )
245
+
246
+ if self.config.keep_snp_root is not None:
247
+ keep_snp = pd.read_csv(f'{self.config.keep_snp_root}.{chrom}.snp', header=None)[0].to_list()
248
+ self.keep_snp_mask = self.snp_gene_pair_dummy.index.isin(keep_snp)
249
+ # the SNP name of keeped
250
+ self.snp_name = self.snp_gene_pair_dummy.index[self.keep_snp_mask].to_list()
251
+ else:
252
+ self.keep_snp_mask = None
253
+ self.snp_name = self.snp_gene_pair_dummy.index.to_list()
254
+
255
+ if self.config.additional_baseline_annotation_dir_path is not None:
256
+ additional_baseline_annotation_dir_path = Path(self.config.additional_baseline_annotation_dir_path)
257
+ additional_baseline_annotation_file_path = additional_baseline_annotation_dir_path / f'baseline.{chrom}.annot.gz'
258
+ assert additional_baseline_annotation_file_path.exists(), f'additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}'
259
+ additional_baseline_annotation_df = pd.read_csv(additional_baseline_annotation_file_path, sep='\t')
260
+ additional_baseline_annotation_df.set_index('SNP', inplace=True)
261
+
262
+ # drop these columns if exists CHR BP CM]
263
+ additional_baseline_annotation_df.drop(['CHR', 'BP', 'CM'], axis=1, inplace=True, errors='ignore')
264
+
265
+ # reindex, for those SNPs not in additional_baseline_annotation_df, set to 0
266
+ num_of_not_exist_snp = (~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)).sum()
267
+ if num_of_not_exist_snp > 0:
268
+ logger.warning(
269
+ f'{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0')
270
+ additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
271
+ self.snp_gene_pair_dummy.index,
272
+ fill_value=0)
273
+ else:
274
+ additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
275
+ self.snp_gene_pair_dummy.index)
276
+
277
+ # do this for saving the cpu time, by only calculate r2 once
278
+ self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
279
+ calculate_ldscore_from_multiple_annotation(
280
+ [self.snp_gene_pair_dummy, additional_baseline_annotation_df],
281
+ chrom,
282
+ self.config.bfile_root,
283
+ ld_wind=self.config.ld_wind,
284
+ ld_unit=self.config.ld_unit))
285
+
286
+ ld_score_file = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather'
287
+ M_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M'
288
+ M_5_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50'
289
+
290
+ # save additional baseline annotation ldscore
291
+ self.save_ldscore(additional_baseline_annotation_ldscore.values,
292
+ column_names=additional_baseline_annotation_ldscore.columns,
293
+ save_file_name=ld_score_file,
294
+ )
295
+
296
+ # caculate the M and save
297
+ save_dir = Path(M_file_path).parent
298
+ save_dir.mkdir(parents=True, exist_ok=True)
299
+ M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
300
+ M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0,keepdims=True)
301
+ np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
302
+ np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
303
+
304
+ else:
305
+ # Calculate SNP-Gene weight matrix
306
+ self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(self.snp_gene_pair_dummy, chrom,
307
+ self.config.bfile_root,
308
+ ld_wind=self.config.ld_wind,
309
+ ld_unit=self.config.ld_unit)
310
+ # convert to sparse
311
+ self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
312
+
313
+ # calculate baseline ld score
314
+ self.calculate_ldscore_for_base_line(chrom, self.config.sample_name, self.config.ldscore_save_dir)
315
+
316
+ # calculate ld score for annotation
317
+ self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
318
+ self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
319
+ chrom,
320
+ self.config.sample_name,
321
+ self.config.ldscore_save_dir,
322
+ )
323
+
324
+ def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
325
+ mk_score_chunk,
326
+ save_file_name,
327
+ drop_dummy_na=True,
328
+ ):
329
+
330
+ if drop_dummy_na:
331
+ ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
332
+ else:
333
+ ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
334
+
335
+ self.save_ldscore(ldscore_chr_chunk,
336
+ column_names=mk_score_chunk.columns,
337
+ save_file_name=save_file_name,
338
+ )
339
+
340
+ def save_ldscore(self, ldscore_chr_chunk: np.ndarray, column_names, save_file_name):
341
+ save_dir = Path(save_file_name).parent
342
+ save_dir.mkdir(parents=True, exist_ok=True)
343
+
344
+ ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
345
+ # avoid overflow of float16, if inf, set to max of float16
346
+ ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
347
+ ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
348
+ self.keep_snp_mask]
349
+ # save for each chunk
350
+ df = pd.DataFrame(ldscore_chr_chunk,
351
+ index=self.snp_name,
352
+ columns=column_names,
353
+ )
354
+ df.index.name = 'SNP'
355
+ df.reset_index().to_feather(save_file_name)
356
+
357
+ def calculate_M_use_SNP_gene_pair_dummy_by_chunk(self,
358
+ mk_score_chunk,
359
+ M_file_path, M_5_file_path,
360
+ drop_dummy_na=True,
361
+ ):
362
+ '''
363
+ calculate M use SNP_gene_pair_dummy_sumed_along_snp_axis and mk_score_chunk
364
+ '''
365
+ SNP_gene_pair_dummy_sumed_along_snp_axis = self.snp_gene_pair_dummy.values.sum(axis=0, keepdims=True)
366
+ SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = self.snp_gene_pair_dummy.loc[self.snp_pass_maf].values.sum(
367
+ axis=0,
368
+ keepdims=True)
369
+ if drop_dummy_na:
370
+ SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[:, :-1]
371
+ SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf[:,
372
+ :-1]
373
+ save_dir = Path(M_file_path).parent
374
+ save_dir.mkdir(parents=True, exist_ok=True)
375
+ M_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis @ mk_score_chunk
376
+ M_5_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf @ mk_score_chunk
377
+ np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
378
+ np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
379
+
380
+
381
+ def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(self, mk_score_common, chrom, sample_name, save_dir):
382
+ """
383
+ Calculate the LD score using the SNP-gene weight matrix.
384
+ :param sample_name:
385
+ """
386
+ # Calculate the LD score
387
+ chunk_index = 1
388
+ for i in trange(0, mk_score_common.shape[1], self.config.spots_per_chunk,
389
+ desc=f'Calculating LD score by chunk for chr{chrom}'):
390
+ mk_score_chunk = mk_score_common.iloc[:, i:i + self.config.spots_per_chunk]
391
+
392
+ ld_score_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather'
393
+ M_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M'
394
+ M_5_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50'
395
+
396
+ self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
397
+ mk_score_chunk,
398
+ save_file_name=ld_score_file,
399
+ drop_dummy_na=True,
400
+ )
401
+ self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
402
+ mk_score_chunk,
403
+ M_file,
404
+ M_5_file,
405
+ drop_dummy_na=True,
406
+ )
407
+
408
+ chunk_index += 1
409
+
410
+ def calculate_ldscore_for_base_line(self, chrom, sample_name, save_dir):
411
+ # save baseline ld score
412
+ baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
413
+ baseline_mk_score[-1, 0] = 0 # all_gene
414
+ baseline_mk_score_df = pd.DataFrame(baseline_mk_score, index=self.snp_gene_pair_dummy.columns,
415
+ columns=['all_gene', 'base'])
416
+ ld_score_file = f'{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather'
417
+ M_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M'
418
+ M_5_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M_5_50'
419
+
420
+ self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
421
+ baseline_mk_score_df,
422
+ save_file_name=ld_score_file,
423
+ drop_dummy_na=False,
424
+ )
425
+ # save baseline M
426
+ self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
427
+ baseline_mk_score_df,
428
+ M_file,
429
+ M_5_file,
430
+ drop_dummy_na=False,
431
+ )
432
+
433
+ def get_snp_gene_dummy(self, chrom, ):
434
+ """
435
+ Get the dummy matrix of SNP-gene pairs.
436
+ """
437
+ # Load the bim file
438
+ bim, bim_pr = load_bim(self.config.bfile_root, chrom)
439
+
440
+ if self.config.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
441
+
442
+ SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
443
+ SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
444
+ # total_SNP_gene_pair = SNP_gene_pair_gtf.join(SNP_gene_pair_enhancer, how='outer', lsuffix='_gtf', )
445
+
446
+ mask_of_nan_gtf = SNP_gene_pair_gtf.gene_name.isna()
447
+ mask_of_nan_enhancer = SNP_gene_pair_enhancer.gene_name.isna()
448
+
449
+ if self.config.gene_window_enhancer_priority == 'gene_window_first':
450
+ SNP_gene_pair = SNP_gene_pair_gtf
451
+ SNP_gene_pair.loc[mask_of_nan_gtf, 'gene_name'] = SNP_gene_pair_enhancer.loc[
452
+ mask_of_nan_gtf, 'gene_name']
453
+ elif self.config.gene_window_enhancer_priority == 'enhancer_first':
454
+ SNP_gene_pair = SNP_gene_pair_enhancer
455
+ SNP_gene_pair.loc[mask_of_nan_enhancer, 'gene_name'] = SNP_gene_pair_gtf.loc[
456
+ mask_of_nan_enhancer, 'gene_name']
457
+ else:
458
+ raise ValueError(
459
+ f'Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}')
460
+
461
+ elif self.config.gene_window_enhancer_priority is None: # use gtf only
462
+ SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
463
+ SNP_gene_pair = SNP_gene_pair_gtf
464
+
465
+ elif self.config.gene_window_enhancer_priority == 'enhancer_only':
466
+ SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
467
+ SNP_gene_pair = SNP_gene_pair_enhancer
468
+ else:
469
+ raise ValueError('gtf_pr and enhancer_pr cannot be None at the same time')
470
+
471
+ # Get the dummy matrix
472
+ SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair['gene_name'], dummy_na=True)
473
+ return SNP_gene_pair_dummy
474
+
475
+ def get_SNP_gene_pair_from_gtf(self, bim, bim_pr):
476
+ logger.info(
477
+ "Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)")
478
+ overlaps_small = Overlaps_gtf_bim(self.gtf_pr, bim_pr)
479
+ # Get the SNP-gene pair
480
+ annot = bim[["CHR", "BP", "SNP", "CM"]]
481
+ SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
482
+ return SNP_gene_pair
483
+
484
+ def get_SNP_gene_pair_from_enhancer(self, bim, bim_pr, ):
485
+ logger.info(
486
+ "Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score")
487
+ # Get the SNP-gene pair
488
+ overlaps_small = self.enhancer_pr.join(bim_pr).df
489
+ annot = bim[["CHR", "BP", "SNP", "CM"]]
490
+ if self.config.snp_multiple_enhancer_strategy == 'max_mkscore':
491
+ logger.debug('select the gene with highest marker score')
492
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').avg_mkscore.idxmax()]
493
+
494
+ elif self.config.snp_multiple_enhancer_strategy == 'nearest_TSS':
495
+ logger.debug('select the gene with nearest TSS')
496
+ overlaps_small['Distance'] = np.abs(overlaps_small['Start_b'] - overlaps_small['TSS'])
497
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
498
+
499
+ SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
500
+
501
+ return SNP_gene_pair
502
+
503
+
504
+ def run_generate_ldscore(config: GenerateLDScoreConfig):
505
+ s_ldsc_boost = S_LDSC_Boost(config)
506
+ if config.chrom == 'all':
507
+ for chrom in range(1, 23):
508
+ s_ldsc_boost.process_chromosome(chrom)
509
+ else:
510
+ s_ldsc_boost.process_chromosome(config.chrom)
511
+
512
+
513
+ # %%
514
+ if __name__ == '__main__':
515
+ TEST = True
516
+ if TEST:
517
+ # %%
518
+ sample_name = 'Cortex_151507'
519
+ chrom = 'all'
520
+ save_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/Cortex_151507/snp_annotation/test/0101/sparse'
521
+ # %%
522
+ gtf_file = '/storage/yangjianLab/songliyang/ReferenceGenome/GRCh37/gencode.v39lift37.annotation.gtf'
523
+ mkscore_feather_file = f'/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/{sample_name}/gene_markers/{sample_name}_rank.feather'
524
+ bfile_root = '/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC'
525
+ window_size = 50000
526
+ keep_snp_root = '/storage/yangjianLab/sharedata/LDSC_resource/hapmap3_snps/hm'
527
+ spots_per_chunk = 10_000
528
+ enhancer_annotation = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/resource/epigenome/cleaned_data/by_tissue/BRN/ABC_roadmap_merged.bed'
529
+ # %%
530
+ config = GenerateLDScoreConfig(
531
+ sample_name=sample_name,
532
+ chrom=chrom,
533
+ ldscore_save_dir=save_dir,
534
+ gtf_annotation_file=gtf_file,
535
+ mkscore_feather_file=mkscore_feather_file,
536
+ bfile_root=bfile_root,
537
+ keep_snp_root=keep_snp_root,
538
+ gene_window_size=window_size,
539
+ spots_per_chunk=spots_per_chunk,
540
+ enhancer_annotation_file=enhancer_annotation,
541
+ gene_window_enhancer_priority='enhancer_first',
542
+ additional_baseline_annotation_dir_path='/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/resource/ldsc/baseline_v1.2/remove_base'
543
+ )
544
+ # %%
545
+ run_generate_ldscore(config)
546
+ else:
547
+ parser = argparse.ArgumentParser(description="Configuration for the application.")
548
+ add_generate_ldscore_args(parser)
549
+ args = parser.parse_args()
550
+ config = GenerateLDScoreConfig(**vars(args))
551
+ run_generate_ldscore(config)