gsMap 1.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN_VAE/__init__.py +0 -0
- gsMap/GNN_VAE/adjacency_matrix.py +95 -0
- gsMap/GNN_VAE/model.py +87 -0
- gsMap/GNN_VAE/train.py +97 -0
- gsMap/__init__.py +5 -0
- gsMap/__main__.py +3 -0
- gsMap/cauchy_combination_test.py +163 -0
- gsMap/config.py +734 -0
- gsMap/find_latent_representation.py +209 -0
- gsMap/format_sumstats.py +410 -0
- gsMap/generate_ldscore.py +551 -0
- gsMap/generate_r2_matrix.py +743 -0
- gsMap/jackknife.py +514 -0
- gsMap/latent_to_gene.py +257 -0
- gsMap/main.py +39 -0
- gsMap/make_annotations.py +560 -0
- gsMap/regression_read.py +294 -0
- gsMap/spatial_ldsc_multiple_sumstats.py +307 -0
- gsMap/visualize.py +154 -0
- gsmap-1.60.dist-info/LICENSE +21 -0
- gsmap-1.60.dist-info/METADATA +124 -0
- gsmap-1.60.dist-info/RECORD +24 -0
- gsmap-1.60.dist-info/WHEEL +4 -0
- gsmap-1.60.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,551 @@
|
|
1
|
+
import argparse
|
2
|
+
import logging
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
# %%
|
7
|
+
import pandas as pd
|
8
|
+
import pyranges as pr
|
9
|
+
from scipy.sparse import csr_matrix
|
10
|
+
from tqdm import trange
|
11
|
+
|
12
|
+
from gsMap.config import GenerateLDScoreConfig, add_generate_ldscore_args
|
13
|
+
# %%
|
14
|
+
from gsMap.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
logger.setLevel(logging.DEBUG)
|
18
|
+
handler = logging.StreamHandler()
|
19
|
+
handler.setFormatter(logging.Formatter(
|
20
|
+
'[{asctime}] {levelname:6s} {message}', style='{'))
|
21
|
+
logger.addHandler(handler)
|
22
|
+
|
23
|
+
|
24
|
+
# %%
|
25
|
+
# load gtf
|
26
|
+
def load_gtf(gtf_file, mk_score, window_size):
|
27
|
+
"""
|
28
|
+
Load the gene annotation file (gtf).
|
29
|
+
"""
|
30
|
+
print("Loading gtf data")
|
31
|
+
#
|
32
|
+
# Load GTF file
|
33
|
+
gtf = pr.read_gtf(gtf_file)
|
34
|
+
gtf = gtf.df
|
35
|
+
#
|
36
|
+
# Select the common genes
|
37
|
+
gtf = gtf[gtf['Feature'] == 'gene']
|
38
|
+
common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
|
39
|
+
#
|
40
|
+
gtf = gtf[gtf.gene_name.isin(common_gene)]
|
41
|
+
mk_score = mk_score[mk_score.index.isin(common_gene)]
|
42
|
+
#
|
43
|
+
# Remove duplicated lines
|
44
|
+
gtf = gtf.drop_duplicates(subset='gene_name', keep="first")
|
45
|
+
#
|
46
|
+
# Process the GTF (open 100-KB window: Tss - Ted)
|
47
|
+
gtf_bed = gtf[['Chromosome', 'Start', 'End', 'gene_name', 'Strand']].copy()
|
48
|
+
gtf_bed.loc[:, 'TSS'] = gtf_bed['Start']
|
49
|
+
gtf_bed.loc[:, 'TED'] = gtf_bed['End']
|
50
|
+
|
51
|
+
gtf_bed.loc[:, 'Start'] = gtf_bed['TSS'] - window_size
|
52
|
+
gtf_bed.loc[:, 'End'] = gtf_bed['TED'] + window_size
|
53
|
+
gtf_bed.loc[gtf_bed['Start'] < 0, 'Start'] = 0
|
54
|
+
#
|
55
|
+
# Correct the negative strand
|
56
|
+
tss_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS']
|
57
|
+
ted_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED']
|
58
|
+
gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS'] = ted_neg
|
59
|
+
gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED'] = tss_neg
|
60
|
+
gtf_bed = gtf_bed.drop('Strand', axis=1)
|
61
|
+
#
|
62
|
+
# Transform the GTF to PyRanges
|
63
|
+
gtf_pr = pr.PyRanges(gtf_bed)
|
64
|
+
return gtf_pr, mk_score
|
65
|
+
|
66
|
+
|
67
|
+
# %%
|
68
|
+
def load_marker_score(mk_score_file):
|
69
|
+
"""
|
70
|
+
Load marker scores of each cell.
|
71
|
+
"""
|
72
|
+
mk_score = pd.read_feather(mk_score_file).set_index('HUMAN_GENE_SYM').rename_axis('gene_name')
|
73
|
+
mk_score = mk_score.astype(np.float32, copy=False)
|
74
|
+
return mk_score
|
75
|
+
|
76
|
+
|
77
|
+
# %%
|
78
|
+
# load mkscore get common gene
|
79
|
+
# %%
|
80
|
+
# load bim
|
81
|
+
def load_bim(bfile_root, chrom):
|
82
|
+
"""
|
83
|
+
Load the bim file.
|
84
|
+
"""
|
85
|
+
print("Loading bim data")
|
86
|
+
bim = pd.read_csv(f'{bfile_root}.{chrom}.bim', sep='\t', header=None)
|
87
|
+
bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
|
88
|
+
#
|
89
|
+
# Transform bim to PyRanges
|
90
|
+
bim_pr = bim.copy()
|
91
|
+
bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
|
92
|
+
bim_pr['End'] = bim_pr['Start']
|
93
|
+
bim_pr = pr.PyRanges(bim_pr)
|
94
|
+
bim_pr.Chromosome = f'chr{chrom}'
|
95
|
+
return bim, bim_pr
|
96
|
+
|
97
|
+
|
98
|
+
# %%
|
99
|
+
def Overlaps_gtf_bim(gtf_pr, bim_pr):
|
100
|
+
"""
|
101
|
+
Find overlaps between gtf and bim file.
|
102
|
+
"""
|
103
|
+
# Select the overlapped regions (SNPs in gene windows)
|
104
|
+
overlaps = gtf_pr.join(bim_pr)
|
105
|
+
overlaps = overlaps.df
|
106
|
+
overlaps['Distance'] = np.abs(overlaps['Start_b'] - overlaps['TSS'])
|
107
|
+
overlaps_small = overlaps.copy()
|
108
|
+
overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
|
109
|
+
return overlaps_small
|
110
|
+
|
111
|
+
|
112
|
+
# %%
|
113
|
+
|
114
|
+
|
115
|
+
# %%
|
116
|
+
def get_snp_pass_maf(bfile_root, chrom, maf_min=0.05):
|
117
|
+
"""
|
118
|
+
Get the dummy matrix of SNP-gene pairs.
|
119
|
+
"""
|
120
|
+
# Load the bim file
|
121
|
+
PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
|
122
|
+
PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
|
123
|
+
|
124
|
+
bfile = f'{bfile_root}.{chrom}'
|
125
|
+
snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
|
126
|
+
array_snps = snp_obj(snp_file)
|
127
|
+
m = len(array_snps.IDList)
|
128
|
+
|
129
|
+
# Load fam
|
130
|
+
ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
|
131
|
+
array_indivs = ind_obj(ind_file)
|
132
|
+
n = len(array_indivs.IDList)
|
133
|
+
array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
|
134
|
+
geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
|
135
|
+
ii = geno_array.maf > maf_min
|
136
|
+
snp_pass_maf = array_snps.IDList[ii]
|
137
|
+
print(f'After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain.')
|
138
|
+
return snp_pass_maf.SNP.to_list()
|
139
|
+
|
140
|
+
|
141
|
+
def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit='CM'):
|
142
|
+
PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
|
143
|
+
PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
|
144
|
+
|
145
|
+
bfile = f'{bfile_root}.{chrom}'
|
146
|
+
snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
|
147
|
+
array_snps = snp_obj(snp_file)
|
148
|
+
m = len(array_snps.IDList)
|
149
|
+
print(f'Read list of {m} SNPs from {snp_file}')
|
150
|
+
|
151
|
+
# Load fam
|
152
|
+
ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
|
153
|
+
array_indivs = ind_obj(ind_file)
|
154
|
+
n = len(array_indivs.IDList)
|
155
|
+
print(f'Read list of {n} individuals from {ind_file}')
|
156
|
+
array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
|
157
|
+
geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
|
158
|
+
# Load the annotations of the baseline
|
159
|
+
if ld_unit == 'SNP':
|
160
|
+
max_dist = ld_wind
|
161
|
+
coords = np.array(range(geno_array.m))
|
162
|
+
elif ld_unit == 'KB':
|
163
|
+
max_dist = ld_wind * 1000
|
164
|
+
coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
|
165
|
+
elif ld_unit == 'CM':
|
166
|
+
max_dist = ld_wind
|
167
|
+
coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
|
168
|
+
else:
|
169
|
+
raise ValueError(f'Invalid ld_wind_unit: {ld_unit}')
|
170
|
+
block_left = getBlockLefts(coords, max_dist)
|
171
|
+
# Calculate the LD score
|
172
|
+
lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
|
173
|
+
return lN_df
|
174
|
+
|
175
|
+
|
176
|
+
# %%
|
177
|
+
def calculate_ldscore_from_annotation(SNP_annotation_df, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
|
178
|
+
"""
|
179
|
+
Calculate the SNP-gene weight matrix.
|
180
|
+
"""
|
181
|
+
# Get the dummy matrix
|
182
|
+
# Get the SNP-gene weight matrix
|
183
|
+
snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
|
184
|
+
ld_unit=ld_unit)
|
185
|
+
snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
|
186
|
+
snp_gene_weight_matrix.index = SNP_annotation_df.index
|
187
|
+
snp_gene_weight_matrix.columns = SNP_annotation_df.columns
|
188
|
+
return snp_gene_weight_matrix
|
189
|
+
|
190
|
+
|
191
|
+
def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
|
192
|
+
SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1)
|
193
|
+
|
194
|
+
snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
|
195
|
+
ld_unit=ld_unit)
|
196
|
+
snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
|
197
|
+
snp_gene_weight_matrix.index = SNP_annotation_df.index
|
198
|
+
snp_gene_weight_matrix.columns = SNP_annotation_df.columns
|
199
|
+
|
200
|
+
# split to each annotation
|
201
|
+
snp_annotation_len_list = [len(df.columns) for df in SNP_annotation_df_list]
|
202
|
+
snp_gene_weight_matrix_list = []
|
203
|
+
start = 0
|
204
|
+
for snp_annotation_len in snp_annotation_len_list:
|
205
|
+
snp_gene_weight_matrix_list.append(snp_gene_weight_matrix.iloc[:, start:start + snp_annotation_len])
|
206
|
+
start += snp_annotation_len
|
207
|
+
return snp_gene_weight_matrix_list
|
208
|
+
|
209
|
+
|
210
|
+
# %%
|
211
|
+
class S_LDSC_Boost:
|
212
|
+
def __init__(self, config: GenerateLDScoreConfig):
|
213
|
+
self.config = config
|
214
|
+
|
215
|
+
self.mk_score = load_marker_score(config.mkscore_feather_file)
|
216
|
+
|
217
|
+
# Load GTF and get common markers
|
218
|
+
self.gtf_pr, self.mk_score_common = load_gtf(config.gtf_annotation_file, self.mk_score,
|
219
|
+
window_size=config.gene_window_size)
|
220
|
+
|
221
|
+
# Load enhancer
|
222
|
+
if config.enhancer_annotation_file is not None:
|
223
|
+
enhancer_df = pr.read_bed(config.enhancer_annotation_file, as_df=True)
|
224
|
+
enhancer_df.set_index('Name', inplace=True)
|
225
|
+
enhancer_df.index.name = 'gene_name'
|
226
|
+
|
227
|
+
# keep the common genes and add the enhancer score
|
228
|
+
avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=['avg_mkscore'])
|
229
|
+
enhancer_df = enhancer_df.join(avg_mkscore, how='inner', on='gene_name', )
|
230
|
+
|
231
|
+
# add distance to TSS
|
232
|
+
enhancer_df['TSS'] = self.gtf_pr.df.set_index('gene_name').reindex(enhancer_df.index)['TSS']
|
233
|
+
|
234
|
+
# convert to pyranges
|
235
|
+
self.enhancer_pr = pr.PyRanges(enhancer_df.reset_index())
|
236
|
+
|
237
|
+
else:
|
238
|
+
self.enhancer_pr = None
|
239
|
+
|
240
|
+
def process_chromosome(self, chrom: int):
|
241
|
+
self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
|
242
|
+
|
243
|
+
# Get SNP-Gene dummy pairs
|
244
|
+
self.snp_gene_pair_dummy = self.get_snp_gene_dummy(chrom, )
|
245
|
+
|
246
|
+
if self.config.keep_snp_root is not None:
|
247
|
+
keep_snp = pd.read_csv(f'{self.config.keep_snp_root}.{chrom}.snp', header=None)[0].to_list()
|
248
|
+
self.keep_snp_mask = self.snp_gene_pair_dummy.index.isin(keep_snp)
|
249
|
+
# the SNP name of keeped
|
250
|
+
self.snp_name = self.snp_gene_pair_dummy.index[self.keep_snp_mask].to_list()
|
251
|
+
else:
|
252
|
+
self.keep_snp_mask = None
|
253
|
+
self.snp_name = self.snp_gene_pair_dummy.index.to_list()
|
254
|
+
|
255
|
+
if self.config.additional_baseline_annotation_dir_path is not None:
|
256
|
+
additional_baseline_annotation_dir_path = Path(self.config.additional_baseline_annotation_dir_path)
|
257
|
+
additional_baseline_annotation_file_path = additional_baseline_annotation_dir_path / f'baseline.{chrom}.annot.gz'
|
258
|
+
assert additional_baseline_annotation_file_path.exists(), f'additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}'
|
259
|
+
additional_baseline_annotation_df = pd.read_csv(additional_baseline_annotation_file_path, sep='\t')
|
260
|
+
additional_baseline_annotation_df.set_index('SNP', inplace=True)
|
261
|
+
|
262
|
+
# drop these columns if exists CHR BP CM]
|
263
|
+
additional_baseline_annotation_df.drop(['CHR', 'BP', 'CM'], axis=1, inplace=True, errors='ignore')
|
264
|
+
|
265
|
+
# reindex, for those SNPs not in additional_baseline_annotation_df, set to 0
|
266
|
+
num_of_not_exist_snp = (~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)).sum()
|
267
|
+
if num_of_not_exist_snp > 0:
|
268
|
+
logger.warning(
|
269
|
+
f'{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0')
|
270
|
+
additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
|
271
|
+
self.snp_gene_pair_dummy.index,
|
272
|
+
fill_value=0)
|
273
|
+
else:
|
274
|
+
additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
|
275
|
+
self.snp_gene_pair_dummy.index)
|
276
|
+
|
277
|
+
# do this for saving the cpu time, by only calculate r2 once
|
278
|
+
self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
|
279
|
+
calculate_ldscore_from_multiple_annotation(
|
280
|
+
[self.snp_gene_pair_dummy, additional_baseline_annotation_df],
|
281
|
+
chrom,
|
282
|
+
self.config.bfile_root,
|
283
|
+
ld_wind=self.config.ld_wind,
|
284
|
+
ld_unit=self.config.ld_unit))
|
285
|
+
|
286
|
+
ld_score_file = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather'
|
287
|
+
M_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M'
|
288
|
+
M_5_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50'
|
289
|
+
|
290
|
+
# save additional baseline annotation ldscore
|
291
|
+
self.save_ldscore(additional_baseline_annotation_ldscore.values,
|
292
|
+
column_names=additional_baseline_annotation_ldscore.columns,
|
293
|
+
save_file_name=ld_score_file,
|
294
|
+
)
|
295
|
+
|
296
|
+
# caculate the M and save
|
297
|
+
save_dir = Path(M_file_path).parent
|
298
|
+
save_dir.mkdir(parents=True, exist_ok=True)
|
299
|
+
M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
|
300
|
+
M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0,keepdims=True)
|
301
|
+
np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
|
302
|
+
np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
|
303
|
+
|
304
|
+
else:
|
305
|
+
# Calculate SNP-Gene weight matrix
|
306
|
+
self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(self.snp_gene_pair_dummy, chrom,
|
307
|
+
self.config.bfile_root,
|
308
|
+
ld_wind=self.config.ld_wind,
|
309
|
+
ld_unit=self.config.ld_unit)
|
310
|
+
# convert to sparse
|
311
|
+
self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
|
312
|
+
|
313
|
+
# calculate baseline ld score
|
314
|
+
self.calculate_ldscore_for_base_line(chrom, self.config.sample_name, self.config.ldscore_save_dir)
|
315
|
+
|
316
|
+
# calculate ld score for annotation
|
317
|
+
self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
|
318
|
+
self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
|
319
|
+
chrom,
|
320
|
+
self.config.sample_name,
|
321
|
+
self.config.ldscore_save_dir,
|
322
|
+
)
|
323
|
+
|
324
|
+
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
|
325
|
+
mk_score_chunk,
|
326
|
+
save_file_name,
|
327
|
+
drop_dummy_na=True,
|
328
|
+
):
|
329
|
+
|
330
|
+
if drop_dummy_na:
|
331
|
+
ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
|
332
|
+
else:
|
333
|
+
ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
|
334
|
+
|
335
|
+
self.save_ldscore(ldscore_chr_chunk,
|
336
|
+
column_names=mk_score_chunk.columns,
|
337
|
+
save_file_name=save_file_name,
|
338
|
+
)
|
339
|
+
|
340
|
+
def save_ldscore(self, ldscore_chr_chunk: np.ndarray, column_names, save_file_name):
|
341
|
+
save_dir = Path(save_file_name).parent
|
342
|
+
save_dir.mkdir(parents=True, exist_ok=True)
|
343
|
+
|
344
|
+
ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
|
345
|
+
# avoid overflow of float16, if inf, set to max of float16
|
346
|
+
ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
|
347
|
+
ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
|
348
|
+
self.keep_snp_mask]
|
349
|
+
# save for each chunk
|
350
|
+
df = pd.DataFrame(ldscore_chr_chunk,
|
351
|
+
index=self.snp_name,
|
352
|
+
columns=column_names,
|
353
|
+
)
|
354
|
+
df.index.name = 'SNP'
|
355
|
+
df.reset_index().to_feather(save_file_name)
|
356
|
+
|
357
|
+
def calculate_M_use_SNP_gene_pair_dummy_by_chunk(self,
|
358
|
+
mk_score_chunk,
|
359
|
+
M_file_path, M_5_file_path,
|
360
|
+
drop_dummy_na=True,
|
361
|
+
):
|
362
|
+
'''
|
363
|
+
calculate M use SNP_gene_pair_dummy_sumed_along_snp_axis and mk_score_chunk
|
364
|
+
'''
|
365
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis = self.snp_gene_pair_dummy.values.sum(axis=0, keepdims=True)
|
366
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = self.snp_gene_pair_dummy.loc[self.snp_pass_maf].values.sum(
|
367
|
+
axis=0,
|
368
|
+
keepdims=True)
|
369
|
+
if drop_dummy_na:
|
370
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[:, :-1]
|
371
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf[:,
|
372
|
+
:-1]
|
373
|
+
save_dir = Path(M_file_path).parent
|
374
|
+
save_dir.mkdir(parents=True, exist_ok=True)
|
375
|
+
M_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis @ mk_score_chunk
|
376
|
+
M_5_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf @ mk_score_chunk
|
377
|
+
np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
|
378
|
+
np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
|
379
|
+
|
380
|
+
|
381
|
+
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(self, mk_score_common, chrom, sample_name, save_dir):
|
382
|
+
"""
|
383
|
+
Calculate the LD score using the SNP-gene weight matrix.
|
384
|
+
:param sample_name:
|
385
|
+
"""
|
386
|
+
# Calculate the LD score
|
387
|
+
chunk_index = 1
|
388
|
+
for i in trange(0, mk_score_common.shape[1], self.config.spots_per_chunk,
|
389
|
+
desc=f'Calculating LD score by chunk for chr{chrom}'):
|
390
|
+
mk_score_chunk = mk_score_common.iloc[:, i:i + self.config.spots_per_chunk]
|
391
|
+
|
392
|
+
ld_score_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather'
|
393
|
+
M_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M'
|
394
|
+
M_5_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50'
|
395
|
+
|
396
|
+
self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
397
|
+
mk_score_chunk,
|
398
|
+
save_file_name=ld_score_file,
|
399
|
+
drop_dummy_na=True,
|
400
|
+
)
|
401
|
+
self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
|
402
|
+
mk_score_chunk,
|
403
|
+
M_file,
|
404
|
+
M_5_file,
|
405
|
+
drop_dummy_na=True,
|
406
|
+
)
|
407
|
+
|
408
|
+
chunk_index += 1
|
409
|
+
|
410
|
+
def calculate_ldscore_for_base_line(self, chrom, sample_name, save_dir):
|
411
|
+
# save baseline ld score
|
412
|
+
baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
|
413
|
+
baseline_mk_score[-1, 0] = 0 # all_gene
|
414
|
+
baseline_mk_score_df = pd.DataFrame(baseline_mk_score, index=self.snp_gene_pair_dummy.columns,
|
415
|
+
columns=['all_gene', 'base'])
|
416
|
+
ld_score_file = f'{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather'
|
417
|
+
M_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M'
|
418
|
+
M_5_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M_5_50'
|
419
|
+
|
420
|
+
self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
421
|
+
baseline_mk_score_df,
|
422
|
+
save_file_name=ld_score_file,
|
423
|
+
drop_dummy_na=False,
|
424
|
+
)
|
425
|
+
# save baseline M
|
426
|
+
self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
|
427
|
+
baseline_mk_score_df,
|
428
|
+
M_file,
|
429
|
+
M_5_file,
|
430
|
+
drop_dummy_na=False,
|
431
|
+
)
|
432
|
+
|
433
|
+
def get_snp_gene_dummy(self, chrom, ):
|
434
|
+
"""
|
435
|
+
Get the dummy matrix of SNP-gene pairs.
|
436
|
+
"""
|
437
|
+
# Load the bim file
|
438
|
+
bim, bim_pr = load_bim(self.config.bfile_root, chrom)
|
439
|
+
|
440
|
+
if self.config.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
|
441
|
+
|
442
|
+
SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
|
443
|
+
SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
|
444
|
+
# total_SNP_gene_pair = SNP_gene_pair_gtf.join(SNP_gene_pair_enhancer, how='outer', lsuffix='_gtf', )
|
445
|
+
|
446
|
+
mask_of_nan_gtf = SNP_gene_pair_gtf.gene_name.isna()
|
447
|
+
mask_of_nan_enhancer = SNP_gene_pair_enhancer.gene_name.isna()
|
448
|
+
|
449
|
+
if self.config.gene_window_enhancer_priority == 'gene_window_first':
|
450
|
+
SNP_gene_pair = SNP_gene_pair_gtf
|
451
|
+
SNP_gene_pair.loc[mask_of_nan_gtf, 'gene_name'] = SNP_gene_pair_enhancer.loc[
|
452
|
+
mask_of_nan_gtf, 'gene_name']
|
453
|
+
elif self.config.gene_window_enhancer_priority == 'enhancer_first':
|
454
|
+
SNP_gene_pair = SNP_gene_pair_enhancer
|
455
|
+
SNP_gene_pair.loc[mask_of_nan_enhancer, 'gene_name'] = SNP_gene_pair_gtf.loc[
|
456
|
+
mask_of_nan_enhancer, 'gene_name']
|
457
|
+
else:
|
458
|
+
raise ValueError(
|
459
|
+
f'Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}')
|
460
|
+
|
461
|
+
elif self.config.gene_window_enhancer_priority is None: # use gtf only
|
462
|
+
SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
|
463
|
+
SNP_gene_pair = SNP_gene_pair_gtf
|
464
|
+
|
465
|
+
elif self.config.gene_window_enhancer_priority == 'enhancer_only':
|
466
|
+
SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
|
467
|
+
SNP_gene_pair = SNP_gene_pair_enhancer
|
468
|
+
else:
|
469
|
+
raise ValueError('gtf_pr and enhancer_pr cannot be None at the same time')
|
470
|
+
|
471
|
+
# Get the dummy matrix
|
472
|
+
SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair['gene_name'], dummy_na=True)
|
473
|
+
return SNP_gene_pair_dummy
|
474
|
+
|
475
|
+
def get_SNP_gene_pair_from_gtf(self, bim, bim_pr):
|
476
|
+
logger.info(
|
477
|
+
"Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)")
|
478
|
+
overlaps_small = Overlaps_gtf_bim(self.gtf_pr, bim_pr)
|
479
|
+
# Get the SNP-gene pair
|
480
|
+
annot = bim[["CHR", "BP", "SNP", "CM"]]
|
481
|
+
SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
|
482
|
+
return SNP_gene_pair
|
483
|
+
|
484
|
+
def get_SNP_gene_pair_from_enhancer(self, bim, bim_pr, ):
|
485
|
+
logger.info(
|
486
|
+
"Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score")
|
487
|
+
# Get the SNP-gene pair
|
488
|
+
overlaps_small = self.enhancer_pr.join(bim_pr).df
|
489
|
+
annot = bim[["CHR", "BP", "SNP", "CM"]]
|
490
|
+
if self.config.snp_multiple_enhancer_strategy == 'max_mkscore':
|
491
|
+
logger.debug('select the gene with highest marker score')
|
492
|
+
overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').avg_mkscore.idxmax()]
|
493
|
+
|
494
|
+
elif self.config.snp_multiple_enhancer_strategy == 'nearest_TSS':
|
495
|
+
logger.debug('select the gene with nearest TSS')
|
496
|
+
overlaps_small['Distance'] = np.abs(overlaps_small['Start_b'] - overlaps_small['TSS'])
|
497
|
+
overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
|
498
|
+
|
499
|
+
SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
|
500
|
+
|
501
|
+
return SNP_gene_pair
|
502
|
+
|
503
|
+
|
504
|
+
def run_generate_ldscore(config: GenerateLDScoreConfig):
|
505
|
+
s_ldsc_boost = S_LDSC_Boost(config)
|
506
|
+
if config.chrom == 'all':
|
507
|
+
for chrom in range(1, 23):
|
508
|
+
s_ldsc_boost.process_chromosome(chrom)
|
509
|
+
else:
|
510
|
+
s_ldsc_boost.process_chromosome(config.chrom)
|
511
|
+
|
512
|
+
|
513
|
+
# %%
|
514
|
+
if __name__ == '__main__':
|
515
|
+
TEST = True
|
516
|
+
if TEST:
|
517
|
+
# %%
|
518
|
+
sample_name = 'Cortex_151507'
|
519
|
+
chrom = 'all'
|
520
|
+
save_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/Cortex_151507/snp_annotation/test/0101/sparse'
|
521
|
+
# %%
|
522
|
+
gtf_file = '/storage/yangjianLab/songliyang/ReferenceGenome/GRCh37/gencode.v39lift37.annotation.gtf'
|
523
|
+
mkscore_feather_file = f'/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/{sample_name}/gene_markers/{sample_name}_rank.feather'
|
524
|
+
bfile_root = '/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC'
|
525
|
+
window_size = 50000
|
526
|
+
keep_snp_root = '/storage/yangjianLab/sharedata/LDSC_resource/hapmap3_snps/hm'
|
527
|
+
spots_per_chunk = 10_000
|
528
|
+
enhancer_annotation = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/resource/epigenome/cleaned_data/by_tissue/BRN/ABC_roadmap_merged.bed'
|
529
|
+
# %%
|
530
|
+
config = GenerateLDScoreConfig(
|
531
|
+
sample_name=sample_name,
|
532
|
+
chrom=chrom,
|
533
|
+
ldscore_save_dir=save_dir,
|
534
|
+
gtf_annotation_file=gtf_file,
|
535
|
+
mkscore_feather_file=mkscore_feather_file,
|
536
|
+
bfile_root=bfile_root,
|
537
|
+
keep_snp_root=keep_snp_root,
|
538
|
+
gene_window_size=window_size,
|
539
|
+
spots_per_chunk=spots_per_chunk,
|
540
|
+
enhancer_annotation_file=enhancer_annotation,
|
541
|
+
gene_window_enhancer_priority='enhancer_first',
|
542
|
+
additional_baseline_annotation_dir_path='/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/resource/ldsc/baseline_v1.2/remove_base'
|
543
|
+
)
|
544
|
+
# %%
|
545
|
+
run_generate_ldscore(config)
|
546
|
+
else:
|
547
|
+
parser = argparse.ArgumentParser(description="Configuration for the application.")
|
548
|
+
add_generate_ldscore_args(parser)
|
549
|
+
args = parser.parse_args()
|
550
|
+
config = GenerateLDScoreConfig(**vars(args))
|
551
|
+
run_generate_ldscore(config)
|