gsMap 1.71.2__py3-none-any.whl → 1.72.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN/adjacency_matrix.py +25 -27
- gsMap/GNN/model.py +9 -7
- gsMap/GNN/train.py +8 -11
- gsMap/__init__.py +3 -3
- gsMap/__main__.py +3 -2
- gsMap/cauchy_combination_test.py +75 -72
- gsMap/config.py +822 -316
- gsMap/create_slice_mean.py +154 -0
- gsMap/diagnosis.py +179 -101
- gsMap/find_latent_representation.py +28 -26
- gsMap/format_sumstats.py +233 -201
- gsMap/generate_ldscore.py +353 -209
- gsMap/latent_to_gene.py +92 -60
- gsMap/main.py +23 -14
- gsMap/report.py +39 -25
- gsMap/run_all_mode.py +86 -46
- gsMap/setup.py +1 -1
- gsMap/spatial_ldsc_multiple_sumstats.py +154 -80
- gsMap/utils/generate_r2_matrix.py +173 -140
- gsMap/utils/jackknife.py +84 -80
- gsMap/utils/manhattan_plot.py +180 -207
- gsMap/utils/regression_read.py +105 -122
- gsMap/visualize.py +82 -64
- {gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/METADATA +21 -6
- gsmap-1.72.3.dist-info/RECORD +31 -0
- {gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/WHEEL +1 -1
- gsMap/utils/make_annotations.py +0 -518
- gsmap-1.71.2.dist-info/RECORD +0 -31
- {gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/LICENSE +0 -0
- {gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/entry_points.txt +0 -0
gsMap/generate_ldscore.py
CHANGED
@@ -10,7 +10,7 @@ from scipy.sparse import csr_matrix
|
|
10
10
|
from tqdm import trange
|
11
11
|
|
12
12
|
from gsMap.config import GenerateLDScoreConfig
|
13
|
-
from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts
|
13
|
+
from gsMap.utils.generate_r2_matrix import ID_List_Factory, PlinkBEDFileWithR2Cache, getBlockLefts
|
14
14
|
|
15
15
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
16
16
|
logger = logging.getLogger(__name__)
|
@@ -25,34 +25,36 @@ def load_gtf(gtf_file, mk_score, window_size):
|
|
25
25
|
print("Loading gtf data")
|
26
26
|
#
|
27
27
|
# Load GTF file
|
28
|
-
gtf = pr.read_gtf(
|
28
|
+
gtf = pr.read_gtf(
|
29
|
+
gtf_file,
|
30
|
+
)
|
29
31
|
gtf = gtf.df
|
30
32
|
#
|
31
33
|
# Select the common genes
|
32
|
-
gtf = gtf[gtf[
|
34
|
+
gtf = gtf[gtf["Feature"] == "gene"]
|
33
35
|
common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
|
34
36
|
#
|
35
37
|
gtf = gtf[gtf.gene_name.isin(common_gene)]
|
36
38
|
mk_score = mk_score[mk_score.index.isin(common_gene)]
|
37
39
|
#
|
38
40
|
# Remove duplicated lines
|
39
|
-
gtf = gtf.drop_duplicates(subset=
|
41
|
+
gtf = gtf.drop_duplicates(subset="gene_name", keep="first")
|
40
42
|
#
|
41
43
|
# Process the GTF (open 100-KB window: Tss - Ted)
|
42
|
-
gtf_bed = gtf[[
|
43
|
-
gtf_bed.loc[:,
|
44
|
-
gtf_bed.loc[:,
|
44
|
+
gtf_bed = gtf[["Chromosome", "Start", "End", "gene_name", "Strand"]].copy()
|
45
|
+
gtf_bed.loc[:, "TSS"] = gtf_bed["Start"]
|
46
|
+
gtf_bed.loc[:, "TED"] = gtf_bed["End"]
|
45
47
|
|
46
|
-
gtf_bed.loc[:,
|
47
|
-
gtf_bed.loc[:,
|
48
|
-
gtf_bed.loc[gtf_bed[
|
48
|
+
gtf_bed.loc[:, "Start"] = gtf_bed["TSS"] - window_size
|
49
|
+
gtf_bed.loc[:, "End"] = gtf_bed["TED"] + window_size
|
50
|
+
gtf_bed.loc[gtf_bed["Start"] < 0, "Start"] = 0
|
49
51
|
#
|
50
52
|
# Correct the negative strand
|
51
|
-
tss_neg = gtf_bed.loc[gtf_bed[
|
52
|
-
ted_neg = gtf_bed.loc[gtf_bed[
|
53
|
-
gtf_bed.loc[gtf_bed[
|
54
|
-
gtf_bed.loc[gtf_bed[
|
55
|
-
gtf_bed = gtf_bed.drop(
|
53
|
+
tss_neg = gtf_bed.loc[gtf_bed["Strand"] == "-", "TSS"]
|
54
|
+
ted_neg = gtf_bed.loc[gtf_bed["Strand"] == "-", "TED"]
|
55
|
+
gtf_bed.loc[gtf_bed["Strand"] == "-", "TSS"] = ted_neg
|
56
|
+
gtf_bed.loc[gtf_bed["Strand"] == "-", "TED"] = tss_neg
|
57
|
+
gtf_bed = gtf_bed.drop("Strand", axis=1)
|
56
58
|
#
|
57
59
|
# Transform the GTF to PyRanges
|
58
60
|
gtf_pr = pr.PyRanges(gtf_bed)
|
@@ -64,7 +66,7 @@ def load_marker_score(mk_score_file):
|
|
64
66
|
"""
|
65
67
|
Load marker scores of each cell.
|
66
68
|
"""
|
67
|
-
mk_score = pd.read_feather(mk_score_file).set_index(
|
69
|
+
mk_score = pd.read_feather(mk_score_file).set_index("HUMAN_GENE_SYM").rename_axis("gene_name")
|
68
70
|
mk_score = mk_score.astype(np.float32, copy=False)
|
69
71
|
return mk_score
|
70
72
|
|
@@ -77,18 +79,18 @@ def load_bim(bfile_root, chrom):
|
|
77
79
|
"""
|
78
80
|
Load the bim file.
|
79
81
|
"""
|
80
|
-
bim = pd.read_csv(f
|
82
|
+
bim = pd.read_csv(f"{bfile_root}.{chrom}.bim", sep="\t", header=None)
|
81
83
|
bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
|
82
84
|
#
|
83
85
|
# Transform bim to PyRanges
|
84
86
|
bim_pr = bim.copy()
|
85
87
|
bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
|
86
88
|
|
87
|
-
bim_pr[
|
88
|
-
bim_pr[
|
89
|
+
bim_pr["End"] = bim_pr["Start"].copy()
|
90
|
+
bim_pr["Start"] = bim_pr["Start"] - 1 # Due to bim file is 1-based
|
89
91
|
|
90
92
|
bim_pr = pr.PyRanges(bim_pr)
|
91
|
-
bim_pr.Chromosome = f
|
93
|
+
bim_pr.Chromosome = f"chr{chrom}"
|
92
94
|
return bim, bim_pr
|
93
95
|
|
94
96
|
|
@@ -100,9 +102,9 @@ def Overlaps_gtf_bim(gtf_pr, bim_pr):
|
|
100
102
|
# Select the overlapped regions (SNPs in gene windows)
|
101
103
|
overlaps = gtf_pr.join(bim_pr)
|
102
104
|
overlaps = overlaps.df
|
103
|
-
overlaps[
|
105
|
+
overlaps["Distance"] = np.abs(overlaps["Start_b"] - overlaps["TSS"])
|
104
106
|
overlaps_small = overlaps.copy()
|
105
|
-
overlaps_small = overlaps_small.loc[overlaps_small.groupby(
|
107
|
+
overlaps_small = overlaps_small.loc[overlaps_small.groupby("SNP").Distance.idxmin()]
|
106
108
|
return overlaps_small
|
107
109
|
|
108
110
|
|
@@ -110,7 +112,7 @@ def Overlaps_gtf_bim(gtf_pr, bim_pr):
|
|
110
112
|
def filter_snps_by_keep_snp(bim_df, keep_snp_file):
|
111
113
|
# Load the keep_snp file and filter the BIM DataFrame
|
112
114
|
keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
|
113
|
-
filtered_bim_df = bim_df[bim_df[
|
115
|
+
filtered_bim_df = bim_df[bim_df["SNP"].isin(keep_snp)]
|
114
116
|
return filtered_bim_df
|
115
117
|
|
116
118
|
|
@@ -122,7 +124,7 @@ def get_snp_counts(config):
|
|
122
124
|
bim_df, _ = load_bim(config.bfile_root, chrom)
|
123
125
|
|
124
126
|
if config.keep_snp_root:
|
125
|
-
keep_snp_file = f
|
127
|
+
keep_snp_file = f"{config.keep_snp_root}.{chrom}.snp"
|
126
128
|
filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
|
127
129
|
else:
|
128
130
|
filtered_bim_df = bim_df
|
@@ -130,11 +132,11 @@ def get_snp_counts(config):
|
|
130
132
|
snp_counts[chrom] = filtered_bim_df.shape[0]
|
131
133
|
total_snp += snp_counts[chrom]
|
132
134
|
|
133
|
-
snp_counts[
|
135
|
+
snp_counts["total"] = total_snp
|
134
136
|
|
135
137
|
chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
|
136
138
|
|
137
|
-
snp_counts[
|
139
|
+
snp_counts["chrom_snp_start_point"] = [0] + chrom_snp_length_array.tolist()
|
138
140
|
|
139
141
|
return snp_counts
|
140
142
|
|
@@ -145,55 +147,63 @@ def get_snp_pass_maf(bfile_root, chrom, maf_min=0.05):
|
|
145
147
|
Get the dummy matrix of SNP-gene pairs.
|
146
148
|
"""
|
147
149
|
# Load the bim file
|
148
|
-
PlinkBIMFile = ID_List_Factory(
|
149
|
-
|
150
|
+
PlinkBIMFile = ID_List_Factory(
|
151
|
+
["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
|
152
|
+
)
|
153
|
+
PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
|
150
154
|
|
151
|
-
bfile = f
|
152
|
-
snp_file, snp_obj = bfile +
|
155
|
+
bfile = f"{bfile_root}.{chrom}"
|
156
|
+
snp_file, snp_obj = bfile + ".bim", PlinkBIMFile
|
153
157
|
array_snps = snp_obj(snp_file)
|
154
|
-
m = len(array_snps.IDList)
|
158
|
+
# m = len(array_snps.IDList)
|
155
159
|
|
156
160
|
# Load fam
|
157
|
-
ind_file, ind_obj = bfile +
|
161
|
+
ind_file, ind_obj = bfile + ".fam", PlinkFAMFile
|
158
162
|
array_indivs = ind_obj(ind_file)
|
159
163
|
n = len(array_indivs.IDList)
|
160
|
-
array_file, array_obj = bfile +
|
161
|
-
geno_array = array_obj(
|
164
|
+
array_file, array_obj = bfile + ".bed", PlinkBEDFileWithR2Cache
|
165
|
+
geno_array = array_obj(
|
166
|
+
array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None
|
167
|
+
)
|
162
168
|
ii = geno_array.maf > maf_min
|
163
169
|
snp_pass_maf = array_snps.IDList[ii]
|
164
|
-
print(f
|
170
|
+
print(f"After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain.")
|
165
171
|
return snp_pass_maf.SNP.to_list()
|
166
172
|
|
167
173
|
|
168
|
-
def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit=
|
169
|
-
PlinkBIMFile = ID_List_Factory(
|
170
|
-
|
174
|
+
def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit="CM"):
|
175
|
+
PlinkBIMFile = ID_List_Factory(
|
176
|
+
["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
|
177
|
+
)
|
178
|
+
PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
|
171
179
|
|
172
|
-
bfile = f
|
173
|
-
snp_file, snp_obj = bfile +
|
180
|
+
bfile = f"{bfile_root}.{chrom}"
|
181
|
+
snp_file, snp_obj = bfile + ".bim", PlinkBIMFile
|
174
182
|
array_snps = snp_obj(snp_file)
|
175
183
|
m = len(array_snps.IDList)
|
176
|
-
print(f
|
184
|
+
print(f"Read list of {m} SNPs from {snp_file}")
|
177
185
|
|
178
186
|
# Load fam
|
179
|
-
ind_file, ind_obj = bfile +
|
187
|
+
ind_file, ind_obj = bfile + ".fam", PlinkFAMFile
|
180
188
|
array_indivs = ind_obj(ind_file)
|
181
189
|
n = len(array_indivs.IDList)
|
182
|
-
print(f
|
183
|
-
array_file, array_obj = bfile +
|
184
|
-
geno_array = array_obj(
|
190
|
+
print(f"Read list of {n} individuals from {ind_file}")
|
191
|
+
array_file, array_obj = bfile + ".bed", PlinkBEDFileWithR2Cache
|
192
|
+
geno_array = array_obj(
|
193
|
+
array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None
|
194
|
+
)
|
185
195
|
# Load the annotations of the baseline
|
186
|
-
if ld_unit ==
|
196
|
+
if ld_unit == "SNP":
|
187
197
|
max_dist = ld_wind
|
188
198
|
coords = np.array(range(geno_array.m))
|
189
|
-
elif ld_unit ==
|
199
|
+
elif ld_unit == "KB":
|
190
200
|
max_dist = ld_wind * 1000
|
191
|
-
coords = np.array(array_snps.df[
|
192
|
-
elif ld_unit ==
|
201
|
+
coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
|
202
|
+
elif ld_unit == "CM":
|
193
203
|
max_dist = ld_wind
|
194
|
-
coords = np.array(array_snps.df[
|
204
|
+
coords = np.array(array_snps.df["CM"])[geno_array.kept_snps]
|
195
205
|
else:
|
196
|
-
raise ValueError(f
|
206
|
+
raise ValueError(f"Invalid ld_wind_unit: {ld_unit}")
|
197
207
|
block_left = getBlockLefts(coords, max_dist)
|
198
208
|
# Calculate the LD score
|
199
209
|
lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
|
@@ -201,25 +211,31 @@ def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit='CM'):
|
|
201
211
|
|
202
212
|
|
203
213
|
# %%
|
204
|
-
def calculate_ldscore_from_annotation(
|
214
|
+
def calculate_ldscore_from_annotation(
|
215
|
+
SNP_annotation_df, chrom, bfile_root, ld_wind=1, ld_unit="CM"
|
216
|
+
):
|
205
217
|
"""
|
206
218
|
Calculate the SNP-gene weight matrix.
|
207
219
|
"""
|
208
220
|
# Get the dummy matrix
|
209
221
|
# Get the SNP-gene weight matrix
|
210
|
-
snp_gene_weight_matrix = get_ldscore(
|
211
|
-
|
222
|
+
snp_gene_weight_matrix = get_ldscore(
|
223
|
+
bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
|
224
|
+
)
|
212
225
|
snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
|
213
226
|
snp_gene_weight_matrix.index = SNP_annotation_df.index
|
214
227
|
snp_gene_weight_matrix.columns = SNP_annotation_df.columns
|
215
228
|
return snp_gene_weight_matrix
|
216
229
|
|
217
230
|
|
218
|
-
def calculate_ldscore_from_multiple_annotation(
|
231
|
+
def calculate_ldscore_from_multiple_annotation(
|
232
|
+
SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit="CM"
|
233
|
+
):
|
219
234
|
SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1).astype(np.float32, copy=False)
|
220
235
|
|
221
|
-
snp_gene_weight_matrix = get_ldscore(
|
222
|
-
|
236
|
+
snp_gene_weight_matrix = get_ldscore(
|
237
|
+
bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
|
238
|
+
)
|
223
239
|
snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
|
224
240
|
snp_gene_weight_matrix.index = SNP_annotation_df.index
|
225
241
|
snp_gene_weight_matrix.columns = SNP_annotation_df.columns
|
@@ -229,7 +245,9 @@ def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bf
|
|
229
245
|
snp_gene_weight_matrix_list = []
|
230
246
|
start = 0
|
231
247
|
for snp_annotation_len in snp_annotation_len_list:
|
232
|
-
snp_gene_weight_matrix_list.append(
|
248
|
+
snp_gene_weight_matrix_list.append(
|
249
|
+
snp_gene_weight_matrix.iloc[:, start : start + snp_annotation_len]
|
250
|
+
)
|
233
251
|
start += snp_annotation_len
|
234
252
|
return snp_gene_weight_matrix_list
|
235
253
|
|
@@ -242,21 +260,28 @@ class S_LDSC_Boost:
|
|
242
260
|
self.mk_score = load_marker_score(config.mkscore_feather_path)
|
243
261
|
|
244
262
|
# Load GTF and get common markers
|
245
|
-
self.gtf_pr, self.mk_score_common = load_gtf(
|
246
|
-
|
263
|
+
self.gtf_pr, self.mk_score_common = load_gtf(
|
264
|
+
config.gtf_annotation_file, self.mk_score, window_size=config.gene_window_size
|
265
|
+
)
|
247
266
|
|
248
267
|
# Load enhancer
|
249
268
|
if config.enhancer_annotation_file is not None:
|
250
269
|
enhancer_df = pr.read_bed(config.enhancer_annotation_file, as_df=True)
|
251
|
-
enhancer_df.set_index(
|
252
|
-
enhancer_df.index.name =
|
270
|
+
enhancer_df.set_index("Name", inplace=True)
|
271
|
+
enhancer_df.index.name = "gene_name"
|
253
272
|
|
254
273
|
# keep the common genes and add the enhancer score
|
255
|
-
avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=[
|
256
|
-
enhancer_df = enhancer_df.join(
|
274
|
+
avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=["avg_mkscore"])
|
275
|
+
enhancer_df = enhancer_df.join(
|
276
|
+
avg_mkscore,
|
277
|
+
how="inner",
|
278
|
+
on="gene_name",
|
279
|
+
)
|
257
280
|
|
258
281
|
# add distance to TSS
|
259
|
-
enhancer_df[
|
282
|
+
enhancer_df["TSS"] = self.gtf_pr.df.set_index("gene_name").reindex(enhancer_df.index)[
|
283
|
+
"TSS"
|
284
|
+
]
|
260
285
|
|
261
286
|
# convert to pyranges
|
262
287
|
self.enhancer_pr = pr.PyRanges(enhancer_df.reset_index())
|
@@ -265,32 +290,39 @@ class S_LDSC_Boost:
|
|
265
290
|
self.enhancer_pr = None
|
266
291
|
|
267
292
|
# create tha zarr file
|
268
|
-
if config.ldscore_save_format ==
|
269
|
-
|
293
|
+
if config.ldscore_save_format == "zarr":
|
270
294
|
chrom_snp_length_dict = get_snp_counts(config)
|
271
|
-
self.chrom_snp_start_point = chrom_snp_length_dict[
|
295
|
+
self.chrom_snp_start_point = chrom_snp_length_dict["chrom_snp_start_point"]
|
272
296
|
|
273
|
-
zarr_path = Path(config.ldscore_save_dir) / f
|
297
|
+
zarr_path = Path(config.ldscore_save_dir) / f"{config.sample_name}.ldscore.zarr"
|
274
298
|
if not zarr_path.exists():
|
275
|
-
self.zarr_file = zarr.open(
|
276
|
-
|
277
|
-
|
299
|
+
self.zarr_file = zarr.open(
|
300
|
+
zarr_path.as_posix(),
|
301
|
+
mode="a",
|
302
|
+
dtype=np.float16,
|
303
|
+
chunks=config.zarr_chunk_size,
|
304
|
+
shape=(chrom_snp_length_dict["total"], self.mk_score_common.shape[1]),
|
305
|
+
)
|
278
306
|
zarr_path.mkdir(parents=True, exist_ok=True)
|
279
307
|
# save spot names
|
280
|
-
self.zarr_file.attrs[
|
308
|
+
self.zarr_file.attrs["spot_names"] = self.mk_score_common.columns.to_list()
|
281
309
|
# save chrom_snp_length_dict
|
282
|
-
self.zarr_file.attrs[
|
310
|
+
self.zarr_file.attrs["chrom_snp_start_point"] = self.chrom_snp_start_point
|
283
311
|
else:
|
284
|
-
self.zarr_file = zarr.open(zarr_path.as_posix(), mode=
|
312
|
+
self.zarr_file = zarr.open(zarr_path.as_posix(), mode="a")
|
285
313
|
|
286
314
|
def process_chromosome(self, chrom: int):
|
287
315
|
self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
|
288
316
|
|
289
317
|
# Get SNP-Gene dummy pairs
|
290
|
-
self.snp_gene_pair_dummy = self.get_snp_gene_dummy(
|
318
|
+
self.snp_gene_pair_dummy = self.get_snp_gene_dummy(
|
319
|
+
chrom,
|
320
|
+
)
|
291
321
|
|
292
322
|
if self.config.keep_snp_root is not None:
|
293
|
-
keep_snp = pd.read_csv(f
|
323
|
+
keep_snp = pd.read_csv(f"{self.config.keep_snp_root}.{chrom}.snp", header=None)[
|
324
|
+
0
|
325
|
+
].to_list()
|
294
326
|
self.keep_snp_mask = self.snp_gene_pair_dummy.index.isin(keep_snp)
|
295
327
|
# the SNP name of keeped
|
296
328
|
self.snp_name = self.snp_gene_pair_dummy.index[self.keep_snp_mask].to_list()
|
@@ -300,25 +332,37 @@ class S_LDSC_Boost:
|
|
300
332
|
|
301
333
|
if self.config.additional_baseline_annotation is not None:
|
302
334
|
additional_baseline_annotation = Path(self.config.additional_baseline_annotation)
|
303
|
-
additional_baseline_annotation_file_path =
|
304
|
-
|
305
|
-
|
306
|
-
|
335
|
+
additional_baseline_annotation_file_path = (
|
336
|
+
additional_baseline_annotation / f"baseline.{chrom}.annot.gz"
|
337
|
+
)
|
338
|
+
assert additional_baseline_annotation_file_path.exists(), (
|
339
|
+
f"additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}"
|
340
|
+
)
|
341
|
+
additional_baseline_annotation_df = pd.read_csv(
|
342
|
+
additional_baseline_annotation_file_path, sep="\t"
|
343
|
+
)
|
344
|
+
additional_baseline_annotation_df.set_index("SNP", inplace=True)
|
307
345
|
|
308
346
|
# drop these columns if exists CHR BP CM]
|
309
|
-
additional_baseline_annotation_df.drop(
|
347
|
+
additional_baseline_annotation_df.drop(
|
348
|
+
["CHR", "BP", "CM"], axis=1, inplace=True, errors="ignore"
|
349
|
+
)
|
310
350
|
|
311
351
|
# reindex, for those SNPs not in additional_baseline_annotation_df, set to 0
|
312
|
-
num_of_not_exist_snp = (
|
352
|
+
num_of_not_exist_snp = (
|
353
|
+
~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)
|
354
|
+
).sum()
|
313
355
|
if num_of_not_exist_snp > 0:
|
314
356
|
logger.warning(
|
315
|
-
f
|
357
|
+
f"{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0"
|
358
|
+
)
|
316
359
|
additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
|
317
|
-
self.snp_gene_pair_dummy.index,
|
318
|
-
|
360
|
+
self.snp_gene_pair_dummy.index, fill_value=0
|
361
|
+
)
|
319
362
|
else:
|
320
363
|
additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
|
321
|
-
self.snp_gene_pair_dummy.index
|
364
|
+
self.snp_gene_pair_dummy.index
|
365
|
+
)
|
322
366
|
|
323
367
|
# do this for saving the cpu time, only calculate r2 once
|
324
368
|
self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
|
@@ -327,56 +371,85 @@ class S_LDSC_Boost:
|
|
327
371
|
chrom,
|
328
372
|
self.config.bfile_root,
|
329
373
|
ld_wind=self.config.ld_wind,
|
330
|
-
ld_unit=self.config.ld_unit
|
374
|
+
ld_unit=self.config.ld_unit,
|
375
|
+
)
|
376
|
+
)
|
331
377
|
|
332
|
-
additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[
|
378
|
+
additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[
|
379
|
+
self.snp_name
|
380
|
+
]
|
333
381
|
# print(additional_baseline_annotation_ldscore.index.to_list()==self.snp_name)
|
334
382
|
|
335
|
-
ld_score_file = f
|
336
|
-
M_file_path =
|
337
|
-
|
383
|
+
ld_score_file = f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather"
|
384
|
+
M_file_path = (
|
385
|
+
f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M"
|
386
|
+
)
|
387
|
+
M_5_file_path = (
|
388
|
+
f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50"
|
389
|
+
)
|
338
390
|
|
339
391
|
# save additional baseline annotation ldscore
|
340
|
-
self.save_ldscore_to_feather(
|
341
|
-
|
342
|
-
|
343
|
-
|
392
|
+
self.save_ldscore_to_feather(
|
393
|
+
additional_baseline_annotation_ldscore.values,
|
394
|
+
column_names=additional_baseline_annotation_ldscore.columns,
|
395
|
+
save_file_name=ld_score_file,
|
396
|
+
)
|
344
397
|
|
345
398
|
# caculate the M and save
|
346
399
|
save_dir = Path(M_file_path).parent
|
347
400
|
save_dir.mkdir(parents=True, exist_ok=True)
|
348
401
|
M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
|
349
|
-
M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(
|
350
|
-
|
351
|
-
|
402
|
+
M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(
|
403
|
+
axis=0, keepdims=True
|
404
|
+
)
|
405
|
+
np.savetxt(
|
406
|
+
M_file_path,
|
407
|
+
M_chr_chunk,
|
408
|
+
delimiter="\t",
|
409
|
+
)
|
410
|
+
np.savetxt(
|
411
|
+
M_5_file_path,
|
412
|
+
M_5_chr_chunk,
|
413
|
+
delimiter="\t",
|
414
|
+
)
|
352
415
|
|
353
416
|
else:
|
354
417
|
# Calculate SNP-Gene weight matrix
|
355
|
-
self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(
|
356
|
-
|
357
|
-
|
358
|
-
|
418
|
+
self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(
|
419
|
+
self.snp_gene_pair_dummy,
|
420
|
+
chrom,
|
421
|
+
self.config.bfile_root,
|
422
|
+
ld_wind=self.config.ld_wind,
|
423
|
+
ld_unit=self.config.ld_unit,
|
424
|
+
)
|
359
425
|
# only keep the snp in keep_snp_root
|
360
426
|
if self.keep_snp_mask is not None:
|
361
427
|
self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
|
362
428
|
|
363
429
|
if self.config.save_pre_calculate_snp_gene_weight_matrix:
|
364
|
-
snp_gene_weight_matrix_save_dir =
|
430
|
+
snp_gene_weight_matrix_save_dir = (
|
431
|
+
Path(self.config.ldscore_save_dir) / "snp_gene_weight_matrix"
|
432
|
+
)
|
365
433
|
snp_gene_weight_matrix_save_dir.mkdir(parents=True, exist_ok=True)
|
366
|
-
logger.info(f
|
434
|
+
logger.info(f"Saving snp_gene_weight_matrix for chr{chrom}...")
|
367
435
|
self.snp_gene_weight_matrix.reset_index().to_feather(
|
368
|
-
snp_gene_weight_matrix_save_dir / f
|
436
|
+
snp_gene_weight_matrix_save_dir / f"{chrom}.snp_gene_weight_matrix.feather"
|
437
|
+
)
|
369
438
|
|
370
439
|
# convert to sparse
|
371
440
|
self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
|
372
|
-
logger.info(
|
441
|
+
logger.info(
|
442
|
+
f"Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}"
|
443
|
+
)
|
373
444
|
|
374
445
|
# calculate baseline ld score
|
375
|
-
logger.info(f
|
376
|
-
self.calculate_ldscore_for_base_line(
|
446
|
+
logger.info(f"Calculating baseline ld score for chr{chrom}...")
|
447
|
+
self.calculate_ldscore_for_base_line(
|
448
|
+
chrom, self.config.sample_name, self.config.ldscore_save_dir
|
449
|
+
)
|
377
450
|
|
378
451
|
# calculate ld score for annotation
|
379
|
-
logger.info(f
|
452
|
+
logger.info(f"Calculating ld score for annotation for chr{chrom}...")
|
380
453
|
self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
|
381
454
|
self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
|
382
455
|
chrom,
|
@@ -384,11 +457,11 @@ class S_LDSC_Boost:
|
|
384
457
|
self.config.ldscore_save_dir,
|
385
458
|
)
|
386
459
|
|
387
|
-
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
460
|
+
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
461
|
+
self,
|
462
|
+
mk_score_chunk,
|
463
|
+
drop_dummy_na=True,
|
464
|
+
):
|
392
465
|
if drop_dummy_na:
|
393
466
|
ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
|
394
467
|
else:
|
@@ -407,16 +480,20 @@ class S_LDSC_Boost:
|
|
407
480
|
# self.keep_snp_mask]
|
408
481
|
|
409
482
|
# save for each chunk
|
410
|
-
df = pd.DataFrame(
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
483
|
+
df = pd.DataFrame(
|
484
|
+
ldscore_chr_chunk,
|
485
|
+
index=self.snp_name,
|
486
|
+
columns=column_names,
|
487
|
+
)
|
488
|
+
df.index.name = "SNP"
|
415
489
|
df.reset_index().to_feather(save_file_name)
|
416
490
|
|
417
|
-
def save_ldscore_chunk_to_zarr(
|
418
|
-
|
419
|
-
|
491
|
+
def save_ldscore_chunk_to_zarr(
|
492
|
+
self,
|
493
|
+
ldscore_chr_chunk: np.ndarray,
|
494
|
+
chrom: int,
|
495
|
+
start_col_index,
|
496
|
+
):
|
420
497
|
ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
|
421
498
|
# avoid overflow of float16, if inf, set to max of float16
|
422
499
|
ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
|
@@ -425,63 +502,90 @@ class S_LDSC_Boost:
|
|
425
502
|
chrom_snp_start_point = self.chrom_snp_start_point[chrom - 1]
|
426
503
|
chrom_snp_end_point = self.chrom_snp_start_point[chrom]
|
427
504
|
|
428
|
-
self.zarr_file[
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
505
|
+
self.zarr_file[
|
506
|
+
chrom_snp_start_point:chrom_snp_end_point,
|
507
|
+
start_col_index : start_col_index + ldscore_chr_chunk.shape[1],
|
508
|
+
] = ldscore_chr_chunk
|
509
|
+
|
510
|
+
def calculate_M_use_SNP_gene_pair_dummy_by_chunk(
|
511
|
+
self,
|
512
|
+
mk_score_chunk,
|
513
|
+
M_file_path,
|
514
|
+
M_5_file_path,
|
515
|
+
drop_dummy_na=True,
|
516
|
+
):
|
517
|
+
"""
|
518
|
+
Calculate M use SNP_gene_pair_dummy_sumed_along_snp_axis and mk_score_chunk
|
519
|
+
"""
|
520
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis = self.snp_gene_pair_dummy.values.sum(
|
521
|
+
axis=0, keepdims=True
|
522
|
+
)
|
523
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = self.snp_gene_pair_dummy.loc[
|
524
|
+
self.snp_pass_maf
|
525
|
+
].values.sum(axis=0, keepdims=True)
|
443
526
|
if drop_dummy_na:
|
444
|
-
SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[
|
445
|
-
|
446
|
-
|
527
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[
|
528
|
+
:, :-1
|
529
|
+
]
|
530
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = (
|
531
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf[:, :-1]
|
532
|
+
)
|
447
533
|
save_dir = Path(M_file_path).parent
|
448
534
|
save_dir.mkdir(parents=True, exist_ok=True)
|
449
535
|
M_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis @ mk_score_chunk
|
450
536
|
M_5_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf @ mk_score_chunk
|
451
|
-
np.savetxt(
|
452
|
-
|
537
|
+
np.savetxt(
|
538
|
+
M_file_path,
|
539
|
+
M_chr_chunk,
|
540
|
+
delimiter="\t",
|
541
|
+
)
|
542
|
+
np.savetxt(
|
543
|
+
M_5_file_path,
|
544
|
+
M_5_chr_chunk,
|
545
|
+
delimiter="\t",
|
546
|
+
)
|
453
547
|
|
454
|
-
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
|
548
|
+
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
|
549
|
+
self, mk_score_common, chrom, sample_name, save_dir
|
550
|
+
):
|
455
551
|
"""
|
456
552
|
Calculate the LD score using the SNP-gene weight matrix.
|
457
553
|
:param sample_name:
|
458
554
|
"""
|
459
555
|
# Calculate the LD score
|
460
556
|
chunk_index = 1
|
461
|
-
for i in trange(
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
557
|
+
for i in trange(
|
558
|
+
0,
|
559
|
+
mk_score_common.shape[1],
|
560
|
+
self.config.spots_per_chunk,
|
561
|
+
desc=f"Calculating LD score by chunk for chr{chrom}",
|
562
|
+
):
|
563
|
+
mk_score_chunk = mk_score_common.iloc[:, i : i + self.config.spots_per_chunk]
|
564
|
+
|
565
|
+
ld_score_file = f"{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather"
|
566
|
+
M_file = f"{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M"
|
567
|
+
M_5_file = (
|
568
|
+
f"{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50"
|
569
|
+
)
|
468
570
|
|
469
571
|
ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
470
572
|
mk_score_chunk,
|
471
573
|
drop_dummy_na=True,
|
472
574
|
)
|
473
|
-
if self.config.ldscore_save_format ==
|
474
|
-
self.save_ldscore_to_feather(
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
575
|
+
if self.config.ldscore_save_format == "feather":
|
576
|
+
self.save_ldscore_to_feather(
|
577
|
+
ldscore_chr_chunk,
|
578
|
+
column_names=mk_score_chunk.columns,
|
579
|
+
save_file_name=ld_score_file,
|
580
|
+
)
|
581
|
+
elif self.config.ldscore_save_format == "zarr":
|
582
|
+
self.save_ldscore_chunk_to_zarr(
|
583
|
+
ldscore_chr_chunk,
|
584
|
+
chrom=chrom,
|
585
|
+
start_col_index=i,
|
586
|
+
)
|
483
587
|
else:
|
484
|
-
raise ValueError(f
|
588
|
+
raise ValueError(f"Invalid ldscore_save_format: {self.config.ldscore_save_format}")
|
485
589
|
|
486
590
|
self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
|
487
591
|
mk_score_chunk,
|
@@ -496,21 +600,23 @@ class S_LDSC_Boost:
|
|
496
600
|
# save baseline ld score
|
497
601
|
baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
|
498
602
|
baseline_mk_score[-1, 0] = 0 # all_gene
|
499
|
-
baseline_mk_score_df = pd.DataFrame(
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
603
|
+
baseline_mk_score_df = pd.DataFrame(
|
604
|
+
baseline_mk_score, index=self.snp_gene_pair_dummy.columns, columns=["all_gene", "base"]
|
605
|
+
)
|
606
|
+
ld_score_file = f"{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather"
|
607
|
+
M_file = f"{save_dir}/baseline/baseline.{chrom}.l2.M"
|
608
|
+
M_5_file = f"{save_dir}/baseline/baseline.{chrom}.l2.M_5_50"
|
504
609
|
|
505
610
|
ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
506
611
|
baseline_mk_score_df,
|
507
612
|
drop_dummy_na=False,
|
508
613
|
)
|
509
614
|
|
510
|
-
self.save_ldscore_to_feather(
|
511
|
-
|
512
|
-
|
513
|
-
|
615
|
+
self.save_ldscore_to_feather(
|
616
|
+
ldscore_chr_chunk,
|
617
|
+
column_names=baseline_mk_score_df.columns,
|
618
|
+
save_file_name=ld_score_file,
|
619
|
+
)
|
514
620
|
# save baseline M
|
515
621
|
self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
|
516
622
|
baseline_mk_score_df,
|
@@ -519,7 +625,10 @@ class S_LDSC_Boost:
|
|
519
625
|
drop_dummy_na=False,
|
520
626
|
)
|
521
627
|
|
522
|
-
def get_snp_gene_dummy(
|
628
|
+
def get_snp_gene_dummy(
|
629
|
+
self,
|
630
|
+
chrom,
|
631
|
+
):
|
523
632
|
"""
|
524
633
|
Get the dummy matrix of SNP-gene pairs.
|
525
634
|
"""
|
@@ -527,91 +636,126 @@ class S_LDSC_Boost:
|
|
527
636
|
print("Loading bim data")
|
528
637
|
bim, bim_pr = load_bim(self.config.bfile_root, chrom)
|
529
638
|
|
530
|
-
if self.config.gene_window_enhancer_priority in [
|
531
|
-
|
532
|
-
|
533
|
-
|
639
|
+
if self.config.gene_window_enhancer_priority in ["gene_window_first", "enhancer_first"]:
|
640
|
+
SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(
|
641
|
+
bim,
|
642
|
+
bim_pr,
|
643
|
+
)
|
644
|
+
SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(
|
645
|
+
bim,
|
646
|
+
bim_pr,
|
647
|
+
)
|
534
648
|
# total_SNP_gene_pair = SNP_gene_pair_gtf.join(SNP_gene_pair_enhancer, how='outer', lsuffix='_gtf', )
|
535
649
|
|
536
650
|
mask_of_nan_gtf = SNP_gene_pair_gtf.gene_name.isna()
|
537
651
|
mask_of_nan_enhancer = SNP_gene_pair_enhancer.gene_name.isna()
|
538
652
|
|
539
|
-
if self.config.gene_window_enhancer_priority ==
|
653
|
+
if self.config.gene_window_enhancer_priority == "gene_window_first":
|
540
654
|
SNP_gene_pair = SNP_gene_pair_gtf
|
541
|
-
SNP_gene_pair.loc[mask_of_nan_gtf,
|
542
|
-
mask_of_nan_gtf,
|
543
|
-
|
655
|
+
SNP_gene_pair.loc[mask_of_nan_gtf, "gene_name"] = SNP_gene_pair_enhancer.loc[
|
656
|
+
mask_of_nan_gtf, "gene_name"
|
657
|
+
]
|
658
|
+
elif self.config.gene_window_enhancer_priority == "enhancer_first":
|
544
659
|
SNP_gene_pair = SNP_gene_pair_enhancer
|
545
|
-
SNP_gene_pair.loc[mask_of_nan_enhancer,
|
546
|
-
mask_of_nan_enhancer,
|
660
|
+
SNP_gene_pair.loc[mask_of_nan_enhancer, "gene_name"] = SNP_gene_pair_gtf.loc[
|
661
|
+
mask_of_nan_enhancer, "gene_name"
|
662
|
+
]
|
547
663
|
else:
|
548
664
|
raise ValueError(
|
549
|
-
f
|
665
|
+
f"Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}"
|
666
|
+
)
|
550
667
|
|
551
668
|
elif self.config.gene_window_enhancer_priority is None: # use gtf only
|
552
|
-
SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(
|
669
|
+
SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(
|
670
|
+
bim,
|
671
|
+
bim_pr,
|
672
|
+
)
|
553
673
|
SNP_gene_pair = SNP_gene_pair_gtf
|
554
674
|
|
555
|
-
elif self.config.gene_window_enhancer_priority ==
|
556
|
-
SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(
|
675
|
+
elif self.config.gene_window_enhancer_priority == "enhancer_only":
|
676
|
+
SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(
|
677
|
+
bim,
|
678
|
+
bim_pr,
|
679
|
+
)
|
557
680
|
SNP_gene_pair = SNP_gene_pair_enhancer
|
558
681
|
else:
|
559
|
-
raise ValueError(
|
682
|
+
raise ValueError("gtf_pr and enhancer_pr cannot be None at the same time")
|
560
683
|
|
561
684
|
# save the SNP_gene_pair to feather
|
562
|
-
SNP_gene_pair_save_path =
|
563
|
-
self.config.ldscore_save_dir) / f
|
685
|
+
SNP_gene_pair_save_path = (
|
686
|
+
Path(self.config.ldscore_save_dir) / f"SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather"
|
687
|
+
)
|
564
688
|
SNP_gene_pair_save_path.parent.mkdir(parents=True, exist_ok=True)
|
565
689
|
SNP_gene_pair.reset_index().to_feather(SNP_gene_pair_save_path)
|
566
690
|
|
567
691
|
# Get the dummy matrix
|
568
|
-
SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair[
|
692
|
+
SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair["gene_name"], dummy_na=True)
|
569
693
|
return SNP_gene_pair_dummy
|
570
694
|
|
571
695
|
def get_SNP_gene_pair_from_gtf(self, bim, bim_pr):
|
572
696
|
logger.info(
|
573
|
-
"Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)"
|
697
|
+
"Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)"
|
698
|
+
)
|
574
699
|
overlaps_small = Overlaps_gtf_bim(self.gtf_pr, bim_pr)
|
575
700
|
# Get the SNP-gene pair
|
576
701
|
annot = bim[["CHR", "BP", "SNP", "CM"]]
|
577
|
-
SNP_gene_pair =
|
702
|
+
SNP_gene_pair = (
|
703
|
+
overlaps_small[["SNP", "gene_name"]]
|
704
|
+
.set_index("SNP")
|
705
|
+
.join(annot.set_index("SNP"), how="right")
|
706
|
+
)
|
578
707
|
return SNP_gene_pair
|
579
708
|
|
580
|
-
def get_SNP_gene_pair_from_enhancer(
|
709
|
+
def get_SNP_gene_pair_from_enhancer(
|
710
|
+
self,
|
711
|
+
bim,
|
712
|
+
bim_pr,
|
713
|
+
):
|
581
714
|
logger.info(
|
582
|
-
"Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score"
|
715
|
+
"Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score"
|
716
|
+
)
|
583
717
|
# Get the SNP-gene pair
|
584
718
|
overlaps_small = self.enhancer_pr.join(bim_pr).df
|
585
719
|
annot = bim[["CHR", "BP", "SNP", "CM"]]
|
586
|
-
if self.config.snp_multiple_enhancer_strategy ==
|
587
|
-
logger.debug(
|
588
|
-
overlaps_small = overlaps_small.loc[overlaps_small.groupby(
|
589
|
-
|
590
|
-
elif self.config.snp_multiple_enhancer_strategy ==
|
591
|
-
logger.debug(
|
592
|
-
overlaps_small[
|
593
|
-
overlaps_small = overlaps_small.loc[overlaps_small.groupby(
|
594
|
-
|
595
|
-
SNP_gene_pair =
|
720
|
+
if self.config.snp_multiple_enhancer_strategy == "max_mkscore":
|
721
|
+
logger.debug("select the gene with highest marker score")
|
722
|
+
overlaps_small = overlaps_small.loc[overlaps_small.groupby("SNP").avg_mkscore.idxmax()]
|
723
|
+
|
724
|
+
elif self.config.snp_multiple_enhancer_strategy == "nearest_TSS":
|
725
|
+
logger.debug("select the gene with nearest TSS")
|
726
|
+
overlaps_small["Distance"] = np.abs(overlaps_small["Start_b"] - overlaps_small["TSS"])
|
727
|
+
overlaps_small = overlaps_small.loc[overlaps_small.groupby("SNP").Distance.idxmin()]
|
728
|
+
|
729
|
+
SNP_gene_pair = (
|
730
|
+
overlaps_small[["SNP", "gene_name"]]
|
731
|
+
.set_index("SNP")
|
732
|
+
.join(annot.set_index("SNP"), how="right")
|
733
|
+
)
|
596
734
|
|
597
735
|
return SNP_gene_pair
|
598
736
|
|
599
737
|
|
600
738
|
def run_generate_ldscore(config: GenerateLDScoreConfig):
|
601
|
-
if config.ldscore_save_format ==
|
602
|
-
logger.info(
|
739
|
+
if config.ldscore_save_format == "quick_mode":
|
740
|
+
logger.info(
|
741
|
+
"Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore."
|
742
|
+
)
|
603
743
|
ldscore_save_dir = config.ldscore_save_dir
|
604
744
|
|
605
745
|
# link the baseline annotation
|
606
746
|
baseline_annotation_dir = Path(config.baseline_annotation_dir)
|
607
|
-
(ldscore_save_dir /
|
747
|
+
(ldscore_save_dir / "baseline").symlink_to(
|
748
|
+
baseline_annotation_dir, target_is_directory=True
|
749
|
+
)
|
608
750
|
|
609
751
|
# link the SNP_gene_pair
|
610
752
|
SNP_gene_pair_dir = Path(config.SNP_gene_pair_dir)
|
611
|
-
(ldscore_save_dir /
|
753
|
+
(ldscore_save_dir / "SNP_gene_pair").symlink_to(
|
754
|
+
SNP_gene_pair_dir, target_is_directory=True
|
755
|
+
)
|
612
756
|
return
|
613
757
|
s_ldsc_boost = S_LDSC_Boost(config)
|
614
|
-
if config.chrom ==
|
758
|
+
if config.chrom == "all":
|
615
759
|
for chrom in range(1, 23):
|
616
760
|
s_ldsc_boost.process_chromosome(chrom)
|
617
761
|
else:
|