gsMap 1.71.2__py3-none-any.whl → 1.73.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN/adjacency_matrix.py +25 -27
- gsMap/GNN/model.py +9 -7
- gsMap/GNN/train.py +8 -11
- gsMap/__init__.py +3 -3
- gsMap/__main__.py +3 -2
- gsMap/cauchy_combination_test.py +78 -75
- gsMap/config.py +948 -322
- gsMap/create_slice_mean.py +168 -0
- gsMap/diagnosis.py +179 -101
- gsMap/find_latent_representation.py +29 -27
- gsMap/format_sumstats.py +239 -201
- gsMap/generate_ldscore.py +334 -222
- gsMap/latent_to_gene.py +128 -68
- gsMap/main.py +23 -14
- gsMap/report.py +39 -25
- gsMap/run_all_mode.py +87 -46
- gsMap/setup.py +1 -1
- gsMap/spatial_ldsc_multiple_sumstats.py +154 -80
- gsMap/utils/generate_r2_matrix.py +100 -346
- gsMap/utils/jackknife.py +84 -80
- gsMap/utils/manhattan_plot.py +180 -207
- gsMap/utils/regression_read.py +83 -176
- gsMap/visualize.py +82 -64
- gsmap-1.73.0.dist-info/METADATA +169 -0
- gsmap-1.73.0.dist-info/RECORD +31 -0
- {gsmap-1.71.2.dist-info → gsmap-1.73.0.dist-info}/WHEEL +1 -1
- {gsmap-1.71.2.dist-info → gsmap-1.73.0.dist-info/licenses}/LICENSE +6 -6
- gsMap/utils/make_annotations.py +0 -518
- gsmap-1.71.2.dist-info/METADATA +0 -105
- gsmap-1.71.2.dist-info/RECORD +0 -31
- {gsmap-1.71.2.dist-info → gsmap-1.73.0.dist-info}/entry_points.txt +0 -0
gsMap/generate_ldscore.py
CHANGED
@@ -10,7 +10,7 @@ from scipy.sparse import csr_matrix
|
|
10
10
|
from tqdm import trange
|
11
11
|
|
12
12
|
from gsMap.config import GenerateLDScoreConfig
|
13
|
-
from gsMap.utils.generate_r2_matrix import
|
13
|
+
from gsMap.utils.generate_r2_matrix import getBlockLefts, load_bfile
|
14
14
|
|
15
15
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
16
16
|
logger = logging.getLogger(__name__)
|
@@ -25,34 +25,36 @@ def load_gtf(gtf_file, mk_score, window_size):
|
|
25
25
|
print("Loading gtf data")
|
26
26
|
#
|
27
27
|
# Load GTF file
|
28
|
-
gtf = pr.read_gtf(
|
28
|
+
gtf = pr.read_gtf(
|
29
|
+
gtf_file,
|
30
|
+
)
|
29
31
|
gtf = gtf.df
|
30
32
|
#
|
31
33
|
# Select the common genes
|
32
|
-
gtf = gtf[gtf[
|
34
|
+
gtf = gtf[gtf["Feature"] == "gene"]
|
33
35
|
common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
|
34
36
|
#
|
35
37
|
gtf = gtf[gtf.gene_name.isin(common_gene)]
|
36
38
|
mk_score = mk_score[mk_score.index.isin(common_gene)]
|
37
39
|
#
|
38
40
|
# Remove duplicated lines
|
39
|
-
gtf = gtf.drop_duplicates(subset=
|
41
|
+
gtf = gtf.drop_duplicates(subset="gene_name", keep="first")
|
40
42
|
#
|
41
43
|
# Process the GTF (open 100-KB window: Tss - Ted)
|
42
|
-
gtf_bed = gtf[[
|
43
|
-
gtf_bed.loc[:,
|
44
|
-
gtf_bed.loc[:,
|
44
|
+
gtf_bed = gtf[["Chromosome", "Start", "End", "gene_name", "Strand"]].copy()
|
45
|
+
gtf_bed.loc[:, "TSS"] = gtf_bed["Start"]
|
46
|
+
gtf_bed.loc[:, "TED"] = gtf_bed["End"]
|
45
47
|
|
46
|
-
gtf_bed.loc[:,
|
47
|
-
gtf_bed.loc[:,
|
48
|
-
gtf_bed.loc[gtf_bed[
|
48
|
+
gtf_bed.loc[:, "Start"] = gtf_bed["TSS"] - window_size
|
49
|
+
gtf_bed.loc[:, "End"] = gtf_bed["TED"] + window_size
|
50
|
+
gtf_bed.loc[gtf_bed["Start"] < 0, "Start"] = 0
|
49
51
|
#
|
50
52
|
# Correct the negative strand
|
51
|
-
tss_neg = gtf_bed.loc[gtf_bed[
|
52
|
-
ted_neg = gtf_bed.loc[gtf_bed[
|
53
|
-
gtf_bed.loc[gtf_bed[
|
54
|
-
gtf_bed.loc[gtf_bed[
|
55
|
-
gtf_bed = gtf_bed.drop(
|
53
|
+
tss_neg = gtf_bed.loc[gtf_bed["Strand"] == "-", "TSS"]
|
54
|
+
ted_neg = gtf_bed.loc[gtf_bed["Strand"] == "-", "TED"]
|
55
|
+
gtf_bed.loc[gtf_bed["Strand"] == "-", "TSS"] = ted_neg
|
56
|
+
gtf_bed.loc[gtf_bed["Strand"] == "-", "TED"] = tss_neg
|
57
|
+
gtf_bed = gtf_bed.drop("Strand", axis=1)
|
56
58
|
#
|
57
59
|
# Transform the GTF to PyRanges
|
58
60
|
gtf_pr = pr.PyRanges(gtf_bed)
|
@@ -64,31 +66,28 @@ def load_marker_score(mk_score_file):
|
|
64
66
|
"""
|
65
67
|
Load marker scores of each cell.
|
66
68
|
"""
|
67
|
-
mk_score = pd.read_feather(mk_score_file).set_index(
|
69
|
+
mk_score = pd.read_feather(mk_score_file).set_index("HUMAN_GENE_SYM").rename_axis("gene_name")
|
68
70
|
mk_score = mk_score.astype(np.float32, copy=False)
|
69
71
|
return mk_score
|
70
72
|
|
71
73
|
|
72
|
-
# %%
|
73
|
-
# load mkscore get common gene
|
74
|
-
# %%
|
75
74
|
# load bim
|
76
75
|
def load_bim(bfile_root, chrom):
|
77
76
|
"""
|
78
77
|
Load the bim file.
|
79
78
|
"""
|
80
|
-
bim = pd.read_csv(f
|
79
|
+
bim = pd.read_csv(f"{bfile_root}.{chrom}.bim", sep="\t", header=None)
|
81
80
|
bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
|
82
81
|
#
|
83
82
|
# Transform bim to PyRanges
|
84
83
|
bim_pr = bim.copy()
|
85
84
|
bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
|
86
85
|
|
87
|
-
bim_pr[
|
88
|
-
bim_pr[
|
86
|
+
bim_pr["End"] = bim_pr["Start"].copy()
|
87
|
+
bim_pr["Start"] = bim_pr["Start"] - 1 # Due to bim file is 1-based
|
89
88
|
|
90
89
|
bim_pr = pr.PyRanges(bim_pr)
|
91
|
-
bim_pr.Chromosome = f
|
90
|
+
bim_pr.Chromosome = f"chr{chrom}"
|
92
91
|
return bim, bim_pr
|
93
92
|
|
94
93
|
|
@@ -100,9 +99,9 @@ def Overlaps_gtf_bim(gtf_pr, bim_pr):
|
|
100
99
|
# Select the overlapped regions (SNPs in gene windows)
|
101
100
|
overlaps = gtf_pr.join(bim_pr)
|
102
101
|
overlaps = overlaps.df
|
103
|
-
overlaps[
|
102
|
+
overlaps["Distance"] = np.abs(overlaps["Start_b"] - overlaps["TSS"])
|
104
103
|
overlaps_small = overlaps.copy()
|
105
|
-
overlaps_small = overlaps_small.loc[overlaps_small.groupby(
|
104
|
+
overlaps_small = overlaps_small.loc[overlaps_small.groupby("SNP").Distance.idxmin()]
|
106
105
|
return overlaps_small
|
107
106
|
|
108
107
|
|
@@ -110,7 +109,7 @@ def Overlaps_gtf_bim(gtf_pr, bim_pr):
|
|
110
109
|
def filter_snps_by_keep_snp(bim_df, keep_snp_file):
|
111
110
|
# Load the keep_snp file and filter the BIM DataFrame
|
112
111
|
keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
|
113
|
-
filtered_bim_df = bim_df[bim_df[
|
112
|
+
filtered_bim_df = bim_df[bim_df["SNP"].isin(keep_snp)]
|
114
113
|
return filtered_bim_df
|
115
114
|
|
116
115
|
|
@@ -122,7 +121,7 @@ def get_snp_counts(config):
|
|
122
121
|
bim_df, _ = load_bim(config.bfile_root, chrom)
|
123
122
|
|
124
123
|
if config.keep_snp_root:
|
125
|
-
keep_snp_file = f
|
124
|
+
keep_snp_file = f"{config.keep_snp_root}.{chrom}.snp"
|
126
125
|
filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
|
127
126
|
else:
|
128
127
|
filtered_bim_df = bim_df
|
@@ -130,11 +129,11 @@ def get_snp_counts(config):
|
|
130
129
|
snp_counts[chrom] = filtered_bim_df.shape[0]
|
131
130
|
total_snp += snp_counts[chrom]
|
132
131
|
|
133
|
-
snp_counts[
|
132
|
+
snp_counts["total"] = total_snp
|
134
133
|
|
135
134
|
chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
|
136
135
|
|
137
|
-
snp_counts[
|
136
|
+
snp_counts["chrom_snp_start_point"] = [0] + chrom_snp_length_array.tolist()
|
138
137
|
|
139
138
|
return snp_counts
|
140
139
|
|
@@ -144,56 +143,35 @@ def get_snp_pass_maf(bfile_root, chrom, maf_min=0.05):
|
|
144
143
|
"""
|
145
144
|
Get the dummy matrix of SNP-gene pairs.
|
146
145
|
"""
|
147
|
-
|
148
|
-
PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
|
149
|
-
PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
|
146
|
+
array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix=f"{bfile_root}.{chrom}")
|
150
147
|
|
151
|
-
bfile = f'{bfile_root}.{chrom}'
|
152
|
-
snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
|
153
|
-
array_snps = snp_obj(snp_file)
|
154
148
|
m = len(array_snps.IDList)
|
155
|
-
|
156
|
-
# Load fam
|
157
|
-
ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
|
158
|
-
array_indivs = ind_obj(ind_file)
|
159
149
|
n = len(array_indivs.IDList)
|
160
|
-
|
161
|
-
|
150
|
+
logger.info(
|
151
|
+
f"Loading genotype data for {m} SNPs and {n} individuals from {bfile_root}.{chrom}"
|
152
|
+
)
|
153
|
+
|
162
154
|
ii = geno_array.maf > maf_min
|
163
155
|
snp_pass_maf = array_snps.IDList[ii]
|
164
|
-
|
156
|
+
logger.info(f"After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain.")
|
165
157
|
return snp_pass_maf.SNP.to_list()
|
166
158
|
|
167
159
|
|
168
|
-
def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit=
|
169
|
-
|
170
|
-
PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
|
160
|
+
def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit="CM"):
|
161
|
+
array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix=f"{bfile_root}.{chrom}")
|
171
162
|
|
172
|
-
bfile = f'{bfile_root}.{chrom}'
|
173
|
-
snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
|
174
|
-
array_snps = snp_obj(snp_file)
|
175
|
-
m = len(array_snps.IDList)
|
176
|
-
print(f'Read list of {m} SNPs from {snp_file}')
|
177
|
-
|
178
|
-
# Load fam
|
179
|
-
ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
|
180
|
-
array_indivs = ind_obj(ind_file)
|
181
|
-
n = len(array_indivs.IDList)
|
182
|
-
print(f'Read list of {n} individuals from {ind_file}')
|
183
|
-
array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
|
184
|
-
geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
|
185
163
|
# Load the annotations of the baseline
|
186
|
-
if ld_unit ==
|
164
|
+
if ld_unit == "SNP":
|
187
165
|
max_dist = ld_wind
|
188
166
|
coords = np.array(range(geno_array.m))
|
189
|
-
elif ld_unit ==
|
167
|
+
elif ld_unit == "KB":
|
190
168
|
max_dist = ld_wind * 1000
|
191
|
-
coords = np.array(array_snps.df[
|
192
|
-
elif ld_unit ==
|
169
|
+
coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
|
170
|
+
elif ld_unit == "CM":
|
193
171
|
max_dist = ld_wind
|
194
|
-
coords = np.array(array_snps.df[
|
172
|
+
coords = np.array(array_snps.df["CM"])[geno_array.kept_snps]
|
195
173
|
else:
|
196
|
-
raise ValueError(f
|
174
|
+
raise ValueError(f"Invalid ld_wind_unit: {ld_unit}")
|
197
175
|
block_left = getBlockLefts(coords, max_dist)
|
198
176
|
# Calculate the LD score
|
199
177
|
lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
|
@@ -201,25 +179,31 @@ def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit='CM'):
|
|
201
179
|
|
202
180
|
|
203
181
|
# %%
|
204
|
-
def calculate_ldscore_from_annotation(
|
182
|
+
def calculate_ldscore_from_annotation(
|
183
|
+
SNP_annotation_df, chrom, bfile_root, ld_wind=1, ld_unit="CM"
|
184
|
+
):
|
205
185
|
"""
|
206
186
|
Calculate the SNP-gene weight matrix.
|
207
187
|
"""
|
208
188
|
# Get the dummy matrix
|
209
189
|
# Get the SNP-gene weight matrix
|
210
|
-
snp_gene_weight_matrix = get_ldscore(
|
211
|
-
|
190
|
+
snp_gene_weight_matrix = get_ldscore(
|
191
|
+
bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
|
192
|
+
)
|
212
193
|
snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
|
213
194
|
snp_gene_weight_matrix.index = SNP_annotation_df.index
|
214
195
|
snp_gene_weight_matrix.columns = SNP_annotation_df.columns
|
215
196
|
return snp_gene_weight_matrix
|
216
197
|
|
217
198
|
|
218
|
-
def calculate_ldscore_from_multiple_annotation(
|
199
|
+
def calculate_ldscore_from_multiple_annotation(
|
200
|
+
SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit="CM"
|
201
|
+
):
|
219
202
|
SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1).astype(np.float32, copy=False)
|
220
203
|
|
221
|
-
snp_gene_weight_matrix = get_ldscore(
|
222
|
-
|
204
|
+
snp_gene_weight_matrix = get_ldscore(
|
205
|
+
bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
|
206
|
+
)
|
223
207
|
snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
|
224
208
|
snp_gene_weight_matrix.index = SNP_annotation_df.index
|
225
209
|
snp_gene_weight_matrix.columns = SNP_annotation_df.columns
|
@@ -229,7 +213,9 @@ def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bf
|
|
229
213
|
snp_gene_weight_matrix_list = []
|
230
214
|
start = 0
|
231
215
|
for snp_annotation_len in snp_annotation_len_list:
|
232
|
-
snp_gene_weight_matrix_list.append(
|
216
|
+
snp_gene_weight_matrix_list.append(
|
217
|
+
snp_gene_weight_matrix.iloc[:, start : start + snp_annotation_len]
|
218
|
+
)
|
233
219
|
start += snp_annotation_len
|
234
220
|
return snp_gene_weight_matrix_list
|
235
221
|
|
@@ -242,21 +228,28 @@ class S_LDSC_Boost:
|
|
242
228
|
self.mk_score = load_marker_score(config.mkscore_feather_path)
|
243
229
|
|
244
230
|
# Load GTF and get common markers
|
245
|
-
self.gtf_pr, self.mk_score_common = load_gtf(
|
246
|
-
|
231
|
+
self.gtf_pr, self.mk_score_common = load_gtf(
|
232
|
+
config.gtf_annotation_file, self.mk_score, window_size=config.gene_window_size
|
233
|
+
)
|
247
234
|
|
248
235
|
# Load enhancer
|
249
236
|
if config.enhancer_annotation_file is not None:
|
250
237
|
enhancer_df = pr.read_bed(config.enhancer_annotation_file, as_df=True)
|
251
|
-
enhancer_df.set_index(
|
252
|
-
enhancer_df.index.name =
|
238
|
+
enhancer_df.set_index("Name", inplace=True)
|
239
|
+
enhancer_df.index.name = "gene_name"
|
253
240
|
|
254
241
|
# keep the common genes and add the enhancer score
|
255
|
-
avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=[
|
256
|
-
enhancer_df = enhancer_df.join(
|
242
|
+
avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=["avg_mkscore"])
|
243
|
+
enhancer_df = enhancer_df.join(
|
244
|
+
avg_mkscore,
|
245
|
+
how="inner",
|
246
|
+
on="gene_name",
|
247
|
+
)
|
257
248
|
|
258
249
|
# add distance to TSS
|
259
|
-
enhancer_df[
|
250
|
+
enhancer_df["TSS"] = self.gtf_pr.df.set_index("gene_name").reindex(enhancer_df.index)[
|
251
|
+
"TSS"
|
252
|
+
]
|
260
253
|
|
261
254
|
# convert to pyranges
|
262
255
|
self.enhancer_pr = pr.PyRanges(enhancer_df.reset_index())
|
@@ -265,32 +258,39 @@ class S_LDSC_Boost:
|
|
265
258
|
self.enhancer_pr = None
|
266
259
|
|
267
260
|
# create tha zarr file
|
268
|
-
if config.ldscore_save_format ==
|
269
|
-
|
261
|
+
if config.ldscore_save_format == "zarr":
|
270
262
|
chrom_snp_length_dict = get_snp_counts(config)
|
271
|
-
self.chrom_snp_start_point = chrom_snp_length_dict[
|
263
|
+
self.chrom_snp_start_point = chrom_snp_length_dict["chrom_snp_start_point"]
|
272
264
|
|
273
|
-
zarr_path = Path(config.ldscore_save_dir) / f
|
265
|
+
zarr_path = Path(config.ldscore_save_dir) / f"{config.sample_name}.ldscore.zarr"
|
274
266
|
if not zarr_path.exists():
|
275
|
-
self.zarr_file = zarr.open(
|
276
|
-
|
277
|
-
|
267
|
+
self.zarr_file = zarr.open(
|
268
|
+
zarr_path.as_posix(),
|
269
|
+
mode="a",
|
270
|
+
dtype=np.float16,
|
271
|
+
chunks=config.zarr_chunk_size,
|
272
|
+
shape=(chrom_snp_length_dict["total"], self.mk_score_common.shape[1]),
|
273
|
+
)
|
278
274
|
zarr_path.mkdir(parents=True, exist_ok=True)
|
279
275
|
# save spot names
|
280
|
-
self.zarr_file.attrs[
|
276
|
+
self.zarr_file.attrs["spot_names"] = self.mk_score_common.columns.to_list()
|
281
277
|
# save chrom_snp_length_dict
|
282
|
-
self.zarr_file.attrs[
|
278
|
+
self.zarr_file.attrs["chrom_snp_start_point"] = self.chrom_snp_start_point
|
283
279
|
else:
|
284
|
-
self.zarr_file = zarr.open(zarr_path.as_posix(), mode=
|
280
|
+
self.zarr_file = zarr.open(zarr_path.as_posix(), mode="a")
|
285
281
|
|
286
282
|
def process_chromosome(self, chrom: int):
|
287
283
|
self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
|
288
284
|
|
289
285
|
# Get SNP-Gene dummy pairs
|
290
|
-
self.snp_gene_pair_dummy = self.get_snp_gene_dummy(
|
286
|
+
self.snp_gene_pair_dummy = self.get_snp_gene_dummy(
|
287
|
+
chrom,
|
288
|
+
)
|
291
289
|
|
292
290
|
if self.config.keep_snp_root is not None:
|
293
|
-
keep_snp = pd.read_csv(f
|
291
|
+
keep_snp = pd.read_csv(f"{self.config.keep_snp_root}.{chrom}.snp", header=None)[
|
292
|
+
0
|
293
|
+
].to_list()
|
294
294
|
self.keep_snp_mask = self.snp_gene_pair_dummy.index.isin(keep_snp)
|
295
295
|
# the SNP name of keeped
|
296
296
|
self.snp_name = self.snp_gene_pair_dummy.index[self.keep_snp_mask].to_list()
|
@@ -300,25 +300,37 @@ class S_LDSC_Boost:
|
|
300
300
|
|
301
301
|
if self.config.additional_baseline_annotation is not None:
|
302
302
|
additional_baseline_annotation = Path(self.config.additional_baseline_annotation)
|
303
|
-
additional_baseline_annotation_file_path =
|
304
|
-
|
305
|
-
|
306
|
-
|
303
|
+
additional_baseline_annotation_file_path = (
|
304
|
+
additional_baseline_annotation / f"baseline.{chrom}.annot.gz"
|
305
|
+
)
|
306
|
+
assert additional_baseline_annotation_file_path.exists(), (
|
307
|
+
f"additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}"
|
308
|
+
)
|
309
|
+
additional_baseline_annotation_df = pd.read_csv(
|
310
|
+
additional_baseline_annotation_file_path, sep="\t"
|
311
|
+
)
|
312
|
+
additional_baseline_annotation_df.set_index("SNP", inplace=True)
|
307
313
|
|
308
314
|
# drop these columns if exists CHR BP CM]
|
309
|
-
additional_baseline_annotation_df.drop(
|
315
|
+
additional_baseline_annotation_df.drop(
|
316
|
+
["CHR", "BP", "CM"], axis=1, inplace=True, errors="ignore"
|
317
|
+
)
|
310
318
|
|
311
319
|
# reindex, for those SNPs not in additional_baseline_annotation_df, set to 0
|
312
|
-
num_of_not_exist_snp = (
|
320
|
+
num_of_not_exist_snp = (
|
321
|
+
~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)
|
322
|
+
).sum()
|
313
323
|
if num_of_not_exist_snp > 0:
|
314
324
|
logger.warning(
|
315
|
-
f
|
325
|
+
f"{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0"
|
326
|
+
)
|
316
327
|
additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
|
317
|
-
self.snp_gene_pair_dummy.index,
|
318
|
-
|
328
|
+
self.snp_gene_pair_dummy.index, fill_value=0
|
329
|
+
)
|
319
330
|
else:
|
320
331
|
additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
|
321
|
-
self.snp_gene_pair_dummy.index
|
332
|
+
self.snp_gene_pair_dummy.index
|
333
|
+
)
|
322
334
|
|
323
335
|
# do this for saving the cpu time, only calculate r2 once
|
324
336
|
self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
|
@@ -327,56 +339,85 @@ class S_LDSC_Boost:
|
|
327
339
|
chrom,
|
328
340
|
self.config.bfile_root,
|
329
341
|
ld_wind=self.config.ld_wind,
|
330
|
-
ld_unit=self.config.ld_unit
|
342
|
+
ld_unit=self.config.ld_unit,
|
343
|
+
)
|
344
|
+
)
|
331
345
|
|
332
|
-
additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[
|
346
|
+
additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[
|
347
|
+
self.snp_name
|
348
|
+
]
|
333
349
|
# print(additional_baseline_annotation_ldscore.index.to_list()==self.snp_name)
|
334
350
|
|
335
|
-
ld_score_file = f
|
336
|
-
M_file_path =
|
337
|
-
|
351
|
+
ld_score_file = f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather"
|
352
|
+
M_file_path = (
|
353
|
+
f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M"
|
354
|
+
)
|
355
|
+
M_5_file_path = (
|
356
|
+
f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50"
|
357
|
+
)
|
338
358
|
|
339
359
|
# save additional baseline annotation ldscore
|
340
|
-
self.save_ldscore_to_feather(
|
341
|
-
|
342
|
-
|
343
|
-
|
360
|
+
self.save_ldscore_to_feather(
|
361
|
+
additional_baseline_annotation_ldscore.values,
|
362
|
+
column_names=additional_baseline_annotation_ldscore.columns,
|
363
|
+
save_file_name=ld_score_file,
|
364
|
+
)
|
344
365
|
|
345
366
|
# caculate the M and save
|
346
367
|
save_dir = Path(M_file_path).parent
|
347
368
|
save_dir.mkdir(parents=True, exist_ok=True)
|
348
369
|
M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
|
349
|
-
M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(
|
350
|
-
|
351
|
-
|
370
|
+
M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(
|
371
|
+
axis=0, keepdims=True
|
372
|
+
)
|
373
|
+
np.savetxt(
|
374
|
+
M_file_path,
|
375
|
+
M_chr_chunk,
|
376
|
+
delimiter="\t",
|
377
|
+
)
|
378
|
+
np.savetxt(
|
379
|
+
M_5_file_path,
|
380
|
+
M_5_chr_chunk,
|
381
|
+
delimiter="\t",
|
382
|
+
)
|
352
383
|
|
353
384
|
else:
|
354
385
|
# Calculate SNP-Gene weight matrix
|
355
|
-
self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(
|
356
|
-
|
357
|
-
|
358
|
-
|
386
|
+
self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(
|
387
|
+
self.snp_gene_pair_dummy,
|
388
|
+
chrom,
|
389
|
+
self.config.bfile_root,
|
390
|
+
ld_wind=self.config.ld_wind,
|
391
|
+
ld_unit=self.config.ld_unit,
|
392
|
+
)
|
359
393
|
# only keep the snp in keep_snp_root
|
360
394
|
if self.keep_snp_mask is not None:
|
361
395
|
self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
|
362
396
|
|
363
397
|
if self.config.save_pre_calculate_snp_gene_weight_matrix:
|
364
|
-
snp_gene_weight_matrix_save_dir =
|
398
|
+
snp_gene_weight_matrix_save_dir = (
|
399
|
+
Path(self.config.ldscore_save_dir) / "snp_gene_weight_matrix"
|
400
|
+
)
|
365
401
|
snp_gene_weight_matrix_save_dir.mkdir(parents=True, exist_ok=True)
|
366
|
-
logger.info(f
|
402
|
+
logger.info(f"Saving snp_gene_weight_matrix for chr{chrom}...")
|
367
403
|
self.snp_gene_weight_matrix.reset_index().to_feather(
|
368
|
-
snp_gene_weight_matrix_save_dir / f
|
404
|
+
snp_gene_weight_matrix_save_dir / f"{chrom}.snp_gene_weight_matrix.feather"
|
405
|
+
)
|
369
406
|
|
370
407
|
# convert to sparse
|
371
408
|
self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
|
372
|
-
logger.info(
|
409
|
+
logger.info(
|
410
|
+
f"Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}"
|
411
|
+
)
|
373
412
|
|
374
413
|
# calculate baseline ld score
|
375
|
-
logger.info(f
|
376
|
-
self.calculate_ldscore_for_base_line(
|
414
|
+
logger.info(f"Calculating baseline ld score for chr{chrom}...")
|
415
|
+
self.calculate_ldscore_for_base_line(
|
416
|
+
chrom, self.config.sample_name, self.config.ldscore_save_dir
|
417
|
+
)
|
377
418
|
|
378
419
|
# calculate ld score for annotation
|
379
|
-
logger.info(f
|
420
|
+
logger.info(f"Calculating ld score for annotation for chr{chrom}...")
|
380
421
|
self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
|
381
422
|
self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
|
382
423
|
chrom,
|
@@ -384,11 +425,11 @@ class S_LDSC_Boost:
|
|
384
425
|
self.config.ldscore_save_dir,
|
385
426
|
)
|
386
427
|
|
387
|
-
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
428
|
+
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
429
|
+
self,
|
430
|
+
mk_score_chunk,
|
431
|
+
drop_dummy_na=True,
|
432
|
+
):
|
392
433
|
if drop_dummy_na:
|
393
434
|
ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
|
394
435
|
else:
|
@@ -407,16 +448,20 @@ class S_LDSC_Boost:
|
|
407
448
|
# self.keep_snp_mask]
|
408
449
|
|
409
450
|
# save for each chunk
|
410
|
-
df = pd.DataFrame(
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
451
|
+
df = pd.DataFrame(
|
452
|
+
ldscore_chr_chunk,
|
453
|
+
index=self.snp_name,
|
454
|
+
columns=column_names,
|
455
|
+
)
|
456
|
+
df.index.name = "SNP"
|
415
457
|
df.reset_index().to_feather(save_file_name)
|
416
458
|
|
417
|
-
def save_ldscore_chunk_to_zarr(
|
418
|
-
|
419
|
-
|
459
|
+
def save_ldscore_chunk_to_zarr(
|
460
|
+
self,
|
461
|
+
ldscore_chr_chunk: np.ndarray,
|
462
|
+
chrom: int,
|
463
|
+
start_col_index,
|
464
|
+
):
|
420
465
|
ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
|
421
466
|
# avoid overflow of float16, if inf, set to max of float16
|
422
467
|
ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
|
@@ -425,63 +470,90 @@ class S_LDSC_Boost:
|
|
425
470
|
chrom_snp_start_point = self.chrom_snp_start_point[chrom - 1]
|
426
471
|
chrom_snp_end_point = self.chrom_snp_start_point[chrom]
|
427
472
|
|
428
|
-
self.zarr_file[
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
473
|
+
self.zarr_file[
|
474
|
+
chrom_snp_start_point:chrom_snp_end_point,
|
475
|
+
start_col_index : start_col_index + ldscore_chr_chunk.shape[1],
|
476
|
+
] = ldscore_chr_chunk
|
477
|
+
|
478
|
+
def calculate_M_use_SNP_gene_pair_dummy_by_chunk(
|
479
|
+
self,
|
480
|
+
mk_score_chunk,
|
481
|
+
M_file_path,
|
482
|
+
M_5_file_path,
|
483
|
+
drop_dummy_na=True,
|
484
|
+
):
|
485
|
+
"""
|
486
|
+
Calculate M use SNP_gene_pair_dummy_sumed_along_snp_axis and mk_score_chunk
|
487
|
+
"""
|
488
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis = self.snp_gene_pair_dummy.values.sum(
|
489
|
+
axis=0, keepdims=True
|
490
|
+
)
|
491
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = self.snp_gene_pair_dummy.loc[
|
492
|
+
self.snp_pass_maf
|
493
|
+
].values.sum(axis=0, keepdims=True)
|
443
494
|
if drop_dummy_na:
|
444
|
-
SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[
|
445
|
-
|
446
|
-
|
495
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[
|
496
|
+
:, :-1
|
497
|
+
]
|
498
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = (
|
499
|
+
SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf[:, :-1]
|
500
|
+
)
|
447
501
|
save_dir = Path(M_file_path).parent
|
448
502
|
save_dir.mkdir(parents=True, exist_ok=True)
|
449
503
|
M_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis @ mk_score_chunk
|
450
504
|
M_5_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf @ mk_score_chunk
|
451
|
-
np.savetxt(
|
452
|
-
|
505
|
+
np.savetxt(
|
506
|
+
M_file_path,
|
507
|
+
M_chr_chunk,
|
508
|
+
delimiter="\t",
|
509
|
+
)
|
510
|
+
np.savetxt(
|
511
|
+
M_5_file_path,
|
512
|
+
M_5_chr_chunk,
|
513
|
+
delimiter="\t",
|
514
|
+
)
|
453
515
|
|
454
|
-
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
|
516
|
+
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
|
517
|
+
self, mk_score_common, chrom, sample_name, save_dir
|
518
|
+
):
|
455
519
|
"""
|
456
520
|
Calculate the LD score using the SNP-gene weight matrix.
|
457
521
|
:param sample_name:
|
458
522
|
"""
|
459
523
|
# Calculate the LD score
|
460
524
|
chunk_index = 1
|
461
|
-
for i in trange(
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
525
|
+
for i in trange(
|
526
|
+
0,
|
527
|
+
mk_score_common.shape[1],
|
528
|
+
self.config.spots_per_chunk,
|
529
|
+
desc=f"Calculating LD score by chunk for chr{chrom}",
|
530
|
+
):
|
531
|
+
mk_score_chunk = mk_score_common.iloc[:, i : i + self.config.spots_per_chunk]
|
532
|
+
|
533
|
+
ld_score_file = f"{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather"
|
534
|
+
M_file = f"{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M"
|
535
|
+
M_5_file = (
|
536
|
+
f"{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50"
|
537
|
+
)
|
468
538
|
|
469
539
|
ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
470
540
|
mk_score_chunk,
|
471
541
|
drop_dummy_na=True,
|
472
542
|
)
|
473
|
-
if self.config.ldscore_save_format ==
|
474
|
-
self.save_ldscore_to_feather(
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
543
|
+
if self.config.ldscore_save_format == "feather":
|
544
|
+
self.save_ldscore_to_feather(
|
545
|
+
ldscore_chr_chunk,
|
546
|
+
column_names=mk_score_chunk.columns,
|
547
|
+
save_file_name=ld_score_file,
|
548
|
+
)
|
549
|
+
elif self.config.ldscore_save_format == "zarr":
|
550
|
+
self.save_ldscore_chunk_to_zarr(
|
551
|
+
ldscore_chr_chunk,
|
552
|
+
chrom=chrom,
|
553
|
+
start_col_index=i,
|
554
|
+
)
|
483
555
|
else:
|
484
|
-
raise ValueError(f
|
556
|
+
raise ValueError(f"Invalid ldscore_save_format: {self.config.ldscore_save_format}")
|
485
557
|
|
486
558
|
self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
|
487
559
|
mk_score_chunk,
|
@@ -496,21 +568,23 @@ class S_LDSC_Boost:
|
|
496
568
|
# save baseline ld score
|
497
569
|
baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
|
498
570
|
baseline_mk_score[-1, 0] = 0 # all_gene
|
499
|
-
baseline_mk_score_df = pd.DataFrame(
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
571
|
+
baseline_mk_score_df = pd.DataFrame(
|
572
|
+
baseline_mk_score, index=self.snp_gene_pair_dummy.columns, columns=["all_gene", "base"]
|
573
|
+
)
|
574
|
+
ld_score_file = f"{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather"
|
575
|
+
M_file = f"{save_dir}/baseline/baseline.{chrom}.l2.M"
|
576
|
+
M_5_file = f"{save_dir}/baseline/baseline.{chrom}.l2.M_5_50"
|
504
577
|
|
505
578
|
ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
506
579
|
baseline_mk_score_df,
|
507
580
|
drop_dummy_na=False,
|
508
581
|
)
|
509
582
|
|
510
|
-
self.save_ldscore_to_feather(
|
511
|
-
|
512
|
-
|
513
|
-
|
583
|
+
self.save_ldscore_to_feather(
|
584
|
+
ldscore_chr_chunk,
|
585
|
+
column_names=baseline_mk_score_df.columns,
|
586
|
+
save_file_name=ld_score_file,
|
587
|
+
)
|
514
588
|
# save baseline M
|
515
589
|
self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
|
516
590
|
baseline_mk_score_df,
|
@@ -519,7 +593,10 @@ class S_LDSC_Boost:
|
|
519
593
|
drop_dummy_na=False,
|
520
594
|
)
|
521
595
|
|
522
|
-
def get_snp_gene_dummy(
|
596
|
+
def get_snp_gene_dummy(
|
597
|
+
self,
|
598
|
+
chrom,
|
599
|
+
):
|
523
600
|
"""
|
524
601
|
Get the dummy matrix of SNP-gene pairs.
|
525
602
|
"""
|
@@ -527,91 +604,126 @@ class S_LDSC_Boost:
|
|
527
604
|
print("Loading bim data")
|
528
605
|
bim, bim_pr = load_bim(self.config.bfile_root, chrom)
|
529
606
|
|
530
|
-
if self.config.gene_window_enhancer_priority in [
|
531
|
-
|
532
|
-
|
533
|
-
|
607
|
+
if self.config.gene_window_enhancer_priority in ["gene_window_first", "enhancer_first"]:
|
608
|
+
SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(
|
609
|
+
bim,
|
610
|
+
bim_pr,
|
611
|
+
)
|
612
|
+
SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(
|
613
|
+
bim,
|
614
|
+
bim_pr,
|
615
|
+
)
|
534
616
|
# total_SNP_gene_pair = SNP_gene_pair_gtf.join(SNP_gene_pair_enhancer, how='outer', lsuffix='_gtf', )
|
535
617
|
|
536
618
|
mask_of_nan_gtf = SNP_gene_pair_gtf.gene_name.isna()
|
537
619
|
mask_of_nan_enhancer = SNP_gene_pair_enhancer.gene_name.isna()
|
538
620
|
|
539
|
-
if self.config.gene_window_enhancer_priority ==
|
621
|
+
if self.config.gene_window_enhancer_priority == "gene_window_first":
|
540
622
|
SNP_gene_pair = SNP_gene_pair_gtf
|
541
|
-
SNP_gene_pair.loc[mask_of_nan_gtf,
|
542
|
-
mask_of_nan_gtf,
|
543
|
-
|
623
|
+
SNP_gene_pair.loc[mask_of_nan_gtf, "gene_name"] = SNP_gene_pair_enhancer.loc[
|
624
|
+
mask_of_nan_gtf, "gene_name"
|
625
|
+
]
|
626
|
+
elif self.config.gene_window_enhancer_priority == "enhancer_first":
|
544
627
|
SNP_gene_pair = SNP_gene_pair_enhancer
|
545
|
-
SNP_gene_pair.loc[mask_of_nan_enhancer,
|
546
|
-
mask_of_nan_enhancer,
|
628
|
+
SNP_gene_pair.loc[mask_of_nan_enhancer, "gene_name"] = SNP_gene_pair_gtf.loc[
|
629
|
+
mask_of_nan_enhancer, "gene_name"
|
630
|
+
]
|
547
631
|
else:
|
548
632
|
raise ValueError(
|
549
|
-
f
|
633
|
+
f"Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}"
|
634
|
+
)
|
550
635
|
|
551
636
|
elif self.config.gene_window_enhancer_priority is None: # use gtf only
|
552
|
-
SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(
|
637
|
+
SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(
|
638
|
+
bim,
|
639
|
+
bim_pr,
|
640
|
+
)
|
553
641
|
SNP_gene_pair = SNP_gene_pair_gtf
|
554
642
|
|
555
|
-
elif self.config.gene_window_enhancer_priority ==
|
556
|
-
SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(
|
643
|
+
elif self.config.gene_window_enhancer_priority == "enhancer_only":
|
644
|
+
SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(
|
645
|
+
bim,
|
646
|
+
bim_pr,
|
647
|
+
)
|
557
648
|
SNP_gene_pair = SNP_gene_pair_enhancer
|
558
649
|
else:
|
559
|
-
raise ValueError(
|
650
|
+
raise ValueError("gtf_pr and enhancer_pr cannot be None at the same time")
|
560
651
|
|
561
652
|
# save the SNP_gene_pair to feather
|
562
|
-
SNP_gene_pair_save_path =
|
563
|
-
self.config.ldscore_save_dir) / f
|
653
|
+
SNP_gene_pair_save_path = (
|
654
|
+
Path(self.config.ldscore_save_dir) / f"SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather"
|
655
|
+
)
|
564
656
|
SNP_gene_pair_save_path.parent.mkdir(parents=True, exist_ok=True)
|
565
657
|
SNP_gene_pair.reset_index().to_feather(SNP_gene_pair_save_path)
|
566
658
|
|
567
659
|
# Get the dummy matrix
|
568
|
-
SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair[
|
660
|
+
SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair["gene_name"], dummy_na=True)
|
569
661
|
return SNP_gene_pair_dummy
|
570
662
|
|
571
663
|
def get_SNP_gene_pair_from_gtf(self, bim, bim_pr):
|
572
664
|
logger.info(
|
573
|
-
"Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)"
|
665
|
+
"Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)"
|
666
|
+
)
|
574
667
|
overlaps_small = Overlaps_gtf_bim(self.gtf_pr, bim_pr)
|
575
668
|
# Get the SNP-gene pair
|
576
669
|
annot = bim[["CHR", "BP", "SNP", "CM"]]
|
577
|
-
SNP_gene_pair =
|
670
|
+
SNP_gene_pair = (
|
671
|
+
overlaps_small[["SNP", "gene_name"]]
|
672
|
+
.set_index("SNP")
|
673
|
+
.join(annot.set_index("SNP"), how="right")
|
674
|
+
)
|
578
675
|
return SNP_gene_pair
|
579
676
|
|
580
|
-
def get_SNP_gene_pair_from_enhancer(
|
677
|
+
def get_SNP_gene_pair_from_enhancer(
|
678
|
+
self,
|
679
|
+
bim,
|
680
|
+
bim_pr,
|
681
|
+
):
|
581
682
|
logger.info(
|
582
|
-
"Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score"
|
683
|
+
"Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score"
|
684
|
+
)
|
583
685
|
# Get the SNP-gene pair
|
584
686
|
overlaps_small = self.enhancer_pr.join(bim_pr).df
|
585
687
|
annot = bim[["CHR", "BP", "SNP", "CM"]]
|
586
|
-
if self.config.snp_multiple_enhancer_strategy ==
|
587
|
-
logger.debug(
|
588
|
-
overlaps_small = overlaps_small.loc[overlaps_small.groupby(
|
589
|
-
|
590
|
-
elif self.config.snp_multiple_enhancer_strategy ==
|
591
|
-
logger.debug(
|
592
|
-
overlaps_small[
|
593
|
-
overlaps_small = overlaps_small.loc[overlaps_small.groupby(
|
594
|
-
|
595
|
-
SNP_gene_pair =
|
688
|
+
if self.config.snp_multiple_enhancer_strategy == "max_mkscore":
|
689
|
+
logger.debug("select the gene with highest marker score")
|
690
|
+
overlaps_small = overlaps_small.loc[overlaps_small.groupby("SNP").avg_mkscore.idxmax()]
|
691
|
+
|
692
|
+
elif self.config.snp_multiple_enhancer_strategy == "nearest_TSS":
|
693
|
+
logger.debug("select the gene with nearest TSS")
|
694
|
+
overlaps_small["Distance"] = np.abs(overlaps_small["Start_b"] - overlaps_small["TSS"])
|
695
|
+
overlaps_small = overlaps_small.loc[overlaps_small.groupby("SNP").Distance.idxmin()]
|
696
|
+
|
697
|
+
SNP_gene_pair = (
|
698
|
+
overlaps_small[["SNP", "gene_name"]]
|
699
|
+
.set_index("SNP")
|
700
|
+
.join(annot.set_index("SNP"), how="right")
|
701
|
+
)
|
596
702
|
|
597
703
|
return SNP_gene_pair
|
598
704
|
|
599
705
|
|
600
706
|
def run_generate_ldscore(config: GenerateLDScoreConfig):
|
601
|
-
if config.ldscore_save_format ==
|
602
|
-
logger.info(
|
707
|
+
if config.ldscore_save_format == "quick_mode":
|
708
|
+
logger.info(
|
709
|
+
"Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore."
|
710
|
+
)
|
603
711
|
ldscore_save_dir = config.ldscore_save_dir
|
604
712
|
|
605
713
|
# link the baseline annotation
|
606
714
|
baseline_annotation_dir = Path(config.baseline_annotation_dir)
|
607
|
-
(ldscore_save_dir /
|
715
|
+
(ldscore_save_dir / "baseline").symlink_to(
|
716
|
+
baseline_annotation_dir, target_is_directory=True
|
717
|
+
)
|
608
718
|
|
609
719
|
# link the SNP_gene_pair
|
610
720
|
SNP_gene_pair_dir = Path(config.SNP_gene_pair_dir)
|
611
|
-
(ldscore_save_dir /
|
721
|
+
(ldscore_save_dir / "SNP_gene_pair").symlink_to(
|
722
|
+
SNP_gene_pair_dir, target_is_directory=True
|
723
|
+
)
|
612
724
|
return
|
613
725
|
s_ldsc_boost = S_LDSC_Boost(config)
|
614
|
-
if config.chrom ==
|
726
|
+
if config.chrom == "all":
|
615
727
|
for chrom in range(1, 23):
|
616
728
|
s_ldsc_boost.process_chromosome(chrom)
|
617
729
|
else:
|