gsMap 1.62__py3-none-any.whl → 1.63__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN_VAE/adjacency_matrix.py +1 -1
- gsMap/GNN_VAE/model.py +5 -5
- gsMap/GNN_VAE/train.py +1 -1
- gsMap/__init__.py +1 -1
- gsMap/cauchy_combination_test.py +14 -36
- gsMap/config.py +473 -404
- gsMap/diagnosis.py +273 -0
- gsMap/find_latent_representation.py +22 -86
- gsMap/format_sumstats.py +79 -82
- gsMap/generate_ldscore.py +145 -78
- gsMap/latent_to_gene.py +65 -104
- gsMap/main.py +1 -9
- gsMap/report.py +160 -0
- gsMap/run_all_mode.py +195 -0
- gsMap/spatial_ldsc_multiple_sumstats.py +187 -112
- gsMap/templates/report_template.html +198 -0
- gsMap/utils/__init__.py +0 -0
- gsMap/{generate_r2_matrix.py → utils/generate_r2_matrix.py} +1 -9
- gsMap/{make_annotations.py → utils/make_annotations.py} +1 -43
- gsMap/utils/manhattan_plot.py +639 -0
- gsMap/{regression_read.py → utils/regression_read.py} +1 -1
- gsMap/visualize.py +100 -55
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/METADATA +16 -46
- gsmap-1.63.dist-info/RECORD +30 -0
- gsmap-1.62.dist-info/RECORD +0 -24
- /gsMap/{jackknife.py → utils/jackknife.py} +0 -0
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/LICENSE +0 -0
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/WHEEL +0 -0
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/entry_points.txt +0 -0
@@ -1,19 +1,19 @@
|
|
1
|
-
import
|
2
|
-
import numpy as np
|
3
|
-
import pandas as pd
|
4
|
-
|
5
|
-
import argparse
|
1
|
+
import gc
|
6
2
|
import logging
|
7
|
-
import
|
3
|
+
import os
|
8
4
|
from collections import defaultdict
|
9
5
|
from pathlib import Path
|
10
6
|
|
7
|
+
import anndata as ad
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
import zarr
|
11
11
|
from scipy.stats import norm
|
12
|
-
from tqdm.contrib.concurrent import
|
12
|
+
from tqdm.contrib.concurrent import thread_map
|
13
13
|
|
14
|
-
import gsMap.jackknife as jk
|
15
|
-
from gsMap.config import
|
16
|
-
from gsMap.regression_read import _read_sumstats, _read_w_ld, _read_ref_ld_v2
|
14
|
+
import gsMap.utils.jackknife as jk
|
15
|
+
from gsMap.config import SpatialLDSCConfig
|
16
|
+
from gsMap.utils.regression_read import _read_sumstats, _read_w_ld, _read_ref_ld_v2
|
17
17
|
|
18
18
|
logger = logging.getLogger(__name__)
|
19
19
|
|
@@ -21,7 +21,8 @@ logger = logging.getLogger(__name__)
|
|
21
21
|
# %%
|
22
22
|
def _coef_new(jknife):
|
23
23
|
# return coef[0], coef_se[0], z[0]]
|
24
|
-
est_ = jknife.est[0, 0] / Nbar
|
24
|
+
# est_ = jknife.est[0, 0] / Nbar
|
25
|
+
est_ = jknife.jknife_est[0, 0] / Nbar
|
25
26
|
se_ = jknife.jknife_se[0, 0] / Nbar
|
26
27
|
return est_, se_
|
27
28
|
|
@@ -68,16 +69,19 @@ def weights(ld, w_ld, N, M, hsq, intercept=1):
|
|
68
69
|
|
69
70
|
def jackknife_for_processmap(spot_id):
|
70
71
|
# calculate the initial weight for each spot
|
72
|
+
spot_spatial_annotation = spatial_annotation[:, spot_id]
|
73
|
+
spot_x_tot_precomputed = spot_spatial_annotation + ref_ld_baseline_column_sum
|
71
74
|
initial_w = (
|
72
|
-
get_weight_optimized(sumstats,
|
75
|
+
get_weight_optimized(sumstats, x_tot_precomputed=spot_x_tot_precomputed,
|
76
|
+
M_tot=10000, w_ld=w_ld_common_snp, intercept=1)
|
73
77
|
.astype(np.float32)
|
74
78
|
.reshape((-1, 1)))
|
75
79
|
|
76
80
|
# apply the weight to baseline annotation, spatial annotation and CHISQ
|
77
81
|
initial_w_scaled = initial_w / np.sum(initial_w)
|
78
82
|
baseline_annotation_spot = baseline_annotation * initial_w_scaled
|
79
|
-
spatial_annotation_spot =
|
80
|
-
CHISQ = sumstats.chisq.
|
83
|
+
spatial_annotation_spot = spot_spatial_annotation.reshape((-1, 1)) * initial_w_scaled
|
84
|
+
CHISQ = sumstats.chisq.values.reshape((-1, 1))
|
81
85
|
y = CHISQ * initial_w_scaled
|
82
86
|
|
83
87
|
# run the jackknife
|
@@ -113,6 +117,9 @@ def _preprocess_sumstats(trait_name, sumstat_file_path, baseline_and_w_ld_common
|
|
113
117
|
logger.warning(f'WARNING: number of SNPs less than 200k; for {trait_name} this is almost always bad.')
|
114
118
|
|
115
119
|
sumstats = sumstats.loc[common_snp]
|
120
|
+
|
121
|
+
# get the common index position of baseline_and_w_ld_common_snp for quick access
|
122
|
+
sumstats['common_index_pos'] = pd.Index(baseline_and_w_ld_common_snp).get_indexer(sumstats.index)
|
116
123
|
return sumstats
|
117
124
|
|
118
125
|
|
@@ -132,8 +139,77 @@ def _get_sumstats_from_sumstats_dict(sumstats_config_dict: dict, baseline_and_w_
|
|
132
139
|
return sumstats_cleaned_dict
|
133
140
|
|
134
141
|
|
142
|
+
class S_LDSC_Boost_with_pre_calculate_SNP_Gene_weight_matrix:
|
143
|
+
def __init__(self, config: SpatialLDSCConfig, common_snp_among_all_sumstats_pos):
|
144
|
+
self.config = config
|
145
|
+
mk_score = pd.read_feather(config.mkscore_feather_path).set_index('HUMAN_GENE_SYM')
|
146
|
+
mk_score_genes = mk_score.index
|
147
|
+
|
148
|
+
snp_gene_weight_adata = ad.read_h5ad(config.snp_gene_weight_adata_path)
|
149
|
+
common_genes = mk_score_genes.intersection(snp_gene_weight_adata.var.index)
|
150
|
+
common_snps = snp_gene_weight_adata.obs.index
|
151
|
+
# self.snp_gene_weight_adata = snp_gene_weight_adata[common_snp_among_all_sumstats:, common_genes.to_list()]
|
152
|
+
self.snp_gene_weight_matrix = snp_gene_weight_adata[common_snp_among_all_sumstats_pos, common_genes.to_list()].X
|
153
|
+
self.mk_score_common = mk_score.loc[common_genes]
|
154
|
+
|
155
|
+
# calculate the chunk number
|
156
|
+
self.chunk_starts = list(range(0, self.mk_score_common.shape[1], self.config.spots_per_chunk_quick_mode))
|
157
|
+
|
158
|
+
def fetch_ldscore_by_chunk(self, chunk_index):
|
159
|
+
chunk_start = self.chunk_starts[chunk_index]
|
160
|
+
mk_score_chunk = self.mk_score_common.iloc[:,
|
161
|
+
chunk_start:chunk_start + self.config.spots_per_chunk_quick_mode]
|
162
|
+
ldscore_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
163
|
+
mk_score_chunk,
|
164
|
+
drop_dummy_na=False,
|
165
|
+
)
|
166
|
+
|
167
|
+
spots_name = self.mk_score_common.columns[chunk_start:chunk_start + self.config.spots_per_chunk_quick_mode]
|
168
|
+
return ldscore_chunk, spots_name
|
169
|
+
|
170
|
+
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
|
171
|
+
mk_score_chunk,
|
172
|
+
drop_dummy_na=True,
|
173
|
+
):
|
174
|
+
|
175
|
+
if drop_dummy_na:
|
176
|
+
ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
|
177
|
+
else:
|
178
|
+
ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
|
179
|
+
|
180
|
+
return ldscore_chr_chunk
|
181
|
+
|
182
|
+
|
183
|
+
def _get_sumstats_with_common_snp_from_sumstats_dict(sumstats_config_dict: dict, baseline_and_w_ld_common_snp: pd.Index,
|
184
|
+
chisq_max=None):
|
185
|
+
# first validate if all sumstats file exists
|
186
|
+
logger.info('Validating sumstats files...')
|
187
|
+
for trait_name, sumstat_file_path in sumstats_config_dict.items():
|
188
|
+
if not os.path.exists(sumstat_file_path):
|
189
|
+
raise FileNotFoundError(f'{sumstat_file_path} not found')
|
190
|
+
# then load all sumstats
|
191
|
+
sumstats_cleaned_dict = {}
|
192
|
+
for trait_name, sumstat_file_path in sumstats_config_dict.items():
|
193
|
+
sumstats_cleaned_dict[trait_name] = _preprocess_sumstats(trait_name, sumstat_file_path,
|
194
|
+
baseline_and_w_ld_common_snp, chisq_max)
|
195
|
+
# get the common snps among all sumstats
|
196
|
+
common_snp_among_all_sumstats = None
|
197
|
+
for trait_name, sumstats in sumstats_cleaned_dict.items():
|
198
|
+
if common_snp_among_all_sumstats is None:
|
199
|
+
common_snp_among_all_sumstats = sumstats.index
|
200
|
+
else:
|
201
|
+
common_snp_among_all_sumstats = common_snp_among_all_sumstats.intersection(sumstats.index)
|
202
|
+
|
203
|
+
# filter the common snps among all sumstats
|
204
|
+
for trait_name, sumstats in sumstats_cleaned_dict.items():
|
205
|
+
sumstats_cleaned_dict[trait_name] = sumstats.loc[common_snp_among_all_sumstats]
|
206
|
+
|
207
|
+
logger.info(f'!Common SNPs among all sumstats: {len(common_snp_among_all_sumstats)}')
|
208
|
+
return sumstats_cleaned_dict, common_snp_among_all_sumstats
|
209
|
+
|
210
|
+
|
135
211
|
def run_spatial_ldsc(config: SpatialLDSCConfig):
|
136
|
-
global spatial_annotation, baseline_annotation, n_blocks, Nbar, sumstats,
|
212
|
+
global spatial_annotation, baseline_annotation, n_blocks, Nbar, sumstats, ref_ld_baseline_column_sum, w_ld_common_snp
|
137
213
|
# config
|
138
214
|
n_blocks = config.n_blocks
|
139
215
|
sample_name = config.sample_name
|
@@ -144,72 +220,107 @@ def run_spatial_ldsc(config: SpatialLDSCConfig):
|
|
144
220
|
w_ld_cname = w_ld.columns[1]
|
145
221
|
w_ld.set_index('SNP', inplace=True)
|
146
222
|
|
147
|
-
|
148
|
-
|
223
|
+
ld_file_baseline = f'{config.ldscore_save_dir}/baseline/baseline.'
|
224
|
+
|
149
225
|
ref_ld_baseline = _read_ref_ld_v2(ld_file_baseline)
|
150
|
-
n_annot_baseline = len(ref_ld_baseline.columns)
|
151
|
-
M_annot_baseline = _read_M_v2(ld_file_baseline, n_annot_baseline, config.not_M_5_50)
|
226
|
+
# n_annot_baseline = len(ref_ld_baseline.columns)
|
227
|
+
# M_annot_baseline = _read_M_v2(ld_file_baseline, n_annot_baseline, config.not_M_5_50)
|
152
228
|
|
153
229
|
# common snp between baseline and w_ld
|
154
230
|
baseline_and_w_ld_common_snp = ref_ld_baseline.index.intersection(w_ld.index)
|
155
|
-
|
156
|
-
|
157
|
-
|
231
|
+
baseline_and_w_ld_common_snp_pos = pd.Index(ref_ld_baseline.index).get_indexer(baseline_and_w_ld_common_snp)
|
232
|
+
|
233
|
+
# Clean the sumstats
|
234
|
+
sumstats_cleaned_dict, common_snp_among_all_sumstats = _get_sumstats_with_common_snp_from_sumstats_dict(
|
235
|
+
config.sumstats_config_dict, baseline_and_w_ld_common_snp,
|
236
|
+
chisq_max=config.chisq_max)
|
237
|
+
common_snp_among_all_sumstats_pos = ref_ld_baseline.index.get_indexer(common_snp_among_all_sumstats)
|
238
|
+
|
239
|
+
# insure the order is monotonic
|
240
|
+
assert pd.Series(
|
241
|
+
common_snp_among_all_sumstats_pos).is_monotonic_increasing, 'common_snp_among_all_sumstats_pos is not monotonic increasing'
|
242
|
+
|
243
|
+
if len(common_snp_among_all_sumstats) < 200000:
|
244
|
+
logger.warning(
|
245
|
+
f'!!!!! WARNING: number of SNPs less than 200k; for {sample_name} this is almost always bad. Please check the sumstats files.')
|
246
|
+
|
247
|
+
ref_ld_baseline = ref_ld_baseline.loc[common_snp_among_all_sumstats]
|
248
|
+
w_ld = w_ld.loc[common_snp_among_all_sumstats]
|
158
249
|
|
159
250
|
# load additional baseline annotations
|
160
251
|
if config.use_additional_baseline_annotation:
|
161
|
-
|
252
|
+
print('Using additional baseline annotations')
|
253
|
+
ld_file_baseline_additional = f'{config.ldscore_save_dir}/additional_baseline/baseline.'
|
162
254
|
ref_ld_baseline_additional = _read_ref_ld_v2(ld_file_baseline_additional)
|
163
255
|
n_annot_baseline_additional = len(ref_ld_baseline_additional.columns)
|
164
256
|
logger.info(f'{len(ref_ld_baseline_additional.columns)} additional baseline annotations loaded')
|
165
257
|
# M_annot_baseline_additional = _read_M_v2(ld_file_baseline_additional, n_annot_baseline_additional,
|
166
258
|
# config.not_M_5_50)
|
167
|
-
ref_ld_baseline_additional = ref_ld_baseline_additional.loc[
|
259
|
+
ref_ld_baseline_additional = ref_ld_baseline_additional.loc[common_snp_among_all_sumstats]
|
168
260
|
ref_ld_baseline = pd.concat([ref_ld_baseline, ref_ld_baseline_additional], axis=1)
|
169
261
|
del ref_ld_baseline_additional
|
170
262
|
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
263
|
+
# Detect available chunk files
|
264
|
+
if config.ldscore_save_format == 'quick_mode':
|
265
|
+
s_ldsc = S_LDSC_Boost_with_pre_calculate_SNP_Gene_weight_matrix(config, common_snp_among_all_sumstats_pos)
|
266
|
+
total_chunk_number_found = len(s_ldsc.chunk_starts)
|
267
|
+
print(f'Split data into {total_chunk_number_found} chunks')
|
268
|
+
else:
|
269
|
+
all_file = os.listdir(config.ldscore_save_dir)
|
270
|
+
total_chunk_number_found = sum('chunk' in name for name in all_file)
|
271
|
+
print(f'Find {total_chunk_number_found} chunked files in {config.ldscore_save_dir}')
|
176
272
|
|
177
|
-
# Detect avalable chunk files
|
178
|
-
all_file = os.listdir(config.ldscore_input_dir)
|
179
273
|
if config.all_chunk is None:
|
180
|
-
|
181
|
-
|
182
|
-
|
274
|
+
if config.chunk_range is not None:
|
275
|
+
assert config.chunk_range[0] >= 1 and config.chunk_range[
|
276
|
+
1] <= total_chunk_number_found, 'Chunk range out of bound. It should be in [1, all_chunk]'
|
277
|
+
print(
|
278
|
+
f'chunk range provided, using chunked files from {config.chunk_range[0]} to {config.chunk_range[1]}')
|
279
|
+
start_chunk, end_chunk = config.chunk_range
|
280
|
+
else:
|
281
|
+
start_chunk, end_chunk = 1, total_chunk_number_found
|
183
282
|
else:
|
184
283
|
all_chunk = config.all_chunk
|
185
284
|
print(f'using {all_chunk} chunked files by provided argument')
|
186
285
|
print(f'\t')
|
187
286
|
print(f'Input {all_chunk} chunked files')
|
287
|
+
start_chunk, end_chunk = 1, all_chunk
|
288
|
+
|
289
|
+
running_chunk_number = end_chunk - start_chunk + 1
|
188
290
|
|
189
291
|
# Process each chunk
|
190
292
|
output_dict = defaultdict(list)
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
293
|
+
zarr_path = Path(config.ldscore_save_dir) / f'{config.sample_name}.ldscore.zarr'
|
294
|
+
if config.ldscore_save_format == 'zarr':
|
295
|
+
assert zarr_path.exists(), f'{zarr_path} not found, which is required for zarr format'
|
296
|
+
zarr_file = zarr.open(str(zarr_path))
|
297
|
+
spots_name = zarr_file.attrs['spot_names']
|
298
|
+
|
299
|
+
for chunk_index in range(start_chunk, end_chunk + 1):
|
300
|
+
if config.ldscore_save_format == 'feather':
|
301
|
+
ref_ld_spatial, spatial_annotation_cnames = load_ldscore_chunk_from_feather(chunk_index,
|
302
|
+
common_snp_among_all_sumstats_pos,
|
303
|
+
config,
|
304
|
+
)
|
305
|
+
elif config.ldscore_save_format == 'zarr':
|
306
|
+
ref_ld_spatial = zarr_file.blocks[:, chunk_index - 1][common_snp_among_all_sumstats_pos]
|
307
|
+
start_spot = (chunk_index - 1) * zarr_file.chunks[1]
|
308
|
+
ref_ld_spatial = ref_ld_spatial.astype(np.float32, copy=False)
|
309
|
+
spatial_annotation_cnames = spots_name[start_spot:start_spot + zarr_file.chunks[1]]
|
310
|
+
elif config.ldscore_save_format == 'quick_mode':
|
311
|
+
ref_ld_spatial, spatial_annotation_cnames = s_ldsc.fetch_ldscore_by_chunk(chunk_index - 1)
|
312
|
+
else:
|
313
|
+
raise ValueError(f'Invalid ld score save format: {config.ldscore_save_format}')
|
199
314
|
|
200
315
|
# get the x_tot_precomputed matrix by adding baseline and spatial annotation
|
201
|
-
|
316
|
+
ref_ld_baseline_column_sum = ref_ld_baseline.sum(axis=1).values
|
317
|
+
# x_tot_precomputed = ref_ld_spatial + ref_ld_baseline_column_sum
|
202
318
|
|
203
319
|
for trait_name, sumstats in sumstats_cleaned_dict.items():
|
204
|
-
logger.info(f'Processing {trait_name}...')
|
205
320
|
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
spatial_annotation_cnames = spatial_annotation.columns
|
210
|
-
baseline_annotation = ref_ld_baseline.loc[common_snp].astype(np.float32, copy=False)
|
211
|
-
w_ld_common_snp = w_ld.loc[common_snp].astype(np.float32, copy=False)
|
212
|
-
x_tot_precomputed_common_snp = x_tot_precomputed.loc[common_snp].values
|
321
|
+
spatial_annotation = ref_ld_spatial.astype(np.float32, copy=False)
|
322
|
+
baseline_annotation = ref_ld_baseline.copy().astype(np.float32, copy=False)
|
323
|
+
w_ld_common_snp = w_ld.astype(np.float32, copy=False)
|
213
324
|
|
214
325
|
# weight the baseline annotation by N
|
215
326
|
baseline_annotation = baseline_annotation * sumstats.N.values.reshape((-1, 1)) / sumstats.N.mean()
|
@@ -219,10 +330,11 @@ def run_spatial_ldsc(config: SpatialLDSCConfig):
|
|
219
330
|
# Run the jackknife
|
220
331
|
Nbar = sumstats.N.mean()
|
221
332
|
chunk_size = spatial_annotation.shape[1]
|
222
|
-
out_chunk =
|
223
|
-
|
224
|
-
|
225
|
-
|
333
|
+
out_chunk = thread_map(jackknife_for_processmap, range(chunk_size),
|
334
|
+
max_workers=config.num_processes,
|
335
|
+
chunksize=10,
|
336
|
+
desc=f'Chunk-{chunk_index}/Total-chunk-{running_chunk_number} for {trait_name}',
|
337
|
+
)
|
226
338
|
|
227
339
|
# cache the results
|
228
340
|
out_chunk = pd.DataFrame.from_records(out_chunk,
|
@@ -230,7 +342,8 @@ def run_spatial_ldsc(config: SpatialLDSCConfig):
|
|
230
342
|
index=spatial_annotation_cnames)
|
231
343
|
# get the spots with nan
|
232
344
|
nan_spots = out_chunk[out_chunk.isna().any(axis=1)].index
|
233
|
-
|
345
|
+
if len(nan_spots) > 0:
|
346
|
+
logger.info(f'Nan spots: {nan_spots} in chunk-{chunk_index} for {trait_name}. They are removed.')
|
234
347
|
# drop the nan
|
235
348
|
out_chunk = out_chunk.dropna()
|
236
349
|
|
@@ -238,70 +351,32 @@ def run_spatial_ldsc(config: SpatialLDSCConfig):
|
|
238
351
|
out_chunk['p'] = norm.sf(out_chunk['z'])
|
239
352
|
output_dict[trait_name].append(out_chunk)
|
240
353
|
|
241
|
-
|
242
|
-
|
354
|
+
del ref_ld_spatial, spatial_annotation, baseline_annotation, w_ld_common_snp
|
355
|
+
gc.collect()
|
243
356
|
|
244
357
|
# Save the results
|
245
|
-
out_dir =
|
246
|
-
out_dir.mkdir(parents=True, exist_ok=True, mode=0o777)
|
358
|
+
out_dir = config.ldsc_save_dir
|
247
359
|
for trait_name, out_chunk_list in output_dict.items():
|
248
360
|
out_all = pd.concat(out_chunk_list, axis=0)
|
249
|
-
|
361
|
+
if running_chunk_number == total_chunk_number_found:
|
362
|
+
out_file_name = out_dir / f'{sample_name}_{trait_name}.csv.gz'
|
363
|
+
else:
|
364
|
+
out_file_name = out_dir / f'{sample_name}_{trait_name}_chunk{start_chunk}-{end_chunk}.csv.gz'
|
250
365
|
out_all['spot'] = out_all.index
|
251
366
|
out_all = out_all[['spot', 'beta', 'se', 'z', 'p']]
|
252
367
|
out_all.to_csv(out_file_name, compression='gzip', index=False)
|
368
|
+
|
253
369
|
logger.info(f'Output saved to {out_file_name} for {trait_name}')
|
254
370
|
logger.info(f'------Spatial LDSC for {sample_name} finished!')
|
255
371
|
|
256
372
|
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
gwas_trait = "/storage/yangjianLab/songliyang/GWAS_trait/GWAS_Public_Use_MaxPower.csv"
|
268
|
-
root = "/storage/yangjianLab/songliyang/SpatialData/Data/Brain/Human/Nature_Neuroscience_2021/processed/h5ad"
|
269
|
-
|
270
|
-
name = 'Cortex_151507'
|
271
|
-
spe_name = name
|
272
|
-
# ld_pth = f"/storage/yangjianLab/songliyang/SpatialData/Data/Brain/Human/Nature_Neuroscience_2021/annotation/{spe_name}/snp_annotation"
|
273
|
-
ld_pth = f"/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/snake_workdir/{name}/generate_ldscore"
|
274
|
-
out_pth = f"/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/snake_workdir/{name}/ldsc"
|
275
|
-
gwas_file = "ADULT1_ADULT2_ONSET_ASTHMA"
|
276
|
-
# Prepare the arguments list using f-strings
|
277
|
-
args_list = [
|
278
|
-
"--h2", f"{gwas_root}/{gwas_file}.sumstats.gz",
|
279
|
-
"--w_file", "/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.",
|
280
|
-
"--sample_name", spe_name,
|
281
|
-
"--num_processes", '4',
|
282
|
-
"--ldscore_input_dir", ld_pth,
|
283
|
-
"--ldsc_save_dir", out_pth,
|
284
|
-
'--trait_name', 'adult1_adult2_onset_asthma'
|
285
|
-
]
|
286
|
-
# args = parser.parse_args(args_list)
|
287
|
-
else:
|
288
|
-
args = parser.parse_args()
|
289
|
-
|
290
|
-
os.chdir('/storage/yangjianLab/chenwenhao/tmp/gsMap_Height_debug')
|
291
|
-
TASK_ID = 16
|
292
|
-
spe_name = f'E{TASK_ID}.5_E1S1'
|
293
|
-
config = SpatialLDSCConfig(**{'all_chunk': None,
|
294
|
-
'chisq_max': None,
|
295
|
-
# 'sumstats_file': '/storage/yangjianLab/songliyang/GWAS_trait/LDSC/GIANT_EUR_Height_2022_Nature.sumstats.gz',
|
296
|
-
'ldsc_save_dir': f'{spe_name}/ldsc_results_three_row_sum_sub_config_traits',
|
297
|
-
'ldscore_input_dir': '/storage/yangjianLab/songliyang/SpatialData/Data/Embryo/Mice/Cell_MOSTA/annotation/E16.5_E1S1/generate_ldscore_new',
|
298
|
-
'n_blocks': 200,
|
299
|
-
'not_M_5_50': False,
|
300
|
-
'num_processes': 15,
|
301
|
-
'sample_name': spe_name,
|
302
|
-
# 'trait_name': 'GIANT_EUR_Height_2022_Nature',
|
303
|
-
'sumstats_config_file': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/src/gsMap/example/sumstats_config_sub.yaml',
|
304
|
-
'w_file': '/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.'
|
305
|
-
})
|
306
|
-
# config = SpatialLDSCConfig(**vars(args))
|
307
|
-
run_spatial_ldsc(config)
|
373
|
+
def load_ldscore_chunk_from_feather(chunk_index, common_snp_among_all_sumstats_pos, config, ):
|
374
|
+
# Load the spatial annotations for this chunk
|
375
|
+
sample_name = config.sample_name
|
376
|
+
ld_file_spatial = f'{config.ldscore_save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.'
|
377
|
+
ref_ld_spatial = _read_ref_ld_v2(ld_file_spatial)
|
378
|
+
ref_ld_spatial = ref_ld_spatial.iloc[common_snp_among_all_sumstats_pos]
|
379
|
+
ref_ld_spatial = ref_ld_spatial.astype(np.float32, copy=False)
|
380
|
+
|
381
|
+
spatial_annotation_cnames = ref_ld_spatial.columns
|
382
|
+
return ref_ld_spatial.values, spatial_annotation_cnames
|
@@ -0,0 +1,198 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html lang="en">
|
3
|
+
<head>
|
4
|
+
<meta charset="UTF-8">
|
5
|
+
<title>{{ title }}</title>
|
6
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
7
|
+
<!-- Bootstrap CSS -->
|
8
|
+
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
|
9
|
+
<!-- Custom Styles -->
|
10
|
+
<style>
|
11
|
+
body {
|
12
|
+
padding: 20px;
|
13
|
+
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
|
14
|
+
}
|
15
|
+
.plot-container {
|
16
|
+
margin-bottom: 50px;
|
17
|
+
}
|
18
|
+
.section-description {
|
19
|
+
color: #6c757d;
|
20
|
+
font-size: 0.95rem;
|
21
|
+
margin-bottom: 20px;
|
22
|
+
}
|
23
|
+
.scrollable-table {
|
24
|
+
max-height: 400px;
|
25
|
+
overflow-y: auto;
|
26
|
+
}
|
27
|
+
.table thead th {
|
28
|
+
position: sticky;
|
29
|
+
top: 0;
|
30
|
+
background-color: #f8f9fa;
|
31
|
+
}
|
32
|
+
img {
|
33
|
+
max-width: 100%;
|
34
|
+
height: auto;
|
35
|
+
border: 1px solid #dee2e6;
|
36
|
+
border-radius: 5px;
|
37
|
+
}
|
38
|
+
.gene-select-label {
|
39
|
+
font-weight: bold;
|
40
|
+
margin-bottom: 10px;
|
41
|
+
}
|
42
|
+
.collapse-toggle {
|
43
|
+
cursor: pointer;
|
44
|
+
color: #0d6efd;
|
45
|
+
text-decoration: underline;
|
46
|
+
}
|
47
|
+
</style>
|
48
|
+
</head>
|
49
|
+
<body>
|
50
|
+
<div class="container-fluid">
|
51
|
+
<h1 class="mb-4">{{ title }}</h1>
|
52
|
+
|
53
|
+
<!-- Genetic Spatial Mapping Plot -->
|
54
|
+
<div class="plot-container">
|
55
|
+
<h2>Genetic Spatial Mapping Plot</h2>
|
56
|
+
<p class="section-description">This plot shows the spatial genetic mapping results across different tissues.</p>
|
57
|
+
<div class="border rounded p-3">
|
58
|
+
{{ genetic_mapping_plot|safe }}
|
59
|
+
</div>
|
60
|
+
</div>
|
61
|
+
|
62
|
+
<!-- Cauchy Combination Result Table -->
|
63
|
+
<div class="plot-container">
|
64
|
+
<h2>Cauchy Combination Result</h2>
|
65
|
+
<p class="section-description">This table presents the results of the Cauchy combination test, summarizing the genetic associations.</p>
|
66
|
+
<div class="scrollable-table">
|
67
|
+
<table class="table table-hover table-bordered">
|
68
|
+
<thead class="table-light">
|
69
|
+
<tr>
|
70
|
+
<th>Annotation</th>
|
71
|
+
<th>P Cauchy</th>
|
72
|
+
<th>P Median</th>
|
73
|
+
</tr>
|
74
|
+
</thead>
|
75
|
+
<tbody>
|
76
|
+
{% for row in cauchy_table %}
|
77
|
+
<tr>
|
78
|
+
<td>{{ row.annotation }}</td>
|
79
|
+
<td>{{ "%.4e"|format(row.p_cauchy) }}</td>
|
80
|
+
<td>{{ "%.4e"|format(row.p_median) }}</td>
|
81
|
+
</tr>
|
82
|
+
{% endfor %}
|
83
|
+
</tbody>
|
84
|
+
</table>
|
85
|
+
</div>
|
86
|
+
</div>
|
87
|
+
|
88
|
+
<!-- Manhattan Plot -->
|
89
|
+
<div class="plot-container">
|
90
|
+
<h2>Diagnosis Manhattan Plot</h2>
|
91
|
+
<p class="section-description">The Manhattan plot shows the association of SNPs with the top associated gene across the genome.</p>
|
92
|
+
<div class="border rounded p-3">
|
93
|
+
{{ manhattan_plot|safe }}
|
94
|
+
</div>
|
95
|
+
</div>
|
96
|
+
|
97
|
+
<!-- Gene Expression and GSS Distribution -->
|
98
|
+
<div class="plot-container">
|
99
|
+
<h2>Gene Expression and GSS Distribution</h2>
|
100
|
+
<p class="section-description">Select a gene to view its expression distribution and gene specificity score (GSS).</p>
|
101
|
+
<label for="geneSelect" class="gene-select-label">Select a gene:</label>
|
102
|
+
<select id="geneSelect" class="form-select mb-4">
|
103
|
+
{% for gene in gene_plots %}
|
104
|
+
<option value="{{ gene.name }}">{{ gene.name }}</option>
|
105
|
+
{% endfor %}
|
106
|
+
</select>
|
107
|
+
<div id="genePlots" class="row">
|
108
|
+
<div class="col-md-6 mb-4">
|
109
|
+
<h5>Expression Distribution</h5>
|
110
|
+
<img src="{{ gene_plots[0].expression_plot }}" alt="{{ gene_plots[0].name }} Expression Distribution" id="expressionPlotImg" class="img-fluid">
|
111
|
+
</div>
|
112
|
+
<div class="col-md-6 mb-4">
|
113
|
+
<h5>Gene Specificity Score (GSS)</h5>
|
114
|
+
<img src="{{ gene_plots[0].gss_plot }}" alt="{{ gene_plots[0].name }} GSS Distribution" id="gssPlotImg" class="img-fluid">
|
115
|
+
</div>
|
116
|
+
</div>
|
117
|
+
</div>
|
118
|
+
|
119
|
+
<!-- Gene Diagnostic Info Table -->
|
120
|
+
<div class="plot-container">
|
121
|
+
<h2>Top 50 Gene Diagnostic Info</h2>
|
122
|
+
<p class="section-description">This table lists the top 50 genes based on diagnostic criteria, including the gene specificity score (GSS) and PCC.</p>
|
123
|
+
<div class="scrollable-table">
|
124
|
+
<table class="table table-hover table-bordered">
|
125
|
+
<thead class="table-light">
|
126
|
+
<tr>
|
127
|
+
<th>Gene</th>
|
128
|
+
<th>Annotation</th>
|
129
|
+
<th>Median GSS</th>
|
130
|
+
<th>PCC</th>
|
131
|
+
</tr>
|
132
|
+
</thead>
|
133
|
+
<tbody>
|
134
|
+
{% for row in gene_diagnostic_info %}
|
135
|
+
<tr>
|
136
|
+
<td>{{ row.Gene }}</td>
|
137
|
+
<td>{{ row.Annotation }}</td>
|
138
|
+
<td>{{ "%.4f"|format(row.Median_GSS) }}</td>
|
139
|
+
<td>{{ "%.4f"|format(row.PCC) }}</td>
|
140
|
+
</tr>
|
141
|
+
{% endfor %}
|
142
|
+
</tbody>
|
143
|
+
</table>
|
144
|
+
</div>
|
145
|
+
</div>
|
146
|
+
|
147
|
+
<!-- Running Info (collapsible) -->
|
148
|
+
<div class="plot-container">
|
149
|
+
<h2>Running Info</h2>
|
150
|
+
<p class="section-description">Click to view detailed run information and parameters.</p>
|
151
|
+
<p class="collapse-toggle" data-bs-toggle="collapse" href="#runningInfo" role="button" aria-expanded="false" aria-controls="runningInfo">
|
152
|
+
Show/Hide Running Info
|
153
|
+
</p>
|
154
|
+
<div class="collapse" id="runningInfo">
|
155
|
+
<div class="card card-body">
|
156
|
+
<p><strong>gsMap Version:</strong> {{ gsmap_version }}</p>
|
157
|
+
<p><strong>Parameters:</strong></p>
|
158
|
+
<ul class="mb-0">
|
159
|
+
{% for key, value in parameters.items() %}
|
160
|
+
<li><strong>{{ key }}:</strong> {{ value }}</li>
|
161
|
+
{% endfor %}
|
162
|
+
</ul>
|
163
|
+
</div>
|
164
|
+
</div>
|
165
|
+
</div>
|
166
|
+
</div>
|
167
|
+
|
168
|
+
<!-- JavaScript for Gene Plots -->
|
169
|
+
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
|
170
|
+
<script>
|
171
|
+
(function() {
|
172
|
+
const geneSelect = document.getElementById('geneSelect');
|
173
|
+
const expressionPlotImg = document.getElementById('expressionPlotImg');
|
174
|
+
const gssPlotImg = document.getElementById('gssPlotImg');
|
175
|
+
|
176
|
+
const genePlots = {
|
177
|
+
{% for gene in gene_plots %}
|
178
|
+
"{{ gene.name }}": {
|
179
|
+
expression_plot: "{{ gene.expression_plot }}",
|
180
|
+
gss_plot: "{{ gene.gss_plot }}"
|
181
|
+
}{% if not loop.last %},{% endif %}
|
182
|
+
{% endfor %}
|
183
|
+
};
|
184
|
+
|
185
|
+
geneSelect.addEventListener('change', function() {
|
186
|
+
const selectedGene = this.value;
|
187
|
+
const selectedGenePlots = genePlots[selectedGene];
|
188
|
+
|
189
|
+
// Update images
|
190
|
+
expressionPlotImg.src = selectedGenePlots.expression_plot;
|
191
|
+
expressionPlotImg.alt = `${selectedGene} Expression Distribution`;
|
192
|
+
gssPlotImg.src = selectedGenePlots.gss_plot;
|
193
|
+
gssPlotImg.alt = `${selectedGene} GSS Distribution`;
|
194
|
+
});
|
195
|
+
})();
|
196
|
+
</script>
|
197
|
+
</body>
|
198
|
+
</html>
|
gsMap/utils/__init__.py
ADDED
File without changes
|
@@ -69,7 +69,7 @@ def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
|
|
69
69
|
raise ValueError('{f} filename must end in {f}'.format(f=end))
|
70
70
|
comp = get_compression(fname)
|
71
71
|
self.df = pd.read_csv(fname, header=self.header, usecols=self.usecols,
|
72
|
-
|
72
|
+
sep='\s+', compression=comp)
|
73
73
|
if self.colnames:
|
74
74
|
self.df.columns = self.colnames
|
75
75
|
if self.keepcol is not None:
|
@@ -733,11 +733,3 @@ def generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_win
|
|
733
733
|
ld_wind_cm=ld_wind_cm,
|
734
734
|
output_cache_file_dir=output_cache_file_prefix)
|
735
735
|
print(f'Compute r2 matrix for chr{chr} done!')
|
736
|
-
|
737
|
-
|
738
|
-
if __name__ == '__main__':
|
739
|
-
bfile_prefix = '/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC'
|
740
|
-
chromosome_list = range(1, 22)
|
741
|
-
r2_cache_dir = Path('/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/r2_matrix')
|
742
|
-
ld_wind_cm = 1
|
743
|
-
generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_wind_cm)
|