gsMap 1.62__py3-none-any.whl → 1.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,27 +1,28 @@
1
- import os
2
- import numpy as np
3
- import pandas as pd
4
-
5
- import argparse
1
+ import gc
6
2
  import logging
7
- import multiprocessing
3
+ import os
8
4
  from collections import defaultdict
9
5
  from pathlib import Path
10
6
 
7
+ import anndata as ad
8
+ import numpy as np
9
+ import pandas as pd
10
+ import zarr
11
11
  from scipy.stats import norm
12
- from tqdm.contrib.concurrent import process_map
12
+ from tqdm.contrib.concurrent import thread_map
13
13
 
14
- import gsMap.jackknife as jk
15
- from gsMap.config import add_spatial_ldsc_args, SpatialLDSCConfig
16
- from gsMap.regression_read import _read_sumstats, _read_w_ld, _read_ref_ld_v2, _read_M_v2
14
+ import gsMap.utils.jackknife as jk
15
+ from gsMap.config import SpatialLDSCConfig
16
+ from gsMap.utils.regression_read import _read_sumstats, _read_w_ld, _read_ref_ld_v2
17
17
 
18
- logger = logging.getLogger(__name__)
18
+ logger = logging.getLogger('gsMap.spatial_ldsc')
19
19
 
20
20
 
21
21
  # %%
22
22
  def _coef_new(jknife):
23
23
  # return coef[0], coef_se[0], z[0]]
24
- est_ = jknife.est[0, 0] / Nbar
24
+ # est_ = jknife.est[0, 0] / Nbar
25
+ est_ = jknife.jknife_est[0, 0] / Nbar
25
26
  se_ = jknife.jknife_se[0, 0] / Nbar
26
27
  return est_, se_
27
28
 
@@ -68,16 +69,19 @@ def weights(ld, w_ld, N, M, hsq, intercept=1):
68
69
 
69
70
  def jackknife_for_processmap(spot_id):
70
71
  # calculate the initial weight for each spot
72
+ spot_spatial_annotation = spatial_annotation[:, spot_id]
73
+ spot_x_tot_precomputed = spot_spatial_annotation + ref_ld_baseline_column_sum
71
74
  initial_w = (
72
- get_weight_optimized(sumstats, x_tot_precomputed_common_snp[:, spot_id], 10000, w_ld_common_snp, intercept=1)
75
+ get_weight_optimized(sumstats, x_tot_precomputed=spot_x_tot_precomputed,
76
+ M_tot=10000, w_ld=w_ld_common_snp, intercept=1)
73
77
  .astype(np.float32)
74
78
  .reshape((-1, 1)))
75
79
 
76
80
  # apply the weight to baseline annotation, spatial annotation and CHISQ
77
81
  initial_w_scaled = initial_w / np.sum(initial_w)
78
82
  baseline_annotation_spot = baseline_annotation * initial_w_scaled
79
- spatial_annotation_spot = spatial_annotation.iloc[:, spot_id].values.reshape((-1, 1)) * initial_w_scaled
80
- CHISQ = sumstats.chisq.to_numpy(dtype=np.float32).reshape((-1, 1)).copy()
83
+ spatial_annotation_spot = spot_spatial_annotation.reshape((-1, 1)) * initial_w_scaled
84
+ CHISQ = sumstats.chisq.values.reshape((-1, 1))
81
85
  y = CHISQ * initial_w_scaled
82
86
 
83
87
  # run the jackknife
@@ -113,6 +117,9 @@ def _preprocess_sumstats(trait_name, sumstat_file_path, baseline_and_w_ld_common
113
117
  logger.warning(f'WARNING: number of SNPs less than 200k; for {trait_name} this is almost always bad.')
114
118
 
115
119
  sumstats = sumstats.loc[common_snp]
120
+
121
+ # get the common index position of baseline_and_w_ld_common_snp for quick access
122
+ sumstats['common_index_pos'] = pd.Index(baseline_and_w_ld_common_snp).get_indexer(sumstats.index)
116
123
  return sumstats
117
124
 
118
125
 
@@ -132,8 +139,77 @@ def _get_sumstats_from_sumstats_dict(sumstats_config_dict: dict, baseline_and_w_
132
139
  return sumstats_cleaned_dict
133
140
 
134
141
 
142
+ class S_LDSC_Boost_with_pre_calculate_SNP_Gene_weight_matrix:
143
+ def __init__(self, config: SpatialLDSCConfig, common_snp_among_all_sumstats_pos):
144
+ self.config = config
145
+ mk_score = pd.read_feather(config.mkscore_feather_path).set_index('HUMAN_GENE_SYM')
146
+ mk_score_genes = mk_score.index
147
+
148
+ snp_gene_weight_adata = ad.read_h5ad(config.snp_gene_weight_adata_path)
149
+ common_genes = mk_score_genes.intersection(snp_gene_weight_adata.var.index)
150
+ common_snps = snp_gene_weight_adata.obs.index
151
+ # self.snp_gene_weight_adata = snp_gene_weight_adata[common_snp_among_all_sumstats:, common_genes.to_list()]
152
+ self.snp_gene_weight_matrix = snp_gene_weight_adata[common_snp_among_all_sumstats_pos, common_genes.to_list()].X
153
+ self.mk_score_common = mk_score.loc[common_genes]
154
+
155
+ # calculate the chunk number
156
+ self.chunk_starts = list(range(0, self.mk_score_common.shape[1], self.config.spots_per_chunk_quick_mode))
157
+
158
+ def fetch_ldscore_by_chunk(self, chunk_index):
159
+ chunk_start = self.chunk_starts[chunk_index]
160
+ mk_score_chunk = self.mk_score_common.iloc[:,
161
+ chunk_start:chunk_start + self.config.spots_per_chunk_quick_mode]
162
+ ldscore_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
163
+ mk_score_chunk,
164
+ drop_dummy_na=False,
165
+ )
166
+
167
+ spots_name = self.mk_score_common.columns[chunk_start:chunk_start + self.config.spots_per_chunk_quick_mode]
168
+ return ldscore_chunk, spots_name
169
+
170
+ def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
171
+ mk_score_chunk,
172
+ drop_dummy_na=True,
173
+ ):
174
+
175
+ if drop_dummy_na:
176
+ ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
177
+ else:
178
+ ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
179
+
180
+ return ldscore_chr_chunk
181
+
182
+
183
+ def _get_sumstats_with_common_snp_from_sumstats_dict(sumstats_config_dict: dict, baseline_and_w_ld_common_snp: pd.Index,
184
+ chisq_max=None):
185
+ # first validate if all sumstats file exists
186
+ logger.info('Validating sumstats files...')
187
+ for trait_name, sumstat_file_path in sumstats_config_dict.items():
188
+ if not os.path.exists(sumstat_file_path):
189
+ raise FileNotFoundError(f'{sumstat_file_path} not found')
190
+ # then load all sumstats
191
+ sumstats_cleaned_dict = {}
192
+ for trait_name, sumstat_file_path in sumstats_config_dict.items():
193
+ sumstats_cleaned_dict[trait_name] = _preprocess_sumstats(trait_name, sumstat_file_path,
194
+ baseline_and_w_ld_common_snp, chisq_max)
195
+ # get the common snps among all sumstats
196
+ common_snp_among_all_sumstats = None
197
+ for trait_name, sumstats in sumstats_cleaned_dict.items():
198
+ if common_snp_among_all_sumstats is None:
199
+ common_snp_among_all_sumstats = sumstats.index
200
+ else:
201
+ common_snp_among_all_sumstats = common_snp_among_all_sumstats.intersection(sumstats.index)
202
+
203
+ # filter the common snps among all sumstats
204
+ for trait_name, sumstats in sumstats_cleaned_dict.items():
205
+ sumstats_cleaned_dict[trait_name] = sumstats.loc[common_snp_among_all_sumstats]
206
+
207
+ logger.info(f'Common SNPs among all sumstats: {len(common_snp_among_all_sumstats)}')
208
+ return sumstats_cleaned_dict, common_snp_among_all_sumstats
209
+
210
+
135
211
  def run_spatial_ldsc(config: SpatialLDSCConfig):
136
- global spatial_annotation, baseline_annotation, n_blocks, Nbar, sumstats, x_tot_precomputed_common_snp, w_ld_common_snp
212
+ global spatial_annotation, baseline_annotation, n_blocks, Nbar, sumstats, ref_ld_baseline_column_sum, w_ld_common_snp
137
213
  # config
138
214
  n_blocks = config.n_blocks
139
215
  sample_name = config.sample_name
@@ -144,72 +220,107 @@ def run_spatial_ldsc(config: SpatialLDSCConfig):
144
220
  w_ld_cname = w_ld.columns[1]
145
221
  w_ld.set_index('SNP', inplace=True)
146
222
 
147
- # Load the baseline annotations
148
- ld_file_baseline = f'{config.ldscore_input_dir}/baseline/baseline.'
223
+ ld_file_baseline = f'{config.ldscore_save_dir}/baseline/baseline.'
224
+
149
225
  ref_ld_baseline = _read_ref_ld_v2(ld_file_baseline)
150
- n_annot_baseline = len(ref_ld_baseline.columns)
151
- M_annot_baseline = _read_M_v2(ld_file_baseline, n_annot_baseline, config.not_M_5_50)
226
+ # n_annot_baseline = len(ref_ld_baseline.columns)
227
+ # M_annot_baseline = _read_M_v2(ld_file_baseline, n_annot_baseline, config.not_M_5_50)
152
228
 
153
229
  # common snp between baseline and w_ld
154
230
  baseline_and_w_ld_common_snp = ref_ld_baseline.index.intersection(w_ld.index)
155
- if len(baseline_and_w_ld_common_snp) < 200000:
156
- logger.warning(f'WARNING: number of SNPs less than 200k; for {sample_name} this is almost always bad.')
157
- ref_ld_baseline = ref_ld_baseline.loc[baseline_and_w_ld_common_snp]
231
+ baseline_and_w_ld_common_snp_pos = pd.Index(ref_ld_baseline.index).get_indexer(baseline_and_w_ld_common_snp)
232
+
233
+ # Clean the sumstats
234
+ sumstats_cleaned_dict, common_snp_among_all_sumstats = _get_sumstats_with_common_snp_from_sumstats_dict(
235
+ config.sumstats_config_dict, baseline_and_w_ld_common_snp,
236
+ chisq_max=config.chisq_max)
237
+ common_snp_among_all_sumstats_pos = ref_ld_baseline.index.get_indexer(common_snp_among_all_sumstats)
238
+
239
+ # insure the order is monotonic
240
+ assert pd.Series(
241
+ common_snp_among_all_sumstats_pos).is_monotonic_increasing, 'common_snp_among_all_sumstats_pos is not monotonic increasing'
242
+
243
+ if len(common_snp_among_all_sumstats) < 200000:
244
+ logger.warning(
245
+ f'!!!!! WARNING: number of SNPs less than 200k; for {sample_name} this is almost always bad. Please check the sumstats files.')
246
+
247
+ ref_ld_baseline = ref_ld_baseline.loc[common_snp_among_all_sumstats]
248
+ w_ld = w_ld.loc[common_snp_among_all_sumstats]
158
249
 
159
250
  # load additional baseline annotations
160
251
  if config.use_additional_baseline_annotation:
161
- ld_file_baseline_additional = f'{config.ldscore_input_dir}/additional_baseline/baseline.'
252
+ print('Using additional baseline annotations')
253
+ ld_file_baseline_additional = f'{config.ldscore_save_dir}/additional_baseline/baseline.'
162
254
  ref_ld_baseline_additional = _read_ref_ld_v2(ld_file_baseline_additional)
163
255
  n_annot_baseline_additional = len(ref_ld_baseline_additional.columns)
164
256
  logger.info(f'{len(ref_ld_baseline_additional.columns)} additional baseline annotations loaded')
165
257
  # M_annot_baseline_additional = _read_M_v2(ld_file_baseline_additional, n_annot_baseline_additional,
166
258
  # config.not_M_5_50)
167
- ref_ld_baseline_additional = ref_ld_baseline_additional.loc[baseline_and_w_ld_common_snp]
259
+ ref_ld_baseline_additional = ref_ld_baseline_additional.loc[common_snp_among_all_sumstats]
168
260
  ref_ld_baseline = pd.concat([ref_ld_baseline, ref_ld_baseline_additional], axis=1)
169
261
  del ref_ld_baseline_additional
170
262
 
171
- w_ld = w_ld.loc[baseline_and_w_ld_common_snp]
172
-
173
- # Clean the sumstats
174
- sumstats_cleaned_dict = _get_sumstats_from_sumstats_dict(config.sumstats_config_dict, baseline_and_w_ld_common_snp,
175
- chisq_max=config.chisq_max)
263
+ # Detect available chunk files
264
+ if config.ldscore_save_format == 'quick_mode':
265
+ s_ldsc = S_LDSC_Boost_with_pre_calculate_SNP_Gene_weight_matrix(config, common_snp_among_all_sumstats_pos)
266
+ total_chunk_number_found = len(s_ldsc.chunk_starts)
267
+ print(f'Split data into {total_chunk_number_found} chunks')
268
+ else:
269
+ all_file = os.listdir(config.ldscore_save_dir)
270
+ total_chunk_number_found = sum('chunk' in name for name in all_file)
271
+ print(f'Find {total_chunk_number_found} chunked files in {config.ldscore_save_dir}')
176
272
 
177
- # Detect avalable chunk files
178
- all_file = os.listdir(config.ldscore_input_dir)
179
273
  if config.all_chunk is None:
180
- all_chunk = sum('chunk' in name for name in all_file)
181
- print(f'\t')
182
- print(f'Find {all_chunk} chunked files')
274
+ if config.chunk_range is not None:
275
+ assert config.chunk_range[0] >= 1 and config.chunk_range[
276
+ 1] <= total_chunk_number_found, 'Chunk range out of bound. It should be in [1, all_chunk]'
277
+ print(
278
+ f'chunk range provided, using chunked files from {config.chunk_range[0]} to {config.chunk_range[1]}')
279
+ start_chunk, end_chunk = config.chunk_range
280
+ else:
281
+ start_chunk, end_chunk = 1, total_chunk_number_found
183
282
  else:
184
283
  all_chunk = config.all_chunk
185
284
  print(f'using {all_chunk} chunked files by provided argument')
186
285
  print(f'\t')
187
286
  print(f'Input {all_chunk} chunked files')
287
+ start_chunk, end_chunk = 1, all_chunk
288
+
289
+ running_chunk_number = end_chunk - start_chunk + 1
188
290
 
189
291
  # Process each chunk
190
292
  output_dict = defaultdict(list)
191
- for chunk_index in range(1, all_chunk + 1):
192
- print(f'------Processing chunk-{chunk_index}')
193
-
194
- # Load the spatial annotations for this chunk
195
- ld_file_spatial = f'{config.ldscore_input_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.'
196
- ref_ld_spatial = _read_ref_ld_v2(ld_file_spatial)
197
- ref_ld_spatial = ref_ld_spatial.loc[baseline_and_w_ld_common_snp]
198
- ref_ld_spatial = ref_ld_spatial.astype(np.float32, copy=False)
293
+ zarr_path = Path(config.ldscore_save_dir) / f'{config.sample_name}.ldscore.zarr'
294
+ if config.ldscore_save_format == 'zarr':
295
+ assert zarr_path.exists(), f'{zarr_path} not found, which is required for zarr format'
296
+ zarr_file = zarr.open(str(zarr_path))
297
+ spots_name = zarr_file.attrs['spot_names']
298
+
299
+ for chunk_index in range(start_chunk, end_chunk + 1):
300
+ if config.ldscore_save_format == 'feather':
301
+ ref_ld_spatial, spatial_annotation_cnames = load_ldscore_chunk_from_feather(chunk_index,
302
+ common_snp_among_all_sumstats_pos,
303
+ config,
304
+ )
305
+ elif config.ldscore_save_format == 'zarr':
306
+ ref_ld_spatial = zarr_file.blocks[:, chunk_index - 1][common_snp_among_all_sumstats_pos]
307
+ start_spot = (chunk_index - 1) * zarr_file.chunks[1]
308
+ ref_ld_spatial = ref_ld_spatial.astype(np.float32, copy=False)
309
+ spatial_annotation_cnames = spots_name[start_spot:start_spot + zarr_file.chunks[1]]
310
+ elif config.ldscore_save_format == 'quick_mode':
311
+ ref_ld_spatial, spatial_annotation_cnames = s_ldsc.fetch_ldscore_by_chunk(chunk_index - 1)
312
+ else:
313
+ raise ValueError(f'Invalid ld score save format: {config.ldscore_save_format}')
199
314
 
200
315
  # get the x_tot_precomputed matrix by adding baseline and spatial annotation
201
- x_tot_precomputed = ref_ld_spatial + ref_ld_baseline.sum(axis=1).values.reshape((-1, 1))
316
+ ref_ld_baseline_column_sum = ref_ld_baseline.sum(axis=1).values
317
+ # x_tot_precomputed = ref_ld_spatial + ref_ld_baseline_column_sum
202
318
 
203
319
  for trait_name, sumstats in sumstats_cleaned_dict.items():
204
- logger.info(f'Processing {trait_name}...')
205
320
 
206
- # filter ldscore by common snp
207
- common_snp = sumstats.index
208
- spatial_annotation = ref_ld_spatial.loc[common_snp].astype(np.float32, copy=False)
209
- spatial_annotation_cnames = spatial_annotation.columns
210
- baseline_annotation = ref_ld_baseline.loc[common_snp].astype(np.float32, copy=False)
211
- w_ld_common_snp = w_ld.loc[common_snp].astype(np.float32, copy=False)
212
- x_tot_precomputed_common_snp = x_tot_precomputed.loc[common_snp].values
321
+ spatial_annotation = ref_ld_spatial.astype(np.float32, copy=False)
322
+ baseline_annotation = ref_ld_baseline.copy().astype(np.float32, copy=False)
323
+ w_ld_common_snp = w_ld.astype(np.float32, copy=False)
213
324
 
214
325
  # weight the baseline annotation by N
215
326
  baseline_annotation = baseline_annotation * sumstats.N.values.reshape((-1, 1)) / sumstats.N.mean()
@@ -219,10 +330,11 @@ def run_spatial_ldsc(config: SpatialLDSCConfig):
219
330
  # Run the jackknife
220
331
  Nbar = sumstats.N.mean()
221
332
  chunk_size = spatial_annotation.shape[1]
222
- out_chunk = process_map(jackknife_for_processmap, range(chunk_size),
223
- max_workers=config.num_processes,
224
- chunksize=10,
225
- desc=f'LDSC chunk-{chunk_index}: {trait_name}')
333
+ out_chunk = thread_map(jackknife_for_processmap, range(chunk_size),
334
+ max_workers=config.num_processes,
335
+ chunksize=10,
336
+ desc=f'Chunk-{chunk_index}/Total-chunk-{running_chunk_number} for {trait_name}',
337
+ )
226
338
 
227
339
  # cache the results
228
340
  out_chunk = pd.DataFrame.from_records(out_chunk,
@@ -230,7 +342,8 @@ def run_spatial_ldsc(config: SpatialLDSCConfig):
230
342
  index=spatial_annotation_cnames)
231
343
  # get the spots with nan
232
344
  nan_spots = out_chunk[out_chunk.isna().any(axis=1)].index
233
- logger.info(f'Nan spots: {nan_spots} in chunk-{chunk_index} for {trait_name}. They are removed.')
345
+ if len(nan_spots) > 0:
346
+ logger.info(f'Nan spots: {nan_spots} in chunk-{chunk_index} for {trait_name}. They are removed.')
234
347
  # drop the nan
235
348
  out_chunk = out_chunk.dropna()
236
349
 
@@ -238,70 +351,32 @@ def run_spatial_ldsc(config: SpatialLDSCConfig):
238
351
  out_chunk['p'] = norm.sf(out_chunk['z'])
239
352
  output_dict[trait_name].append(out_chunk)
240
353
 
241
- # garbage collection
242
- del spatial_annotation
354
+ del ref_ld_spatial, spatial_annotation, baseline_annotation, w_ld_common_snp
355
+ gc.collect()
243
356
 
244
357
  # Save the results
245
- out_dir = Path(config.ldsc_save_dir)
246
- out_dir.mkdir(parents=True, exist_ok=True, mode=0o777)
358
+ out_dir = config.ldsc_save_dir
247
359
  for trait_name, out_chunk_list in output_dict.items():
248
360
  out_all = pd.concat(out_chunk_list, axis=0)
249
- out_file_name = out_dir / f'{sample_name}_{trait_name}.csv.gz'
361
+ if running_chunk_number == total_chunk_number_found:
362
+ out_file_name = out_dir / f'{sample_name}_{trait_name}.csv.gz'
363
+ else:
364
+ out_file_name = out_dir / f'{sample_name}_{trait_name}_chunk{start_chunk}-{end_chunk}.csv.gz'
250
365
  out_all['spot'] = out_all.index
251
366
  out_all = out_all[['spot', 'beta', 'se', 'z', 'p']]
252
367
  out_all.to_csv(out_file_name, compression='gzip', index=False)
368
+
253
369
  logger.info(f'Output saved to {out_file_name} for {trait_name}')
254
370
  logger.info(f'------Spatial LDSC for {sample_name} finished!')
255
371
 
256
372
 
257
- # %%
258
- if __name__ == '__main__':
259
- # Main function of analysis
260
- parser = argparse.ArgumentParser(
261
- description="Run Spatial LD Score Regression (LDSC) analysis for GWAS and spatial transcriptomic data."
262
- )
263
- parser = add_spatial_ldsc_args(parser)
264
- TEST = True
265
- if TEST:
266
- gwas_root = "/storage/yangjianLab/songliyang/GWAS_trait/LDSC"
267
- gwas_trait = "/storage/yangjianLab/songliyang/GWAS_trait/GWAS_Public_Use_MaxPower.csv"
268
- root = "/storage/yangjianLab/songliyang/SpatialData/Data/Brain/Human/Nature_Neuroscience_2021/processed/h5ad"
269
-
270
- name = 'Cortex_151507'
271
- spe_name = name
272
- # ld_pth = f"/storage/yangjianLab/songliyang/SpatialData/Data/Brain/Human/Nature_Neuroscience_2021/annotation/{spe_name}/snp_annotation"
273
- ld_pth = f"/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/snake_workdir/{name}/generate_ldscore"
274
- out_pth = f"/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/snake_workdir/{name}/ldsc"
275
- gwas_file = "ADULT1_ADULT2_ONSET_ASTHMA"
276
- # Prepare the arguments list using f-strings
277
- args_list = [
278
- "--h2", f"{gwas_root}/{gwas_file}.sumstats.gz",
279
- "--w_file", "/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.",
280
- "--sample_name", spe_name,
281
- "--num_processes", '4',
282
- "--ldscore_input_dir", ld_pth,
283
- "--ldsc_save_dir", out_pth,
284
- '--trait_name', 'adult1_adult2_onset_asthma'
285
- ]
286
- # args = parser.parse_args(args_list)
287
- else:
288
- args = parser.parse_args()
289
-
290
- os.chdir('/storage/yangjianLab/chenwenhao/tmp/gsMap_Height_debug')
291
- TASK_ID = 16
292
- spe_name = f'E{TASK_ID}.5_E1S1'
293
- config = SpatialLDSCConfig(**{'all_chunk': None,
294
- 'chisq_max': None,
295
- # 'sumstats_file': '/storage/yangjianLab/songliyang/GWAS_trait/LDSC/GIANT_EUR_Height_2022_Nature.sumstats.gz',
296
- 'ldsc_save_dir': f'{spe_name}/ldsc_results_three_row_sum_sub_config_traits',
297
- 'ldscore_input_dir': '/storage/yangjianLab/songliyang/SpatialData/Data/Embryo/Mice/Cell_MOSTA/annotation/E16.5_E1S1/generate_ldscore_new',
298
- 'n_blocks': 200,
299
- 'not_M_5_50': False,
300
- 'num_processes': 15,
301
- 'sample_name': spe_name,
302
- # 'trait_name': 'GIANT_EUR_Height_2022_Nature',
303
- 'sumstats_config_file': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/src/gsMap/example/sumstats_config_sub.yaml',
304
- 'w_file': '/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.'
305
- })
306
- # config = SpatialLDSCConfig(**vars(args))
307
- run_spatial_ldsc(config)
373
+ def load_ldscore_chunk_from_feather(chunk_index, common_snp_among_all_sumstats_pos, config, ):
374
+ # Load the spatial annotations for this chunk
375
+ sample_name = config.sample_name
376
+ ld_file_spatial = f'{config.ldscore_save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.'
377
+ ref_ld_spatial = _read_ref_ld_v2(ld_file_spatial)
378
+ ref_ld_spatial = ref_ld_spatial.iloc[common_snp_among_all_sumstats_pos]
379
+ ref_ld_spatial = ref_ld_spatial.astype(np.float32, copy=False)
380
+
381
+ spatial_annotation_cnames = ref_ld_spatial.columns
382
+ return ref_ld_spatial.values, spatial_annotation_cnames
@@ -0,0 +1,198 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>{{ title }}</title>
6
+ <meta name="viewport" content="width=device-width, initial-scale=1">
7
+ <!-- Bootstrap CSS -->
8
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
9
+ <!-- Custom Styles -->
10
+ <style>
11
+ body {
12
+ padding: 20px;
13
+ font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
14
+ }
15
+ .plot-container {
16
+ margin-bottom: 50px;
17
+ }
18
+ .section-description {
19
+ color: #6c757d;
20
+ font-size: 0.95rem;
21
+ margin-bottom: 20px;
22
+ }
23
+ .scrollable-table {
24
+ max-height: 400px;
25
+ overflow-y: auto;
26
+ }
27
+ .table thead th {
28
+ position: sticky;
29
+ top: 0;
30
+ background-color: #f8f9fa;
31
+ }
32
+ img {
33
+ max-width: 100%;
34
+ height: auto;
35
+ border: 1px solid #dee2e6;
36
+ border-radius: 5px;
37
+ }
38
+ .gene-select-label {
39
+ font-weight: bold;
40
+ margin-bottom: 10px;
41
+ }
42
+ .collapse-toggle {
43
+ cursor: pointer;
44
+ color: #0d6efd;
45
+ text-decoration: underline;
46
+ }
47
+ </style>
48
+ </head>
49
+ <body>
50
+ <div class="container-fluid">
51
+ <h1 class="mb-4">{{ title }}</h1>
52
+
53
+ <!-- Genetic Spatial Mapping Plot -->
54
+ <div class="plot-container">
55
+ <h2>Genetic Spatial Mapping Plot</h2>
56
+ <p class="section-description">This plot shows the spatial genetic mapping results across different tissues.</p>
57
+ <div class="border rounded p-3">
58
+ {{ genetic_mapping_plot|safe }}
59
+ </div>
60
+ </div>
61
+
62
+ <!-- Cauchy Combination Result Table -->
63
+ <div class="plot-container">
64
+ <h2>Cauchy Combination Result</h2>
65
+ <p class="section-description">This table presents the results of the Cauchy combination test, summarizing the genetic associations.</p>
66
+ <div class="scrollable-table">
67
+ <table class="table table-hover table-bordered">
68
+ <thead class="table-light">
69
+ <tr>
70
+ <th>Annotation</th>
71
+ <th>P Cauchy</th>
72
+ <th>P Median</th>
73
+ </tr>
74
+ </thead>
75
+ <tbody>
76
+ {% for row in cauchy_table %}
77
+ <tr>
78
+ <td>{{ row.annotation }}</td>
79
+ <td>{{ "%.4e"|format(row.p_cauchy) }}</td>
80
+ <td>{{ "%.4e"|format(row.p_median) }}</td>
81
+ </tr>
82
+ {% endfor %}
83
+ </tbody>
84
+ </table>
85
+ </div>
86
+ </div>
87
+
88
+ <!-- Manhattan Plot -->
89
+ <div class="plot-container">
90
+ <h2>Diagnosis Manhattan Plot</h2>
91
+ <p class="section-description">The Manhattan plot shows the association of SNPs with the top associated gene across the genome.</p>
92
+ <div class="border rounded p-3">
93
+ {{ manhattan_plot|safe }}
94
+ </div>
95
+ </div>
96
+
97
+ <!-- Gene Expression and GSS Distribution -->
98
+ <div class="plot-container">
99
+ <h2>Gene Expression and GSS Distribution</h2>
100
+ <p class="section-description">Select a gene to view its expression distribution and gene specificity score (GSS).</p>
101
+ <label for="geneSelect" class="gene-select-label">Select a gene:</label>
102
+ <select id="geneSelect" class="form-select mb-4">
103
+ {% for gene in gene_plots %}
104
+ <option value="{{ gene.name }}">{{ gene.name }}</option>
105
+ {% endfor %}
106
+ </select>
107
+ <div id="genePlots" class="row">
108
+ <div class="col-md-6 mb-4">
109
+ <h5>Expression Distribution</h5>
110
+ <img src="{{ gene_plots[0].expression_plot }}" alt="{{ gene_plots[0].name }} Expression Distribution" id="expressionPlotImg" class="img-fluid">
111
+ </div>
112
+ <div class="col-md-6 mb-4">
113
+ <h5>Gene Specificity Score (GSS)</h5>
114
+ <img src="{{ gene_plots[0].gss_plot }}" alt="{{ gene_plots[0].name }} GSS Distribution" id="gssPlotImg" class="img-fluid">
115
+ </div>
116
+ </div>
117
+ </div>
118
+
119
+ <!-- Gene Diagnostic Info Table -->
120
+ <div class="plot-container">
121
+ <h2>Top 50 Gene Diagnostic Info</h2>
122
+ <p class="section-description">This table lists the top 50 genes based on diagnostic criteria, including the gene specificity score (GSS) and PCC.</p>
123
+ <div class="scrollable-table">
124
+ <table class="table table-hover table-bordered">
125
+ <thead class="table-light">
126
+ <tr>
127
+ <th>Gene</th>
128
+ <th>Annotation</th>
129
+ <th>Median GSS</th>
130
+ <th>PCC</th>
131
+ </tr>
132
+ </thead>
133
+ <tbody>
134
+ {% for row in gene_diagnostic_info %}
135
+ <tr>
136
+ <td>{{ row.Gene }}</td>
137
+ <td>{{ row.Annotation }}</td>
138
+ <td>{{ "%.4f"|format(row.Median_GSS) }}</td>
139
+ <td>{{ "%.4f"|format(row.PCC) }}</td>
140
+ </tr>
141
+ {% endfor %}
142
+ </tbody>
143
+ </table>
144
+ </div>
145
+ </div>
146
+
147
+ <!-- Running Info (collapsible) -->
148
+ <div class="plot-container">
149
+ <h2>Running Info</h2>
150
+ <p class="section-description">Click to view detailed run information and parameters.</p>
151
+ <p class="collapse-toggle" data-bs-toggle="collapse" href="#runningInfo" role="button" aria-expanded="false" aria-controls="runningInfo">
152
+ Show/Hide Running Info
153
+ </p>
154
+ <div class="collapse" id="runningInfo">
155
+ <div class="card card-body">
156
+ <p><strong>gsMap Version:</strong> {{ gsmap_version }}</p>
157
+ <p><strong>Parameters:</strong></p>
158
+ <ul class="mb-0">
159
+ {% for key, value in parameters.items() %}
160
+ <li><strong>{{ key }}:</strong> {{ value }}</li>
161
+ {% endfor %}
162
+ </ul>
163
+ </div>
164
+ </div>
165
+ </div>
166
+ </div>
167
+
168
+ <!-- JavaScript for Gene Plots -->
169
+ <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
170
+ <script>
171
+ (function() {
172
+ const geneSelect = document.getElementById('geneSelect');
173
+ const expressionPlotImg = document.getElementById('expressionPlotImg');
174
+ const gssPlotImg = document.getElementById('gssPlotImg');
175
+
176
+ const genePlots = {
177
+ {% for gene in gene_plots %}
178
+ "{{ gene.name }}": {
179
+ expression_plot: "{{ gene.expression_plot }}",
180
+ gss_plot: "{{ gene.gss_plot }}"
181
+ }{% if not loop.last %},{% endif %}
182
+ {% endfor %}
183
+ };
184
+
185
+ geneSelect.addEventListener('change', function() {
186
+ const selectedGene = this.value;
187
+ const selectedGenePlots = genePlots[selectedGene];
188
+
189
+ // Update images
190
+ expressionPlotImg.src = selectedGenePlots.expression_plot;
191
+ expressionPlotImg.alt = `${selectedGene} Expression Distribution`;
192
+ gssPlotImg.src = selectedGenePlots.gss_plot;
193
+ gssPlotImg.alt = `${selectedGene} GSS Distribution`;
194
+ });
195
+ })();
196
+ </script>
197
+ </body>
198
+ </html>
File without changes
@@ -2,7 +2,7 @@ from pathlib import Path
2
2
  import bitarray as ba
3
3
  import numpy as np
4
4
  import pandas as pd
5
- from scipy.sparse import csr_matrix,csc_matrix
5
+ from scipy.sparse import csr_matrix
6
6
  from scipy.sparse import save_npz, load_npz
7
7
  from tqdm import trange, tqdm
8
8
 
@@ -69,7 +69,7 @@ def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
69
69
  raise ValueError('{f} filename must end in {f}'.format(f=end))
70
70
  comp = get_compression(fname)
71
71
  self.df = pd.read_csv(fname, header=self.header, usecols=self.usecols,
72
- delim_whitespace=True, compression=comp)
72
+ sep='\s+', compression=comp)
73
73
  if self.colnames:
74
74
  self.df.columns = self.colnames
75
75
  if self.keepcol is not None:
@@ -733,11 +733,3 @@ def generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_win
733
733
  ld_wind_cm=ld_wind_cm,
734
734
  output_cache_file_dir=output_cache_file_prefix)
735
735
  print(f'Compute r2 matrix for chr{chr} done!')
736
-
737
-
738
- if __name__ == '__main__':
739
- bfile_prefix = '/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC'
740
- chromosome_list = range(1, 22)
741
- r2_cache_dir = Path('/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/r2_matrix')
742
- ld_wind_cm = 1
743
- generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_wind_cm)