gsMap 1.62__py3-none-any.whl → 1.63__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN_VAE/adjacency_matrix.py +1 -1
- gsMap/GNN_VAE/model.py +5 -5
- gsMap/GNN_VAE/train.py +1 -1
- gsMap/__init__.py +1 -1
- gsMap/cauchy_combination_test.py +14 -36
- gsMap/config.py +473 -404
- gsMap/diagnosis.py +273 -0
- gsMap/find_latent_representation.py +22 -86
- gsMap/format_sumstats.py +79 -82
- gsMap/generate_ldscore.py +145 -78
- gsMap/latent_to_gene.py +65 -104
- gsMap/main.py +1 -9
- gsMap/report.py +160 -0
- gsMap/run_all_mode.py +195 -0
- gsMap/spatial_ldsc_multiple_sumstats.py +187 -112
- gsMap/templates/report_template.html +198 -0
- gsMap/utils/__init__.py +0 -0
- gsMap/{generate_r2_matrix.py → utils/generate_r2_matrix.py} +1 -9
- gsMap/{make_annotations.py → utils/make_annotations.py} +1 -43
- gsMap/utils/manhattan_plot.py +639 -0
- gsMap/{regression_read.py → utils/regression_read.py} +1 -1
- gsMap/visualize.py +100 -55
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/METADATA +16 -46
- gsmap-1.63.dist-info/RECORD +30 -0
- gsmap-1.62.dist-info/RECORD +0 -24
- /gsMap/{jackknife.py → utils/jackknife.py} +0 -0
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/LICENSE +0 -0
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/WHEEL +0 -0
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/entry_points.txt +0 -0
gsMap/generate_ldscore.py
CHANGED
@@ -1,24 +1,19 @@
|
|
1
|
-
import argparse
|
2
1
|
import logging
|
2
|
+
import warnings
|
3
3
|
from pathlib import Path
|
4
4
|
|
5
5
|
import numpy as np
|
6
|
-
# %%
|
7
6
|
import pandas as pd
|
8
7
|
import pyranges as pr
|
8
|
+
import zarr
|
9
9
|
from scipy.sparse import csr_matrix
|
10
10
|
from tqdm import trange
|
11
11
|
|
12
|
-
from gsMap.config import GenerateLDScoreConfig
|
13
|
-
|
14
|
-
from gsMap.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
|
12
|
+
from gsMap.config import GenerateLDScoreConfig
|
13
|
+
from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
|
15
14
|
|
15
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
16
16
|
logger = logging.getLogger(__name__)
|
17
|
-
logger.setLevel(logging.DEBUG)
|
18
|
-
handler = logging.StreamHandler()
|
19
|
-
handler.setFormatter(logging.Formatter(
|
20
|
-
'[{asctime}] {levelname:6s} {message}', style='{'))
|
21
|
-
logger.addHandler(handler)
|
22
17
|
|
23
18
|
|
24
19
|
# %%
|
@@ -30,7 +25,7 @@ def load_gtf(gtf_file, mk_score, window_size):
|
|
30
25
|
print("Loading gtf data")
|
31
26
|
#
|
32
27
|
# Load GTF file
|
33
|
-
gtf = pr.read_gtf(gtf_file)
|
28
|
+
gtf = pr.read_gtf(gtf_file, )
|
34
29
|
gtf = gtf.df
|
35
30
|
#
|
36
31
|
# Select the common genes
|
@@ -82,14 +77,16 @@ def load_bim(bfile_root, chrom):
|
|
82
77
|
"""
|
83
78
|
Load the bim file.
|
84
79
|
"""
|
85
|
-
print("Loading bim data")
|
86
80
|
bim = pd.read_csv(f'{bfile_root}.{chrom}.bim', sep='\t', header=None)
|
87
81
|
bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
|
88
82
|
#
|
89
83
|
# Transform bim to PyRanges
|
90
84
|
bim_pr = bim.copy()
|
91
85
|
bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
|
92
|
-
|
86
|
+
|
87
|
+
bim_pr['End'] = bim_pr['Start'].copy()
|
88
|
+
bim_pr['Start'] = bim_pr['Start'] - 1 # Due to bim file is 1-based
|
89
|
+
|
93
90
|
bim_pr = pr.PyRanges(bim_pr)
|
94
91
|
bim_pr.Chromosome = f'chr{chrom}'
|
95
92
|
return bim, bim_pr
|
@@ -110,6 +107,36 @@ def Overlaps_gtf_bim(gtf_pr, bim_pr):
|
|
110
107
|
|
111
108
|
|
112
109
|
# %%
|
110
|
+
def filter_snps_by_keep_snp(bim_df, keep_snp_file):
|
111
|
+
# Load the keep_snp file and filter the BIM DataFrame
|
112
|
+
keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
|
113
|
+
filtered_bim_df = bim_df[bim_df['SNP'].isin(keep_snp)]
|
114
|
+
return filtered_bim_df
|
115
|
+
|
116
|
+
|
117
|
+
def get_snp_counts(config):
|
118
|
+
snp_counts = {}
|
119
|
+
total_snp = 0
|
120
|
+
|
121
|
+
for chrom in range(1, 23):
|
122
|
+
bim_df, _ = load_bim(config.bfile_root, chrom)
|
123
|
+
|
124
|
+
if config.keep_snp_root:
|
125
|
+
keep_snp_file = f'{config.keep_snp_root}.{chrom}.snp'
|
126
|
+
filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
|
127
|
+
else:
|
128
|
+
filtered_bim_df = bim_df
|
129
|
+
|
130
|
+
snp_counts[chrom] = filtered_bim_df.shape[0]
|
131
|
+
total_snp += snp_counts[chrom]
|
132
|
+
|
133
|
+
snp_counts['total'] = total_snp
|
134
|
+
|
135
|
+
chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
|
136
|
+
|
137
|
+
snp_counts['chrom_snp_start_point'] = [0] + chrom_snp_length_array.tolist()
|
138
|
+
|
139
|
+
return snp_counts
|
113
140
|
|
114
141
|
|
115
142
|
# %%
|
@@ -189,7 +216,7 @@ def calculate_ldscore_from_annotation(SNP_annotation_df, chrom, bfile_root, ld_w
|
|
189
216
|
|
190
217
|
|
191
218
|
def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
|
192
|
-
SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1)
|
219
|
+
SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1).astype(np.float32, copy=False)
|
193
220
|
|
194
221
|
snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
|
195
222
|
ld_unit=ld_unit)
|
@@ -212,7 +239,7 @@ class S_LDSC_Boost:
|
|
212
239
|
def __init__(self, config: GenerateLDScoreConfig):
|
213
240
|
self.config = config
|
214
241
|
|
215
|
-
self.mk_score = load_marker_score(config.
|
242
|
+
self.mk_score = load_marker_score(config.mkscore_feather_path)
|
216
243
|
|
217
244
|
# Load GTF and get common markers
|
218
245
|
self.gtf_pr, self.mk_score_common = load_gtf(config.gtf_annotation_file, self.mk_score,
|
@@ -237,6 +264,25 @@ class S_LDSC_Boost:
|
|
237
264
|
else:
|
238
265
|
self.enhancer_pr = None
|
239
266
|
|
267
|
+
# create tha zarr file
|
268
|
+
if config.ldscore_save_format == 'zarr':
|
269
|
+
|
270
|
+
chrom_snp_length_dict = get_snp_counts(config)
|
271
|
+
self.chrom_snp_start_point = chrom_snp_length_dict['chrom_snp_start_point']
|
272
|
+
|
273
|
+
zarr_path = Path(config.ldscore_save_dir) / f'{config.sample_name}.ldscore.zarr'
|
274
|
+
if not zarr_path.exists():
|
275
|
+
self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a', dtype=np.float16,
|
276
|
+
chunks=config.zarr_chunk_size,
|
277
|
+
shape=(chrom_snp_length_dict['total'], self.mk_score_common.shape[1]))
|
278
|
+
zarr_path.mkdir(parents=True, exist_ok=True)
|
279
|
+
# save spot names
|
280
|
+
self.zarr_file.attrs['spot_names'] = self.mk_score_common.columns.to_list()
|
281
|
+
# save chrom_snp_length_dict
|
282
|
+
self.zarr_file.attrs['chrom_snp_start_point'] = self.chrom_snp_start_point
|
283
|
+
else:
|
284
|
+
self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a')
|
285
|
+
|
240
286
|
def process_chromosome(self, chrom: int):
|
241
287
|
self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
|
242
288
|
|
@@ -252,9 +298,9 @@ class S_LDSC_Boost:
|
|
252
298
|
self.keep_snp_mask = None
|
253
299
|
self.snp_name = self.snp_gene_pair_dummy.index.to_list()
|
254
300
|
|
255
|
-
if self.config.
|
256
|
-
|
257
|
-
additional_baseline_annotation_file_path =
|
301
|
+
if self.config.additional_baseline_annotation is not None:
|
302
|
+
additional_baseline_annotation = Path(self.config.additional_baseline_annotation)
|
303
|
+
additional_baseline_annotation_file_path = additional_baseline_annotation / f'baseline.{chrom}.annot.gz'
|
258
304
|
assert additional_baseline_annotation_file_path.exists(), f'additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}'
|
259
305
|
additional_baseline_annotation_df = pd.read_csv(additional_baseline_annotation_file_path, sep='\t')
|
260
306
|
additional_baseline_annotation_df.set_index('SNP', inplace=True)
|
@@ -274,7 +320,7 @@ class S_LDSC_Boost:
|
|
274
320
|
additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
|
275
321
|
self.snp_gene_pair_dummy.index)
|
276
322
|
|
277
|
-
# do this for saving the cpu time,
|
323
|
+
# do this for saving the cpu time, only calculate r2 once
|
278
324
|
self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
|
279
325
|
calculate_ldscore_from_multiple_annotation(
|
280
326
|
[self.snp_gene_pair_dummy, additional_baseline_annotation_df],
|
@@ -283,21 +329,24 @@ class S_LDSC_Boost:
|
|
283
329
|
ld_wind=self.config.ld_wind,
|
284
330
|
ld_unit=self.config.ld_unit))
|
285
331
|
|
332
|
+
additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[self.snp_name]
|
333
|
+
# print(additional_baseline_annotation_ldscore.index.to_list()==self.snp_name)
|
334
|
+
|
286
335
|
ld_score_file = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather'
|
287
336
|
M_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M'
|
288
337
|
M_5_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50'
|
289
338
|
|
290
339
|
# save additional baseline annotation ldscore
|
291
|
-
self.
|
292
|
-
|
293
|
-
|
294
|
-
|
340
|
+
self.save_ldscore_to_feather(additional_baseline_annotation_ldscore.values,
|
341
|
+
column_names=additional_baseline_annotation_ldscore.columns,
|
342
|
+
save_file_name=ld_score_file,
|
343
|
+
)
|
295
344
|
|
296
345
|
# caculate the M and save
|
297
346
|
save_dir = Path(M_file_path).parent
|
298
347
|
save_dir.mkdir(parents=True, exist_ok=True)
|
299
348
|
M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
|
300
|
-
M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0,keepdims=True)
|
349
|
+
M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0, keepdims=True)
|
301
350
|
np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
|
302
351
|
np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
|
303
352
|
|
@@ -307,13 +356,27 @@ class S_LDSC_Boost:
|
|
307
356
|
self.config.bfile_root,
|
308
357
|
ld_wind=self.config.ld_wind,
|
309
358
|
ld_unit=self.config.ld_unit)
|
359
|
+
# only keep the snp in keep_snp_root
|
360
|
+
if self.keep_snp_mask is not None:
|
361
|
+
self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
|
362
|
+
|
363
|
+
if self.config.save_pre_calculate_snp_gene_weight_matrix:
|
364
|
+
snp_gene_weight_matrix_save_dir = Path(self.config.ldscore_save_dir) / 'snp_gene_weight_matrix'
|
365
|
+
snp_gene_weight_matrix_save_dir.mkdir(parents=True, exist_ok=True)
|
366
|
+
logger.info(f'Saving snp_gene_weight_matrix for chr{chrom}...')
|
367
|
+
self.snp_gene_weight_matrix.reset_index().to_feather(
|
368
|
+
snp_gene_weight_matrix_save_dir / f'{chrom}.snp_gene_weight_matrix.feather')
|
369
|
+
|
310
370
|
# convert to sparse
|
311
371
|
self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
|
372
|
+
logger.info(f'Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}')
|
312
373
|
|
313
374
|
# calculate baseline ld score
|
375
|
+
logger.info(f'Calculating baseline ld score for chr{chrom}...')
|
314
376
|
self.calculate_ldscore_for_base_line(chrom, self.config.sample_name, self.config.ldscore_save_dir)
|
315
377
|
|
316
378
|
# calculate ld score for annotation
|
379
|
+
logger.info(f'Calculating ld score for annotation for chr{chrom}...')
|
317
380
|
self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
|
318
381
|
self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
|
319
382
|
chrom,
|
@@ -323,7 +386,6 @@ class S_LDSC_Boost:
|
|
323
386
|
|
324
387
|
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
|
325
388
|
mk_score_chunk,
|
326
|
-
save_file_name,
|
327
389
|
drop_dummy_na=True,
|
328
390
|
):
|
329
391
|
|
@@ -332,20 +394,18 @@ class S_LDSC_Boost:
|
|
332
394
|
else:
|
333
395
|
ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
|
334
396
|
|
335
|
-
|
336
|
-
column_names=mk_score_chunk.columns,
|
337
|
-
save_file_name=save_file_name,
|
338
|
-
)
|
397
|
+
return ldscore_chr_chunk
|
339
398
|
|
340
|
-
def
|
399
|
+
def save_ldscore_to_feather(self, ldscore_chr_chunk: np.ndarray, column_names, save_file_name):
|
341
400
|
save_dir = Path(save_file_name).parent
|
342
401
|
save_dir.mkdir(parents=True, exist_ok=True)
|
343
402
|
|
344
403
|
ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
|
345
404
|
# avoid overflow of float16, if inf, set to max of float16
|
346
405
|
ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
|
347
|
-
ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
|
348
|
-
|
406
|
+
# ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
|
407
|
+
# self.keep_snp_mask]
|
408
|
+
|
349
409
|
# save for each chunk
|
350
410
|
df = pd.DataFrame(ldscore_chr_chunk,
|
351
411
|
index=self.snp_name,
|
@@ -354,6 +414,20 @@ class S_LDSC_Boost:
|
|
354
414
|
df.index.name = 'SNP'
|
355
415
|
df.reset_index().to_feather(save_file_name)
|
356
416
|
|
417
|
+
def save_ldscore_chunk_to_zarr(self, ldscore_chr_chunk: np.ndarray,
|
418
|
+
chrom: int, start_col_index,
|
419
|
+
):
|
420
|
+
ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
|
421
|
+
# avoid overflow of float16, if inf, set to max of float16
|
422
|
+
ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
|
423
|
+
|
424
|
+
# save for each chunk
|
425
|
+
chrom_snp_start_point = self.chrom_snp_start_point[chrom - 1]
|
426
|
+
chrom_snp_end_point = self.chrom_snp_start_point[chrom]
|
427
|
+
|
428
|
+
self.zarr_file[chrom_snp_start_point:chrom_snp_end_point,
|
429
|
+
start_col_index:start_col_index + ldscore_chr_chunk.shape[1]] = ldscore_chr_chunk
|
430
|
+
|
357
431
|
def calculate_M_use_SNP_gene_pair_dummy_by_chunk(self,
|
358
432
|
mk_score_chunk,
|
359
433
|
M_file_path, M_5_file_path,
|
@@ -377,7 +451,6 @@ class S_LDSC_Boost:
|
|
377
451
|
np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
|
378
452
|
np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
|
379
453
|
|
380
|
-
|
381
454
|
def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(self, mk_score_common, chrom, sample_name, save_dir):
|
382
455
|
"""
|
383
456
|
Calculate the LD score using the SNP-gene weight matrix.
|
@@ -393,11 +466,23 @@ class S_LDSC_Boost:
|
|
393
466
|
M_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M'
|
394
467
|
M_5_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50'
|
395
468
|
|
396
|
-
self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
469
|
+
ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
397
470
|
mk_score_chunk,
|
398
|
-
save_file_name=ld_score_file,
|
399
471
|
drop_dummy_na=True,
|
400
472
|
)
|
473
|
+
if self.config.ldscore_save_format == 'feather':
|
474
|
+
self.save_ldscore_to_feather(ldscore_chr_chunk,
|
475
|
+
column_names=mk_score_chunk.columns,
|
476
|
+
save_file_name=ld_score_file,
|
477
|
+
)
|
478
|
+
elif self.config.ldscore_save_format == 'zarr':
|
479
|
+
self.save_ldscore_chunk_to_zarr(ldscore_chr_chunk,
|
480
|
+
chrom=chrom,
|
481
|
+
start_col_index=i,
|
482
|
+
)
|
483
|
+
else:
|
484
|
+
raise ValueError(f'Invalid ldscore_save_format: {self.config.ldscore_save_format}')
|
485
|
+
|
401
486
|
self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
|
402
487
|
mk_score_chunk,
|
403
488
|
M_file,
|
@@ -417,11 +502,15 @@ class S_LDSC_Boost:
|
|
417
502
|
M_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M'
|
418
503
|
M_5_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M_5_50'
|
419
504
|
|
420
|
-
self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
505
|
+
ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
|
421
506
|
baseline_mk_score_df,
|
422
|
-
save_file_name=ld_score_file,
|
423
507
|
drop_dummy_na=False,
|
424
508
|
)
|
509
|
+
|
510
|
+
self.save_ldscore_to_feather(ldscore_chr_chunk,
|
511
|
+
column_names=baseline_mk_score_df.columns,
|
512
|
+
save_file_name=ld_score_file,
|
513
|
+
)
|
425
514
|
# save baseline M
|
426
515
|
self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
|
427
516
|
baseline_mk_score_df,
|
@@ -435,6 +524,7 @@ class S_LDSC_Boost:
|
|
435
524
|
Get the dummy matrix of SNP-gene pairs.
|
436
525
|
"""
|
437
526
|
# Load the bim file
|
527
|
+
print("Loading bim data")
|
438
528
|
bim, bim_pr = load_bim(self.config.bfile_root, chrom)
|
439
529
|
|
440
530
|
if self.config.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
|
@@ -468,6 +558,12 @@ class S_LDSC_Boost:
|
|
468
558
|
else:
|
469
559
|
raise ValueError('gtf_pr and enhancer_pr cannot be None at the same time')
|
470
560
|
|
561
|
+
# save the SNP_gene_pair to feather
|
562
|
+
SNP_gene_pair_save_path = Path(
|
563
|
+
self.config.ldscore_save_dir) / f'SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather'
|
564
|
+
SNP_gene_pair_save_path.parent.mkdir(parents=True, exist_ok=True)
|
565
|
+
SNP_gene_pair.reset_index().to_feather(SNP_gene_pair_save_path)
|
566
|
+
|
471
567
|
# Get the dummy matrix
|
472
568
|
SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair['gene_name'], dummy_na=True)
|
473
569
|
return SNP_gene_pair_dummy
|
@@ -502,50 +598,21 @@ class S_LDSC_Boost:
|
|
502
598
|
|
503
599
|
|
504
600
|
def run_generate_ldscore(config: GenerateLDScoreConfig):
|
601
|
+
if config.ldscore_save_format == 'quick_mode':
|
602
|
+
logger.info('Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore.')
|
603
|
+
ldscore_save_dir = config.ldscore_save_dir
|
604
|
+
|
605
|
+
# link the baseline annotation
|
606
|
+
baseline_annotation_dir = Path(config.baseline_annotation_dir)
|
607
|
+
(ldscore_save_dir / 'baseline').symlink_to(baseline_annotation_dir, target_is_directory=True)
|
608
|
+
|
609
|
+
# link the SNP_gene_pair
|
610
|
+
SNP_gene_pair_dir = Path(config.SNP_gene_pair_dir)
|
611
|
+
(ldscore_save_dir / 'SNP_gene_pair').symlink_to(SNP_gene_pair_dir, target_is_directory=True)
|
612
|
+
return
|
505
613
|
s_ldsc_boost = S_LDSC_Boost(config)
|
506
614
|
if config.chrom == 'all':
|
507
615
|
for chrom in range(1, 23):
|
508
616
|
s_ldsc_boost.process_chromosome(chrom)
|
509
617
|
else:
|
510
618
|
s_ldsc_boost.process_chromosome(config.chrom)
|
511
|
-
|
512
|
-
|
513
|
-
# %%
|
514
|
-
if __name__ == '__main__':
|
515
|
-
TEST = True
|
516
|
-
if TEST:
|
517
|
-
# %%
|
518
|
-
sample_name = 'Cortex_151507'
|
519
|
-
chrom = 'all'
|
520
|
-
save_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/Cortex_151507/snp_annotation/test/0101/sparse'
|
521
|
-
# %%
|
522
|
-
gtf_file = '/storage/yangjianLab/songliyang/ReferenceGenome/GRCh37/gencode.v39lift37.annotation.gtf'
|
523
|
-
mkscore_feather_file = f'/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/{sample_name}/gene_markers/{sample_name}_rank.feather'
|
524
|
-
bfile_root = '/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC'
|
525
|
-
window_size = 50000
|
526
|
-
keep_snp_root = '/storage/yangjianLab/sharedata/LDSC_resource/hapmap3_snps/hm'
|
527
|
-
spots_per_chunk = 10_000
|
528
|
-
enhancer_annotation = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/resource/epigenome/cleaned_data/by_tissue/BRN/ABC_roadmap_merged.bed'
|
529
|
-
# %%
|
530
|
-
config = GenerateLDScoreConfig(
|
531
|
-
sample_name=sample_name,
|
532
|
-
chrom=chrom,
|
533
|
-
ldscore_save_dir=save_dir,
|
534
|
-
gtf_annotation_file=gtf_file,
|
535
|
-
mkscore_feather_file=mkscore_feather_file,
|
536
|
-
bfile_root=bfile_root,
|
537
|
-
keep_snp_root=keep_snp_root,
|
538
|
-
gene_window_size=window_size,
|
539
|
-
spots_per_chunk=spots_per_chunk,
|
540
|
-
enhancer_annotation_file=enhancer_annotation,
|
541
|
-
gene_window_enhancer_priority='enhancer_first',
|
542
|
-
additional_baseline_annotation_dir_path='/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/resource/ldsc/baseline_v1.2/remove_base'
|
543
|
-
)
|
544
|
-
# %%
|
545
|
-
run_generate_ldscore(config)
|
546
|
-
else:
|
547
|
-
parser = argparse.ArgumentParser(description="Configuration for the application.")
|
548
|
-
add_generate_ldscore_args(parser)
|
549
|
-
args = parser.parse_args()
|
550
|
-
config = GenerateLDScoreConfig(**vars(args))
|
551
|
-
run_generate_ldscore(config)
|
gsMap/latent_to_gene.py
CHANGED
@@ -1,8 +1,4 @@
|
|
1
|
-
import argparse
|
2
1
|
import logging
|
3
|
-
import multiprocessing
|
4
|
-
import pprint
|
5
|
-
import time
|
6
2
|
from pathlib import Path
|
7
3
|
|
8
4
|
import numpy as np
|
@@ -14,14 +10,9 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
14
10
|
from sklearn.neighbors import NearestNeighbors
|
15
11
|
from tqdm import tqdm
|
16
12
|
|
17
|
-
from gsMap.config import
|
13
|
+
from gsMap.config import LatentToGeneConfig
|
18
14
|
|
19
15
|
logger = logging.getLogger(__name__)
|
20
|
-
logger.setLevel(logging.DEBUG)
|
21
|
-
handler = logging.StreamHandler()
|
22
|
-
handler.setFormatter(logging.Formatter(
|
23
|
-
'[{asctime}] {levelname:8s} {filename} {message}', style='{'))
|
24
|
-
logger.addHandler(handler)
|
25
16
|
|
26
17
|
|
27
18
|
def find_Neighbors(coor, num_neighbour):
|
@@ -49,31 +40,31 @@ def _build_spatial_net(adata, annotation, num_neighbour):
|
|
49
40
|
"""
|
50
41
|
1 Build spatial neighbourhood matrix for each spot (cell) based on the spatial coord
|
51
42
|
"""
|
52
|
-
|
43
|
+
logger.info(f'------Building spatial graph based on spatial coordinates...')
|
53
44
|
|
54
45
|
coor = pd.DataFrame(adata.obsm['spatial'])
|
55
46
|
coor.index = adata.obs.index
|
56
47
|
|
57
48
|
if not annotation is None:
|
58
|
-
|
49
|
+
logger.info(f'Cell annotations are provided...')
|
59
50
|
spatial_net = pd.DataFrame()
|
60
51
|
# Cells with annotations
|
61
52
|
for ct in adata.obs[annotation].dropna().unique():
|
62
53
|
coor_temp = coor.loc[adata.obs[annotation] == ct, :]
|
63
54
|
spatial_net_temp = find_Neighbors(coor_temp, min(num_neighbour, coor_temp.shape[0]))
|
64
55
|
spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
|
65
|
-
|
56
|
+
logger.info(f'{ct}: {coor_temp.shape[0]} cells')
|
66
57
|
|
67
58
|
# Cells labeled as nan
|
68
59
|
if pd.isnull(adata.obs[annotation]).any():
|
69
60
|
cell_nan = adata.obs.index[np.where(pd.isnull(adata.obs[annotation]))[0]]
|
70
|
-
|
61
|
+
logger.info(f'Nan: {len(cell_nan)} cells')
|
71
62
|
|
72
63
|
spatial_net_temp = find_Neighbors(coor, num_neighbour)
|
73
64
|
spatial_net_temp = spatial_net_temp.loc[spatial_net_temp.Cell1.isin(cell_nan), :]
|
74
65
|
spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
|
75
66
|
else:
|
76
|
-
|
67
|
+
logger.info(f'Cell annotations are not provided...')
|
77
68
|
spatial_net = find_Neighbors(coor, num_neighbour)
|
78
69
|
|
79
70
|
return spatial_net
|
@@ -117,7 +108,7 @@ def _compute_regional_mkscore(cell_tg, ):
|
|
117
108
|
# Simultaneously consider the ratio of expression fractions and ranks
|
118
109
|
gene_ranks_region = (gene_ranks_region * frac_region).values
|
119
110
|
|
120
|
-
mkscore = np.exp(gene_ranks_region **
|
111
|
+
mkscore = np.exp(gene_ranks_region ** 1.5) - 1
|
121
112
|
return mkscore.astype(np.float16, copy=False)
|
122
113
|
|
123
114
|
|
@@ -125,31 +116,39 @@ def run_latent_to_gene(config: LatentToGeneConfig):
|
|
125
116
|
global adata, coor_latent, spatial_net, ranks, frac_whole, args, spatial_net_dict, expressed_mask
|
126
117
|
args = config
|
127
118
|
# Load and process the spatial data
|
128
|
-
|
129
|
-
adata = sc.read_h5ad(config.
|
119
|
+
logger.info('------Loading the spatial data...')
|
120
|
+
adata = sc.read_h5ad(config.hdf5_with_latent_path)
|
121
|
+
|
122
|
+
logger.info('------Ranking the spatial data...')
|
123
|
+
adata.layers['rank'] = rankdata(adata.X.toarray().astype(np.float32), axis=1).astype(np.float32)
|
124
|
+
|
130
125
|
if not config.annotation is None:
|
131
|
-
|
126
|
+
logger.info(f'------Cell annotations are provided as {config.annotation}...')
|
132
127
|
adata = adata[~pd.isnull(adata.obs[config.annotation]), :]
|
133
|
-
# Homologs transformation
|
134
|
-
if not config.species is None:
|
135
|
-
print(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
|
136
|
-
homologs = pd.read_csv(config.gs_species, sep='\t')
|
137
|
-
homologs.index = homologs[config.species]
|
138
|
-
adata = adata[:, adata.var_names.isin(homologs[config.species])]
|
139
|
-
print(f'{adata.shape[1]} genes left after homologs transformation.')
|
140
|
-
adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM']
|
141
|
-
# Process the data
|
142
|
-
if config.type == 'count':
|
143
|
-
adata.X = adata.layers[config.type]
|
144
|
-
sc.pp.normalize_total(adata, target_sum=1e4)
|
145
|
-
sc.pp.log1p(adata)
|
146
|
-
else:
|
147
|
-
adata.X = adata.layers[config.type]
|
148
128
|
|
149
|
-
|
150
|
-
|
129
|
+
# Homologs transformation
|
130
|
+
if not config.homolog_file is None:
|
131
|
+
logger.info(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
|
132
|
+
homologs = pd.read_csv(config.homolog_file, sep='\t')
|
133
|
+
if homologs.shape[1] != 2:
|
134
|
+
raise ValueError(
|
135
|
+
"Homologs file must have two columns: one for the species and one for the human gene symbol.")
|
136
|
+
|
137
|
+
homologs.columns = [config.species, 'HUMAN_GENE_SYM']
|
138
|
+
homologs.set_index(config.species, inplace=True)
|
139
|
+
adata = adata[:, adata.var_names.isin(homologs.index)]
|
140
|
+
# Log the number of genes left after homolog transformation
|
141
|
+
logger.info(f"{adata.shape[1]} genes retained after homolog transformation.")
|
142
|
+
if adata.shape[1] < 100:
|
143
|
+
raise ValueError("Too few genes retained in ST data (<100).")
|
144
|
+
adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM'].values
|
145
|
+
# drop duplicated genes
|
146
|
+
adata = adata[:, ~adata.var_names.duplicated()]
|
147
|
+
|
148
|
+
# Remove cells that do not express any genes after transformation, and genes that are not expressed in any cells.
|
149
|
+
logger.info(f'Number of cells, genes of the input data: {adata.shape[0]},{adata.shape[1]}')
|
151
150
|
adata = adata[adata.X.sum(axis=1) > 0, adata.X.sum(axis=0) > 0]
|
152
|
-
|
151
|
+
logger.info(f'Number of cells, genes after transformation: {adata.shape[0]},{adata.shape[1]}')
|
153
152
|
# Buid the spatial graph
|
154
153
|
spatial_net = _build_spatial_net(adata, config.annotation, config.num_neighbour_spatial)
|
155
154
|
spatial_net.set_index('Cell1', inplace=True)
|
@@ -163,27 +162,31 @@ def run_latent_to_gene(config: LatentToGeneConfig):
|
|
163
162
|
cell_list = adata.obs.index.tolist()
|
164
163
|
|
165
164
|
# Load the geometrical mean across slices
|
166
|
-
if
|
167
|
-
|
165
|
+
if config.gM_slices is not None:
|
166
|
+
logger.info('Geometrical mean across multiple slices is provided.')
|
168
167
|
gM = pd.read_parquet(config.gM_slices)
|
169
|
-
|
168
|
+
if config.species is not None:
|
169
|
+
homologs = pd.read_csv(config.homolog_file, sep='\t', header=None)
|
170
|
+
if homologs.shape[1] < 2:
|
171
|
+
raise ValueError(
|
172
|
+
"Homologs file must have at least two columns: one for the species and one for the human gene symbol.")
|
173
|
+
homologs.columns = [config.species, 'HUMAN_GENE_SYM']
|
174
|
+
homologs.set_index(config.species, inplace=True)
|
175
|
+
gM = gM.loc[gM.index.isin(homologs.index)]
|
176
|
+
gM.index = homologs.loc[gM.index, 'HUMAN_GENE_SYM'].values
|
170
177
|
common_gene = np.intersect1d(adata.var_names, gM.index)
|
171
178
|
gM = gM.loc[common_gene]
|
172
|
-
gM = gM['G_Mean'].
|
173
|
-
print('------Ranking the spatial data...')
|
179
|
+
gM = gM['G_Mean'].to_numpy()
|
174
180
|
adata = adata[:, common_gene]
|
175
|
-
ranks = np.apply_along_axis(rankdata, 1, adata.X.toarray())
|
176
181
|
else:
|
177
|
-
|
178
|
-
ranks = rankdata(adata.X.toarray().astype(np.float32), axis=1).astype(np.float32)
|
179
|
-
gM = gmean(ranks, axis=0)
|
182
|
+
gM = gmean(adata.layers['rank'], axis=0)
|
180
183
|
|
181
184
|
# Compute the fraction of each gene across cells
|
182
185
|
expressed_mask = pd.DataFrame((adata.X > 0).toarray(), index=adata.obs.index, columns=adata.var.index)
|
183
|
-
# frac_whole = np.array((
|
186
|
+
# frac_whole = np.array((adata_layer > 0).sum(axis=0))[0] / (adata.shape[0])
|
184
187
|
frac_whole = np.array(expressed_mask.sum(axis=0)) / (adata.shape[0])
|
185
188
|
# Normalize the geometrical mean
|
186
|
-
ranks =
|
189
|
+
ranks = adata.layers['rank'] / gM
|
187
190
|
ranks = pd.DataFrame(ranks, index=adata.obs_names)
|
188
191
|
ranks.columns = adata.var.index
|
189
192
|
mk_score = [
|
@@ -192,66 +195,24 @@ def run_latent_to_gene(config: LatentToGeneConfig):
|
|
192
195
|
desc="Finding markers (Rank-based approach) | cells")
|
193
196
|
]
|
194
197
|
# Normalize the marker scores
|
195
|
-
mk_score = pd.DataFrame(np.vstack(mk_score).T, index=adata.
|
198
|
+
mk_score = pd.DataFrame(np.vstack(mk_score).T, index=adata.var_names, columns=cell_list)
|
196
199
|
# mk_score_normalized = mk_score.div(mk_score.sum())*1e+2
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
mk_score = mk_score[
|
201
|
-
|
200
|
+
|
201
|
+
# Remove the mitochondrial genes from mk_score
|
202
|
+
mt_gene_mask = ~adata.var_names.str.startswith(('MT-', 'mt-'))
|
203
|
+
mk_score = mk_score[mt_gene_mask]
|
204
|
+
adata = adata[:, mt_gene_mask]
|
205
|
+
|
206
|
+
# # Save the mk_score DataFrame to an adata layer
|
207
|
+
# adata.layers['mkscore'] = mk_score.values.T
|
208
|
+
|
202
209
|
# Save the marker scores
|
203
|
-
|
204
|
-
output_file_path = Path(config.
|
210
|
+
logger.info(f'------Saving marker scores ...')
|
211
|
+
output_file_path = Path(config.mkscore_feather_path)
|
205
212
|
output_file_path.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
|
206
213
|
mk_score.reset_index(inplace=True)
|
207
214
|
mk_score.rename(columns={mk_score.columns[0]: 'HUMAN_GENE_SYM'}, inplace=True)
|
208
215
|
mk_score.to_feather(output_file_path)
|
209
216
|
|
210
|
-
|
211
|
-
|
212
|
-
parser = argparse.ArgumentParser(description="Process latent to gene data.")
|
213
|
-
add_latent_to_gene_args(parser)
|
214
|
-
TEST = True
|
215
|
-
if TEST:
|
216
|
-
name = 'Cortex_151507'
|
217
|
-
test_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021'
|
218
|
-
|
219
|
-
args = parser.parse_args([
|
220
|
-
'--input_hdf5_with_latent_path', f'{test_dir}/{name}/hdf5/{name}_add_latent.h5ad',
|
221
|
-
'--sample_name', f'{name}',
|
222
|
-
'--output_feather_path', f'{test_dir}/{name}/gene_markers/{name}_rank.feather',
|
223
|
-
'--method', 'rank',
|
224
|
-
'--latent_representation', 'latent_GVAE',
|
225
|
-
'--type', 'count',
|
226
|
-
'--annotation', 'layer_guess',
|
227
|
-
'--num_neighbour', '51',
|
228
|
-
# '--no_expression_fraction',
|
229
|
-
|
230
|
-
])
|
231
|
-
|
232
|
-
# config = LatentToGeneConfig(
|
233
|
-
# **{'annotation': 'SubClass',
|
234
|
-
# 'fold': 1.0,
|
235
|
-
# 'gM_slices': None,
|
236
|
-
# 'gs_species': '/storage/yangjianLab/songliyang/SpatialData/homologs/macaque_human_homologs.txt',
|
237
|
-
# 'input_hdf5_with_latent_path': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/macaque/T121_macaque1/find_latent_representations/T121_macaque1_add_latent.h5ad',
|
238
|
-
# 'latent_representation': 'latent_GVAE',
|
239
|
-
# 'method': 'rank',
|
240
|
-
# 'num_neighbour': 51,
|
241
|
-
# 'num_neighbour_spatial': 201,
|
242
|
-
# 'output_feather_path': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/macaque/T121_macaque1/latent_to_gene/T121_macaque1_gene_marker_score.feather',
|
243
|
-
# 'pst': 0.2,
|
244
|
-
# 'sample_name': 'T121_macaque1',
|
245
|
-
# 'species': 'MACAQUE_GENE_SYM',
|
246
|
-
# 'type': 'SCT'}
|
247
|
-
# )
|
248
|
-
else:
|
249
|
-
args = parser.parse_args()
|
250
|
-
config = LatentToGeneConfig(**vars(args))
|
251
|
-
logger.info(f'Latent to gene for {args.sample_name}...')
|
252
|
-
pprint.pprint(config)
|
253
|
-
start_time = time.time()
|
254
|
-
run_latent_to_gene(config)
|
255
|
-
end_time = time.time()
|
256
|
-
logger.info(
|
257
|
-
f'Latent to gene for {config.sample_name} finished. Time spent: {(end_time - start_time) / 60:.2f} min.')
|
217
|
+
# Save the modified adata object to disk
|
218
|
+
adata.write(config.hdf5_with_latent_path)
|