gsMap 1.62__py3-none-any.whl → 1.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/generate_ldscore.py CHANGED
@@ -1,24 +1,19 @@
1
- import argparse
2
1
  import logging
2
+ import warnings
3
3
  from pathlib import Path
4
4
 
5
5
  import numpy as np
6
- # %%
7
6
  import pandas as pd
8
7
  import pyranges as pr
8
+ import zarr
9
9
  from scipy.sparse import csr_matrix
10
10
  from tqdm import trange
11
11
 
12
- from gsMap.config import GenerateLDScoreConfig, add_generate_ldscore_args
13
- # %%
14
- from gsMap.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
12
+ from gsMap.config import GenerateLDScoreConfig
13
+ from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
15
14
 
15
+ warnings.filterwarnings("ignore", category=FutureWarning)
16
16
  logger = logging.getLogger(__name__)
17
- logger.setLevel(logging.DEBUG)
18
- handler = logging.StreamHandler()
19
- handler.setFormatter(logging.Formatter(
20
- '[{asctime}] {levelname:6s} {message}', style='{'))
21
- logger.addHandler(handler)
22
17
 
23
18
 
24
19
  # %%
@@ -30,7 +25,7 @@ def load_gtf(gtf_file, mk_score, window_size):
30
25
  print("Loading gtf data")
31
26
  #
32
27
  # Load GTF file
33
- gtf = pr.read_gtf(gtf_file)
28
+ gtf = pr.read_gtf(gtf_file, )
34
29
  gtf = gtf.df
35
30
  #
36
31
  # Select the common genes
@@ -82,14 +77,16 @@ def load_bim(bfile_root, chrom):
82
77
  """
83
78
  Load the bim file.
84
79
  """
85
- print("Loading bim data")
86
80
  bim = pd.read_csv(f'{bfile_root}.{chrom}.bim', sep='\t', header=None)
87
81
  bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
88
82
  #
89
83
  # Transform bim to PyRanges
90
84
  bim_pr = bim.copy()
91
85
  bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
92
- bim_pr['End'] = bim_pr['Start']
86
+
87
+ bim_pr['End'] = bim_pr['Start'].copy()
88
+ bim_pr['Start'] = bim_pr['Start'] - 1 # Due to bim file is 1-based
89
+
93
90
  bim_pr = pr.PyRanges(bim_pr)
94
91
  bim_pr.Chromosome = f'chr{chrom}'
95
92
  return bim, bim_pr
@@ -110,6 +107,36 @@ def Overlaps_gtf_bim(gtf_pr, bim_pr):
110
107
 
111
108
 
112
109
  # %%
110
+ def filter_snps_by_keep_snp(bim_df, keep_snp_file):
111
+ # Load the keep_snp file and filter the BIM DataFrame
112
+ keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
113
+ filtered_bim_df = bim_df[bim_df['SNP'].isin(keep_snp)]
114
+ return filtered_bim_df
115
+
116
+
117
+ def get_snp_counts(config):
118
+ snp_counts = {}
119
+ total_snp = 0
120
+
121
+ for chrom in range(1, 23):
122
+ bim_df, _ = load_bim(config.bfile_root, chrom)
123
+
124
+ if config.keep_snp_root:
125
+ keep_snp_file = f'{config.keep_snp_root}.{chrom}.snp'
126
+ filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
127
+ else:
128
+ filtered_bim_df = bim_df
129
+
130
+ snp_counts[chrom] = filtered_bim_df.shape[0]
131
+ total_snp += snp_counts[chrom]
132
+
133
+ snp_counts['total'] = total_snp
134
+
135
+ chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
136
+
137
+ snp_counts['chrom_snp_start_point'] = [0] + chrom_snp_length_array.tolist()
138
+
139
+ return snp_counts
113
140
 
114
141
 
115
142
  # %%
@@ -189,7 +216,7 @@ def calculate_ldscore_from_annotation(SNP_annotation_df, chrom, bfile_root, ld_w
189
216
 
190
217
 
191
218
  def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
192
- SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1)
219
+ SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1).astype(np.float32, copy=False)
193
220
 
194
221
  snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
195
222
  ld_unit=ld_unit)
@@ -212,7 +239,7 @@ class S_LDSC_Boost:
212
239
  def __init__(self, config: GenerateLDScoreConfig):
213
240
  self.config = config
214
241
 
215
- self.mk_score = load_marker_score(config.mkscore_feather_file)
242
+ self.mk_score = load_marker_score(config.mkscore_feather_path)
216
243
 
217
244
  # Load GTF and get common markers
218
245
  self.gtf_pr, self.mk_score_common = load_gtf(config.gtf_annotation_file, self.mk_score,
@@ -237,6 +264,25 @@ class S_LDSC_Boost:
237
264
  else:
238
265
  self.enhancer_pr = None
239
266
 
267
+ # create tha zarr file
268
+ if config.ldscore_save_format == 'zarr':
269
+
270
+ chrom_snp_length_dict = get_snp_counts(config)
271
+ self.chrom_snp_start_point = chrom_snp_length_dict['chrom_snp_start_point']
272
+
273
+ zarr_path = Path(config.ldscore_save_dir) / f'{config.sample_name}.ldscore.zarr'
274
+ if not zarr_path.exists():
275
+ self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a', dtype=np.float16,
276
+ chunks=config.zarr_chunk_size,
277
+ shape=(chrom_snp_length_dict['total'], self.mk_score_common.shape[1]))
278
+ zarr_path.mkdir(parents=True, exist_ok=True)
279
+ # save spot names
280
+ self.zarr_file.attrs['spot_names'] = self.mk_score_common.columns.to_list()
281
+ # save chrom_snp_length_dict
282
+ self.zarr_file.attrs['chrom_snp_start_point'] = self.chrom_snp_start_point
283
+ else:
284
+ self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a')
285
+
240
286
  def process_chromosome(self, chrom: int):
241
287
  self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
242
288
 
@@ -252,9 +298,9 @@ class S_LDSC_Boost:
252
298
  self.keep_snp_mask = None
253
299
  self.snp_name = self.snp_gene_pair_dummy.index.to_list()
254
300
 
255
- if self.config.additional_baseline_annotation_dir_path is not None:
256
- additional_baseline_annotation_dir_path = Path(self.config.additional_baseline_annotation_dir_path)
257
- additional_baseline_annotation_file_path = additional_baseline_annotation_dir_path / f'baseline.{chrom}.annot.gz'
301
+ if self.config.additional_baseline_annotation is not None:
302
+ additional_baseline_annotation = Path(self.config.additional_baseline_annotation)
303
+ additional_baseline_annotation_file_path = additional_baseline_annotation / f'baseline.{chrom}.annot.gz'
258
304
  assert additional_baseline_annotation_file_path.exists(), f'additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}'
259
305
  additional_baseline_annotation_df = pd.read_csv(additional_baseline_annotation_file_path, sep='\t')
260
306
  additional_baseline_annotation_df.set_index('SNP', inplace=True)
@@ -274,7 +320,7 @@ class S_LDSC_Boost:
274
320
  additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
275
321
  self.snp_gene_pair_dummy.index)
276
322
 
277
- # do this for saving the cpu time, by only calculate r2 once
323
+ # do this for saving the cpu time, only calculate r2 once
278
324
  self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
279
325
  calculate_ldscore_from_multiple_annotation(
280
326
  [self.snp_gene_pair_dummy, additional_baseline_annotation_df],
@@ -283,21 +329,24 @@ class S_LDSC_Boost:
283
329
  ld_wind=self.config.ld_wind,
284
330
  ld_unit=self.config.ld_unit))
285
331
 
332
+ additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[self.snp_name]
333
+ # print(additional_baseline_annotation_ldscore.index.to_list()==self.snp_name)
334
+
286
335
  ld_score_file = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather'
287
336
  M_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M'
288
337
  M_5_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50'
289
338
 
290
339
  # save additional baseline annotation ldscore
291
- self.save_ldscore(additional_baseline_annotation_ldscore.values,
292
- column_names=additional_baseline_annotation_ldscore.columns,
293
- save_file_name=ld_score_file,
294
- )
340
+ self.save_ldscore_to_feather(additional_baseline_annotation_ldscore.values,
341
+ column_names=additional_baseline_annotation_ldscore.columns,
342
+ save_file_name=ld_score_file,
343
+ )
295
344
 
296
345
  # caculate the M and save
297
346
  save_dir = Path(M_file_path).parent
298
347
  save_dir.mkdir(parents=True, exist_ok=True)
299
348
  M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
300
- M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0,keepdims=True)
349
+ M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0, keepdims=True)
301
350
  np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
302
351
  np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
303
352
 
@@ -307,13 +356,27 @@ class S_LDSC_Boost:
307
356
  self.config.bfile_root,
308
357
  ld_wind=self.config.ld_wind,
309
358
  ld_unit=self.config.ld_unit)
359
+ # only keep the snp in keep_snp_root
360
+ if self.keep_snp_mask is not None:
361
+ self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
362
+
363
+ if self.config.save_pre_calculate_snp_gene_weight_matrix:
364
+ snp_gene_weight_matrix_save_dir = Path(self.config.ldscore_save_dir) / 'snp_gene_weight_matrix'
365
+ snp_gene_weight_matrix_save_dir.mkdir(parents=True, exist_ok=True)
366
+ logger.info(f'Saving snp_gene_weight_matrix for chr{chrom}...')
367
+ self.snp_gene_weight_matrix.reset_index().to_feather(
368
+ snp_gene_weight_matrix_save_dir / f'{chrom}.snp_gene_weight_matrix.feather')
369
+
310
370
  # convert to sparse
311
371
  self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
372
+ logger.info(f'Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}')
312
373
 
313
374
  # calculate baseline ld score
375
+ logger.info(f'Calculating baseline ld score for chr{chrom}...')
314
376
  self.calculate_ldscore_for_base_line(chrom, self.config.sample_name, self.config.ldscore_save_dir)
315
377
 
316
378
  # calculate ld score for annotation
379
+ logger.info(f'Calculating ld score for annotation for chr{chrom}...')
317
380
  self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
318
381
  self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
319
382
  chrom,
@@ -323,7 +386,6 @@ class S_LDSC_Boost:
323
386
 
324
387
  def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
325
388
  mk_score_chunk,
326
- save_file_name,
327
389
  drop_dummy_na=True,
328
390
  ):
329
391
 
@@ -332,20 +394,18 @@ class S_LDSC_Boost:
332
394
  else:
333
395
  ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
334
396
 
335
- self.save_ldscore(ldscore_chr_chunk,
336
- column_names=mk_score_chunk.columns,
337
- save_file_name=save_file_name,
338
- )
397
+ return ldscore_chr_chunk
339
398
 
340
- def save_ldscore(self, ldscore_chr_chunk: np.ndarray, column_names, save_file_name):
399
+ def save_ldscore_to_feather(self, ldscore_chr_chunk: np.ndarray, column_names, save_file_name):
341
400
  save_dir = Path(save_file_name).parent
342
401
  save_dir.mkdir(parents=True, exist_ok=True)
343
402
 
344
403
  ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
345
404
  # avoid overflow of float16, if inf, set to max of float16
346
405
  ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
347
- ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
348
- self.keep_snp_mask]
406
+ # ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
407
+ # self.keep_snp_mask]
408
+
349
409
  # save for each chunk
350
410
  df = pd.DataFrame(ldscore_chr_chunk,
351
411
  index=self.snp_name,
@@ -354,6 +414,20 @@ class S_LDSC_Boost:
354
414
  df.index.name = 'SNP'
355
415
  df.reset_index().to_feather(save_file_name)
356
416
 
417
+ def save_ldscore_chunk_to_zarr(self, ldscore_chr_chunk: np.ndarray,
418
+ chrom: int, start_col_index,
419
+ ):
420
+ ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
421
+ # avoid overflow of float16, if inf, set to max of float16
422
+ ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
423
+
424
+ # save for each chunk
425
+ chrom_snp_start_point = self.chrom_snp_start_point[chrom - 1]
426
+ chrom_snp_end_point = self.chrom_snp_start_point[chrom]
427
+
428
+ self.zarr_file[chrom_snp_start_point:chrom_snp_end_point,
429
+ start_col_index:start_col_index + ldscore_chr_chunk.shape[1]] = ldscore_chr_chunk
430
+
357
431
  def calculate_M_use_SNP_gene_pair_dummy_by_chunk(self,
358
432
  mk_score_chunk,
359
433
  M_file_path, M_5_file_path,
@@ -377,7 +451,6 @@ class S_LDSC_Boost:
377
451
  np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
378
452
  np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
379
453
 
380
-
381
454
  def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(self, mk_score_common, chrom, sample_name, save_dir):
382
455
  """
383
456
  Calculate the LD score using the SNP-gene weight matrix.
@@ -393,11 +466,23 @@ class S_LDSC_Boost:
393
466
  M_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M'
394
467
  M_5_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50'
395
468
 
396
- self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
469
+ ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
397
470
  mk_score_chunk,
398
- save_file_name=ld_score_file,
399
471
  drop_dummy_na=True,
400
472
  )
473
+ if self.config.ldscore_save_format == 'feather':
474
+ self.save_ldscore_to_feather(ldscore_chr_chunk,
475
+ column_names=mk_score_chunk.columns,
476
+ save_file_name=ld_score_file,
477
+ )
478
+ elif self.config.ldscore_save_format == 'zarr':
479
+ self.save_ldscore_chunk_to_zarr(ldscore_chr_chunk,
480
+ chrom=chrom,
481
+ start_col_index=i,
482
+ )
483
+ else:
484
+ raise ValueError(f'Invalid ldscore_save_format: {self.config.ldscore_save_format}')
485
+
401
486
  self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
402
487
  mk_score_chunk,
403
488
  M_file,
@@ -417,11 +502,15 @@ class S_LDSC_Boost:
417
502
  M_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M'
418
503
  M_5_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M_5_50'
419
504
 
420
- self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
505
+ ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
421
506
  baseline_mk_score_df,
422
- save_file_name=ld_score_file,
423
507
  drop_dummy_na=False,
424
508
  )
509
+
510
+ self.save_ldscore_to_feather(ldscore_chr_chunk,
511
+ column_names=baseline_mk_score_df.columns,
512
+ save_file_name=ld_score_file,
513
+ )
425
514
  # save baseline M
426
515
  self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
427
516
  baseline_mk_score_df,
@@ -435,6 +524,7 @@ class S_LDSC_Boost:
435
524
  Get the dummy matrix of SNP-gene pairs.
436
525
  """
437
526
  # Load the bim file
527
+ print("Loading bim data")
438
528
  bim, bim_pr = load_bim(self.config.bfile_root, chrom)
439
529
 
440
530
  if self.config.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
@@ -468,6 +558,12 @@ class S_LDSC_Boost:
468
558
  else:
469
559
  raise ValueError('gtf_pr and enhancer_pr cannot be None at the same time')
470
560
 
561
+ # save the SNP_gene_pair to feather
562
+ SNP_gene_pair_save_path = Path(
563
+ self.config.ldscore_save_dir) / f'SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather'
564
+ SNP_gene_pair_save_path.parent.mkdir(parents=True, exist_ok=True)
565
+ SNP_gene_pair.reset_index().to_feather(SNP_gene_pair_save_path)
566
+
471
567
  # Get the dummy matrix
472
568
  SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair['gene_name'], dummy_na=True)
473
569
  return SNP_gene_pair_dummy
@@ -502,50 +598,21 @@ class S_LDSC_Boost:
502
598
 
503
599
 
504
600
  def run_generate_ldscore(config: GenerateLDScoreConfig):
601
+ if config.ldscore_save_format == 'quick_mode':
602
+ logger.info('Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore.')
603
+ ldscore_save_dir = config.ldscore_save_dir
604
+
605
+ # link the baseline annotation
606
+ baseline_annotation_dir = Path(config.baseline_annotation_dir)
607
+ (ldscore_save_dir / 'baseline').symlink_to(baseline_annotation_dir, target_is_directory=True)
608
+
609
+ # link the SNP_gene_pair
610
+ SNP_gene_pair_dir = Path(config.SNP_gene_pair_dir)
611
+ (ldscore_save_dir / 'SNP_gene_pair').symlink_to(SNP_gene_pair_dir, target_is_directory=True)
612
+ return
505
613
  s_ldsc_boost = S_LDSC_Boost(config)
506
614
  if config.chrom == 'all':
507
615
  for chrom in range(1, 23):
508
616
  s_ldsc_boost.process_chromosome(chrom)
509
617
  else:
510
618
  s_ldsc_boost.process_chromosome(config.chrom)
511
-
512
-
513
- # %%
514
- if __name__ == '__main__':
515
- TEST = True
516
- if TEST:
517
- # %%
518
- sample_name = 'Cortex_151507'
519
- chrom = 'all'
520
- save_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/Cortex_151507/snp_annotation/test/0101/sparse'
521
- # %%
522
- gtf_file = '/storage/yangjianLab/songliyang/ReferenceGenome/GRCh37/gencode.v39lift37.annotation.gtf'
523
- mkscore_feather_file = f'/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/{sample_name}/gene_markers/{sample_name}_rank.feather'
524
- bfile_root = '/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC'
525
- window_size = 50000
526
- keep_snp_root = '/storage/yangjianLab/sharedata/LDSC_resource/hapmap3_snps/hm'
527
- spots_per_chunk = 10_000
528
- enhancer_annotation = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/resource/epigenome/cleaned_data/by_tissue/BRN/ABC_roadmap_merged.bed'
529
- # %%
530
- config = GenerateLDScoreConfig(
531
- sample_name=sample_name,
532
- chrom=chrom,
533
- ldscore_save_dir=save_dir,
534
- gtf_annotation_file=gtf_file,
535
- mkscore_feather_file=mkscore_feather_file,
536
- bfile_root=bfile_root,
537
- keep_snp_root=keep_snp_root,
538
- gene_window_size=window_size,
539
- spots_per_chunk=spots_per_chunk,
540
- enhancer_annotation_file=enhancer_annotation,
541
- gene_window_enhancer_priority='enhancer_first',
542
- additional_baseline_annotation_dir_path='/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/resource/ldsc/baseline_v1.2/remove_base'
543
- )
544
- # %%
545
- run_generate_ldscore(config)
546
- else:
547
- parser = argparse.ArgumentParser(description="Configuration for the application.")
548
- add_generate_ldscore_args(parser)
549
- args = parser.parse_args()
550
- config = GenerateLDScoreConfig(**vars(args))
551
- run_generate_ldscore(config)
gsMap/latent_to_gene.py CHANGED
@@ -1,8 +1,4 @@
1
- import argparse
2
1
  import logging
3
- import multiprocessing
4
- import pprint
5
- import time
6
2
  from pathlib import Path
7
3
 
8
4
  import numpy as np
@@ -14,14 +10,9 @@ from sklearn.metrics.pairwise import cosine_similarity
14
10
  from sklearn.neighbors import NearestNeighbors
15
11
  from tqdm import tqdm
16
12
 
17
- from gsMap.config import add_latent_to_gene_args, LatentToGeneConfig
13
+ from gsMap.config import LatentToGeneConfig
18
14
 
19
15
  logger = logging.getLogger(__name__)
20
- logger.setLevel(logging.DEBUG)
21
- handler = logging.StreamHandler()
22
- handler.setFormatter(logging.Formatter(
23
- '[{asctime}] {levelname:8s} {filename} {message}', style='{'))
24
- logger.addHandler(handler)
25
16
 
26
17
 
27
18
  def find_Neighbors(coor, num_neighbour):
@@ -49,31 +40,31 @@ def _build_spatial_net(adata, annotation, num_neighbour):
49
40
  """
50
41
  1 Build spatial neighbourhood matrix for each spot (cell) based on the spatial coord
51
42
  """
52
- print(f'------Building spatial graph based on spatial coordinates...')
43
+ logger.info(f'------Building spatial graph based on spatial coordinates...')
53
44
 
54
45
  coor = pd.DataFrame(adata.obsm['spatial'])
55
46
  coor.index = adata.obs.index
56
47
 
57
48
  if not annotation is None:
58
- print(f'Cell annotations are provided...')
49
+ logger.info(f'Cell annotations are provided...')
59
50
  spatial_net = pd.DataFrame()
60
51
  # Cells with annotations
61
52
  for ct in adata.obs[annotation].dropna().unique():
62
53
  coor_temp = coor.loc[adata.obs[annotation] == ct, :]
63
54
  spatial_net_temp = find_Neighbors(coor_temp, min(num_neighbour, coor_temp.shape[0]))
64
55
  spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
65
- print(f'{ct}: {coor_temp.shape[0]} cells')
56
+ logger.info(f'{ct}: {coor_temp.shape[0]} cells')
66
57
 
67
58
  # Cells labeled as nan
68
59
  if pd.isnull(adata.obs[annotation]).any():
69
60
  cell_nan = adata.obs.index[np.where(pd.isnull(adata.obs[annotation]))[0]]
70
- print(f'Nan: {len(cell_nan)} cells')
61
+ logger.info(f'Nan: {len(cell_nan)} cells')
71
62
 
72
63
  spatial_net_temp = find_Neighbors(coor, num_neighbour)
73
64
  spatial_net_temp = spatial_net_temp.loc[spatial_net_temp.Cell1.isin(cell_nan), :]
74
65
  spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
75
66
  else:
76
- print(f'Cell annotations are not provided...')
67
+ logger.info(f'Cell annotations are not provided...')
77
68
  spatial_net = find_Neighbors(coor, num_neighbour)
78
69
 
79
70
  return spatial_net
@@ -117,7 +108,7 @@ def _compute_regional_mkscore(cell_tg, ):
117
108
  # Simultaneously consider the ratio of expression fractions and ranks
118
109
  gene_ranks_region = (gene_ranks_region * frac_region).values
119
110
 
120
- mkscore = np.exp(gene_ranks_region ** 2) - 1
111
+ mkscore = np.exp(gene_ranks_region ** 1.5) - 1
121
112
  return mkscore.astype(np.float16, copy=False)
122
113
 
123
114
 
@@ -125,31 +116,39 @@ def run_latent_to_gene(config: LatentToGeneConfig):
125
116
  global adata, coor_latent, spatial_net, ranks, frac_whole, args, spatial_net_dict, expressed_mask
126
117
  args = config
127
118
  # Load and process the spatial data
128
- print('------Loading the spatial data...')
129
- adata = sc.read_h5ad(config.input_hdf5_with_latent_path)
119
+ logger.info('------Loading the spatial data...')
120
+ adata = sc.read_h5ad(config.hdf5_with_latent_path)
121
+
122
+ logger.info('------Ranking the spatial data...')
123
+ adata.layers['rank'] = rankdata(adata.X.toarray().astype(np.float32), axis=1).astype(np.float32)
124
+
130
125
  if not config.annotation is None:
131
- print(f'------Cell annotations are provided as {config.annotation}...')
126
+ logger.info(f'------Cell annotations are provided as {config.annotation}...')
132
127
  adata = adata[~pd.isnull(adata.obs[config.annotation]), :]
133
- # Homologs transformation
134
- if not config.species is None:
135
- print(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
136
- homologs = pd.read_csv(config.gs_species, sep='\t')
137
- homologs.index = homologs[config.species]
138
- adata = adata[:, adata.var_names.isin(homologs[config.species])]
139
- print(f'{adata.shape[1]} genes left after homologs transformation.')
140
- adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM']
141
- # Process the data
142
- if config.type == 'count':
143
- adata.X = adata.layers[config.type]
144
- sc.pp.normalize_total(adata, target_sum=1e4)
145
- sc.pp.log1p(adata)
146
- else:
147
- adata.X = adata.layers[config.type]
148
128
 
149
- # Remove cells that do not express any genes after transformation, and genes that are not expressed in any cells.
150
- print(f'Number of cells, genes of the input data: {adata.shape[0]},{adata.shape[1]}')
129
+ # Homologs transformation
130
+ if not config.homolog_file is None:
131
+ logger.info(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
132
+ homologs = pd.read_csv(config.homolog_file, sep='\t')
133
+ if homologs.shape[1] != 2:
134
+ raise ValueError(
135
+ "Homologs file must have two columns: one for the species and one for the human gene symbol.")
136
+
137
+ homologs.columns = [config.species, 'HUMAN_GENE_SYM']
138
+ homologs.set_index(config.species, inplace=True)
139
+ adata = adata[:, adata.var_names.isin(homologs.index)]
140
+ # Log the number of genes left after homolog transformation
141
+ logger.info(f"{adata.shape[1]} genes retained after homolog transformation.")
142
+ if adata.shape[1] < 100:
143
+ raise ValueError("Too few genes retained in ST data (<100).")
144
+ adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM'].values
145
+ # drop duplicated genes
146
+ adata = adata[:, ~adata.var_names.duplicated()]
147
+
148
+ # Remove cells that do not express any genes after transformation, and genes that are not expressed in any cells.
149
+ logger.info(f'Number of cells, genes of the input data: {adata.shape[0]},{adata.shape[1]}')
151
150
  adata = adata[adata.X.sum(axis=1) > 0, adata.X.sum(axis=0) > 0]
152
- print(f'Number of cells, genes after transformation: {adata.shape[0]},{adata.shape[1]}')
151
+ logger.info(f'Number of cells, genes after transformation: {adata.shape[0]},{adata.shape[1]}')
153
152
  # Buid the spatial graph
154
153
  spatial_net = _build_spatial_net(adata, config.annotation, config.num_neighbour_spatial)
155
154
  spatial_net.set_index('Cell1', inplace=True)
@@ -163,27 +162,31 @@ def run_latent_to_gene(config: LatentToGeneConfig):
163
162
  cell_list = adata.obs.index.tolist()
164
163
 
165
164
  # Load the geometrical mean across slices
166
- if not config.gM_slices is None:
167
- print('Geometrical mean across multiple slices are provided.')
165
+ if config.gM_slices is not None:
166
+ logger.info('Geometrical mean across multiple slices is provided.')
168
167
  gM = pd.read_parquet(config.gM_slices)
169
- # Select the common gene
168
+ if config.species is not None:
169
+ homologs = pd.read_csv(config.homolog_file, sep='\t', header=None)
170
+ if homologs.shape[1] < 2:
171
+ raise ValueError(
172
+ "Homologs file must have at least two columns: one for the species and one for the human gene symbol.")
173
+ homologs.columns = [config.species, 'HUMAN_GENE_SYM']
174
+ homologs.set_index(config.species, inplace=True)
175
+ gM = gM.loc[gM.index.isin(homologs.index)]
176
+ gM.index = homologs.loc[gM.index, 'HUMAN_GENE_SYM'].values
170
177
  common_gene = np.intersect1d(adata.var_names, gM.index)
171
178
  gM = gM.loc[common_gene]
172
- gM = gM['G_Mean'].to_list()
173
- print('------Ranking the spatial data...')
179
+ gM = gM['G_Mean'].to_numpy()
174
180
  adata = adata[:, common_gene]
175
- ranks = np.apply_along_axis(rankdata, 1, adata.X.toarray())
176
181
  else:
177
- print('------Ranking the spatial data...')
178
- ranks = rankdata(adata.X.toarray().astype(np.float32), axis=1).astype(np.float32)
179
- gM = gmean(ranks, axis=0)
182
+ gM = gmean(adata.layers['rank'], axis=0)
180
183
 
181
184
  # Compute the fraction of each gene across cells
182
185
  expressed_mask = pd.DataFrame((adata.X > 0).toarray(), index=adata.obs.index, columns=adata.var.index)
183
- # frac_whole = np.array((adata.X > 0).sum(axis=0))[0] / (adata.shape[0])
186
+ # frac_whole = np.array((adata_layer > 0).sum(axis=0))[0] / (adata.shape[0])
184
187
  frac_whole = np.array(expressed_mask.sum(axis=0)) / (adata.shape[0])
185
188
  # Normalize the geometrical mean
186
- ranks = ranks / gM
189
+ ranks = adata.layers['rank'] / gM
187
190
  ranks = pd.DataFrame(ranks, index=adata.obs_names)
188
191
  ranks.columns = adata.var.index
189
192
  mk_score = [
@@ -192,66 +195,24 @@ def run_latent_to_gene(config: LatentToGeneConfig):
192
195
  desc="Finding markers (Rank-based approach) | cells")
193
196
  ]
194
197
  # Normalize the marker scores
195
- mk_score = pd.DataFrame(np.vstack(mk_score).T, index=adata.var.index, columns=cell_list)
198
+ mk_score = pd.DataFrame(np.vstack(mk_score).T, index=adata.var_names, columns=cell_list)
196
199
  # mk_score_normalized = mk_score.div(mk_score.sum())*1e+2
197
- # Remove the mitochondrial genes
198
- mt_genes = [gene for gene in mk_score.index if gene.startswith('MT-') or gene.startswith('mt-')]
199
- mask = ~mk_score.index.isin(set(mt_genes))
200
- mk_score = mk_score[mask] # Apply the mask to mk_score
201
- print(mk_score.shape)
200
+
201
+ # Remove the mitochondrial genes from mk_score
202
+ mt_gene_mask = ~adata.var_names.str.startswith(('MT-', 'mt-'))
203
+ mk_score = mk_score[mt_gene_mask]
204
+ adata = adata[:, mt_gene_mask]
205
+
206
+ # # Save the mk_score DataFrame to an adata layer
207
+ # adata.layers['mkscore'] = mk_score.values.T
208
+
202
209
  # Save the marker scores
203
- print(f'------Saving marker scores ...')
204
- output_file_path = Path(config.output_feather_path)
210
+ logger.info(f'------Saving marker scores ...')
211
+ output_file_path = Path(config.mkscore_feather_path)
205
212
  output_file_path.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
206
213
  mk_score.reset_index(inplace=True)
207
214
  mk_score.rename(columns={mk_score.columns[0]: 'HUMAN_GENE_SYM'}, inplace=True)
208
215
  mk_score.to_feather(output_file_path)
209
216
 
210
- #%%
211
- if __name__ == '__main__':
212
- parser = argparse.ArgumentParser(description="Process latent to gene data.")
213
- add_latent_to_gene_args(parser)
214
- TEST = True
215
- if TEST:
216
- name = 'Cortex_151507'
217
- test_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021'
218
-
219
- args = parser.parse_args([
220
- '--input_hdf5_with_latent_path', f'{test_dir}/{name}/hdf5/{name}_add_latent.h5ad',
221
- '--sample_name', f'{name}',
222
- '--output_feather_path', f'{test_dir}/{name}/gene_markers/{name}_rank.feather',
223
- '--method', 'rank',
224
- '--latent_representation', 'latent_GVAE',
225
- '--type', 'count',
226
- '--annotation', 'layer_guess',
227
- '--num_neighbour', '51',
228
- # '--no_expression_fraction',
229
-
230
- ])
231
-
232
- # config = LatentToGeneConfig(
233
- # **{'annotation': 'SubClass',
234
- # 'fold': 1.0,
235
- # 'gM_slices': None,
236
- # 'gs_species': '/storage/yangjianLab/songliyang/SpatialData/homologs/macaque_human_homologs.txt',
237
- # 'input_hdf5_with_latent_path': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/macaque/T121_macaque1/find_latent_representations/T121_macaque1_add_latent.h5ad',
238
- # 'latent_representation': 'latent_GVAE',
239
- # 'method': 'rank',
240
- # 'num_neighbour': 51,
241
- # 'num_neighbour_spatial': 201,
242
- # 'output_feather_path': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/macaque/T121_macaque1/latent_to_gene/T121_macaque1_gene_marker_score.feather',
243
- # 'pst': 0.2,
244
- # 'sample_name': 'T121_macaque1',
245
- # 'species': 'MACAQUE_GENE_SYM',
246
- # 'type': 'SCT'}
247
- # )
248
- else:
249
- args = parser.parse_args()
250
- config = LatentToGeneConfig(**vars(args))
251
- logger.info(f'Latent to gene for {args.sample_name}...')
252
- pprint.pprint(config)
253
- start_time = time.time()
254
- run_latent_to_gene(config)
255
- end_time = time.time()
256
- logger.info(
257
- f'Latent to gene for {config.sample_name} finished. Time spent: {(end_time - start_time) / 60:.2f} min.')
217
+ # Save the modified adata object to disk
218
+ adata.write(config.hdf5_with_latent_path)