gsMap 1.60__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,257 @@
1
+ import argparse
2
+ import logging
3
+ import multiprocessing
4
+ import pprint
5
+ import time
6
+ from pathlib import Path
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import scanpy as sc
11
+ from scipy.stats import gmean
12
+ from scipy.stats import rankdata
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+ from sklearn.neighbors import NearestNeighbors
15
+ from tqdm import tqdm
16
+
17
+ from gsMap.config import add_latent_to_gene_args, LatentToGeneConfig
18
+
19
+ logger = logging.getLogger(__name__)
20
+ logger.setLevel(logging.DEBUG)
21
+ handler = logging.StreamHandler()
22
+ handler.setFormatter(logging.Formatter(
23
+ '[{asctime}] {levelname:8s} {filename} {message}', style='{'))
24
+ logger.addHandler(handler)
25
+
26
+
27
+ def find_Neighbors(coor, num_neighbour):
28
+ """
29
+ find Neighbors of each cell (based on spatial coordinates)
30
+ """
31
+ nbrs = NearestNeighbors(n_neighbors=num_neighbour).fit(coor)
32
+ distances, indices = nbrs.kneighbors(coor, return_distance=True)
33
+
34
+ KNN_list = [pd.DataFrame(zip([it] * indices[it].shape[0], indices[it], distances[it])) for it in
35
+ range(indices.shape[0])]
36
+ KNN_df = pd.concat(KNN_list)
37
+ KNN_df.columns = ['Cell1', 'Cell2', 'Distance']
38
+
39
+ spatial_net = KNN_df.copy()
40
+ id_cell_trans = dict(zip(range(coor.shape[0]), np.array(coor.index)))
41
+
42
+ spatial_net['Cell1'] = spatial_net['Cell1'].map(id_cell_trans)
43
+ spatial_net['Cell2'] = spatial_net['Cell2'].map(id_cell_trans)
44
+
45
+ return spatial_net
46
+
47
+
48
+ def _build_spatial_net(adata, annotation, num_neighbour):
49
+ """
50
+ 1 Build spatial neighbourhood matrix for each spot (cell) based on the spatial coord
51
+ """
52
+ print(f'------Building spatial graph based on spatial coordinates...')
53
+
54
+ coor = pd.DataFrame(adata.obsm['spatial'])
55
+ coor.index = adata.obs.index
56
+
57
+ if not annotation is None:
58
+ print(f'Cell annotations are provided...')
59
+ spatial_net = pd.DataFrame()
60
+ # Cells with annotations
61
+ for ct in adata.obs[annotation].dropna().unique():
62
+ coor_temp = coor.loc[adata.obs[annotation] == ct, :]
63
+ spatial_net_temp = find_Neighbors(coor_temp, min(num_neighbour, coor_temp.shape[0]))
64
+ spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
65
+ print(f'{ct}: {coor_temp.shape[0]} cells')
66
+
67
+ # Cells labeled as nan
68
+ if pd.isnull(adata.obs[annotation]).any():
69
+ cell_nan = adata.obs.index[np.where(pd.isnull(adata.obs[annotation]))[0]]
70
+ print(f'Nan: {len(cell_nan)} cells')
71
+
72
+ spatial_net_temp = find_Neighbors(coor, num_neighbour)
73
+ spatial_net_temp = spatial_net_temp.loc[spatial_net_temp.Cell1.isin(cell_nan), :]
74
+ spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
75
+ else:
76
+ print(f'Cell annotations are not provided...')
77
+ spatial_net = find_Neighbors(coor, num_neighbour)
78
+
79
+ return spatial_net
80
+
81
+
82
+ def find_Neighbors_Regional(cell):
83
+ cell_use = spatial_net_dict[cell]
84
+ similarity = cosine_similarity(coor_latent.loc[cell].values.reshape(1, -1),
85
+ coor_latent.loc[cell_use].values).reshape(-1)
86
+ if not args.annotation is None:
87
+ annotation = adata.obs[args.annotation]
88
+ df = pd.DataFrame({'Cell2': cell_use, 'Similarity': similarity, 'Annotation': annotation[cell_use]})
89
+ df = df.loc[df.loc[cell_use, 'Annotation'] == df.loc[cell, 'Annotation']]
90
+ else:
91
+ df = pd.DataFrame({'Cell2': cell_use, 'Similarity': similarity})
92
+
93
+ df = df.sort_values(by='Similarity', ascending=False)
94
+ cell_select = df.Cell2[0:args.num_neighbour].to_list()
95
+
96
+ return cell_select
97
+
98
+
99
+ def _compute_regional_mkscore(cell_tg, ):
100
+ """
101
+ compute gmean ranks of a region
102
+ """
103
+ cell_select = find_Neighbors_Regional(cell_tg)
104
+
105
+ # Ratio of expression ranks
106
+ ranks_tg = ranks.loc[cell_select]
107
+ gene_ranks_region = gmean(ranks_tg, axis=0)
108
+ gene_ranks_region[gene_ranks_region <= 1] = 0
109
+
110
+ if not args.no_expression_fraction:
111
+ # Ratio of expression fractions
112
+ frac_focal = expressed_mask.loc[cell_select].sum(0) / len(cell_select)
113
+ frac_region = frac_focal / frac_whole
114
+ frac_region[frac_region <= 1] = 0
115
+ frac_region[frac_region > 1] = 1
116
+
117
+ # Simultaneously consider the ratio of expression fractions and ranks
118
+ gene_ranks_region = (gene_ranks_region * frac_region).values
119
+
120
+ mkscore = np.exp(gene_ranks_region ** 2) - 1
121
+ return mkscore.astype(np.float16, copy=False)
122
+
123
+
124
+ def run_latent_to_gene(config: LatentToGeneConfig):
125
+ global adata, coor_latent, spatial_net, ranks, frac_whole, args, spatial_net_dict, expressed_mask
126
+ args = config
127
+ # Load and process the spatial data
128
+ print('------Loading the spatial data...')
129
+ adata = sc.read_h5ad(config.input_hdf5_with_latent_path)
130
+ if not config.annotation is None:
131
+ print(f'------Cell annotations are provided as {config.annotation}...')
132
+ adata = adata[~pd.isnull(adata.obs[config.annotation]), :]
133
+ # Homologs transformation
134
+ if not config.species is None:
135
+ print(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
136
+ homologs = pd.read_csv(config.gs_species, sep='\t')
137
+ homologs.index = homologs[config.species]
138
+ adata = adata[:, adata.var_names.isin(homologs[config.species])]
139
+ print(f'{adata.shape[1]} genes left after homologs transformation.')
140
+ adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM']
141
+ # Process the data
142
+ if config.type == 'count':
143
+ adata.X = adata.layers[config.type]
144
+ sc.pp.normalize_total(adata, target_sum=1e4)
145
+ sc.pp.log1p(adata)
146
+ else:
147
+ adata.X = adata.layers[config.type]
148
+
149
+ # Remove cells that do not express any genes after transformation, and genes that are not expressed in any cells.
150
+ print(f'Number of cells, genes of the input data: {adata.shape[0]},{adata.shape[1]}')
151
+ adata = adata[adata.X.sum(axis=1) > 0, adata.X.sum(axis=0) > 0]
152
+ print(f'Number of cells, genes after transformation: {adata.shape[0]},{adata.shape[1]}')
153
+ # Buid the spatial graph
154
+ spatial_net = _build_spatial_net(adata, config.annotation, config.num_neighbour_spatial)
155
+ spatial_net.set_index('Cell1', inplace=True)
156
+ # convert the spatial graph to a dictionary cell1 to cells in the neighbourhood
157
+ spatial_net_dict = spatial_net.groupby(spatial_net.index).Cell2.apply(list).to_dict()
158
+
159
+ # Extract the latent representation
160
+ coor_latent = pd.DataFrame(adata.obsm[config.latent_representation])
161
+ coor_latent.index = adata.obs.index
162
+ # Find marker genes
163
+ cell_list = adata.obs.index.tolist()
164
+
165
+ # Load the geometrical mean across slices
166
+ if not config.gM_slices is None:
167
+ print('Geometrical mean across multiple slices are provided.')
168
+ gM = pd.read_parquet(config.gM_slices)
169
+ # Select the common gene
170
+ common_gene = np.intersect1d(adata.var_names, gM.index)
171
+ gM = gM.loc[common_gene]
172
+ gM = gM['G_Mean'].to_list()
173
+ print('------Ranking the spatial data...')
174
+ adata = adata[:, common_gene]
175
+ ranks = np.apply_along_axis(rankdata, 1, adata.X.toarray())
176
+ else:
177
+ print('------Ranking the spatial data...')
178
+ ranks = rankdata(adata.X.toarray().astype(np.float32), axis=1).astype(np.float32)
179
+ gM = gmean(ranks, axis=0)
180
+
181
+ # Compute the fraction of each gene across cells
182
+ expressed_mask = pd.DataFrame((adata.X > 0).toarray(), index=adata.obs.index, columns=adata.var.index)
183
+ # frac_whole = np.array((adata.X > 0).sum(axis=0))[0] / (adata.shape[0])
184
+ frac_whole = np.array(expressed_mask.sum(axis=0)) / (adata.shape[0])
185
+ # Normalize the geometrical mean
186
+ ranks = ranks / gM
187
+ ranks = pd.DataFrame(ranks, index=adata.obs_names)
188
+ ranks.columns = adata.var.index
189
+ mk_score = [
190
+ _compute_regional_mkscore(cell_tg)
191
+ for cell_tg in tqdm(cell_list,
192
+ desc="Finding markers (Rank-based approach) | cells")
193
+ ]
194
+ # Normalize the marker scores
195
+ mk_score = pd.DataFrame(np.vstack(mk_score).T, index=adata.var.index, columns=cell_list)
196
+ # mk_score_normalized = mk_score.div(mk_score.sum())*1e+2
197
+ # Remove the mitochondrial genes
198
+ mt_genes = [gene for gene in mk_score.index if gene.startswith('MT-') or gene.startswith('mt-')]
199
+ mask = ~mk_score.index.isin(set(mt_genes))
200
+ mk_score = mk_score[mask] # Apply the mask to mk_score
201
+ print(mk_score.shape)
202
+ # Save the marker scores
203
+ print(f'------Saving marker scores ...')
204
+ output_file_path = Path(config.output_feather_path)
205
+ output_file_path.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
206
+ mk_score.reset_index(inplace=True)
207
+ mk_score.rename(columns={mk_score.columns[0]: 'HUMAN_GENE_SYM'}, inplace=True)
208
+ mk_score.to_feather(output_file_path)
209
+
210
+ #%%
211
+ if __name__ == '__main__':
212
+ parser = argparse.ArgumentParser(description="Process latent to gene data.")
213
+ add_latent_to_gene_args(parser)
214
+ TEST = True
215
+ if TEST:
216
+ name = 'Cortex_151507'
217
+ test_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021'
218
+
219
+ args = parser.parse_args([
220
+ '--input_hdf5_with_latent_path', f'{test_dir}/{name}/hdf5/{name}_add_latent.h5ad',
221
+ '--sample_name', f'{name}',
222
+ '--output_feather_path', f'{test_dir}/{name}/gene_markers/{name}_rank.feather',
223
+ '--method', 'rank',
224
+ '--latent_representation', 'latent_GVAE',
225
+ '--type', 'count',
226
+ '--annotation', 'layer_guess',
227
+ '--num_neighbour', '51',
228
+ # '--no_expression_fraction',
229
+
230
+ ])
231
+
232
+ # config = LatentToGeneConfig(
233
+ # **{'annotation': 'SubClass',
234
+ # 'fold': 1.0,
235
+ # 'gM_slices': None,
236
+ # 'gs_species': '/storage/yangjianLab/songliyang/SpatialData/homologs/macaque_human_homologs.txt',
237
+ # 'input_hdf5_with_latent_path': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/macaque/T121_macaque1/find_latent_representations/T121_macaque1_add_latent.h5ad',
238
+ # 'latent_representation': 'latent_GVAE',
239
+ # 'method': 'rank',
240
+ # 'num_neighbour': 51,
241
+ # 'num_neighbour_spatial': 201,
242
+ # 'output_feather_path': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/macaque/T121_macaque1/latent_to_gene/T121_macaque1_gene_marker_score.feather',
243
+ # 'pst': 0.2,
244
+ # 'sample_name': 'T121_macaque1',
245
+ # 'species': 'MACAQUE_GENE_SYM',
246
+ # 'type': 'SCT'}
247
+ # )
248
+ else:
249
+ args = parser.parse_args()
250
+ config = LatentToGeneConfig(**vars(args))
251
+ logger.info(f'Latent to gene for {args.sample_name}...')
252
+ pprint.pprint(config)
253
+ start_time = time.time()
254
+ run_latent_to_gene(config)
255
+ end_time = time.time()
256
+ logger.info(
257
+ f'Latent to gene for {config.sample_name} finished. Time spent: {(end_time - start_time) / 60:.2f} min.')
gsMap/main.py ADDED
@@ -0,0 +1,39 @@
1
+ from gsMap import (__version__)
2
+ from gsMap.config import *
3
+
4
+ logger = logging.getLogger(__name__)
5
+ logger.setLevel(logging.DEBUG)
6
+ handler = logging.StreamHandler()
7
+ handler.setFormatter(logging.Formatter(
8
+ '[{asctime}] {levelname:8s} {filename} {message}', style='{'))
9
+ logger.addHandler(handler)
10
+
11
+
12
+ def main():
13
+ parser = create_parser()
14
+ args = parser.parse_args()
15
+ if args.subcommand is None:
16
+ parser.print_help()
17
+ exit(1)
18
+ args.func(
19
+ args
20
+ )
21
+
22
+ def create_parser():
23
+ parser = argparse.ArgumentParser(description=" gsMap: Genetics-informed pathogenic spatial mapping",
24
+ formatter_class=argparse.RawTextHelpFormatter,
25
+ prog='gsMap'
26
+ )
27
+ parser.add_argument('--version', '-v', action='version', version=f'gsMap version {__version__}')
28
+ subparsers = parser.add_subparsers(dest="subcommand", help="Subcommands", title="Available subcommands")
29
+ for subcommand in cli_function_registry.values():
30
+ subcommand_parser = subparsers.add_parser(subcommand.name, help=subcommand.description,
31
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
32
+ )
33
+ subcommand.add_args_function(subcommand_parser)
34
+ subcommand_parser.set_defaults(func=subcommand.func)
35
+ return parser
36
+
37
+
38
+ if __name__ == "__main__":
39
+ main()