gsMap 1.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN_VAE/__init__.py +0 -0
- gsMap/GNN_VAE/adjacency_matrix.py +95 -0
- gsMap/GNN_VAE/model.py +87 -0
- gsMap/GNN_VAE/train.py +97 -0
- gsMap/__init__.py +5 -0
- gsMap/__main__.py +3 -0
- gsMap/cauchy_combination_test.py +163 -0
- gsMap/config.py +734 -0
- gsMap/find_latent_representation.py +209 -0
- gsMap/format_sumstats.py +410 -0
- gsMap/generate_ldscore.py +551 -0
- gsMap/generate_r2_matrix.py +743 -0
- gsMap/jackknife.py +514 -0
- gsMap/latent_to_gene.py +257 -0
- gsMap/main.py +39 -0
- gsMap/make_annotations.py +560 -0
- gsMap/regression_read.py +294 -0
- gsMap/spatial_ldsc_multiple_sumstats.py +307 -0
- gsMap/visualize.py +154 -0
- gsmap-1.60.dist-info/LICENSE +21 -0
- gsmap-1.60.dist-info/METADATA +124 -0
- gsmap-1.60.dist-info/RECORD +24 -0
- gsmap-1.60.dist-info/WHEEL +4 -0
- gsmap-1.60.dist-info/entry_points.txt +3 -0
gsMap/latent_to_gene.py
ADDED
@@ -0,0 +1,257 @@
|
|
1
|
+
import argparse
|
2
|
+
import logging
|
3
|
+
import multiprocessing
|
4
|
+
import pprint
|
5
|
+
import time
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
import scanpy as sc
|
11
|
+
from scipy.stats import gmean
|
12
|
+
from scipy.stats import rankdata
|
13
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
14
|
+
from sklearn.neighbors import NearestNeighbors
|
15
|
+
from tqdm import tqdm
|
16
|
+
|
17
|
+
from gsMap.config import add_latent_to_gene_args, LatentToGeneConfig
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
logger.setLevel(logging.DEBUG)
|
21
|
+
handler = logging.StreamHandler()
|
22
|
+
handler.setFormatter(logging.Formatter(
|
23
|
+
'[{asctime}] {levelname:8s} {filename} {message}', style='{'))
|
24
|
+
logger.addHandler(handler)
|
25
|
+
|
26
|
+
|
27
|
+
def find_Neighbors(coor, num_neighbour):
|
28
|
+
"""
|
29
|
+
find Neighbors of each cell (based on spatial coordinates)
|
30
|
+
"""
|
31
|
+
nbrs = NearestNeighbors(n_neighbors=num_neighbour).fit(coor)
|
32
|
+
distances, indices = nbrs.kneighbors(coor, return_distance=True)
|
33
|
+
|
34
|
+
KNN_list = [pd.DataFrame(zip([it] * indices[it].shape[0], indices[it], distances[it])) for it in
|
35
|
+
range(indices.shape[0])]
|
36
|
+
KNN_df = pd.concat(KNN_list)
|
37
|
+
KNN_df.columns = ['Cell1', 'Cell2', 'Distance']
|
38
|
+
|
39
|
+
spatial_net = KNN_df.copy()
|
40
|
+
id_cell_trans = dict(zip(range(coor.shape[0]), np.array(coor.index)))
|
41
|
+
|
42
|
+
spatial_net['Cell1'] = spatial_net['Cell1'].map(id_cell_trans)
|
43
|
+
spatial_net['Cell2'] = spatial_net['Cell2'].map(id_cell_trans)
|
44
|
+
|
45
|
+
return spatial_net
|
46
|
+
|
47
|
+
|
48
|
+
def _build_spatial_net(adata, annotation, num_neighbour):
|
49
|
+
"""
|
50
|
+
1 Build spatial neighbourhood matrix for each spot (cell) based on the spatial coord
|
51
|
+
"""
|
52
|
+
print(f'------Building spatial graph based on spatial coordinates...')
|
53
|
+
|
54
|
+
coor = pd.DataFrame(adata.obsm['spatial'])
|
55
|
+
coor.index = adata.obs.index
|
56
|
+
|
57
|
+
if not annotation is None:
|
58
|
+
print(f'Cell annotations are provided...')
|
59
|
+
spatial_net = pd.DataFrame()
|
60
|
+
# Cells with annotations
|
61
|
+
for ct in adata.obs[annotation].dropna().unique():
|
62
|
+
coor_temp = coor.loc[adata.obs[annotation] == ct, :]
|
63
|
+
spatial_net_temp = find_Neighbors(coor_temp, min(num_neighbour, coor_temp.shape[0]))
|
64
|
+
spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
|
65
|
+
print(f'{ct}: {coor_temp.shape[0]} cells')
|
66
|
+
|
67
|
+
# Cells labeled as nan
|
68
|
+
if pd.isnull(adata.obs[annotation]).any():
|
69
|
+
cell_nan = adata.obs.index[np.where(pd.isnull(adata.obs[annotation]))[0]]
|
70
|
+
print(f'Nan: {len(cell_nan)} cells')
|
71
|
+
|
72
|
+
spatial_net_temp = find_Neighbors(coor, num_neighbour)
|
73
|
+
spatial_net_temp = spatial_net_temp.loc[spatial_net_temp.Cell1.isin(cell_nan), :]
|
74
|
+
spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
|
75
|
+
else:
|
76
|
+
print(f'Cell annotations are not provided...')
|
77
|
+
spatial_net = find_Neighbors(coor, num_neighbour)
|
78
|
+
|
79
|
+
return spatial_net
|
80
|
+
|
81
|
+
|
82
|
+
def find_Neighbors_Regional(cell):
|
83
|
+
cell_use = spatial_net_dict[cell]
|
84
|
+
similarity = cosine_similarity(coor_latent.loc[cell].values.reshape(1, -1),
|
85
|
+
coor_latent.loc[cell_use].values).reshape(-1)
|
86
|
+
if not args.annotation is None:
|
87
|
+
annotation = adata.obs[args.annotation]
|
88
|
+
df = pd.DataFrame({'Cell2': cell_use, 'Similarity': similarity, 'Annotation': annotation[cell_use]})
|
89
|
+
df = df.loc[df.loc[cell_use, 'Annotation'] == df.loc[cell, 'Annotation']]
|
90
|
+
else:
|
91
|
+
df = pd.DataFrame({'Cell2': cell_use, 'Similarity': similarity})
|
92
|
+
|
93
|
+
df = df.sort_values(by='Similarity', ascending=False)
|
94
|
+
cell_select = df.Cell2[0:args.num_neighbour].to_list()
|
95
|
+
|
96
|
+
return cell_select
|
97
|
+
|
98
|
+
|
99
|
+
def _compute_regional_mkscore(cell_tg, ):
|
100
|
+
"""
|
101
|
+
compute gmean ranks of a region
|
102
|
+
"""
|
103
|
+
cell_select = find_Neighbors_Regional(cell_tg)
|
104
|
+
|
105
|
+
# Ratio of expression ranks
|
106
|
+
ranks_tg = ranks.loc[cell_select]
|
107
|
+
gene_ranks_region = gmean(ranks_tg, axis=0)
|
108
|
+
gene_ranks_region[gene_ranks_region <= 1] = 0
|
109
|
+
|
110
|
+
if not args.no_expression_fraction:
|
111
|
+
# Ratio of expression fractions
|
112
|
+
frac_focal = expressed_mask.loc[cell_select].sum(0) / len(cell_select)
|
113
|
+
frac_region = frac_focal / frac_whole
|
114
|
+
frac_region[frac_region <= 1] = 0
|
115
|
+
frac_region[frac_region > 1] = 1
|
116
|
+
|
117
|
+
# Simultaneously consider the ratio of expression fractions and ranks
|
118
|
+
gene_ranks_region = (gene_ranks_region * frac_region).values
|
119
|
+
|
120
|
+
mkscore = np.exp(gene_ranks_region ** 2) - 1
|
121
|
+
return mkscore.astype(np.float16, copy=False)
|
122
|
+
|
123
|
+
|
124
|
+
def run_latent_to_gene(config: LatentToGeneConfig):
|
125
|
+
global adata, coor_latent, spatial_net, ranks, frac_whole, args, spatial_net_dict, expressed_mask
|
126
|
+
args = config
|
127
|
+
# Load and process the spatial data
|
128
|
+
print('------Loading the spatial data...')
|
129
|
+
adata = sc.read_h5ad(config.input_hdf5_with_latent_path)
|
130
|
+
if not config.annotation is None:
|
131
|
+
print(f'------Cell annotations are provided as {config.annotation}...')
|
132
|
+
adata = adata[~pd.isnull(adata.obs[config.annotation]), :]
|
133
|
+
# Homologs transformation
|
134
|
+
if not config.species is None:
|
135
|
+
print(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
|
136
|
+
homologs = pd.read_csv(config.gs_species, sep='\t')
|
137
|
+
homologs.index = homologs[config.species]
|
138
|
+
adata = adata[:, adata.var_names.isin(homologs[config.species])]
|
139
|
+
print(f'{adata.shape[1]} genes left after homologs transformation.')
|
140
|
+
adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM']
|
141
|
+
# Process the data
|
142
|
+
if config.type == 'count':
|
143
|
+
adata.X = adata.layers[config.type]
|
144
|
+
sc.pp.normalize_total(adata, target_sum=1e4)
|
145
|
+
sc.pp.log1p(adata)
|
146
|
+
else:
|
147
|
+
adata.X = adata.layers[config.type]
|
148
|
+
|
149
|
+
# Remove cells that do not express any genes after transformation, and genes that are not expressed in any cells.
|
150
|
+
print(f'Number of cells, genes of the input data: {adata.shape[0]},{adata.shape[1]}')
|
151
|
+
adata = adata[adata.X.sum(axis=1) > 0, adata.X.sum(axis=0) > 0]
|
152
|
+
print(f'Number of cells, genes after transformation: {adata.shape[0]},{adata.shape[1]}')
|
153
|
+
# Buid the spatial graph
|
154
|
+
spatial_net = _build_spatial_net(adata, config.annotation, config.num_neighbour_spatial)
|
155
|
+
spatial_net.set_index('Cell1', inplace=True)
|
156
|
+
# convert the spatial graph to a dictionary cell1 to cells in the neighbourhood
|
157
|
+
spatial_net_dict = spatial_net.groupby(spatial_net.index).Cell2.apply(list).to_dict()
|
158
|
+
|
159
|
+
# Extract the latent representation
|
160
|
+
coor_latent = pd.DataFrame(adata.obsm[config.latent_representation])
|
161
|
+
coor_latent.index = adata.obs.index
|
162
|
+
# Find marker genes
|
163
|
+
cell_list = adata.obs.index.tolist()
|
164
|
+
|
165
|
+
# Load the geometrical mean across slices
|
166
|
+
if not config.gM_slices is None:
|
167
|
+
print('Geometrical mean across multiple slices are provided.')
|
168
|
+
gM = pd.read_parquet(config.gM_slices)
|
169
|
+
# Select the common gene
|
170
|
+
common_gene = np.intersect1d(adata.var_names, gM.index)
|
171
|
+
gM = gM.loc[common_gene]
|
172
|
+
gM = gM['G_Mean'].to_list()
|
173
|
+
print('------Ranking the spatial data...')
|
174
|
+
adata = adata[:, common_gene]
|
175
|
+
ranks = np.apply_along_axis(rankdata, 1, adata.X.toarray())
|
176
|
+
else:
|
177
|
+
print('------Ranking the spatial data...')
|
178
|
+
ranks = rankdata(adata.X.toarray().astype(np.float32), axis=1).astype(np.float32)
|
179
|
+
gM = gmean(ranks, axis=0)
|
180
|
+
|
181
|
+
# Compute the fraction of each gene across cells
|
182
|
+
expressed_mask = pd.DataFrame((adata.X > 0).toarray(), index=adata.obs.index, columns=adata.var.index)
|
183
|
+
# frac_whole = np.array((adata.X > 0).sum(axis=0))[0] / (adata.shape[0])
|
184
|
+
frac_whole = np.array(expressed_mask.sum(axis=0)) / (adata.shape[0])
|
185
|
+
# Normalize the geometrical mean
|
186
|
+
ranks = ranks / gM
|
187
|
+
ranks = pd.DataFrame(ranks, index=adata.obs_names)
|
188
|
+
ranks.columns = adata.var.index
|
189
|
+
mk_score = [
|
190
|
+
_compute_regional_mkscore(cell_tg)
|
191
|
+
for cell_tg in tqdm(cell_list,
|
192
|
+
desc="Finding markers (Rank-based approach) | cells")
|
193
|
+
]
|
194
|
+
# Normalize the marker scores
|
195
|
+
mk_score = pd.DataFrame(np.vstack(mk_score).T, index=adata.var.index, columns=cell_list)
|
196
|
+
# mk_score_normalized = mk_score.div(mk_score.sum())*1e+2
|
197
|
+
# Remove the mitochondrial genes
|
198
|
+
mt_genes = [gene for gene in mk_score.index if gene.startswith('MT-') or gene.startswith('mt-')]
|
199
|
+
mask = ~mk_score.index.isin(set(mt_genes))
|
200
|
+
mk_score = mk_score[mask] # Apply the mask to mk_score
|
201
|
+
print(mk_score.shape)
|
202
|
+
# Save the marker scores
|
203
|
+
print(f'------Saving marker scores ...')
|
204
|
+
output_file_path = Path(config.output_feather_path)
|
205
|
+
output_file_path.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
|
206
|
+
mk_score.reset_index(inplace=True)
|
207
|
+
mk_score.rename(columns={mk_score.columns[0]: 'HUMAN_GENE_SYM'}, inplace=True)
|
208
|
+
mk_score.to_feather(output_file_path)
|
209
|
+
|
210
|
+
#%%
|
211
|
+
if __name__ == '__main__':
|
212
|
+
parser = argparse.ArgumentParser(description="Process latent to gene data.")
|
213
|
+
add_latent_to_gene_args(parser)
|
214
|
+
TEST = True
|
215
|
+
if TEST:
|
216
|
+
name = 'Cortex_151507'
|
217
|
+
test_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021'
|
218
|
+
|
219
|
+
args = parser.parse_args([
|
220
|
+
'--input_hdf5_with_latent_path', f'{test_dir}/{name}/hdf5/{name}_add_latent.h5ad',
|
221
|
+
'--sample_name', f'{name}',
|
222
|
+
'--output_feather_path', f'{test_dir}/{name}/gene_markers/{name}_rank.feather',
|
223
|
+
'--method', 'rank',
|
224
|
+
'--latent_representation', 'latent_GVAE',
|
225
|
+
'--type', 'count',
|
226
|
+
'--annotation', 'layer_guess',
|
227
|
+
'--num_neighbour', '51',
|
228
|
+
# '--no_expression_fraction',
|
229
|
+
|
230
|
+
])
|
231
|
+
|
232
|
+
# config = LatentToGeneConfig(
|
233
|
+
# **{'annotation': 'SubClass',
|
234
|
+
# 'fold': 1.0,
|
235
|
+
# 'gM_slices': None,
|
236
|
+
# 'gs_species': '/storage/yangjianLab/songliyang/SpatialData/homologs/macaque_human_homologs.txt',
|
237
|
+
# 'input_hdf5_with_latent_path': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/macaque/T121_macaque1/find_latent_representations/T121_macaque1_add_latent.h5ad',
|
238
|
+
# 'latent_representation': 'latent_GVAE',
|
239
|
+
# 'method': 'rank',
|
240
|
+
# 'num_neighbour': 51,
|
241
|
+
# 'num_neighbour_spatial': 201,
|
242
|
+
# 'output_feather_path': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/macaque/T121_macaque1/latent_to_gene/T121_macaque1_gene_marker_score.feather',
|
243
|
+
# 'pst': 0.2,
|
244
|
+
# 'sample_name': 'T121_macaque1',
|
245
|
+
# 'species': 'MACAQUE_GENE_SYM',
|
246
|
+
# 'type': 'SCT'}
|
247
|
+
# )
|
248
|
+
else:
|
249
|
+
args = parser.parse_args()
|
250
|
+
config = LatentToGeneConfig(**vars(args))
|
251
|
+
logger.info(f'Latent to gene for {args.sample_name}...')
|
252
|
+
pprint.pprint(config)
|
253
|
+
start_time = time.time()
|
254
|
+
run_latent_to_gene(config)
|
255
|
+
end_time = time.time()
|
256
|
+
logger.info(
|
257
|
+
f'Latent to gene for {config.sample_name} finished. Time spent: {(end_time - start_time) / 60:.2f} min.')
|
gsMap/main.py
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
from gsMap import (__version__)
|
2
|
+
from gsMap.config import *
|
3
|
+
|
4
|
+
logger = logging.getLogger(__name__)
|
5
|
+
logger.setLevel(logging.DEBUG)
|
6
|
+
handler = logging.StreamHandler()
|
7
|
+
handler.setFormatter(logging.Formatter(
|
8
|
+
'[{asctime}] {levelname:8s} {filename} {message}', style='{'))
|
9
|
+
logger.addHandler(handler)
|
10
|
+
|
11
|
+
|
12
|
+
def main():
|
13
|
+
parser = create_parser()
|
14
|
+
args = parser.parse_args()
|
15
|
+
if args.subcommand is None:
|
16
|
+
parser.print_help()
|
17
|
+
exit(1)
|
18
|
+
args.func(
|
19
|
+
args
|
20
|
+
)
|
21
|
+
|
22
|
+
def create_parser():
|
23
|
+
parser = argparse.ArgumentParser(description=" gsMap: Genetics-informed pathogenic spatial mapping",
|
24
|
+
formatter_class=argparse.RawTextHelpFormatter,
|
25
|
+
prog='gsMap'
|
26
|
+
)
|
27
|
+
parser.add_argument('--version', '-v', action='version', version=f'gsMap version {__version__}')
|
28
|
+
subparsers = parser.add_subparsers(dest="subcommand", help="Subcommands", title="Available subcommands")
|
29
|
+
for subcommand in cli_function_registry.values():
|
30
|
+
subcommand_parser = subparsers.add_parser(subcommand.name, help=subcommand.description,
|
31
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
32
|
+
)
|
33
|
+
subcommand.add_args_function(subcommand_parser)
|
34
|
+
subcommand_parser.set_defaults(func=subcommand.func)
|
35
|
+
return parser
|
36
|
+
|
37
|
+
|
38
|
+
if __name__ == "__main__":
|
39
|
+
main()
|