gsMap 1.65__py3-none-any.whl → 1.66__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/__init__.py +1 -1
- gsMap/latent_to_gene.py +129 -106
- {gsmap-1.65.dist-info → gsmap-1.66.dist-info}/METADATA +1 -1
- {gsmap-1.65.dist-info → gsmap-1.66.dist-info}/RECORD +7 -7
- {gsmap-1.65.dist-info → gsmap-1.66.dist-info}/LICENSE +0 -0
- {gsmap-1.65.dist-info → gsmap-1.66.dist-info}/WHEEL +0 -0
- {gsmap-1.65.dist-info → gsmap-1.66.dist-info}/entry_points.txt +0 -0
gsMap/__init__.py
CHANGED
gsMap/latent_to_gene.py
CHANGED
@@ -4,10 +4,12 @@ from pathlib import Path
|
|
4
4
|
import numpy as np
|
5
5
|
import pandas as pd
|
6
6
|
import scanpy as sc
|
7
|
+
from scipy.sparse import csr_matrix
|
7
8
|
from scipy.stats import gmean
|
8
9
|
from scipy.stats import rankdata
|
9
10
|
from sklearn.metrics.pairwise import cosine_similarity
|
10
11
|
from sklearn.neighbors import NearestNeighbors
|
12
|
+
from joblib import Parallel, delayed
|
11
13
|
from tqdm import tqdm
|
12
14
|
|
13
15
|
from gsMap.config import LatentToGeneConfig
|
@@ -15,119 +17,126 @@ from gsMap.config import LatentToGeneConfig
|
|
15
17
|
logger = logging.getLogger(__name__)
|
16
18
|
|
17
19
|
|
18
|
-
def
|
20
|
+
def find_neighbors(coor, num_neighbour):
|
19
21
|
"""
|
20
|
-
|
22
|
+
Find Neighbors of each cell (based on spatial coordinates).
|
21
23
|
"""
|
22
24
|
nbrs = NearestNeighbors(n_neighbors=num_neighbour).fit(coor)
|
23
25
|
distances, indices = nbrs.kneighbors(coor, return_distance=True)
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
spatial_net = KNN_df.copy()
|
31
|
-
id_cell_trans = dict(zip(range(coor.shape[0]), np.array(coor.index)))
|
32
|
-
|
33
|
-
spatial_net['Cell1'] = spatial_net['Cell1'].map(id_cell_trans)
|
34
|
-
spatial_net['Cell2'] = spatial_net['Cell2'].map(id_cell_trans)
|
35
|
-
|
26
|
+
cell_indices = np.arange(coor.shape[0])
|
27
|
+
cell1 = np.repeat(cell_indices, indices.shape[1])
|
28
|
+
cell2 = indices.flatten()
|
29
|
+
distance = distances.flatten()
|
30
|
+
spatial_net = pd.DataFrame({'Cell1': cell1, 'Cell2': cell2, 'Distance': distance})
|
36
31
|
return spatial_net
|
37
32
|
|
38
33
|
|
39
|
-
def
|
34
|
+
def build_spatial_net(adata, annotation, num_neighbour):
|
40
35
|
"""
|
41
|
-
|
36
|
+
Build spatial neighbourhood matrix for each spot (cell) based on the spatial coordinates.
|
42
37
|
"""
|
43
38
|
logger.info(f'------Building spatial graph based on spatial coordinates...')
|
44
39
|
|
45
|
-
coor =
|
46
|
-
|
47
|
-
|
48
|
-
if not annotation is None:
|
40
|
+
coor = adata.obsm['spatial']
|
41
|
+
if annotation is not None:
|
49
42
|
logger.info(f'Cell annotations are provided...')
|
50
|
-
|
43
|
+
spatial_net_list = []
|
51
44
|
# Cells with annotations
|
52
45
|
for ct in adata.obs[annotation].dropna().unique():
|
53
|
-
|
54
|
-
|
55
|
-
|
46
|
+
idx = np.where(adata.obs[annotation] == ct)[0]
|
47
|
+
coor_temp = coor[idx, :]
|
48
|
+
spatial_net_temp = find_neighbors(coor_temp, min(num_neighbour, coor_temp.shape[0]))
|
49
|
+
# Map back to original indices
|
50
|
+
spatial_net_temp['Cell1'] = idx[spatial_net_temp['Cell1'].values]
|
51
|
+
spatial_net_temp['Cell2'] = idx[spatial_net_temp['Cell2'].values]
|
52
|
+
spatial_net_list.append(spatial_net_temp)
|
56
53
|
logger.info(f'{ct}: {coor_temp.shape[0]} cells')
|
57
54
|
|
58
55
|
# Cells labeled as nan
|
59
56
|
if pd.isnull(adata.obs[annotation]).any():
|
60
|
-
|
61
|
-
logger.info(f'Nan: {len(
|
62
|
-
|
63
|
-
spatial_net_temp =
|
64
|
-
|
65
|
-
|
57
|
+
idx_nan = np.where(pd.isnull(adata.obs[annotation]))[0]
|
58
|
+
logger.info(f'Nan: {len(idx_nan)} cells')
|
59
|
+
spatial_net_temp = find_neighbors(coor, num_neighbour)
|
60
|
+
spatial_net_temp = spatial_net_temp[spatial_net_temp['Cell1'].isin(idx_nan)]
|
61
|
+
spatial_net_list.append(spatial_net_temp)
|
62
|
+
spatial_net = pd.concat(spatial_net_list, axis=0)
|
66
63
|
else:
|
67
64
|
logger.info(f'Cell annotations are not provided...')
|
68
|
-
spatial_net =
|
65
|
+
spatial_net = find_neighbors(coor, num_neighbour)
|
69
66
|
|
70
67
|
return spatial_net
|
71
68
|
|
72
69
|
|
73
|
-
def
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
70
|
+
def find_neighbors_regional(cell_pos, spatial_net_dict, coor_latent, config, cell_annotations):
|
71
|
+
num_neighbour = config.num_neighbour
|
72
|
+
annotations = config.annotation
|
73
|
+
|
74
|
+
cell_use_pos = spatial_net_dict.get(cell_pos, [])
|
75
|
+
if len(cell_use_pos) == 0:
|
76
|
+
return []
|
77
|
+
|
78
|
+
cell_latent = coor_latent[cell_pos, :].reshape(1, -1)
|
79
|
+
neighbors_latent = coor_latent[cell_use_pos, :]
|
80
|
+
similarity = cosine_similarity(cell_latent, neighbors_latent).reshape(-1)
|
83
81
|
|
84
|
-
|
85
|
-
|
82
|
+
if annotations is not None:
|
83
|
+
cell_annotation = cell_annotations[cell_pos]
|
84
|
+
neighbor_annotations = cell_annotations[cell_use_pos]
|
85
|
+
mask = neighbor_annotations == cell_annotation
|
86
|
+
if not np.any(mask):
|
87
|
+
return []
|
88
|
+
similarity = similarity[mask]
|
89
|
+
cell_use_pos = cell_use_pos[mask]
|
86
90
|
|
87
|
-
|
91
|
+
if len(similarity) == 0:
|
92
|
+
return []
|
88
93
|
|
94
|
+
indices = np.argsort(-similarity) # descending order
|
95
|
+
top_indices = indices[:num_neighbour]
|
96
|
+
cell_select_pos = cell_use_pos[top_indices]
|
97
|
+
return cell_select_pos
|
89
98
|
|
90
|
-
|
99
|
+
|
100
|
+
def compute_regional_mkscore(cell_pos, spatial_net_dict, coor_latent, config, cell_annotations,
|
101
|
+
ranks, frac_whole, adata_X_bool):
|
91
102
|
"""
|
92
|
-
|
103
|
+
Compute gmean ranks of a region.
|
93
104
|
"""
|
94
|
-
|
105
|
+
cell_select_pos = find_neighbors_regional(
|
106
|
+
cell_pos, spatial_net_dict, coor_latent, config, cell_annotations
|
107
|
+
)
|
108
|
+
if len(cell_select_pos) == 0:
|
109
|
+
return np.zeros(ranks.shape[1], dtype=np.float16)
|
95
110
|
|
96
111
|
# Ratio of expression ranks
|
97
|
-
ranks_tg = ranks
|
112
|
+
ranks_tg = ranks[cell_select_pos, :]
|
98
113
|
gene_ranks_region = gmean(ranks_tg, axis=0)
|
99
114
|
gene_ranks_region[gene_ranks_region <= 1] = 0
|
100
115
|
|
101
|
-
if not
|
116
|
+
if not config.no_expression_fraction:
|
102
117
|
# Ratio of expression fractions
|
103
|
-
frac_focal =
|
118
|
+
frac_focal = adata_X_bool[cell_select_pos, :].sum(axis=0).A1 / len(cell_select_pos)
|
104
119
|
frac_region = frac_focal / frac_whole
|
105
120
|
frac_region[frac_region <= 1] = 0
|
106
121
|
frac_region[frac_region > 1] = 1
|
107
122
|
|
108
123
|
# Simultaneously consider the ratio of expression fractions and ranks
|
109
|
-
gene_ranks_region =
|
124
|
+
gene_ranks_region = gene_ranks_region * frac_region
|
110
125
|
|
111
126
|
mkscore = np.exp(gene_ranks_region ** 1.5) - 1
|
112
127
|
return mkscore.astype(np.float16, copy=False)
|
113
128
|
|
114
129
|
|
115
130
|
def run_latent_to_gene(config: LatentToGeneConfig):
|
116
|
-
global adata, coor_latent, spatial_net, ranks, frac_whole, args, spatial_net_dict, expressed_mask
|
117
|
-
args = config
|
118
|
-
# Load and process the spatial data
|
119
131
|
logger.info('------Loading the spatial data...')
|
120
132
|
adata = sc.read_h5ad(config.hdf5_with_latent_path)
|
121
133
|
|
122
|
-
|
123
|
-
adata.layers['rank'] = rankdata(adata.X.toarray().astype(np.float32), axis=1).astype(np.float32)
|
124
|
-
|
125
|
-
if not config.annotation is None:
|
134
|
+
if config.annotation is not None:
|
126
135
|
logger.info(f'------Cell annotations are provided as {config.annotation}...')
|
127
136
|
adata = adata[~pd.isnull(adata.obs[config.annotation]), :]
|
128
137
|
|
129
138
|
# Homologs transformation
|
130
|
-
if
|
139
|
+
if config.homolog_file is not None:
|
131
140
|
logger.info(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
|
132
141
|
homologs = pd.read_csv(config.homolog_file, sep='\t')
|
133
142
|
if homologs.shape[1] != 2:
|
@@ -137,34 +146,47 @@ def run_latent_to_gene(config: LatentToGeneConfig):
|
|
137
146
|
homologs.columns = [config.species, 'HUMAN_GENE_SYM']
|
138
147
|
homologs.set_index(config.species, inplace=True)
|
139
148
|
adata = adata[:, adata.var_names.isin(homologs.index)]
|
140
|
-
# Log the number of genes left after homolog transformation
|
141
149
|
logger.info(f"{adata.shape[1]} genes retained after homolog transformation.")
|
142
150
|
if adata.shape[1] < 100:
|
143
151
|
raise ValueError("Too few genes retained in ST data (<100).")
|
144
152
|
adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM'].values
|
145
|
-
# drop duplicated genes
|
146
153
|
adata = adata[:, ~adata.var_names.duplicated()]
|
147
154
|
|
148
|
-
# Remove cells
|
155
|
+
# Remove cells and genes that are not expressed
|
149
156
|
logger.info(f'Number of cells, genes of the input data: {adata.shape[0]},{adata.shape[1]}')
|
150
157
|
adata = adata[adata.X.sum(axis=1) > 0, adata.X.sum(axis=0) > 0]
|
151
158
|
logger.info(f'Number of cells, genes after transformation: {adata.shape[0]},{adata.shape[1]}')
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
159
|
+
|
160
|
+
# Create mappings
|
161
|
+
n_cells = adata.n_obs
|
162
|
+
n_genes = adata.n_vars
|
163
|
+
|
164
|
+
if config.annotation is not None:
|
165
|
+
cell_annotations = adata.obs[config.annotation].values
|
166
|
+
else:
|
167
|
+
cell_annotations = None
|
168
|
+
|
169
|
+
# Build the spatial graph
|
170
|
+
spatial_net = build_spatial_net(adata, config.annotation, config.num_neighbour_spatial)
|
171
|
+
spatial_net_dict = spatial_net.groupby('Cell1')['Cell2'].apply(np.array).to_dict()
|
157
172
|
|
158
173
|
# Extract the latent representation
|
159
|
-
coor_latent =
|
160
|
-
coor_latent
|
161
|
-
|
162
|
-
|
174
|
+
coor_latent = adata.obsm[config.latent_representation]
|
175
|
+
coor_latent = coor_latent.astype(np.float32)
|
176
|
+
|
177
|
+
# Compute ranks
|
178
|
+
logger.info('------Ranking the spatial data...')
|
179
|
+
adata_X = adata.X.tocsr()
|
180
|
+
ranks = np.zeros((n_cells, n_genes), dtype=np.float32)
|
163
181
|
|
164
|
-
|
182
|
+
for i in tqdm(range(n_cells), desc="Computing ranks per cell"):
|
183
|
+
data = adata_X[i, :].toarray().flatten()
|
184
|
+
ranks[i, :] = rankdata(data, method='average')
|
185
|
+
|
186
|
+
# Geometric mean across slices
|
165
187
|
if config.gM_slices is not None:
|
166
188
|
logger.info('Geometrical mean across multiple slices is provided.')
|
167
|
-
|
189
|
+
gM_df = pd.read_parquet(config.gM_slices)
|
168
190
|
if config.species is not None:
|
169
191
|
homologs = pd.read_csv(config.homolog_file, sep='\t', header=None)
|
170
192
|
if homologs.shape[1] < 2:
|
@@ -172,47 +194,48 @@ def run_latent_to_gene(config: LatentToGeneConfig):
|
|
172
194
|
"Homologs file must have at least two columns: one for the species and one for the human gene symbol.")
|
173
195
|
homologs.columns = [config.species, 'HUMAN_GENE_SYM']
|
174
196
|
homologs.set_index(config.species, inplace=True)
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
gM =
|
180
|
-
adata = adata[:,
|
197
|
+
gM_df = gM_df.loc[gM_df.index.isin(homologs.index)]
|
198
|
+
gM_df.index = homologs.loc[gM_df.index, 'HUMAN_GENE_SYM'].values
|
199
|
+
common_genes = np.intersect1d(adata.var_names, gM_df.index)
|
200
|
+
gM_df = gM_df.loc[common_genes]
|
201
|
+
gM = gM_df['G_Mean'].values
|
202
|
+
adata = adata[:, common_genes]
|
203
|
+
ranks = ranks[:, np.isin(adata.var_names, common_genes)]
|
181
204
|
else:
|
182
|
-
gM = gmean(
|
205
|
+
gM = gmean(ranks, axis=0)
|
183
206
|
|
184
207
|
# Compute the fraction of each gene across cells
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
# Normalize the
|
189
|
-
ranks =
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
# adata.layers['mkscore'] = mk_score.values.T
|
208
|
+
adata_X_bool = adata_X.astype(bool)
|
209
|
+
frac_whole = np.asarray(adata_X_bool.sum(axis=0)).flatten() / n_cells
|
210
|
+
|
211
|
+
# Normalize the ranks
|
212
|
+
ranks = ranks / gM
|
213
|
+
|
214
|
+
# Compute marker scores in parallel
|
215
|
+
logger.info('------Computing marker scores...')
|
216
|
+
|
217
|
+
def compute_mk_score_wrapper(cell_pos):
|
218
|
+
return compute_regional_mkscore(
|
219
|
+
cell_pos, spatial_net_dict, coor_latent, config, cell_annotations, ranks, frac_whole, adata_X_bool
|
220
|
+
)
|
221
|
+
|
222
|
+
mk_scores = [compute_mk_score_wrapper(cell_pos) for cell_pos in tqdm(range(n_cells), desc="Calculating marker scores")]
|
223
|
+
mk_score = np.vstack(mk_scores).T
|
224
|
+
|
225
|
+
# Remove mitochondrial genes
|
226
|
+
gene_names = adata.var_names.values.astype(str)
|
227
|
+
mt_gene_mask = ~(np.char.startswith(gene_names, 'MT-') | np.char.startswith(gene_names, 'mt-'))
|
228
|
+
mk_score = mk_score[mt_gene_mask, :]
|
229
|
+
gene_names = gene_names[mt_gene_mask]
|
208
230
|
|
209
231
|
# Save the marker scores
|
210
232
|
logger.info(f'------Saving marker scores ...')
|
211
233
|
output_file_path = Path(config.mkscore_feather_path)
|
212
234
|
output_file_path.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
|
213
|
-
|
214
|
-
|
215
|
-
|
235
|
+
mk_score_df = pd.DataFrame(mk_score, index=gene_names, columns=adata.obs_names)
|
236
|
+
mk_score_df.reset_index(inplace=True)
|
237
|
+
mk_score_df.rename(columns={'index': 'HUMAN_GENE_SYM'}, inplace=True)
|
238
|
+
mk_score_df.to_feather(output_file_path)
|
216
239
|
|
217
240
|
# Save the modified adata object to disk
|
218
241
|
adata.write(config.hdf5_with_latent_path)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
gsMap/__init__.py,sha256=
|
1
|
+
gsMap/__init__.py,sha256=eQ-mfdcGTJtKS2KIu5PEQMqgx_9j9W5KKTBVr-iI4yo,78
|
2
2
|
gsMap/__main__.py,sha256=jR-HT42Zzfj2f-7kFJy0bkWjNxcV1MyfQHXFpef2nSE,62
|
3
3
|
gsMap/cauchy_combination_test.py,sha256=zBPR7DOaNkr7rRoua4tAjRZL7ArjCyMRSQlPSUdHNSE,5694
|
4
4
|
gsMap/config.py,sha256=hMUvlwlKZXeRdTJZfMINz_8DadVhEIT6X6fyJf11M9E,41134
|
@@ -6,7 +6,7 @@ gsMap/diagnosis.py,sha256=pp3ONVaWCOoNCog1_6eud38yicBFxL-XhH7D8iTBgF4,13220
|
|
6
6
|
gsMap/find_latent_representation.py,sha256=BVv4dyTolrlciHG3I-vwNDh2ruPpTf9jiT1hMKZnpto,6044
|
7
7
|
gsMap/format_sumstats.py,sha256=9OBxuunoOLml3LKZvvRsPEEjQvT1Cuqb0w6lqsRIYPw,13714
|
8
8
|
gsMap/generate_ldscore.py,sha256=2JfQoMWeQ0-B-zRHakmwq8ovkeewlnWHUCnih6od6ZE,29089
|
9
|
-
gsMap/latent_to_gene.py,sha256=
|
9
|
+
gsMap/latent_to_gene.py,sha256=MwoGQd0EDvDmvpuMoVD83SI1EeGJXXzMW8YZp_6wxI8,10082
|
10
10
|
gsMap/main.py,sha256=skyBtESdjvuXd9HNq5c83OPxQTNgLVErkYhwuJm8tE4,1285
|
11
11
|
gsMap/report.py,sha256=H0uYAru2L5-d41_LFHPPdoJbtiTzP4f8kX-mirUNAfc,6963
|
12
12
|
gsMap/run_all_mode.py,sha256=sPEct9fRw7aAQuU7BNChxk-I8YQcXuq--mtBn-2wTTY,8388
|
@@ -24,8 +24,8 @@ gsMap/utils/jackknife.py,sha256=nEDPVQJOPQ_uqfUCGX_v5cQwokgCqUmJTT_8rVFuIQo,1824
|
|
24
24
|
gsMap/utils/make_annotations.py,sha256=lCbtahT27WFOwLgZrEUE5QcNRuMXmAFYUfsFR-cT-m0,22197
|
25
25
|
gsMap/utils/manhattan_plot.py,sha256=k3n-NNgMsov9-8UQrirVqG560FUfJ4d6wNG8C0OeCjY,26235
|
26
26
|
gsMap/utils/regression_read.py,sha256=n_hZZzQXHU-CSLvSofXmQM5Jw4Zpufv3U2HoUW344ko,8768
|
27
|
-
gsmap-1.
|
28
|
-
gsmap-1.
|
29
|
-
gsmap-1.
|
30
|
-
gsmap-1.
|
31
|
-
gsmap-1.
|
27
|
+
gsmap-1.66.dist-info/entry_points.txt,sha256=s_P2Za22O077tc1FPLKMinbdRVXaN_HTcDBgWMYpqA4,41
|
28
|
+
gsmap-1.66.dist-info/LICENSE,sha256=Ni2F-lLSv_H1xaVT3CoSrkiKzMvlgwh-dq8PE1esGyI,1094
|
29
|
+
gsmap-1.66.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
|
30
|
+
gsmap-1.66.dist-info/METADATA,sha256=HXeRNmaP_UPzG2Qjn5s-jcLBvrfLgPYl7qVGDAKJG5Y,3376
|
31
|
+
gsmap-1.66.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|