mcDETECT 2.0.9__tar.gz → 2.0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcDETECT might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcDETECT
3
- Version: 2.0.9
3
+ Version: 2.0.10
4
4
  Summary: Uncovering the dark transcriptome in polarized neuronal compartments with mcDETECT
5
5
  Home-page: https://github.com/chen-yang-yuan/mcDETECT
6
6
  Author: Chenyang Yuan
@@ -0,0 +1,6 @@
1
+ __version__ = "2.0.10"
2
+
3
+ from . import model
4
+ from . import utils
5
+
6
+ __all__ = ["model", "utils"]
@@ -122,8 +122,10 @@ class mcDETECT:
122
122
  continue
123
123
  temp_in_nucleus = np.sum(target["overlaps_nucleus"].values[mask])
124
124
  temp_size = coords.shape[0]
125
- coords_unique = np.unique(coords, axis=0)
126
- center, r2 = miniball.get_bounding_ball(coords_unique, epsilon=1e-8)
125
+ # coords_unique = np.unique(coords, axis=0)
126
+ temp = pd.DataFrame(coords, columns=["global_x", "global_y", "global_z"])
127
+ temp = temp.drop_duplicates()
128
+ center, r2 = miniball.get_bounding_ball(np.array(temp), epsilon=1e-8)
127
129
  if self.type != "Xenium":
128
130
  closest_z = closest(z_grid, center[2])
129
131
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcDETECT
3
- Version: 2.0.9
3
+ Version: 2.0.10
4
4
  Summary: Uncovering the dark transcriptome in polarized neuronal compartments with mcDETECT
5
5
  Home-page: https://github.com/chen-yang-yuan/mcDETECT
6
6
  Author: Chenyang Yuan
@@ -3,7 +3,6 @@ README.md
3
3
  setup.py
4
4
  mcDETECT/__init__.py
5
5
  mcDETECT/model.py
6
- mcDETECT/model_new_incorrect.py
7
6
  mcDETECT/utils.py
8
7
  mcDETECT.egg-info/PKG-INFO
9
8
  mcDETECT.egg-info/SOURCES.txt
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name = "mcDETECT",
5
- version = "2.0.9",
5
+ version = "2.0.10",
6
6
  packages = find_packages(),
7
7
  install_requires = ["anndata", "miniball", "numpy", "pandas", "rtree", "scanpy", "scikit-learn", "scipy", "shapely"],
8
8
  author = "Chenyang Yuan",
@@ -1,6 +0,0 @@
1
- __version__ = "2.0.9"
2
-
3
- from . import model
4
- from . import utils
5
-
6
- __all__ = ["model", "utils"]
@@ -1,625 +0,0 @@
1
- import anndata
2
- import math
3
- import miniball
4
- import numpy as np
5
- import pandas as pd
6
- import scanpy as sc
7
- from collections import Counter
8
- from rtree import index
9
- from scipy.sparse import csr_matrix
10
- from scipy.spatial import cKDTree
11
- from scipy.stats import poisson
12
- from shapely.geometry import Point
13
- from sklearn.cluster import DBSCAN
14
- from sklearn.preprocessing import OneHotEncoder
15
-
16
-
17
- from .utils import *
18
-
19
-
20
- class mcDETECT:
21
-
22
-
23
- def __init__(self, type, transcripts, gnl_genes, nc_genes = None, eps = 1.5, minspl = None, grid_len = 1.0, cutoff_prob = 0.95, alpha = 5.0, low_bound = 3,
24
- size_thr = 4.0, in_nucleus_thr = (0.5, 0.5), l = 1.0, rho = 0.2, s = 1.0, nc_top = 20, nc_thr = 0.1):
25
-
26
- self.type = type # string, iST platform, now support MERSCOPE, Xenium, and CosMx
27
- self.transcripts = transcripts # dataframe, transcripts file
28
- self.gnl_genes = gnl_genes # list, string, all granule markers
29
- self.nc_genes = nc_genes # list, string, all negative controls
30
- self.eps = eps # numeric, searching radius epsilon
31
- self.minspl = minspl # integer, manually select min_samples, i.e., no automatic parameter selection
32
- self.grid_len = grid_len # numeric, length of grids for computing the tissue area
33
- self.cutoff_prob = cutoff_prob # numeric, cutoff probability in parameter selection for min_samples
34
- self.alpha = alpha # numeric, scaling factor in parameter selection for min_samples
35
- self.low_bound = low_bound # integer, lower bound in parameter selection for min_samples
36
- self.size_thr = size_thr # numeric, threshold for maximum radius of an aggregation
37
- self.in_nucleus_thr = in_nucleus_thr # 2-d tuple, threshold for low- and high-in-nucleus ratio
38
- self.l = l # numeric, scaling factor for seaching overlapped spheres
39
- self.rho = rho # numeric, threshold for determining overlaps
40
- self.s = s # numeric, scaling factor for merging overlapped spheres
41
- self.nc_top = nc_top # integer, number of negative controls retained for filtering
42
- self.nc_thr = nc_thr # numeric, threshold for negative control filtering
43
-
44
-
45
- # [INNER] construct grids, input for tissue_area()
46
- def construct_grid(self, grid_len = None):
47
- if grid_len is None:
48
- grid_len = self.grid_len
49
- x_min, x_max = np.min(self.transcripts["global_x"]), np.max(self.transcripts["global_x"])
50
- y_min, y_max = np.min(self.transcripts["global_y"]), np.max(self.transcripts["global_y"])
51
- x_min = np.floor(x_min / grid_len) * grid_len
52
- x_max = np.ceil(x_max / grid_len) * grid_len
53
- y_min = np.floor(y_min / grid_len) * grid_len
54
- y_max = np.ceil(y_max / grid_len) * grid_len
55
- x_bins = np.arange(x_min, x_max + grid_len, grid_len)
56
- y_bins = np.arange(y_min, y_max + grid_len, grid_len)
57
- return x_bins, y_bins
58
-
59
-
60
- # [INNER] calculate tissue area, input for poisson_select()
61
- def tissue_area(self):
62
- if not hasattr(self, "_cached_area"):
63
- x_bins, y_bins = self.construct_grid(grid_len = None)
64
- hist, _, _ = np.histogram2d(self.transcripts["global_x"], self.transcripts["global_y"], bins = [x_bins, y_bins])
65
- self._cached_area = np.count_nonzero(hist) * (self.grid_len ** 2)
66
- return self._cached_area
67
-
68
-
69
- # [INNER] calculate optimal min_samples, input for dbscan()
70
- def poisson_select(self, gene_name):
71
- num_trans = np.sum(self.transcripts["target"] == gene_name)
72
- bg_density = num_trans / self.tissue_area()
73
- cutoff_density = poisson.ppf(self.cutoff_prob, mu = self.alpha * bg_density * (np.pi * self.eps ** 2))
74
- optimal_m = int(max(cutoff_density, self.low_bound))
75
- return optimal_m
76
-
77
-
78
- # [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each granule marker
79
- def dbscan(self, target_names = None, record_cell_id = False, write_csv = False, write_path = "./"):
80
-
81
- if self.type != "Xenium":
82
- z_grid = list(self.transcripts["global_z"].unique())
83
- z_grid.sort()
84
-
85
- if target_names is None:
86
- target_names = self.gnl_genes
87
-
88
- transcripts = self.transcripts[self.transcripts["target"].isin(target_names)]
89
- grouped = {g: df for g, df in transcripts.groupby("target")}
90
-
91
- num_individual, data_low, data_high = [], {}, {}
92
-
93
- for j in target_names:
94
-
95
- # split transcripts
96
- target = grouped[j]
97
- others = pd.concat([grouped[g] for g in target_names if g != j], ignore_index = True)
98
- tree = make_tree(d1 = np.array(others["global_x"]), d2 = np.array(others["global_y"]), d3 = np.array(others["global_z"]))
99
-
100
- # 3D DBSCAN
101
- if self.minspl is None:
102
- min_spl = self.poisson_select(j)
103
- else:
104
- min_spl = self.minspl
105
- X = np.array(target[["global_x", "global_y", "global_z"]])
106
- db = DBSCAN(eps = self.eps, min_samples = min_spl, algorithm = "kd_tree").fit(X)
107
- labels = db.labels_
108
- n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
109
-
110
- # iterate over all aggregations
111
- cell_id, sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score = [], [], [], [], [], [], [], [], []
112
-
113
- for k in range(n_clusters):
114
-
115
- # record cell ids
116
- if record_cell_id:
117
- temp = target[labels == k]
118
- temp_cell_id_mode = temp["cell_id"].mode()[0]
119
- cell_id.append(temp_cell_id_mode)
120
-
121
- # find minimum enclosing spheres
122
- mask = (labels == k)
123
- coords = X[mask]
124
- if coords.shape[0] == 0:
125
- continue
126
- temp_in_nucleus = np.sum(target["overlaps_nucleus"].values[mask])
127
- temp_size = coords.shape[0]
128
- coords_unique = np.unique(coords, axis=0)
129
- center, r2 = miniball.get_bounding_ball(coords_unique, epsilon=1e-8)
130
- if self.type != "Xenium":
131
- closest_z = closest(z_grid, center[2])
132
- else:
133
- closest_z = center[2]
134
-
135
- # calculate size, composition, and in-nucleus score
136
- other_idx = tree.query_ball_point([center[0], center[1], center[2]], np.sqrt(r2))
137
- other_trans = others.iloc[other_idx]
138
- other_in_nucleus = np.sum(other_trans["overlaps_nucleus"])
139
- other_size = other_trans.shape[0]
140
- other_comp = len(other_trans["target"].unique())
141
- total_size = temp_size + other_size
142
- total_comp = 1 + other_comp
143
- local_score = (temp_in_nucleus + other_in_nucleus) / total_size
144
-
145
- # record coordinate, radius, size, composition, and in-nucleus score
146
- sphere_x.append(center[0])
147
- sphere_y.append(center[1])
148
- sphere_z.append(center[2])
149
- layer_z.append(closest_z)
150
- sphere_r.append(np.sqrt(r2))
151
- sphere_size.append(total_size)
152
- sphere_comp.append(total_comp)
153
- sphere_score.append(local_score)
154
-
155
- # basic features for all spheres from each granule marker
156
- sphere = pd.DataFrame(list(zip(sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score, [j] * len(sphere_x))),
157
- columns = ["sphere_x", "sphere_y", "sphere_z", "layer_z", "sphere_r", "size", "comp", "in_nucleus", "gene"])
158
- sphere = sphere.astype({"sphere_x": float, "sphere_y": float, "sphere_z": float, "layer_z": float, "sphere_r": float, "size": float, "comp": float, "in_nucleus": float, "gene": str})
159
- if record_cell_id:
160
- sphere["cell_id"] = cell_id
161
- sphere = sphere.astype({"cell_id": str})
162
-
163
- # split low- and high-in-nucleus spheres
164
- sphere_low = sphere[(sphere["sphere_r"] < self.size_thr) & (sphere["in_nucleus"] < self.in_nucleus_thr[0])]
165
- sphere_high = sphere[(sphere["sphere_r"] < self.size_thr) & (sphere["in_nucleus"] > self.in_nucleus_thr[1])]
166
-
167
- if write_csv:
168
- sphere_low.to_csv(write_path + j + " sphere.csv", index=0)
169
- sphere_high.to_csv(write_path + j + " sphere_high.csv", index=0)
170
-
171
- num_individual.append(sphere_low.shape[0])
172
- data_low[target_names.index(j)] = sphere_low
173
- data_high[target_names.index(j)] = sphere_high
174
- print(f"{target_names.index(j) + 1} of {len(target_names)} genes processed!")
175
-
176
- return np.sum(num_individual), data_low, data_high
177
-
178
-
179
- # [INNER] merge points from two overlapped spheres, input for remove_overlaps()
180
- def find_points(self, sphere_a, sphere_b):
181
- transcripts = self.transcripts[self.transcripts["target"].isin(self.gnl_genes)]
182
- tree_temp = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
183
- idx_a = tree_temp.query_ball_point([sphere_a["sphere_x"], sphere_a["sphere_y"], sphere_a["sphere_z"]], sphere_a["sphere_r"])
184
- points_a = transcripts.iloc[idx_a]
185
- points_a = points_a[points_a["target"] == sphere_a["gene"]]
186
- idx_b = tree_temp.query_ball_point([sphere_b["sphere_x"], sphere_b["sphere_y"], sphere_b["sphere_z"]], sphere_b["sphere_r"])
187
- points_b = transcripts.iloc[idx_b]
188
- points_b = points_b[points_b["target"] == sphere_b["gene"]]
189
- points = pd.concat([points_a, points_b])
190
- points = points[["global_x", "global_y", "global_z"]]
191
- return points
192
-
193
-
194
- def remove_overlaps(self, set_a, set_b):
195
-
196
- set_a = set_a.copy()
197
- set_b = set_b.copy()
198
-
199
- # find possible overlaps on 2D by r-tree
200
- idx_b = make_rtree(set_b)
201
- for i, sphere_a in set_a.iterrows():
202
- center_a_3D = np.array([sphere_a.sphere_x, sphere_a.sphere_y, sphere_a.sphere_z])
203
- bounds_a = (sphere_a.sphere_x - sphere_a.sphere_r,
204
- sphere_a.sphere_y - sphere_a.sphere_r,
205
- sphere_a.sphere_x + sphere_a.sphere_r,
206
- sphere_a.sphere_y + sphere_a.sphere_r)
207
- possible_overlaps = idx_b.intersection(bounds_a)
208
-
209
- # search 3D overlaps within possible overlaps
210
- for j in possible_overlaps:
211
- if j in set_b.index:
212
- sphere_b = set_b.loc[j]
213
- center_b_3D = np.array([sphere_b.sphere_x, sphere_b.sphere_y, sphere_b.sphere_z])
214
- dist = np.linalg.norm(center_a_3D - center_b_3D)
215
- radius_sum = sphere_a.sphere_r + sphere_b.sphere_r
216
- radius_diff = sphere_a.sphere_r - sphere_b.sphere_r
217
-
218
- # relative positions (0: internal & intersect, 1: internal, 2: intersect)
219
- c0 = (dist < self.l * radius_sum)
220
- c1 = (dist <= self.l * np.abs(radius_diff))
221
- c1_1 = (radius_diff > 0)
222
- c2_1 = (dist < self.rho * self.l * radius_sum)
223
-
224
- # operations on dataframes
225
- if c0:
226
- if c1 and c1_1: # keep A and remove B
227
- set_b.drop(index = j, inplace = True)
228
- elif c1 and not c1_1: # replace A with B and remove B
229
- set_a.loc[i] = set_b.loc[j]
230
- set_b.drop(index = j, inplace = True)
231
- elif not c1 and c2_1: # replace A with new sphere and remove B
232
- points_union = np.array(self.find_points(sphere_a, sphere_b))
233
- new_center, new_radius = miniball.get_bounding_ball(points_union, epsilon=1e-8)
234
- set_a.loc[i, "sphere_x"] = new_center[0]
235
- set_a.loc[i, "sphere_y"] = new_center[1]
236
- set_a.loc[i, "sphere_z"] = new_center[2]
237
- set_a.loc[i, "sphere_r"] = self.s * new_radius
238
- set_b.drop(index = j, inplace = True)
239
-
240
- set_a = set_a.reset_index(drop = True)
241
- set_b = set_b.reset_index(drop = True)
242
- return set_a, set_b
243
-
244
-
245
- # [INNER] merge spheres from different granule markers, input for detect()
246
- def merge_sphere(self, sphere_dict):
247
- sphere = sphere_dict[0].copy()
248
- for j in range(1, len(self.gnl_genes)):
249
- target_sphere = sphere_dict[j]
250
- sphere, target_sphere_new = self.remove_overlaps(sphere, target_sphere)
251
- sphere = pd.concat([sphere, target_sphere_new])
252
- sphere = sphere.reset_index(drop = True)
253
- return sphere
254
-
255
-
256
- # [INNER] negative control filtering, input for detect()
257
- def nc_filter(self, sphere_low, sphere_high):
258
-
259
- # negative control gene profiling
260
- adata_low = self.profile(sphere_low, self.nc_genes)
261
- adata_high = self.profile(sphere_high, self.nc_genes)
262
- adata = anndata.concat([adata_low, adata_high], axis = 0, merge = "same")
263
- adata.var["genes"] = adata.var.index
264
- adata.obs_keys = list(np.arange(adata.shape[0]))
265
- adata.obs["type"] = ["low"] * adata_low.shape[0] + ["high"] * adata_high.shape[0]
266
- adata.obs["type"] = pd.Categorical(adata.obs["type"], categories = ["low", "high"], ordered = True)
267
-
268
- # DE analysis of negative control genes
269
- sc.tl.rank_genes_groups(adata, "type", method = "t-test")
270
- names = adata.uns["rank_genes_groups"]["names"]
271
- names = pd.DataFrame(names)
272
- logfc = adata.uns["rank_genes_groups"]["logfoldchanges"]
273
- logfc = pd.DataFrame(logfc)
274
- pvals = adata.uns["rank_genes_groups"]["pvals"]
275
- pvals = pd.DataFrame(pvals)
276
-
277
- # select top upregulated negative control genes
278
- df = pd.DataFrame({"names": names["high"], "logfc": logfc["high"], "pvals": pvals["high"]})
279
- df = df[df["logfc"] >= 0]
280
- df = df.sort_values(by = ["pvals"], ascending = True)
281
- nc_genes_final = list(df["names"].head(self.nc_top))
282
-
283
- # negative control filtering
284
- nc_transcripts_final = self.transcripts[self.transcripts["target"].isin(nc_genes_final)]
285
- tree = make_tree(d1 = np.array(nc_transcripts_final["global_x"]), d2 = np.array(nc_transcripts_final["global_y"]), d3 = np.array(nc_transcripts_final["global_z"]))
286
- centers = sphere_low[["sphere_x", "sphere_y", "sphere_z"]].to_numpy()
287
- radii = sphere_low["sphere_r"].to_numpy()
288
- sizes = sphere_low["size"].to_numpy()
289
- counts = np.array([len(tree.query_ball_point(c, r)) for c, r in zip(centers, radii)])
290
- pass_idx = (counts == 0) | (counts / sizes < self.nc_thr)
291
- sphere = sphere_low[pass_idx].reset_index(drop = True)
292
- return sphere
293
-
294
-
295
- # [MAIN] dataframe, granule metadata
296
- def detect(self, record_cell_id = False):
297
-
298
- _, data_low, data_high = self.dbscan(record_cell_id = record_cell_id)
299
-
300
- print("Merging spheres...")
301
- sphere_low, sphere_high = self.merge_sphere(data_low), self.merge_sphere(data_high)
302
-
303
- if self.nc_genes is None:
304
- return sphere_low
305
- else:
306
- print("Negative control filtering...")
307
- return self.nc_filter(sphere_low, sphere_high)
308
-
309
-
310
- # [MAIN] anndata, granule spatial transcriptome profile
311
- def profile(self, granule, genes = None, print_itr = False):
312
-
313
- if genes is None:
314
- genes = list(self.transcripts["target"].unique())
315
- transcripts = self.transcripts
316
- else:
317
- transcripts = self.transcripts[self.transcripts["target"].isin(genes)]
318
-
319
- gene_to_idx = {g: i for i, g in enumerate(genes)}
320
- gene_array = transcripts["target"].to_numpy()
321
- tree = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
322
-
323
- n_gnl = granule.shape[0]
324
- n_gene = len(genes)
325
- data, row_idx, col_idx = [], [], []
326
-
327
- # iterate over all granules to count nearby transcripts
328
- for i in range(n_gnl):
329
- temp = granule.iloc[i]
330
- target_idx = tree.query_ball_point([temp["sphere_x"], temp["sphere_y"], temp["layer_z"]], temp["sphere_r"])
331
- if not target_idx:
332
- continue
333
- local_genes = gene_array[target_idx] # extract genes for those nearby transcripts
334
- counts = Counter(local_genes) # count how many times each gene occurs
335
- for g, cnt in counts.items(): # append nonzero entries to sparse matrix lists
336
- j = gene_to_idx[g] # get gene column index
337
- data.append(cnt) # nonzero count
338
- row_idx.append(i) # row index = granule index
339
- col_idx.append(j) # column index = gene index
340
- if print_itr and (i % 5000 == 0):
341
- print(f"{i} out of {n_gnl} granules profiled!")
342
-
343
- # construct sparse spatial transcriptome profile, (n_granules × n_genes)
344
- X = csr_matrix((data, (row_idx, col_idx)), shape = (n_gnl, n_gene), dtype = np.float32)
345
- adata = anndata.AnnData(X = X, obs = granule.copy())
346
- adata.obs["granule_id"] = [f"gnl_{i}" for i in range(n_gnl)]
347
- adata.obs = adata.obs.astype({"granule_id": str})
348
- adata.obs.rename(columns = {"sphere_x": "global_x", "sphere_y": "global_y", "sphere_z": "global_z"}, inplace = True)
349
- adata.var["genes"] = genes
350
- adata.var_names = genes
351
- adata.var_keys = genes
352
- return adata
353
-
354
-
355
- # [MAIN] anndata, spot-level gene expression
356
- def spot_expression(self, grid_len, genes = None):
357
-
358
- if genes is None:
359
- genes = list(self.transcripts["target"].unique())
360
- transcripts = self.transcripts
361
- else:
362
- transcripts = self.transcripts[self.transcripts["target"].isin(genes)]
363
-
364
- # construct bins
365
- x_bins, y_bins = self.construct_grid(grid_len = grid_len)
366
-
367
- # initialize data
368
- X = np.zeros((len(genes), (len(x_bins) - 1) * (len(y_bins) - 1)))
369
- global_x, global_y = [], []
370
-
371
- # coordinates
372
- for i in list(x_bins)[:-1]:
373
- center_x = i + 0.5 * grid_len
374
- for j in list(y_bins)[:-1]:
375
- center_y = j + 0.5 * grid_len
376
- global_x.append(center_x)
377
- global_y.append(center_y)
378
-
379
- # count matrix
380
- for k_idx, k in enumerate(genes):
381
- target_gene = transcripts[transcripts["target"] == k]
382
- count_gene, _, _ = np.histogram2d(target_gene["global_x"], target_gene["global_y"], bins = [x_bins, y_bins])
383
- X[k_idx, :] = count_gene.flatten()
384
- if k_idx % 100 == 0:
385
- print(f"{k_idx} out of {len(genes)} genes profiled!")
386
-
387
- # spot id
388
- spot_id = []
389
- for i in range(len(global_x)):
390
- id = "spot_" + str(i)
391
- spot_id.append(id)
392
-
393
- # assemble data
394
- adata = anndata.AnnData(X = np.transpose(X))
395
- adata.obs["spot_id"] = spot_id
396
- adata.obs["global_x"] = global_x
397
- adata.obs["global_y"] = global_y
398
- adata.var["genes"] = genes
399
- adata.var_names = genes
400
- adata.var_keys = genes
401
- return adata
402
-
403
-
404
- # [MAIN] anndata, spot-level neuron metadata
405
- def spot_neuron(adata_neuron, spot, grid_len = 50, neuron_loc_key = ["global_x", "global_y"], spot_loc_key = ["global_x", "global_y"]):
406
-
407
- adata_neuron = adata_neuron.copy()
408
- neurons = adata_neuron.obs
409
- spot = spot.copy()
410
-
411
- half_len = grid_len / 2
412
-
413
- indicator, neuron_count = [], []
414
-
415
- for _, row in spot.obs.iterrows():
416
-
417
- x = row[spot_loc_key[0]]
418
- y = row[spot_loc_key[1]]
419
- neuron_temp = neurons[(neurons[neuron_loc_key[0]] > x - half_len) & (neurons[neuron_loc_key[0]] < x + half_len) & (neurons[neuron_loc_key[1]] > y - half_len) & (neurons[neuron_loc_key[1]] < y + half_len)]
420
- indicator.append(int(len(neuron_temp) > 0))
421
- neuron_count.append(len(neuron_temp))
422
-
423
- spot.obs["indicator"] = indicator
424
- spot.obs["neuron_count"] = neuron_count
425
- return spot
426
-
427
-
428
- # [MAIN] anndata, spot-level granule metadata
429
- def spot_granule(granule, spot, grid_len = 50, gnl_loc_key = ["sphere_x", "sphere_y"], spot_loc_key = ["global_x", "global_y"]):
430
-
431
- granule = granule.copy()
432
- spot = spot.copy()
433
-
434
- half_len = grid_len / 2
435
-
436
- indicator, granule_count, granule_radius, granule_size, granule_score = [], [], [], [], []
437
-
438
- for _, row in spot.obs.iterrows():
439
-
440
- x = row[spot_loc_key[0]]
441
- y = row[spot_loc_key[1]]
442
- gnl_temp = granule[(granule[gnl_loc_key[0]] >= x - half_len) & (granule[gnl_loc_key[0]] < x + half_len) & (granule[gnl_loc_key[1]] >= y - half_len) & (granule[gnl_loc_key[1]] < y + half_len)]
443
- indicator.append(int(len(gnl_temp) > 0))
444
- granule_count.append(len(gnl_temp))
445
-
446
- if len(gnl_temp) == 0:
447
- granule_radius.append(0)
448
- granule_size.append(0)
449
- granule_score.append(0)
450
- else:
451
- granule_radius.append(np.nanmean(gnl_temp["sphere_r"]))
452
- granule_size.append(np.nanmean(gnl_temp["size"]))
453
- granule_score.append(np.nanmean(gnl_temp["in_nucleus"]))
454
-
455
- spot.obs["indicator"] = indicator
456
- spot.obs["gnl_count"] = granule_count
457
- spot.obs["gnl_radius"] = granule_radius
458
- spot.obs["gnl_size"] = granule_size
459
- spot.obs["gnl_score"] = granule_score
460
- return spot
461
-
462
-
463
- # [Main] anndata, neuron-granule colocalization
464
- def neighbor_granule(adata_neuron, granule_adata, radius = 10, sigma = None, loc_key = ["global_x", "global_y"]):
465
-
466
- adata_neuron = adata_neuron.copy()
467
- granule_adata = granule_adata.copy()
468
-
469
- if sigma is None:
470
- sigma = radius / 2
471
-
472
- # neuron and granule coordinates
473
- neuron_coords = adata_neuron.obs[loc_key].values
474
- gnl_coords = granule_adata.obs[loc_key].values
475
-
476
- # make tree
477
- tree = make_tree(d1 = gnl_coords[:, 0], d2 = gnl_coords[:, 1])
478
-
479
- # query neighboring granules for each neuron
480
- neighbor_indices = tree.query_ball_point(neuron_coords, r = radius)
481
-
482
- # record count and indices
483
- granule_counts = np.array([len(indices) for indices in neighbor_indices])
484
- adata_neuron.obs["neighbor_gnl_count"] = granule_counts
485
- adata_neuron.uns["neighbor_gnl_indices"] = neighbor_indices
486
-
487
- # ---------- neighboring granule expression matrix ---------- #
488
- n_neurons, n_genes = adata_neuron.n_obs, adata_neuron.n_vars
489
- weighted_expr = np.zeros((n_neurons, n_genes))
490
-
491
- for i, indices in enumerate(neighbor_indices):
492
- if len(indices) == 0:
493
- continue
494
- distances = np.linalg.norm(gnl_coords[indices] - neuron_coords[i], axis = 1)
495
- weights = np.exp(- (distances ** 2) / (2 * sigma ** 2))
496
- weights = weights / weights.sum()
497
- weighted_expr[i] = np.average(granule_adata.X[indices], axis = 0, weights = weights)
498
-
499
- adata_neuron.obsm["weighted_gnl_expression"] = weighted_expr
500
-
501
- # ---------- neighboring granule spatial feature ---------- #
502
- features = []
503
-
504
- for i, gnl_idx in enumerate(neighbor_indices):
505
-
506
- feats = {}
507
- feats["n_granules"] = len(gnl_idx)
508
-
509
- if len(gnl_idx) == 0:
510
- feats.update({"mean_distance": np.nan, "std_distance": np.nan, "radius_max": np.nan, "radius_min": np.nan, "density": 0, "center_offset_norm": np.nan, "anisotropy_ratio": np.nan})
511
- else:
512
- gnl_pos = gnl_coords[gnl_idx]
513
- neuron_pos = neuron_coords[i]
514
- dists = np.linalg.norm(gnl_pos - neuron_pos, axis = 1)
515
- feats["mean_distance"] = dists.mean()
516
- feats["std_distance"] = dists.std()
517
- feats["radius_max"] = dists.max()
518
- feats["radius_min"] = dists.min()
519
- feats["density"] = len(gnl_idx) / (np.pi * radius ** 2)
520
- centroid = gnl_pos.mean(axis = 0)
521
- offset = centroid - neuron_pos
522
- feats["center_offset_norm"] = np.linalg.norm(offset)
523
- cov = np.cov((gnl_pos - neuron_pos).T)
524
- eigvals = np.linalg.eigvalsh(cov)
525
- if np.min(eigvals) > 0:
526
- feats["anisotropy_ratio"] = np.max(eigvals) / np.min(eigvals)
527
- else:
528
- feats["anisotropy_ratio"] = np.nan
529
-
530
- features.append(feats)
531
-
532
- spatial_df = pd.DataFrame(features, index = adata_neuron.obs_names)
533
- return adata_neuron, spatial_df
534
-
535
-
536
- # [MAIN] numpy array, neuron embeddings based on neighboring granules
537
- def neuron_embedding_one_hot(adata_neuron, granule_adata, k = 10, radius = 10, loc_key = ["global_x", "global_y"], gnl_subtype_key = "granule_subtype_kmeans", padding_value = "Others"):
538
-
539
- adata_neuron = adata_neuron.copy()
540
- granule_adata = granule_adata.copy()
541
-
542
- # neuron and granule coordinates, granule subtypes
543
- neuron_coords = adata_neuron.obs[loc_key].to_numpy()
544
- granule_coords = granule_adata.obs[loc_key].to_numpy()
545
- granule_subtypes = granule_adata.obs[gnl_subtype_key].astype(str).to_numpy()
546
-
547
- # include padding category
548
- unique_subtypes = np.unique(granule_subtypes).tolist()
549
- if padding_value not in unique_subtypes:
550
- unique_subtypes.append(padding_value)
551
-
552
- encoder = OneHotEncoder(categories = [unique_subtypes], sparse = False, handle_unknown = "ignore")
553
- encoder.fit(np.array(unique_subtypes).reshape(-1, 1))
554
- S = len(unique_subtypes)
555
-
556
- # k-d tree
557
- tree = make_tree(d1 = granule_coords[:, 0], d2 = granule_coords[:, 1])
558
- distances, indices = tree.query(neuron_coords, k = k, distance_upper_bound = radius)
559
-
560
- # initialize output
561
- n_neurons = neuron_coords.shape[0]
562
- embeddings = np.zeros((n_neurons, k, S), dtype = float)
563
-
564
- for i in range(n_neurons):
565
- for k in range(k):
566
- idx = indices[i, k]
567
- dist = distances[i, k]
568
- if idx == granule_coords.shape[0] or np.isinf(dist):
569
- subtype = padding_value
570
- else:
571
- subtype = granule_subtypes[idx]
572
- onehot = encoder.transform([[subtype]])[0]
573
- embeddings[i, k, :] = onehot
574
-
575
- return embeddings, encoder.categories_[0]
576
-
577
-
578
- # [MAIN] numpy array, neuron embeddings based on neighboring granules
579
- def neuron_embedding_spatial_weight(adata_neuron, granule_adata, radius = 10, sigma = 10, loc_key = ["global_x", "global_y"], gnl_subtype_key = "granule_subtype_kmeans", padding_value = "Others"):
580
-
581
- adata_neuron = adata_neuron.copy()
582
- granule_adata = granule_adata.copy()
583
-
584
- # neuron and granule coordinates, granule subtypes
585
- neuron_coords = adata_neuron.obs[loc_key].to_numpy()
586
- granule_coords = granule_adata.obs[loc_key].to_numpy()
587
- granule_subtypes = granule_adata.obs[gnl_subtype_key].astype(str).to_numpy()
588
-
589
- # include padding category
590
- unique_subtypes = np.unique(granule_subtypes).tolist()
591
- if padding_value not in unique_subtypes:
592
- unique_subtypes.append(padding_value)
593
-
594
- encoder = OneHotEncoder(categories = [unique_subtypes], sparse = False, handle_unknown = "ignore")
595
- encoder.fit(np.array(unique_subtypes).reshape(-1, 1))
596
- S = len(unique_subtypes)
597
-
598
- # k-d tree
599
- tree = make_tree(d1 = granule_coords[:, 0], d2 = granule_coords[:, 1])
600
- all_neighbors = tree.query_ball_point(neuron_coords, r = radius)
601
-
602
- # initialize output
603
- n_neurons = neuron_coords.shape[0]
604
- embeddings = np.zeros((n_neurons, S), dtype = float)
605
-
606
- for i, neighbor_indices in enumerate(all_neighbors):
607
- if not neighbor_indices:
608
- # no neighbors, assign to padding subtype
609
- embeddings[i] = encoder.transform([[padding_value]])[0]
610
- continue
611
-
612
- # get neighbor subtypes and distances
613
- neighbor_coords = granule_coords[neighbor_indices]
614
- dists = np.linalg.norm(neuron_coords[i] - neighbor_coords, axis = 1)
615
- weights = np.exp(- dists / sigma)
616
-
617
- # encode subtypes to one-hot and weight them
618
- subtypes = granule_subtypes[neighbor_indices]
619
- onehots = encoder.transform(subtypes.reshape(-1, 1))
620
- weighted_sum = (weights[:, np.newaxis] * onehots).sum(axis = 0)
621
-
622
- # normalize to make it a composition vector
623
- embeddings[i] = weighted_sum / weights.sum()
624
-
625
- return embeddings, encoder.categories_[0]
File without changes
File without changes
File without changes
File without changes