mcDETECT 2.0.6__tar.gz → 2.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcDETECT might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcDETECT
3
- Version: 2.0.6
3
+ Version: 2.0.8
4
4
  Summary: Uncovering the dark transcriptome in polarized neuronal compartments with mcDETECT
5
5
  Home-page: https://github.com/chen-yang-yuan/mcDETECT
6
6
  Author: Chenyang Yuan
@@ -0,0 +1,6 @@
1
+ __version__ = "2.0.8"
2
+
3
+ from . import model
4
+ from . import utils
5
+
6
+ __all__ = ["model", "utils"]
@@ -0,0 +1,622 @@
1
+ import anndata
2
+ import math
3
+ import miniball
4
+ import numpy as np
5
+ import pandas as pd
6
+ import scanpy as sc
7
+ from collections import Counter
8
+ from rtree import index
9
+ from scipy.sparse import csr_matrix
10
+ from scipy.spatial import cKDTree
11
+ from scipy.stats import poisson
12
+ from shapely.geometry import Point
13
+ from sklearn.cluster import DBSCAN
14
+ from sklearn.preprocessing import OneHotEncoder
15
+
16
+
17
+ from .utils import *
18
+
19
+
20
+ class mcDETECT:
21
+
22
+
23
+ def __init__(self, type, transcripts, gnl_genes, nc_genes = None, eps = 1.5, minspl = None, grid_len = 1.0, cutoff_prob = 0.95, alpha = 5.0, low_bound = 3,
24
+ size_thr = 4.0, in_nucleus_thr = (0.5, 0.5), l = 1.0, rho = 0.2, s = 1.0, nc_top = 20, nc_thr = 0.1):
25
+
26
+ self.type = type # string, iST platform, now support MERSCOPE, Xenium, and CosMx
27
+ self.transcripts = transcripts # dataframe, transcripts file
28
+ self.gnl_genes = gnl_genes # list, string, all granule markers
29
+ self.nc_genes = nc_genes # list, string, all negative controls
30
+ self.eps = eps # numeric, searching radius epsilon
31
+ self.minspl = minspl # integer, manually select min_samples, i.e., no automatic parameter selection
32
+ self.grid_len = grid_len # numeric, length of grids for computing the tissue area
33
+ self.cutoff_prob = cutoff_prob # numeric, cutoff probability in parameter selection for min_samples
34
+ self.alpha = alpha # numeric, scaling factor in parameter selection for min_samples
35
+ self.low_bound = low_bound # integer, lower bound in parameter selection for min_samples
36
+ self.size_thr = size_thr # numeric, threshold for maximum radius of an aggregation
37
+ self.in_nucleus_thr = in_nucleus_thr # 2-d tuple, threshold for low- and high-in-nucleus ratio
38
+ self.l = l # numeric, scaling factor for seaching overlapped spheres
39
+ self.rho = rho # numeric, threshold for determining overlaps
40
+ self.s = s # numeric, scaling factor for merging overlapped spheres
41
+ self.nc_top = nc_top # integer, number of negative controls retained for filtering
42
+ self.nc_thr = nc_thr # numeric, threshold for negative control filtering
43
+
44
+
45
+ # [INNER] construct grids, input for tissue_area()
46
+ def construct_grid(self, grid_len = None):
47
+ if grid_len is None:
48
+ grid_len = self.grid_len
49
+ x_min, x_max = np.min(self.transcripts["global_x"]), np.max(self.transcripts["global_x"])
50
+ y_min, y_max = np.min(self.transcripts["global_y"]), np.max(self.transcripts["global_y"])
51
+ x_min = np.floor(x_min / grid_len) * grid_len
52
+ x_max = np.ceil(x_max / grid_len) * grid_len
53
+ y_min = np.floor(y_min / grid_len) * grid_len
54
+ y_max = np.ceil(y_max / grid_len) * grid_len
55
+ x_bins = np.arange(x_min, x_max + grid_len, grid_len)
56
+ y_bins = np.arange(y_min, y_max + grid_len, grid_len)
57
+ return x_bins, y_bins
58
+
59
+
60
+ # [INNER] calculate tissue area, input for poisson_select()
61
+ def tissue_area(self):
62
+ x_bins, y_bins = self.construct_grid(grid_len = None)
63
+ hist, _, _ = np.histogram2d(self.transcripts["global_x"], self.transcripts["global_y"], bins = [x_bins, y_bins])
64
+ area = np.count_nonzero(hist) * (self.grid_len ** 2)
65
+ return area
66
+
67
+
68
+ # [INNER] calculate optimal min_samples, input for dbscan()
69
+ def poisson_select(self, gene_name):
70
+ num_trans = np.sum(self.transcripts["target"] == gene_name)
71
+ bg_density = num_trans / self.tissue_area()
72
+ cutoff_density = poisson.ppf(self.cutoff_prob, mu = self.alpha * bg_density * (np.pi * self.eps ** 2))
73
+ optimal_m = int(max(cutoff_density, self.low_bound))
74
+ return optimal_m
75
+
76
+
77
+ # [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each granule marker
78
+ def dbscan(self, target_names = None, record_cell_id = False, write_csv = False, write_path = "./"):
79
+
80
+ if self.type != "Xenium":
81
+ z_grid = list(self.transcripts["global_z"].unique())
82
+ z_grid.sort()
83
+
84
+ if target_names is None:
85
+ target_names = self.gnl_genes
86
+ transcripts = self.transcripts[self.transcripts["target"].isin(target_names)]
87
+
88
+ num_individual, data_low, data_high = [], {}, {}
89
+
90
+ for j in target_names:
91
+
92
+ # split transcripts
93
+ target = transcripts[transcripts["target"] == j]
94
+ others = transcripts[transcripts["target"] != j]
95
+ tree = make_tree(d1 = np.array(others["global_x"]), d2 = np.array(others["global_y"]), d3 = np.array(others["global_z"]))
96
+
97
+ # 3D DBSCAN
98
+ if self.minspl is None:
99
+ min_spl = self.poisson_select(j)
100
+ else:
101
+ min_spl = self.minspl
102
+ X = np.array(target[["global_x", "global_y", "global_z"]])
103
+ db = DBSCAN(eps = self.eps, min_samples = min_spl, algorithm = "kd_tree").fit(X)
104
+ labels = db.labels_
105
+ n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
106
+
107
+ # iterate over all aggregations
108
+ cell_id, sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score = [], [], [], [], [], [], [], [], []
109
+
110
+ for k in range(n_clusters):
111
+
112
+ # record cell ids
113
+ if record_cell_id:
114
+ temp = target[labels == k]
115
+ temp_cell_id_mode = temp["cell_id"].mode()[0]
116
+ cell_id.append(temp_cell_id_mode)
117
+
118
+ # find minimum enclosing spheres
119
+ mask = (labels == k)
120
+ coords = X[mask]
121
+ if coords.shape[0] == 0:
122
+ continue
123
+ temp_in_nucleus = np.sum(target["overlaps_nucleus"].values[mask])
124
+ temp_size = coords.shape[0]
125
+ coords_unique = np.unique(coords, axis=0)
126
+ center, r2 = miniball.get_bounding_ball(coords_unique, epsilon=1e-8)
127
+ if self.type != "Xenium":
128
+ closest_z = closest(z_grid, center[2])
129
+ else:
130
+ closest_z = center[2]
131
+
132
+ # calculate size, composition, and in-nucleus score
133
+ other_idx = tree.query_ball_point([center[0], center[1], center[2]], np.sqrt(r2))
134
+ other_trans = others.iloc[other_idx]
135
+ other_in_nucleus = np.sum(other_trans["overlaps_nucleus"])
136
+ other_size = other_trans.shape[0]
137
+ other_comp = len(other_trans["target"].unique())
138
+ total_size = temp_size + other_size
139
+ total_comp = 1 + other_comp
140
+ local_score = (temp_in_nucleus + other_in_nucleus) / total_size
141
+
142
+ # record coordinate, radius, size, composition, and in-nucleus score
143
+ sphere_x.append(center[0])
144
+ sphere_y.append(center[1])
145
+ sphere_z.append(center[2])
146
+ layer_z.append(closest_z)
147
+ sphere_r.append(np.sqrt(r2))
148
+ sphere_size.append(total_size)
149
+ sphere_comp.append(total_comp)
150
+ sphere_score.append(local_score)
151
+
152
+ # basic features for all spheres from each granule marker
153
+ sphere = pd.DataFrame(list(zip(sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score, [j] * len(sphere_x))),
154
+ columns = ["sphere_x", "sphere_y", "sphere_z", "layer_z", "sphere_r", "size", "comp", "in_nucleus", "gene"])
155
+ sphere = sphere.astype({"sphere_x": float, "sphere_y": float, "sphere_z": float, "layer_z": float, "sphere_r": float, "size": float, "comp": float, "in_nucleus": float, "gene": str})
156
+ if record_cell_id:
157
+ sphere["cell_id"] = cell_id
158
+ sphere = sphere.astype({"cell_id": str})
159
+
160
+ # split low- and high-in-nucleus spheres
161
+ sphere_low = sphere[(sphere["sphere_r"] < self.size_thr) & (sphere["in_nucleus"] < self.in_nucleus_thr[0])]
162
+ sphere_high = sphere[(sphere["sphere_r"] < self.size_thr) & (sphere["in_nucleus"] > self.in_nucleus_thr[1])]
163
+
164
+ if write_csv:
165
+ sphere_low.to_csv(write_path + j + " sphere.csv", index=0)
166
+ sphere_high.to_csv(write_path + j + " sphere_high.csv", index=0)
167
+
168
+ num_individual.append(sphere_low.shape[0])
169
+ data_low[target_names.index(j)] = sphere_low
170
+ data_high[target_names.index(j)] = sphere_high
171
+ print("{} out of {} genes processed!".format(target_names.index(j) + 1, len(target_names)))
172
+
173
+ return np.sum(num_individual), data_low, data_high
174
+
175
+
176
+ # [INNER] merge points from two overlapped spheres, input for remove_overlaps()
177
+ def find_points(self, sphere_a, sphere_b):
178
+ transcripts = self.transcripts[self.transcripts["target"].isin(self.gnl_genes)]
179
+ tree_temp = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
180
+ idx_a = tree_temp.query_ball_point([sphere_a["sphere_x"], sphere_a["sphere_y"], sphere_a["sphere_z"]], sphere_a["sphere_r"])
181
+ points_a = transcripts.iloc[idx_a]
182
+ points_a = points_a[points_a["target"] == sphere_a["gene"]]
183
+ idx_b = tree_temp.query_ball_point([sphere_b["sphere_x"], sphere_b["sphere_y"], sphere_b["sphere_z"]], sphere_b["sphere_r"])
184
+ points_b = transcripts.iloc[idx_b]
185
+ points_b = points_b[points_b["target"] == sphere_b["gene"]]
186
+ points = pd.concat([points_a, points_b])
187
+ points = points[["global_x", "global_y", "global_z"]]
188
+ return points
189
+
190
+
191
+ def remove_overlaps(self, set_a, set_b):
192
+
193
+ set_a = set_a.copy()
194
+ set_b = set_b.copy()
195
+
196
+ # find possible overlaps on 2D by r-tree
197
+ idx_b = make_rtree(set_b)
198
+ for i, sphere_a in set_a.iterrows():
199
+ center_a_3D = np.array([sphere_a.sphere_x, sphere_a.sphere_y, sphere_a.sphere_z])
200
+ bounds_a = (sphere_a.sphere_x - sphere_a.sphere_r,
201
+ sphere_a.sphere_y - sphere_a.sphere_r,
202
+ sphere_a.sphere_x + sphere_a.sphere_r,
203
+ sphere_a.sphere_y + sphere_a.sphere_r)
204
+ possible_overlaps = idx_b.intersection(bounds_a)
205
+
206
+ # search 3D overlaps within possible overlaps
207
+ for j in possible_overlaps:
208
+ if j in set_b.index:
209
+ sphere_b = set_b.loc[j]
210
+ center_b_3D = np.array([sphere_b.sphere_x, sphere_b.sphere_y, sphere_b.sphere_z])
211
+ dist = np.linalg.norm(center_a_3D - center_b_3D)
212
+ radius_sum = sphere_a.sphere_r + sphere_b.sphere_r
213
+ radius_diff = sphere_a.sphere_r - sphere_b.sphere_r
214
+
215
+ # relative positions (0: internal & intersect, 1: internal, 2: intersect)
216
+ c0 = (dist < self.l * radius_sum)
217
+ c1 = (dist <= self.l * np.abs(radius_diff))
218
+ c1_1 = (radius_diff > 0)
219
+ c2_1 = (dist < self.rho * self.l * radius_sum)
220
+
221
+ # operations on dataframes
222
+ if c0:
223
+ if c1 and c1_1: # keep A and remove B
224
+ set_b.drop(index = j, inplace = True)
225
+ elif c1 and not c1_1: # replace A with B and remove B
226
+ set_a.loc[i] = set_b.loc[j]
227
+ set_b.drop(index = j, inplace = True)
228
+ elif not c1 and c2_1: # replace A with new sphere and remove B
229
+ points_union = np.array(self.find_points(sphere_a, sphere_b))
230
+ new_center, new_radius = miniball.get_bounding_ball(points_union, epsilon=1e-8)
231
+ set_a.loc[i, "sphere_x"] = new_center[0]
232
+ set_a.loc[i, "sphere_y"] = new_center[1]
233
+ set_a.loc[i, "sphere_z"] = new_center[2]
234
+ set_a.loc[i, "sphere_r"] = self.s * new_radius
235
+ set_b.drop(index = j, inplace = True)
236
+
237
+ set_a = set_a.reset_index(drop = True)
238
+ set_b = set_b.reset_index(drop = True)
239
+ return set_a, set_b
240
+
241
+
242
+ # [INNER] merge spheres from different granule markers, input for detect()
243
+ def merge_sphere(self, sphere_dict):
244
+ sphere = sphere_dict[0].copy()
245
+ for j in range(1, len(self.gnl_genes)):
246
+ target_sphere = sphere_dict[j]
247
+ sphere, target_sphere_new = self.remove_overlaps(sphere, target_sphere)
248
+ sphere = pd.concat([sphere, target_sphere_new])
249
+ sphere = sphere.reset_index(drop = True)
250
+ return sphere
251
+
252
+
253
+ # [INNER] negative control filtering, input for detect()
254
+ def nc_filter(self, sphere_low, sphere_high):
255
+
256
+ # negative control gene profiling
257
+ adata_low = self.profile(sphere_low, self.nc_genes)
258
+ adata_high = self.profile(sphere_high, self.nc_genes)
259
+ adata = anndata.concat([adata_low, adata_high], axis = 0, merge = "same")
260
+ adata.var["genes"] = adata.var.index
261
+ adata.obs_keys = list(np.arange(adata.shape[0]))
262
+ adata.obs["type"] = ["low"] * adata_low.shape[0] + ["high"] * adata_high.shape[0]
263
+ adata.obs["type"] = pd.Categorical(adata.obs["type"], categories = ["low", "high"], ordered = True)
264
+
265
+ # DE analysis of negative control genes
266
+ sc.tl.rank_genes_groups(adata, "type", method = "t-test")
267
+ names = adata.uns["rank_genes_groups"]["names"]
268
+ names = pd.DataFrame(names)
269
+ logfc = adata.uns["rank_genes_groups"]["logfoldchanges"]
270
+ logfc = pd.DataFrame(logfc)
271
+ pvals = adata.uns["rank_genes_groups"]["pvals"]
272
+ pvals = pd.DataFrame(pvals)
273
+
274
+ # select top upregulated negative control genes
275
+ df = pd.DataFrame({"names": names["high"], "logfc": logfc["high"], "pvals": pvals["high"]})
276
+ df = df[df["logfc"] >= 0]
277
+ df = df.sort_values(by = ["pvals"], ascending = True)
278
+ nc_genes_final = list(df["names"].head(self.nc_top))
279
+
280
+ # negative control filtering
281
+ nc_transcripts_final = self.transcripts[self.transcripts["target"].isin(nc_genes_final)]
282
+ tree = make_tree(d1 = np.array(nc_transcripts_final["global_x"]), d2 = np.array(nc_transcripts_final["global_y"]), d3 = np.array(nc_transcripts_final["global_z"]))
283
+ centers = sphere_low[["sphere_x", "sphere_y", "sphere_z"]].to_numpy()
284
+ radii = sphere_low["sphere_r"].to_numpy()
285
+ sizes = sphere_low["size"].to_numpy()
286
+ counts = np.array([len(tree.query_ball_point(c, r)) for c, r in zip(centers, radii)])
287
+ pass_idx = (counts == 0) | (counts / sizes < self.nc_thr)
288
+ sphere = sphere_low[pass_idx].reset_index(drop = True)
289
+ return sphere
290
+
291
+
292
+ # [MAIN] dataframe, granule metadata
293
+ def detect(self):
294
+
295
+ _, data_low, data_high = self.dbscan()
296
+
297
+ print("Merging spheres...")
298
+ sphere_low, sphere_high = self.merge_sphere(data_low), self.merge_sphere(data_high)
299
+
300
+ if self.nc_genes is None:
301
+ return sphere_low
302
+ else:
303
+ print("Negative control filtering...")
304
+ return self.nc_filter(sphere_low, sphere_high)
305
+
306
+
307
+ # [MAIN] anndata, granule spatial transcriptome profile
308
+ def profile(self, granule, genes = None, print_itr = False):
309
+
310
+ if genes is None:
311
+ genes = list(self.transcripts["target"].unique())
312
+ transcripts = self.transcripts
313
+ else:
314
+ transcripts = self.transcripts[self.transcripts["target"].isin(genes)]
315
+
316
+ gene_to_idx = {g: i for i, g in enumerate(genes)}
317
+ gene_array = transcripts["target"].to_numpy()
318
+ tree = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
319
+
320
+ n_gnl = granule.shape[0]
321
+ n_gene = len(genes)
322
+ data, row_idx, col_idx = [], [], []
323
+
324
+ # iterate over all granules to count nearby transcripts
325
+ for i in range(n_gnl):
326
+ temp = granule.iloc[i]
327
+ target_idx = tree.query_ball_point([temp["sphere_x"], temp["sphere_y"], temp["layer_z"]], temp["sphere_r"])
328
+ if not target_idx:
329
+ continue
330
+ local_genes = gene_array[target_idx] # extract genes for those nearby transcripts
331
+ counts = Counter(local_genes) # count how many times each gene occurs
332
+ for g, cnt in counts.items(): # append nonzero entries to sparse matrix lists
333
+ j = gene_to_idx[g] # get gene column index
334
+ data.append(cnt) # nonzero count
335
+ row_idx.append(i) # row index = granule index
336
+ col_idx.append(j) # column index = gene index
337
+ if print_itr and (i % 5000 == 0):
338
+ print(f"{i} out of {n_gnl} granules profiled!")
339
+
340
+ # construct sparse spatial transcriptome profile, (n_granules × n_genes)
341
+ X = csr_matrix((data, (row_idx, col_idx)), shape = (n_gnl, n_gene), dtype = np.float32)
342
+ adata = anndata.AnnData(X = X, obs = granule.copy())
343
+ adata.obs["granule_id"] = [f"gnl_{i}" for i in range(n_gnl)]
344
+ adata.obs = adata.obs.astype({"granule_id": str})
345
+ adata.obs.rename(columns = {"sphere_x": "global_x", "sphere_y": "global_y", "sphere_z": "global_z"}, inplace = True)
346
+ adata.var["genes"] = genes
347
+ adata.var_names = genes
348
+ adata.var_keys = genes
349
+ return adata
350
+
351
+
352
+ # [MAIN] anndata, spot-level gene expression
353
+ def spot_expression(self, grid_len, genes = None):
354
+
355
+ if genes is None:
356
+ genes = list(self.transcripts["target"].unique())
357
+ transcripts = self.transcripts
358
+ else:
359
+ transcripts = self.transcripts[self.transcripts["target"].isin(genes)]
360
+
361
+ # construct bins
362
+ x_bins, y_bins = self.construct_grid(grid_len = grid_len)
363
+
364
+ # initialize data
365
+ X = np.zeros((len(genes), (len(x_bins) - 1) * (len(y_bins) - 1)))
366
+ global_x, global_y = [], []
367
+
368
+ # coordinates
369
+ for i in list(x_bins)[:-1]:
370
+ center_x = i + 0.5 * grid_len
371
+ for j in list(y_bins)[:-1]:
372
+ center_y = j + 0.5 * grid_len
373
+ global_x.append(center_x)
374
+ global_y.append(center_y)
375
+
376
+ # count matrix
377
+ for k_idx, k in enumerate(genes):
378
+ target_gene = transcripts[transcripts["target"] == k]
379
+ count_gene, _, _ = np.histogram2d(target_gene["global_x"], target_gene["global_y"], bins = [x_bins, y_bins])
380
+ X[k_idx, :] = count_gene.flatten()
381
+ if k_idx % 100 == 0:
382
+ print("{} out of {} genes profiled!".format(k_idx, len(genes)))
383
+
384
+ # spot id
385
+ spot_id = []
386
+ for i in range(len(global_x)):
387
+ id = "spot_" + str(i)
388
+ spot_id.append(id)
389
+
390
+ # assemble data
391
+ adata = anndata.AnnData(X = np.transpose(X))
392
+ adata.obs["spot_id"] = spot_id
393
+ adata.obs["global_x"] = global_x
394
+ adata.obs["global_y"] = global_y
395
+ adata.var["genes"] = genes
396
+ adata.var_names = genes
397
+ adata.var_keys = genes
398
+ return adata
399
+
400
+
401
+ # [MAIN] anndata, spot-level neuron metadata
402
+ def spot_neuron(adata_neuron, spot, grid_len = 50, neuron_loc_key = ["global_x", "global_y"], spot_loc_key = ["global_x", "global_y"]):
403
+
404
+ adata_neuron = adata_neuron.copy()
405
+ neurons = adata_neuron.obs
406
+ spot = spot.copy()
407
+
408
+ half_len = grid_len / 2
409
+
410
+ indicator, neuron_count = [], []
411
+
412
+ for _, row in spot.obs.iterrows():
413
+
414
+ x = row[spot_loc_key[0]]
415
+ y = row[spot_loc_key[1]]
416
+ neuron_temp = neurons[(neurons[neuron_loc_key[0]] > x - half_len) & (neurons[neuron_loc_key[0]] < x + half_len) & (neurons[neuron_loc_key[1]] > y - half_len) & (neurons[neuron_loc_key[1]] < y + half_len)]
417
+ indicator.append(int(len(neuron_temp) > 0))
418
+ neuron_count.append(len(neuron_temp))
419
+
420
+ spot.obs["indicator"] = indicator
421
+ spot.obs["neuron_count"] = neuron_count
422
+ return spot
423
+
424
+
425
+ # [MAIN] anndata, spot-level granule metadata
426
+ def spot_granule(granule, spot, grid_len = 50, gnl_loc_key = ["sphere_x", "sphere_y"], spot_loc_key = ["global_x", "global_y"]):
427
+
428
+ granule = granule.copy()
429
+ spot = spot.copy()
430
+
431
+ half_len = grid_len / 2
432
+
433
+ indicator, granule_count, granule_radius, granule_size, granule_score = [], [], [], [], []
434
+
435
+ for _, row in spot.obs.iterrows():
436
+
437
+ x = row[spot_loc_key[0]]
438
+ y = row[spot_loc_key[1]]
439
+ gnl_temp = granule[(granule[gnl_loc_key[0]] >= x - half_len) & (granule[gnl_loc_key[0]] < x + half_len) & (granule[gnl_loc_key[1]] >= y - half_len) & (granule[gnl_loc_key[1]] < y + half_len)]
440
+ indicator.append(int(len(gnl_temp) > 0))
441
+ granule_count.append(len(gnl_temp))
442
+
443
+ if len(gnl_temp) == 0:
444
+ granule_radius.append(0)
445
+ granule_size.append(0)
446
+ granule_score.append(0)
447
+ else:
448
+ granule_radius.append(np.nanmean(gnl_temp["sphere_r"]))
449
+ granule_size.append(np.nanmean(gnl_temp["size"]))
450
+ granule_score.append(np.nanmean(gnl_temp["in_nucleus"]))
451
+
452
+ spot.obs["indicator"] = indicator
453
+ spot.obs["gnl_count"] = granule_count
454
+ spot.obs["gnl_radius"] = granule_radius
455
+ spot.obs["gnl_size"] = granule_size
456
+ spot.obs["gnl_score"] = granule_score
457
+ return spot
458
+
459
+
460
+ # [Main] anndata, neuron-granule colocalization
461
+ def neighbor_granule(adata_neuron, granule_adata, radius = 10, sigma = None, loc_key = ["global_x", "global_y"]):
462
+
463
+ adata_neuron = adata_neuron.copy()
464
+ granule_adata = granule_adata.copy()
465
+
466
+ if sigma is None:
467
+ sigma = radius / 2
468
+
469
+ # neuron and granule coordinates
470
+ neuron_coords = adata_neuron.obs[loc_key].values
471
+ gnl_coords = granule_adata.obs[loc_key].values
472
+
473
+ # make tree
474
+ tree = make_tree(d1 = gnl_coords[:, 0], d2 = gnl_coords[:, 1])
475
+
476
+ # query neighboring granules for each neuron
477
+ neighbor_indices = tree.query_ball_point(neuron_coords, r = radius)
478
+
479
+ # record count and indices
480
+ granule_counts = np.array([len(indices) for indices in neighbor_indices])
481
+ adata_neuron.obs["neighbor_gnl_count"] = granule_counts
482
+ adata_neuron.uns["neighbor_gnl_indices"] = neighbor_indices
483
+
484
+ # ---------- neighboring granule expression matrix ---------- #
485
+ n_neurons, n_genes = adata_neuron.n_obs, adata_neuron.n_vars
486
+ weighted_expr = np.zeros((n_neurons, n_genes))
487
+
488
+ for i, indices in enumerate(neighbor_indices):
489
+ if len(indices) == 0:
490
+ continue
491
+ distances = np.linalg.norm(gnl_coords[indices] - neuron_coords[i], axis = 1)
492
+ weights = np.exp(- (distances ** 2) / (2 * sigma ** 2))
493
+ weights = weights / weights.sum()
494
+ weighted_expr[i] = np.average(granule_adata.X[indices], axis = 0, weights = weights)
495
+
496
+ adata_neuron.obsm["weighted_gnl_expression"] = weighted_expr
497
+
498
+ # ---------- neighboring granule spatial feature ---------- #
499
+ features = []
500
+
501
+ for i, gnl_idx in enumerate(neighbor_indices):
502
+
503
+ feats = {}
504
+ feats["n_granules"] = len(gnl_idx)
505
+
506
+ if len(gnl_idx) == 0:
507
+ feats.update({"mean_distance": np.nan, "std_distance": np.nan, "radius_max": np.nan, "radius_min": np.nan, "density": 0, "center_offset_norm": np.nan, "anisotropy_ratio": np.nan})
508
+ else:
509
+ gnl_pos = gnl_coords[gnl_idx]
510
+ neuron_pos = neuron_coords[i]
511
+ dists = np.linalg.norm(gnl_pos - neuron_pos, axis = 1)
512
+ feats["mean_distance"] = dists.mean()
513
+ feats["std_distance"] = dists.std()
514
+ feats["radius_max"] = dists.max()
515
+ feats["radius_min"] = dists.min()
516
+ feats["density"] = len(gnl_idx) / (np.pi * radius ** 2)
517
+ centroid = gnl_pos.mean(axis = 0)
518
+ offset = centroid - neuron_pos
519
+ feats["center_offset_norm"] = np.linalg.norm(offset)
520
+ cov = np.cov((gnl_pos - neuron_pos).T)
521
+ eigvals = np.linalg.eigvalsh(cov)
522
+ if np.min(eigvals) > 0:
523
+ feats["anisotropy_ratio"] = np.max(eigvals) / np.min(eigvals)
524
+ else:
525
+ feats["anisotropy_ratio"] = np.nan
526
+
527
+ features.append(feats)
528
+
529
+ spatial_df = pd.DataFrame(features, index = adata_neuron.obs_names)
530
+ return adata_neuron, spatial_df
531
+
532
+
533
+ # [MAIN] numpy array, neuron embeddings based on neighboring granules
534
+ def neuron_embedding_one_hot(adata_neuron, granule_adata, k = 10, radius = 10, loc_key = ["global_x", "global_y"], gnl_subtype_key = "granule_subtype_kmeans", padding_value = "Others"):
535
+
536
+ adata_neuron = adata_neuron.copy()
537
+ granule_adata = granule_adata.copy()
538
+
539
+ # neuron and granule coordinates, granule subtypes
540
+ neuron_coords = adata_neuron.obs[loc_key].to_numpy()
541
+ granule_coords = granule_adata.obs[loc_key].to_numpy()
542
+ granule_subtypes = granule_adata.obs[gnl_subtype_key].astype(str).to_numpy()
543
+
544
+ # include padding category
545
+ unique_subtypes = np.unique(granule_subtypes).tolist()
546
+ if padding_value not in unique_subtypes:
547
+ unique_subtypes.append(padding_value)
548
+
549
+ encoder = OneHotEncoder(categories = [unique_subtypes], sparse = False, handle_unknown = "ignore")
550
+ encoder.fit(np.array(unique_subtypes).reshape(-1, 1))
551
+ S = len(unique_subtypes)
552
+
553
+ # k-d tree
554
+ tree = make_tree(d1 = granule_coords[:, 0], d2 = granule_coords[:, 1])
555
+ distances, indices = tree.query(neuron_coords, k = k, distance_upper_bound = radius)
556
+
557
+ # initialize output
558
+ n_neurons = neuron_coords.shape[0]
559
+ embeddings = np.zeros((n_neurons, k, S), dtype = float)
560
+
561
+ for i in range(n_neurons):
562
+ for k in range(k):
563
+ idx = indices[i, k]
564
+ dist = distances[i, k]
565
+ if idx == granule_coords.shape[0] or np.isinf(dist):
566
+ subtype = padding_value
567
+ else:
568
+ subtype = granule_subtypes[idx]
569
+ onehot = encoder.transform([[subtype]])[0]
570
+ embeddings[i, k, :] = onehot
571
+
572
+ return embeddings, encoder.categories_[0]
573
+
574
+
575
+ # [MAIN] numpy array, neuron embeddings based on neighboring granules
576
+ def neuron_embedding_spatial_weight(adata_neuron, granule_adata, radius = 10, sigma = 10, loc_key = ["global_x", "global_y"], gnl_subtype_key = "granule_subtype_kmeans", padding_value = "Others"):
577
+
578
+ adata_neuron = adata_neuron.copy()
579
+ granule_adata = granule_adata.copy()
580
+
581
+ # neuron and granule coordinates, granule subtypes
582
+ neuron_coords = adata_neuron.obs[loc_key].to_numpy()
583
+ granule_coords = granule_adata.obs[loc_key].to_numpy()
584
+ granule_subtypes = granule_adata.obs[gnl_subtype_key].astype(str).to_numpy()
585
+
586
+ # include padding category
587
+ unique_subtypes = np.unique(granule_subtypes).tolist()
588
+ if padding_value not in unique_subtypes:
589
+ unique_subtypes.append(padding_value)
590
+
591
+ encoder = OneHotEncoder(categories = [unique_subtypes], sparse = False, handle_unknown = "ignore")
592
+ encoder.fit(np.array(unique_subtypes).reshape(-1, 1))
593
+ S = len(unique_subtypes)
594
+
595
+ # k-d tree
596
+ tree = make_tree(d1 = granule_coords[:, 0], d2 = granule_coords[:, 1])
597
+ all_neighbors = tree.query_ball_point(neuron_coords, r = radius)
598
+
599
+ # initialize output
600
+ n_neurons = neuron_coords.shape[0]
601
+ embeddings = np.zeros((n_neurons, S), dtype = float)
602
+
603
+ for i, neighbor_indices in enumerate(all_neighbors):
604
+ if not neighbor_indices:
605
+ # no neighbors, assign to padding subtype
606
+ embeddings[i] = encoder.transform([[padding_value]])[0]
607
+ continue
608
+
609
+ # get neighbor subtypes and distances
610
+ neighbor_coords = granule_coords[neighbor_indices]
611
+ dists = np.linalg.norm(neuron_coords[i] - neighbor_coords, axis = 1)
612
+ weights = np.exp(- dists / sigma)
613
+
614
+ # encode subtypes to one-hot and weight them
615
+ subtypes = granule_subtypes[neighbor_indices]
616
+ onehots = encoder.transform(subtypes.reshape(-1, 1))
617
+ weighted_sum = (weights[:, np.newaxis] * onehots).sum(axis = 0)
618
+
619
+ # normalize to make it a composition vector
620
+ embeddings[i] = weighted_sum / weights.sum()
621
+
622
+ return embeddings, encoder.categories_[0]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcDETECT
3
- Version: 2.0.6
3
+ Version: 2.0.8
4
4
  Summary: Uncovering the dark transcriptome in polarized neuronal compartments with mcDETECT
5
5
  Home-page: https://github.com/chen-yang-yuan/mcDETECT
6
6
  Author: Chenyang Yuan
@@ -3,6 +3,7 @@ README.md
3
3
  setup.py
4
4
  mcDETECT/__init__.py
5
5
  mcDETECT/model.py
6
+ mcDETECT/model_new_but_wrong.py
6
7
  mcDETECT/utils.py
7
8
  mcDETECT.egg-info/PKG-INFO
8
9
  mcDETECT.egg-info/SOURCES.txt
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name = "mcDETECT",
5
- version = "2.0.6",
5
+ version = "2.0.8",
6
6
  packages = find_packages(),
7
7
  install_requires = ["anndata", "miniball", "numpy", "pandas", "rtree", "scanpy", "scikit-learn", "scipy", "shapely"],
8
8
  author = "Chenyang Yuan",
@@ -1,6 +0,0 @@
1
- __version__ = "2.0.6"
2
-
3
- from . import model
4
- from . import utils
5
-
6
- __all__ = ["model", "utils"]
File without changes
File without changes
File without changes
File without changes