mcDETECT 2.0.6__py3-none-any.whl → 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcDETECT might be problematic. Click here for more details.
- mcDETECT/__init__.py +1 -1
- mcDETECT/model.py +19 -22
- mcDETECT/model_new_but_wrong.py +625 -0
- {mcdetect-2.0.6.dist-info → mcdetect-2.0.7.dist-info}/METADATA +1 -1
- mcdetect-2.0.7.dist-info/RECORD +9 -0
- mcdetect-2.0.6.dist-info/RECORD +0 -8
- {mcdetect-2.0.6.dist-info → mcdetect-2.0.7.dist-info}/WHEEL +0 -0
- {mcdetect-2.0.6.dist-info → mcdetect-2.0.7.dist-info}/licenses/LICENSE +0 -0
- {mcdetect-2.0.6.dist-info → mcdetect-2.0.7.dist-info}/top_level.txt +0 -0
mcDETECT/__init__.py
CHANGED
mcDETECT/model.py
CHANGED
|
@@ -20,12 +20,12 @@ from .utils import *
|
|
|
20
20
|
class mcDETECT:
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def __init__(self, type, transcripts,
|
|
23
|
+
def __init__(self, type, transcripts, syn_genes, nc_genes = None, eps = 1.5, minspl = None, grid_len = 1.0, cutoff_prob = 0.95, alpha = 5.0, low_bound = 3,
|
|
24
24
|
size_thr = 4.0, in_nucleus_thr = (0.5, 0.5), l = 1.0, rho = 0.2, s = 1.0, nc_top = 20, nc_thr = 0.1):
|
|
25
25
|
|
|
26
26
|
self.type = type # string, iST platform, now support MERSCOPE, Xenium, and CosMx
|
|
27
27
|
self.transcripts = transcripts # dataframe, transcripts file
|
|
28
|
-
self.
|
|
28
|
+
self.syn_genes = syn_genes # list, string, all synaptic markers
|
|
29
29
|
self.nc_genes = nc_genes # list, string, all negative controls
|
|
30
30
|
self.eps = eps # numeric, searching radius epsilon
|
|
31
31
|
self.minspl = minspl # integer, manually select min_samples, i.e., no automatic parameter selection
|
|
@@ -59,11 +59,10 @@ class mcDETECT:
|
|
|
59
59
|
|
|
60
60
|
# [INNER] calculate tissue area, input for poisson_select()
|
|
61
61
|
def tissue_area(self):
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
return self._cached_area
|
|
62
|
+
x_bins, y_bins = self.construct_grid(grid_len = None)
|
|
63
|
+
hist, _, _ = np.histogram2d(self.transcripts["global_x"], self.transcripts["global_y"], bins = [x_bins, y_bins])
|
|
64
|
+
area = np.count_nonzero(hist) * (self.grid_len ** 2)
|
|
65
|
+
return area
|
|
67
66
|
|
|
68
67
|
|
|
69
68
|
# [INNER] calculate optimal min_samples, input for dbscan()
|
|
@@ -75,7 +74,7 @@ class mcDETECT:
|
|
|
75
74
|
return optimal_m
|
|
76
75
|
|
|
77
76
|
|
|
78
|
-
# [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each
|
|
77
|
+
# [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each synaptic marker
|
|
79
78
|
def dbscan(self, target_names = None, record_cell_id = False, write_csv = False, write_path = "./"):
|
|
80
79
|
|
|
81
80
|
if self.type != "Xenium":
|
|
@@ -83,18 +82,16 @@ class mcDETECT:
|
|
|
83
82
|
z_grid.sort()
|
|
84
83
|
|
|
85
84
|
if target_names is None:
|
|
86
|
-
target_names = self.
|
|
87
|
-
|
|
85
|
+
target_names = self.syn_genes
|
|
88
86
|
transcripts = self.transcripts[self.transcripts["target"].isin(target_names)]
|
|
89
|
-
grouped = {g: df for g, df in transcripts.groupby("target")}
|
|
90
87
|
|
|
91
88
|
num_individual, data_low, data_high = [], {}, {}
|
|
92
89
|
|
|
93
90
|
for j in target_names:
|
|
94
91
|
|
|
95
92
|
# split transcripts
|
|
96
|
-
target =
|
|
97
|
-
others =
|
|
93
|
+
target = transcripts[transcripts["target"] == j]
|
|
94
|
+
others = transcripts[transcripts["target"] != j]
|
|
98
95
|
tree = make_tree(d1 = np.array(others["global_x"]), d2 = np.array(others["global_y"]), d3 = np.array(others["global_z"]))
|
|
99
96
|
|
|
100
97
|
# 3D DBSCAN
|
|
@@ -171,14 +168,14 @@ class mcDETECT:
|
|
|
171
168
|
num_individual.append(sphere_low.shape[0])
|
|
172
169
|
data_low[target_names.index(j)] = sphere_low
|
|
173
170
|
data_high[target_names.index(j)] = sphere_high
|
|
174
|
-
print(
|
|
171
|
+
print("{} out of {} genes processed!".format(target_names.index(j) + 1, len(target_names)))
|
|
175
172
|
|
|
176
173
|
return np.sum(num_individual), data_low, data_high
|
|
177
174
|
|
|
178
175
|
|
|
179
176
|
# [INNER] merge points from two overlapped spheres, input for remove_overlaps()
|
|
180
177
|
def find_points(self, sphere_a, sphere_b):
|
|
181
|
-
transcripts = self.transcripts[self.transcripts["target"].isin(self.
|
|
178
|
+
transcripts = self.transcripts[self.transcripts["target"].isin(self.syn_genes)]
|
|
182
179
|
tree_temp = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
|
|
183
180
|
idx_a = tree_temp.query_ball_point([sphere_a["sphere_x"], sphere_a["sphere_y"], sphere_a["sphere_z"]], sphere_a["sphere_r"])
|
|
184
181
|
points_a = transcripts.iloc[idx_a]
|
|
@@ -242,10 +239,10 @@ class mcDETECT:
|
|
|
242
239
|
return set_a, set_b
|
|
243
240
|
|
|
244
241
|
|
|
245
|
-
# [INNER] merge spheres from different
|
|
242
|
+
# [INNER] merge spheres from different synaptic markers, input for detect()
|
|
246
243
|
def merge_sphere(self, sphere_dict):
|
|
247
244
|
sphere = sphere_dict[0].copy()
|
|
248
|
-
for j in range(1, len(self.
|
|
245
|
+
for j in range(1, len(self.syn_genes)):
|
|
249
246
|
target_sphere = sphere_dict[j]
|
|
250
247
|
sphere, target_sphere_new = self.remove_overlaps(sphere, target_sphere)
|
|
251
248
|
sphere = pd.concat([sphere, target_sphere_new])
|
|
@@ -292,10 +289,10 @@ class mcDETECT:
|
|
|
292
289
|
return sphere
|
|
293
290
|
|
|
294
291
|
|
|
295
|
-
# [MAIN] dataframe,
|
|
296
|
-
def detect(self
|
|
292
|
+
# [MAIN] dataframe, synapse metadata
|
|
293
|
+
def detect(self):
|
|
297
294
|
|
|
298
|
-
_, data_low, data_high = self.dbscan(
|
|
295
|
+
_, data_low, data_high = self.dbscan()
|
|
299
296
|
|
|
300
297
|
print("Merging spheres...")
|
|
301
298
|
sphere_low, sphere_high = self.merge_sphere(data_low), self.merge_sphere(data_high)
|
|
@@ -307,7 +304,7 @@ class mcDETECT:
|
|
|
307
304
|
return self.nc_filter(sphere_low, sphere_high)
|
|
308
305
|
|
|
309
306
|
|
|
310
|
-
# [MAIN] anndata,
|
|
307
|
+
# [MAIN] anndata, synapse spatial transcriptome profile
|
|
311
308
|
def profile(self, granule, genes = None, print_itr = False):
|
|
312
309
|
|
|
313
310
|
if genes is None:
|
|
@@ -382,7 +379,7 @@ class mcDETECT:
|
|
|
382
379
|
count_gene, _, _ = np.histogram2d(target_gene["global_x"], target_gene["global_y"], bins = [x_bins, y_bins])
|
|
383
380
|
X[k_idx, :] = count_gene.flatten()
|
|
384
381
|
if k_idx % 100 == 0:
|
|
385
|
-
print(
|
|
382
|
+
print("{} out of {} genes profiled!".format(k_idx, len(genes)))
|
|
386
383
|
|
|
387
384
|
# spot id
|
|
388
385
|
spot_id = []
|
|
@@ -0,0 +1,625 @@
|
|
|
1
|
+
import anndata
|
|
2
|
+
import math
|
|
3
|
+
import miniball
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import scanpy as sc
|
|
7
|
+
from collections import Counter
|
|
8
|
+
from rtree import index
|
|
9
|
+
from scipy.sparse import csr_matrix
|
|
10
|
+
from scipy.spatial import cKDTree
|
|
11
|
+
from scipy.stats import poisson
|
|
12
|
+
from shapely.geometry import Point
|
|
13
|
+
from sklearn.cluster import DBSCAN
|
|
14
|
+
from sklearn.preprocessing import OneHotEncoder
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
from .utils import *
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class mcDETECT:
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def __init__(self, type, transcripts, gnl_genes, nc_genes = None, eps = 1.5, minspl = None, grid_len = 1.0, cutoff_prob = 0.95, alpha = 5.0, low_bound = 3,
|
|
24
|
+
size_thr = 4.0, in_nucleus_thr = (0.5, 0.5), l = 1.0, rho = 0.2, s = 1.0, nc_top = 20, nc_thr = 0.1):
|
|
25
|
+
|
|
26
|
+
self.type = type # string, iST platform, now support MERSCOPE, Xenium, and CosMx
|
|
27
|
+
self.transcripts = transcripts # dataframe, transcripts file
|
|
28
|
+
self.gnl_genes = gnl_genes # list, string, all granule markers
|
|
29
|
+
self.nc_genes = nc_genes # list, string, all negative controls
|
|
30
|
+
self.eps = eps # numeric, searching radius epsilon
|
|
31
|
+
self.minspl = minspl # integer, manually select min_samples, i.e., no automatic parameter selection
|
|
32
|
+
self.grid_len = grid_len # numeric, length of grids for computing the tissue area
|
|
33
|
+
self.cutoff_prob = cutoff_prob # numeric, cutoff probability in parameter selection for min_samples
|
|
34
|
+
self.alpha = alpha # numeric, scaling factor in parameter selection for min_samples
|
|
35
|
+
self.low_bound = low_bound # integer, lower bound in parameter selection for min_samples
|
|
36
|
+
self.size_thr = size_thr # numeric, threshold for maximum radius of an aggregation
|
|
37
|
+
self.in_nucleus_thr = in_nucleus_thr # 2-d tuple, threshold for low- and high-in-nucleus ratio
|
|
38
|
+
self.l = l # numeric, scaling factor for seaching overlapped spheres
|
|
39
|
+
self.rho = rho # numeric, threshold for determining overlaps
|
|
40
|
+
self.s = s # numeric, scaling factor for merging overlapped spheres
|
|
41
|
+
self.nc_top = nc_top # integer, number of negative controls retained for filtering
|
|
42
|
+
self.nc_thr = nc_thr # numeric, threshold for negative control filtering
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# [INNER] construct grids, input for tissue_area()
|
|
46
|
+
def construct_grid(self, grid_len = None):
|
|
47
|
+
if grid_len is None:
|
|
48
|
+
grid_len = self.grid_len
|
|
49
|
+
x_min, x_max = np.min(self.transcripts["global_x"]), np.max(self.transcripts["global_x"])
|
|
50
|
+
y_min, y_max = np.min(self.transcripts["global_y"]), np.max(self.transcripts["global_y"])
|
|
51
|
+
x_min = np.floor(x_min / grid_len) * grid_len
|
|
52
|
+
x_max = np.ceil(x_max / grid_len) * grid_len
|
|
53
|
+
y_min = np.floor(y_min / grid_len) * grid_len
|
|
54
|
+
y_max = np.ceil(y_max / grid_len) * grid_len
|
|
55
|
+
x_bins = np.arange(x_min, x_max + grid_len, grid_len)
|
|
56
|
+
y_bins = np.arange(y_min, y_max + grid_len, grid_len)
|
|
57
|
+
return x_bins, y_bins
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# [INNER] calculate tissue area, input for poisson_select()
|
|
61
|
+
def tissue_area(self):
|
|
62
|
+
if not hasattr(self, "_cached_area"):
|
|
63
|
+
x_bins, y_bins = self.construct_grid(grid_len = None)
|
|
64
|
+
hist, _, _ = np.histogram2d(self.transcripts["global_x"], self.transcripts["global_y"], bins = [x_bins, y_bins])
|
|
65
|
+
self._cached_area = np.count_nonzero(hist) * (self.grid_len ** 2)
|
|
66
|
+
return self._cached_area
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# [INNER] calculate optimal min_samples, input for dbscan()
|
|
70
|
+
def poisson_select(self, gene_name):
|
|
71
|
+
num_trans = np.sum(self.transcripts["target"] == gene_name)
|
|
72
|
+
bg_density = num_trans / self.tissue_area()
|
|
73
|
+
cutoff_density = poisson.ppf(self.cutoff_prob, mu = self.alpha * bg_density * (np.pi * self.eps ** 2))
|
|
74
|
+
optimal_m = int(max(cutoff_density, self.low_bound))
|
|
75
|
+
return optimal_m
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each granule marker
|
|
79
|
+
def dbscan(self, target_names = None, record_cell_id = False, write_csv = False, write_path = "./"):
|
|
80
|
+
|
|
81
|
+
if self.type != "Xenium":
|
|
82
|
+
z_grid = list(self.transcripts["global_z"].unique())
|
|
83
|
+
z_grid.sort()
|
|
84
|
+
|
|
85
|
+
if target_names is None:
|
|
86
|
+
target_names = self.gnl_genes
|
|
87
|
+
|
|
88
|
+
transcripts = self.transcripts[self.transcripts["target"].isin(target_names)]
|
|
89
|
+
grouped = {g: df for g, df in transcripts.groupby("target")}
|
|
90
|
+
|
|
91
|
+
num_individual, data_low, data_high = [], {}, {}
|
|
92
|
+
|
|
93
|
+
for j in target_names:
|
|
94
|
+
|
|
95
|
+
# split transcripts
|
|
96
|
+
target = grouped[j]
|
|
97
|
+
others = pd.concat([grouped[g] for g in target_names if g != j], ignore_index = True)
|
|
98
|
+
tree = make_tree(d1 = np.array(others["global_x"]), d2 = np.array(others["global_y"]), d3 = np.array(others["global_z"]))
|
|
99
|
+
|
|
100
|
+
# 3D DBSCAN
|
|
101
|
+
if self.minspl is None:
|
|
102
|
+
min_spl = self.poisson_select(j)
|
|
103
|
+
else:
|
|
104
|
+
min_spl = self.minspl
|
|
105
|
+
X = np.array(target[["global_x", "global_y", "global_z"]])
|
|
106
|
+
db = DBSCAN(eps = self.eps, min_samples = min_spl, algorithm = "kd_tree").fit(X)
|
|
107
|
+
labels = db.labels_
|
|
108
|
+
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
|
109
|
+
|
|
110
|
+
# iterate over all aggregations
|
|
111
|
+
cell_id, sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score = [], [], [], [], [], [], [], [], []
|
|
112
|
+
|
|
113
|
+
for k in range(n_clusters):
|
|
114
|
+
|
|
115
|
+
# record cell ids
|
|
116
|
+
if record_cell_id:
|
|
117
|
+
temp = target[labels == k]
|
|
118
|
+
temp_cell_id_mode = temp["cell_id"].mode()[0]
|
|
119
|
+
cell_id.append(temp_cell_id_mode)
|
|
120
|
+
|
|
121
|
+
# find minimum enclosing spheres
|
|
122
|
+
mask = (labels == k)
|
|
123
|
+
coords = X[mask]
|
|
124
|
+
if coords.shape[0] == 0:
|
|
125
|
+
continue
|
|
126
|
+
temp_in_nucleus = np.sum(target["overlaps_nucleus"].values[mask])
|
|
127
|
+
temp_size = coords.shape[0]
|
|
128
|
+
coords_unique = np.unique(coords, axis=0)
|
|
129
|
+
center, r2 = miniball.get_bounding_ball(coords_unique, epsilon=1e-8)
|
|
130
|
+
if self.type != "Xenium":
|
|
131
|
+
closest_z = closest(z_grid, center[2])
|
|
132
|
+
else:
|
|
133
|
+
closest_z = center[2]
|
|
134
|
+
|
|
135
|
+
# calculate size, composition, and in-nucleus score
|
|
136
|
+
other_idx = tree.query_ball_point([center[0], center[1], center[2]], np.sqrt(r2))
|
|
137
|
+
other_trans = others.iloc[other_idx]
|
|
138
|
+
other_in_nucleus = np.sum(other_trans["overlaps_nucleus"])
|
|
139
|
+
other_size = other_trans.shape[0]
|
|
140
|
+
other_comp = len(other_trans["target"].unique())
|
|
141
|
+
total_size = temp_size + other_size
|
|
142
|
+
total_comp = 1 + other_comp
|
|
143
|
+
local_score = (temp_in_nucleus + other_in_nucleus) / total_size
|
|
144
|
+
|
|
145
|
+
# record coordinate, radius, size, composition, and in-nucleus score
|
|
146
|
+
sphere_x.append(center[0])
|
|
147
|
+
sphere_y.append(center[1])
|
|
148
|
+
sphere_z.append(center[2])
|
|
149
|
+
layer_z.append(closest_z)
|
|
150
|
+
sphere_r.append(np.sqrt(r2))
|
|
151
|
+
sphere_size.append(total_size)
|
|
152
|
+
sphere_comp.append(total_comp)
|
|
153
|
+
sphere_score.append(local_score)
|
|
154
|
+
|
|
155
|
+
# basic features for all spheres from each granule marker
|
|
156
|
+
sphere = pd.DataFrame(list(zip(sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score, [j] * len(sphere_x))),
|
|
157
|
+
columns = ["sphere_x", "sphere_y", "sphere_z", "layer_z", "sphere_r", "size", "comp", "in_nucleus", "gene"])
|
|
158
|
+
sphere = sphere.astype({"sphere_x": float, "sphere_y": float, "sphere_z": float, "layer_z": float, "sphere_r": float, "size": float, "comp": float, "in_nucleus": float, "gene": str})
|
|
159
|
+
if record_cell_id:
|
|
160
|
+
sphere["cell_id"] = cell_id
|
|
161
|
+
sphere = sphere.astype({"cell_id": str})
|
|
162
|
+
|
|
163
|
+
# split low- and high-in-nucleus spheres
|
|
164
|
+
sphere_low = sphere[(sphere["sphere_r"] < self.size_thr) & (sphere["in_nucleus"] < self.in_nucleus_thr[0])]
|
|
165
|
+
sphere_high = sphere[(sphere["sphere_r"] < self.size_thr) & (sphere["in_nucleus"] > self.in_nucleus_thr[1])]
|
|
166
|
+
|
|
167
|
+
if write_csv:
|
|
168
|
+
sphere_low.to_csv(write_path + j + " sphere.csv", index=0)
|
|
169
|
+
sphere_high.to_csv(write_path + j + " sphere_high.csv", index=0)
|
|
170
|
+
|
|
171
|
+
num_individual.append(sphere_low.shape[0])
|
|
172
|
+
data_low[target_names.index(j)] = sphere_low
|
|
173
|
+
data_high[target_names.index(j)] = sphere_high
|
|
174
|
+
print(f"{target_names.index(j) + 1} of {len(target_names)} genes processed!")
|
|
175
|
+
|
|
176
|
+
return np.sum(num_individual), data_low, data_high
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
# [INNER] merge points from two overlapped spheres, input for remove_overlaps()
|
|
180
|
+
def find_points(self, sphere_a, sphere_b):
|
|
181
|
+
transcripts = self.transcripts[self.transcripts["target"].isin(self.gnl_genes)]
|
|
182
|
+
tree_temp = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
|
|
183
|
+
idx_a = tree_temp.query_ball_point([sphere_a["sphere_x"], sphere_a["sphere_y"], sphere_a["sphere_z"]], sphere_a["sphere_r"])
|
|
184
|
+
points_a = transcripts.iloc[idx_a]
|
|
185
|
+
points_a = points_a[points_a["target"] == sphere_a["gene"]]
|
|
186
|
+
idx_b = tree_temp.query_ball_point([sphere_b["sphere_x"], sphere_b["sphere_y"], sphere_b["sphere_z"]], sphere_b["sphere_r"])
|
|
187
|
+
points_b = transcripts.iloc[idx_b]
|
|
188
|
+
points_b = points_b[points_b["target"] == sphere_b["gene"]]
|
|
189
|
+
points = pd.concat([points_a, points_b])
|
|
190
|
+
points = points[["global_x", "global_y", "global_z"]]
|
|
191
|
+
return points
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def remove_overlaps(self, set_a, set_b):
|
|
195
|
+
|
|
196
|
+
set_a = set_a.copy()
|
|
197
|
+
set_b = set_b.copy()
|
|
198
|
+
|
|
199
|
+
# find possible overlaps on 2D by r-tree
|
|
200
|
+
idx_b = make_rtree(set_b)
|
|
201
|
+
for i, sphere_a in set_a.iterrows():
|
|
202
|
+
center_a_3D = np.array([sphere_a.sphere_x, sphere_a.sphere_y, sphere_a.sphere_z])
|
|
203
|
+
bounds_a = (sphere_a.sphere_x - sphere_a.sphere_r,
|
|
204
|
+
sphere_a.sphere_y - sphere_a.sphere_r,
|
|
205
|
+
sphere_a.sphere_x + sphere_a.sphere_r,
|
|
206
|
+
sphere_a.sphere_y + sphere_a.sphere_r)
|
|
207
|
+
possible_overlaps = idx_b.intersection(bounds_a)
|
|
208
|
+
|
|
209
|
+
# search 3D overlaps within possible overlaps
|
|
210
|
+
for j in possible_overlaps:
|
|
211
|
+
if j in set_b.index:
|
|
212
|
+
sphere_b = set_b.loc[j]
|
|
213
|
+
center_b_3D = np.array([sphere_b.sphere_x, sphere_b.sphere_y, sphere_b.sphere_z])
|
|
214
|
+
dist = np.linalg.norm(center_a_3D - center_b_3D)
|
|
215
|
+
radius_sum = sphere_a.sphere_r + sphere_b.sphere_r
|
|
216
|
+
radius_diff = sphere_a.sphere_r - sphere_b.sphere_r
|
|
217
|
+
|
|
218
|
+
# relative positions (0: internal & intersect, 1: internal, 2: intersect)
|
|
219
|
+
c0 = (dist < self.l * radius_sum)
|
|
220
|
+
c1 = (dist <= self.l * np.abs(radius_diff))
|
|
221
|
+
c1_1 = (radius_diff > 0)
|
|
222
|
+
c2_1 = (dist < self.rho * self.l * radius_sum)
|
|
223
|
+
|
|
224
|
+
# operations on dataframes
|
|
225
|
+
if c0:
|
|
226
|
+
if c1 and c1_1: # keep A and remove B
|
|
227
|
+
set_b.drop(index = j, inplace = True)
|
|
228
|
+
elif c1 and not c1_1: # replace A with B and remove B
|
|
229
|
+
set_a.loc[i] = set_b.loc[j]
|
|
230
|
+
set_b.drop(index = j, inplace = True)
|
|
231
|
+
elif not c1 and c2_1: # replace A with new sphere and remove B
|
|
232
|
+
points_union = np.array(self.find_points(sphere_a, sphere_b))
|
|
233
|
+
new_center, new_radius = miniball.get_bounding_ball(points_union, epsilon=1e-8)
|
|
234
|
+
set_a.loc[i, "sphere_x"] = new_center[0]
|
|
235
|
+
set_a.loc[i, "sphere_y"] = new_center[1]
|
|
236
|
+
set_a.loc[i, "sphere_z"] = new_center[2]
|
|
237
|
+
set_a.loc[i, "sphere_r"] = self.s * new_radius
|
|
238
|
+
set_b.drop(index = j, inplace = True)
|
|
239
|
+
|
|
240
|
+
set_a = set_a.reset_index(drop = True)
|
|
241
|
+
set_b = set_b.reset_index(drop = True)
|
|
242
|
+
return set_a, set_b
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
# [INNER] merge spheres from different granule markers, input for detect()
|
|
246
|
+
def merge_sphere(self, sphere_dict):
|
|
247
|
+
sphere = sphere_dict[0].copy()
|
|
248
|
+
for j in range(1, len(self.gnl_genes)):
|
|
249
|
+
target_sphere = sphere_dict[j]
|
|
250
|
+
sphere, target_sphere_new = self.remove_overlaps(sphere, target_sphere)
|
|
251
|
+
sphere = pd.concat([sphere, target_sphere_new])
|
|
252
|
+
sphere = sphere.reset_index(drop = True)
|
|
253
|
+
return sphere
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
# [INNER] negative control filtering, input for detect()
|
|
257
|
+
def nc_filter(self, sphere_low, sphere_high):
|
|
258
|
+
|
|
259
|
+
# negative control gene profiling
|
|
260
|
+
adata_low = self.profile(sphere_low, self.nc_genes)
|
|
261
|
+
adata_high = self.profile(sphere_high, self.nc_genes)
|
|
262
|
+
adata = anndata.concat([adata_low, adata_high], axis = 0, merge = "same")
|
|
263
|
+
adata.var["genes"] = adata.var.index
|
|
264
|
+
adata.obs_keys = list(np.arange(adata.shape[0]))
|
|
265
|
+
adata.obs["type"] = ["low"] * adata_low.shape[0] + ["high"] * adata_high.shape[0]
|
|
266
|
+
adata.obs["type"] = pd.Categorical(adata.obs["type"], categories = ["low", "high"], ordered = True)
|
|
267
|
+
|
|
268
|
+
# DE analysis of negative control genes
|
|
269
|
+
sc.tl.rank_genes_groups(adata, "type", method = "t-test")
|
|
270
|
+
names = adata.uns["rank_genes_groups"]["names"]
|
|
271
|
+
names = pd.DataFrame(names)
|
|
272
|
+
logfc = adata.uns["rank_genes_groups"]["logfoldchanges"]
|
|
273
|
+
logfc = pd.DataFrame(logfc)
|
|
274
|
+
pvals = adata.uns["rank_genes_groups"]["pvals"]
|
|
275
|
+
pvals = pd.DataFrame(pvals)
|
|
276
|
+
|
|
277
|
+
# select top upregulated negative control genes
|
|
278
|
+
df = pd.DataFrame({"names": names["high"], "logfc": logfc["high"], "pvals": pvals["high"]})
|
|
279
|
+
df = df[df["logfc"] >= 0]
|
|
280
|
+
df = df.sort_values(by = ["pvals"], ascending = True)
|
|
281
|
+
nc_genes_final = list(df["names"].head(self.nc_top))
|
|
282
|
+
|
|
283
|
+
# negative control filtering
|
|
284
|
+
nc_transcripts_final = self.transcripts[self.transcripts["target"].isin(nc_genes_final)]
|
|
285
|
+
tree = make_tree(d1 = np.array(nc_transcripts_final["global_x"]), d2 = np.array(nc_transcripts_final["global_y"]), d3 = np.array(nc_transcripts_final["global_z"]))
|
|
286
|
+
centers = sphere_low[["sphere_x", "sphere_y", "sphere_z"]].to_numpy()
|
|
287
|
+
radii = sphere_low["sphere_r"].to_numpy()
|
|
288
|
+
sizes = sphere_low["size"].to_numpy()
|
|
289
|
+
counts = np.array([len(tree.query_ball_point(c, r)) for c, r in zip(centers, radii)])
|
|
290
|
+
pass_idx = (counts == 0) | (counts / sizes < self.nc_thr)
|
|
291
|
+
sphere = sphere_low[pass_idx].reset_index(drop = True)
|
|
292
|
+
return sphere
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
# [MAIN] dataframe, granule metadata
|
|
296
|
+
def detect(self, record_cell_id = False):
|
|
297
|
+
|
|
298
|
+
_, data_low, data_high = self.dbscan(record_cell_id = record_cell_id)
|
|
299
|
+
|
|
300
|
+
print("Merging spheres...")
|
|
301
|
+
sphere_low, sphere_high = self.merge_sphere(data_low), self.merge_sphere(data_high)
|
|
302
|
+
|
|
303
|
+
if self.nc_genes is None:
|
|
304
|
+
return sphere_low
|
|
305
|
+
else:
|
|
306
|
+
print("Negative control filtering...")
|
|
307
|
+
return self.nc_filter(sphere_low, sphere_high)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# [MAIN] anndata, granule spatial transcriptome profile
|
|
311
|
+
def profile(self, granule, genes = None, print_itr = False):
|
|
312
|
+
|
|
313
|
+
if genes is None:
|
|
314
|
+
genes = list(self.transcripts["target"].unique())
|
|
315
|
+
transcripts = self.transcripts
|
|
316
|
+
else:
|
|
317
|
+
transcripts = self.transcripts[self.transcripts["target"].isin(genes)]
|
|
318
|
+
|
|
319
|
+
gene_to_idx = {g: i for i, g in enumerate(genes)}
|
|
320
|
+
gene_array = transcripts["target"].to_numpy()
|
|
321
|
+
tree = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
|
|
322
|
+
|
|
323
|
+
n_gnl = granule.shape[0]
|
|
324
|
+
n_gene = len(genes)
|
|
325
|
+
data, row_idx, col_idx = [], [], []
|
|
326
|
+
|
|
327
|
+
# iterate over all granules to count nearby transcripts
|
|
328
|
+
for i in range(n_gnl):
|
|
329
|
+
temp = granule.iloc[i]
|
|
330
|
+
target_idx = tree.query_ball_point([temp["sphere_x"], temp["sphere_y"], temp["layer_z"]], temp["sphere_r"])
|
|
331
|
+
if not target_idx:
|
|
332
|
+
continue
|
|
333
|
+
local_genes = gene_array[target_idx] # extract genes for those nearby transcripts
|
|
334
|
+
counts = Counter(local_genes) # count how many times each gene occurs
|
|
335
|
+
for g, cnt in counts.items(): # append nonzero entries to sparse matrix lists
|
|
336
|
+
j = gene_to_idx[g] # get gene column index
|
|
337
|
+
data.append(cnt) # nonzero count
|
|
338
|
+
row_idx.append(i) # row index = granule index
|
|
339
|
+
col_idx.append(j) # column index = gene index
|
|
340
|
+
if print_itr and (i % 5000 == 0):
|
|
341
|
+
print(f"{i} out of {n_gnl} granules profiled!")
|
|
342
|
+
|
|
343
|
+
# construct sparse spatial transcriptome profile, (n_granules × n_genes)
|
|
344
|
+
X = csr_matrix((data, (row_idx, col_idx)), shape = (n_gnl, n_gene), dtype = np.float32)
|
|
345
|
+
adata = anndata.AnnData(X = X, obs = granule.copy())
|
|
346
|
+
adata.obs["granule_id"] = [f"gnl_{i}" for i in range(n_gnl)]
|
|
347
|
+
adata.obs = adata.obs.astype({"granule_id": str})
|
|
348
|
+
adata.obs.rename(columns = {"sphere_x": "global_x", "sphere_y": "global_y", "sphere_z": "global_z"}, inplace = True)
|
|
349
|
+
adata.var["genes"] = genes
|
|
350
|
+
adata.var_names = genes
|
|
351
|
+
adata.var_keys = genes
|
|
352
|
+
return adata
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
# [MAIN] anndata, spot-level gene expression
|
|
356
|
+
def spot_expression(self, grid_len, genes = None):
|
|
357
|
+
|
|
358
|
+
if genes is None:
|
|
359
|
+
genes = list(self.transcripts["target"].unique())
|
|
360
|
+
transcripts = self.transcripts
|
|
361
|
+
else:
|
|
362
|
+
transcripts = self.transcripts[self.transcripts["target"].isin(genes)]
|
|
363
|
+
|
|
364
|
+
# construct bins
|
|
365
|
+
x_bins, y_bins = self.construct_grid(grid_len = grid_len)
|
|
366
|
+
|
|
367
|
+
# initialize data
|
|
368
|
+
X = np.zeros((len(genes), (len(x_bins) - 1) * (len(y_bins) - 1)))
|
|
369
|
+
global_x, global_y = [], []
|
|
370
|
+
|
|
371
|
+
# coordinates
|
|
372
|
+
for i in list(x_bins)[:-1]:
|
|
373
|
+
center_x = i + 0.5 * grid_len
|
|
374
|
+
for j in list(y_bins)[:-1]:
|
|
375
|
+
center_y = j + 0.5 * grid_len
|
|
376
|
+
global_x.append(center_x)
|
|
377
|
+
global_y.append(center_y)
|
|
378
|
+
|
|
379
|
+
# count matrix
|
|
380
|
+
for k_idx, k in enumerate(genes):
|
|
381
|
+
target_gene = transcripts[transcripts["target"] == k]
|
|
382
|
+
count_gene, _, _ = np.histogram2d(target_gene["global_x"], target_gene["global_y"], bins = [x_bins, y_bins])
|
|
383
|
+
X[k_idx, :] = count_gene.flatten()
|
|
384
|
+
if k_idx % 100 == 0:
|
|
385
|
+
print(f"{k_idx} out of {len(genes)} genes profiled!")
|
|
386
|
+
|
|
387
|
+
# spot id
|
|
388
|
+
spot_id = []
|
|
389
|
+
for i in range(len(global_x)):
|
|
390
|
+
id = "spot_" + str(i)
|
|
391
|
+
spot_id.append(id)
|
|
392
|
+
|
|
393
|
+
# assemble data
|
|
394
|
+
adata = anndata.AnnData(X = np.transpose(X))
|
|
395
|
+
adata.obs["spot_id"] = spot_id
|
|
396
|
+
adata.obs["global_x"] = global_x
|
|
397
|
+
adata.obs["global_y"] = global_y
|
|
398
|
+
adata.var["genes"] = genes
|
|
399
|
+
adata.var_names = genes
|
|
400
|
+
adata.var_keys = genes
|
|
401
|
+
return adata
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
# [MAIN] anndata, spot-level neuron metadata
|
|
405
|
+
def spot_neuron(adata_neuron, spot, grid_len = 50, neuron_loc_key = ["global_x", "global_y"], spot_loc_key = ["global_x", "global_y"]):
|
|
406
|
+
|
|
407
|
+
adata_neuron = adata_neuron.copy()
|
|
408
|
+
neurons = adata_neuron.obs
|
|
409
|
+
spot = spot.copy()
|
|
410
|
+
|
|
411
|
+
half_len = grid_len / 2
|
|
412
|
+
|
|
413
|
+
indicator, neuron_count = [], []
|
|
414
|
+
|
|
415
|
+
for _, row in spot.obs.iterrows():
|
|
416
|
+
|
|
417
|
+
x = row[spot_loc_key[0]]
|
|
418
|
+
y = row[spot_loc_key[1]]
|
|
419
|
+
neuron_temp = neurons[(neurons[neuron_loc_key[0]] > x - half_len) & (neurons[neuron_loc_key[0]] < x + half_len) & (neurons[neuron_loc_key[1]] > y - half_len) & (neurons[neuron_loc_key[1]] < y + half_len)]
|
|
420
|
+
indicator.append(int(len(neuron_temp) > 0))
|
|
421
|
+
neuron_count.append(len(neuron_temp))
|
|
422
|
+
|
|
423
|
+
spot.obs["indicator"] = indicator
|
|
424
|
+
spot.obs["neuron_count"] = neuron_count
|
|
425
|
+
return spot
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
# [MAIN] anndata, spot-level granule metadata
|
|
429
|
+
def spot_granule(granule, spot, grid_len = 50, gnl_loc_key = ["sphere_x", "sphere_y"], spot_loc_key = ["global_x", "global_y"]):
|
|
430
|
+
|
|
431
|
+
granule = granule.copy()
|
|
432
|
+
spot = spot.copy()
|
|
433
|
+
|
|
434
|
+
half_len = grid_len / 2
|
|
435
|
+
|
|
436
|
+
indicator, granule_count, granule_radius, granule_size, granule_score = [], [], [], [], []
|
|
437
|
+
|
|
438
|
+
for _, row in spot.obs.iterrows():
|
|
439
|
+
|
|
440
|
+
x = row[spot_loc_key[0]]
|
|
441
|
+
y = row[spot_loc_key[1]]
|
|
442
|
+
gnl_temp = granule[(granule[gnl_loc_key[0]] >= x - half_len) & (granule[gnl_loc_key[0]] < x + half_len) & (granule[gnl_loc_key[1]] >= y - half_len) & (granule[gnl_loc_key[1]] < y + half_len)]
|
|
443
|
+
indicator.append(int(len(gnl_temp) > 0))
|
|
444
|
+
granule_count.append(len(gnl_temp))
|
|
445
|
+
|
|
446
|
+
if len(gnl_temp) == 0:
|
|
447
|
+
granule_radius.append(0)
|
|
448
|
+
granule_size.append(0)
|
|
449
|
+
granule_score.append(0)
|
|
450
|
+
else:
|
|
451
|
+
granule_radius.append(np.nanmean(gnl_temp["sphere_r"]))
|
|
452
|
+
granule_size.append(np.nanmean(gnl_temp["size"]))
|
|
453
|
+
granule_score.append(np.nanmean(gnl_temp["in_nucleus"]))
|
|
454
|
+
|
|
455
|
+
spot.obs["indicator"] = indicator
|
|
456
|
+
spot.obs["gnl_count"] = granule_count
|
|
457
|
+
spot.obs["gnl_radius"] = granule_radius
|
|
458
|
+
spot.obs["gnl_size"] = granule_size
|
|
459
|
+
spot.obs["gnl_score"] = granule_score
|
|
460
|
+
return spot
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
# [Main] anndata, neuron-granule colocalization
|
|
464
|
+
def neighbor_granule(adata_neuron, granule_adata, radius = 10, sigma = None, loc_key = ["global_x", "global_y"]):
|
|
465
|
+
|
|
466
|
+
adata_neuron = adata_neuron.copy()
|
|
467
|
+
granule_adata = granule_adata.copy()
|
|
468
|
+
|
|
469
|
+
if sigma is None:
|
|
470
|
+
sigma = radius / 2
|
|
471
|
+
|
|
472
|
+
# neuron and granule coordinates
|
|
473
|
+
neuron_coords = adata_neuron.obs[loc_key].values
|
|
474
|
+
gnl_coords = granule_adata.obs[loc_key].values
|
|
475
|
+
|
|
476
|
+
# make tree
|
|
477
|
+
tree = make_tree(d1 = gnl_coords[:, 0], d2 = gnl_coords[:, 1])
|
|
478
|
+
|
|
479
|
+
# query neighboring granules for each neuron
|
|
480
|
+
neighbor_indices = tree.query_ball_point(neuron_coords, r = radius)
|
|
481
|
+
|
|
482
|
+
# record count and indices
|
|
483
|
+
granule_counts = np.array([len(indices) for indices in neighbor_indices])
|
|
484
|
+
adata_neuron.obs["neighbor_gnl_count"] = granule_counts
|
|
485
|
+
adata_neuron.uns["neighbor_gnl_indices"] = neighbor_indices
|
|
486
|
+
|
|
487
|
+
# ---------- neighboring granule expression matrix ---------- #
|
|
488
|
+
n_neurons, n_genes = adata_neuron.n_obs, adata_neuron.n_vars
|
|
489
|
+
weighted_expr = np.zeros((n_neurons, n_genes))
|
|
490
|
+
|
|
491
|
+
for i, indices in enumerate(neighbor_indices):
|
|
492
|
+
if len(indices) == 0:
|
|
493
|
+
continue
|
|
494
|
+
distances = np.linalg.norm(gnl_coords[indices] - neuron_coords[i], axis = 1)
|
|
495
|
+
weights = np.exp(- (distances ** 2) / (2 * sigma ** 2))
|
|
496
|
+
weights = weights / weights.sum()
|
|
497
|
+
weighted_expr[i] = np.average(granule_adata.X[indices], axis = 0, weights = weights)
|
|
498
|
+
|
|
499
|
+
adata_neuron.obsm["weighted_gnl_expression"] = weighted_expr
|
|
500
|
+
|
|
501
|
+
# ---------- neighboring granule spatial feature ---------- #
|
|
502
|
+
features = []
|
|
503
|
+
|
|
504
|
+
for i, gnl_idx in enumerate(neighbor_indices):
|
|
505
|
+
|
|
506
|
+
feats = {}
|
|
507
|
+
feats["n_granules"] = len(gnl_idx)
|
|
508
|
+
|
|
509
|
+
if len(gnl_idx) == 0:
|
|
510
|
+
feats.update({"mean_distance": np.nan, "std_distance": np.nan, "radius_max": np.nan, "radius_min": np.nan, "density": 0, "center_offset_norm": np.nan, "anisotropy_ratio": np.nan})
|
|
511
|
+
else:
|
|
512
|
+
gnl_pos = gnl_coords[gnl_idx]
|
|
513
|
+
neuron_pos = neuron_coords[i]
|
|
514
|
+
dists = np.linalg.norm(gnl_pos - neuron_pos, axis = 1)
|
|
515
|
+
feats["mean_distance"] = dists.mean()
|
|
516
|
+
feats["std_distance"] = dists.std()
|
|
517
|
+
feats["radius_max"] = dists.max()
|
|
518
|
+
feats["radius_min"] = dists.min()
|
|
519
|
+
feats["density"] = len(gnl_idx) / (np.pi * radius ** 2)
|
|
520
|
+
centroid = gnl_pos.mean(axis = 0)
|
|
521
|
+
offset = centroid - neuron_pos
|
|
522
|
+
feats["center_offset_norm"] = np.linalg.norm(offset)
|
|
523
|
+
cov = np.cov((gnl_pos - neuron_pos).T)
|
|
524
|
+
eigvals = np.linalg.eigvalsh(cov)
|
|
525
|
+
if np.min(eigvals) > 0:
|
|
526
|
+
feats["anisotropy_ratio"] = np.max(eigvals) / np.min(eigvals)
|
|
527
|
+
else:
|
|
528
|
+
feats["anisotropy_ratio"] = np.nan
|
|
529
|
+
|
|
530
|
+
features.append(feats)
|
|
531
|
+
|
|
532
|
+
spatial_df = pd.DataFrame(features, index = adata_neuron.obs_names)
|
|
533
|
+
return adata_neuron, spatial_df
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
# [MAIN] numpy array, neuron embeddings based on neighboring granules
|
|
537
|
+
def neuron_embedding_one_hot(adata_neuron, granule_adata, k = 10, radius = 10, loc_key = ["global_x", "global_y"], gnl_subtype_key = "granule_subtype_kmeans", padding_value = "Others"):
|
|
538
|
+
|
|
539
|
+
adata_neuron = adata_neuron.copy()
|
|
540
|
+
granule_adata = granule_adata.copy()
|
|
541
|
+
|
|
542
|
+
# neuron and granule coordinates, granule subtypes
|
|
543
|
+
neuron_coords = adata_neuron.obs[loc_key].to_numpy()
|
|
544
|
+
granule_coords = granule_adata.obs[loc_key].to_numpy()
|
|
545
|
+
granule_subtypes = granule_adata.obs[gnl_subtype_key].astype(str).to_numpy()
|
|
546
|
+
|
|
547
|
+
# include padding category
|
|
548
|
+
unique_subtypes = np.unique(granule_subtypes).tolist()
|
|
549
|
+
if padding_value not in unique_subtypes:
|
|
550
|
+
unique_subtypes.append(padding_value)
|
|
551
|
+
|
|
552
|
+
encoder = OneHotEncoder(categories = [unique_subtypes], sparse = False, handle_unknown = "ignore")
|
|
553
|
+
encoder.fit(np.array(unique_subtypes).reshape(-1, 1))
|
|
554
|
+
S = len(unique_subtypes)
|
|
555
|
+
|
|
556
|
+
# k-d tree
|
|
557
|
+
tree = make_tree(d1 = granule_coords[:, 0], d2 = granule_coords[:, 1])
|
|
558
|
+
distances, indices = tree.query(neuron_coords, k = k, distance_upper_bound = radius)
|
|
559
|
+
|
|
560
|
+
# initialize output
|
|
561
|
+
n_neurons = neuron_coords.shape[0]
|
|
562
|
+
embeddings = np.zeros((n_neurons, k, S), dtype = float)
|
|
563
|
+
|
|
564
|
+
for i in range(n_neurons):
|
|
565
|
+
for k in range(k):
|
|
566
|
+
idx = indices[i, k]
|
|
567
|
+
dist = distances[i, k]
|
|
568
|
+
if idx == granule_coords.shape[0] or np.isinf(dist):
|
|
569
|
+
subtype = padding_value
|
|
570
|
+
else:
|
|
571
|
+
subtype = granule_subtypes[idx]
|
|
572
|
+
onehot = encoder.transform([[subtype]])[0]
|
|
573
|
+
embeddings[i, k, :] = onehot
|
|
574
|
+
|
|
575
|
+
return embeddings, encoder.categories_[0]
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
# [MAIN] numpy array, neuron embeddings based on neighboring granules
|
|
579
|
+
def neuron_embedding_spatial_weight(adata_neuron, granule_adata, radius = 10, sigma = 10, loc_key = ["global_x", "global_y"], gnl_subtype_key = "granule_subtype_kmeans", padding_value = "Others"):
|
|
580
|
+
|
|
581
|
+
adata_neuron = adata_neuron.copy()
|
|
582
|
+
granule_adata = granule_adata.copy()
|
|
583
|
+
|
|
584
|
+
# neuron and granule coordinates, granule subtypes
|
|
585
|
+
neuron_coords = adata_neuron.obs[loc_key].to_numpy()
|
|
586
|
+
granule_coords = granule_adata.obs[loc_key].to_numpy()
|
|
587
|
+
granule_subtypes = granule_adata.obs[gnl_subtype_key].astype(str).to_numpy()
|
|
588
|
+
|
|
589
|
+
# include padding category
|
|
590
|
+
unique_subtypes = np.unique(granule_subtypes).tolist()
|
|
591
|
+
if padding_value not in unique_subtypes:
|
|
592
|
+
unique_subtypes.append(padding_value)
|
|
593
|
+
|
|
594
|
+
encoder = OneHotEncoder(categories = [unique_subtypes], sparse = False, handle_unknown = "ignore")
|
|
595
|
+
encoder.fit(np.array(unique_subtypes).reshape(-1, 1))
|
|
596
|
+
S = len(unique_subtypes)
|
|
597
|
+
|
|
598
|
+
# k-d tree
|
|
599
|
+
tree = make_tree(d1 = granule_coords[:, 0], d2 = granule_coords[:, 1])
|
|
600
|
+
all_neighbors = tree.query_ball_point(neuron_coords, r = radius)
|
|
601
|
+
|
|
602
|
+
# initialize output
|
|
603
|
+
n_neurons = neuron_coords.shape[0]
|
|
604
|
+
embeddings = np.zeros((n_neurons, S), dtype = float)
|
|
605
|
+
|
|
606
|
+
for i, neighbor_indices in enumerate(all_neighbors):
|
|
607
|
+
if not neighbor_indices:
|
|
608
|
+
# no neighbors, assign to padding subtype
|
|
609
|
+
embeddings[i] = encoder.transform([[padding_value]])[0]
|
|
610
|
+
continue
|
|
611
|
+
|
|
612
|
+
# get neighbor subtypes and distances
|
|
613
|
+
neighbor_coords = granule_coords[neighbor_indices]
|
|
614
|
+
dists = np.linalg.norm(neuron_coords[i] - neighbor_coords, axis = 1)
|
|
615
|
+
weights = np.exp(- dists / sigma)
|
|
616
|
+
|
|
617
|
+
# encode subtypes to one-hot and weight them
|
|
618
|
+
subtypes = granule_subtypes[neighbor_indices]
|
|
619
|
+
onehots = encoder.transform(subtypes.reshape(-1, 1))
|
|
620
|
+
weighted_sum = (weights[:, np.newaxis] * onehots).sum(axis = 0)
|
|
621
|
+
|
|
622
|
+
# normalize to make it a composition vector
|
|
623
|
+
embeddings[i] = weighted_sum / weights.sum()
|
|
624
|
+
|
|
625
|
+
return embeddings, encoder.categories_[0]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
mcDETECT/__init__.py,sha256=_UFDBE5UfX_xzioVs7rshassZR-KO0yPDE71Uflgx-E,92
|
|
2
|
+
mcDETECT/model.py,sha256=zDmfQjQwkSm8JRe2e45FN8siS0o50AUHZQoqCWvtvw4,29200
|
|
3
|
+
mcDETECT/model_new_but_wrong.py,sha256=MqJMAC4cyjux3BWIYWmGLugr_gHPeYcQNH-O40xbPHE,29398
|
|
4
|
+
mcDETECT/utils.py,sha256=0gvqZV7sGi8qvvdC5x9XScyiTXlSfqbUt1zks4t7Xd4,4545
|
|
5
|
+
mcdetect-2.0.7.dist-info/licenses/LICENSE,sha256=uxq-shEWOGTIGVnQLmpElILmfCkuUhFZRAMnZUiKvtg,1070
|
|
6
|
+
mcdetect-2.0.7.dist-info/METADATA,sha256=TlVlS7eK2SzHiyweSTEDfNTOFJeKh66epvKLS1pCC40,3016
|
|
7
|
+
mcdetect-2.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
mcdetect-2.0.7.dist-info/top_level.txt,sha256=WwzBojt5U-T2hZ8llO6XgpM9OFIBkWQQldQKu19O8EY,9
|
|
9
|
+
mcdetect-2.0.7.dist-info/RECORD,,
|
mcdetect-2.0.6.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
mcDETECT/__init__.py,sha256=y6JBEdILAES1NQ3-UVEO6BJwAeNk7e7aTbBMKzefCn8,92
|
|
2
|
-
mcDETECT/model.py,sha256=MqJMAC4cyjux3BWIYWmGLugr_gHPeYcQNH-O40xbPHE,29398
|
|
3
|
-
mcDETECT/utils.py,sha256=0gvqZV7sGi8qvvdC5x9XScyiTXlSfqbUt1zks4t7Xd4,4545
|
|
4
|
-
mcdetect-2.0.6.dist-info/licenses/LICENSE,sha256=uxq-shEWOGTIGVnQLmpElILmfCkuUhFZRAMnZUiKvtg,1070
|
|
5
|
-
mcdetect-2.0.6.dist-info/METADATA,sha256=p5Xqkc_e8Q9KLOhL7SqiNY1LmdjVLL5jp-nXvm6Pg6U,3016
|
|
6
|
-
mcdetect-2.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
-
mcdetect-2.0.6.dist-info/top_level.txt,sha256=WwzBojt5U-T2hZ8llO6XgpM9OFIBkWQQldQKu19O8EY,9
|
|
8
|
-
mcdetect-2.0.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|