mcDETECT 2.0.9__py3-none-any.whl → 2.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcDETECT might be problematic. Click here for more details.
- mcDETECT/__init__.py +1 -1
- mcDETECT/model.py +4 -2
- {mcdetect-2.0.9.dist-info → mcdetect-2.0.10.dist-info}/METADATA +1 -1
- mcdetect-2.0.10.dist-info/RECORD +8 -0
- mcDETECT/model_new_incorrect.py +0 -625
- mcdetect-2.0.9.dist-info/RECORD +0 -9
- {mcdetect-2.0.9.dist-info → mcdetect-2.0.10.dist-info}/WHEEL +0 -0
- {mcdetect-2.0.9.dist-info → mcdetect-2.0.10.dist-info}/licenses/LICENSE +0 -0
- {mcdetect-2.0.9.dist-info → mcdetect-2.0.10.dist-info}/top_level.txt +0 -0
mcDETECT/__init__.py
CHANGED
mcDETECT/model.py
CHANGED
|
@@ -122,8 +122,10 @@ class mcDETECT:
|
|
|
122
122
|
continue
|
|
123
123
|
temp_in_nucleus = np.sum(target["overlaps_nucleus"].values[mask])
|
|
124
124
|
temp_size = coords.shape[0]
|
|
125
|
-
coords_unique = np.unique(coords, axis=0)
|
|
126
|
-
|
|
125
|
+
# coords_unique = np.unique(coords, axis=0)
|
|
126
|
+
temp = pd.DataFrame(coords, columns=["global_x", "global_y", "global_z"])
|
|
127
|
+
temp = temp.drop_duplicates()
|
|
128
|
+
center, r2 = miniball.get_bounding_ball(np.array(temp), epsilon=1e-8)
|
|
127
129
|
if self.type != "Xenium":
|
|
128
130
|
closest_z = closest(z_grid, center[2])
|
|
129
131
|
else:
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
mcDETECT/__init__.py,sha256=FoRKnIon2qyKVkAcetKJDkIw8YuDuM1LhpoCZv-AE38,93
|
|
2
|
+
mcDETECT/model.py,sha256=oEkBpFPZWBTRScjsyfwes6oTu6hDsjsI-MCVdVxGiFk,29375
|
|
3
|
+
mcDETECT/utils.py,sha256=0gvqZV7sGi8qvvdC5x9XScyiTXlSfqbUt1zks4t7Xd4,4545
|
|
4
|
+
mcdetect-2.0.10.dist-info/licenses/LICENSE,sha256=uxq-shEWOGTIGVnQLmpElILmfCkuUhFZRAMnZUiKvtg,1070
|
|
5
|
+
mcdetect-2.0.10.dist-info/METADATA,sha256=N8RO2SHnwLe-LZ1wEu1xFTumQ_u4mTfMEdF8exePRYM,3017
|
|
6
|
+
mcdetect-2.0.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
mcdetect-2.0.10.dist-info/top_level.txt,sha256=WwzBojt5U-T2hZ8llO6XgpM9OFIBkWQQldQKu19O8EY,9
|
|
8
|
+
mcdetect-2.0.10.dist-info/RECORD,,
|
mcDETECT/model_new_incorrect.py
DELETED
|
@@ -1,625 +0,0 @@
|
|
|
1
|
-
import anndata
|
|
2
|
-
import math
|
|
3
|
-
import miniball
|
|
4
|
-
import numpy as np
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import scanpy as sc
|
|
7
|
-
from collections import Counter
|
|
8
|
-
from rtree import index
|
|
9
|
-
from scipy.sparse import csr_matrix
|
|
10
|
-
from scipy.spatial import cKDTree
|
|
11
|
-
from scipy.stats import poisson
|
|
12
|
-
from shapely.geometry import Point
|
|
13
|
-
from sklearn.cluster import DBSCAN
|
|
14
|
-
from sklearn.preprocessing import OneHotEncoder
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
from .utils import *
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class mcDETECT:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def __init__(self, type, transcripts, gnl_genes, nc_genes = None, eps = 1.5, minspl = None, grid_len = 1.0, cutoff_prob = 0.95, alpha = 5.0, low_bound = 3,
|
|
24
|
-
size_thr = 4.0, in_nucleus_thr = (0.5, 0.5), l = 1.0, rho = 0.2, s = 1.0, nc_top = 20, nc_thr = 0.1):
|
|
25
|
-
|
|
26
|
-
self.type = type # string, iST platform, now support MERSCOPE, Xenium, and CosMx
|
|
27
|
-
self.transcripts = transcripts # dataframe, transcripts file
|
|
28
|
-
self.gnl_genes = gnl_genes # list, string, all granule markers
|
|
29
|
-
self.nc_genes = nc_genes # list, string, all negative controls
|
|
30
|
-
self.eps = eps # numeric, searching radius epsilon
|
|
31
|
-
self.minspl = minspl # integer, manually select min_samples, i.e., no automatic parameter selection
|
|
32
|
-
self.grid_len = grid_len # numeric, length of grids for computing the tissue area
|
|
33
|
-
self.cutoff_prob = cutoff_prob # numeric, cutoff probability in parameter selection for min_samples
|
|
34
|
-
self.alpha = alpha # numeric, scaling factor in parameter selection for min_samples
|
|
35
|
-
self.low_bound = low_bound # integer, lower bound in parameter selection for min_samples
|
|
36
|
-
self.size_thr = size_thr # numeric, threshold for maximum radius of an aggregation
|
|
37
|
-
self.in_nucleus_thr = in_nucleus_thr # 2-d tuple, threshold for low- and high-in-nucleus ratio
|
|
38
|
-
self.l = l # numeric, scaling factor for seaching overlapped spheres
|
|
39
|
-
self.rho = rho # numeric, threshold for determining overlaps
|
|
40
|
-
self.s = s # numeric, scaling factor for merging overlapped spheres
|
|
41
|
-
self.nc_top = nc_top # integer, number of negative controls retained for filtering
|
|
42
|
-
self.nc_thr = nc_thr # numeric, threshold for negative control filtering
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
# [INNER] construct grids, input for tissue_area()
|
|
46
|
-
def construct_grid(self, grid_len = None):
|
|
47
|
-
if grid_len is None:
|
|
48
|
-
grid_len = self.grid_len
|
|
49
|
-
x_min, x_max = np.min(self.transcripts["global_x"]), np.max(self.transcripts["global_x"])
|
|
50
|
-
y_min, y_max = np.min(self.transcripts["global_y"]), np.max(self.transcripts["global_y"])
|
|
51
|
-
x_min = np.floor(x_min / grid_len) * grid_len
|
|
52
|
-
x_max = np.ceil(x_max / grid_len) * grid_len
|
|
53
|
-
y_min = np.floor(y_min / grid_len) * grid_len
|
|
54
|
-
y_max = np.ceil(y_max / grid_len) * grid_len
|
|
55
|
-
x_bins = np.arange(x_min, x_max + grid_len, grid_len)
|
|
56
|
-
y_bins = np.arange(y_min, y_max + grid_len, grid_len)
|
|
57
|
-
return x_bins, y_bins
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
# [INNER] calculate tissue area, input for poisson_select()
|
|
61
|
-
def tissue_area(self):
|
|
62
|
-
if not hasattr(self, "_cached_area"):
|
|
63
|
-
x_bins, y_bins = self.construct_grid(grid_len = None)
|
|
64
|
-
hist, _, _ = np.histogram2d(self.transcripts["global_x"], self.transcripts["global_y"], bins = [x_bins, y_bins])
|
|
65
|
-
self._cached_area = np.count_nonzero(hist) * (self.grid_len ** 2)
|
|
66
|
-
return self._cached_area
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
# [INNER] calculate optimal min_samples, input for dbscan()
|
|
70
|
-
def poisson_select(self, gene_name):
|
|
71
|
-
num_trans = np.sum(self.transcripts["target"] == gene_name)
|
|
72
|
-
bg_density = num_trans / self.tissue_area()
|
|
73
|
-
cutoff_density = poisson.ppf(self.cutoff_prob, mu = self.alpha * bg_density * (np.pi * self.eps ** 2))
|
|
74
|
-
optimal_m = int(max(cutoff_density, self.low_bound))
|
|
75
|
-
return optimal_m
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
# [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each granule marker
|
|
79
|
-
def dbscan(self, target_names = None, record_cell_id = False, write_csv = False, write_path = "./"):
|
|
80
|
-
|
|
81
|
-
if self.type != "Xenium":
|
|
82
|
-
z_grid = list(self.transcripts["global_z"].unique())
|
|
83
|
-
z_grid.sort()
|
|
84
|
-
|
|
85
|
-
if target_names is None:
|
|
86
|
-
target_names = self.gnl_genes
|
|
87
|
-
|
|
88
|
-
transcripts = self.transcripts[self.transcripts["target"].isin(target_names)]
|
|
89
|
-
grouped = {g: df for g, df in transcripts.groupby("target")}
|
|
90
|
-
|
|
91
|
-
num_individual, data_low, data_high = [], {}, {}
|
|
92
|
-
|
|
93
|
-
for j in target_names:
|
|
94
|
-
|
|
95
|
-
# split transcripts
|
|
96
|
-
target = grouped[j]
|
|
97
|
-
others = pd.concat([grouped[g] for g in target_names if g != j], ignore_index = True)
|
|
98
|
-
tree = make_tree(d1 = np.array(others["global_x"]), d2 = np.array(others["global_y"]), d3 = np.array(others["global_z"]))
|
|
99
|
-
|
|
100
|
-
# 3D DBSCAN
|
|
101
|
-
if self.minspl is None:
|
|
102
|
-
min_spl = self.poisson_select(j)
|
|
103
|
-
else:
|
|
104
|
-
min_spl = self.minspl
|
|
105
|
-
X = np.array(target[["global_x", "global_y", "global_z"]])
|
|
106
|
-
db = DBSCAN(eps = self.eps, min_samples = min_spl, algorithm = "kd_tree").fit(X)
|
|
107
|
-
labels = db.labels_
|
|
108
|
-
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
|
109
|
-
|
|
110
|
-
# iterate over all aggregations
|
|
111
|
-
cell_id, sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score = [], [], [], [], [], [], [], [], []
|
|
112
|
-
|
|
113
|
-
for k in range(n_clusters):
|
|
114
|
-
|
|
115
|
-
# record cell ids
|
|
116
|
-
if record_cell_id:
|
|
117
|
-
temp = target[labels == k]
|
|
118
|
-
temp_cell_id_mode = temp["cell_id"].mode()[0]
|
|
119
|
-
cell_id.append(temp_cell_id_mode)
|
|
120
|
-
|
|
121
|
-
# find minimum enclosing spheres
|
|
122
|
-
mask = (labels == k)
|
|
123
|
-
coords = X[mask]
|
|
124
|
-
if coords.shape[0] == 0:
|
|
125
|
-
continue
|
|
126
|
-
temp_in_nucleus = np.sum(target["overlaps_nucleus"].values[mask])
|
|
127
|
-
temp_size = coords.shape[0]
|
|
128
|
-
coords_unique = np.unique(coords, axis=0)
|
|
129
|
-
center, r2 = miniball.get_bounding_ball(coords_unique, epsilon=1e-8)
|
|
130
|
-
if self.type != "Xenium":
|
|
131
|
-
closest_z = closest(z_grid, center[2])
|
|
132
|
-
else:
|
|
133
|
-
closest_z = center[2]
|
|
134
|
-
|
|
135
|
-
# calculate size, composition, and in-nucleus score
|
|
136
|
-
other_idx = tree.query_ball_point([center[0], center[1], center[2]], np.sqrt(r2))
|
|
137
|
-
other_trans = others.iloc[other_idx]
|
|
138
|
-
other_in_nucleus = np.sum(other_trans["overlaps_nucleus"])
|
|
139
|
-
other_size = other_trans.shape[0]
|
|
140
|
-
other_comp = len(other_trans["target"].unique())
|
|
141
|
-
total_size = temp_size + other_size
|
|
142
|
-
total_comp = 1 + other_comp
|
|
143
|
-
local_score = (temp_in_nucleus + other_in_nucleus) / total_size
|
|
144
|
-
|
|
145
|
-
# record coordinate, radius, size, composition, and in-nucleus score
|
|
146
|
-
sphere_x.append(center[0])
|
|
147
|
-
sphere_y.append(center[1])
|
|
148
|
-
sphere_z.append(center[2])
|
|
149
|
-
layer_z.append(closest_z)
|
|
150
|
-
sphere_r.append(np.sqrt(r2))
|
|
151
|
-
sphere_size.append(total_size)
|
|
152
|
-
sphere_comp.append(total_comp)
|
|
153
|
-
sphere_score.append(local_score)
|
|
154
|
-
|
|
155
|
-
# basic features for all spheres from each granule marker
|
|
156
|
-
sphere = pd.DataFrame(list(zip(sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score, [j] * len(sphere_x))),
|
|
157
|
-
columns = ["sphere_x", "sphere_y", "sphere_z", "layer_z", "sphere_r", "size", "comp", "in_nucleus", "gene"])
|
|
158
|
-
sphere = sphere.astype({"sphere_x": float, "sphere_y": float, "sphere_z": float, "layer_z": float, "sphere_r": float, "size": float, "comp": float, "in_nucleus": float, "gene": str})
|
|
159
|
-
if record_cell_id:
|
|
160
|
-
sphere["cell_id"] = cell_id
|
|
161
|
-
sphere = sphere.astype({"cell_id": str})
|
|
162
|
-
|
|
163
|
-
# split low- and high-in-nucleus spheres
|
|
164
|
-
sphere_low = sphere[(sphere["sphere_r"] < self.size_thr) & (sphere["in_nucleus"] < self.in_nucleus_thr[0])]
|
|
165
|
-
sphere_high = sphere[(sphere["sphere_r"] < self.size_thr) & (sphere["in_nucleus"] > self.in_nucleus_thr[1])]
|
|
166
|
-
|
|
167
|
-
if write_csv:
|
|
168
|
-
sphere_low.to_csv(write_path + j + " sphere.csv", index=0)
|
|
169
|
-
sphere_high.to_csv(write_path + j + " sphere_high.csv", index=0)
|
|
170
|
-
|
|
171
|
-
num_individual.append(sphere_low.shape[0])
|
|
172
|
-
data_low[target_names.index(j)] = sphere_low
|
|
173
|
-
data_high[target_names.index(j)] = sphere_high
|
|
174
|
-
print(f"{target_names.index(j) + 1} of {len(target_names)} genes processed!")
|
|
175
|
-
|
|
176
|
-
return np.sum(num_individual), data_low, data_high
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
# [INNER] merge points from two overlapped spheres, input for remove_overlaps()
|
|
180
|
-
def find_points(self, sphere_a, sphere_b):
|
|
181
|
-
transcripts = self.transcripts[self.transcripts["target"].isin(self.gnl_genes)]
|
|
182
|
-
tree_temp = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
|
|
183
|
-
idx_a = tree_temp.query_ball_point([sphere_a["sphere_x"], sphere_a["sphere_y"], sphere_a["sphere_z"]], sphere_a["sphere_r"])
|
|
184
|
-
points_a = transcripts.iloc[idx_a]
|
|
185
|
-
points_a = points_a[points_a["target"] == sphere_a["gene"]]
|
|
186
|
-
idx_b = tree_temp.query_ball_point([sphere_b["sphere_x"], sphere_b["sphere_y"], sphere_b["sphere_z"]], sphere_b["sphere_r"])
|
|
187
|
-
points_b = transcripts.iloc[idx_b]
|
|
188
|
-
points_b = points_b[points_b["target"] == sphere_b["gene"]]
|
|
189
|
-
points = pd.concat([points_a, points_b])
|
|
190
|
-
points = points[["global_x", "global_y", "global_z"]]
|
|
191
|
-
return points
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
def remove_overlaps(self, set_a, set_b):
|
|
195
|
-
|
|
196
|
-
set_a = set_a.copy()
|
|
197
|
-
set_b = set_b.copy()
|
|
198
|
-
|
|
199
|
-
# find possible overlaps on 2D by r-tree
|
|
200
|
-
idx_b = make_rtree(set_b)
|
|
201
|
-
for i, sphere_a in set_a.iterrows():
|
|
202
|
-
center_a_3D = np.array([sphere_a.sphere_x, sphere_a.sphere_y, sphere_a.sphere_z])
|
|
203
|
-
bounds_a = (sphere_a.sphere_x - sphere_a.sphere_r,
|
|
204
|
-
sphere_a.sphere_y - sphere_a.sphere_r,
|
|
205
|
-
sphere_a.sphere_x + sphere_a.sphere_r,
|
|
206
|
-
sphere_a.sphere_y + sphere_a.sphere_r)
|
|
207
|
-
possible_overlaps = idx_b.intersection(bounds_a)
|
|
208
|
-
|
|
209
|
-
# search 3D overlaps within possible overlaps
|
|
210
|
-
for j in possible_overlaps:
|
|
211
|
-
if j in set_b.index:
|
|
212
|
-
sphere_b = set_b.loc[j]
|
|
213
|
-
center_b_3D = np.array([sphere_b.sphere_x, sphere_b.sphere_y, sphere_b.sphere_z])
|
|
214
|
-
dist = np.linalg.norm(center_a_3D - center_b_3D)
|
|
215
|
-
radius_sum = sphere_a.sphere_r + sphere_b.sphere_r
|
|
216
|
-
radius_diff = sphere_a.sphere_r - sphere_b.sphere_r
|
|
217
|
-
|
|
218
|
-
# relative positions (0: internal & intersect, 1: internal, 2: intersect)
|
|
219
|
-
c0 = (dist < self.l * radius_sum)
|
|
220
|
-
c1 = (dist <= self.l * np.abs(radius_diff))
|
|
221
|
-
c1_1 = (radius_diff > 0)
|
|
222
|
-
c2_1 = (dist < self.rho * self.l * radius_sum)
|
|
223
|
-
|
|
224
|
-
# operations on dataframes
|
|
225
|
-
if c0:
|
|
226
|
-
if c1 and c1_1: # keep A and remove B
|
|
227
|
-
set_b.drop(index = j, inplace = True)
|
|
228
|
-
elif c1 and not c1_1: # replace A with B and remove B
|
|
229
|
-
set_a.loc[i] = set_b.loc[j]
|
|
230
|
-
set_b.drop(index = j, inplace = True)
|
|
231
|
-
elif not c1 and c2_1: # replace A with new sphere and remove B
|
|
232
|
-
points_union = np.array(self.find_points(sphere_a, sphere_b))
|
|
233
|
-
new_center, new_radius = miniball.get_bounding_ball(points_union, epsilon=1e-8)
|
|
234
|
-
set_a.loc[i, "sphere_x"] = new_center[0]
|
|
235
|
-
set_a.loc[i, "sphere_y"] = new_center[1]
|
|
236
|
-
set_a.loc[i, "sphere_z"] = new_center[2]
|
|
237
|
-
set_a.loc[i, "sphere_r"] = self.s * new_radius
|
|
238
|
-
set_b.drop(index = j, inplace = True)
|
|
239
|
-
|
|
240
|
-
set_a = set_a.reset_index(drop = True)
|
|
241
|
-
set_b = set_b.reset_index(drop = True)
|
|
242
|
-
return set_a, set_b
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
# [INNER] merge spheres from different granule markers, input for detect()
|
|
246
|
-
def merge_sphere(self, sphere_dict):
|
|
247
|
-
sphere = sphere_dict[0].copy()
|
|
248
|
-
for j in range(1, len(self.gnl_genes)):
|
|
249
|
-
target_sphere = sphere_dict[j]
|
|
250
|
-
sphere, target_sphere_new = self.remove_overlaps(sphere, target_sphere)
|
|
251
|
-
sphere = pd.concat([sphere, target_sphere_new])
|
|
252
|
-
sphere = sphere.reset_index(drop = True)
|
|
253
|
-
return sphere
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
# [INNER] negative control filtering, input for detect()
|
|
257
|
-
def nc_filter(self, sphere_low, sphere_high):
|
|
258
|
-
|
|
259
|
-
# negative control gene profiling
|
|
260
|
-
adata_low = self.profile(sphere_low, self.nc_genes)
|
|
261
|
-
adata_high = self.profile(sphere_high, self.nc_genes)
|
|
262
|
-
adata = anndata.concat([adata_low, adata_high], axis = 0, merge = "same")
|
|
263
|
-
adata.var["genes"] = adata.var.index
|
|
264
|
-
adata.obs_keys = list(np.arange(adata.shape[0]))
|
|
265
|
-
adata.obs["type"] = ["low"] * adata_low.shape[0] + ["high"] * adata_high.shape[0]
|
|
266
|
-
adata.obs["type"] = pd.Categorical(adata.obs["type"], categories = ["low", "high"], ordered = True)
|
|
267
|
-
|
|
268
|
-
# DE analysis of negative control genes
|
|
269
|
-
sc.tl.rank_genes_groups(adata, "type", method = "t-test")
|
|
270
|
-
names = adata.uns["rank_genes_groups"]["names"]
|
|
271
|
-
names = pd.DataFrame(names)
|
|
272
|
-
logfc = adata.uns["rank_genes_groups"]["logfoldchanges"]
|
|
273
|
-
logfc = pd.DataFrame(logfc)
|
|
274
|
-
pvals = adata.uns["rank_genes_groups"]["pvals"]
|
|
275
|
-
pvals = pd.DataFrame(pvals)
|
|
276
|
-
|
|
277
|
-
# select top upregulated negative control genes
|
|
278
|
-
df = pd.DataFrame({"names": names["high"], "logfc": logfc["high"], "pvals": pvals["high"]})
|
|
279
|
-
df = df[df["logfc"] >= 0]
|
|
280
|
-
df = df.sort_values(by = ["pvals"], ascending = True)
|
|
281
|
-
nc_genes_final = list(df["names"].head(self.nc_top))
|
|
282
|
-
|
|
283
|
-
# negative control filtering
|
|
284
|
-
nc_transcripts_final = self.transcripts[self.transcripts["target"].isin(nc_genes_final)]
|
|
285
|
-
tree = make_tree(d1 = np.array(nc_transcripts_final["global_x"]), d2 = np.array(nc_transcripts_final["global_y"]), d3 = np.array(nc_transcripts_final["global_z"]))
|
|
286
|
-
centers = sphere_low[["sphere_x", "sphere_y", "sphere_z"]].to_numpy()
|
|
287
|
-
radii = sphere_low["sphere_r"].to_numpy()
|
|
288
|
-
sizes = sphere_low["size"].to_numpy()
|
|
289
|
-
counts = np.array([len(tree.query_ball_point(c, r)) for c, r in zip(centers, radii)])
|
|
290
|
-
pass_idx = (counts == 0) | (counts / sizes < self.nc_thr)
|
|
291
|
-
sphere = sphere_low[pass_idx].reset_index(drop = True)
|
|
292
|
-
return sphere
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
# [MAIN] dataframe, granule metadata
|
|
296
|
-
def detect(self, record_cell_id = False):
|
|
297
|
-
|
|
298
|
-
_, data_low, data_high = self.dbscan(record_cell_id = record_cell_id)
|
|
299
|
-
|
|
300
|
-
print("Merging spheres...")
|
|
301
|
-
sphere_low, sphere_high = self.merge_sphere(data_low), self.merge_sphere(data_high)
|
|
302
|
-
|
|
303
|
-
if self.nc_genes is None:
|
|
304
|
-
return sphere_low
|
|
305
|
-
else:
|
|
306
|
-
print("Negative control filtering...")
|
|
307
|
-
return self.nc_filter(sphere_low, sphere_high)
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
# [MAIN] anndata, granule spatial transcriptome profile
|
|
311
|
-
def profile(self, granule, genes = None, print_itr = False):
|
|
312
|
-
|
|
313
|
-
if genes is None:
|
|
314
|
-
genes = list(self.transcripts["target"].unique())
|
|
315
|
-
transcripts = self.transcripts
|
|
316
|
-
else:
|
|
317
|
-
transcripts = self.transcripts[self.transcripts["target"].isin(genes)]
|
|
318
|
-
|
|
319
|
-
gene_to_idx = {g: i for i, g in enumerate(genes)}
|
|
320
|
-
gene_array = transcripts["target"].to_numpy()
|
|
321
|
-
tree = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
|
|
322
|
-
|
|
323
|
-
n_gnl = granule.shape[0]
|
|
324
|
-
n_gene = len(genes)
|
|
325
|
-
data, row_idx, col_idx = [], [], []
|
|
326
|
-
|
|
327
|
-
# iterate over all granules to count nearby transcripts
|
|
328
|
-
for i in range(n_gnl):
|
|
329
|
-
temp = granule.iloc[i]
|
|
330
|
-
target_idx = tree.query_ball_point([temp["sphere_x"], temp["sphere_y"], temp["layer_z"]], temp["sphere_r"])
|
|
331
|
-
if not target_idx:
|
|
332
|
-
continue
|
|
333
|
-
local_genes = gene_array[target_idx] # extract genes for those nearby transcripts
|
|
334
|
-
counts = Counter(local_genes) # count how many times each gene occurs
|
|
335
|
-
for g, cnt in counts.items(): # append nonzero entries to sparse matrix lists
|
|
336
|
-
j = gene_to_idx[g] # get gene column index
|
|
337
|
-
data.append(cnt) # nonzero count
|
|
338
|
-
row_idx.append(i) # row index = granule index
|
|
339
|
-
col_idx.append(j) # column index = gene index
|
|
340
|
-
if print_itr and (i % 5000 == 0):
|
|
341
|
-
print(f"{i} out of {n_gnl} granules profiled!")
|
|
342
|
-
|
|
343
|
-
# construct sparse spatial transcriptome profile, (n_granules × n_genes)
|
|
344
|
-
X = csr_matrix((data, (row_idx, col_idx)), shape = (n_gnl, n_gene), dtype = np.float32)
|
|
345
|
-
adata = anndata.AnnData(X = X, obs = granule.copy())
|
|
346
|
-
adata.obs["granule_id"] = [f"gnl_{i}" for i in range(n_gnl)]
|
|
347
|
-
adata.obs = adata.obs.astype({"granule_id": str})
|
|
348
|
-
adata.obs.rename(columns = {"sphere_x": "global_x", "sphere_y": "global_y", "sphere_z": "global_z"}, inplace = True)
|
|
349
|
-
adata.var["genes"] = genes
|
|
350
|
-
adata.var_names = genes
|
|
351
|
-
adata.var_keys = genes
|
|
352
|
-
return adata
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
# [MAIN] anndata, spot-level gene expression
|
|
356
|
-
def spot_expression(self, grid_len, genes = None):
|
|
357
|
-
|
|
358
|
-
if genes is None:
|
|
359
|
-
genes = list(self.transcripts["target"].unique())
|
|
360
|
-
transcripts = self.transcripts
|
|
361
|
-
else:
|
|
362
|
-
transcripts = self.transcripts[self.transcripts["target"].isin(genes)]
|
|
363
|
-
|
|
364
|
-
# construct bins
|
|
365
|
-
x_bins, y_bins = self.construct_grid(grid_len = grid_len)
|
|
366
|
-
|
|
367
|
-
# initialize data
|
|
368
|
-
X = np.zeros((len(genes), (len(x_bins) - 1) * (len(y_bins) - 1)))
|
|
369
|
-
global_x, global_y = [], []
|
|
370
|
-
|
|
371
|
-
# coordinates
|
|
372
|
-
for i in list(x_bins)[:-1]:
|
|
373
|
-
center_x = i + 0.5 * grid_len
|
|
374
|
-
for j in list(y_bins)[:-1]:
|
|
375
|
-
center_y = j + 0.5 * grid_len
|
|
376
|
-
global_x.append(center_x)
|
|
377
|
-
global_y.append(center_y)
|
|
378
|
-
|
|
379
|
-
# count matrix
|
|
380
|
-
for k_idx, k in enumerate(genes):
|
|
381
|
-
target_gene = transcripts[transcripts["target"] == k]
|
|
382
|
-
count_gene, _, _ = np.histogram2d(target_gene["global_x"], target_gene["global_y"], bins = [x_bins, y_bins])
|
|
383
|
-
X[k_idx, :] = count_gene.flatten()
|
|
384
|
-
if k_idx % 100 == 0:
|
|
385
|
-
print(f"{k_idx} out of {len(genes)} genes profiled!")
|
|
386
|
-
|
|
387
|
-
# spot id
|
|
388
|
-
spot_id = []
|
|
389
|
-
for i in range(len(global_x)):
|
|
390
|
-
id = "spot_" + str(i)
|
|
391
|
-
spot_id.append(id)
|
|
392
|
-
|
|
393
|
-
# assemble data
|
|
394
|
-
adata = anndata.AnnData(X = np.transpose(X))
|
|
395
|
-
adata.obs["spot_id"] = spot_id
|
|
396
|
-
adata.obs["global_x"] = global_x
|
|
397
|
-
adata.obs["global_y"] = global_y
|
|
398
|
-
adata.var["genes"] = genes
|
|
399
|
-
adata.var_names = genes
|
|
400
|
-
adata.var_keys = genes
|
|
401
|
-
return adata
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
# [MAIN] anndata, spot-level neuron metadata
|
|
405
|
-
def spot_neuron(adata_neuron, spot, grid_len = 50, neuron_loc_key = ["global_x", "global_y"], spot_loc_key = ["global_x", "global_y"]):
|
|
406
|
-
|
|
407
|
-
adata_neuron = adata_neuron.copy()
|
|
408
|
-
neurons = adata_neuron.obs
|
|
409
|
-
spot = spot.copy()
|
|
410
|
-
|
|
411
|
-
half_len = grid_len / 2
|
|
412
|
-
|
|
413
|
-
indicator, neuron_count = [], []
|
|
414
|
-
|
|
415
|
-
for _, row in spot.obs.iterrows():
|
|
416
|
-
|
|
417
|
-
x = row[spot_loc_key[0]]
|
|
418
|
-
y = row[spot_loc_key[1]]
|
|
419
|
-
neuron_temp = neurons[(neurons[neuron_loc_key[0]] > x - half_len) & (neurons[neuron_loc_key[0]] < x + half_len) & (neurons[neuron_loc_key[1]] > y - half_len) & (neurons[neuron_loc_key[1]] < y + half_len)]
|
|
420
|
-
indicator.append(int(len(neuron_temp) > 0))
|
|
421
|
-
neuron_count.append(len(neuron_temp))
|
|
422
|
-
|
|
423
|
-
spot.obs["indicator"] = indicator
|
|
424
|
-
spot.obs["neuron_count"] = neuron_count
|
|
425
|
-
return spot
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
# [MAIN] anndata, spot-level granule metadata
|
|
429
|
-
def spot_granule(granule, spot, grid_len = 50, gnl_loc_key = ["sphere_x", "sphere_y"], spot_loc_key = ["global_x", "global_y"]):
|
|
430
|
-
|
|
431
|
-
granule = granule.copy()
|
|
432
|
-
spot = spot.copy()
|
|
433
|
-
|
|
434
|
-
half_len = grid_len / 2
|
|
435
|
-
|
|
436
|
-
indicator, granule_count, granule_radius, granule_size, granule_score = [], [], [], [], []
|
|
437
|
-
|
|
438
|
-
for _, row in spot.obs.iterrows():
|
|
439
|
-
|
|
440
|
-
x = row[spot_loc_key[0]]
|
|
441
|
-
y = row[spot_loc_key[1]]
|
|
442
|
-
gnl_temp = granule[(granule[gnl_loc_key[0]] >= x - half_len) & (granule[gnl_loc_key[0]] < x + half_len) & (granule[gnl_loc_key[1]] >= y - half_len) & (granule[gnl_loc_key[1]] < y + half_len)]
|
|
443
|
-
indicator.append(int(len(gnl_temp) > 0))
|
|
444
|
-
granule_count.append(len(gnl_temp))
|
|
445
|
-
|
|
446
|
-
if len(gnl_temp) == 0:
|
|
447
|
-
granule_radius.append(0)
|
|
448
|
-
granule_size.append(0)
|
|
449
|
-
granule_score.append(0)
|
|
450
|
-
else:
|
|
451
|
-
granule_radius.append(np.nanmean(gnl_temp["sphere_r"]))
|
|
452
|
-
granule_size.append(np.nanmean(gnl_temp["size"]))
|
|
453
|
-
granule_score.append(np.nanmean(gnl_temp["in_nucleus"]))
|
|
454
|
-
|
|
455
|
-
spot.obs["indicator"] = indicator
|
|
456
|
-
spot.obs["gnl_count"] = granule_count
|
|
457
|
-
spot.obs["gnl_radius"] = granule_radius
|
|
458
|
-
spot.obs["gnl_size"] = granule_size
|
|
459
|
-
spot.obs["gnl_score"] = granule_score
|
|
460
|
-
return spot
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
# [Main] anndata, neuron-granule colocalization
|
|
464
|
-
def neighbor_granule(adata_neuron, granule_adata, radius = 10, sigma = None, loc_key = ["global_x", "global_y"]):
|
|
465
|
-
|
|
466
|
-
adata_neuron = adata_neuron.copy()
|
|
467
|
-
granule_adata = granule_adata.copy()
|
|
468
|
-
|
|
469
|
-
if sigma is None:
|
|
470
|
-
sigma = radius / 2
|
|
471
|
-
|
|
472
|
-
# neuron and granule coordinates
|
|
473
|
-
neuron_coords = adata_neuron.obs[loc_key].values
|
|
474
|
-
gnl_coords = granule_adata.obs[loc_key].values
|
|
475
|
-
|
|
476
|
-
# make tree
|
|
477
|
-
tree = make_tree(d1 = gnl_coords[:, 0], d2 = gnl_coords[:, 1])
|
|
478
|
-
|
|
479
|
-
# query neighboring granules for each neuron
|
|
480
|
-
neighbor_indices = tree.query_ball_point(neuron_coords, r = radius)
|
|
481
|
-
|
|
482
|
-
# record count and indices
|
|
483
|
-
granule_counts = np.array([len(indices) for indices in neighbor_indices])
|
|
484
|
-
adata_neuron.obs["neighbor_gnl_count"] = granule_counts
|
|
485
|
-
adata_neuron.uns["neighbor_gnl_indices"] = neighbor_indices
|
|
486
|
-
|
|
487
|
-
# ---------- neighboring granule expression matrix ---------- #
|
|
488
|
-
n_neurons, n_genes = adata_neuron.n_obs, adata_neuron.n_vars
|
|
489
|
-
weighted_expr = np.zeros((n_neurons, n_genes))
|
|
490
|
-
|
|
491
|
-
for i, indices in enumerate(neighbor_indices):
|
|
492
|
-
if len(indices) == 0:
|
|
493
|
-
continue
|
|
494
|
-
distances = np.linalg.norm(gnl_coords[indices] - neuron_coords[i], axis = 1)
|
|
495
|
-
weights = np.exp(- (distances ** 2) / (2 * sigma ** 2))
|
|
496
|
-
weights = weights / weights.sum()
|
|
497
|
-
weighted_expr[i] = np.average(granule_adata.X[indices], axis = 0, weights = weights)
|
|
498
|
-
|
|
499
|
-
adata_neuron.obsm["weighted_gnl_expression"] = weighted_expr
|
|
500
|
-
|
|
501
|
-
# ---------- neighboring granule spatial feature ---------- #
|
|
502
|
-
features = []
|
|
503
|
-
|
|
504
|
-
for i, gnl_idx in enumerate(neighbor_indices):
|
|
505
|
-
|
|
506
|
-
feats = {}
|
|
507
|
-
feats["n_granules"] = len(gnl_idx)
|
|
508
|
-
|
|
509
|
-
if len(gnl_idx) == 0:
|
|
510
|
-
feats.update({"mean_distance": np.nan, "std_distance": np.nan, "radius_max": np.nan, "radius_min": np.nan, "density": 0, "center_offset_norm": np.nan, "anisotropy_ratio": np.nan})
|
|
511
|
-
else:
|
|
512
|
-
gnl_pos = gnl_coords[gnl_idx]
|
|
513
|
-
neuron_pos = neuron_coords[i]
|
|
514
|
-
dists = np.linalg.norm(gnl_pos - neuron_pos, axis = 1)
|
|
515
|
-
feats["mean_distance"] = dists.mean()
|
|
516
|
-
feats["std_distance"] = dists.std()
|
|
517
|
-
feats["radius_max"] = dists.max()
|
|
518
|
-
feats["radius_min"] = dists.min()
|
|
519
|
-
feats["density"] = len(gnl_idx) / (np.pi * radius ** 2)
|
|
520
|
-
centroid = gnl_pos.mean(axis = 0)
|
|
521
|
-
offset = centroid - neuron_pos
|
|
522
|
-
feats["center_offset_norm"] = np.linalg.norm(offset)
|
|
523
|
-
cov = np.cov((gnl_pos - neuron_pos).T)
|
|
524
|
-
eigvals = np.linalg.eigvalsh(cov)
|
|
525
|
-
if np.min(eigvals) > 0:
|
|
526
|
-
feats["anisotropy_ratio"] = np.max(eigvals) / np.min(eigvals)
|
|
527
|
-
else:
|
|
528
|
-
feats["anisotropy_ratio"] = np.nan
|
|
529
|
-
|
|
530
|
-
features.append(feats)
|
|
531
|
-
|
|
532
|
-
spatial_df = pd.DataFrame(features, index = adata_neuron.obs_names)
|
|
533
|
-
return adata_neuron, spatial_df
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
# [MAIN] numpy array, neuron embeddings based on neighboring granules
|
|
537
|
-
def neuron_embedding_one_hot(adata_neuron, granule_adata, k = 10, radius = 10, loc_key = ["global_x", "global_y"], gnl_subtype_key = "granule_subtype_kmeans", padding_value = "Others"):
|
|
538
|
-
|
|
539
|
-
adata_neuron = adata_neuron.copy()
|
|
540
|
-
granule_adata = granule_adata.copy()
|
|
541
|
-
|
|
542
|
-
# neuron and granule coordinates, granule subtypes
|
|
543
|
-
neuron_coords = adata_neuron.obs[loc_key].to_numpy()
|
|
544
|
-
granule_coords = granule_adata.obs[loc_key].to_numpy()
|
|
545
|
-
granule_subtypes = granule_adata.obs[gnl_subtype_key].astype(str).to_numpy()
|
|
546
|
-
|
|
547
|
-
# include padding category
|
|
548
|
-
unique_subtypes = np.unique(granule_subtypes).tolist()
|
|
549
|
-
if padding_value not in unique_subtypes:
|
|
550
|
-
unique_subtypes.append(padding_value)
|
|
551
|
-
|
|
552
|
-
encoder = OneHotEncoder(categories = [unique_subtypes], sparse = False, handle_unknown = "ignore")
|
|
553
|
-
encoder.fit(np.array(unique_subtypes).reshape(-1, 1))
|
|
554
|
-
S = len(unique_subtypes)
|
|
555
|
-
|
|
556
|
-
# k-d tree
|
|
557
|
-
tree = make_tree(d1 = granule_coords[:, 0], d2 = granule_coords[:, 1])
|
|
558
|
-
distances, indices = tree.query(neuron_coords, k = k, distance_upper_bound = radius)
|
|
559
|
-
|
|
560
|
-
# initialize output
|
|
561
|
-
n_neurons = neuron_coords.shape[0]
|
|
562
|
-
embeddings = np.zeros((n_neurons, k, S), dtype = float)
|
|
563
|
-
|
|
564
|
-
for i in range(n_neurons):
|
|
565
|
-
for k in range(k):
|
|
566
|
-
idx = indices[i, k]
|
|
567
|
-
dist = distances[i, k]
|
|
568
|
-
if idx == granule_coords.shape[0] or np.isinf(dist):
|
|
569
|
-
subtype = padding_value
|
|
570
|
-
else:
|
|
571
|
-
subtype = granule_subtypes[idx]
|
|
572
|
-
onehot = encoder.transform([[subtype]])[0]
|
|
573
|
-
embeddings[i, k, :] = onehot
|
|
574
|
-
|
|
575
|
-
return embeddings, encoder.categories_[0]
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
# [MAIN] numpy array, neuron embeddings based on neighboring granules
|
|
579
|
-
def neuron_embedding_spatial_weight(adata_neuron, granule_adata, radius = 10, sigma = 10, loc_key = ["global_x", "global_y"], gnl_subtype_key = "granule_subtype_kmeans", padding_value = "Others"):
|
|
580
|
-
|
|
581
|
-
adata_neuron = adata_neuron.copy()
|
|
582
|
-
granule_adata = granule_adata.copy()
|
|
583
|
-
|
|
584
|
-
# neuron and granule coordinates, granule subtypes
|
|
585
|
-
neuron_coords = adata_neuron.obs[loc_key].to_numpy()
|
|
586
|
-
granule_coords = granule_adata.obs[loc_key].to_numpy()
|
|
587
|
-
granule_subtypes = granule_adata.obs[gnl_subtype_key].astype(str).to_numpy()
|
|
588
|
-
|
|
589
|
-
# include padding category
|
|
590
|
-
unique_subtypes = np.unique(granule_subtypes).tolist()
|
|
591
|
-
if padding_value not in unique_subtypes:
|
|
592
|
-
unique_subtypes.append(padding_value)
|
|
593
|
-
|
|
594
|
-
encoder = OneHotEncoder(categories = [unique_subtypes], sparse = False, handle_unknown = "ignore")
|
|
595
|
-
encoder.fit(np.array(unique_subtypes).reshape(-1, 1))
|
|
596
|
-
S = len(unique_subtypes)
|
|
597
|
-
|
|
598
|
-
# k-d tree
|
|
599
|
-
tree = make_tree(d1 = granule_coords[:, 0], d2 = granule_coords[:, 1])
|
|
600
|
-
all_neighbors = tree.query_ball_point(neuron_coords, r = radius)
|
|
601
|
-
|
|
602
|
-
# initialize output
|
|
603
|
-
n_neurons = neuron_coords.shape[0]
|
|
604
|
-
embeddings = np.zeros((n_neurons, S), dtype = float)
|
|
605
|
-
|
|
606
|
-
for i, neighbor_indices in enumerate(all_neighbors):
|
|
607
|
-
if not neighbor_indices:
|
|
608
|
-
# no neighbors, assign to padding subtype
|
|
609
|
-
embeddings[i] = encoder.transform([[padding_value]])[0]
|
|
610
|
-
continue
|
|
611
|
-
|
|
612
|
-
# get neighbor subtypes and distances
|
|
613
|
-
neighbor_coords = granule_coords[neighbor_indices]
|
|
614
|
-
dists = np.linalg.norm(neuron_coords[i] - neighbor_coords, axis = 1)
|
|
615
|
-
weights = np.exp(- dists / sigma)
|
|
616
|
-
|
|
617
|
-
# encode subtypes to one-hot and weight them
|
|
618
|
-
subtypes = granule_subtypes[neighbor_indices]
|
|
619
|
-
onehots = encoder.transform(subtypes.reshape(-1, 1))
|
|
620
|
-
weighted_sum = (weights[:, np.newaxis] * onehots).sum(axis = 0)
|
|
621
|
-
|
|
622
|
-
# normalize to make it a composition vector
|
|
623
|
-
embeddings[i] = weighted_sum / weights.sum()
|
|
624
|
-
|
|
625
|
-
return embeddings, encoder.categories_[0]
|
mcdetect-2.0.9.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
mcDETECT/__init__.py,sha256=AIqZz0ppjK5Erc3Po-5CXAnnZRxUi9Ny_0VNiqvt_0w,92
|
|
2
|
-
mcDETECT/model.py,sha256=ZHNOWHdObat8SNZaZ47eMSKnHQ7NKEJxXCPBTurFRm0,29236
|
|
3
|
-
mcDETECT/model_new_incorrect.py,sha256=MqJMAC4cyjux3BWIYWmGLugr_gHPeYcQNH-O40xbPHE,29398
|
|
4
|
-
mcDETECT/utils.py,sha256=0gvqZV7sGi8qvvdC5x9XScyiTXlSfqbUt1zks4t7Xd4,4545
|
|
5
|
-
mcdetect-2.0.9.dist-info/licenses/LICENSE,sha256=uxq-shEWOGTIGVnQLmpElILmfCkuUhFZRAMnZUiKvtg,1070
|
|
6
|
-
mcdetect-2.0.9.dist-info/METADATA,sha256=rT8W7oUUD5m4G8sk7l6H2ItNTpia3x_TQah4rjLECl0,3016
|
|
7
|
-
mcdetect-2.0.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
mcdetect-2.0.9.dist-info/top_level.txt,sha256=WwzBojt5U-T2hZ8llO6XgpM9OFIBkWQQldQKu19O8EY,9
|
|
9
|
-
mcdetect-2.0.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|