mcDETECT 2.0.3__py3-none-any.whl → 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcDETECT might be problematic. Click here for more details.
- mcDETECT/__init__.py +1 -1
- mcDETECT/model.py +83 -60
- {mcdetect-2.0.3.dist-info → mcdetect-2.0.5.dist-info}/METADATA +1 -1
- mcdetect-2.0.5.dist-info/RECORD +8 -0
- mcdetect-2.0.3.dist-info/RECORD +0 -8
- {mcdetect-2.0.3.dist-info → mcdetect-2.0.5.dist-info}/WHEEL +0 -0
- {mcdetect-2.0.3.dist-info → mcdetect-2.0.5.dist-info}/licenses/LICENSE +0 -0
- {mcdetect-2.0.3.dist-info → mcdetect-2.0.5.dist-info}/top_level.txt +0 -0
mcDETECT/__init__.py
CHANGED
mcDETECT/model.py
CHANGED
|
@@ -4,7 +4,9 @@ import miniball
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import scanpy as sc
|
|
7
|
+
from collections import Counter
|
|
7
8
|
from rtree import index
|
|
9
|
+
from scipy.sparse import csr_matrix
|
|
8
10
|
from scipy.spatial import cKDTree
|
|
9
11
|
from scipy.stats import poisson
|
|
10
12
|
from shapely.geometry import Point
|
|
@@ -18,12 +20,12 @@ from .utils import *
|
|
|
18
20
|
class mcDETECT:
|
|
19
21
|
|
|
20
22
|
|
|
21
|
-
def __init__(self, type, transcripts,
|
|
23
|
+
def __init__(self, type, transcripts, gnl_genes, nc_genes = None, eps = 1.5, minspl = None, grid_len = 1.0, cutoff_prob = 0.95, alpha = 5.0, low_bound = 3,
|
|
22
24
|
size_thr = 4.0, in_nucleus_thr = (0.5, 0.5), l = 1.0, rho = 0.2, s = 1.0, nc_top = 20, nc_thr = 0.1):
|
|
23
25
|
|
|
24
26
|
self.type = type # string, iST platform, now support MERSCOPE, Xenium, and CosMx
|
|
25
27
|
self.transcripts = transcripts # dataframe, transcripts file
|
|
26
|
-
self.
|
|
28
|
+
self.gnl_genes = gnl_genes # list, string, all granule markers
|
|
27
29
|
self.nc_genes = nc_genes # list, string, all negative controls
|
|
28
30
|
self.eps = eps # numeric, searching radius epsilon
|
|
29
31
|
self.minspl = minspl # integer, manually select min_samples, i.e., no automatic parameter selection
|
|
@@ -57,10 +59,11 @@ class mcDETECT:
|
|
|
57
59
|
|
|
58
60
|
# [INNER] calculate tissue area, input for poisson_select()
|
|
59
61
|
def tissue_area(self):
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
62
|
+
if not hasattr(self, "_cached_area"):
|
|
63
|
+
x_bins, y_bins = self.construct_grid(grid_len = None)
|
|
64
|
+
hist, _, _ = np.histogram2d(self.transcripts["global_x"], self.transcripts["global_y"], bins = [x_bins, y_bins])
|
|
65
|
+
self._cached_area = np.count_nonzero(hist) * (self.grid_len ** 2)
|
|
66
|
+
return self._cached_area
|
|
64
67
|
|
|
65
68
|
|
|
66
69
|
# [INNER] calculate optimal min_samples, input for dbscan()
|
|
@@ -72,24 +75,26 @@ class mcDETECT:
|
|
|
72
75
|
return optimal_m
|
|
73
76
|
|
|
74
77
|
|
|
75
|
-
# [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each
|
|
76
|
-
def dbscan(self, target_names = None, write_csv = False, write_path = "./"):
|
|
78
|
+
# [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each granule marker
|
|
79
|
+
def dbscan(self, target_names = None, record_cell_id = False, write_csv = False, write_path = "./"):
|
|
77
80
|
|
|
78
81
|
if self.type != "Xenium":
|
|
79
82
|
z_grid = list(self.transcripts["global_z"].unique())
|
|
80
83
|
z_grid.sort()
|
|
81
84
|
|
|
82
85
|
if target_names is None:
|
|
83
|
-
target_names = self.
|
|
86
|
+
target_names = self.gnl_genes
|
|
87
|
+
|
|
84
88
|
transcripts = self.transcripts[self.transcripts["target"].isin(target_names)]
|
|
89
|
+
grouped = {g: df for g, df in transcripts.groupby("target")}
|
|
85
90
|
|
|
86
91
|
num_individual, data_low, data_high = [], {}, {}
|
|
87
92
|
|
|
88
93
|
for j in target_names:
|
|
89
94
|
|
|
90
95
|
# split transcripts
|
|
91
|
-
target =
|
|
92
|
-
others =
|
|
96
|
+
target = grouped[j]
|
|
97
|
+
others = pd.concat([grouped[g] for g in target_names if g != j], ignore_index = True)
|
|
93
98
|
tree = make_tree(d1 = np.array(others["global_x"]), d2 = np.array(others["global_y"]), d3 = np.array(others["global_z"]))
|
|
94
99
|
|
|
95
100
|
# 3D DBSCAN
|
|
@@ -103,17 +108,25 @@ class mcDETECT:
|
|
|
103
108
|
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
|
104
109
|
|
|
105
110
|
# iterate over all aggregations
|
|
106
|
-
sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score = [], [], [], [], [], [], [], []
|
|
111
|
+
cell_id, sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score = [], [], [], [], [], [], [], [], []
|
|
107
112
|
|
|
108
113
|
for k in range(n_clusters):
|
|
109
114
|
|
|
115
|
+
# record cell ids
|
|
116
|
+
if record_cell_id:
|
|
117
|
+
temp = target[labels == k]
|
|
118
|
+
temp_cell_id_mode = temp["cell_id"].mode()[0]
|
|
119
|
+
cell_id.append(temp_cell_id_mode)
|
|
120
|
+
|
|
110
121
|
# find minimum enclosing spheres
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
122
|
+
mask = (labels == k)
|
|
123
|
+
coords = X[mask]
|
|
124
|
+
if coords.shape[0] == 0:
|
|
125
|
+
continue
|
|
126
|
+
temp_in_nucleus = np.sum(target["overlaps_nucleus"].values[mask])
|
|
127
|
+
temp_size = coords.shape[0]
|
|
128
|
+
coords_unique = np.unique(coords, axis=0)
|
|
129
|
+
center, r2 = miniball.get_bounding_ball(coords_unique, epsilon=1e-8)
|
|
117
130
|
if self.type != "Xenium":
|
|
118
131
|
closest_z = closest(z_grid, center[2])
|
|
119
132
|
else:
|
|
@@ -139,11 +152,13 @@ class mcDETECT:
|
|
|
139
152
|
sphere_comp.append(total_comp)
|
|
140
153
|
sphere_score.append(local_score)
|
|
141
154
|
|
|
142
|
-
# basic features for all spheres from each
|
|
143
|
-
sphere = pd.DataFrame(list(zip(sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score)),
|
|
144
|
-
|
|
145
|
-
sphere
|
|
146
|
-
|
|
155
|
+
# basic features for all spheres from each granule marker
|
|
156
|
+
sphere = pd.DataFrame(list(zip(sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score, [j] * len(sphere_x))),
|
|
157
|
+
columns = ["sphere_x", "sphere_y", "sphere_z", "layer_z", "sphere_r", "size", "comp", "in_nucleus", "gene"])
|
|
158
|
+
sphere = sphere.astype({"sphere_x": float, "sphere_y": float, "sphere_z": float, "layer_z": float, "sphere_r": float, "size": float, "comp": float, "in_nucleus": float, "gene": str})
|
|
159
|
+
if record_cell_id:
|
|
160
|
+
sphere["cell_id"] = cell_id
|
|
161
|
+
sphere = sphere.astype({"cell_id": str})
|
|
147
162
|
|
|
148
163
|
# split low- and high-in-nucleus spheres
|
|
149
164
|
sphere_low = sphere[(sphere["sphere_r"] < self.size_thr) & (sphere["in_nucleus"] < self.in_nucleus_thr[0])]
|
|
@@ -156,14 +171,14 @@ class mcDETECT:
|
|
|
156
171
|
num_individual.append(sphere_low.shape[0])
|
|
157
172
|
data_low[target_names.index(j)] = sphere_low
|
|
158
173
|
data_high[target_names.index(j)] = sphere_high
|
|
159
|
-
print("{
|
|
174
|
+
print(f"{target_names.index(j) + 1} / {len(target_names)} genes processed!")
|
|
160
175
|
|
|
161
176
|
return np.sum(num_individual), data_low, data_high
|
|
162
177
|
|
|
163
178
|
|
|
164
179
|
# [INNER] merge points from two overlapped spheres, input for remove_overlaps()
|
|
165
180
|
def find_points(self, sphere_a, sphere_b):
|
|
166
|
-
transcripts = self.transcripts[self.transcripts["target"].isin(self.
|
|
181
|
+
transcripts = self.transcripts[self.transcripts["target"].isin(self.gnl_genes)]
|
|
167
182
|
tree_temp = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
|
|
168
183
|
idx_a = tree_temp.query_ball_point([sphere_a["sphere_x"], sphere_a["sphere_y"], sphere_a["sphere_z"]], sphere_a["sphere_r"])
|
|
169
184
|
points_a = transcripts.iloc[idx_a]
|
|
@@ -184,7 +199,7 @@ class mcDETECT:
|
|
|
184
199
|
# find possible overlaps on 2D by r-tree
|
|
185
200
|
idx_b = make_rtree(set_b)
|
|
186
201
|
for i, sphere_a in set_a.iterrows():
|
|
187
|
-
center_a_3D = (sphere_a.sphere_x, sphere_a.sphere_y, sphere_a.sphere_z)
|
|
202
|
+
center_a_3D = np.array([sphere_a.sphere_x, sphere_a.sphere_y, sphere_a.sphere_z])
|
|
188
203
|
bounds_a = (sphere_a.sphere_x - sphere_a.sphere_r,
|
|
189
204
|
sphere_a.sphere_y - sphere_a.sphere_r,
|
|
190
205
|
sphere_a.sphere_x + sphere_a.sphere_r,
|
|
@@ -195,8 +210,8 @@ class mcDETECT:
|
|
|
195
210
|
for j in possible_overlaps:
|
|
196
211
|
if j in set_b.index:
|
|
197
212
|
sphere_b = set_b.loc[j]
|
|
198
|
-
center_b_3D = (sphere_b.sphere_x, sphere_b.sphere_y, sphere_b.sphere_z)
|
|
199
|
-
dist =
|
|
213
|
+
center_b_3D = np.array([sphere_b.sphere_x, sphere_b.sphere_y, sphere_b.sphere_z])
|
|
214
|
+
dist = np.linalg.norm(center_a_3D - center_b_3D)
|
|
200
215
|
radius_sum = sphere_a.sphere_r + sphere_b.sphere_r
|
|
201
216
|
radius_diff = sphere_a.sphere_r - sphere_b.sphere_r
|
|
202
217
|
|
|
@@ -227,10 +242,10 @@ class mcDETECT:
|
|
|
227
242
|
return set_a, set_b
|
|
228
243
|
|
|
229
244
|
|
|
230
|
-
# [INNER] merge spheres from different
|
|
245
|
+
# [INNER] merge spheres from different granule markers, input for detect()
|
|
231
246
|
def merge_sphere(self, sphere_dict):
|
|
232
247
|
sphere = sphere_dict[0].copy()
|
|
233
|
-
for j in range(1, len(self.
|
|
248
|
+
for j in range(1, len(self.gnl_genes)):
|
|
234
249
|
target_sphere = sphere_dict[j]
|
|
235
250
|
sphere, target_sphere_new = self.remove_overlaps(sphere, target_sphere)
|
|
236
251
|
sphere = pd.concat([sphere, target_sphere_new])
|
|
@@ -268,23 +283,19 @@ class mcDETECT:
|
|
|
268
283
|
# negative control filtering
|
|
269
284
|
nc_transcripts_final = self.transcripts[self.transcripts["target"].isin(nc_genes_final)]
|
|
270
285
|
tree = make_tree(d1 = np.array(nc_transcripts_final["global_x"]), d2 = np.array(nc_transcripts_final["global_y"]), d3 = np.array(nc_transcripts_final["global_z"]))
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
elif len(nc_idx) / temp["size"] < self.nc_thr:
|
|
278
|
-
pass_idx[i] = 2
|
|
279
|
-
sphere = sphere_low[np.array(pass_idx) != 0]
|
|
280
|
-
sphere = sphere.reset_index(drop = True)
|
|
286
|
+
centers = sphere_low[["sphere_x", "sphere_y", "sphere_z"]].to_numpy()
|
|
287
|
+
radii = sphere_low["sphere_r"].to_numpy()
|
|
288
|
+
sizes = sphere_low["size"].to_numpy()
|
|
289
|
+
counts = np.array([len(tree.query_ball_point(c, r)) for c, r in zip(centers, radii)])
|
|
290
|
+
pass_idx = (counts == 0) | (counts / sizes < self.nc_thr)
|
|
291
|
+
sphere = sphere_low[pass_idx].reset_index(drop = True)
|
|
281
292
|
return sphere
|
|
282
293
|
|
|
283
294
|
|
|
284
|
-
# [MAIN] dataframe,
|
|
285
|
-
def detect(self):
|
|
295
|
+
# [MAIN] dataframe, granule metadata
|
|
296
|
+
def detect(self, record_cell_id = False):
|
|
286
297
|
|
|
287
|
-
_, data_low, data_high = self.dbscan()
|
|
298
|
+
_, data_low, data_high = self.dbscan(record_cell_id = record_cell_id)
|
|
288
299
|
|
|
289
300
|
print("Merging spheres...")
|
|
290
301
|
sphere_low, sphere_high = self.merge_sphere(data_low), self.merge_sphere(data_high)
|
|
@@ -296,32 +307,44 @@ class mcDETECT:
|
|
|
296
307
|
return self.nc_filter(sphere_low, sphere_high)
|
|
297
308
|
|
|
298
309
|
|
|
299
|
-
# [MAIN] anndata,
|
|
300
|
-
def profile(self,
|
|
310
|
+
# [MAIN] anndata, granule spatial transcriptome profile
|
|
311
|
+
def profile(self, granule, genes = None, print_itr = False):
|
|
301
312
|
|
|
302
313
|
if genes is None:
|
|
303
314
|
genes = list(self.transcripts["target"].unique())
|
|
304
315
|
transcripts = self.transcripts
|
|
305
316
|
else:
|
|
306
317
|
transcripts = self.transcripts[self.transcripts["target"].isin(genes)]
|
|
318
|
+
|
|
319
|
+
gene_to_idx = {g: i for i, g in enumerate(genes)}
|
|
320
|
+
gene_array = transcripts["target"].to_numpy()
|
|
307
321
|
tree = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
|
|
308
322
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
323
|
+
n_gnl = granule.shape[0]
|
|
324
|
+
n_gene = len(genes)
|
|
325
|
+
data, row_idx, col_idx = [], [], []
|
|
326
|
+
|
|
327
|
+
# iterate over all granules to count nearby transcripts
|
|
328
|
+
for i in range(n_gnl):
|
|
329
|
+
temp = granule.iloc[i]
|
|
313
330
|
target_idx = tree.query_ball_point([temp["sphere_x"], temp["sphere_y"], temp["layer_z"]], temp["sphere_r"])
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
for
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
331
|
+
if not target_idx:
|
|
332
|
+
continue
|
|
333
|
+
local_genes = gene_array[target_idx] # extract genes for those nearby transcripts
|
|
334
|
+
counts = Counter(local_genes) # count how many times each gene occurs
|
|
335
|
+
for g, cnt in counts.items(): # append nonzero entries to sparse matrix lists
|
|
336
|
+
j = gene_to_idx[g] # get gene column index
|
|
337
|
+
data.append(cnt) # nonzero count
|
|
338
|
+
row_idx.append(i) # row index = granule index
|
|
339
|
+
col_idx.append(j) # column index = gene index
|
|
340
|
+
if print_itr and (i % 5000 == 0):
|
|
341
|
+
print(f"{i} out of {n_gnl} granules profiled!")
|
|
320
342
|
|
|
321
|
-
# construct spatial transcriptome profile
|
|
322
|
-
|
|
323
|
-
adata
|
|
324
|
-
adata.obs["
|
|
343
|
+
# construct sparse spatial transcriptome profile, (n_granules × n_genes)
|
|
344
|
+
X = csr_matrix((data, (row_idx, col_idx)), shape = (n_gnl, n_gene), dtype = np.float32)
|
|
345
|
+
adata = anndata.AnnData(X = X, obs = granule.copy())
|
|
346
|
+
adata.obs["granule_id"] = [f"gnl_{i}" for i in range(n_gnl)]
|
|
347
|
+
adata.obs = adata.obs.astype({"granule_id": str})
|
|
325
348
|
adata.obs.rename(columns = {"sphere_x": "global_x", "sphere_y": "global_y", "sphere_z": "global_z"}, inplace = True)
|
|
326
349
|
adata.var["genes"] = genes
|
|
327
350
|
adata.var_names = genes
|
|
@@ -359,7 +382,7 @@ class mcDETECT:
|
|
|
359
382
|
count_gene, _, _ = np.histogram2d(target_gene["global_x"], target_gene["global_y"], bins = [x_bins, y_bins])
|
|
360
383
|
X[k_idx, :] = count_gene.flatten()
|
|
361
384
|
if k_idx % 100 == 0:
|
|
362
|
-
print("{} out of {} genes profiled!"
|
|
385
|
+
print(f"{k_idx} out of {len(genes)} genes profiled!")
|
|
363
386
|
|
|
364
387
|
# spot id
|
|
365
388
|
spot_id = []
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
mcDETECT/__init__.py,sha256=GbRiy2Zt7JccZDK0rFa5ge7kE9r1L4bERDgQQ1e8QpQ,92
|
|
2
|
+
mcDETECT/model.py,sha256=9V1uNag4tur-JW5MWIPEVyy9yrADxsFR-HpbgU1lkgk,29397
|
|
3
|
+
mcDETECT/utils.py,sha256=0gvqZV7sGi8qvvdC5x9XScyiTXlSfqbUt1zks4t7Xd4,4545
|
|
4
|
+
mcdetect-2.0.5.dist-info/licenses/LICENSE,sha256=uxq-shEWOGTIGVnQLmpElILmfCkuUhFZRAMnZUiKvtg,1070
|
|
5
|
+
mcdetect-2.0.5.dist-info/METADATA,sha256=QE2OBc5Qu18c1iopwx13GkJTp3PEHxpVhX-vo5KccSw,3016
|
|
6
|
+
mcdetect-2.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
mcdetect-2.0.5.dist-info/top_level.txt,sha256=WwzBojt5U-T2hZ8llO6XgpM9OFIBkWQQldQKu19O8EY,9
|
|
8
|
+
mcdetect-2.0.5.dist-info/RECORD,,
|
mcdetect-2.0.3.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
mcDETECT/__init__.py,sha256=SPCzZZOrSFKUNUYRrFbrBWF0FPN6OUzUpRP4zjlfQr0,92
|
|
2
|
-
mcDETECT/model.py,sha256=zEdHqgwTjDi7HxdLW0aPG2j8uLMPiobNu-BcJraAG8g,28047
|
|
3
|
-
mcDETECT/utils.py,sha256=0gvqZV7sGi8qvvdC5x9XScyiTXlSfqbUt1zks4t7Xd4,4545
|
|
4
|
-
mcdetect-2.0.3.dist-info/licenses/LICENSE,sha256=uxq-shEWOGTIGVnQLmpElILmfCkuUhFZRAMnZUiKvtg,1070
|
|
5
|
-
mcdetect-2.0.3.dist-info/METADATA,sha256=1ny7qrjmE9p1Ybgmw3k4QnVJSKlXVJR4nlBNPxj3RCU,3016
|
|
6
|
-
mcdetect-2.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
-
mcdetect-2.0.3.dist-info/top_level.txt,sha256=WwzBojt5U-T2hZ8llO6XgpM9OFIBkWQQldQKu19O8EY,9
|
|
8
|
-
mcdetect-2.0.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|