mcDETECT 2.0.3__py3-none-any.whl → 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcDETECT might be problematic. Click here for more details.

mcDETECT/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "2.0.3"
1
+ __version__ = "2.0.5"
2
2
 
3
3
  from . import model
4
4
  from . import utils
mcDETECT/model.py CHANGED
@@ -4,7 +4,9 @@ import miniball
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
  import scanpy as sc
7
+ from collections import Counter
7
8
  from rtree import index
9
+ from scipy.sparse import csr_matrix
8
10
  from scipy.spatial import cKDTree
9
11
  from scipy.stats import poisson
10
12
  from shapely.geometry import Point
@@ -18,12 +20,12 @@ from .utils import *
18
20
  class mcDETECT:
19
21
 
20
22
 
21
- def __init__(self, type, transcripts, syn_genes, nc_genes = None, eps = 1.5, minspl = None, grid_len = 1.0, cutoff_prob = 0.95, alpha = 5.0, low_bound = 3,
23
+ def __init__(self, type, transcripts, gnl_genes, nc_genes = None, eps = 1.5, minspl = None, grid_len = 1.0, cutoff_prob = 0.95, alpha = 5.0, low_bound = 3,
22
24
  size_thr = 4.0, in_nucleus_thr = (0.5, 0.5), l = 1.0, rho = 0.2, s = 1.0, nc_top = 20, nc_thr = 0.1):
23
25
 
24
26
  self.type = type # string, iST platform, now support MERSCOPE, Xenium, and CosMx
25
27
  self.transcripts = transcripts # dataframe, transcripts file
26
- self.syn_genes = syn_genes # list, string, all synaptic markers
28
+ self.gnl_genes = gnl_genes # list, string, all granule markers
27
29
  self.nc_genes = nc_genes # list, string, all negative controls
28
30
  self.eps = eps # numeric, searching radius epsilon
29
31
  self.minspl = minspl # integer, manually select min_samples, i.e., no automatic parameter selection
@@ -57,10 +59,11 @@ class mcDETECT:
57
59
 
58
60
  # [INNER] calculate tissue area, input for poisson_select()
59
61
  def tissue_area(self):
60
- x_bins, y_bins = self.construct_grid(grid_len = None)
61
- hist, _, _ = np.histogram2d(self.transcripts["global_x"], self.transcripts["global_y"], bins = [x_bins, y_bins])
62
- area = np.count_nonzero(hist) * (self.grid_len ** 2)
63
- return area
62
+ if not hasattr(self, "_cached_area"):
63
+ x_bins, y_bins = self.construct_grid(grid_len = None)
64
+ hist, _, _ = np.histogram2d(self.transcripts["global_x"], self.transcripts["global_y"], bins = [x_bins, y_bins])
65
+ self._cached_area = np.count_nonzero(hist) * (self.grid_len ** 2)
66
+ return self._cached_area
64
67
 
65
68
 
66
69
  # [INNER] calculate optimal min_samples, input for dbscan()
@@ -72,24 +75,26 @@ class mcDETECT:
72
75
  return optimal_m
73
76
 
74
77
 
75
- # [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each synaptic marker
76
- def dbscan(self, target_names = None, write_csv = False, write_path = "./"):
78
+ # [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each granule marker
79
+ def dbscan(self, target_names = None, record_cell_id = False, write_csv = False, write_path = "./"):
77
80
 
78
81
  if self.type != "Xenium":
79
82
  z_grid = list(self.transcripts["global_z"].unique())
80
83
  z_grid.sort()
81
84
 
82
85
  if target_names is None:
83
- target_names = self.syn_genes
86
+ target_names = self.gnl_genes
87
+
84
88
  transcripts = self.transcripts[self.transcripts["target"].isin(target_names)]
89
+ grouped = {g: df for g, df in transcripts.groupby("target")}
85
90
 
86
91
  num_individual, data_low, data_high = [], {}, {}
87
92
 
88
93
  for j in target_names:
89
94
 
90
95
  # split transcripts
91
- target = transcripts[transcripts["target"] == j]
92
- others = transcripts[transcripts["target"] != j]
96
+ target = grouped[j]
97
+ others = pd.concat([grouped[g] for g in target_names if g != j], ignore_index = True)
93
98
  tree = make_tree(d1 = np.array(others["global_x"]), d2 = np.array(others["global_y"]), d3 = np.array(others["global_z"]))
94
99
 
95
100
  # 3D DBSCAN
@@ -103,17 +108,25 @@ class mcDETECT:
103
108
  n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
104
109
 
105
110
  # iterate over all aggregations
106
- sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score = [], [], [], [], [], [], [], []
111
+ cell_id, sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score = [], [], [], [], [], [], [], [], []
107
112
 
108
113
  for k in range(n_clusters):
109
114
 
115
+ # record cell ids
116
+ if record_cell_id:
117
+ temp = target[labels == k]
118
+ temp_cell_id_mode = temp["cell_id"].mode()[0]
119
+ cell_id.append(temp_cell_id_mode)
120
+
110
121
  # find minimum enclosing spheres
111
- temp = target[labels == k]
112
- temp_in_nucleus = np.sum(temp["overlaps_nucleus"])
113
- temp_size = temp.shape[0]
114
- temp = temp[["global_x", "global_y", "global_z"]]
115
- temp = temp.drop_duplicates()
116
- center, r2 = miniball.get_bounding_ball(np.array(temp), epsilon=1e-8)
122
+ mask = (labels == k)
123
+ coords = X[mask]
124
+ if coords.shape[0] == 0:
125
+ continue
126
+ temp_in_nucleus = np.sum(target["overlaps_nucleus"].values[mask])
127
+ temp_size = coords.shape[0]
128
+ coords_unique = np.unique(coords, axis=0)
129
+ center, r2 = miniball.get_bounding_ball(coords_unique, epsilon=1e-8)
117
130
  if self.type != "Xenium":
118
131
  closest_z = closest(z_grid, center[2])
119
132
  else:
@@ -139,11 +152,13 @@ class mcDETECT:
139
152
  sphere_comp.append(total_comp)
140
153
  sphere_score.append(local_score)
141
154
 
142
- # basic features for all spheres from each synaptic marker
143
- sphere = pd.DataFrame(list(zip(sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score)),
144
- columns = ["sphere_x", "sphere_y", "sphere_z", "layer_z", "sphere_r", "size", "comp", "in_nucleus"])
145
- sphere["gene"] = [j] * sphere.shape[0]
146
- sphere = sphere.astype({"sphere_x": float, "sphere_y": float, "sphere_z": float, "layer_z": int, "sphere_r": float, "size": float, "comp": float, "in_nucleus": int, "gene": str})
155
+ # basic features for all spheres from each granule marker
156
+ sphere = pd.DataFrame(list(zip(sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score, [j] * len(sphere_x))),
157
+ columns = ["sphere_x", "sphere_y", "sphere_z", "layer_z", "sphere_r", "size", "comp", "in_nucleus", "gene"])
158
+ sphere = sphere.astype({"sphere_x": float, "sphere_y": float, "sphere_z": float, "layer_z": float, "sphere_r": float, "size": float, "comp": float, "in_nucleus": float, "gene": str})
159
+ if record_cell_id:
160
+ sphere["cell_id"] = cell_id
161
+ sphere = sphere.astype({"cell_id": str})
147
162
 
148
163
  # split low- and high-in-nucleus spheres
149
164
  sphere_low = sphere[(sphere["sphere_r"] < self.size_thr) & (sphere["in_nucleus"] < self.in_nucleus_thr[0])]
@@ -156,14 +171,14 @@ class mcDETECT:
156
171
  num_individual.append(sphere_low.shape[0])
157
172
  data_low[target_names.index(j)] = sphere_low
158
173
  data_high[target_names.index(j)] = sphere_high
159
- print("{} out of {} genes processed!".format(target_names.index(j) + 1, len(target_names)))
174
+ print(f"{target_names.index(j) + 1} / {len(target_names)} genes processed!")
160
175
 
161
176
  return np.sum(num_individual), data_low, data_high
162
177
 
163
178
 
164
179
  # [INNER] merge points from two overlapped spheres, input for remove_overlaps()
165
180
  def find_points(self, sphere_a, sphere_b):
166
- transcripts = self.transcripts[self.transcripts["target"].isin(self.syn_genes)]
181
+ transcripts = self.transcripts[self.transcripts["target"].isin(self.gnl_genes)]
167
182
  tree_temp = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
168
183
  idx_a = tree_temp.query_ball_point([sphere_a["sphere_x"], sphere_a["sphere_y"], sphere_a["sphere_z"]], sphere_a["sphere_r"])
169
184
  points_a = transcripts.iloc[idx_a]
@@ -184,7 +199,7 @@ class mcDETECT:
184
199
  # find possible overlaps on 2D by r-tree
185
200
  idx_b = make_rtree(set_b)
186
201
  for i, sphere_a in set_a.iterrows():
187
- center_a_3D = (sphere_a.sphere_x, sphere_a.sphere_y, sphere_a.sphere_z)
202
+ center_a_3D = np.array([sphere_a.sphere_x, sphere_a.sphere_y, sphere_a.sphere_z])
188
203
  bounds_a = (sphere_a.sphere_x - sphere_a.sphere_r,
189
204
  sphere_a.sphere_y - sphere_a.sphere_r,
190
205
  sphere_a.sphere_x + sphere_a.sphere_r,
@@ -195,8 +210,8 @@ class mcDETECT:
195
210
  for j in possible_overlaps:
196
211
  if j in set_b.index:
197
212
  sphere_b = set_b.loc[j]
198
- center_b_3D = (sphere_b.sphere_x, sphere_b.sphere_y, sphere_b.sphere_z)
199
- dist = math.dist(center_a_3D, center_b_3D)
213
+ center_b_3D = np.array([sphere_b.sphere_x, sphere_b.sphere_y, sphere_b.sphere_z])
214
+ dist = np.linalg.norm(center_a_3D - center_b_3D)
200
215
  radius_sum = sphere_a.sphere_r + sphere_b.sphere_r
201
216
  radius_diff = sphere_a.sphere_r - sphere_b.sphere_r
202
217
 
@@ -227,10 +242,10 @@ class mcDETECT:
227
242
  return set_a, set_b
228
243
 
229
244
 
230
- # [INNER] merge spheres from different synaptic markers, input for detect()
245
+ # [INNER] merge spheres from different granule markers, input for detect()
231
246
  def merge_sphere(self, sphere_dict):
232
247
  sphere = sphere_dict[0].copy()
233
- for j in range(1, len(self.syn_genes)):
248
+ for j in range(1, len(self.gnl_genes)):
234
249
  target_sphere = sphere_dict[j]
235
250
  sphere, target_sphere_new = self.remove_overlaps(sphere, target_sphere)
236
251
  sphere = pd.concat([sphere, target_sphere_new])
@@ -268,23 +283,19 @@ class mcDETECT:
268
283
  # negative control filtering
269
284
  nc_transcripts_final = self.transcripts[self.transcripts["target"].isin(nc_genes_final)]
270
285
  tree = make_tree(d1 = np.array(nc_transcripts_final["global_x"]), d2 = np.array(nc_transcripts_final["global_y"]), d3 = np.array(nc_transcripts_final["global_z"]))
271
- pass_idx = [0] * sphere_low.shape[0]
272
- for i in range(sphere_low.shape[0]):
273
- temp = sphere_low.iloc[i]
274
- nc_idx = tree.query_ball_point([temp["sphere_x"], temp["sphere_y"], temp["sphere_z"]], temp["sphere_r"])
275
- if len(nc_idx) == 0:
276
- pass_idx[i] = 1
277
- elif len(nc_idx) / temp["size"] < self.nc_thr:
278
- pass_idx[i] = 2
279
- sphere = sphere_low[np.array(pass_idx) != 0]
280
- sphere = sphere.reset_index(drop = True)
286
+ centers = sphere_low[["sphere_x", "sphere_y", "sphere_z"]].to_numpy()
287
+ radii = sphere_low["sphere_r"].to_numpy()
288
+ sizes = sphere_low["size"].to_numpy()
289
+ counts = np.array([len(tree.query_ball_point(c, r)) for c, r in zip(centers, radii)])
290
+ pass_idx = (counts == 0) | (counts / sizes < self.nc_thr)
291
+ sphere = sphere_low[pass_idx].reset_index(drop = True)
281
292
  return sphere
282
293
 
283
294
 
284
- # [MAIN] dataframe, synapse metadata
285
- def detect(self):
295
+ # [MAIN] dataframe, granule metadata
296
+ def detect(self, record_cell_id = False):
286
297
 
287
- _, data_low, data_high = self.dbscan()
298
+ _, data_low, data_high = self.dbscan(record_cell_id = record_cell_id)
288
299
 
289
300
  print("Merging spheres...")
290
301
  sphere_low, sphere_high = self.merge_sphere(data_low), self.merge_sphere(data_high)
@@ -296,32 +307,44 @@ class mcDETECT:
296
307
  return self.nc_filter(sphere_low, sphere_high)
297
308
 
298
309
 
299
- # [MAIN] anndata, synapse spatial transcriptome profile
300
- def profile(self, synapse, genes = None, print_itr = False):
310
+ # [MAIN] anndata, granule spatial transcriptome profile
311
+ def profile(self, granule, genes = None, print_itr = False):
301
312
 
302
313
  if genes is None:
303
314
  genes = list(self.transcripts["target"].unique())
304
315
  transcripts = self.transcripts
305
316
  else:
306
317
  transcripts = self.transcripts[self.transcripts["target"].isin(genes)]
318
+
319
+ gene_to_idx = {g: i for i, g in enumerate(genes)}
320
+ gene_array = transcripts["target"].to_numpy()
307
321
  tree = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
308
322
 
309
- # construct gene count matrix
310
- X = np.zeros((len(genes), synapse.shape[0]))
311
- for i in range(synapse.shape[0]):
312
- temp = synapse.iloc[i]
323
+ n_gnl = granule.shape[0]
324
+ n_gene = len(genes)
325
+ data, row_idx, col_idx = [], [], []
326
+
327
+ # iterate over all granules to count nearby transcripts
328
+ for i in range(n_gnl):
329
+ temp = granule.iloc[i]
313
330
  target_idx = tree.query_ball_point([temp["sphere_x"], temp["sphere_y"], temp["layer_z"]], temp["sphere_r"])
314
- target_trans = transcripts.iloc[target_idx]
315
- target_gene = list(target_trans["target"])
316
- for j in np.unique(target_gene):
317
- X[genes.index(j), i] = target_gene.count(j)
318
- if (print_itr) & (i % 5000 == 0):
319
- print("{} out of {} synapses profiled!".format(i, synapse.shape[0]))
331
+ if not target_idx:
332
+ continue
333
+ local_genes = gene_array[target_idx] # extract genes for those nearby transcripts
334
+ counts = Counter(local_genes) # count how many times each gene occurs
335
+ for g, cnt in counts.items(): # append nonzero entries to sparse matrix lists
336
+ j = gene_to_idx[g] # get gene column index
337
+ data.append(cnt) # nonzero count
338
+ row_idx.append(i) # row index = granule index
339
+ col_idx.append(j) # column index = gene index
340
+ if print_itr and (i % 5000 == 0):
341
+ print(f"{i} out of {n_gnl} granules profiled!")
320
342
 
321
- # construct spatial transcriptome profile
322
- adata = anndata.AnnData(X = np.transpose(X), obs = synapse)
323
- adata.obs["synapse_id"] = ["syn_{}".format(i) for i in range(synapse.shape[0])]
324
- adata.obs["synapse_id"] = adata.obs["synapse_id"].astype(str)
343
+ # construct sparse spatial transcriptome profile, (n_granules × n_genes)
344
+ X = csr_matrix((data, (row_idx, col_idx)), shape = (n_gnl, n_gene), dtype = np.float32)
345
+ adata = anndata.AnnData(X = X, obs = granule.copy())
346
+ adata.obs["granule_id"] = [f"gnl_{i}" for i in range(n_gnl)]
347
+ adata.obs = adata.obs.astype({"granule_id": str})
325
348
  adata.obs.rename(columns = {"sphere_x": "global_x", "sphere_y": "global_y", "sphere_z": "global_z"}, inplace = True)
326
349
  adata.var["genes"] = genes
327
350
  adata.var_names = genes
@@ -359,7 +382,7 @@ class mcDETECT:
359
382
  count_gene, _, _ = np.histogram2d(target_gene["global_x"], target_gene["global_y"], bins = [x_bins, y_bins])
360
383
  X[k_idx, :] = count_gene.flatten()
361
384
  if k_idx % 100 == 0:
362
- print("{} out of {} genes profiled!".format(k_idx, len(genes)))
385
+ print(f"{k_idx} out of {len(genes)} genes profiled!")
363
386
 
364
387
  # spot id
365
388
  spot_id = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcDETECT
3
- Version: 2.0.3
3
+ Version: 2.0.5
4
4
  Summary: Uncovering the dark transcriptome in polarized neuronal compartments with mcDETECT
5
5
  Home-page: https://github.com/chen-yang-yuan/mcDETECT
6
6
  Author: Chenyang Yuan
@@ -0,0 +1,8 @@
1
+ mcDETECT/__init__.py,sha256=GbRiy2Zt7JccZDK0rFa5ge7kE9r1L4bERDgQQ1e8QpQ,92
2
+ mcDETECT/model.py,sha256=9V1uNag4tur-JW5MWIPEVyy9yrADxsFR-HpbgU1lkgk,29397
3
+ mcDETECT/utils.py,sha256=0gvqZV7sGi8qvvdC5x9XScyiTXlSfqbUt1zks4t7Xd4,4545
4
+ mcdetect-2.0.5.dist-info/licenses/LICENSE,sha256=uxq-shEWOGTIGVnQLmpElILmfCkuUhFZRAMnZUiKvtg,1070
5
+ mcdetect-2.0.5.dist-info/METADATA,sha256=QE2OBc5Qu18c1iopwx13GkJTp3PEHxpVhX-vo5KccSw,3016
6
+ mcdetect-2.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ mcdetect-2.0.5.dist-info/top_level.txt,sha256=WwzBojt5U-T2hZ8llO6XgpM9OFIBkWQQldQKu19O8EY,9
8
+ mcdetect-2.0.5.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- mcDETECT/__init__.py,sha256=SPCzZZOrSFKUNUYRrFbrBWF0FPN6OUzUpRP4zjlfQr0,92
2
- mcDETECT/model.py,sha256=zEdHqgwTjDi7HxdLW0aPG2j8uLMPiobNu-BcJraAG8g,28047
3
- mcDETECT/utils.py,sha256=0gvqZV7sGi8qvvdC5x9XScyiTXlSfqbUt1zks4t7Xd4,4545
4
- mcdetect-2.0.3.dist-info/licenses/LICENSE,sha256=uxq-shEWOGTIGVnQLmpElILmfCkuUhFZRAMnZUiKvtg,1070
5
- mcdetect-2.0.3.dist-info/METADATA,sha256=1ny7qrjmE9p1Ybgmw3k4QnVJSKlXVJR4nlBNPxj3RCU,3016
6
- mcdetect-2.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- mcdetect-2.0.3.dist-info/top_level.txt,sha256=WwzBojt5U-T2hZ8llO6XgpM9OFIBkWQQldQKu19O8EY,9
8
- mcdetect-2.0.3.dist-info/RECORD,,