mcDETECT 2.0.2__py3-none-any.whl → 2.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcDETECT might be problematic. Click here for more details.

mcDETECT/__init__.py CHANGED
@@ -1,12 +1,4 @@
1
- __version__ = "2.0.2"
2
-
3
- # from .utils import find_threshold_index, closest, make_tree, make_rtree, scale, weighted_corr, weighted_spearmanr, assign_palette_to_adata, p_val_to_star, top_columns_above_threshold
4
- # from .model import mcDETECT
5
- # from .model import spot_neuron, spot_granule, neighbor_granule, neuron_embedding_one_hot, neuron_embedding_spatial_weight
6
-
7
- # __all__ = ["mcDETECT",
8
- # "spot_neuron", "spot_granule", "neighbor_granule", "neuron_embedding_one_hot", "neuron_embedding_spatial_weight",
9
- # "find_threshold_index", "closest", "make_tree", "make_rtree", "scale", "weighted_corr", "weighted_spearmanr", "assign_palette_to_adata", "p_val_to_star", "top_columns_above_threshold"]
1
+ __version__ = "2.0.4"
10
2
 
11
3
  from . import model
12
4
  from . import utils
mcDETECT/model.py CHANGED
@@ -1,10 +1,11 @@
1
1
  import anndata
2
- import math
3
2
  import miniball
4
3
  import numpy as np
5
4
  import pandas as pd
6
5
  import scanpy as sc
6
+ from collections import Counter
7
7
  from rtree import index
8
+ from scipy.sparse import csr_matrix
8
9
  from scipy.spatial import cKDTree
9
10
  from scipy.stats import poisson
10
11
  from shapely.geometry import Point
@@ -18,12 +19,12 @@ from .utils import *
18
19
  class mcDETECT:
19
20
 
20
21
 
21
- def __init__(self, type, transcripts, syn_genes, nc_genes = None, eps = 1.5, minspl = None, grid_len = 1.0, cutoff_prob = 0.95, alpha = 5.0, low_bound = 3,
22
+ def __init__(self, type, transcripts, gnl_genes, nc_genes = None, eps = 1.5, minspl = None, grid_len = 1.0, cutoff_prob = 0.95, alpha = 5.0, low_bound = 3,
22
23
  size_thr = 4.0, in_nucleus_thr = (0.5, 0.5), l = 1.0, rho = 0.2, s = 1.0, nc_top = 20, nc_thr = 0.1):
23
24
 
24
25
  self.type = type # string, iST platform, now support MERSCOPE, Xenium, and CosMx
25
26
  self.transcripts = transcripts # dataframe, transcripts file
26
- self.syn_genes = syn_genes # list, string, all synaptic markers
27
+ self.gnl_genes = gnl_genes # list, string, all granule markers
27
28
  self.nc_genes = nc_genes # list, string, all negative controls
28
29
  self.eps = eps # numeric, searching radius epsilon
29
30
  self.minspl = minspl # integer, manually select min_samples, i.e., no automatic parameter selection
@@ -57,10 +58,11 @@ class mcDETECT:
57
58
 
58
59
  # [INNER] calculate tissue area, input for poisson_select()
59
60
  def tissue_area(self):
60
- x_bins, y_bins = self.construct_grid(grid_len = None)
61
- hist, _, _ = np.histogram2d(self.transcripts["global_x"], self.transcripts["global_y"], bins = [x_bins, y_bins])
62
- area = np.count_nonzero(hist) * (self.grid_len ** 2)
63
- return area
61
+ if not hasattr(self, "_cached_area"):
62
+ x_bins, y_bins = self.construct_grid(grid_len = None)
63
+ hist, _, _ = np.histogram2d(self.transcripts["global_x"], self.transcripts["global_y"], bins = [x_bins, y_bins])
64
+ self._cached_area = np.count_nonzero(hist) * (self.grid_len ** 2)
65
+ return self._cached_area
64
66
 
65
67
 
66
68
  # [INNER] calculate optimal min_samples, input for dbscan()
@@ -72,24 +74,26 @@ class mcDETECT:
72
74
  return optimal_m
73
75
 
74
76
 
75
- # [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each synaptic marker
76
- def dbscan(self, target_names = None, write_csv = False, write_path = "./"):
77
+ # [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each granule marker
78
+ def dbscan(self, target_names = None, record_cell_id = False, write_csv = False, write_path = "./"):
77
79
 
78
80
  if self.type != "Xenium":
79
81
  z_grid = list(self.transcripts["global_z"].unique())
80
82
  z_grid.sort()
81
83
 
82
84
  if target_names is None:
83
- target_names = self.syn_genes
85
+ target_names = self.gnl_genes
86
+
84
87
  transcripts = self.transcripts[self.transcripts["target"].isin(target_names)]
88
+ grouped = {g: df for g, df in transcripts.groupby("target")}
85
89
 
86
90
  num_individual, data_low, data_high = [], {}, {}
87
91
 
88
92
  for j in target_names:
89
93
 
90
94
  # split transcripts
91
- target = transcripts[transcripts["target"] == j]
92
- others = transcripts[transcripts["target"] != j]
95
+ target = grouped[j]
96
+ others = pd.concat([grouped[g] for g in target_names if g != j], ignore_index = True)
93
97
  tree = make_tree(d1 = np.array(others["global_x"]), d2 = np.array(others["global_y"]), d3 = np.array(others["global_z"]))
94
98
 
95
99
  # 3D DBSCAN
@@ -103,17 +107,25 @@ class mcDETECT:
103
107
  n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
104
108
 
105
109
  # iterate over all aggregations
106
- sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score = [], [], [], [], [], [], [], []
110
+ cell_id, sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score = [], [], [], [], [], [], [], [], []
107
111
 
108
112
  for k in range(n_clusters):
109
113
 
114
+ # record cell ids
115
+ if record_cell_id:
116
+ temp = target[labels == k]
117
+ temp_cell_id_mode = temp["cell_id"].mode()[0]
118
+ cell_id.append(temp_cell_id_mode)
119
+
110
120
  # find minimum enclosing spheres
111
- temp = target[labels == k]
112
- temp_in_nucleus = np.sum(temp["overlaps_nucleus"])
113
- temp_size = temp.shape[0]
114
- temp = temp[["global_x", "global_y", "global_z"]]
115
- temp = temp.drop_duplicates()
116
- center, r2 = miniball.get_bounding_ball(np.array(temp), epsilon=1e-8)
121
+ mask = (labels == k)
122
+ coords = X[mask]
123
+ if coords.shape[0] == 0:
124
+ continue
125
+ temp_in_nucleus = np.sum(target["overlaps_nucleus"].values[mask])
126
+ temp_size = coords.shape[0]
127
+ coords_unique = np.unique(coords, axis=0)
128
+ center, r2 = miniball.get_bounding_ball(coords_unique, epsilon=1e-8)
117
129
  if self.type != "Xenium":
118
130
  closest_z = closest(z_grid, center[2])
119
131
  else:
@@ -139,11 +151,13 @@ class mcDETECT:
139
151
  sphere_comp.append(total_comp)
140
152
  sphere_score.append(local_score)
141
153
 
142
- # basic features for all spheres from each synaptic marker
143
- sphere = pd.DataFrame(list(zip(sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score)),
144
- columns = ["sphere_x", "sphere_y", "sphere_z", "layer_z", "sphere_r", "size", "comp", "in_nucleus"])
145
- sphere["gene"] = [j] * sphere.shape[0]
146
- sphere = sphere.astype({"sphere_x": float, "sphere_y": float, "sphere_z": float, "layer_z": int, "sphere_r": float, "size": float, "comp": float, "in_nucleus": int, "gene": str})
154
+ # basic features for all spheres from each granule marker
155
+ sphere = pd.DataFrame(list(zip(sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score, [j] * len(sphere_x))),
156
+ columns = ["sphere_x", "sphere_y", "sphere_z", "layer_z", "sphere_r", "size", "comp", "in_nucleus", "gene"])
157
+ sphere = sphere.astype({"sphere_x": float, "sphere_y": float, "sphere_z": float, "layer_z": float, "sphere_r": float, "size": float, "comp": float, "in_nucleus": float, "gene": str})
158
+ if record_cell_id:
159
+ sphere["cell_id"] = cell_id
160
+ sphere = sphere.astype({"cell_id": str})
147
161
 
148
162
  # split low- and high-in-nucleus spheres
149
163
  sphere_low = sphere[(sphere["sphere_r"] < self.size_thr) & (sphere["in_nucleus"] < self.in_nucleus_thr[0])]
@@ -156,14 +170,14 @@ class mcDETECT:
156
170
  num_individual.append(sphere_low.shape[0])
157
171
  data_low[target_names.index(j)] = sphere_low
158
172
  data_high[target_names.index(j)] = sphere_high
159
- print("{} out of {} genes processed!".format(target_names.index(j) + 1, len(target_names)))
173
+ print(f"{target_names.index(j) + 1} / {len(target_names)} genes processed!")
160
174
 
161
175
  return np.sum(num_individual), data_low, data_high
162
176
 
163
177
 
164
178
  # [INNER] merge points from two overlapped spheres, input for remove_overlaps()
165
179
  def find_points(self, sphere_a, sphere_b):
166
- transcripts = self.transcripts[self.transcripts["target"].isin(self.syn_genes)]
180
+ transcripts = self.transcripts[self.transcripts["target"].isin(self.gnl_genes)]
167
181
  tree_temp = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
168
182
  idx_a = tree_temp.query_ball_point([sphere_a["sphere_x"], sphere_a["sphere_y"], sphere_a["sphere_z"]], sphere_a["sphere_r"])
169
183
  points_a = transcripts.iloc[idx_a]
@@ -184,7 +198,7 @@ class mcDETECT:
184
198
  # find possible overlaps on 2D by r-tree
185
199
  idx_b = make_rtree(set_b)
186
200
  for i, sphere_a in set_a.iterrows():
187
- center_a_3D = (sphere_a.sphere_x, sphere_a.sphere_y, sphere_a.sphere_z)
201
+ center_a_3D = np.array([sphere_a.sphere_x, sphere_a.sphere_y, sphere_a.sphere_z])
188
202
  bounds_a = (sphere_a.sphere_x - sphere_a.sphere_r,
189
203
  sphere_a.sphere_y - sphere_a.sphere_r,
190
204
  sphere_a.sphere_x + sphere_a.sphere_r,
@@ -195,8 +209,8 @@ class mcDETECT:
195
209
  for j in possible_overlaps:
196
210
  if j in set_b.index:
197
211
  sphere_b = set_b.loc[j]
198
- center_b_3D = (sphere_b.sphere_x, sphere_b.sphere_y, sphere_b.sphere_z)
199
- dist = math.dist(center_a_3D, center_b_3D)
212
+ center_b_3D = np.array([sphere_b.sphere_x, sphere_b.sphere_y, sphere_b.sphere_z])
213
+ dist = np.linalg.norm(center_a_3D - center_b_3D)
200
214
  radius_sum = sphere_a.sphere_r + sphere_b.sphere_r
201
215
  radius_diff = sphere_a.sphere_r - sphere_b.sphere_r
202
216
 
@@ -227,10 +241,10 @@ class mcDETECT:
227
241
  return set_a, set_b
228
242
 
229
243
 
230
- # [INNER] merge spheres from different synaptic markers, input for detect()
244
+ # [INNER] merge spheres from different granule markers, input for detect()
231
245
  def merge_sphere(self, sphere_dict):
232
246
  sphere = sphere_dict[0].copy()
233
- for j in range(1, len(self.syn_genes)):
247
+ for j in range(1, len(self.gnl_genes)):
234
248
  target_sphere = sphere_dict[j]
235
249
  sphere, target_sphere_new = self.remove_overlaps(sphere, target_sphere)
236
250
  sphere = pd.concat([sphere, target_sphere_new])
@@ -268,23 +282,19 @@ class mcDETECT:
268
282
  # negative control filtering
269
283
  nc_transcripts_final = self.transcripts[self.transcripts["target"].isin(nc_genes_final)]
270
284
  tree = make_tree(d1 = np.array(nc_transcripts_final["global_x"]), d2 = np.array(nc_transcripts_final["global_y"]), d3 = np.array(nc_transcripts_final["global_z"]))
271
- pass_idx = [0] * sphere_low.shape[0]
272
- for i in range(sphere_low.shape[0]):
273
- temp = sphere_low.iloc[i]
274
- nc_idx = tree.query_ball_point([temp["sphere_x"], temp["sphere_y"], temp["sphere_z"]], temp["sphere_r"])
275
- if len(nc_idx) == 0:
276
- pass_idx[i] = 1
277
- elif len(nc_idx) / temp["size"] < self.nc_thr:
278
- pass_idx[i] = 2
279
- sphere = sphere_low[np.array(pass_idx) != 0]
280
- sphere = sphere.reset_index(drop = True)
285
+ centers = sphere_low[["sphere_x", "sphere_y", "sphere_z"]].to_numpy()
286
+ radii = sphere_low["sphere_r"].to_numpy()
287
+ sizes = sphere_low["size"].to_numpy()
288
+ counts = np.array([len(tree.query_ball_point(c, r)) for c, r in zip(centers, radii)])
289
+ pass_idx = (counts == 0) | (counts / sizes < self.nc_thr)
290
+ sphere = sphere_low[pass_idx].reset_index(drop = True)
281
291
  return sphere
282
292
 
283
293
 
284
- # [MAIN] dataframe, synapse metadata
285
- def detect(self):
294
+ # [MAIN] dataframe, granule metadata
295
+ def detect(self, record_cell_id = False):
286
296
 
287
- _, data_low, data_high = self.dbscan()
297
+ _, data_low, data_high = self.dbscan(record_cell_id = record_cell_id)
288
298
 
289
299
  print("Merging spheres...")
290
300
  sphere_low, sphere_high = self.merge_sphere(data_low), self.merge_sphere(data_high)
@@ -296,32 +306,44 @@ class mcDETECT:
296
306
  return self.nc_filter(sphere_low, sphere_high)
297
307
 
298
308
 
299
- # [MAIN] anndata, synapse spatial transcriptome profile
300
- def profile(self, synapse, genes = None, print_itr = False):
309
+ # [MAIN] anndata, granule spatial transcriptome profile
310
+ def profile(self, granule, genes = None, print_itr = False):
301
311
 
302
312
  if genes is None:
303
313
  genes = list(self.transcripts["target"].unique())
304
314
  transcripts = self.transcripts
305
315
  else:
306
316
  transcripts = self.transcripts[self.transcripts["target"].isin(genes)]
317
+
318
+ gene_to_idx = {g: i for i, g in enumerate(genes)}
319
+ gene_array = transcripts["target"].to_numpy()
307
320
  tree = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
308
321
 
309
- # construct gene count matrix
310
- X = np.zeros((len(genes), synapse.shape[0]))
311
- for i in range(synapse.shape[0]):
312
- temp = synapse.iloc[i]
322
+ n_gnl = granule.shape[0]
323
+ n_gene = len(genes)
324
+ data, row_idx, col_idx = [], [], []
325
+
326
+ # iterate over all granules to count nearby transcripts
327
+ for i in range(n_gnl):
328
+ temp = granule.iloc[i]
313
329
  target_idx = tree.query_ball_point([temp["sphere_x"], temp["sphere_y"], temp["layer_z"]], temp["sphere_r"])
314
- target_trans = transcripts.iloc[target_idx]
315
- target_gene = list(target_trans["target"])
316
- for j in np.unique(target_gene):
317
- X[genes.index(j), i] = target_gene.count(j)
318
- if (print_itr) & (i % 5000 == 0):
319
- print("{} out of {} synapses profiled!".format(i, synapse.shape[0]))
330
+ if not target_idx:
331
+ continue
332
+ local_genes = gene_array[target_idx] # extract genes for those nearby transcripts
333
+ counts = Counter(local_genes) # count how many times each gene occurs
334
+ for g, cnt in counts.items(): # append nonzero entries to sparse matrix lists
335
+ j = gene_to_idx[g] # get gene column index
336
+ data.append(cnt) # nonzero count
337
+ row_idx.append(i) # row index = granule index
338
+ col_idx.append(j) # column index = gene index
339
+ if print_itr and (i % 5000 == 0):
340
+ print(f"{i} out of {n_gnl} granules profiled!")
320
341
 
321
- # construct spatial transcriptome profile
322
- adata = anndata.AnnData(X = np.transpose(X), obs = synapse)
323
- adata.obs["synapse_id"] = ["syn_{}".format(i) for i in range(synapse.shape[0])]
324
- adata.obs["synapse_id"] = adata.obs["synapse_id"].astype(str)
342
+ # construct sparse spatial transcriptome profile, (n_granules × n_genes)
343
+ X = csr_matrix((data, (row_idx, col_idx)), shape = (n_gnl, n_gene), dtype = np.float32)
344
+ adata = anndata.AnnData(X = X, obs = granule.copy())
345
+ adata.obs["granule_id"] = [f"gnl_{i}" for i in range(n_gnl)]
346
+ adata.obs = adata.obs.astype({"granule_id": str})
325
347
  adata.obs.rename(columns = {"sphere_x": "global_x", "sphere_y": "global_y", "sphere_z": "global_z"}, inplace = True)
326
348
  adata.var["genes"] = genes
327
349
  adata.var_names = genes
@@ -359,7 +381,7 @@ class mcDETECT:
359
381
  count_gene, _, _ = np.histogram2d(target_gene["global_x"], target_gene["global_y"], bins = [x_bins, y_bins])
360
382
  X[k_idx, :] = count_gene.flatten()
361
383
  if k_idx % 100 == 0:
362
- print("{} out of {} genes profiled!".format(k_idx, len(genes)))
384
+ print(f"{k_idx} out of {len(genes)} genes profiled!")
363
385
 
364
386
  # spot id
365
387
  spot_id = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcDETECT
3
- Version: 2.0.2
3
+ Version: 2.0.4
4
4
  Summary: Uncovering the dark transcriptome in polarized neuronal compartments with mcDETECT
5
5
  Home-page: https://github.com/chen-yang-yuan/mcDETECT
6
6
  Author: Chenyang Yuan
@@ -0,0 +1,8 @@
1
+ mcDETECT/__init__.py,sha256=2gqfrrw4FNzDVBMDCXpfBwjDU_esM9r6VoW1_ru4rBs,92
2
+ mcDETECT/model.py,sha256=BJkarQR4wd6d0eb05wqhBTRT6ApJv9A8XwD5blv7c8k,29385
3
+ mcDETECT/utils.py,sha256=0gvqZV7sGi8qvvdC5x9XScyiTXlSfqbUt1zks4t7Xd4,4545
4
+ mcdetect-2.0.4.dist-info/licenses/LICENSE,sha256=uxq-shEWOGTIGVnQLmpElILmfCkuUhFZRAMnZUiKvtg,1070
5
+ mcdetect-2.0.4.dist-info/METADATA,sha256=thmYqmCQQ4AYQ2VULhMZjsJvAlg26ZrxoNvYvUK9_-c,3016
6
+ mcdetect-2.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ mcdetect-2.0.4.dist-info/top_level.txt,sha256=WwzBojt5U-T2hZ8llO6XgpM9OFIBkWQQldQKu19O8EY,9
8
+ mcdetect-2.0.4.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- mcDETECT/__init__.py,sha256=_kFD4ZyEYvyCNaOBJ1Wj3fOsRFigUdpaNkttzTr0TjY,783
2
- mcDETECT/model.py,sha256=zEdHqgwTjDi7HxdLW0aPG2j8uLMPiobNu-BcJraAG8g,28047
3
- mcDETECT/utils.py,sha256=0gvqZV7sGi8qvvdC5x9XScyiTXlSfqbUt1zks4t7Xd4,4545
4
- mcdetect-2.0.2.dist-info/licenses/LICENSE,sha256=uxq-shEWOGTIGVnQLmpElILmfCkuUhFZRAMnZUiKvtg,1070
5
- mcdetect-2.0.2.dist-info/METADATA,sha256=V0nxFJduH1coDW8F-Yv7vExY-3u7U6o3m7hWV1bCj0k,3016
6
- mcdetect-2.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- mcdetect-2.0.2.dist-info/top_level.txt,sha256=WwzBojt5U-T2hZ8llO6XgpM9OFIBkWQQldQKu19O8EY,9
8
- mcdetect-2.0.2.dist-info/RECORD,,