PyPI - mcDETECT - Versions diffs - 2.0.3__py3-none-any.whl → 2.0.5__py3-none-any.whl - Mend

mcDETECT 2.0.3py3-none-any.whl → 2.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mcDETECT might be problematic. Click here for more details.

Files changed (8) hide show

mcDETECT/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "2.0.3"
+__version__ = "2.0.5"
 from . import model
 from . import utils

mcDETECT/model.py CHANGED Viewed

@@ -4,7 +4,9 @@ import miniball
 import numpy as np
 import pandas as pd
 import scanpy as sc
+from collections import Counter
 from rtree import index
+from scipy.sparse import csr_matrix
 from scipy.spatial import cKDTree
 from scipy.stats import poisson
 from shapely.geometry import Point
@@ -18,12 +20,12 @@ from .utils import *
 class mcDETECT:
-    def __init__(self, type, transcripts, syn_genes, nc_genes = None, eps = 1.5, minspl = None, grid_len = 1.0, cutoff_prob = 0.95, alpha = 5.0, low_bound = 3,
+    def __init__(self, type, transcripts, gnl_genes, nc_genes = None, eps = 1.5, minspl = None, grid_len = 1.0, cutoff_prob = 0.95, alpha = 5.0, low_bound = 3,
                  size_thr = 4.0, in_nucleus_thr = (0.5, 0.5), l = 1.0, rho = 0.2, s = 1.0, nc_top = 20, nc_thr = 0.1):
         self.type = type                        # string, iST platform, now support MERSCOPE, Xenium, and CosMx
         self.transcripts = transcripts          # dataframe, transcripts file
-        self.syn_genes = syn_genes              # list, string, all synaptic markers
+        self.gnl_genes = gnl_genes              # list, string, all granule markers
         self.nc_genes = nc_genes                # list, string, all negative controls
         self.eps = eps                          # numeric, searching radius epsilon
         self.minspl = minspl                    # integer, manually select min_samples, i.e., no automatic parameter selection
@@ -57,10 +59,11 @@ class mcDETECT:
     # [INNER] calculate tissue area, input for poisson_select()
     def tissue_area(self):
-        x_bins, y_bins = self.construct_grid(grid_len = None)
-        hist, _, _ = np.histogram2d(self.transcripts["global_x"], self.transcripts["global_y"], bins = [x_bins, y_bins])
-        area = np.count_nonzero(hist) * (self.grid_len ** 2)
-        return area
+        if not hasattr(self, "_cached_area"):
+            x_bins, y_bins = self.construct_grid(grid_len = None)
+            hist, _, _ = np.histogram2d(self.transcripts["global_x"], self.transcripts["global_y"], bins = [x_bins, y_bins])
+            self._cached_area = np.count_nonzero(hist) * (self.grid_len ** 2)
+        return self._cached_area
     # [INNER] calculate optimal min_samples, input for dbscan()
@@ -72,24 +75,26 @@ class mcDETECT:
         return optimal_m
-    # [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each synaptic marker
-    def dbscan(self, target_names = None, write_csv = False, write_path = "./"):
+    # [INTERMEDIATE] dictionary, low- and high-in-nucleus spheres for each granule marker
+    def dbscan(self, target_names = None, record_cell_id = False, write_csv = False, write_path = "./"):
         if self.type != "Xenium":
             z_grid = list(self.transcripts["global_z"].unique())
             z_grid.sort()
         if target_names is None:
-            target_names = self.syn_genes
+            target_names = self.gnl_genes
         transcripts = self.transcripts[self.transcripts["target"].isin(target_names)]
+        grouped = {g: df for g, df in transcripts.groupby("target")}
         num_individual, data_low, data_high = [], {}, {}
         for j in target_names:
             # split transcripts
-            target = transcripts[transcripts["target"] == j]
-            others = transcripts[transcripts["target"] != j]
+            target = grouped[j]
+            others = pd.concat([grouped[g] for g in target_names if g != j], ignore_index = True)
             tree = make_tree(d1 = np.array(others["global_x"]), d2 = np.array(others["global_y"]), d3 = np.array(others["global_z"]))
             # 3D DBSCAN
@@ -103,17 +108,25 @@ class mcDETECT:
             n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
             # iterate over all aggregations
-            sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score = [], [], [], [], [], [], [], []
+            cell_id, sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score = [], [], [], [], [], [], [], [], []
             for k in range(n_clusters):
+                # record cell ids
+                if record_cell_id:
+                    temp = target[labels == k]
+                    temp_cell_id_mode = temp["cell_id"].mode()[0]
+                    cell_id.append(temp_cell_id_mode)
                 # find minimum enclosing spheres
-                temp = target[labels == k]
-                temp_in_nucleus = np.sum(temp["overlaps_nucleus"])
-                temp_size = temp.shape[0]
-                temp = temp[["global_x", "global_y", "global_z"]]
-                temp = temp.drop_duplicates()
-                center, r2 = miniball.get_bounding_ball(np.array(temp), epsilon=1e-8)
+                mask = (labels == k)
+                coords = X[mask]
+                if coords.shape[0] == 0:
+                    continue
+                temp_in_nucleus = np.sum(target["overlaps_nucleus"].values[mask])
+                temp_size = coords.shape[0]
+                coords_unique = np.unique(coords, axis=0)
+                center, r2 = miniball.get_bounding_ball(coords_unique, epsilon=1e-8)
                 if self.type != "Xenium":
                     closest_z = closest(z_grid, center[2])
                 else:
@@ -139,11 +152,13 @@ class mcDETECT:
                 sphere_comp.append(total_comp)
                 sphere_score.append(local_score)
-            # basic features for all spheres from each synaptic marker
-            sphere = pd.DataFrame(list(zip(sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score)),
-                                  columns = ["sphere_x", "sphere_y", "sphere_z", "layer_z", "sphere_r", "size", "comp", "in_nucleus"])
-            sphere["gene"] = [j] * sphere.shape[0]
-            sphere = sphere.astype({"sphere_x": float, "sphere_y": float, "sphere_z": float, "layer_z": int, "sphere_r": float, "size": float, "comp": float, "in_nucleus": int, "gene": str})
+            # basic features for all spheres from each granule marker
+            sphere = pd.DataFrame(list(zip(sphere_x, sphere_y, sphere_z, layer_z, sphere_r, sphere_size, sphere_comp, sphere_score, [j] * len(sphere_x))),
+                                      columns = ["sphere_x", "sphere_y", "sphere_z", "layer_z", "sphere_r", "size", "comp", "in_nucleus", "gene"])
+            sphere = sphere.astype({"sphere_x": float, "sphere_y": float, "sphere_z": float, "layer_z": float, "sphere_r": float, "size": float, "comp": float, "in_nucleus": float, "gene": str})
+            if record_cell_id:
+                sphere["cell_id"] = cell_id
+                sphere = sphere.astype({"cell_id": str})
             # split low- and high-in-nucleus spheres
             sphere_low = sphere[(sphere["sphere_r"] < self.size_thr) & (sphere["in_nucleus"] < self.in_nucleus_thr[0])]
@@ -156,14 +171,14 @@ class mcDETECT:
             num_individual.append(sphere_low.shape[0])
             data_low[target_names.index(j)] = sphere_low
             data_high[target_names.index(j)] = sphere_high
-            print("{} out of {} genes processed!".format(target_names.index(j) + 1, len(target_names)))
+            print(f"{target_names.index(j) + 1} / {len(target_names)} genes processed!")
         return np.sum(num_individual), data_low, data_high
     # [INNER] merge points from two overlapped spheres, input for remove_overlaps()
     def find_points(self, sphere_a, sphere_b):
-        transcripts = self.transcripts[self.transcripts["target"].isin(self.syn_genes)]
+        transcripts = self.transcripts[self.transcripts["target"].isin(self.gnl_genes)]
         tree_temp = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
         idx_a = tree_temp.query_ball_point([sphere_a["sphere_x"], sphere_a["sphere_y"], sphere_a["sphere_z"]], sphere_a["sphere_r"])
         points_a = transcripts.iloc[idx_a]
@@ -184,7 +199,7 @@ class mcDETECT:
         # find possible overlaps on 2D by r-tree
         idx_b = make_rtree(set_b)
         for i, sphere_a in set_a.iterrows():
-            center_a_3D = (sphere_a.sphere_x, sphere_a.sphere_y, sphere_a.sphere_z)
+            center_a_3D = np.array([sphere_a.sphere_x, sphere_a.sphere_y, sphere_a.sphere_z])
             bounds_a = (sphere_a.sphere_x - sphere_a.sphere_r,
                         sphere_a.sphere_y - sphere_a.sphere_r,
                         sphere_a.sphere_x + sphere_a.sphere_r,
@@ -195,8 +210,8 @@ class mcDETECT:
             for j in possible_overlaps:
                 if j in set_b.index:
                     sphere_b = set_b.loc[j]
-                    center_b_3D = (sphere_b.sphere_x, sphere_b.sphere_y, sphere_b.sphere_z)
-                    dist = math.dist(center_a_3D, center_b_3D)
+                    center_b_3D = np.array([sphere_b.sphere_x, sphere_b.sphere_y, sphere_b.sphere_z])
+                    dist = np.linalg.norm(center_a_3D - center_b_3D)
                     radius_sum = sphere_a.sphere_r + sphere_b.sphere_r
                     radius_diff = sphere_a.sphere_r - sphere_b.sphere_r
@@ -227,10 +242,10 @@ class mcDETECT:
         return set_a, set_b
-    # [INNER] merge spheres from different synaptic markers, input for detect()
+    # [INNER] merge spheres from different granule markers, input for detect()
     def merge_sphere(self, sphere_dict):
         sphere = sphere_dict[0].copy()
-        for j in range(1, len(self.syn_genes)):
+        for j in range(1, len(self.gnl_genes)):
             target_sphere = sphere_dict[j]
             sphere, target_sphere_new = self.remove_overlaps(sphere, target_sphere)
             sphere = pd.concat([sphere, target_sphere_new])
@@ -268,23 +283,19 @@ class mcDETECT:
         # negative control filtering
         nc_transcripts_final = self.transcripts[self.transcripts["target"].isin(nc_genes_final)]
         tree = make_tree(d1 = np.array(nc_transcripts_final["global_x"]), d2 = np.array(nc_transcripts_final["global_y"]), d3 = np.array(nc_transcripts_final["global_z"]))
-        pass_idx = [0] * sphere_low.shape[0]
-        for i in range(sphere_low.shape[0]):
-            temp = sphere_low.iloc[i]
-            nc_idx = tree.query_ball_point([temp["sphere_x"], temp["sphere_y"], temp["sphere_z"]], temp["sphere_r"])
-            if len(nc_idx) == 0:
-                pass_idx[i] = 1
-            elif len(nc_idx) / temp["size"] < self.nc_thr:
-                pass_idx[i] = 2
-        sphere = sphere_low[np.array(pass_idx) != 0]
-        sphere = sphere.reset_index(drop = True)
+        centers = sphere_low[["sphere_x", "sphere_y", "sphere_z"]].to_numpy()
+        radii = sphere_low["sphere_r"].to_numpy()
+        sizes = sphere_low["size"].to_numpy()
+        counts = np.array([len(tree.query_ball_point(c, r)) for c, r in zip(centers, radii)])
+        pass_idx = (counts == 0) | (counts / sizes < self.nc_thr)
+        sphere = sphere_low[pass_idx].reset_index(drop = True)
         return sphere
-    # [MAIN] dataframe, synapse metadata
-    def detect(self):
+    # [MAIN] dataframe, granule metadata
+    def detect(self, record_cell_id = False):
-        _, data_low, data_high = self.dbscan()
+        _, data_low, data_high = self.dbscan(record_cell_id = record_cell_id)
         print("Merging spheres...")
         sphere_low, sphere_high = self.merge_sphere(data_low), self.merge_sphere(data_high)
@@ -296,32 +307,44 @@ class mcDETECT:
             return self.nc_filter(sphere_low, sphere_high)
-    # [MAIN] anndata, synapse spatial transcriptome profile
-    def profile(self, synapse, genes = None, print_itr = False):
+    # [MAIN] anndata, granule spatial transcriptome profile
+    def profile(self, granule, genes = None, print_itr = False):
         if genes is None:
             genes = list(self.transcripts["target"].unique())
             transcripts = self.transcripts
         else:
             transcripts = self.transcripts[self.transcripts["target"].isin(genes)]
+        gene_to_idx = {g: i for i, g in enumerate(genes)}
+        gene_array = transcripts["target"].to_numpy()
         tree = make_tree(d1 = np.array(transcripts["global_x"]), d2 = np.array(transcripts["global_y"]), d3 = np.array(transcripts["global_z"]))
-        # construct gene count matrix
-        X = np.zeros((len(genes), synapse.shape[0]))
-        for i in range(synapse.shape[0]):
-            temp = synapse.iloc[i]
+        n_gnl = granule.shape[0]
+        n_gene = len(genes)
+        data, row_idx, col_idx = [], [], []
+        # iterate over all granules to count nearby transcripts
+        for i in range(n_gnl):
+            temp = granule.iloc[i]
             target_idx = tree.query_ball_point([temp["sphere_x"], temp["sphere_y"], temp["layer_z"]], temp["sphere_r"])
-            target_trans = transcripts.iloc[target_idx]
-            target_gene = list(target_trans["target"])
-            for j in np.unique(target_gene):
-                X[genes.index(j), i] = target_gene.count(j)
-            if (print_itr) & (i % 5000 == 0):
-                print("{} out of {} synapses profiled!".format(i, synapse.shape[0]))
+            if not target_idx:
+                continue
+            local_genes = gene_array[target_idx]    # extract genes for those nearby transcripts
+            counts = Counter(local_genes)           # count how many times each gene occurs
+            for g, cnt in counts.items():           # append nonzero entries to sparse matrix lists
+                j = gene_to_idx[g]                  # get gene column index
+                data.append(cnt)                    # nonzero count
+                row_idx.append(i)                   # row index = granule index
+                col_idx.append(j)                   # column index = gene index
+            if print_itr and (i % 5000 == 0):
+                print(f"{i} out of {n_gnl} granules profiled!")
-        # construct spatial transcriptome profile
-        adata = anndata.AnnData(X = np.transpose(X), obs = synapse)
-        adata.obs["synapse_id"] = ["syn_{}".format(i) for i in range(synapse.shape[0])]
-        adata.obs["synapse_id"] = adata.obs["synapse_id"].astype(str)
+        # construct sparse spatial transcriptome profile, (n_granules × n_genes)
+        X = csr_matrix((data, (row_idx, col_idx)), shape = (n_gnl, n_gene), dtype = np.float32)
+        adata = anndata.AnnData(X = X, obs = granule.copy())
+        adata.obs["granule_id"] = [f"gnl_{i}" for i in range(n_gnl)]
+        adata.obs = adata.obs.astype({"granule_id": str})
         adata.obs.rename(columns = {"sphere_x": "global_x", "sphere_y": "global_y", "sphere_z": "global_z"}, inplace = True)
         adata.var["genes"] = genes
         adata.var_names = genes
@@ -359,7 +382,7 @@ class mcDETECT:
             count_gene, _, _ = np.histogram2d(target_gene["global_x"], target_gene["global_y"], bins = [x_bins, y_bins])
             X[k_idx, :] = count_gene.flatten()
             if k_idx % 100 == 0:
-                print("{} out of {} genes profiled!".format(k_idx, len(genes)))
+                print(f"{k_idx} out of {len(genes)} genes profiled!")
         # spot id
         spot_id = []

{mcdetect-2.0.3.dist-info → mcdetect-2.0.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mcDETECT
-Version: 2.0.3
+Version: 2.0.5
 Summary: Uncovering the dark transcriptome in polarized neuronal compartments with mcDETECT
 Home-page: https://github.com/chen-yang-yuan/mcDETECT
 Author: Chenyang Yuan

mcdetect-2.0.5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+mcDETECT/__init__.py,sha256=GbRiy2Zt7JccZDK0rFa5ge7kE9r1L4bERDgQQ1e8QpQ,92
+mcDETECT/model.py,sha256=9V1uNag4tur-JW5MWIPEVyy9yrADxsFR-HpbgU1lkgk,29397
+mcDETECT/utils.py,sha256=0gvqZV7sGi8qvvdC5x9XScyiTXlSfqbUt1zks4t7Xd4,4545
+mcdetect-2.0.5.dist-info/licenses/LICENSE,sha256=uxq-shEWOGTIGVnQLmpElILmfCkuUhFZRAMnZUiKvtg,1070
+mcdetect-2.0.5.dist-info/METADATA,sha256=QE2OBc5Qu18c1iopwx13GkJTp3PEHxpVhX-vo5KccSw,3016
+mcdetect-2.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+mcdetect-2.0.5.dist-info/top_level.txt,sha256=WwzBojt5U-T2hZ8llO6XgpM9OFIBkWQQldQKu19O8EY,9
+mcdetect-2.0.5.dist-info/RECORD,,

mcdetect-2.0.3.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-mcDETECT/__init__.py,sha256=SPCzZZOrSFKUNUYRrFbrBWF0FPN6OUzUpRP4zjlfQr0,92
-mcDETECT/model.py,sha256=zEdHqgwTjDi7HxdLW0aPG2j8uLMPiobNu-BcJraAG8g,28047
-mcDETECT/utils.py,sha256=0gvqZV7sGi8qvvdC5x9XScyiTXlSfqbUt1zks4t7Xd4,4545
-mcdetect-2.0.3.dist-info/licenses/LICENSE,sha256=uxq-shEWOGTIGVnQLmpElILmfCkuUhFZRAMnZUiKvtg,1070
-mcdetect-2.0.3.dist-info/METADATA,sha256=1ny7qrjmE9p1Ybgmw3k4QnVJSKlXVJR4nlBNPxj3RCU,3016
-mcdetect-2.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-mcdetect-2.0.3.dist-info/top_level.txt,sha256=WwzBojt5U-T2hZ8llO6XgpM9OFIBkWQQldQKu19O8EY,9
-mcdetect-2.0.3.dist-info/RECORD,,

{mcdetect-2.0.3.dist-info → mcdetect-2.0.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{mcdetect-2.0.3.dist-info → mcdetect-2.0.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mcdetect-2.0.3.dist-info → mcdetect-2.0.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

mcDETECT 2.0.3__py3-none-any.whl → 2.0.5__py3-none-any.whl

Potentially problematic release.

mcDETECT 2.0.3py3-none-any.whl → 2.0.5py3-none-any.whl