pycopro 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
copro/__init__.py ADDED
@@ -0,0 +1,23 @@
1
+ """CoPro Python — Spatial Kernel-based Reduced Rank CCA for spatial transcriptomics."""
2
+
3
+ from .core import CoProSingle, CoProMulti, subset_data
4
+ from .pca import compute_pca
5
+ from .distance import compute_distance
6
+ from .kernel import compute_kernel_matrix
7
+ from .skrcca import run_skr_cca
8
+ from .correlation import compute_normalized_correlation
9
+ from .scores import compute_gene_and_cell_scores
10
+
11
+ __all__ = [
12
+ "CoProSingle",
13
+ "CoProMulti",
14
+ "subset_data",
15
+ "compute_pca",
16
+ "compute_distance",
17
+ "compute_kernel_matrix",
18
+ "run_skr_cca",
19
+ "compute_normalized_correlation",
20
+ "compute_gene_and_cell_scores",
21
+ ]
22
+
23
+ __version__ = "0.1.0"
copro/core.py ADDED
@@ -0,0 +1,124 @@
1
+ """CoProSingle and CoProMulti dataclasses — state containers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Optional
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+
12
+ @dataclass
13
+ class CoProSingle:
14
+ # Input data
15
+ normalized_data: np.ndarray # cells × genes
16
+ location_data: pd.DataFrame # cells × {x, y, ...}
17
+ meta_data: pd.DataFrame
18
+ cell_types: np.ndarray # per-cell label vector
19
+
20
+ # Set by subset_data
21
+ cell_types_of_interest: list = field(default_factory=list)
22
+ normalized_data_sub: Optional[np.ndarray] = None
23
+ location_data_sub: Optional[pd.DataFrame] = None
24
+ cell_types_sub: Optional[np.ndarray] = None
25
+
26
+ # Computed results (keyed dicts)
27
+ pca_global: dict = field(default_factory=dict) # ct → dict with components/scores/sdev
28
+ distances: dict = field(default_factory=dict) # flat keys: "dist|A|B"
29
+ kernel_matrices: dict = field(default_factory=dict) # flat keys: "kernel|sigma0.1|A|B"
30
+ sigma_values: list = field(default_factory=list)
31
+ skr_cca_out: dict = field(default_factory=dict) # "sigma_0.1" → {ct: w_matrix}
32
+ normalized_correlation: dict = field(default_factory=dict)
33
+ sigma_value_choice: Optional[float] = None
34
+ cell_scores: dict = field(default_factory=dict)
35
+ gene_scores: dict = field(default_factory=dict)
36
+ n_cc: int = 2
37
+ n_pca: int = 30
38
+ scale_pcs: bool = True
39
+
40
+
41
+ @dataclass
42
+ class CoProMulti:
43
+ """Multi-slide CoPro object. meta_data must have a 'slideID' column."""
44
+ # Input data
45
+ normalized_data: np.ndarray
46
+ location_data: pd.DataFrame
47
+ meta_data: pd.DataFrame
48
+ cell_types: np.ndarray
49
+
50
+ # Slide list
51
+ slide_list: list = field(default_factory=list) # ordered list of slide IDs
52
+
53
+ # Set by subset_data
54
+ cell_types_of_interest: list = field(default_factory=list)
55
+ normalized_data_sub: Optional[np.ndarray] = None
56
+ location_data_sub: Optional[pd.DataFrame] = None
57
+ cell_types_sub: Optional[np.ndarray] = None
58
+ meta_data_sub: Optional[pd.DataFrame] = None # subset of meta_data (with slideID)
59
+
60
+ # PCA
61
+ pca_global: dict = field(default_factory=dict) # ct → global PCA dict (rotation, sdev)
62
+ pca_results: dict = field(default_factory=dict) # slide → {ct → scores matrix}
63
+
64
+ # Computed results
65
+ distances: dict = field(default_factory=dict) # flat keys: "dist|{slide}|A|B"
66
+ kernel_matrices: dict = field(default_factory=dict) # flat keys: "kernel|sigma0.1|{slide}|A|B"
67
+ sigma_values: list = field(default_factory=list)
68
+ skr_cca_out: dict = field(default_factory=dict) # "sigma_0.1" → {ct: w_matrix} (shared)
69
+ normalized_correlation: dict = field(default_factory=dict)
70
+ sigma_value_choice: Optional[float] = None
71
+ cell_scores: dict = field(default_factory=dict) # "cellScores|sigma0.1|{slide}|{ct}"
72
+ gene_scores: dict = field(default_factory=dict) # "geneScores|sigma0.1|{ct}" (shared)
73
+ n_cc: int = 2
74
+ n_pca: int = 30
75
+ scale_pcs: bool = True
76
+
77
+
78
+ def subset_data(obj, cell_types_of_interest: list, min_cells: int = 10):
79
+ """Filter data to listed cell types. Works for both CoProSingle and CoProMulti."""
80
+ if isinstance(obj, CoProMulti):
81
+ return _subset_data_multi(obj, cell_types_of_interest, min_cells)
82
+ else:
83
+ return _subset_data_single(obj, cell_types_of_interest, min_cells)
84
+
85
+
86
+ def _subset_data_single(obj: CoProSingle, cell_types_of_interest: list, min_cells: int) -> CoProSingle:
87
+ for ct in cell_types_of_interest:
88
+ n = np.sum(obj.cell_types == ct)
89
+ if n < min_cells:
90
+ raise ValueError(
91
+ f"Cell type '{ct}' has only {n} cells (minimum {min_cells} required)."
92
+ )
93
+ mask = np.isin(obj.cell_types, cell_types_of_interest)
94
+ obj.cell_types_of_interest = list(cell_types_of_interest)
95
+ obj.normalized_data_sub = obj.normalized_data[mask]
96
+ obj.location_data_sub = obj.location_data.loc[mask].reset_index(drop=True)
97
+ obj.cell_types_sub = obj.cell_types[mask]
98
+ return obj
99
+
100
+
101
+ def _subset_data_multi(obj: CoProMulti, cell_types_of_interest: list, min_cells: int) -> CoProMulti:
102
+ """Subset multi-slide object. Checks per-slide cell counts."""
103
+ if "slideID" not in obj.meta_data.columns:
104
+ raise ValueError("meta_data must have a 'slideID' column for CoProMulti.")
105
+
106
+ # Discover slide list from meta_data if not set
107
+ if not obj.slide_list:
108
+ obj.slide_list = sorted(obj.meta_data["slideID"].unique().tolist())
109
+
110
+ for ct in cell_types_of_interest:
111
+ # Check total across all slides
112
+ n_total = np.sum(obj.cell_types == ct)
113
+ if n_total < min_cells:
114
+ raise ValueError(
115
+ f"Cell type '{ct}' has only {n_total} cells total (minimum {min_cells} required)."
116
+ )
117
+
118
+ mask = np.isin(obj.cell_types, cell_types_of_interest)
119
+ obj.cell_types_of_interest = list(cell_types_of_interest)
120
+ obj.normalized_data_sub = obj.normalized_data[mask]
121
+ obj.location_data_sub = obj.location_data.loc[mask].reset_index(drop=True)
122
+ obj.cell_types_sub = obj.cell_types[mask]
123
+ obj.meta_data_sub = obj.meta_data.loc[mask].reset_index(drop=True)
124
+ return obj
copro/correlation.py ADDED
@@ -0,0 +1,248 @@
1
+ """compute_normalized_correlation() — spectral-norm normalized CCA correlation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from itertools import combinations
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from scipy.sparse.linalg import svds
10
+
11
+ from .core import CoProSingle
12
+ from .skrcca import _prepare_pc_matrices
13
+
14
+
15
+ def _spectral_norm(K: np.ndarray, tol: float = 1e-4) -> float:
16
+ """Largest singular value of K (spectral norm)."""
17
+ try:
18
+ s = svds(K.astype(float), k=1, tol=tol, return_singular_vectors=False)
19
+ return float(s[0])
20
+ except Exception:
21
+ return float(np.linalg.norm(K, ord=2))
22
+
23
+
24
+
25
+ def _get_kernel_for_pair(flat_kernels, sigma, ct_i, ct_j, slide=None):
26
+ """Retrieve kernel, optionally slide-aware, trying both orderings."""
27
+ if slide is None:
28
+ name = f"kernel|sigma{sigma}|{ct_i}|{ct_j}"
29
+ name_sym = f"kernel|sigma{sigma}|{ct_j}|{ct_i}"
30
+ else:
31
+ name = f"kernel|sigma{sigma}|{slide}|{ct_i}|{ct_j}"
32
+ name_sym = f"kernel|sigma{sigma}|{slide}|{ct_j}|{ct_i}"
33
+ if name in flat_kernels:
34
+ return flat_kernels[name]
35
+ if name_sym in flat_kernels:
36
+ return flat_kernels[name_sym].T
37
+ raise KeyError(f"Kernel not found for ({ct_i},{ct_j}) sigma={sigma} slide={slide}")
38
+
39
+
40
+ def compute_normalized_correlation(obj, tol: float = 1e-4):
41
+ """Compute normalized CCA correlation for each sigma × pair × CC.
42
+
43
+ Dispatches to multi-slide version for CoProMulti objects.
44
+
45
+ Formula:
46
+ numerator = (A @ w1)^T K (B @ w2)
47
+ denominator = ||A @ w1|| * ||B @ w2|| * ||K||_spec
48
+ norm_corr = numerator / denominator
49
+
50
+ Stores in obj.normalized_correlation[sigma_name] = DataFrame.
51
+ Chooses obj.sigma_value_choice as sigma maximizing mean CC1 correlation.
52
+ """
53
+ from .core import CoProMulti
54
+ if isinstance(obj, CoProMulti):
55
+ return _compute_normalized_correlation_multi(obj, tol)
56
+
57
+ # --- Single-slide path ---
58
+ cts = obj.cell_types_of_interest
59
+ if not cts:
60
+ raise ValueError("No cell types of interest.")
61
+ if not obj.skr_cca_out:
62
+ raise ValueError("CCA results missing. Run run_skr_cca() first.")
63
+
64
+ scale_pcs = getattr(obj, "scale_pcs", True)
65
+ n_cc = obj.n_cc
66
+
67
+ # Scaled PC matrices
68
+ X_dict = _prepare_pc_matrices(obj, scale_pcs, cts)
69
+
70
+ # Pairs
71
+ if len(cts) == 1:
72
+ pairs = [(cts[0], cts[0])]
73
+ else:
74
+ pairs = list(combinations(cts, 2))
75
+
76
+ print("Calculating spectral norms (may take a while)...")
77
+
78
+ # Precompute spectral norms for each sigma × pair
79
+ spec_norms = {}
80
+ for sigma in obj.sigma_values:
81
+ spec_norms[sigma] = {}
82
+ for ct_i, ct_j in pairs:
83
+ try:
84
+ K = _get_kernel_for_pair(obj.kernel_matrices, sigma, ct_i, ct_j)
85
+ spec_norms[sigma][(ct_i, ct_j)] = _spectral_norm(K, tol=tol)
86
+ spec_norms[sigma][(ct_j, ct_i)] = spec_norms[sigma][(ct_i, ct_j)]
87
+ except KeyError:
88
+ spec_norms[sigma][(ct_i, ct_j)] = np.nan
89
+
90
+ print("Finished calculating spectral norms.")
91
+
92
+ correlation_value = {}
93
+
94
+ for sigma in obj.sigma_values:
95
+ sigma_name = f"sigma_{sigma}"
96
+ w_sigma = obj.skr_cca_out.get(sigma_name)
97
+ if w_sigma is None:
98
+ continue
99
+
100
+ rows = []
101
+ for ct_i, ct_j in pairs:
102
+ A = X_dict[ct_i]
103
+ B = X_dict[ct_j]
104
+ try:
105
+ K = _get_kernel_for_pair(obj.kernel_matrices, sigma, ct_i, ct_j)
106
+ except KeyError:
107
+ continue
108
+ norm_K = spec_norms[sigma].get((ct_i, ct_j), np.nan)
109
+
110
+ for cc in range(n_cc):
111
+ w1 = w_sigma[ct_i][:, cc : cc + 1]
112
+ w2 = w_sigma[ct_j][:, cc : cc + 1]
113
+
114
+ Aw1 = A @ w1
115
+ Bw2 = B @ w2
116
+
117
+ numerator = float((Aw1.T @ K @ Bw2).flat[0])
118
+ denom = float(np.sqrt(np.sum(Aw1 ** 2))) * float(np.sqrt(np.sum(Bw2 ** 2))) * norm_K
119
+
120
+ norm_corr = 0.0 if abs(denom) < 1e-9 else numerator / denom
121
+
122
+ rows.append({
123
+ "sigma": sigma,
124
+ "cell_type_1": ct_i,
125
+ "cell_type_2": ct_j,
126
+ "CC_index": cc + 1,
127
+ "normalized_correlation": norm_corr,
128
+ })
129
+
130
+ correlation_value[sigma_name] = pd.DataFrame(rows)
131
+
132
+ obj.normalized_correlation = correlation_value
133
+
134
+ # Choose sigma maximizing mean CC1 correlation
135
+ all_cc1 = []
136
+ for sigma_name, df in correlation_value.items():
137
+ if df is not None and len(df) > 0:
138
+ cc1 = df[df["CC_index"] == 1]
139
+ mean_corr = cc1["normalized_correlation"].mean()
140
+ sigma_val = float(sigma_name.replace("sigma_", ""))
141
+ all_cc1.append((sigma_val, mean_corr))
142
+
143
+ if all_cc1:
144
+ obj.sigma_value_choice = max(all_cc1, key=lambda x: x[1])[0]
145
+
146
+ return obj
147
+
148
+
149
+ def _compute_normalized_correlation_multi(obj, tol=1e-4):
150
+ """Multi-slide normalized correlation: per-slide values matching R format.
151
+
152
+ R computes normalized correlation independently for each slide using the
153
+ raw (unscaled) per-slide PCA scores from pcaResults (not scaled by sdev).
154
+ We replicate this: for each (sigma, slide, pair, CC), compute norm_corr
155
+ using only that slide's raw PCA scores and per-slide spectral norm.
156
+ Sigma choice is based on the mean CC1 correlation across slides.
157
+ """
158
+ cts = obj.cell_types_of_interest
159
+ slides = obj.slide_list
160
+ n_cc = obj.n_cc
161
+
162
+ # Use raw (unscaled) per-slide PCA scores — matching R's pcaResults usage
163
+ X_list_all = {
164
+ slide: {ct: obj.pca_results[slide][ct].astype(float)
165
+ for ct in cts if ct in obj.pca_results.get(slide, {})}
166
+ for slide in slides
167
+ }
168
+
169
+ if len(cts) == 1:
170
+ pairs = [(cts[0], cts[0])]
171
+ else:
172
+ pairs = list(combinations(cts, 2))
173
+
174
+ # Precompute per-slide spectral norms for each sigma × pair
175
+ print("Calculating spectral norms (multi-slide)...")
176
+ spec_norms = {} # spec_norms[sigma][(ct_i, ct_j, slide)]
177
+ for sigma in obj.sigma_values:
178
+ spec_norms[sigma] = {}
179
+ for ct_i, ct_j in pairs:
180
+ for slide in slides:
181
+ try:
182
+ K = _get_kernel_for_pair(obj.kernel_matrices, sigma, ct_i, ct_j, slide)
183
+ val = _spectral_norm(K, tol)
184
+ except KeyError:
185
+ val = np.nan
186
+ spec_norms[sigma][(ct_i, ct_j, slide)] = val
187
+ spec_norms[sigma][(ct_j, ct_i, slide)] = val
188
+ print("Finished spectral norms.")
189
+
190
+ correlation_value = {}
191
+
192
+ for sigma in obj.sigma_values:
193
+ sigma_name = f"sigma_{sigma}"
194
+ w_sigma = obj.skr_cca_out.get(sigma_name)
195
+ if w_sigma is None:
196
+ continue
197
+
198
+ rows = []
199
+ for ct_i, ct_j in pairs:
200
+ for cc in range(n_cc):
201
+ w1 = w_sigma[ct_i][:, cc:cc+1]
202
+ w2 = w_sigma[ct_j][:, cc:cc+1]
203
+
204
+ # Per-slide correlation (matches R format)
205
+ for slide in slides:
206
+ A = X_list_all[slide].get(ct_i)
207
+ B = X_list_all[slide].get(ct_j)
208
+ if A is None or B is None:
209
+ continue
210
+ try:
211
+ K = _get_kernel_for_pair(obj.kernel_matrices, sigma, ct_i, ct_j, slide)
212
+ except KeyError:
213
+ continue
214
+
215
+ norm_K = spec_norms[sigma].get((ct_i, ct_j, slide), np.nan)
216
+ Aw1 = A @ w1
217
+ Bw2 = B @ w2
218
+ numerator = float((Aw1.T @ K @ Bw2).flat[0])
219
+ denom = (float(np.linalg.norm(Aw1)) *
220
+ float(np.linalg.norm(Bw2)) *
221
+ norm_K)
222
+ norm_corr = 0.0 if abs(denom) < 1e-9 else numerator / denom
223
+
224
+ rows.append({
225
+ "sigma": sigma,
226
+ "slideID": slide,
227
+ "cell_type_1": ct_i,
228
+ "cell_type_2": ct_j,
229
+ "CC_index": cc + 1,
230
+ "normalized_correlation": norm_corr,
231
+ })
232
+
233
+ correlation_value[sigma_name] = pd.DataFrame(rows)
234
+
235
+ obj.normalized_correlation = correlation_value
236
+
237
+ # Choose sigma maximizing mean CC1 correlation across slides
238
+ all_cc1 = []
239
+ for sigma_name, df in correlation_value.items():
240
+ if df is not None and len(df) > 0:
241
+ cc1 = df[df["CC_index"] == 1]
242
+ mean_corr = cc1["normalized_correlation"].mean()
243
+ sigma_val = float(sigma_name.replace("sigma_", ""))
244
+ all_cc1.append((sigma_val, mean_corr))
245
+ if all_cc1:
246
+ obj.sigma_value_choice = max(all_cc1, key=lambda x: x[1])[0]
247
+
248
+ return obj
copro/distance.py ADDED
@@ -0,0 +1,190 @@
1
+ """compute_distance() — pairwise Euclidean distances between cell types."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import warnings
6
+ from itertools import combinations
7
+
8
+ import numpy as np
9
+ from scipy.spatial.distance import cdist
10
+
11
+ from .core import CoProSingle
12
+
13
+
14
+ def _dist_flat_name(ct_i: str, ct_j: str) -> str:
15
+ return f"dist|{ct_i}|{ct_j}"
16
+
17
+
18
+ def _process_distance_matrix(
19
+ dist_mat: np.ndarray,
20
+ truncate: bool,
21
+ percentile_choice: float | None = None,
22
+ set_diag_inf: bool = False,
23
+ ) -> tuple[np.ndarray, float]:
24
+ """Process distance matrix: handle zeros, compute percentile, optionally truncate.
25
+
26
+ Returns (processed_matrix, dist_percentile).
27
+ """
28
+ dist_mat = dist_mat.copy()
29
+
30
+ if set_diag_inf:
31
+ np.fill_diagonal(dist_mat, np.inf)
32
+
33
+ # Replace zeros (overlapping cells) with smallest non-zero
34
+ if np.any(dist_mat == 0):
35
+ min_nz = np.min(dist_mat[dist_mat > 0]) if np.any(dist_mat > 0) else 1.0
36
+ dist_mat[dist_mat == 0] = min_nz
37
+ warnings.warn(
38
+ "Zero distances detected; replaced with smallest non-zero distance."
39
+ )
40
+
41
+ # Choose percentile threshold
42
+ finite_vals = dist_mat[np.isfinite(dist_mat) & (dist_mat > 0)]
43
+ if len(finite_vals) == 0:
44
+ raise ValueError("No finite non-zero distances found.")
45
+
46
+ if percentile_choice is None:
47
+ percentile_choice = min(1e-3, 2.0 / max(dist_mat.shape))
48
+
49
+ dist_percentile = float(np.quantile(finite_vals, percentile_choice))
50
+
51
+ if truncate:
52
+ mask = (dist_mat < dist_percentile) & np.isfinite(dist_mat)
53
+ dist_mat[mask] = dist_percentile
54
+
55
+ return dist_mat, dist_percentile
56
+
57
+
58
+ def compute_distance(
59
+ obj,
60
+ dist_type: str = "Euclidean2D",
61
+ normalize: bool = True,
62
+ truncate: bool = True,
63
+ ):
64
+ """Compute pairwise Euclidean distance matrices between all cell-type pairs.
65
+
66
+ Dispatches to multi-slide version for CoProMulti objects.
67
+
68
+ For single-slide, 2+ types: pairs (ct_i, ct_j) stored under 'dist|ct_i|ct_j'.
69
+ For single-slide, 1 type: within-type (ct, ct) stored under 'dist|ct|ct'.
70
+ For multi-slide: keys include slide: 'dist|{slide}|ct_i|ct_j'.
71
+
72
+ Normalization: scales so 0.001th-percentile distance (across all pairs) equals 0.01.
73
+ """
74
+ from .core import CoProMulti
75
+ if isinstance(obj, CoProMulti):
76
+ return _compute_distance_multi(obj, dist_type, normalize, truncate)
77
+
78
+ # --- Single-slide path ---
79
+ cts = obj.cell_types_of_interest
80
+ if not cts:
81
+ raise ValueError("No cell types of interest. Run subset_data() first.")
82
+
83
+ if dist_type != "Euclidean2D":
84
+ raise NotImplementedError(f"dist_type '{dist_type}' not implemented. Use 'Euclidean2D'.")
85
+
86
+ loc = obj.location_data_sub
87
+ if not {"x", "y"}.issubset(loc.columns):
88
+ raise ValueError("location_data_sub must have columns 'x' and 'y'.")
89
+
90
+ distances = {}
91
+
92
+ if len(cts) == 1:
93
+ # Within-type only
94
+ ct = cts[0]
95
+ mask = obj.cell_types_sub == ct
96
+ coords = loc.loc[mask, ["x", "y"]].values.astype(float)
97
+ dist_mat = cdist(coords, coords)
98
+ dist_mat, dist_percentile = _process_distance_matrix(
99
+ dist_mat, truncate, percentile_choice=1e-4, set_diag_inf=True
100
+ )
101
+ flat_name = _dist_flat_name(ct, ct)
102
+ distances[flat_name] = dist_mat
103
+
104
+ if normalize:
105
+ scaling_factor = 0.01 / dist_percentile
106
+ distances[flat_name] = dist_mat * scaling_factor
107
+
108
+ else:
109
+ # Between-type pairs
110
+ pairs = list(combinations(cts, 2))
111
+ dist_percentiles = []
112
+ raw_mats = {}
113
+
114
+ for ct_i, ct_j in pairs:
115
+ mask_i = obj.cell_types_sub == ct_i
116
+ mask_j = obj.cell_types_sub == ct_j
117
+ coords_i = loc.loc[mask_i, ["x", "y"]].values.astype(float)
118
+ coords_j = loc.loc[mask_j, ["x", "y"]].values.astype(float)
119
+
120
+ dist_mat = cdist(coords_i, coords_j)
121
+ dist_mat, dist_pct = _process_distance_matrix(dist_mat, truncate)
122
+ dist_percentiles.append(dist_pct)
123
+
124
+ flat_name = _dist_flat_name(ct_i, ct_j)
125
+ raw_mats[flat_name] = dist_mat
126
+
127
+ if normalize:
128
+ min_percentile = min(dist_percentiles)
129
+ scaling_factor = 0.01 / min_percentile
130
+ for flat_name, dist_mat in raw_mats.items():
131
+ distances[flat_name] = dist_mat * scaling_factor
132
+ else:
133
+ distances = raw_mats
134
+
135
+ obj.distances = distances
136
+ return obj
137
+
138
+
139
+ def _compute_distance_multi(obj, dist_type="Euclidean2D", normalize=True, truncate=True):
140
+ """Multi-slide distance computation. Keys: 'dist|{slide}|ct_i|ct_j'."""
141
+ cts = obj.cell_types_of_interest
142
+ slides = obj.slide_list
143
+ slide_ids = obj.meta_data_sub["slideID"].values
144
+ loc = obj.location_data_sub
145
+
146
+ if not {"x", "y"}.issubset(loc.columns):
147
+ raise ValueError("location_data_sub must have columns 'x' and 'y'.")
148
+
149
+ distances = {}
150
+ all_percentiles = []
151
+ raw_mats = {}
152
+
153
+ if len(cts) == 1:
154
+ ct = cts[0]
155
+ for slide in slides:
156
+ slide_ct_mask = (obj.cell_types_sub == ct) & (slide_ids == slide)
157
+ if np.sum(slide_ct_mask) <= 5:
158
+ continue
159
+ coords = loc.loc[slide_ct_mask, ["x", "y"]].values.astype(float)
160
+ dist_mat = cdist(coords, coords)
161
+ dist_mat, pct = _process_distance_matrix(dist_mat, truncate, percentile_choice=1e-4, set_diag_inf=True)
162
+ flat_name = f"dist|{slide}|{ct}|{ct}"
163
+ raw_mats[flat_name] = dist_mat
164
+ all_percentiles.append(pct)
165
+ else:
166
+ pairs = list(combinations(cts, 2))
167
+ for slide in slides:
168
+ for ct_i, ct_j in pairs:
169
+ mask_i = (obj.cell_types_sub == ct_i) & (slide_ids == slide)
170
+ mask_j = (obj.cell_types_sub == ct_j) & (slide_ids == slide)
171
+ if np.sum(mask_i) <= 5 or np.sum(mask_j) <= 5:
172
+ continue
173
+ coords_i = loc.loc[mask_i, ["x", "y"]].values.astype(float)
174
+ coords_j = loc.loc[mask_j, ["x", "y"]].values.astype(float)
175
+ dist_mat = cdist(coords_i, coords_j)
176
+ dist_mat, pct = _process_distance_matrix(dist_mat, truncate)
177
+ flat_name = f"dist|{slide}|{ct_i}|{ct_j}"
178
+ raw_mats[flat_name] = dist_mat
179
+ all_percentiles.append(pct)
180
+
181
+ if normalize and all_percentiles:
182
+ global_min = min(all_percentiles)
183
+ scaling_factor = 0.01 / global_min
184
+ for k, v in raw_mats.items():
185
+ distances[k] = v * scaling_factor
186
+ else:
187
+ distances = raw_mats
188
+
189
+ obj.distances = distances
190
+ return obj