pycopro 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycopro-0.1.0/PKG-INFO +14 -0
- pycopro-0.1.0/README.md +161 -0
- pycopro-0.1.0/copro/__init__.py +23 -0
- pycopro-0.1.0/copro/core.py +124 -0
- pycopro-0.1.0/copro/correlation.py +248 -0
- pycopro-0.1.0/copro/distance.py +190 -0
- pycopro-0.1.0/copro/kernel.py +230 -0
- pycopro-0.1.0/copro/optimization.py +481 -0
- pycopro-0.1.0/copro/pca.py +173 -0
- pycopro-0.1.0/copro/scores.py +120 -0
- pycopro-0.1.0/copro/skrcca.py +150 -0
- pycopro-0.1.0/copro/utils.py +27 -0
- pycopro-0.1.0/pycopro.egg-info/PKG-INFO +14 -0
- pycopro-0.1.0/pycopro.egg-info/SOURCES.txt +20 -0
- pycopro-0.1.0/pycopro.egg-info/dependency_links.txt +1 -0
- pycopro-0.1.0/pycopro.egg-info/requires.txt +10 -0
- pycopro-0.1.0/pycopro.egg-info/top_level.txt +1 -0
- pycopro-0.1.0/pyproject.toml +24 -0
- pycopro-0.1.0/setup.cfg +4 -0
- pycopro-0.1.0/tests/test_alt_round1.py +166 -0
- pycopro-0.1.0/tests/test_multi_slide.py +278 -0
- pycopro-0.1.0/tests/test_null_round1.py +103 -0
pycopro-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pycopro
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Spatial Kernel-based Reduced Rank CCA for spatial transcriptomics
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: numpy>=1.24
|
|
7
|
+
Requires-Dist: scipy>=1.10
|
|
8
|
+
Requires-Dist: pandas>=2.0
|
|
9
|
+
Requires-Dist: scikit-learn>=1.3
|
|
10
|
+
Requires-Dist: pyarrow>=12
|
|
11
|
+
Requires-Dist: anndata>=0.10
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
14
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
pycopro-0.1.0/README.md
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# CoPro (Python)
|
|
2
|
+
|
|
3
|
+
**Unsupervised detection of coordinated spatial progressions in spatial transcriptomics**
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org)
|
|
6
|
+
[](https://pypi.org/project/pycopro/)
|
|
7
|
+
[](https://github.com/Zhen-Miao/copro-python)
|
|
8
|
+
[](LICENSE)
|
|
9
|
+
|
|
10
|
+
CoPro detects **coordinated spatial progressions** between cell types in spatial transcriptomics data. Given the spatial positions and gene expression profiles of cells, CoPro finds a low-dimensional axis along which cells of one type are spatially co-organized with cells of another type — or within a single cell type — revealing continuous tissue structure that discrete clustering misses.
|
|
11
|
+
|
|
12
|
+
The method is built on **Spatial Kernel-based Reduced Rank CCA (SkrCCA)**: a power-method optimization that maximizes a spatially-weighted cross-covariance between cell type-specific PC score matrices.
|
|
13
|
+
|
|
14
|
+
> **R package:** The original R implementation is available at [github.com/Zhen-Miao/CoPro](https://github.com/Zhen-Miao/CoPro).
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install pycopro
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Or install from source:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
git clone https://github.com/Zhen-Miao/copro-python.git
|
|
28
|
+
cd copro-python
|
|
29
|
+
pip install -e .
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
**Requirements:** Python ≥ 3.10, NumPy, SciPy, pandas, scikit-learn.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Quick start
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import numpy as np
|
|
40
|
+
import pandas as pd
|
|
41
|
+
import copro as cp
|
|
42
|
+
|
|
43
|
+
# Inputs
|
|
44
|
+
# expr_norm : np.ndarray (cells × genes, normalized)
|
|
45
|
+
# location : pd.DataFrame with columns "x", "y"
|
|
46
|
+
# cell_types: np.ndarray (per-cell type labels)
|
|
47
|
+
|
|
48
|
+
obj = cp.CoProSingle(
|
|
49
|
+
normalized_data=expr_norm,
|
|
50
|
+
location_data=location,
|
|
51
|
+
meta_data=meta,
|
|
52
|
+
cell_types=cell_types,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
obj = cp.subset_data(obj, ["Cell type A", "Cell type B"])
|
|
56
|
+
obj = cp.compute_pca(obj, n_pca=30)
|
|
57
|
+
obj = cp.compute_distance(obj, normalize=False)
|
|
58
|
+
obj = cp.compute_kernel_matrix(obj, sigma_values=[0.1, 0.2, 0.5])
|
|
59
|
+
obj = cp.run_skr_cca(obj, scale_pcs=True, n_cc=2)
|
|
60
|
+
obj = cp.compute_normalized_correlation(obj)
|
|
61
|
+
obj = cp.compute_gene_and_cell_scores(obj)
|
|
62
|
+
|
|
63
|
+
print("Selected sigma:", obj.sigma_value_choice)
|
|
64
|
+
|
|
65
|
+
# Cell scores for the optimal sigma
|
|
66
|
+
sigma = obj.sigma_value_choice
|
|
67
|
+
scores_A = obj.cell_scores[f"cellScores|sigma{sigma}|Cell type A"][:, 0]
|
|
68
|
+
scores_B = obj.cell_scores[f"cellScores|sigma{sigma}|Cell type B"][:, 0]
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## How it works
|
|
74
|
+
|
|
75
|
+
CoPro runs a seven-step pipeline:
|
|
76
|
+
|
|
77
|
+
| Step | Function | Description |
|
|
78
|
+
|------|----------|-------------|
|
|
79
|
+
| 1 | `subset_data` | Filter to cell types of interest |
|
|
80
|
+
| 2 | `compute_pca` | Truncated PCA per cell type (ARPACK, matching R IRLBA) |
|
|
81
|
+
| 3 | `compute_distance` | Pairwise Euclidean distances (within and between types) |
|
|
82
|
+
| 4 | `compute_kernel_matrix` | Gaussian RBF kernel: K = exp(−d²/2σ²) |
|
|
83
|
+
| 5 | `run_skr_cca` | SkrCCA power-method optimization over σ values |
|
|
84
|
+
| 6 | `compute_normalized_correlation` | Spectral-norm normalized CCA correlation per σ and CC |
|
|
85
|
+
| 7 | `compute_gene_and_cell_scores` | Project CCA weights to cell and gene space |
|
|
86
|
+
|
|
87
|
+
Sigma selection is automatic: the σ that maximizes the mean CC1 normalized correlation is chosen as `obj.sigma_value_choice`.
|
|
88
|
+
|
|
89
|
+
**Multi-slide support:** Use `CoProMulti` for datasets spanning multiple tissue sections. CCA weights are learned jointly across slides; cell scores are computed per slide.
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Tutorials
|
|
94
|
+
|
|
95
|
+
| Notebook | Dataset | Description |
|
|
96
|
+
|----------|---------|-------------|
|
|
97
|
+
| [`tutorials/tutorial_one_cell_type.ipynb`](tutorials/tutorial_one_cell_type.ipynb) | Intestinal organoid (seqFISH) | Within-type spatial self-organization of epithelial cells along the crypt–villus axis |
|
|
98
|
+
| [`tutorials/tutorial_two_cell_types.ipynb`](tutorials/tutorial_two_cell_types.ipynb) | Mouse brain MERFISH (Zhang et al. *Nature* 2023) | Cross-type co-progression between D1 and D2 striatal neurons |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## API reference
|
|
103
|
+
|
|
104
|
+
### Data containers
|
|
105
|
+
|
|
106
|
+
**`CoProSingle(normalized_data, location_data, meta_data, cell_types)`**
|
|
107
|
+
Single-slide state container. All pipeline functions modify and return this object.
|
|
108
|
+
|
|
109
|
+
**`CoProMulti(normalized_data, location_data, meta_data, cell_types)`**
|
|
110
|
+
Multi-slide variant. Requires a `slideID` column in `meta_data`.
|
|
111
|
+
|
|
112
|
+
### Pipeline functions
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
cp.subset_data(obj, cell_types_of_interest)
|
|
116
|
+
cp.compute_pca(obj, n_pca=30, center=True, scale=True)
|
|
117
|
+
cp.compute_distance(obj, normalize=False)
|
|
118
|
+
cp.compute_kernel_matrix(obj, sigma_values, upper_quantile=0.85, lower_limit=5e-7)
|
|
119
|
+
cp.run_skr_cca(obj, scale_pcs=True, n_cc=2, max_iter=500, tol=1e-5)
|
|
120
|
+
cp.compute_normalized_correlation(obj, tol=1e-4)
|
|
121
|
+
cp.compute_gene_and_cell_scores(obj)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Key output slots
|
|
125
|
+
|
|
126
|
+
| Attribute | Type | Description |
|
|
127
|
+
|-----------|------|-------------|
|
|
128
|
+
| `obj.sigma_value_choice` | `float` | Automatically selected σ |
|
|
129
|
+
| `obj.normalized_correlation` | `dict[str, DataFrame]` | Normalized correlation per σ and CC |
|
|
130
|
+
| `obj.cell_scores` | `dict[str, ndarray]` | Cell scores, keyed `"cellScores\|sigma{s}\|{ct}"` |
|
|
131
|
+
| `obj.gene_scores` | `dict[str, ndarray]` | Gene scores, keyed `"geneScores\|sigma{s}\|{ct}"` |
|
|
132
|
+
| `obj.kernel_matrices` | `dict[str, ndarray]` | Kernel matrices, keyed `"kernel\|sigma{s}\|{ct_i}\|{ct_j}"` |
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Numerical equivalence with R
|
|
137
|
+
|
|
138
|
+
The Python implementation is numerically validated against the R package on simulation and real datasets. Key design choices ensuring equivalence:
|
|
139
|
+
|
|
140
|
+
- PCA uses `scipy.sparse.linalg.svds` (ARPACK), the same Krylov-subspace family as R's `irlba::prcomp_irlba`
|
|
141
|
+
- Sign convention matches `prcomp_irlba` via `sklearn.utils.extmath.svd_flip`
|
|
142
|
+
- Distance and kernel computations are algebraically identical to R's `fields::rdist` + Gaussian RBF
|
|
143
|
+
- Kernel matrices agree to machine epsilon (~10⁻¹⁶)
|
|
144
|
+
- Cell score Pearson |r| vs R ≥ 0.9999 across all tested datasets and sigma values
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Citation
|
|
149
|
+
|
|
150
|
+
If you use CoPro in your research, please cite:
|
|
151
|
+
|
|
152
|
+
> Miao Z. et al. *CoPro: Unsupervised detection of coordinated spatial progressions in spatial transcriptomics* (in preparation).
|
|
153
|
+
|
|
154
|
+
**MERFISH tutorial data:**
|
|
155
|
+
> Zhang, M., Pan, X., Jung, W. et al. Molecularly defined and spatially resolved cell atlas of the whole mouse brain. *Nature* 624, 343–354 (2023). https://doi.org/10.1038/s41586-023-06808-9
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## License
|
|
160
|
+
|
|
161
|
+
MIT
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""CoPro Python — Spatial Kernel-based Reduced Rank CCA for spatial transcriptomics."""
|
|
2
|
+
|
|
3
|
+
from .core import CoProSingle, CoProMulti, subset_data
|
|
4
|
+
from .pca import compute_pca
|
|
5
|
+
from .distance import compute_distance
|
|
6
|
+
from .kernel import compute_kernel_matrix
|
|
7
|
+
from .skrcca import run_skr_cca
|
|
8
|
+
from .correlation import compute_normalized_correlation
|
|
9
|
+
from .scores import compute_gene_and_cell_scores
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"CoProSingle",
|
|
13
|
+
"CoProMulti",
|
|
14
|
+
"subset_data",
|
|
15
|
+
"compute_pca",
|
|
16
|
+
"compute_distance",
|
|
17
|
+
"compute_kernel_matrix",
|
|
18
|
+
"run_skr_cca",
|
|
19
|
+
"compute_normalized_correlation",
|
|
20
|
+
"compute_gene_and_cell_scores",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""CoProSingle and CoProMulti dataclasses — state containers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class CoProSingle:
|
|
14
|
+
# Input data
|
|
15
|
+
normalized_data: np.ndarray # cells × genes
|
|
16
|
+
location_data: pd.DataFrame # cells × {x, y, ...}
|
|
17
|
+
meta_data: pd.DataFrame
|
|
18
|
+
cell_types: np.ndarray # per-cell label vector
|
|
19
|
+
|
|
20
|
+
# Set by subset_data
|
|
21
|
+
cell_types_of_interest: list = field(default_factory=list)
|
|
22
|
+
normalized_data_sub: Optional[np.ndarray] = None
|
|
23
|
+
location_data_sub: Optional[pd.DataFrame] = None
|
|
24
|
+
cell_types_sub: Optional[np.ndarray] = None
|
|
25
|
+
|
|
26
|
+
# Computed results (keyed dicts)
|
|
27
|
+
pca_global: dict = field(default_factory=dict) # ct → dict with components/scores/sdev
|
|
28
|
+
distances: dict = field(default_factory=dict) # flat keys: "dist|A|B"
|
|
29
|
+
kernel_matrices: dict = field(default_factory=dict) # flat keys: "kernel|sigma0.1|A|B"
|
|
30
|
+
sigma_values: list = field(default_factory=list)
|
|
31
|
+
skr_cca_out: dict = field(default_factory=dict) # "sigma_0.1" → {ct: w_matrix}
|
|
32
|
+
normalized_correlation: dict = field(default_factory=dict)
|
|
33
|
+
sigma_value_choice: Optional[float] = None
|
|
34
|
+
cell_scores: dict = field(default_factory=dict)
|
|
35
|
+
gene_scores: dict = field(default_factory=dict)
|
|
36
|
+
n_cc: int = 2
|
|
37
|
+
n_pca: int = 30
|
|
38
|
+
scale_pcs: bool = True
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class CoProMulti:
|
|
43
|
+
"""Multi-slide CoPro object. meta_data must have a 'slideID' column."""
|
|
44
|
+
# Input data
|
|
45
|
+
normalized_data: np.ndarray
|
|
46
|
+
location_data: pd.DataFrame
|
|
47
|
+
meta_data: pd.DataFrame
|
|
48
|
+
cell_types: np.ndarray
|
|
49
|
+
|
|
50
|
+
# Slide list
|
|
51
|
+
slide_list: list = field(default_factory=list) # ordered list of slide IDs
|
|
52
|
+
|
|
53
|
+
# Set by subset_data
|
|
54
|
+
cell_types_of_interest: list = field(default_factory=list)
|
|
55
|
+
normalized_data_sub: Optional[np.ndarray] = None
|
|
56
|
+
location_data_sub: Optional[pd.DataFrame] = None
|
|
57
|
+
cell_types_sub: Optional[np.ndarray] = None
|
|
58
|
+
meta_data_sub: Optional[pd.DataFrame] = None # subset of meta_data (with slideID)
|
|
59
|
+
|
|
60
|
+
# PCA
|
|
61
|
+
pca_global: dict = field(default_factory=dict) # ct → global PCA dict (rotation, sdev)
|
|
62
|
+
pca_results: dict = field(default_factory=dict) # slide → {ct → scores matrix}
|
|
63
|
+
|
|
64
|
+
# Computed results
|
|
65
|
+
distances: dict = field(default_factory=dict) # flat keys: "dist|{slide}|A|B"
|
|
66
|
+
kernel_matrices: dict = field(default_factory=dict) # flat keys: "kernel|sigma0.1|{slide}|A|B"
|
|
67
|
+
sigma_values: list = field(default_factory=list)
|
|
68
|
+
skr_cca_out: dict = field(default_factory=dict) # "sigma_0.1" → {ct: w_matrix} (shared)
|
|
69
|
+
normalized_correlation: dict = field(default_factory=dict)
|
|
70
|
+
sigma_value_choice: Optional[float] = None
|
|
71
|
+
cell_scores: dict = field(default_factory=dict) # "cellScores|sigma0.1|{slide}|{ct}"
|
|
72
|
+
gene_scores: dict = field(default_factory=dict) # "geneScores|sigma0.1|{ct}" (shared)
|
|
73
|
+
n_cc: int = 2
|
|
74
|
+
n_pca: int = 30
|
|
75
|
+
scale_pcs: bool = True
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def subset_data(obj, cell_types_of_interest: list, min_cells: int = 10):
|
|
79
|
+
"""Filter data to listed cell types. Works for both CoProSingle and CoProMulti."""
|
|
80
|
+
if isinstance(obj, CoProMulti):
|
|
81
|
+
return _subset_data_multi(obj, cell_types_of_interest, min_cells)
|
|
82
|
+
else:
|
|
83
|
+
return _subset_data_single(obj, cell_types_of_interest, min_cells)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _subset_data_single(obj: CoProSingle, cell_types_of_interest: list, min_cells: int) -> CoProSingle:
|
|
87
|
+
for ct in cell_types_of_interest:
|
|
88
|
+
n = np.sum(obj.cell_types == ct)
|
|
89
|
+
if n < min_cells:
|
|
90
|
+
raise ValueError(
|
|
91
|
+
f"Cell type '{ct}' has only {n} cells (minimum {min_cells} required)."
|
|
92
|
+
)
|
|
93
|
+
mask = np.isin(obj.cell_types, cell_types_of_interest)
|
|
94
|
+
obj.cell_types_of_interest = list(cell_types_of_interest)
|
|
95
|
+
obj.normalized_data_sub = obj.normalized_data[mask]
|
|
96
|
+
obj.location_data_sub = obj.location_data.loc[mask].reset_index(drop=True)
|
|
97
|
+
obj.cell_types_sub = obj.cell_types[mask]
|
|
98
|
+
return obj
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _subset_data_multi(obj: CoProMulti, cell_types_of_interest: list, min_cells: int) -> CoProMulti:
|
|
102
|
+
"""Subset multi-slide object. Checks per-slide cell counts."""
|
|
103
|
+
if "slideID" not in obj.meta_data.columns:
|
|
104
|
+
raise ValueError("meta_data must have a 'slideID' column for CoProMulti.")
|
|
105
|
+
|
|
106
|
+
# Discover slide list from meta_data if not set
|
|
107
|
+
if not obj.slide_list:
|
|
108
|
+
obj.slide_list = sorted(obj.meta_data["slideID"].unique().tolist())
|
|
109
|
+
|
|
110
|
+
for ct in cell_types_of_interest:
|
|
111
|
+
# Check total across all slides
|
|
112
|
+
n_total = np.sum(obj.cell_types == ct)
|
|
113
|
+
if n_total < min_cells:
|
|
114
|
+
raise ValueError(
|
|
115
|
+
f"Cell type '{ct}' has only {n_total} cells total (minimum {min_cells} required)."
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
mask = np.isin(obj.cell_types, cell_types_of_interest)
|
|
119
|
+
obj.cell_types_of_interest = list(cell_types_of_interest)
|
|
120
|
+
obj.normalized_data_sub = obj.normalized_data[mask]
|
|
121
|
+
obj.location_data_sub = obj.location_data.loc[mask].reset_index(drop=True)
|
|
122
|
+
obj.cell_types_sub = obj.cell_types[mask]
|
|
123
|
+
obj.meta_data_sub = obj.meta_data.loc[mask].reset_index(drop=True)
|
|
124
|
+
return obj
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""compute_normalized_correlation() — spectral-norm normalized CCA correlation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from itertools import combinations
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from scipy.sparse.linalg import svds
|
|
10
|
+
|
|
11
|
+
from .core import CoProSingle
|
|
12
|
+
from .skrcca import _prepare_pc_matrices
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _spectral_norm(K: np.ndarray, tol: float = 1e-4) -> float:
|
|
16
|
+
"""Largest singular value of K (spectral norm)."""
|
|
17
|
+
try:
|
|
18
|
+
s = svds(K.astype(float), k=1, tol=tol, return_singular_vectors=False)
|
|
19
|
+
return float(s[0])
|
|
20
|
+
except Exception:
|
|
21
|
+
return float(np.linalg.norm(K, ord=2))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _get_kernel_for_pair(flat_kernels, sigma, ct_i, ct_j, slide=None):
|
|
26
|
+
"""Retrieve kernel, optionally slide-aware, trying both orderings."""
|
|
27
|
+
if slide is None:
|
|
28
|
+
name = f"kernel|sigma{sigma}|{ct_i}|{ct_j}"
|
|
29
|
+
name_sym = f"kernel|sigma{sigma}|{ct_j}|{ct_i}"
|
|
30
|
+
else:
|
|
31
|
+
name = f"kernel|sigma{sigma}|{slide}|{ct_i}|{ct_j}"
|
|
32
|
+
name_sym = f"kernel|sigma{sigma}|{slide}|{ct_j}|{ct_i}"
|
|
33
|
+
if name in flat_kernels:
|
|
34
|
+
return flat_kernels[name]
|
|
35
|
+
if name_sym in flat_kernels:
|
|
36
|
+
return flat_kernels[name_sym].T
|
|
37
|
+
raise KeyError(f"Kernel not found for ({ct_i},{ct_j}) sigma={sigma} slide={slide}")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def compute_normalized_correlation(obj, tol: float = 1e-4):
|
|
41
|
+
"""Compute normalized CCA correlation for each sigma × pair × CC.
|
|
42
|
+
|
|
43
|
+
Dispatches to multi-slide version for CoProMulti objects.
|
|
44
|
+
|
|
45
|
+
Formula:
|
|
46
|
+
numerator = (A @ w1)^T K (B @ w2)
|
|
47
|
+
denominator = ||A @ w1|| * ||B @ w2|| * ||K||_spec
|
|
48
|
+
norm_corr = numerator / denominator
|
|
49
|
+
|
|
50
|
+
Stores in obj.normalized_correlation[sigma_name] = DataFrame.
|
|
51
|
+
Chooses obj.sigma_value_choice as sigma maximizing mean CC1 correlation.
|
|
52
|
+
"""
|
|
53
|
+
from .core import CoProMulti
|
|
54
|
+
if isinstance(obj, CoProMulti):
|
|
55
|
+
return _compute_normalized_correlation_multi(obj, tol)
|
|
56
|
+
|
|
57
|
+
# --- Single-slide path ---
|
|
58
|
+
cts = obj.cell_types_of_interest
|
|
59
|
+
if not cts:
|
|
60
|
+
raise ValueError("No cell types of interest.")
|
|
61
|
+
if not obj.skr_cca_out:
|
|
62
|
+
raise ValueError("CCA results missing. Run run_skr_cca() first.")
|
|
63
|
+
|
|
64
|
+
scale_pcs = getattr(obj, "scale_pcs", True)
|
|
65
|
+
n_cc = obj.n_cc
|
|
66
|
+
|
|
67
|
+
# Scaled PC matrices
|
|
68
|
+
X_dict = _prepare_pc_matrices(obj, scale_pcs, cts)
|
|
69
|
+
|
|
70
|
+
# Pairs
|
|
71
|
+
if len(cts) == 1:
|
|
72
|
+
pairs = [(cts[0], cts[0])]
|
|
73
|
+
else:
|
|
74
|
+
pairs = list(combinations(cts, 2))
|
|
75
|
+
|
|
76
|
+
print("Calculating spectral norms (may take a while)...")
|
|
77
|
+
|
|
78
|
+
# Precompute spectral norms for each sigma × pair
|
|
79
|
+
spec_norms = {}
|
|
80
|
+
for sigma in obj.sigma_values:
|
|
81
|
+
spec_norms[sigma] = {}
|
|
82
|
+
for ct_i, ct_j in pairs:
|
|
83
|
+
try:
|
|
84
|
+
K = _get_kernel_for_pair(obj.kernel_matrices, sigma, ct_i, ct_j)
|
|
85
|
+
spec_norms[sigma][(ct_i, ct_j)] = _spectral_norm(K, tol=tol)
|
|
86
|
+
spec_norms[sigma][(ct_j, ct_i)] = spec_norms[sigma][(ct_i, ct_j)]
|
|
87
|
+
except KeyError:
|
|
88
|
+
spec_norms[sigma][(ct_i, ct_j)] = np.nan
|
|
89
|
+
|
|
90
|
+
print("Finished calculating spectral norms.")
|
|
91
|
+
|
|
92
|
+
correlation_value = {}
|
|
93
|
+
|
|
94
|
+
for sigma in obj.sigma_values:
|
|
95
|
+
sigma_name = f"sigma_{sigma}"
|
|
96
|
+
w_sigma = obj.skr_cca_out.get(sigma_name)
|
|
97
|
+
if w_sigma is None:
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
rows = []
|
|
101
|
+
for ct_i, ct_j in pairs:
|
|
102
|
+
A = X_dict[ct_i]
|
|
103
|
+
B = X_dict[ct_j]
|
|
104
|
+
try:
|
|
105
|
+
K = _get_kernel_for_pair(obj.kernel_matrices, sigma, ct_i, ct_j)
|
|
106
|
+
except KeyError:
|
|
107
|
+
continue
|
|
108
|
+
norm_K = spec_norms[sigma].get((ct_i, ct_j), np.nan)
|
|
109
|
+
|
|
110
|
+
for cc in range(n_cc):
|
|
111
|
+
w1 = w_sigma[ct_i][:, cc : cc + 1]
|
|
112
|
+
w2 = w_sigma[ct_j][:, cc : cc + 1]
|
|
113
|
+
|
|
114
|
+
Aw1 = A @ w1
|
|
115
|
+
Bw2 = B @ w2
|
|
116
|
+
|
|
117
|
+
numerator = float((Aw1.T @ K @ Bw2).flat[0])
|
|
118
|
+
denom = float(np.sqrt(np.sum(Aw1 ** 2))) * float(np.sqrt(np.sum(Bw2 ** 2))) * norm_K
|
|
119
|
+
|
|
120
|
+
norm_corr = 0.0 if abs(denom) < 1e-9 else numerator / denom
|
|
121
|
+
|
|
122
|
+
rows.append({
|
|
123
|
+
"sigma": sigma,
|
|
124
|
+
"cell_type_1": ct_i,
|
|
125
|
+
"cell_type_2": ct_j,
|
|
126
|
+
"CC_index": cc + 1,
|
|
127
|
+
"normalized_correlation": norm_corr,
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
correlation_value[sigma_name] = pd.DataFrame(rows)
|
|
131
|
+
|
|
132
|
+
obj.normalized_correlation = correlation_value
|
|
133
|
+
|
|
134
|
+
# Choose sigma maximizing mean CC1 correlation
|
|
135
|
+
all_cc1 = []
|
|
136
|
+
for sigma_name, df in correlation_value.items():
|
|
137
|
+
if df is not None and len(df) > 0:
|
|
138
|
+
cc1 = df[df["CC_index"] == 1]
|
|
139
|
+
mean_corr = cc1["normalized_correlation"].mean()
|
|
140
|
+
sigma_val = float(sigma_name.replace("sigma_", ""))
|
|
141
|
+
all_cc1.append((sigma_val, mean_corr))
|
|
142
|
+
|
|
143
|
+
if all_cc1:
|
|
144
|
+
obj.sigma_value_choice = max(all_cc1, key=lambda x: x[1])[0]
|
|
145
|
+
|
|
146
|
+
return obj
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _compute_normalized_correlation_multi(obj, tol=1e-4):
|
|
150
|
+
"""Multi-slide normalized correlation: per-slide values matching R format.
|
|
151
|
+
|
|
152
|
+
R computes normalized correlation independently for each slide using the
|
|
153
|
+
raw (unscaled) per-slide PCA scores from pcaResults (not scaled by sdev).
|
|
154
|
+
We replicate this: for each (sigma, slide, pair, CC), compute norm_corr
|
|
155
|
+
using only that slide's raw PCA scores and per-slide spectral norm.
|
|
156
|
+
Sigma choice is based on the mean CC1 correlation across slides.
|
|
157
|
+
"""
|
|
158
|
+
cts = obj.cell_types_of_interest
|
|
159
|
+
slides = obj.slide_list
|
|
160
|
+
n_cc = obj.n_cc
|
|
161
|
+
|
|
162
|
+
# Use raw (unscaled) per-slide PCA scores — matching R's pcaResults usage
|
|
163
|
+
X_list_all = {
|
|
164
|
+
slide: {ct: obj.pca_results[slide][ct].astype(float)
|
|
165
|
+
for ct in cts if ct in obj.pca_results.get(slide, {})}
|
|
166
|
+
for slide in slides
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if len(cts) == 1:
|
|
170
|
+
pairs = [(cts[0], cts[0])]
|
|
171
|
+
else:
|
|
172
|
+
pairs = list(combinations(cts, 2))
|
|
173
|
+
|
|
174
|
+
# Precompute per-slide spectral norms for each sigma × pair
|
|
175
|
+
print("Calculating spectral norms (multi-slide)...")
|
|
176
|
+
spec_norms = {} # spec_norms[sigma][(ct_i, ct_j, slide)]
|
|
177
|
+
for sigma in obj.sigma_values:
|
|
178
|
+
spec_norms[sigma] = {}
|
|
179
|
+
for ct_i, ct_j in pairs:
|
|
180
|
+
for slide in slides:
|
|
181
|
+
try:
|
|
182
|
+
K = _get_kernel_for_pair(obj.kernel_matrices, sigma, ct_i, ct_j, slide)
|
|
183
|
+
val = _spectral_norm(K, tol)
|
|
184
|
+
except KeyError:
|
|
185
|
+
val = np.nan
|
|
186
|
+
spec_norms[sigma][(ct_i, ct_j, slide)] = val
|
|
187
|
+
spec_norms[sigma][(ct_j, ct_i, slide)] = val
|
|
188
|
+
print("Finished spectral norms.")
|
|
189
|
+
|
|
190
|
+
correlation_value = {}
|
|
191
|
+
|
|
192
|
+
for sigma in obj.sigma_values:
|
|
193
|
+
sigma_name = f"sigma_{sigma}"
|
|
194
|
+
w_sigma = obj.skr_cca_out.get(sigma_name)
|
|
195
|
+
if w_sigma is None:
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
rows = []
|
|
199
|
+
for ct_i, ct_j in pairs:
|
|
200
|
+
for cc in range(n_cc):
|
|
201
|
+
w1 = w_sigma[ct_i][:, cc:cc+1]
|
|
202
|
+
w2 = w_sigma[ct_j][:, cc:cc+1]
|
|
203
|
+
|
|
204
|
+
# Per-slide correlation (matches R format)
|
|
205
|
+
for slide in slides:
|
|
206
|
+
A = X_list_all[slide].get(ct_i)
|
|
207
|
+
B = X_list_all[slide].get(ct_j)
|
|
208
|
+
if A is None or B is None:
|
|
209
|
+
continue
|
|
210
|
+
try:
|
|
211
|
+
K = _get_kernel_for_pair(obj.kernel_matrices, sigma, ct_i, ct_j, slide)
|
|
212
|
+
except KeyError:
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
norm_K = spec_norms[sigma].get((ct_i, ct_j, slide), np.nan)
|
|
216
|
+
Aw1 = A @ w1
|
|
217
|
+
Bw2 = B @ w2
|
|
218
|
+
numerator = float((Aw1.T @ K @ Bw2).flat[0])
|
|
219
|
+
denom = (float(np.linalg.norm(Aw1)) *
|
|
220
|
+
float(np.linalg.norm(Bw2)) *
|
|
221
|
+
norm_K)
|
|
222
|
+
norm_corr = 0.0 if abs(denom) < 1e-9 else numerator / denom
|
|
223
|
+
|
|
224
|
+
rows.append({
|
|
225
|
+
"sigma": sigma,
|
|
226
|
+
"slideID": slide,
|
|
227
|
+
"cell_type_1": ct_i,
|
|
228
|
+
"cell_type_2": ct_j,
|
|
229
|
+
"CC_index": cc + 1,
|
|
230
|
+
"normalized_correlation": norm_corr,
|
|
231
|
+
})
|
|
232
|
+
|
|
233
|
+
correlation_value[sigma_name] = pd.DataFrame(rows)
|
|
234
|
+
|
|
235
|
+
obj.normalized_correlation = correlation_value
|
|
236
|
+
|
|
237
|
+
# Choose sigma maximizing mean CC1 correlation across slides
|
|
238
|
+
all_cc1 = []
|
|
239
|
+
for sigma_name, df in correlation_value.items():
|
|
240
|
+
if df is not None and len(df) > 0:
|
|
241
|
+
cc1 = df[df["CC_index"] == 1]
|
|
242
|
+
mean_corr = cc1["normalized_correlation"].mean()
|
|
243
|
+
sigma_val = float(sigma_name.replace("sigma_", ""))
|
|
244
|
+
all_cc1.append((sigma_val, mean_corr))
|
|
245
|
+
if all_cc1:
|
|
246
|
+
obj.sigma_value_choice = max(all_cc1, key=lambda x: x[1])[0]
|
|
247
|
+
|
|
248
|
+
return obj
|