py-TranspaceR 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_transpacer-0.1.0.dist-info/METADATA +140 -0
- py_transpacer-0.1.0.dist-info/RECORD +15 -0
- py_transpacer-0.1.0.dist-info/WHEEL +5 -0
- py_transpacer-0.1.0.dist-info/top_level.txt +1 -0
- transspacer/__init__.py +19 -0
- transspacer/clustering.py +194 -0
- transspacer/fft_utils.py +97 -0
- transspacer/gene_selection.py +67 -0
- transspacer/normalization.py +24 -0
- transspacer/plotting.py +199 -0
- transspacer/qc.py +74 -0
- transspacer/sparse_utils.py +67 -0
- transspacer/spatial_stats.py +325 -0
- transspacer/utils.py +198 -0
- transspacer/variogram.py +330 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: py-TranspaceR
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Statistical analysis of Spatial transcriptomic data (Python port of TranspaceR)
|
|
5
|
+
Author-email: Pierre Bost <pierre.bost@curie.fr>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/TranspaceR/TranspaceR
|
|
8
|
+
Keywords: spatial,transcriptomics,variogram,geary-c,bioinformatics
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
19
|
+
Requires-Python: >=3.8
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: numpy>=1.21
|
|
22
|
+
Requires-Dist: scipy>=1.7
|
|
23
|
+
Requires-Dist: scikit-learn>=1.0
|
|
24
|
+
Requires-Dist: pandas>=1.3
|
|
25
|
+
Requires-Dist: matplotlib>=3.4
|
|
26
|
+
Provides-Extra: umap
|
|
27
|
+
Requires-Dist: umap-learn>=0.5; extra == "umap"
|
|
28
|
+
Provides-Extra: leiden
|
|
29
|
+
Requires-Dist: python-igraph>=0.10; extra == "leiden"
|
|
30
|
+
Requires-Dist: leidenalg>=0.9; extra == "leiden"
|
|
31
|
+
Provides-Extra: stats
|
|
32
|
+
Requires-Dist: statsmodels>=0.13; extra == "stats"
|
|
33
|
+
Provides-Extra: all
|
|
34
|
+
Requires-Dist: py-TranspaceR[leiden,stats,umap]; extra == "all"
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
37
|
+
Requires-Dist: py-TranspaceR[all]; extra == "dev"
|
|
38
|
+
|
|
39
|
+
# py-TranspaceR
|
|
40
|
+
|
|
41
|
+
[](https://www.python.org/downloads/)
|
|
42
|
+
[](LICENSE)
|
|
43
|
+
[](#testing)
|
|
44
|
+
[](#speed-benchmark)
|
|
45
|
+
|
|
46
|
+
Python port of [TranspaceR](https://github.com/TranspaceR/TranspaceR) — Statistical analysis of Spatial transcriptomic data.
|
|
47
|
+
|
|
48
|
+
## Correlation Benchmark (Python vs R)
|
|
49
|
+
|
|
50
|
+
| Function | Pearson r | Max Abs Error |
|
|
51
|
+
|---|---|---|
|
|
52
|
+
| `C_normalisation` | 1.000000 | 4.10e-05 |
|
|
53
|
+
| `Otsu_thresholding` | — | 3.54e-05 |
|
|
54
|
+
| `colvars_sparse` | 1.000000 | 4.40e-05 |
|
|
55
|
+
| `Get_variogram_map` | Deterministic match | 0 |
|
|
56
|
+
| `Get_isotropic_vario` | 1.000000 | 0 |
|
|
57
|
+
|
|
58
|
+
All outputs are highly consistent with R references, with errors within floating-point precision.
|
|
59
|
+
|
|
60
|
+
## Speed Benchmark (39,047 cells x 539 genes)
|
|
61
|
+
|
|
62
|
+
| Function | R Time | Python Time | Speedup |
|
|
63
|
+
|---|---|---|---|
|
|
64
|
+
| `C_normalisation` | 1.47s | 0.157s | 9.4x |
|
|
65
|
+
| `Otsu_thresholding` | 0.31s | 0.031s | 10.0x |
|
|
66
|
+
| `colvars_sparse` | 1.72s | 0.011s | 156x |
|
|
67
|
+
| `Get_variogram_map` | 0.02s | 0.0004s | 50x |
|
|
68
|
+
| `Get_isotropic_vario` | 0.01s | 0.0004s | 25x |
|
|
69
|
+
| **Total** | **3.53s** | **0.20s** | **17.7x** |
|
|
70
|
+
|
|
71
|
+
### Why faster
|
|
72
|
+
|
|
73
|
+
- NumPy/SciPy compiled C backend vs R interpreted execution
|
|
74
|
+
- Direct CSC sparse matrix memory layout access
|
|
75
|
+
- Broadcasting replaces R's row-wise `apply` loops
|
|
76
|
+
|
|
77
|
+
## Installation
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install -e ".[all]"
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Quick Start
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
import transspacer as ts
|
|
87
|
+
import numpy as np
|
|
88
|
+
import pandas as pd
|
|
89
|
+
|
|
90
|
+
# Load data
|
|
91
|
+
expr = pd.read_csv("Expression_file.csv.gz", index_col=0)
|
|
92
|
+
meta = pd.read_csv("Meta_data.csv", index_col=0)
|
|
93
|
+
|
|
94
|
+
# Cell-size normalisation
|
|
95
|
+
normed = ts.c_normalisation(expr.values.astype(float), meta["Area"].values)
|
|
96
|
+
|
|
97
|
+
# Otsu thresholding
|
|
98
|
+
threshold = ts.otsu_thresholding(np.log10(expr.values.sum(axis=1) + 1))
|
|
99
|
+
|
|
100
|
+
# Variogram analysis
|
|
101
|
+
result = ts.compute_variogram(normed, meta["cell_centroid_x"].values,
|
|
102
|
+
meta["cell_centroid_y"].values)
|
|
103
|
+
|
|
104
|
+
# Geary's C spatial autocorrelation
|
|
105
|
+
gc = ts.geary_c_score(normed, coords, pvalue_threshold=0.01)
|
|
106
|
+
|
|
107
|
+
# Clustering
|
|
108
|
+
labels = ts.cell_clustering_function(pca_data, K=10, resolution=1.0)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Modules
|
|
112
|
+
|
|
113
|
+
| Module | Description |
|
|
114
|
+
|---|---|
|
|
115
|
+
| `fft_utils` | `fftshift`, `ifftshift`, `pad_definitor` |
|
|
116
|
+
| `normalization` | `C_normalisation` cell-size normalisation |
|
|
117
|
+
| `sparse_utils` | Sparse matrix column variance, group aggregation |
|
|
118
|
+
| `variogram` | FFT variogram map, variogram model fitting |
|
|
119
|
+
| `spatial_stats` | Geary's C, NB excess variance / excess zero score |
|
|
120
|
+
| `clustering` | KNN + Leiden/Louvain clustering, UMAP |
|
|
121
|
+
| `gene_selection` | `log2FC`, gene set union |
|
|
122
|
+
| `qc` | Otsu thresholding, QC gene filtering |
|
|
123
|
+
| `plotting` | Spatial visualization, heatmaps, UMAP plots |
|
|
124
|
+
|
|
125
|
+
## Testing
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
pytest tests/ -q
|
|
129
|
+
# 29 passed
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Dependencies
|
|
133
|
+
|
|
134
|
+
**Core:** `numpy`, `scipy`, `scikit-learn`, `pandas`, `matplotlib`
|
|
135
|
+
|
|
136
|
+
**Optional:** `umap-learn` (UMAP), `python-igraph` + `leidenalg` (Leiden clustering), `statsmodels` (FDR correction)
|
|
137
|
+
|
|
138
|
+
## License
|
|
139
|
+
|
|
140
|
+
MIT
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
transspacer/__init__.py,sha256=OozYDtA1-4NAZ-lNNqNwKutVxAgu6dAK2Fh4D5uRwD8,1049
|
|
2
|
+
transspacer/clustering.py,sha256=BR6evtfPvE6NZ5bdORyYlHFZz5OgHWJU-cNXQVrZHrA,6361
|
|
3
|
+
transspacer/fft_utils.py,sha256=UBK_cd_tDkkKIQe38LOTuxqvE-4iheRKkXtd--Jj85k,2562
|
|
4
|
+
transspacer/gene_selection.py,sha256=BVdXvcwnTsgSe4iaoxQPtoVdlmvbZ-rd-zLkuV3L24w,1933
|
|
5
|
+
transspacer/normalization.py,sha256=bytE0lT_E3JuUW-jB-1iwLbZ02QmRhN5n_sV-BPHrdw,651
|
|
6
|
+
transspacer/plotting.py,sha256=Zi9QbQdhX9zNxAJCMyJyB8NupBfw4TXxP7gMdlbc0CA,6906
|
|
7
|
+
transspacer/qc.py,sha256=NcwPAb8AakIikegGwZwtobH6Ograw1MK55o3hCbPsBs,2066
|
|
8
|
+
transspacer/sparse_utils.py,sha256=Ny_bV7tKDbWGbjVIHZA8wegPuqW10Q4AM5Kk5GRyDq8,2041
|
|
9
|
+
transspacer/spatial_stats.py,sha256=mHYO6Vw2IeFswd6rYZTKsgVKqU4ZaZhSKOrSio9oLYg,10840
|
|
10
|
+
transspacer/utils.py,sha256=PzRbtu-NCgrlExnx9wuEZPPcaQEPqd4q8j_ALpRsl6c,5267
|
|
11
|
+
transspacer/variogram.py,sha256=-F0rvtPB8TAsA8gJ1IRiL1ElBhlx3r-9heWTH3115ZA,10964
|
|
12
|
+
py_transpacer-0.1.0.dist-info/METADATA,sha256=NMx3nNk4jGwsGYjnVXemREuQJnkpGwGTho2Gdrphw6Y,4813
|
|
13
|
+
py_transpacer-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
14
|
+
py_transpacer-0.1.0.dist-info/top_level.txt,sha256=b0JQl_A-pWokxEVEyIx8va_gw0drZXrce3b7BUJlsBQ,12
|
|
15
|
+
py_transpacer-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
transspacer
|
transspacer/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TranspaceR: Statistical analysis of Spatial transcriptomic data (Python port)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
__version__ = "0.1.0"
|
|
6
|
+
|
|
7
|
+
from .fft_utils import fftshift, ifftshift, pad_definitor
|
|
8
|
+
from .normalization import c_normalisation
|
|
9
|
+
from .qc import otsu_thresholding, qc_gene_threshold
|
|
10
|
+
from .sparse_utils import colvars_sparse, aggregate_sparse
|
|
11
|
+
from .gene_selection import calculate_log2fc, select_genes
|
|
12
|
+
from .variogram import get_variogram_map, get_isotropic_vario, process_gene, compute_variogram
|
|
13
|
+
from .spatial_stats import geary_c_score, excess_variance_ratio_nb, excess_zero_score_nb
|
|
14
|
+
from .clustering import cell_clustering_function, clusters_maker, umap_maker
|
|
15
|
+
from .utils import (color_convertion, string_to_colors, compute_radius,
|
|
16
|
+
curate_data, fraction_multiple_samples, load_scimilarity_results)
|
|
17
|
+
from .plotting import (plot_fov, plot_fov_gene, plot_variogram, save_umap,
|
|
18
|
+
save_heatmap_markers, save_annotation_plot, save_boxplot,
|
|
19
|
+
save_dendogram, save_geary_variance_plot, save_tissue_visualization)
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Cell clustering, UMAP, and Clusters_maker pipeline."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from scipy.sparse import issparse, csc_matrix
|
|
5
|
+
from sklearn.neighbors import NearestNeighbors
|
|
6
|
+
from scipy.sparse.csgraph import connected_components
|
|
7
|
+
|
|
8
|
+
def _check_leiden():
|
|
9
|
+
try:
|
|
10
|
+
import igraph as ig
|
|
11
|
+
import leidenalg
|
|
12
|
+
return ig, leidenalg, True
|
|
13
|
+
except ImportError:
|
|
14
|
+
return None, None, False
|
|
15
|
+
|
|
16
|
+
def _check_umap():
|
|
17
|
+
try:
|
|
18
|
+
import umap as umap_lib
|
|
19
|
+
return umap_lib, True
|
|
20
|
+
except (ImportError, TypeError, Exception):
|
|
21
|
+
return None, False
|
|
22
|
+
|
|
23
|
+
from .sparse_utils import colvars_sparse, aggregate_sparse
|
|
24
|
+
from .gene_selection import calculate_log2fc
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def cell_clustering_function(data_correction: np.ndarray, K: int = 30,
|
|
28
|
+
metric: str = "euclidean", n_threads: int = 1,
|
|
29
|
+
resolution: float = 1.0) -> np.ndarray:
|
|
30
|
+
"""KNN graph + Louvain/Leiden clustering.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
data_correction : np.ndarray
|
|
35
|
+
Reduced data matrix (cells x components).
|
|
36
|
+
K : int
|
|
37
|
+
Number of neighbors.
|
|
38
|
+
metric : str
|
|
39
|
+
Distance metric.
|
|
40
|
+
n_threads : int
|
|
41
|
+
Number of threads.
|
|
42
|
+
resolution : float
|
|
43
|
+
Clustering resolution.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
np.ndarray
|
|
48
|
+
Cluster labels (string array, 1-indexed to match R).
|
|
49
|
+
"""
|
|
50
|
+
# Build KNN graph
|
|
51
|
+
nn = NearestNeighbors(n_neighbors=K + 1, metric=metric, n_jobs=n_threads)
|
|
52
|
+
nn.fit(data_correction)
|
|
53
|
+
distances, indices = nn.kneighbors(data_correction)
|
|
54
|
+
|
|
55
|
+
# Build adjacency matrix (exclude self-loop at index 0)
|
|
56
|
+
n = data_correction.shape[0]
|
|
57
|
+
adj = np.zeros((n, n))
|
|
58
|
+
for i in range(n):
|
|
59
|
+
for j_idx in range(1, K + 1):
|
|
60
|
+
j = indices[i, j_idx]
|
|
61
|
+
adj[i, j] = 1.0
|
|
62
|
+
adj[j, i] = 1.0
|
|
63
|
+
|
|
64
|
+
# Symmetrize: KNN_matrix = t(KNN) + KNN (already symmetric from above)
|
|
65
|
+
|
|
66
|
+
ig, la, has_leiden = _check_leiden()
|
|
67
|
+
if has_leiden:
|
|
68
|
+
# Use igraph + leiden
|
|
69
|
+
sources, targets = np.where(adj > 0)
|
|
70
|
+
mask = sources < targets
|
|
71
|
+
edges = list(zip(sources[mask], targets[mask]))
|
|
72
|
+
g = ig.Graph(n=n, edges=edges, directed=False)
|
|
73
|
+
g.es["weight"] = [adj[s, t] for s, t in edges]
|
|
74
|
+
|
|
75
|
+
partition = la.find_partition(g, la.RBConfigurationVertexPartition,
|
|
76
|
+
resolution_parameter=resolution,
|
|
77
|
+
n_iterations=-1)
|
|
78
|
+
labels = np.array([str(m + 1) for m in partition.membership]) # 1-indexed
|
|
79
|
+
else:
|
|
80
|
+
# Fallback: simple connected components + modularity-based split
|
|
81
|
+
from sklearn.cluster import SpectralClustering
|
|
82
|
+
n_clusters = max(2, int(n / 100)) # rough estimate
|
|
83
|
+
sc = SpectralClustering(n_clusters=n_clusters, affinity="precomputed",
|
|
84
|
+
assign_labels="discretize", random_state=42)
|
|
85
|
+
labels = sc.fit_predict(adj) + 1 # 1-indexed
|
|
86
|
+
labels = labels.astype(str)
|
|
87
|
+
|
|
88
|
+
return labels
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def umap_maker(pca_data: np.ndarray, n_components: int = 2,
|
|
92
|
+
random_state: int = 42) -> np.ndarray:
|
|
93
|
+
"""Run UMAP on PCA embedding.
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
pca_data : np.ndarray
|
|
98
|
+
PCA-reduced data (cells x components).
|
|
99
|
+
n_components : int
|
|
100
|
+
UMAP dimensions.
|
|
101
|
+
random_state : int
|
|
102
|
+
Random seed.
|
|
103
|
+
|
|
104
|
+
Returns
|
|
105
|
+
-------
|
|
106
|
+
np.ndarray
|
|
107
|
+
UMAP coordinates (cells x 2).
|
|
108
|
+
"""
|
|
109
|
+
umap_lib, has_umap = _check_umap()
|
|
110
|
+
if has_umap:
|
|
111
|
+
reducer = umap_lib.UMAP(n_components=n_components, random_state=random_state)
|
|
112
|
+
return reducer.fit_transform(pca_data)
|
|
113
|
+
else:
|
|
114
|
+
from sklearn.manifold import TSNE
|
|
115
|
+
return TSNE(n_components=n_components, random_state=random_state).fit_transform(pca_data)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def clusters_maker(expression, shared_genes=None, K: int = 30,
|
|
119
|
+
metric_used: str = "euclidean", n_threads: int = 1,
|
|
120
|
+
resolution: float = 1.0, nv: int = 50) -> dict:
|
|
121
|
+
"""End-to-end: PCA -> clustering -> mean expression -> log2FC.
|
|
122
|
+
|
|
123
|
+
Parameters
|
|
124
|
+
----------
|
|
125
|
+
expression : np.ndarray or sparse matrix
|
|
126
|
+
Expression matrix (cells x genes).
|
|
127
|
+
shared_genes : list, optional
|
|
128
|
+
Gene indices/names to use. If None, uses all genes.
|
|
129
|
+
K : int
|
|
130
|
+
Number of neighbors for clustering.
|
|
131
|
+
metric_used : str
|
|
132
|
+
Distance metric.
|
|
133
|
+
n_threads : int
|
|
134
|
+
Number of threads.
|
|
135
|
+
resolution : float
|
|
136
|
+
Clustering resolution.
|
|
137
|
+
nv : int
|
|
138
|
+
Number of PCA components.
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
dict
|
|
143
|
+
PCA_data, Data_correction, Clustering, Mean_expression, Log2FC_table.
|
|
144
|
+
"""
|
|
145
|
+
from sklearn.decomposition import TruncatedSVD
|
|
146
|
+
|
|
147
|
+
if shared_genes is not None:
|
|
148
|
+
if issparse(expression):
|
|
149
|
+
data_correction = expression[:, shared_genes]
|
|
150
|
+
else:
|
|
151
|
+
data_correction = expression[:, shared_genes]
|
|
152
|
+
else:
|
|
153
|
+
data_correction = expression
|
|
154
|
+
|
|
155
|
+
# PCA via SVD
|
|
156
|
+
if issparse(data_correction):
|
|
157
|
+
svd = TruncatedSVD(n_components=min(nv, data_correction.shape[1] - 1),
|
|
158
|
+
random_state=42)
|
|
159
|
+
pca_u = svd.fit_transform(data_correction)
|
|
160
|
+
else:
|
|
161
|
+
from sklearn.decomposition import PCA
|
|
162
|
+
pca = PCA(n_components=min(nv, data_correction.shape[1]), random_state=42)
|
|
163
|
+
pca_u = pca.fit_transform(data_correction)
|
|
164
|
+
|
|
165
|
+
# Clustering
|
|
166
|
+
clustering = cell_clustering_function(pca_u, K, metric_used, n_threads, resolution)
|
|
167
|
+
|
|
168
|
+
# Mean expression per cluster
|
|
169
|
+
if issparse(data_correction):
|
|
170
|
+
mean_expression = aggregate_sparse(data_correction, clustering).T # genes x groups -> groups x genes
|
|
171
|
+
else:
|
|
172
|
+
groups = np.unique(clustering)
|
|
173
|
+
mean_expression = np.zeros((len(groups), data_correction.shape[1]))
|
|
174
|
+
for i, g in enumerate(groups):
|
|
175
|
+
idx = np.where(clustering == g)[0]
|
|
176
|
+
mean_expression[i, :] = data_correction[idx, :].mean(axis=0)
|
|
177
|
+
|
|
178
|
+
# Log2FC
|
|
179
|
+
n_genes = expression.shape[1]
|
|
180
|
+
genes = list(range(n_genes))
|
|
181
|
+
log2fc_list = {}
|
|
182
|
+
for g_idx in genes:
|
|
183
|
+
log2fc_list[g_idx] = calculate_log2fc(
|
|
184
|
+
expression if not issparse(expression) else expression.toarray(),
|
|
185
|
+
g_idx, clustering
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
return {
|
|
189
|
+
"PCA_data": pca_u,
|
|
190
|
+
"Data_correction": data_correction,
|
|
191
|
+
"Clustering": clustering,
|
|
192
|
+
"Mean_expression": mean_expression,
|
|
193
|
+
"Log2FC_table": log2fc_list
|
|
194
|
+
}
|
transspacer/fft_utils.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""FFT utility functions: fftshift, ifftshift, pad_definitor."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def fftshift(input_matrix: np.ndarray, dim: int = -1) -> np.ndarray:
|
|
7
|
+
"""Shift zero-frequency component to center of spectrum.
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
input_matrix : np.ndarray
|
|
12
|
+
2D array to shift.
|
|
13
|
+
dim : int
|
|
14
|
+
-1 for both dimensions, 1 for rows, 2 for columns.
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
np.ndarray
|
|
19
|
+
Shifted array.
|
|
20
|
+
"""
|
|
21
|
+
rows, cols = input_matrix.shape
|
|
22
|
+
|
|
23
|
+
def swap_up_down(m):
|
|
24
|
+
rows_half = int(np.ceil(rows / 2))
|
|
25
|
+
return np.vstack([m[rows_half:, :], m[:rows_half, :]])
|
|
26
|
+
|
|
27
|
+
def swap_left_right(m):
|
|
28
|
+
cols_half = int(np.ceil(cols / 2))
|
|
29
|
+
return np.hstack([m[:, cols_half:], m[:, :cols_half]])
|
|
30
|
+
|
|
31
|
+
if dim == -1:
|
|
32
|
+
return swap_left_right(swap_up_down(input_matrix))
|
|
33
|
+
elif dim == 1:
|
|
34
|
+
return swap_up_down(input_matrix)
|
|
35
|
+
elif dim == 2:
|
|
36
|
+
return swap_left_right(input_matrix)
|
|
37
|
+
else:
|
|
38
|
+
raise ValueError("Invalid dimension parameter")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def ifftshift(input_matrix: np.ndarray, dim: int = -1) -> np.ndarray:
|
|
42
|
+
"""Inverse FFT shift.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
input_matrix : np.ndarray
|
|
47
|
+
2D array to shift.
|
|
48
|
+
dim : int
|
|
49
|
+
-1 for both dimensions, 1 for rows, 2 for columns.
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
np.ndarray
|
|
54
|
+
Shifted array.
|
|
55
|
+
"""
|
|
56
|
+
rows, cols = input_matrix.shape
|
|
57
|
+
|
|
58
|
+
def swap_up_down(m):
|
|
59
|
+
rows_half = int(np.floor(rows / 2))
|
|
60
|
+
return np.vstack([m[rows_half:, :], m[:rows_half, :]])
|
|
61
|
+
|
|
62
|
+
def swap_left_right(m):
|
|
63
|
+
cols_half = int(np.floor(cols / 2))
|
|
64
|
+
return np.hstack([m[:, cols_half:], m[:, :cols_half]])
|
|
65
|
+
|
|
66
|
+
if dim == -1:
|
|
67
|
+
return swap_up_down(swap_left_right(input_matrix))
|
|
68
|
+
elif dim == 1:
|
|
69
|
+
return swap_up_down(input_matrix)
|
|
70
|
+
elif dim == 2:
|
|
71
|
+
return swap_left_right(input_matrix)
|
|
72
|
+
else:
|
|
73
|
+
raise ValueError("Invalid dimension parameter")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def pad_definitor(meta_data_x: np.ndarray, meta_data_y: np.ndarray) -> int:
|
|
77
|
+
"""Compute automatic padding size from spatial coordinates.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
meta_data_x : np.ndarray
|
|
82
|
+
X coordinates of cell centroids.
|
|
83
|
+
meta_data_y : np.ndarray
|
|
84
|
+
Y coordinates of cell centroids.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
int
|
|
89
|
+
Padding size (minimum 1).
|
|
90
|
+
"""
|
|
91
|
+
xrange = np.max(meta_data_x) - np.min(meta_data_x)
|
|
92
|
+
yrange = np.max(meta_data_y) - np.min(meta_data_y)
|
|
93
|
+
range_val = np.mean([xrange, yrange])
|
|
94
|
+
n_pad = int(np.round(range_val / 200))
|
|
95
|
+
if n_pad == 0:
|
|
96
|
+
n_pad = 1
|
|
97
|
+
return n_pad
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Gene selection utilities: log2FC, shared gene selection."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from collections import Counter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def calculate_log2fc(expression: np.ndarray, gene_idx: int, clustering: np.ndarray) -> dict:
|
|
8
|
+
"""Per-gene log2FC: mean in cluster vs weighted mean in other clusters.
|
|
9
|
+
|
|
10
|
+
Parameters
|
|
11
|
+
----------
|
|
12
|
+
expression : np.ndarray
|
|
13
|
+
Expression matrix (cells x genes).
|
|
14
|
+
gene_idx : int
|
|
15
|
+
Column index of the gene.
|
|
16
|
+
clustering : np.ndarray
|
|
17
|
+
Cluster labels per cell.
|
|
18
|
+
|
|
19
|
+
Returns
|
|
20
|
+
-------
|
|
21
|
+
dict
|
|
22
|
+
{cluster_label: log2fc_value} for each cluster.
|
|
23
|
+
"""
|
|
24
|
+
x = expression[:, gene_idx]
|
|
25
|
+
groups = np.unique(clustering)
|
|
26
|
+
mean_expression = np.array([x[clustering == g].mean() for g in groups])
|
|
27
|
+
counts = Counter(clustering)
|
|
28
|
+
total = len(clustering)
|
|
29
|
+
proportions = {g: counts[g] / total for g in groups}
|
|
30
|
+
|
|
31
|
+
# Build proportion matrix (off-diagonal, row-normalized)
|
|
32
|
+
result = {}
|
|
33
|
+
for i, g in enumerate(groups):
|
|
34
|
+
weighted_other = 0.0
|
|
35
|
+
weight_sum = 0.0
|
|
36
|
+
for j, g2 in enumerate(groups):
|
|
37
|
+
if i != j:
|
|
38
|
+
weighted_other += proportions[g2] * mean_expression[j]
|
|
39
|
+
weight_sum += proportions[g2]
|
|
40
|
+
if weight_sum > 0:
|
|
41
|
+
weighted_other /= weight_sum
|
|
42
|
+
if weighted_other > 0:
|
|
43
|
+
result[g] = np.log2(mean_expression[i] / weighted_other)
|
|
44
|
+
else:
|
|
45
|
+
result[g] = np.nan
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def select_genes(selected_objects: list, selected_names: list = None) -> list:
|
|
50
|
+
"""Union of gene lists.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
selected_objects : list of list
|
|
55
|
+
Each element is a list/set of gene names.
|
|
56
|
+
selected_names : list of str, optional
|
|
57
|
+
Names for each gene set.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
list
|
|
62
|
+
Unique shared genes across all sets.
|
|
63
|
+
"""
|
|
64
|
+
shared = set()
|
|
65
|
+
for obj in selected_objects:
|
|
66
|
+
shared.update(obj)
|
|
67
|
+
return sorted(shared)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Cell-size normalization."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def c_normalisation(y: np.ndarray, scaling_factor: np.ndarray) -> np.ndarray:
|
|
7
|
+
"""Normalize expression by cell size: y / (area + 1/tau).
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
y : np.ndarray
|
|
12
|
+
Expression matrix (cells x genes).
|
|
13
|
+
scaling_factor : np.ndarray
|
|
14
|
+
Cell areas (length n_cells).
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
np.ndarray
|
|
19
|
+
Normalized expression matrix.
|
|
20
|
+
"""
|
|
21
|
+
cell_size = scaling_factor.astype(float)
|
|
22
|
+
tau_parameter = np.mean(y / cell_size[:, np.newaxis], axis=0)
|
|
23
|
+
scaling = cell_size[:, np.newaxis] + 1.0 / tau_parameter[np.newaxis, :]
|
|
24
|
+
return y / scaling
|