pysimspec 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Your Name
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.4
2
+ Name: pysimspec
3
+ Version: 0.1.0
4
+ Summary: Python implementation of the simspec algorithm
5
+ Author-email: Zhisong He <zhisong.he@bsse.ethz.ch>
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: numpy
11
+ Requires-Dist: pandas
12
+ Requires-Dist: anndata
13
+ Requires-Dist: scanpy
14
+ Requires-Dist: scipy
15
+ Requires-Dist: scikit-learn
16
+ Requires-Dist: tqdm
17
+ Requires-Dist: rich
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=7.0; extra == "dev"
20
+ Dynamic: license-file
21
+
22
+ # pysimspec
23
+ Python implementation of simspec (for RSS/CSS). The paper about the method detailed is published in [Genome Biology](https://link.springer.com/article/10.1186/s13059-020-02147-4) in 2020. The original R implementation is available at [GitHub](https://github.com/quadbio/simspec).
24
+
25
+ ## Installation
26
+
27
+ First, clone the codebase to your local environment
28
+ ```bash
29
+ git clone https://github.com/quadbio/pysimspec.git
30
+ ```
31
+
32
+ Next, install the package with `pip`
33
+ ```bash
34
+ cd pysimspec
35
+ pip install .
36
+ ```
37
+
38
+ Just to mention, this project uses [uv](https://github.com/astral-sh/uv) for fast Python package management.
39
+ ```bash
40
+ uv venv
41
+ uv pip install -e '.[dev]'
42
+ ```
43
+
44
+ ## Quick example
45
+
46
+ ```python
47
+ import scanpy as sc
48
+ import anndata
49
+ from pysimspec import Simspec, set_log_level, load
50
+
51
+ # Set up logging
52
+ set_log_level("INFO")
53
+
54
+ # Load and concatenate data
55
+ adata_DS1 = sc.read_h5ad('DS1_raw.h5ad')
56
+ adata_DS2 = sc.read_h5ad('DS2_raw.h5ad')
57
+ adata_DS1.obs['batch'] = 'DS1'
58
+ adata_DS2.obs['batch'] = 'DS2'
59
+ adata = anndata.concat([adata_DS1, adata_DS2], join='inner', keys=['DS1','DS2'], index_unique="_")
60
+
61
+ # Data preprocessing
62
+ adata.layers['counts'] = adata.X.copy()
63
+ sc.pp.normalize_total(adata, target_sum=1e4)
64
+ sc.pp.log1p(adata)
65
+ sc.pp.highly_variable_genes(adata, flavor='seurat_v3', layer='counts', n_top_genes=3000, batch_key='batch')
66
+ sc.pp.pca(adata, n_comps=20, mask_var='highly_variable')
67
+
68
+ # Run CSS
69
+ simspec = Simspec()
70
+ simspec.compute_references(adata, batch = 'batch', use_rep = 'X_pca')
71
+ simspec.compute_simspec(adata)
72
+ simspec.compute_PCA(n_pcs = 10)
73
+ adata.obsm['X_css'] = simspec.get_result()
74
+ adata.obsm['X_csspca'] = simspec.get_transformed_result()
75
+
76
+ # Use CSS representation for followup analysis
77
+ sc.pp.neighbors(adata, use_rep='X_css')
78
+ sc.tl.umap(adata)
79
+ sc.pl.umap(adata, color='batch')
80
+
81
+ # Save the Simspec object
82
+ simspec.save('simspec.pkl')
83
+
84
+ # Calculate projected CSS representation for the new data
85
+ adata_DS3 = sc.read_h5ad('DS3_raw.h5ad')
86
+ adata_DS3.layers['counts'] = adata_DS3.X.copy()
87
+ sc.pp.normalize_total(adata_DS3, target_sum=1e4)
88
+ sc.pp.log1p(adata_DS3)
89
+
90
+ simspec = load('simspec.pkl') # load the saved Simspec object
91
+ simspec.compute_simspec(adata_DS3)
92
+ adata_DS3.obsm['X_css_proj'] = simspec.get_result()
93
+ adata_DS3.obsm['X_csspca_proj'] = simspec.get_transformed_result()
94
+ ```
95
+
96
+ ## License
97
+ MIT
@@ -0,0 +1,76 @@
1
+ # pysimspec
2
+ Python implementation of simspec (for RSS/CSS). The paper about the method detailed is published in [Genome Biology](https://link.springer.com/article/10.1186/s13059-020-02147-4) in 2020. The original R implementation is available at [GitHub](https://github.com/quadbio/simspec).
3
+
4
+ ## Installation
5
+
6
+ First, clone the codebase to your local environment
7
+ ```bash
8
+ git clone https://github.com/quadbio/pysimspec.git
9
+ ```
10
+
11
+ Next, install the package with `pip`
12
+ ```bash
13
+ cd pysimspec
14
+ pip install .
15
+ ```
16
+
17
+ Just to mention, this project uses [uv](https://github.com/astral-sh/uv) for fast Python package management.
18
+ ```bash
19
+ uv venv
20
+ uv pip install -e '.[dev]'
21
+ ```
22
+
23
+ ## Quick example
24
+
25
+ ```python
26
+ import scanpy as sc
27
+ import anndata
28
+ from pysimspec import Simspec, set_log_level, load
29
+
30
+ # Set up logging
31
+ set_log_level("INFO")
32
+
33
+ # Load and concatenate data
34
+ adata_DS1 = sc.read_h5ad('DS1_raw.h5ad')
35
+ adata_DS2 = sc.read_h5ad('DS2_raw.h5ad')
36
+ adata_DS1.obs['batch'] = 'DS1'
37
+ adata_DS2.obs['batch'] = 'DS2'
38
+ adata = anndata.concat([adata_DS1, adata_DS2], join='inner', keys=['DS1','DS2'], index_unique="_")
39
+
40
+ # Data preprocessing
41
+ adata.layers['counts'] = adata.X.copy()
42
+ sc.pp.normalize_total(adata, target_sum=1e4)
43
+ sc.pp.log1p(adata)
44
+ sc.pp.highly_variable_genes(adata, flavor='seurat_v3', layer='counts', n_top_genes=3000, batch_key='batch')
45
+ sc.pp.pca(adata, n_comps=20, mask_var='highly_variable')
46
+
47
+ # Run CSS
48
+ simspec = Simspec()
49
+ simspec.compute_references(adata, batch = 'batch', use_rep = 'X_pca')
50
+ simspec.compute_simspec(adata)
51
+ simspec.compute_PCA(n_pcs = 10)
52
+ adata.obsm['X_css'] = simspec.get_result()
53
+ adata.obsm['X_csspca'] = simspec.get_transformed_result()
54
+
55
+ # Use CSS representation for followup analysis
56
+ sc.pp.neighbors(adata, use_rep='X_css')
57
+ sc.tl.umap(adata)
58
+ sc.pl.umap(adata, color='batch')
59
+
60
+ # Save the Simspec object
61
+ simspec.save('simspec.pkl')
62
+
63
+ # Calculate projected CSS representation for the new data
64
+ adata_DS3 = sc.read_h5ad('DS3_raw.h5ad')
65
+ adata_DS3.layers['counts'] = adata_DS3.X.copy()
66
+ sc.pp.normalize_total(adata_DS3, target_sum=1e4)
67
+ sc.pp.log1p(adata_DS3)
68
+
69
+ simspec = load('simspec.pkl') # load the saved Simspec object
70
+ simspec.compute_simspec(adata_DS3)
71
+ adata_DS3.obsm['X_css_proj'] = simspec.get_result()
72
+ adata_DS3.obsm['X_csspca_proj'] = simspec.get_transformed_result()
73
+ ```
74
+
75
+ ## License
76
+ MIT
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pysimspec"
7
+ version = "0.1.0"
8
+ description = "Python implementation of the simspec algorithm"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = { text = "MIT" }
12
+ authors = [ { name = "Zhisong He", email = "zhisong.he@bsse.ethz.ch" } ]
13
+ dependencies = [
14
+ "numpy",
15
+ "pandas",
16
+ "anndata",
17
+ "scanpy",
18
+ "scipy",
19
+ "scikit-learn",
20
+ "tqdm",
21
+ "rich"
22
+ ]
23
+
24
+ [project.optional-dependencies]
25
+ dev = ["pytest>=7.0"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,8 @@
1
+ """pysimspec package for computing similarity spectra in single-cell data."""
2
+
3
+ __all__ = ["Simspec", "set_log_level", "load"]
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ from .core import Simspec, load
8
+ from .logging import set_log_level
@@ -0,0 +1,232 @@
1
+ """Core module for pysimspec package.
2
+
3
+ This module contains the Simspec class for computing Cluster Similarity Spectra (CSS) and Reference Similarity Spectra (RSS)
4
+ from single-cell RNA-seq data. It provides methods to calculate reference profiles
5
+ from clustered data and compute similarity embeddings.
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import warnings
11
+ import anndata
12
+ import scanpy as sc
13
+ from sklearn.preprocessing import scale as scaledata
14
+ from sklearn.decomposition import PCA
15
+ from tqdm import tqdm
16
+
17
+ from .utils import corSparse, rankMatrix_nonzero, summarize_numeric_matrix
18
+ from .logging import logger
19
+
20
+
21
+ class Simspec:
22
+ """Class to perform Cluster Similarity Spectrum (CSS) calculation for single-cell data analysis.
23
+
24
+ This class computes similarity spectra by comparing single-cell data to reference profiles
25
+ derived from clustered batches. It supports both Pearson and Spearman correlations,
26
+ with optional scaling of results.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ ref: list | pd.DataFrame | anndata.AnnData | None = None,
32
+ method: str = 'spearman',
33
+ scale: bool = True
34
+ ) -> None:
35
+ """Initialize the Simspec object.
36
+
37
+ Args:
38
+ ref: Reference data for similarity calculation. Can be a list of AnnData objects,
39
+ a pandas DataFrame, an AnnData object, or None.
40
+ method: Correlation method to use. 'pearson' or 'spearman'. Default 'spearman'.
41
+ scale: Whether to scale the correlation results. Default True.
42
+ """
43
+ self.ref = ref
44
+ self.method = method
45
+ self.scale = scale
46
+
47
+ if isinstance(self.ref, pd.DataFrame):
48
+ self.ref = anndata.AnnData(X=np.array(self.ref),
49
+ var=pd.DataFrame(index=self.ref.columns),
50
+ obs=pd.DataFrame(index=self.ref.index))
51
+ if isinstance(self.ref, anndata.AnnData):
52
+ self.ref = list(self.ref)
53
+
54
+ self.result = None
55
+ self.transform = None
56
+
57
+ def compute_references(
58
+ self,
59
+ adata: anndata.AnnData,
60
+ batch: str = 'batch',
61
+ use_rep: str = 'X_pca',
62
+ layer: str | None = None,
63
+ n_neighbors: int = 15,
64
+ n_pcs: int | None = None,
65
+ method_clustering: str = 'leiden',
66
+ leiden_flavor: str = 'leidenalg',
67
+ resolution_clustering: float = 1,
68
+ highly_variable: bool = True
69
+ ) -> None:
70
+ """Calculate reference profiles by clustering cells within each batch.
71
+
72
+ This method splits the data by batch, performs clustering on each batch,
73
+ and computes average expression profiles for each cluster as references.
74
+
75
+ Args:
76
+ adata: AnnData object containing single-cell data.
77
+ batch: Column name in adata.obs for batch information. Default 'batch'.
78
+ use_rep: Representation to use for clustering. Default 'X_pca'.
79
+ layer: Layer in adata to use for expression. If None, uses adata.X.
80
+ n_neighbors: Number of neighbors for clustering. Default 15.
81
+ n_pcs: Number of PCs to use. If None, uses all.
82
+ method_clustering: Clustering method, 'louvain' or 'leiden'. Default 'leiden'.
83
+ leiden_flavor: Backend flavor passed to scanpy.tl.leiden.
84
+ Default 'leidenalg' to keep backward-compatible behavior.
85
+ resolution_clustering: Resolution parameter for clustering. Default 1.
86
+ highly_variable: Whether to subset to highly variable genes. Default True.
87
+
88
+ Returns:
89
+ None; stores the reference profiles in self.ref.
90
+ """
91
+ logger.info("Starting reference calculation")
92
+ if highly_variable and 'highly_variable' in adata.var.columns:
93
+ adata = adata[:, adata.var['highly_variable']]
94
+ logger.info(f"Subset to {adata.shape[1]} highly variable genes")
95
+ adata_batch = [adata[adata.obs[batch] == x, :].copy() for x in adata.obs[batch].unique()]
96
+ logger.info(f"Split data into {len(adata_batch)} batches")
97
+
98
+ for i, ad in enumerate(tqdm(adata_batch)):
99
+ logger.debug(f"Clustering batch {i+1}/{len(adata_batch)}")
100
+ sc.pp.neighbors(ad, use_rep=use_rep, n_neighbors=n_neighbors, n_pcs=n_pcs)
101
+ if method_clustering == 'louvain':
102
+ sc.tl.louvain(ad, resolution=resolution_clustering, key_added='cluster')
103
+ if method_clustering == 'leiden':
104
+ with warnings.catch_warnings():
105
+ warnings.filterwarnings(
106
+ "ignore",
107
+ message=r".*default backend for leiden will be igraph.*",
108
+ category=FutureWarning,
109
+ )
110
+ sc.tl.leiden(
111
+ ad,
112
+ resolution=resolution_clustering,
113
+ key_added='cluster',
114
+ flavor=leiden_flavor,
115
+ )
116
+
117
+ avg_expr_cl = list()
118
+ for i, ad in enumerate(tqdm(adata_batch)):
119
+ logger.debug(f"Computing average expression for batch {i+1}")
120
+ avg_expr = summarize_numeric_matrix(
121
+ ad.X if layer is None or layer not in ad.layers.keys() else ad.layers[layer],
122
+ ad.obs['cluster']
123
+ )
124
+ avg_expr = anndata.AnnData(
125
+ X=avg_expr,
126
+ var=ad.var.copy(),
127
+ obs=pd.DataFrame(index=ad.obs['cluster'].cat.categories)
128
+ )
129
+ avg_expr_cl.append(avg_expr)
130
+ self.ref = avg_expr_cl
131
+ logger.info(f"Calculated {len(self.ref)} reference profiles")
132
+
133
+ def compute_simspec(
134
+ self,
135
+ adata,
136
+ layer: str | None = None) -> None:
137
+ """Compute the Cluster Similarity Spectrum (CSS) or Reference Similarity Spectrum (RSS) using the stored references.
138
+
139
+ Calculates similarity between the input data and each reference profile,
140
+ concatenates the results, and stores in self.result.
141
+
142
+ Args:
143
+ adata: AnnData object to compute similarities for.
144
+ layer: Layer in adata to use. If None, uses adata.X.
145
+
146
+ Returns:
147
+ None; results stored in self.result.
148
+ """
149
+ logger.info("Starting similarity computation")
150
+ sims = list()
151
+ for i, ad_ref in enumerate(tqdm(self.ref)):
152
+ logger.debug(f"Computing similarity for reference {i+1}/{len(self.ref)}")
153
+ shared_genes = np.intersect1d(adata.var_names, ad_ref.var_names)
154
+ X = adata[:, shared_genes].X.T if layer is None or layer not in adata.layers.keys() else adata[:,shared_genes].layers[layer].T
155
+ refX = ad_ref[:, shared_genes].X.T
156
+
157
+ if self.method == 'spearman':
158
+ X = rankMatrix_nonzero(X)
159
+ refX = rankMatrix_nonzero(refX)
160
+ corr = corSparse(X, refX)
161
+ if self.scale:
162
+ corr = scaledata(corr, axis=1)
163
+ corr[np.isnan(corr)] = 0
164
+ sims.append(corr)
165
+
166
+ sims_concat = np.concatenate(sims, axis=1)
167
+ self.result = sims_concat
168
+ logger.info(f"Computed similarity spectrum with shape {self.result.shape}")
169
+
170
+ def compute_PCA(self,n_pcs=20, force_recompute=False) -> None:
171
+ """Run PCA on the resulted representation for further dimensionality reduction.
172
+
173
+ Args:
174
+ n_pcs: Number of principal components. Default 20.
175
+ force_recompute: Whether to force recomputation if already computed. Default False.
176
+
177
+ Returns:
178
+ None; stores the PCA model in self.transform.
179
+ """
180
+ if self.result is None:
181
+ raise ValueError("No similarity results found. Please run compute_simspec() first.")
182
+ if self.transform is not None and not force_recompute:
183
+ logger.info("PCA transformation already computed. Use force_recompute=True to recompute.")
184
+ return
185
+ model_pca = PCA(n_components=n_pcs)
186
+ model_pca.fit(self.result)
187
+ self.transform = model_pca
188
+
189
+ def get_result(self) -> np.ndarray:
190
+ """Get the computed similarity results.
191
+
192
+ Returns:
193
+ Numpy array of similarity results.
194
+ """
195
+ return self.result
196
+
197
+ def get_transformed_result(self) -> np.ndarray:
198
+ """Get the PCA transformed similarity results.
199
+
200
+ Returns:
201
+ Numpy array of PCA transformed similarity results.
202
+ """
203
+ if self.transform is None:
204
+ raise ValueError("PCA transformation not computed. Please run compute_PCA() first.")
205
+ return self.transform.transform(self.result)
206
+
207
+ def save(self, filepath: str) -> None:
208
+ """Save the Simspec object to a file.
209
+
210
+ Args:
211
+ filepath: Path to save the object.
212
+
213
+ Returns:
214
+ None.
215
+ """
216
+ import pickle
217
+ with open(filepath, 'wb') as f:
218
+ pickle.dump(self, f)
219
+
220
+
221
+ def load(filepath: str) -> Simspec:
222
+ """Load a Simspec object from a file.
223
+
224
+ Args:
225
+ filepath: Path to the saved object.
226
+ Returns:
227
+ Loaded Simspec object.
228
+ """
229
+ import pickle
230
+ with open(filepath, 'rb') as f:
231
+ obj = pickle.load(f)
232
+ return obj
@@ -0,0 +1,53 @@
1
+ """Logging setup for the package."""
2
+
3
+ import logging
4
+ import os
5
+ from typing import Literal
6
+
7
+
8
+ def _setup_logger() -> logging.Logger:
9
+ from rich.console import Console
10
+ from rich.logging import RichHandler
11
+
12
+ logger = logging.getLogger(__name__)
13
+ level = os.environ.get("LOGLEVEL", logging.INFO)
14
+ logger.setLevel(level=level)
15
+ console = Console(force_terminal=True)
16
+ if console.is_jupyter is True:
17
+ console.is_jupyter = False
18
+ ch = RichHandler(show_path=False, console=console, show_time=logger.level == logging.DEBUG)
19
+ logger.addHandler(ch)
20
+
21
+ # this prevents double outputs
22
+ logger.propagate = False
23
+ return logger
24
+
25
+
26
+ def set_log_level(
27
+ level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] | Literal[10, 20, 30, 40, 50],
28
+ ) -> None:
29
+ """Set the logging level for the scembed logger.
30
+
31
+ Parameters
32
+ ----------
33
+ level
34
+ Logging level. Can be a string ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL')
35
+ or logging constants (logging.DEBUG=10, logging.INFO=20, logging.WARNING=30,
36
+ logging.ERROR=40, logging.CRITICAL=50).
37
+
38
+ Examples
39
+ --------
40
+ >>> import scembed.logging
41
+ >>> scembed.logging.set_log_level("DEBUG")
42
+ >>> scembed.logging.set_log_level(logging.INFO)
43
+ """
44
+ if isinstance(level, str):
45
+ level = getattr(logging, level.upper())
46
+
47
+ logger.setLevel(level)
48
+ # Update handlers to ensure they respect the new level
49
+ for handler in logger.handlers:
50
+ handler.setLevel(level)
51
+
52
+
53
+ logger = _setup_logger()
@@ -0,0 +1,160 @@
1
+ """Utility functions for simspec calculations.
2
+
3
+ This module contains helper functions for computing correlations, ranking matrices,
4
+ and summarizing data, optimized for sparse matrices and single-cell data analysis.
5
+ """
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from scipy import sparse
10
+ from scipy.stats import rankdata
11
+
12
+
13
+ def corSparse(X, Y=None):
14
+ """Calculate correlation between columns of matrices X and Y.
15
+
16
+ Supports both dense and sparse matrices. If Y is None, computes
17
+ correlation of X with itself.
18
+
19
+ Args:
20
+ X: Input matrix (dense or sparse).
21
+ Y: Second matrix. If None, uses X.
22
+
23
+ Returns:
24
+ Correlation matrix as numpy array.
25
+ """
26
+ if Y is None:
27
+ Y = X
28
+
29
+ n = X.shape[0]
30
+ muX = np.ravel(X.mean(0))
31
+ muY = np.ravel(Y.mean(0))
32
+ covmat = (X.T.dot(Y) - (n * muX[:, np.newaxis].dot(muY[:, np.newaxis].T))) / (n - 1)
33
+ sdvecX = np.ravel(
34
+ np.sqrt(((X.power(2)).sum(0) - n * (muX ** 2)) / (n - 1)) if sparse.issparse(X) else np.sqrt(
35
+ ((X ** 2).sum(0) - n * (muX ** 2)) / (n - 1)))
36
+ sdvecY = np.ravel(
37
+ np.sqrt(((Y.power(2)).sum(0) - n * (muY ** 2)) / (n - 1)) if sparse.issparse(Y) else np.sqrt(
38
+ ((Y ** 2).sum(0) - n * (muY ** 2)) / (n - 1)))
39
+ cormat = covmat / sdvecX[:, np.newaxis].dot(sdvecY[:, np.newaxis].T)
40
+
41
+ return np.array(cormat)
42
+
43
+
44
+ def rankMatrix(X):
45
+ """Rank the values in each column of the matrix.
46
+
47
+ For sparse matrices, ranks only non-zero values. For dense, ranks all values.
48
+
49
+ Args:
50
+ X: Input matrix (dense or sparse).
51
+
52
+ Returns:
53
+ Ranked matrix in the same format as input.
54
+ """
55
+ if sparse.issparse(X):
56
+ idx_row, idx_col, dat = sparse.find(X)
57
+ df = pd.DataFrame({'i': idx_row, 'j': idx_col, 'x': dat}).sort_values('j')
58
+ split_idx = np.unique(df['j'].to_numpy(), return_index=True)[1][1:]
59
+ value_split = np.split(df['x'].to_numpy(), split_idx)
60
+ df['r'] = np.concatenate(
61
+ [rankdata(x) + (df.shape[0] - len(x)) - (1 + (df.shape[0] - len(x))) / 2 for x in value_split])
62
+ ranked = sparse.csr_matrix((df['r'].to_numpy(), (df['i'].to_numpy(), df['j'].to_numpy())), shape=X.shape)
63
+ else:
64
+ ranked = rankdata(X, method="average", axis=0)
65
+
66
+ return ranked
67
+
68
+
69
+ def rankMatrix_dense(X):
70
+ """Rank the values in each column using dense ranking method.
71
+
72
+ Dense ranking assigns the same rank to tied values without gaps.
73
+
74
+ Args:
75
+ X: Input matrix (dense or sparse).
76
+
77
+ Returns:
78
+ Ranked matrix.
79
+ """
80
+ if sparse.issparse(X):
81
+ idx_row, idx_col, dat = sparse.find(X)
82
+ df = pd.DataFrame({'i': idx_row, 'j': idx_col, 'x': dat}).sort_values('j')
83
+ split_idx = np.unique(df['j'].to_numpy(), return_index=True)[1][1:]
84
+ df['r'] = np.concatenate(
85
+ [rankdata(x, method="dense") for x in np.split(df['x'].to_numpy(), split_idx)])
86
+ ranked = sparse.csr_matrix((df['r'].to_numpy(), (df['i'].to_numpy(), df['j'].to_numpy())), shape=X.shape)
87
+ else:
88
+ ranked = rankdata(X, method="average", axis=0)
89
+
90
+ return ranked
91
+
92
+
93
+ def rankMatrix_nonzero(X):
94
+ """Rank only the non-zero values in each column.
95
+
96
+ Zero values remain zero. Useful for sparse data.
97
+
98
+ Args:
99
+ X: Input matrix (dense or sparse).
100
+
101
+ Returns:
102
+ Ranked matrix with zeros preserved.
103
+ """
104
+ if sparse.issparse(X):
105
+ idx_row, idx_col, dat = sparse.find(X)
106
+ df = pd.DataFrame({'i': idx_row, 'j': idx_col, 'x': dat}).sort_values('j')
107
+ split_idx = np.unique(df['j'].to_numpy(), return_index=True)[1][1:]
108
+ value_split = np.split(df['x'].to_numpy(), split_idx)
109
+ df['r'] = np.concatenate([rankdata(x) for x in value_split])
110
+ ranked = sparse.csr_matrix((df['r'].to_numpy(), (df['i'].to_numpy(), df['j'].to_numpy())), shape=X.shape)
111
+ else:
112
+ ranked = rankdata(X, method="average", axis=0)
113
+
114
+ return ranked
115
+
116
+
117
+ def group_vec_to_ident_mat(group, norm=True):
118
+ """Convert a group vector to an identity matrix for aggregation.
119
+
120
+ Creates a sparse matrix where each row corresponds to an item,
121
+ and columns to groups. Values are 1 for membership.
122
+
123
+ Args:
124
+ group: Pandas Series or array with group labels.
125
+ norm: If True, normalize by group size.
126
+
127
+ Returns:
128
+ Sparse CSR matrix.
129
+ """
130
+ if isinstance(group, pd.Series):
131
+ i = np.where(group.notnull().to_numpy())[0]
132
+ j = group.iloc[i].astype(int).to_numpy()
133
+ else:
134
+ group_arr = np.asarray(group)
135
+ i = np.where(~pd.isnull(group_arr))[0]
136
+ j = group_arr[i].astype(int)
137
+ mat_ident = sparse.csr_matrix((np.repeat(1, len(i)), (i, j)), shape=(len(group), len(np.unique(j))))
138
+ if norm:
139
+ num_cells_per_group = np.ravel(mat_ident.sum(axis=0))
140
+ mat_ident = mat_ident @ sparse.diags(1 / num_cells_per_group)
141
+
142
+ return mat_ident
143
+
144
+
145
+ def summarize_numeric_matrix(mat, group, use_mean=True):
146
+ """Summarize a numeric matrix by grouping rows.
147
+
148
+ Aggregates the matrix by groups, optionally computing mean per group.
149
+
150
+ Args:
151
+ mat: Numeric matrix (cells x features).
152
+ group: Group labels for each row.
153
+ use_mean: If True, compute mean; else sum.
154
+
155
+ Returns:
156
+ Summarized matrix (groups x features).
157
+ """
158
+ mat_ident = group_vec_to_ident_mat(group, norm=use_mean)
159
+ mat_summ = mat_ident.transpose() @ mat
160
+ return mat_summ
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.4
2
+ Name: pysimspec
3
+ Version: 0.1.0
4
+ Summary: Python implementation of the simspec algorithm
5
+ Author-email: Zhisong He <zhisong.he@bsse.ethz.ch>
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: numpy
11
+ Requires-Dist: pandas
12
+ Requires-Dist: anndata
13
+ Requires-Dist: scanpy
14
+ Requires-Dist: scipy
15
+ Requires-Dist: scikit-learn
16
+ Requires-Dist: tqdm
17
+ Requires-Dist: rich
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=7.0; extra == "dev"
20
+ Dynamic: license-file
21
+
22
+ # pysimspec
23
+ Python implementation of simspec (for RSS/CSS). The paper about the method detailed is published in [Genome Biology](https://link.springer.com/article/10.1186/s13059-020-02147-4) in 2020. The original R implementation is available at [GitHub](https://github.com/quadbio/simspec).
24
+
25
+ ## Installation
26
+
27
+ First, clone the codebase to your local environment
28
+ ```bash
29
+ git clone https://github.com/quadbio/pysimspec.git
30
+ ```
31
+
32
+ Next, install the package with `pip`
33
+ ```bash
34
+ cd pysimspec
35
+ pip install .
36
+ ```
37
+
38
+ Just to mention, this project uses [uv](https://github.com/astral-sh/uv) for fast Python package management.
39
+ ```bash
40
+ uv venv
41
+ uv pip install -e '.[dev]'
42
+ ```
43
+
44
+ ## Quick example
45
+
46
+ ```python
47
+ import scanpy as sc
48
+ import anndata
49
+ from pysimspec import Simspec, set_log_level, load
50
+
51
+ # Set up logging
52
+ set_log_level("INFO")
53
+
54
+ # Load and concatenate data
55
+ adata_DS1 = sc.read_h5ad('DS1_raw.h5ad')
56
+ adata_DS2 = sc.read_h5ad('DS2_raw.h5ad')
57
+ adata_DS1.obs['batch'] = 'DS1'
58
+ adata_DS2.obs['batch'] = 'DS2'
59
+ adata = anndata.concat([adata_DS1, adata_DS2], join='inner', keys=['DS1','DS2'], index_unique="_")
60
+
61
+ # Data preprocessing
62
+ adata.layers['counts'] = adata.X.copy()
63
+ sc.pp.normalize_total(adata, target_sum=1e4)
64
+ sc.pp.log1p(adata)
65
+ sc.pp.highly_variable_genes(adata, flavor='seurat_v3', layer='counts', n_top_genes=3000, batch_key='batch')
66
+ sc.pp.pca(adata, n_comps=20, mask_var='highly_variable')
67
+
68
+ # Run CSS
69
+ simspec = Simspec()
70
+ simspec.compute_references(adata, batch = 'batch', use_rep = 'X_pca')
71
+ simspec.compute_simspec(adata)
72
+ simspec.compute_PCA(n_pcs = 10)
73
+ adata.obsm['X_css'] = simspec.get_result()
74
+ adata.obsm['X_csspca'] = simspec.get_transformed_result()
75
+
76
+ # Use CSS representation for followup analysis
77
+ sc.pp.neighbors(adata, use_rep='X_css')
78
+ sc.tl.umap(adata)
79
+ sc.pl.umap(adata, color='batch')
80
+
81
+ # Save the Simspec object
82
+ simspec.save('simspec.pkl')
83
+
84
+ # Calculate projected CSS representation for the new data
85
+ adata_DS3 = sc.read_h5ad('DS3_raw.h5ad')
86
+ adata_DS3.layers['counts'] = adata_DS3.X.copy()
87
+ sc.pp.normalize_total(adata_DS3, target_sum=1e4)
88
+ sc.pp.log1p(adata_DS3)
89
+
90
+ simspec = load('simspec.pkl') # load the saved Simspec object
91
+ simspec.compute_simspec(adata_DS3)
92
+ adata_DS3.obsm['X_css_proj'] = simspec.get_result()
93
+ adata_DS3.obsm['X_csspca_proj'] = simspec.get_transformed_result()
94
+ ```
95
+
96
+ ## License
97
+ MIT
@@ -0,0 +1,13 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/pysimspec/__init__.py
5
+ src/pysimspec/core.py
6
+ src/pysimspec/logging.py
7
+ src/pysimspec/utils.py
8
+ src/pysimspec.egg-info/PKG-INFO
9
+ src/pysimspec.egg-info/SOURCES.txt
10
+ src/pysimspec.egg-info/dependency_links.txt
11
+ src/pysimspec.egg-info/requires.txt
12
+ src/pysimspec.egg-info/top_level.txt
13
+ tests/test_core.py
@@ -0,0 +1,11 @@
1
+ numpy
2
+ pandas
3
+ anndata
4
+ scanpy
5
+ scipy
6
+ scikit-learn
7
+ tqdm
8
+ rich
9
+
10
+ [dev]
11
+ pytest>=7.0
@@ -0,0 +1 @@
1
+ pysimspec
@@ -0,0 +1,67 @@
1
+ from pysimspec.core import Simspec
2
+ import pysimspec.core as core_module
3
+ import warnings
4
+
5
+ import numpy as np
6
+ from scipy import sparse
7
+
8
+ from pysimspec.utils import rankMatrix_nonzero
9
+
10
+
11
+ def test_simspec_class():
12
+ alg = Simspec()
13
+ assert alg.method == 'spearman'
14
+ assert alg.scale == True
15
+
16
+
17
+ def test_rankmatrix_nonzero_no_series_swapaxes_warning():
18
+ X = sparse.csr_matrix(np.array([[1.0, 0.0, 3.0],
19
+ [0.0, 2.0, 0.0],
20
+ [4.0, 0.0, 5.0]]))
21
+
22
+ with warnings.catch_warnings(record=True) as caught:
23
+ warnings.simplefilter("always", FutureWarning)
24
+ rankMatrix_nonzero(X)
25
+
26
+ messages = [str(w.message) for w in caught if issubclass(w.category, FutureWarning)]
27
+ assert not any("Series.swapaxes" in msg for msg in messages)
28
+
29
+
30
+ def test_compute_references_leiden_default_flavor(monkeypatch):
31
+ import anndata
32
+ import pandas as pd
33
+
34
+ X = np.array([
35
+ [1.0, 0.0, 2.0],
36
+ [0.5, 1.0, 1.5],
37
+ [2.0, 0.5, 0.0],
38
+ [1.5, 1.0, 0.5],
39
+ ])
40
+ obs = pd.DataFrame({'batch': ['b1', 'b1', 'b2', 'b2']})
41
+ var = pd.DataFrame(index=['g1', 'g2', 'g3'])
42
+ adata = anndata.AnnData(X=X, obs=obs, var=var)
43
+ adata.obsm['X_pca'] = X.copy()
44
+
45
+ calls = []
46
+
47
+ def fake_neighbors(*args, **kwargs):
48
+ return None
49
+
50
+ def fake_leiden(ad, **kwargs):
51
+ calls.append(kwargs)
52
+ ad.obs['cluster'] = pd.Categorical(['0'] * ad.n_obs)
53
+
54
+ monkeypatch.setattr(core_module.sc.pp, 'neighbors', fake_neighbors)
55
+ monkeypatch.setattr(core_module.sc.tl, 'leiden', fake_leiden)
56
+
57
+ alg = Simspec()
58
+ alg.compute_references(
59
+ adata,
60
+ batch='batch',
61
+ use_rep='X_pca',
62
+ method_clustering='leiden',
63
+ highly_variable=False,
64
+ )
65
+
66
+ assert len(calls) == 2
67
+ assert all(call.get('flavor') == 'leidenalg' for call in calls)