PyPI - pysimspec - Versions diffs - 0.1.0__tar.gz - Mend

pysimspec 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

pysimspec-0.1.0/LICENSE +21 -0
pysimspec-0.1.0/PKG-INFO +97 -0
pysimspec-0.1.0/README.md +76 -0
pysimspec-0.1.0/pyproject.toml +25 -0
pysimspec-0.1.0/setup.cfg +4 -0
pysimspec-0.1.0/src/pysimspec/__init__.py +8 -0
pysimspec-0.1.0/src/pysimspec/core.py +232 -0
pysimspec-0.1.0/src/pysimspec/logging.py +53 -0
pysimspec-0.1.0/src/pysimspec/utils.py +160 -0
pysimspec-0.1.0/src/pysimspec.egg-info/PKG-INFO +97 -0
pysimspec-0.1.0/src/pysimspec.egg-info/SOURCES.txt +13 -0
pysimspec-0.1.0/src/pysimspec.egg-info/dependency_links.txt +1 -0
pysimspec-0.1.0/src/pysimspec.egg-info/requires.txt +11 -0
pysimspec-0.1.0/src/pysimspec.egg-info/top_level.txt +1 -0
pysimspec-0.1.0/tests/test_core.py +67 -0

pysimspec-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Your Name
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

pysimspec-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,97 @@
+Metadata-Version: 2.4
+Name: pysimspec
+Version: 0.1.0
+Summary: Python implementation of the simspec algorithm
+Author-email: Zhisong He <zhisong.he@bsse.ethz.ch>
+License: MIT
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy
+Requires-Dist: pandas
+Requires-Dist: anndata
+Requires-Dist: scanpy
+Requires-Dist: scipy
+Requires-Dist: scikit-learn
+Requires-Dist: tqdm
+Requires-Dist: rich
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Dynamic: license-file
+# pysimspec
+Python implementation of simspec (for RSS/CSS). The paper about the method detailed is published in [Genome Biology](https://link.springer.com/article/10.1186/s13059-020-02147-4) in 2020. The original R implementation is available at [GitHub](https://github.com/quadbio/simspec).
+## Installation
+First, clone the codebase to your local environment
+```bash
+git clone https://github.com/quadbio/pysimspec.git
+```
+Next, install the package with `pip`
+```bash
+cd pysimspec
+pip install .
+```
+Just to mention, this project uses [uv](https://github.com/astral-sh/uv) for fast Python package management.
+```bash
+uv venv
+uv pip install -e '.[dev]'
+```
+## Quick example
+```python
+import scanpy as sc
+import anndata
+from pysimspec import Simspec, set_log_level, load
+# Set up logging
+set_log_level("INFO")
+# Load and concatenate data
+adata_DS1 = sc.read_h5ad('DS1_raw.h5ad')
+adata_DS2 = sc.read_h5ad('DS2_raw.h5ad')
+adata_DS1.obs['batch'] = 'DS1'
+adata_DS2.obs['batch'] = 'DS2'
+adata = anndata.concat([adata_DS1, adata_DS2], join='inner', keys=['DS1','DS2'], index_unique="_")
+# Data preprocessing
+adata.layers['counts'] = adata.X.copy()
+sc.pp.normalize_total(adata, target_sum=1e4)
+sc.pp.log1p(adata)
+sc.pp.highly_variable_genes(adata, flavor='seurat_v3', layer='counts', n_top_genes=3000, batch_key='batch')
+sc.pp.pca(adata, n_comps=20, mask_var='highly_variable')
+# Run CSS
+simspec = Simspec()
+simspec.compute_references(adata, batch = 'batch', use_rep = 'X_pca')
+simspec.compute_simspec(adata)
+simspec.compute_PCA(n_pcs = 10)
+adata.obsm['X_css'] = simspec.get_result()
+adata.obsm['X_csspca'] = simspec.get_transformed_result()
+# Use CSS representation for followup analysis
+sc.pp.neighbors(adata, use_rep='X_css')
+sc.tl.umap(adata)
+sc.pl.umap(adata, color='batch')
+# Save the Simspec object
+simspec.save('simspec.pkl')
+# Calculate projected CSS representation for the new data
+adata_DS3 = sc.read_h5ad('DS3_raw.h5ad')
+adata_DS3.layers['counts'] = adata_DS3.X.copy()
+sc.pp.normalize_total(adata_DS3, target_sum=1e4)
+sc.pp.log1p(adata_DS3)
+simspec = load('simspec.pkl') # load the saved Simspec object
+simspec.compute_simspec(adata_DS3)
+adata_DS3.obsm['X_css_proj'] = simspec.get_result()
+adata_DS3.obsm['X_csspca_proj'] = simspec.get_transformed_result()
+```
+## License
+MIT

pysimspec-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,76 @@
+# pysimspec
+Python implementation of simspec (for RSS/CSS). The paper about the method detailed is published in [Genome Biology](https://link.springer.com/article/10.1186/s13059-020-02147-4) in 2020. The original R implementation is available at [GitHub](https://github.com/quadbio/simspec).
+## Installation
+First, clone the codebase to your local environment
+```bash
+git clone https://github.com/quadbio/pysimspec.git
+```
+Next, install the package with `pip`
+```bash
+cd pysimspec
+pip install .
+```
+Just to mention, this project uses [uv](https://github.com/astral-sh/uv) for fast Python package management.
+```bash
+uv venv
+uv pip install -e '.[dev]'
+```
+## Quick example
+```python
+import scanpy as sc
+import anndata
+from pysimspec import Simspec, set_log_level, load
+# Set up logging
+set_log_level("INFO")
+# Load and concatenate data
+adata_DS1 = sc.read_h5ad('DS1_raw.h5ad')
+adata_DS2 = sc.read_h5ad('DS2_raw.h5ad')
+adata_DS1.obs['batch'] = 'DS1'
+adata_DS2.obs['batch'] = 'DS2'
+adata = anndata.concat([adata_DS1, adata_DS2], join='inner', keys=['DS1','DS2'], index_unique="_")
+# Data preprocessing
+adata.layers['counts'] = adata.X.copy()
+sc.pp.normalize_total(adata, target_sum=1e4)
+sc.pp.log1p(adata)
+sc.pp.highly_variable_genes(adata, flavor='seurat_v3', layer='counts', n_top_genes=3000, batch_key='batch')
+sc.pp.pca(adata, n_comps=20, mask_var='highly_variable')
+# Run CSS
+simspec = Simspec()
+simspec.compute_references(adata, batch = 'batch', use_rep = 'X_pca')
+simspec.compute_simspec(adata)
+simspec.compute_PCA(n_pcs = 10)
+adata.obsm['X_css'] = simspec.get_result()
+adata.obsm['X_csspca'] = simspec.get_transformed_result()
+# Use CSS representation for followup analysis
+sc.pp.neighbors(adata, use_rep='X_css')
+sc.tl.umap(adata)
+sc.pl.umap(adata, color='batch')
+# Save the Simspec object
+simspec.save('simspec.pkl')
+# Calculate projected CSS representation for the new data
+adata_DS3 = sc.read_h5ad('DS3_raw.h5ad')
+adata_DS3.layers['counts'] = adata_DS3.X.copy()
+sc.pp.normalize_total(adata_DS3, target_sum=1e4)
+sc.pp.log1p(adata_DS3)
+simspec = load('simspec.pkl') # load the saved Simspec object
+simspec.compute_simspec(adata_DS3)
+adata_DS3.obsm['X_css_proj'] = simspec.get_result()
+adata_DS3.obsm['X_csspca_proj'] = simspec.get_transformed_result()
+```
+## License
+MIT

pysimspec-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,25 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "pysimspec"
+version = "0.1.0"
+description = "Python implementation of the simspec algorithm"
+readme = "README.md"
+requires-python = ">=3.8"
+license = { text = "MIT" }
+authors = [ { name = "Zhisong He", email = "zhisong.he@bsse.ethz.ch" } ]
+dependencies = [
+    "numpy",
+    "pandas",
+    "anndata",
+    "scanpy",
+    "scipy",
+    "scikit-learn",
+    "tqdm",
+    "rich"
+]
+[project.optional-dependencies]
+dev = ["pytest>=7.0"]

pysimspec-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

pysimspec-0.1.0/src/pysimspec/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""pysimspec package for computing similarity spectra in single-cell data."""
+__all__ = ["Simspec", "set_log_level", "load"]
+__version__ = "0.1.0"
+from .core import Simspec, load
+from .logging import set_log_level

pysimspec-0.1.0/src/pysimspec/core.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""Core module for pysimspec package.
+This module contains the Simspec class for computing Cluster Similarity Spectra (CSS) and Reference Similarity Spectra (RSS)
+from single-cell RNA-seq data. It provides methods to calculate reference profiles
+from clustered data and compute similarity embeddings.
+"""
+import numpy as np
+import pandas as pd
+import warnings
+import anndata
+import scanpy as sc
+from sklearn.preprocessing import scale as scaledata
+from sklearn.decomposition import PCA
+from tqdm import tqdm
+from .utils import corSparse, rankMatrix_nonzero, summarize_numeric_matrix
+from .logging import logger
+class Simspec:
+    """Class to perform Cluster Similarity Spectrum (CSS) calculation for single-cell data analysis.
+    This class computes similarity spectra by comparing single-cell data to reference profiles
+    derived from clustered batches. It supports both Pearson and Spearman correlations,
+    with optional scaling of results.
+    """
+    def __init__(
+            self,
+            ref: list | pd.DataFrame | anndata.AnnData | None = None,
+            method: str = 'spearman',
+            scale: bool = True
+            ) -> None:
+        """Initialize the Simspec object.
+        Args:
+            ref: Reference data for similarity calculation. Can be a list of AnnData objects,
+                 a pandas DataFrame, an AnnData object, or None.
+            method: Correlation method to use. 'pearson' or 'spearman'. Default 'spearman'.
+            scale: Whether to scale the correlation results. Default True.
+        """
+        self.ref = ref
+        self.method = method
+        self.scale = scale
+        if isinstance(self.ref, pd.DataFrame):
+            self.ref = anndata.AnnData(X=np.array(self.ref),
+                                  var=pd.DataFrame(index=self.ref.columns),
+                                  obs=pd.DataFrame(index=self.ref.index))
+        if isinstance(self.ref, anndata.AnnData):
+            self.ref = list(self.ref)
+        self.result = None
+        self.transform = None
+    def compute_references(
+            self,
+            adata: anndata.AnnData,
+            batch: str = 'batch',
+            use_rep: str = 'X_pca',
+            layer: str | None = None,
+            n_neighbors: int = 15,
+            n_pcs: int | None = None,
+            method_clustering: str = 'leiden',
+            leiden_flavor: str = 'leidenalg',
+            resolution_clustering: float = 1,
+            highly_variable: bool = True
+            ) -> None:
+        """Calculate reference profiles by clustering cells within each batch.
+        This method splits the data by batch, performs clustering on each batch,
+        and computes average expression profiles for each cluster as references.
+        Args:
+            adata: AnnData object containing single-cell data.
+            batch: Column name in adata.obs for batch information. Default 'batch'.
+            use_rep: Representation to use for clustering. Default 'X_pca'.
+            layer: Layer in adata to use for expression. If None, uses adata.X.
+            n_neighbors: Number of neighbors for clustering. Default 15.
+            n_pcs: Number of PCs to use. If None, uses all.
+            method_clustering: Clustering method, 'louvain' or 'leiden'. Default 'leiden'.
+            leiden_flavor: Backend flavor passed to scanpy.tl.leiden.
+                Default 'leidenalg' to keep backward-compatible behavior.
+            resolution_clustering: Resolution parameter for clustering. Default 1.
+            highly_variable: Whether to subset to highly variable genes. Default True.
+        Returns:
+            None; stores the reference profiles in self.ref.
+        """
+        logger.info("Starting reference calculation")
+        if highly_variable and 'highly_variable' in adata.var.columns:
+            adata = adata[:, adata.var['highly_variable']]
+            logger.info(f"Subset to {adata.shape[1]} highly variable genes")
+        adata_batch = [adata[adata.obs[batch] == x, :].copy() for x in adata.obs[batch].unique()]
+        logger.info(f"Split data into {len(adata_batch)} batches")
+        for i, ad in enumerate(tqdm(adata_batch)):
+            logger.debug(f"Clustering batch {i+1}/{len(adata_batch)}")
+            sc.pp.neighbors(ad, use_rep=use_rep, n_neighbors=n_neighbors, n_pcs=n_pcs)
+            if method_clustering == 'louvain':
+                sc.tl.louvain(ad, resolution=resolution_clustering, key_added='cluster')
+            if method_clustering == 'leiden':
+                with warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "ignore",
+                        message=r".*default backend for leiden will be igraph.*",
+                        category=FutureWarning,
+                    )
+                    sc.tl.leiden(
+                        ad,
+                        resolution=resolution_clustering,
+                        key_added='cluster',
+                        flavor=leiden_flavor,
+                    )
+        avg_expr_cl = list()
+        for i, ad in enumerate(tqdm(adata_batch)):
+            logger.debug(f"Computing average expression for batch {i+1}")
+            avg_expr = summarize_numeric_matrix(
+                ad.X if layer is None or layer not in ad.layers.keys() else ad.layers[layer],
+                ad.obs['cluster']
+            )
+            avg_expr = anndata.AnnData(
+                X=avg_expr,
+                var=ad.var.copy(),
+                obs=pd.DataFrame(index=ad.obs['cluster'].cat.categories)
+            )
+            avg_expr_cl.append(avg_expr)
+        self.ref = avg_expr_cl
+        logger.info(f"Calculated {len(self.ref)} reference profiles")
+    def compute_simspec(
+            self,
+            adata,
+            layer: str | None = None) -> None:
+        """Compute the Cluster Similarity Spectrum (CSS) or Reference Similarity Spectrum (RSS) using the stored references.
+        Calculates similarity between the input data and each reference profile,
+        concatenates the results, and stores in self.result.
+        Args:
+            adata: AnnData object to compute similarities for.
+            layer: Layer in adata to use. If None, uses adata.X.
+        Returns:
+            None; results stored in self.result.
+        """
+        logger.info("Starting similarity computation")
+        sims = list()
+        for i, ad_ref in enumerate(tqdm(self.ref)):
+            logger.debug(f"Computing similarity for reference {i+1}/{len(self.ref)}")
+            shared_genes = np.intersect1d(adata.var_names, ad_ref.var_names)
+            X = adata[:, shared_genes].X.T if layer is None or layer not in adata.layers.keys() else adata[:,shared_genes].layers[layer].T
+            refX = ad_ref[:, shared_genes].X.T
+            if self.method == 'spearman':
+                X = rankMatrix_nonzero(X)
+                refX = rankMatrix_nonzero(refX)
+            corr = corSparse(X, refX)
+            if self.scale:
+                corr = scaledata(corr, axis=1)
+            corr[np.isnan(corr)] = 0
+            sims.append(corr)
+        sims_concat = np.concatenate(sims, axis=1)
+        self.result = sims_concat
+        logger.info(f"Computed similarity spectrum with shape {self.result.shape}")
+    def compute_PCA(self,n_pcs=20, force_recompute=False) -> None:
+        """Run PCA on the resulted representation for further dimensionality reduction.
+        Args:
+            n_pcs: Number of principal components. Default 20.
+            force_recompute: Whether to force recomputation if already computed. Default False.
+        Returns:
+            None; stores the PCA model in self.transform.
+        """
+        if self.result is None:
+            raise ValueError("No similarity results found. Please run compute_simspec() first.")
+        if self.transform is not None and not force_recompute:
+            logger.info("PCA transformation already computed. Use force_recompute=True to recompute.")
+            return
+        model_pca = PCA(n_components=n_pcs)
+        model_pca.fit(self.result)
+        self.transform = model_pca
+    def get_result(self) -> np.ndarray:
+        """Get the computed similarity results.
+        Returns:
+            Numpy array of similarity results.
+        """
+        return self.result
+    def get_transformed_result(self) -> np.ndarray:
+        """Get the PCA transformed similarity results.
+        Returns:
+            Numpy array of PCA transformed similarity results.
+        """
+        if self.transform is None:
+            raise ValueError("PCA transformation not computed. Please run compute_PCA() first.")
+        return self.transform.transform(self.result)
+    def save(self, filepath: str) -> None:
+        """Save the Simspec object to a file.
+        Args:
+            filepath: Path to save the object.
+        Returns:
+            None.
+        """
+        import pickle
+        with open(filepath, 'wb') as f:
+            pickle.dump(self, f)
+def load(filepath: str) -> Simspec:
+    """Load a Simspec object from a file.
+    Args:
+        filepath: Path to the saved object.
+    Returns:
+        Loaded Simspec object.
+    """
+    import pickle
+    with open(filepath, 'rb') as f:
+        obj = pickle.load(f)
+    return obj

pysimspec-0.1.0/src/pysimspec/logging.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Logging setup for the package."""
+import logging
+import os
+from typing import Literal
+def _setup_logger() -> logging.Logger:
+    from rich.console import Console
+    from rich.logging import RichHandler
+    logger = logging.getLogger(__name__)
+    level = os.environ.get("LOGLEVEL", logging.INFO)
+    logger.setLevel(level=level)
+    console = Console(force_terminal=True)
+    if console.is_jupyter is True:
+        console.is_jupyter = False
+    ch = RichHandler(show_path=False, console=console, show_time=logger.level == logging.DEBUG)
+    logger.addHandler(ch)
+    # this prevents double outputs
+    logger.propagate = False
+    return logger
+def set_log_level(
+    level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] | Literal[10, 20, 30, 40, 50],
+) -> None:
+    """Set the logging level for the scembed logger.
+    Parameters
+    ----------
+    level
+        Logging level. Can be a string ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL')
+        or logging constants (logging.DEBUG=10, logging.INFO=20, logging.WARNING=30,
+        logging.ERROR=40, logging.CRITICAL=50).
+    Examples
+    --------
+    >>> import scembed.logging
+    >>> scembed.logging.set_log_level("DEBUG")
+    >>> scembed.logging.set_log_level(logging.INFO)
+    """
+    if isinstance(level, str):
+        level = getattr(logging, level.upper())
+    logger.setLevel(level)
+    # Update handlers to ensure they respect the new level
+    for handler in logger.handlers:
+        handler.setLevel(level)
+logger = _setup_logger()

pysimspec-0.1.0/src/pysimspec/utils.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""Utility functions for simspec calculations.
+This module contains helper functions for computing correlations, ranking matrices,
+and summarizing data, optimized for sparse matrices and single-cell data analysis.
+"""
+import numpy as np
+import pandas as pd
+from scipy import sparse
+from scipy.stats import rankdata
+def corSparse(X, Y=None):
+    """Calculate correlation between columns of matrices X and Y.
+    Supports both dense and sparse matrices. If Y is None, computes
+    correlation of X with itself.
+    Args:
+        X: Input matrix (dense or sparse).
+        Y: Second matrix. If None, uses X.
+    Returns:
+        Correlation matrix as numpy array.
+    """
+    if Y is None:
+        Y = X
+    n = X.shape[0]
+    muX = np.ravel(X.mean(0))
+    muY = np.ravel(Y.mean(0))
+    covmat = (X.T.dot(Y) - (n * muX[:, np.newaxis].dot(muY[:, np.newaxis].T))) / (n - 1)
+    sdvecX = np.ravel(
+        np.sqrt(((X.power(2)).sum(0) - n * (muX ** 2)) / (n - 1)) if sparse.issparse(X) else np.sqrt(
+            ((X ** 2).sum(0) - n * (muX ** 2)) / (n - 1)))
+    sdvecY = np.ravel(
+        np.sqrt(((Y.power(2)).sum(0) - n * (muY ** 2)) / (n - 1)) if sparse.issparse(Y) else np.sqrt(
+            ((Y ** 2).sum(0) - n * (muY ** 2)) / (n - 1)))
+    cormat = covmat / sdvecX[:, np.newaxis].dot(sdvecY[:, np.newaxis].T)
+    return np.array(cormat)
+def rankMatrix(X):
+    """Rank the values in each column of the matrix.
+    For sparse matrices, ranks only non-zero values. For dense, ranks all values.
+    Args:
+        X: Input matrix (dense or sparse).
+    Returns:
+        Ranked matrix in the same format as input.
+    """
+    if sparse.issparse(X):
+        idx_row, idx_col, dat = sparse.find(X)
+        df = pd.DataFrame({'i': idx_row, 'j': idx_col, 'x': dat}).sort_values('j')
+        split_idx = np.unique(df['j'].to_numpy(), return_index=True)[1][1:]
+        value_split = np.split(df['x'].to_numpy(), split_idx)
+        df['r'] = np.concatenate(
+            [rankdata(x) + (df.shape[0] - len(x)) - (1 + (df.shape[0] - len(x))) / 2 for x in value_split])
+        ranked = sparse.csr_matrix((df['r'].to_numpy(), (df['i'].to_numpy(), df['j'].to_numpy())), shape=X.shape)
+    else:
+        ranked = rankdata(X, method="average", axis=0)
+    return ranked
+def rankMatrix_dense(X):
+    """Rank the values in each column using dense ranking method.
+    Dense ranking assigns the same rank to tied values without gaps.
+    Args:
+        X: Input matrix (dense or sparse).
+    Returns:
+        Ranked matrix.
+    """
+    if sparse.issparse(X):
+        idx_row, idx_col, dat = sparse.find(X)
+        df = pd.DataFrame({'i': idx_row, 'j': idx_col, 'x': dat}).sort_values('j')
+        split_idx = np.unique(df['j'].to_numpy(), return_index=True)[1][1:]
+        df['r'] = np.concatenate(
+            [rankdata(x, method="dense") for x in np.split(df['x'].to_numpy(), split_idx)])
+        ranked = sparse.csr_matrix((df['r'].to_numpy(), (df['i'].to_numpy(), df['j'].to_numpy())), shape=X.shape)
+    else:
+        ranked = rankdata(X, method="average", axis=0)
+    return ranked
+def rankMatrix_nonzero(X):
+    """Rank only the non-zero values in each column.
+    Zero values remain zero. Useful for sparse data.
+    Args:
+        X: Input matrix (dense or sparse).
+    Returns:
+        Ranked matrix with zeros preserved.
+    """
+    if sparse.issparse(X):
+        idx_row, idx_col, dat = sparse.find(X)
+        df = pd.DataFrame({'i': idx_row, 'j': idx_col, 'x': dat}).sort_values('j')
+        split_idx = np.unique(df['j'].to_numpy(), return_index=True)[1][1:]
+        value_split = np.split(df['x'].to_numpy(), split_idx)
+        df['r'] = np.concatenate([rankdata(x) for x in value_split])
+        ranked = sparse.csr_matrix((df['r'].to_numpy(), (df['i'].to_numpy(), df['j'].to_numpy())), shape=X.shape)
+    else:
+        ranked = rankdata(X, method="average", axis=0)
+    return ranked
+def group_vec_to_ident_mat(group, norm=True):
+    """Convert a group vector to an identity matrix for aggregation.
+    Creates a sparse matrix where each row corresponds to an item,
+    and columns to groups. Values are 1 for membership.
+    Args:
+        group: Pandas Series or array with group labels.
+        norm: If True, normalize by group size.
+    Returns:
+        Sparse CSR matrix.
+    """
+    if isinstance(group, pd.Series):
+        i = np.where(group.notnull().to_numpy())[0]
+        j = group.iloc[i].astype(int).to_numpy()
+    else:
+        group_arr = np.asarray(group)
+        i = np.where(~pd.isnull(group_arr))[0]
+        j = group_arr[i].astype(int)
+    mat_ident = sparse.csr_matrix((np.repeat(1, len(i)), (i, j)), shape=(len(group), len(np.unique(j))))
+    if norm:
+        num_cells_per_group = np.ravel(mat_ident.sum(axis=0))
+        mat_ident = mat_ident @ sparse.diags(1 / num_cells_per_group)
+    return mat_ident
+def summarize_numeric_matrix(mat, group, use_mean=True):
+    """Summarize a numeric matrix by grouping rows.
+    Aggregates the matrix by groups, optionally computing mean per group.
+    Args:
+        mat: Numeric matrix (cells x features).
+        group: Group labels for each row.
+        use_mean: If True, compute mean; else sum.
+    Returns:
+        Summarized matrix (groups x features).
+    """
+    mat_ident = group_vec_to_ident_mat(group, norm=use_mean)
+    mat_summ = mat_ident.transpose() @ mat
+    return mat_summ

pysimspec-0.1.0/src/pysimspec.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,97 @@
+Metadata-Version: 2.4
+Name: pysimspec
+Version: 0.1.0
+Summary: Python implementation of the simspec algorithm
+Author-email: Zhisong He <zhisong.he@bsse.ethz.ch>
+License: MIT
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy
+Requires-Dist: pandas
+Requires-Dist: anndata
+Requires-Dist: scanpy
+Requires-Dist: scipy
+Requires-Dist: scikit-learn
+Requires-Dist: tqdm
+Requires-Dist: rich
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Dynamic: license-file
+# pysimspec
+Python implementation of simspec (for RSS/CSS). The paper about the method detailed is published in [Genome Biology](https://link.springer.com/article/10.1186/s13059-020-02147-4) in 2020. The original R implementation is available at [GitHub](https://github.com/quadbio/simspec).
+## Installation
+First, clone the codebase to your local environment
+```bash
+git clone https://github.com/quadbio/pysimspec.git
+```
+Next, install the package with `pip`
+```bash
+cd pysimspec
+pip install .
+```
+Just to mention, this project uses [uv](https://github.com/astral-sh/uv) for fast Python package management.
+```bash
+uv venv
+uv pip install -e '.[dev]'
+```
+## Quick example
+```python
+import scanpy as sc
+import anndata
+from pysimspec import Simspec, set_log_level, load
+# Set up logging
+set_log_level("INFO")
+# Load and concatenate data
+adata_DS1 = sc.read_h5ad('DS1_raw.h5ad')
+adata_DS2 = sc.read_h5ad('DS2_raw.h5ad')
+adata_DS1.obs['batch'] = 'DS1'
+adata_DS2.obs['batch'] = 'DS2'
+adata = anndata.concat([adata_DS1, adata_DS2], join='inner', keys=['DS1','DS2'], index_unique="_")
+# Data preprocessing
+adata.layers['counts'] = adata.X.copy()
+sc.pp.normalize_total(adata, target_sum=1e4)
+sc.pp.log1p(adata)
+sc.pp.highly_variable_genes(adata, flavor='seurat_v3', layer='counts', n_top_genes=3000, batch_key='batch')
+sc.pp.pca(adata, n_comps=20, mask_var='highly_variable')
+# Run CSS
+simspec = Simspec()
+simspec.compute_references(adata, batch = 'batch', use_rep = 'X_pca')
+simspec.compute_simspec(adata)
+simspec.compute_PCA(n_pcs = 10)
+adata.obsm['X_css'] = simspec.get_result()
+adata.obsm['X_csspca'] = simspec.get_transformed_result()
+# Use CSS representation for followup analysis
+sc.pp.neighbors(adata, use_rep='X_css')
+sc.tl.umap(adata)
+sc.pl.umap(adata, color='batch')
+# Save the Simspec object
+simspec.save('simspec.pkl')
+# Calculate projected CSS representation for the new data
+adata_DS3 = sc.read_h5ad('DS3_raw.h5ad')
+adata_DS3.layers['counts'] = adata_DS3.X.copy()
+sc.pp.normalize_total(adata_DS3, target_sum=1e4)
+sc.pp.log1p(adata_DS3)
+simspec = load('simspec.pkl') # load the saved Simspec object
+simspec.compute_simspec(adata_DS3)
+adata_DS3.obsm['X_css_proj'] = simspec.get_result()
+adata_DS3.obsm['X_csspca_proj'] = simspec.get_transformed_result()
+```
+## License
+MIT

pysimspec-0.1.0/src/pysimspec.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,13 @@
+LICENSE
+README.md
+pyproject.toml
+src/pysimspec/__init__.py
+src/pysimspec/core.py
+src/pysimspec/logging.py
+src/pysimspec/utils.py
+src/pysimspec.egg-info/PKG-INFO
+src/pysimspec.egg-info/SOURCES.txt
+src/pysimspec.egg-info/dependency_links.txt
+src/pysimspec.egg-info/requires.txt
+src/pysimspec.egg-info/top_level.txt
+tests/test_core.py

pysimspec-0.1.0/src/pysimspec.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

pysimspec-0.1.0/src/pysimspec.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,11 @@
+numpy
+pandas
+anndata
+scanpy
+scipy
+scikit-learn
+tqdm
+rich
+[dev]
+pytest>=7.0

pysimspec-0.1.0/src/pysimspec.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ pysimspec

pysimspec-0.1.0/tests/test_core.py ADDED Viewed

@@ -0,0 +1,67 @@
+from pysimspec.core import Simspec
+import pysimspec.core as core_module
+import warnings
+import numpy as np
+from scipy import sparse
+from pysimspec.utils import rankMatrix_nonzero
+def test_simspec_class():
+    alg = Simspec()
+    assert alg.method == 'spearman'
+    assert alg.scale == True
+def test_rankmatrix_nonzero_no_series_swapaxes_warning():
+    X = sparse.csr_matrix(np.array([[1.0, 0.0, 3.0],
+                                    [0.0, 2.0, 0.0],
+                                    [4.0, 0.0, 5.0]]))
+    with warnings.catch_warnings(record=True) as caught:
+        warnings.simplefilter("always", FutureWarning)
+        rankMatrix_nonzero(X)
+    messages = [str(w.message) for w in caught if issubclass(w.category, FutureWarning)]
+    assert not any("Series.swapaxes" in msg for msg in messages)
+def test_compute_references_leiden_default_flavor(monkeypatch):
+    import anndata
+    import pandas as pd
+    X = np.array([
+        [1.0, 0.0, 2.0],
+        [0.5, 1.0, 1.5],
+        [2.0, 0.5, 0.0],
+        [1.5, 1.0, 0.5],
+    ])
+    obs = pd.DataFrame({'batch': ['b1', 'b1', 'b2', 'b2']})
+    var = pd.DataFrame(index=['g1', 'g2', 'g3'])
+    adata = anndata.AnnData(X=X, obs=obs, var=var)
+    adata.obsm['X_pca'] = X.copy()
+    calls = []
+    def fake_neighbors(*args, **kwargs):
+        return None
+    def fake_leiden(ad, **kwargs):
+        calls.append(kwargs)
+        ad.obs['cluster'] = pd.Categorical(['0'] * ad.n_obs)
+    monkeypatch.setattr(core_module.sc.pp, 'neighbors', fake_neighbors)
+    monkeypatch.setattr(core_module.sc.tl, 'leiden', fake_leiden)
+    alg = Simspec()
+    alg.compute_references(
+        adata,
+        batch='batch',
+        use_rep='X_pca',
+        method_clustering='leiden',
+        highly_variable=False,
+    )
+    assert len(calls) == 2
+    assert all(call.get('flavor') == 'leidenalg' for call in calls)