PyPI - pycopro - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pycopro 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

copro/__init__.py +23 -0
copro/core.py +124 -0
copro/correlation.py +248 -0
copro/distance.py +190 -0
copro/kernel.py +230 -0
copro/optimization.py +481 -0
copro/pca.py +173 -0
copro/scores.py +120 -0
copro/skrcca.py +150 -0
copro/utils.py +27 -0
pycopro-0.1.0.dist-info/METADATA +14 -0
pycopro-0.1.0.dist-info/RECORD +14 -0
pycopro-0.1.0.dist-info/WHEEL +5 -0
pycopro-0.1.0.dist-info/top_level.txt +1 -0

copro/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""CoPro Python — Spatial Kernel-based Reduced Rank CCA for spatial transcriptomics."""
+from .core import CoProSingle, CoProMulti, subset_data
+from .pca import compute_pca
+from .distance import compute_distance
+from .kernel import compute_kernel_matrix
+from .skrcca import run_skr_cca
+from .correlation import compute_normalized_correlation
+from .scores import compute_gene_and_cell_scores
+__all__ = [
+    "CoProSingle",
+    "CoProMulti",
+    "subset_data",
+    "compute_pca",
+    "compute_distance",
+    "compute_kernel_matrix",
+    "run_skr_cca",
+    "compute_normalized_correlation",
+    "compute_gene_and_cell_scores",
+]
+__version__ = "0.1.0"

copro/core.py ADDED Viewed

@@ -0,0 +1,124 @@
+"""CoProSingle and CoProMulti dataclasses — state containers."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Optional
+import numpy as np
+import pandas as pd
+@dataclass
+class CoProSingle:
+    # Input data
+    normalized_data: np.ndarray          # cells × genes
+    location_data: pd.DataFrame          # cells × {x, y, ...}
+    meta_data: pd.DataFrame
+    cell_types: np.ndarray               # per-cell label vector
+    # Set by subset_data
+    cell_types_of_interest: list = field(default_factory=list)
+    normalized_data_sub: Optional[np.ndarray] = None
+    location_data_sub: Optional[pd.DataFrame] = None
+    cell_types_sub: Optional[np.ndarray] = None
+    # Computed results (keyed dicts)
+    pca_global: dict = field(default_factory=dict)        # ct → dict with components/scores/sdev
+    distances: dict = field(default_factory=dict)         # flat keys: "dist|A|B"
+    kernel_matrices: dict = field(default_factory=dict)   # flat keys: "kernel|sigma0.1|A|B"
+    sigma_values: list = field(default_factory=list)
+    skr_cca_out: dict = field(default_factory=dict)       # "sigma_0.1" → {ct: w_matrix}
+    normalized_correlation: dict = field(default_factory=dict)
+    sigma_value_choice: Optional[float] = None
+    cell_scores: dict = field(default_factory=dict)
+    gene_scores: dict = field(default_factory=dict)
+    n_cc: int = 2
+    n_pca: int = 30
+    scale_pcs: bool = True
+@dataclass
+class CoProMulti:
+    """Multi-slide CoPro object. meta_data must have a 'slideID' column."""
+    # Input data
+    normalized_data: np.ndarray
+    location_data: pd.DataFrame
+    meta_data: pd.DataFrame
+    cell_types: np.ndarray
+    # Slide list
+    slide_list: list = field(default_factory=list)        # ordered list of slide IDs
+    # Set by subset_data
+    cell_types_of_interest: list = field(default_factory=list)
+    normalized_data_sub: Optional[np.ndarray] = None
+    location_data_sub: Optional[pd.DataFrame] = None
+    cell_types_sub: Optional[np.ndarray] = None
+    meta_data_sub: Optional[pd.DataFrame] = None         # subset of meta_data (with slideID)
+    # PCA
+    pca_global: dict = field(default_factory=dict)        # ct → global PCA dict (rotation, sdev)
+    pca_results: dict = field(default_factory=dict)       # slide → {ct → scores matrix}
+    # Computed results
+    distances: dict = field(default_factory=dict)         # flat keys: "dist|{slide}|A|B"
+    kernel_matrices: dict = field(default_factory=dict)   # flat keys: "kernel|sigma0.1|{slide}|A|B"
+    sigma_values: list = field(default_factory=list)
+    skr_cca_out: dict = field(default_factory=dict)       # "sigma_0.1" → {ct: w_matrix} (shared)
+    normalized_correlation: dict = field(default_factory=dict)
+    sigma_value_choice: Optional[float] = None
+    cell_scores: dict = field(default_factory=dict)       # "cellScores|sigma0.1|{slide}|{ct}"
+    gene_scores: dict = field(default_factory=dict)       # "geneScores|sigma0.1|{ct}" (shared)
+    n_cc: int = 2
+    n_pca: int = 30
+    scale_pcs: bool = True
+def subset_data(obj, cell_types_of_interest: list, min_cells: int = 10):
+    """Filter data to listed cell types. Works for both CoProSingle and CoProMulti."""
+    if isinstance(obj, CoProMulti):
+        return _subset_data_multi(obj, cell_types_of_interest, min_cells)
+    else:
+        return _subset_data_single(obj, cell_types_of_interest, min_cells)
+def _subset_data_single(obj: CoProSingle, cell_types_of_interest: list, min_cells: int) -> CoProSingle:
+    for ct in cell_types_of_interest:
+        n = np.sum(obj.cell_types == ct)
+        if n < min_cells:
+            raise ValueError(
+                f"Cell type '{ct}' has only {n} cells (minimum {min_cells} required)."
+            )
+    mask = np.isin(obj.cell_types, cell_types_of_interest)
+    obj.cell_types_of_interest = list(cell_types_of_interest)
+    obj.normalized_data_sub = obj.normalized_data[mask]
+    obj.location_data_sub = obj.location_data.loc[mask].reset_index(drop=True)
+    obj.cell_types_sub = obj.cell_types[mask]
+    return obj
+def _subset_data_multi(obj: CoProMulti, cell_types_of_interest: list, min_cells: int) -> CoProMulti:
+    """Subset multi-slide object. Checks per-slide cell counts."""
+    if "slideID" not in obj.meta_data.columns:
+        raise ValueError("meta_data must have a 'slideID' column for CoProMulti.")
+    # Discover slide list from meta_data if not set
+    if not obj.slide_list:
+        obj.slide_list = sorted(obj.meta_data["slideID"].unique().tolist())
+    for ct in cell_types_of_interest:
+        # Check total across all slides
+        n_total = np.sum(obj.cell_types == ct)
+        if n_total < min_cells:
+            raise ValueError(
+                f"Cell type '{ct}' has only {n_total} cells total (minimum {min_cells} required)."
+            )
+    mask = np.isin(obj.cell_types, cell_types_of_interest)
+    obj.cell_types_of_interest = list(cell_types_of_interest)
+    obj.normalized_data_sub = obj.normalized_data[mask]
+    obj.location_data_sub = obj.location_data.loc[mask].reset_index(drop=True)
+    obj.cell_types_sub = obj.cell_types[mask]
+    obj.meta_data_sub = obj.meta_data.loc[mask].reset_index(drop=True)
+    return obj

copro/correlation.py ADDED Viewed

@@ -0,0 +1,248 @@
+"""compute_normalized_correlation() — spectral-norm normalized CCA correlation."""
+from __future__ import annotations
+from itertools import combinations
+import numpy as np
+import pandas as pd
+from scipy.sparse.linalg import svds
+from .core import CoProSingle
+from .skrcca import _prepare_pc_matrices
+def _spectral_norm(K: np.ndarray, tol: float = 1e-4) -> float:
+    """Largest singular value of K (spectral norm)."""
+    try:
+        s = svds(K.astype(float), k=1, tol=tol, return_singular_vectors=False)
+        return float(s[0])
+    except Exception:
+        return float(np.linalg.norm(K, ord=2))
+def _get_kernel_for_pair(flat_kernels, sigma, ct_i, ct_j, slide=None):
+    """Retrieve kernel, optionally slide-aware, trying both orderings."""
+    if slide is None:
+        name = f"kernel|sigma{sigma}|{ct_i}|{ct_j}"
+        name_sym = f"kernel|sigma{sigma}|{ct_j}|{ct_i}"
+    else:
+        name = f"kernel|sigma{sigma}|{slide}|{ct_i}|{ct_j}"
+        name_sym = f"kernel|sigma{sigma}|{slide}|{ct_j}|{ct_i}"
+    if name in flat_kernels:
+        return flat_kernels[name]
+    if name_sym in flat_kernels:
+        return flat_kernels[name_sym].T
+    raise KeyError(f"Kernel not found for ({ct_i},{ct_j}) sigma={sigma} slide={slide}")
+def compute_normalized_correlation(obj, tol: float = 1e-4):
+    """Compute normalized CCA correlation for each sigma × pair × CC.
+    Dispatches to multi-slide version for CoProMulti objects.
+    Formula:
+        numerator   = (A @ w1)^T K (B @ w2)
+        denominator = ||A @ w1|| * ||B @ w2|| * ||K||_spec
+        norm_corr   = numerator / denominator
+    Stores in obj.normalized_correlation[sigma_name] = DataFrame.
+    Chooses obj.sigma_value_choice as sigma maximizing mean CC1 correlation.
+    """
+    from .core import CoProMulti
+    if isinstance(obj, CoProMulti):
+        return _compute_normalized_correlation_multi(obj, tol)
+    # --- Single-slide path ---
+    cts = obj.cell_types_of_interest
+    if not cts:
+        raise ValueError("No cell types of interest.")
+    if not obj.skr_cca_out:
+        raise ValueError("CCA results missing. Run run_skr_cca() first.")
+    scale_pcs = getattr(obj, "scale_pcs", True)
+    n_cc = obj.n_cc
+    # Scaled PC matrices
+    X_dict = _prepare_pc_matrices(obj, scale_pcs, cts)
+    # Pairs
+    if len(cts) == 1:
+        pairs = [(cts[0], cts[0])]
+    else:
+        pairs = list(combinations(cts, 2))
+    print("Calculating spectral norms (may take a while)...")
+    # Precompute spectral norms for each sigma × pair
+    spec_norms = {}
+    for sigma in obj.sigma_values:
+        spec_norms[sigma] = {}
+        for ct_i, ct_j in pairs:
+            try:
+                K = _get_kernel_for_pair(obj.kernel_matrices, sigma, ct_i, ct_j)
+                spec_norms[sigma][(ct_i, ct_j)] = _spectral_norm(K, tol=tol)
+                spec_norms[sigma][(ct_j, ct_i)] = spec_norms[sigma][(ct_i, ct_j)]
+            except KeyError:
+                spec_norms[sigma][(ct_i, ct_j)] = np.nan
+    print("Finished calculating spectral norms.")
+    correlation_value = {}
+    for sigma in obj.sigma_values:
+        sigma_name = f"sigma_{sigma}"
+        w_sigma = obj.skr_cca_out.get(sigma_name)
+        if w_sigma is None:
+            continue
+        rows = []
+        for ct_i, ct_j in pairs:
+            A = X_dict[ct_i]
+            B = X_dict[ct_j]
+            try:
+                K = _get_kernel_for_pair(obj.kernel_matrices, sigma, ct_i, ct_j)
+            except KeyError:
+                continue
+            norm_K = spec_norms[sigma].get((ct_i, ct_j), np.nan)
+            for cc in range(n_cc):
+                w1 = w_sigma[ct_i][:, cc : cc + 1]
+                w2 = w_sigma[ct_j][:, cc : cc + 1]
+                Aw1 = A @ w1
+                Bw2 = B @ w2
+                numerator = float((Aw1.T @ K @ Bw2).flat[0])
+                denom = float(np.sqrt(np.sum(Aw1 ** 2))) * float(np.sqrt(np.sum(Bw2 ** 2))) * norm_K
+                norm_corr = 0.0 if abs(denom) < 1e-9 else numerator / denom
+                rows.append({
+                    "sigma": sigma,
+                    "cell_type_1": ct_i,
+                    "cell_type_2": ct_j,
+                    "CC_index": cc + 1,
+                    "normalized_correlation": norm_corr,
+                })
+        correlation_value[sigma_name] = pd.DataFrame(rows)
+    obj.normalized_correlation = correlation_value
+    # Choose sigma maximizing mean CC1 correlation
+    all_cc1 = []
+    for sigma_name, df in correlation_value.items():
+        if df is not None and len(df) > 0:
+            cc1 = df[df["CC_index"] == 1]
+            mean_corr = cc1["normalized_correlation"].mean()
+            sigma_val = float(sigma_name.replace("sigma_", ""))
+            all_cc1.append((sigma_val, mean_corr))
+    if all_cc1:
+        obj.sigma_value_choice = max(all_cc1, key=lambda x: x[1])[0]
+    return obj
+def _compute_normalized_correlation_multi(obj, tol=1e-4):
+    """Multi-slide normalized correlation: per-slide values matching R format.
+    R computes normalized correlation independently for each slide using the
+    raw (unscaled) per-slide PCA scores from pcaResults (not scaled by sdev).
+    We replicate this: for each (sigma, slide, pair, CC), compute norm_corr
+    using only that slide's raw PCA scores and per-slide spectral norm.
+    Sigma choice is based on the mean CC1 correlation across slides.
+    """
+    cts = obj.cell_types_of_interest
+    slides = obj.slide_list
+    n_cc = obj.n_cc
+    # Use raw (unscaled) per-slide PCA scores — matching R's pcaResults usage
+    X_list_all = {
+        slide: {ct: obj.pca_results[slide][ct].astype(float)
+                for ct in cts if ct in obj.pca_results.get(slide, {})}
+        for slide in slides
+    }
+    if len(cts) == 1:
+        pairs = [(cts[0], cts[0])]
+    else:
+        pairs = list(combinations(cts, 2))
+    # Precompute per-slide spectral norms for each sigma × pair
+    print("Calculating spectral norms (multi-slide)...")
+    spec_norms = {}  # spec_norms[sigma][(ct_i, ct_j, slide)]
+    for sigma in obj.sigma_values:
+        spec_norms[sigma] = {}
+        for ct_i, ct_j in pairs:
+            for slide in slides:
+                try:
+                    K = _get_kernel_for_pair(obj.kernel_matrices, sigma, ct_i, ct_j, slide)
+                    val = _spectral_norm(K, tol)
+                except KeyError:
+                    val = np.nan
+                spec_norms[sigma][(ct_i, ct_j, slide)] = val
+                spec_norms[sigma][(ct_j, ct_i, slide)] = val
+    print("Finished spectral norms.")
+    correlation_value = {}
+    for sigma in obj.sigma_values:
+        sigma_name = f"sigma_{sigma}"
+        w_sigma = obj.skr_cca_out.get(sigma_name)
+        if w_sigma is None:
+            continue
+        rows = []
+        for ct_i, ct_j in pairs:
+            for cc in range(n_cc):
+                w1 = w_sigma[ct_i][:, cc:cc+1]
+                w2 = w_sigma[ct_j][:, cc:cc+1]
+                # Per-slide correlation (matches R format)
+                for slide in slides:
+                    A = X_list_all[slide].get(ct_i)
+                    B = X_list_all[slide].get(ct_j)
+                    if A is None or B is None:
+                        continue
+                    try:
+                        K = _get_kernel_for_pair(obj.kernel_matrices, sigma, ct_i, ct_j, slide)
+                    except KeyError:
+                        continue
+                    norm_K = spec_norms[sigma].get((ct_i, ct_j, slide), np.nan)
+                    Aw1 = A @ w1
+                    Bw2 = B @ w2
+                    numerator = float((Aw1.T @ K @ Bw2).flat[0])
+                    denom = (float(np.linalg.norm(Aw1)) *
+                             float(np.linalg.norm(Bw2)) *
+                             norm_K)
+                    norm_corr = 0.0 if abs(denom) < 1e-9 else numerator / denom
+                    rows.append({
+                        "sigma": sigma,
+                        "slideID": slide,
+                        "cell_type_1": ct_i,
+                        "cell_type_2": ct_j,
+                        "CC_index": cc + 1,
+                        "normalized_correlation": norm_corr,
+                    })
+        correlation_value[sigma_name] = pd.DataFrame(rows)
+    obj.normalized_correlation = correlation_value
+    # Choose sigma maximizing mean CC1 correlation across slides
+    all_cc1 = []
+    for sigma_name, df in correlation_value.items():
+        if df is not None and len(df) > 0:
+            cc1 = df[df["CC_index"] == 1]
+            mean_corr = cc1["normalized_correlation"].mean()
+            sigma_val = float(sigma_name.replace("sigma_", ""))
+            all_cc1.append((sigma_val, mean_corr))
+    if all_cc1:
+        obj.sigma_value_choice = max(all_cc1, key=lambda x: x[1])[0]
+    return obj

copro/distance.py ADDED Viewed

@@ -0,0 +1,190 @@
+"""compute_distance() — pairwise Euclidean distances between cell types."""
+from __future__ import annotations
+import warnings
+from itertools import combinations
+import numpy as np
+from scipy.spatial.distance import cdist
+from .core import CoProSingle
+def _dist_flat_name(ct_i: str, ct_j: str) -> str:
+    return f"dist|{ct_i}|{ct_j}"
+def _process_distance_matrix(
+    dist_mat: np.ndarray,
+    truncate: bool,
+    percentile_choice: float | None = None,
+    set_diag_inf: bool = False,
+) -> tuple[np.ndarray, float]:
+    """Process distance matrix: handle zeros, compute percentile, optionally truncate.
+    Returns (processed_matrix, dist_percentile).
+    """
+    dist_mat = dist_mat.copy()
+    if set_diag_inf:
+        np.fill_diagonal(dist_mat, np.inf)
+    # Replace zeros (overlapping cells) with smallest non-zero
+    if np.any(dist_mat == 0):
+        min_nz = np.min(dist_mat[dist_mat > 0]) if np.any(dist_mat > 0) else 1.0
+        dist_mat[dist_mat == 0] = min_nz
+        warnings.warn(
+            "Zero distances detected; replaced with smallest non-zero distance."
+        )
+    # Choose percentile threshold
+    finite_vals = dist_mat[np.isfinite(dist_mat) & (dist_mat > 0)]
+    if len(finite_vals) == 0:
+        raise ValueError("No finite non-zero distances found.")
+    if percentile_choice is None:
+        percentile_choice = min(1e-3, 2.0 / max(dist_mat.shape))
+    dist_percentile = float(np.quantile(finite_vals, percentile_choice))
+    if truncate:
+        mask = (dist_mat < dist_percentile) & np.isfinite(dist_mat)
+        dist_mat[mask] = dist_percentile
+    return dist_mat, dist_percentile
+def compute_distance(
+    obj,
+    dist_type: str = "Euclidean2D",
+    normalize: bool = True,
+    truncate: bool = True,
+):
+    """Compute pairwise Euclidean distance matrices between all cell-type pairs.
+    Dispatches to multi-slide version for CoProMulti objects.
+    For single-slide, 2+ types: pairs (ct_i, ct_j) stored under 'dist|ct_i|ct_j'.
+    For single-slide, 1 type: within-type (ct, ct) stored under 'dist|ct|ct'.
+    For multi-slide: keys include slide: 'dist|{slide}|ct_i|ct_j'.
+    Normalization: scales so 0.001th-percentile distance (across all pairs) equals 0.01.
+    """
+    from .core import CoProMulti
+    if isinstance(obj, CoProMulti):
+        return _compute_distance_multi(obj, dist_type, normalize, truncate)
+    # --- Single-slide path ---
+    cts = obj.cell_types_of_interest
+    if not cts:
+        raise ValueError("No cell types of interest. Run subset_data() first.")
+    if dist_type != "Euclidean2D":
+        raise NotImplementedError(f"dist_type '{dist_type}' not implemented. Use 'Euclidean2D'.")
+    loc = obj.location_data_sub
+    if not {"x", "y"}.issubset(loc.columns):
+        raise ValueError("location_data_sub must have columns 'x' and 'y'.")
+    distances = {}
+    if len(cts) == 1:
+        # Within-type only
+        ct = cts[0]
+        mask = obj.cell_types_sub == ct
+        coords = loc.loc[mask, ["x", "y"]].values.astype(float)
+        dist_mat = cdist(coords, coords)
+        dist_mat, dist_percentile = _process_distance_matrix(
+            dist_mat, truncate, percentile_choice=1e-4, set_diag_inf=True
+        )
+        flat_name = _dist_flat_name(ct, ct)
+        distances[flat_name] = dist_mat
+        if normalize:
+            scaling_factor = 0.01 / dist_percentile
+            distances[flat_name] = dist_mat * scaling_factor
+    else:
+        # Between-type pairs
+        pairs = list(combinations(cts, 2))
+        dist_percentiles = []
+        raw_mats = {}
+        for ct_i, ct_j in pairs:
+            mask_i = obj.cell_types_sub == ct_i
+            mask_j = obj.cell_types_sub == ct_j
+            coords_i = loc.loc[mask_i, ["x", "y"]].values.astype(float)
+            coords_j = loc.loc[mask_j, ["x", "y"]].values.astype(float)
+            dist_mat = cdist(coords_i, coords_j)
+            dist_mat, dist_pct = _process_distance_matrix(dist_mat, truncate)
+            dist_percentiles.append(dist_pct)
+            flat_name = _dist_flat_name(ct_i, ct_j)
+            raw_mats[flat_name] = dist_mat
+        if normalize:
+            min_percentile = min(dist_percentiles)
+            scaling_factor = 0.01 / min_percentile
+            for flat_name, dist_mat in raw_mats.items():
+                distances[flat_name] = dist_mat * scaling_factor
+        else:
+            distances = raw_mats
+    obj.distances = distances
+    return obj
+def _compute_distance_multi(obj, dist_type="Euclidean2D", normalize=True, truncate=True):
+    """Multi-slide distance computation. Keys: 'dist|{slide}|ct_i|ct_j'."""
+    cts = obj.cell_types_of_interest
+    slides = obj.slide_list
+    slide_ids = obj.meta_data_sub["slideID"].values
+    loc = obj.location_data_sub
+    if not {"x", "y"}.issubset(loc.columns):
+        raise ValueError("location_data_sub must have columns 'x' and 'y'.")
+    distances = {}
+    all_percentiles = []
+    raw_mats = {}
+    if len(cts) == 1:
+        ct = cts[0]
+        for slide in slides:
+            slide_ct_mask = (obj.cell_types_sub == ct) & (slide_ids == slide)
+            if np.sum(slide_ct_mask) <= 5:
+                continue
+            coords = loc.loc[slide_ct_mask, ["x", "y"]].values.astype(float)
+            dist_mat = cdist(coords, coords)
+            dist_mat, pct = _process_distance_matrix(dist_mat, truncate, percentile_choice=1e-4, set_diag_inf=True)
+            flat_name = f"dist|{slide}|{ct}|{ct}"
+            raw_mats[flat_name] = dist_mat
+            all_percentiles.append(pct)
+    else:
+        pairs = list(combinations(cts, 2))
+        for slide in slides:
+            for ct_i, ct_j in pairs:
+                mask_i = (obj.cell_types_sub == ct_i) & (slide_ids == slide)
+                mask_j = (obj.cell_types_sub == ct_j) & (slide_ids == slide)
+                if np.sum(mask_i) <= 5 or np.sum(mask_j) <= 5:
+                    continue
+                coords_i = loc.loc[mask_i, ["x", "y"]].values.astype(float)
+                coords_j = loc.loc[mask_j, ["x", "y"]].values.astype(float)
+                dist_mat = cdist(coords_i, coords_j)
+                dist_mat, pct = _process_distance_matrix(dist_mat, truncate)
+                flat_name = f"dist|{slide}|{ct_i}|{ct_j}"
+                raw_mats[flat_name] = dist_mat
+                all_percentiles.append(pct)
+    if normalize and all_percentiles:
+        global_min = min(all_percentiles)
+        scaling_factor = 0.01 / global_min
+        for k, v in raw_mats.items():
+            distances[k] = v * scaling_factor
+    else:
+        distances = raw_mats
+    obj.distances = distances
+    return obj