PyPI - lotsofcells - Versions diffs - 0.3.0__py3-none-any.whl - Mend

lotsofcells 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

lotsofcells/__init__.py +39 -0
lotsofcells/_stats.py +279 -0
lotsofcells/_utils.py +211 -0
lotsofcells/entropy.py +354 -0
lotsofcells/lotsofcells.py +330 -0
lotsofcells/plots.py +681 -0
lotsofcells-0.3.0.dist-info/METADATA +21 -0
lotsofcells-0.3.0.dist-info/RECORD +10 -0
lotsofcells-0.3.0.dist-info/WHEEL +5 -0
lotsofcells-0.3.0.dist-info/top_level.txt +1 -0

lotsofcells/__init__.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""lotsofcells: proportion-test statistics and visualization on single-cell metadata.
+Python port of the R package `lotsOfCells`, designed for the scanpy / AnnData
+framework. Compatible with single-cell (`AnnData`) and spatial transcriptomics
+(`SpatialData` / `MuData`) objects, since metadata is read from `.obs`.
+References
+----------
+Óscar González-Velasco; lotsOfCells: data visualization and statistics of
+single cell metadata. bioRxiv 2024.05.23.595582;
+https://doi.org/10.1101/2024.05.23.595582
+"""
+from ._utils import get_metadata, get_palette
+from .lotsofcells import lots_of_cells
+from .entropy import entropy_score
+from .plots import (
+    bar_chart,
+    waffle_chart,
+    polar_chart,
+    density_chart,
+    dynamics_chart,
+    plot_abundance_test,
+)
+__all__ = [
+    "get_metadata",
+    "get_palette",
+    "lots_of_cells",
+    "entropy_score",
+    "bar_chart",
+    "waffle_chart",
+    "polar_chart",
+    "density_chart",
+    "dynamics_chart",
+    "plot_abundance_test",
+]
+__version__ = "0.3.0"

lotsofcells/_stats.py ADDED Viewed

@@ -0,0 +1,279 @@
+"""Internal statistical primitives.
+Direct ports of the R helpers `cellToGamma`, `cellToGammaOriginal` and
+`cellToMontecarlo`. Implementation choices (pseudocounts, transforms) match
+the R version exactly so results are comparable.
+"""
+from __future__ import annotations
+from typing import Dict, List, Sequence, Tuple, Union
+import numpy as np
+import pandas as pd
+# --- Transformations used everywhere ---------------------------------------------------
+def pseudo_count(counts: np.ndarray) -> np.ndarray:
+    """`counts + 0.5` — matches the R pseudocount in lotsOfCells.R."""
+    return counts + 0.5
+def pseudo_count_arcsin(counts: np.ndarray) -> np.ndarray:
+    """`counts + sqrt(counts^2 + 1)` — matches the R pseudocount in entropyScore.R."""
+    return counts + np.sqrt(counts * counts + 1)
+def asrt(p: np.ndarray) -> np.ndarray:
+    """Arcsin square-root transform (Anscombe-style)."""
+    return np.arcsin(np.sqrt(np.clip(p, 0, 1)))
+def logit(f: np.ndarray) -> np.ndarray:
+    return np.log(f / (1 - f))
+def geom_mean(x: np.ndarray) -> float:
+    """Geometric mean over the strictly positive entries of ``x``.
+    Note: this intentionally diverges from R's literal ``exp(mean(log(x)))``,
+    which collapses to 0 whenever **any** entry is 0. In the symmetric
+    divergence formula used by `entropyScore`, a zero in
+    ``|p * log2(p/q)|`` means ``p[i] == q[i]`` (the two distributions agree
+    on cell type ``i``); such a term should contribute *nothing* to the
+    divergence — not zero out the entire score.
+    The 1-class test makes this critical: random partitions inside a single
+    condition often share integer totals after the ``int(sqrt(count_s))``
+    crowd sizing, which forces ``p[i] == q[i]`` for any cell type missing
+    from both subsamples. With strict R semantics every iteration collapses
+    to 0; with this version the geom_mean is taken over the cell types
+    that actually disagree.
+    If every entry is zero, the divergence really is 0.
+    """
+    x = np.asarray(x, dtype=float)
+    nonzero = x[x > 0]
+    if nonzero.size == 0:
+        return 0.0
+    return float(np.exp(np.mean(np.log(nonzero))))
+# --- Contingency tables --------------------------------------------------------------
+def _table(groups: Sequence[str], covariable: Sequence[str]) -> pd.DataFrame:
+    """Equivalent of R `table(data.frame(groups, covariable))`."""
+    return (
+        pd.crosstab(pd.Series(groups, name="groups"),
+                    pd.Series(covariable, name="covariable"))
+    )
+def _ensure_rows(tab: pd.DataFrame, label_order: Sequence[str]) -> pd.DataFrame:
+    """Add zero rows for any missing labels and reindex."""
+    missing = [l for l in label_order if l not in tab.index]
+    if missing:
+        z = pd.DataFrame(0, index=missing, columns=tab.columns)
+        tab = pd.concat([tab, z])
+    return tab.reindex(label_order)
+def _ensure_cols(tab: pd.DataFrame, indexes: Sequence[str]) -> pd.DataFrame:
+    missing = [c for c in indexes if c not in tab.columns]
+    if missing:
+        for m in missing:
+            tab[m] = 0
+    return tab[list(indexes)]
+# --- Goodman & Kruskal gamma rank correlation ----------------------------------------
+def _ranked_proportions(
+    tab: pd.DataFrame,
+    label_order: Sequence[str],
+    indexes: Sequence[str],
+) -> np.ndarray:
+    """Rows=label_order, cols=covariables.
+    Computes per-covariable proportions then ranks across labels.
+    Mirrors `t(apply(dftmp,2,function(row){row/(sum(row)+0.1)}))[labelOrder, indexes]`
+    followed by `t(apply(.,1,rank))`.
+    """
+    tab = _ensure_rows(tab, label_order)
+    tab = _ensure_cols(tab, indexes)
+    # column-wise proportions: row/(sum(row)+0.1) per column => divide each column by (col_sum+0.1)
+    col_sums = tab.values.sum(axis=0) + 0.1  # shape (n_cov,)
+    contig = tab.values / col_sums[np.newaxis, :]  # rows = labels in label_order
+    # rank within each row across covariables (R: apply(contig_tab,1,rank))
+    # 'average' ties to mirror base::rank's default
+    ranks = np.apply_along_axis(_rank_avg, 1, contig)
+    return ranks  # shape (n_labels, n_cov)
+def _rank_avg(x: np.ndarray) -> np.ndarray:
+    """Equivalent of R base::rank(x, ties.method='average')."""
+    order = np.argsort(x, kind="mergesort")
+    ranks = np.empty_like(order, dtype=float)
+    ranks[order] = np.arange(1, len(x) + 1, dtype=float)
+    # average over ties
+    _, inv, counts = np.unique(x, return_inverse=True, return_counts=True)
+    sums = np.zeros_like(counts, dtype=float)
+    np.add.at(sums, inv, ranks)
+    avg = sums / counts
+    return avg[inv]
+def _concordant_discordant(
+    ranks: np.ndarray, rank_index: np.ndarray, original: bool
+) -> Tuple[np.ndarray, np.ndarray]:
+    """For each covariable column, count concordant and discordant pairs.
+    If `original=False` (random/null): concordant means
+    sign(ranks[i]-ranks[i+1:]) == -1 (matches the R cellToGamma which assumes
+    monotonic 1..N and sign always = -1). Discordant counts where
+    `ranks[i] != ranks[k]` and sign != -1.
+    If `original=True`: compare against the actual rank_index sign pattern.
+    """
+    n_labels, n_cov = ranks.shape
+    nconc = np.zeros(n_cov, dtype=int)
+    ndisc = np.zeros(n_cov, dtype=int)
+    for i in range(n_labels - 1):
+        ri = ranks[i]
+        rj = ranks[i + 1 :]  # (rest, n_cov)
+        diff_r = ri[np.newaxis, :] - rj  # (rest, n_cov)
+        if original:
+            idx_diff = rank_index[i] - rank_index[i + 1 :]
+            target_sign = np.sign(idx_diff)[:, np.newaxis]  # (rest, 1)
+            nconc += np.sum(np.sign(diff_r) == target_sign, axis=0)
+            mask_neq = diff_r != 0
+            ndisc += np.sum((np.sign(diff_r) != target_sign) & mask_neq, axis=0)
+        else:
+            nconc += np.sum(np.sign(diff_r) == -1, axis=0)
+            mask_neq = diff_r != 0
+            ndisc += np.sum((np.sign(diff_r) != -1) & mask_neq, axis=0)
+    return nconc, ndisc
+def cell_to_gamma(
+    covariable: np.ndarray,
+    groups: np.ndarray,
+    label_order: Sequence[str],
+    indexes: Sequence[str],
+    cell_crowd: Dict[str, int],
+    rank_index: np.ndarray,
+    rng: np.random.Generator,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Random null distribution: mix all covariables, then subsample per-group.
+    Returns (n_concordant, n_discordant) per covariable column (length n_cov).
+    """
+    pieces_cov, pieces_grp = [], []
+    for label in label_order:
+        n = int(cell_crowd[label])
+        sample = rng.choice(covariable, size=n, replace=True)
+        pieces_cov.append(sample)
+        pieces_grp.append(np.repeat(label, n))
+    cov = np.concatenate(pieces_cov)
+    grp = np.concatenate(pieces_grp)
+    tab = _table(grp, cov)
+    ranks = _ranked_proportions(tab, label_order, indexes)
+    return _concordant_discordant(ranks, rank_index, original=False)
+def cell_to_gamma_original(
+    covariable: np.ndarray,
+    groups: np.ndarray,
+    label_order: Sequence[str],
+    indexes: Sequence[str],
+    cell_crowd: Dict[str, int],
+    rank_index: np.ndarray,
+    rng: np.random.Generator,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Original-data subsampling: subsample within each group preserving labels."""
+    pieces_cov, pieces_grp = [], []
+    for label in label_order:
+        n = int(cell_crowd[label])
+        pool = covariable[groups == label]
+        if len(pool) == 0:
+            continue
+        replace = n > len(pool)
+        sample = rng.choice(pool, size=n, replace=replace)
+        pieces_cov.append(sample)
+        pieces_grp.append(np.repeat(label, n))
+    cov = np.concatenate(pieces_cov)
+    grp = np.concatenate(pieces_grp)
+    tab = _table(grp, cov)
+    ranks = _ranked_proportions(tab, label_order, indexes)
+    return _concordant_discordant(ranks, rank_index, original=True)
+# --- Monte Carlo for 2-condition fold-change -----------------------------------------
+def _proportions_from_table(
+    tab: pd.DataFrame,
+    label_order: Sequence[str],
+    indexes: Sequence[str],
+    pseudo: bool = True,
+) -> np.ndarray:
+    """`pseudo_count(tab)` then column-wise proportions, indexed by label_order/indexes."""
+    tab = _ensure_rows(tab, label_order)
+    tab = _ensure_cols(tab, indexes)
+    vals = tab.values.astype(float)
+    if pseudo:
+        vals = pseudo_count(vals)
+    col_sums = vals.sum(axis=0) + 1.0
+    return vals / col_sums[np.newaxis, :]  # (n_labels, n_cov)
+def cell_to_montecarlo(
+    covariable: np.ndarray,
+    groups: np.ndarray,
+    label_order: Sequence[str],
+    indexes: Sequence[str],
+    cell_crowd: Union[Dict[str, int], Dict[str, List[int]]],
+    rng: np.random.Generator,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Return (mixed-pool fold change, original-resampled fold change).
+    Both are arrays of length len(indexes), holding
+    log2( asrt(p1) / asrt(p2) ).
+    """
+    def _build_mixed(crowd_for_label):
+        if isinstance(crowd_for_label, (list, np.ndarray)):
+            sizes = np.asarray(crowd_for_label, dtype=int)
+            return np.concatenate(
+                [rng.choice(covariable, size=int(s), replace=True) for s in sizes]
+            )
+        return rng.choice(covariable, size=int(crowd_for_label), replace=True)
+    def _build_orig(crowd_for_label, label):
+        pool = covariable[groups == label]
+        if len(pool) == 0:
+            return np.array([], dtype=covariable.dtype)
+        if isinstance(crowd_for_label, (list, np.ndarray)):
+            sizes = np.asarray(crowd_for_label, dtype=int)
+            return np.concatenate(
+                [rng.choice(pool, size=int(s), replace=True) for s in sizes]
+            )
+        n = int(crowd_for_label)
+        return rng.choice(pool, size=n, replace=True)
+    mixed_cov, mixed_grp, orig_cov, orig_grp = [], [], [], []
+    for label in label_order:
+        cm = _build_mixed(cell_crowd[label])
+        co = _build_orig(cell_crowd[label], label)
+        mixed_cov.append(cm)
+        mixed_grp.append(np.repeat(label, len(cm)))
+        orig_cov.append(co)
+        orig_grp.append(np.repeat(label, len(co)))
+    mixed_tab = _table(np.concatenate(mixed_grp), np.concatenate(mixed_cov))
+    orig_tab = _table(np.concatenate(orig_grp), np.concatenate(orig_cov))
+    p_mixed = _proportions_from_table(mixed_tab, label_order, indexes, pseudo=True)
+    p_orig = _proportions_from_table(orig_tab, label_order, indexes, pseudo=True)
+    fc_mixed = np.log2(asrt(p_mixed[0]) / asrt(p_mixed[1]))
+    fc_orig = np.log2(asrt(p_orig[0]) / asrt(p_orig[1]))
+    return fc_mixed, fc_orig

lotsofcells/_utils.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""Internal helpers: metadata extraction and color palette."""
+from __future__ import annotations
+from typing import Optional, Sequence, Union
+import numpy as np
+import pandas as pd
+# Default palette mirrors the R version (alpha 0.95 + slight desaturation).
+_DEFAULT_PALETTE = [
+    "#8DA0CB", "#926F99", "#9FBE8F", "#E8D161", "#DD8080",
+    "#613269", "#B9E8F5", "#CBB8D0", "#F9BE8D", "#B25356",
+    "#519B84", "#B79C76", "#C1D63C", "#F28D35", "#CA4133",
+    "#F0DA88", "#7EAB6F", "#666666", "#3C7DA6", "#4AA147",
+]
+def _is_anndata(obj) -> bool:
+    """Return True if obj quacks like an AnnData (has .obs)."""
+    try:
+        import anndata  # noqa: F401
+    except Exception:
+        anndata = None  # type: ignore
+    if anndata is not None and isinstance(obj, anndata.AnnData):
+        return True
+    return hasattr(obj, "obs") and isinstance(getattr(obj, "obs"), pd.DataFrame)
+def _is_spatialdata(obj) -> bool:
+    try:
+        import spatialdata  # type: ignore
+        return isinstance(obj, spatialdata.SpatialData)
+    except Exception:
+        return False
+def _is_mudata(obj) -> bool:
+    try:
+        import mudata  # type: ignore
+        return isinstance(obj, mudata.MuData)
+    except Exception:
+        return False
+def get_metadata(sc_object, table: Optional[str] = None) -> pd.DataFrame:
+    """Return a metadata DataFrame from a scanpy/spatial/dataframe object.
+    Parameters
+    ----------
+    sc_object
+        One of: ``pandas.DataFrame``, ``anndata.AnnData``, ``mudata.MuData``,
+        or ``spatialdata.SpatialData``. AnnData/Mu/Spatial objects expose their
+        cell-level metadata via ``.obs``; this is the analogue of
+        ``Seurat[[]]`` / ``SingleCellExperiment::colData``.
+    table
+        Only used when ``sc_object`` is a ``SpatialData`` (the name of the
+        table whose ``.obs`` should be returned) or ``MuData`` (the modality
+        name). If ``None`` and the object has a single table/modality, that
+        one is used.
+    """
+    if sc_object is None:
+        raise ValueError("At least an AnnData/SpatialData/DataFrame is required.")
+    if isinstance(sc_object, pd.DataFrame):
+        return sc_object.copy()
+    if _is_spatialdata(sc_object):
+        tables = dict(sc_object.tables)
+        if not tables:
+            raise ValueError("SpatialData object has no tables.")
+        if table is None:
+            if len(tables) > 1:
+                raise ValueError(
+                    f"SpatialData has multiple tables {list(tables)}; "
+                    "specify `table=...`."
+                )
+            table = next(iter(tables))
+        return tables[table].obs.copy()
+    if _is_mudata(sc_object):
+        if table is None:
+            return sc_object.obs.copy()
+        return sc_object[table].obs.copy()
+    if _is_anndata(sc_object):
+        return sc_object.obs.copy()
+    raise TypeError(
+        "Unsupported object type for metadata extraction. "
+        "Pass a pandas.DataFrame or AnnData/MuData/SpatialData."
+    )
+def get_numerical_variable(
+    sc_object, numerical_variable: str, metadata: pd.DataFrame
+) -> np.ndarray:
+    """Resolve a numerical variable from .obs OR feature counts (gene name).
+    Mirrors the R behaviour of `density_chart`: if the column is in
+    metadata, return it; otherwise look for a feature in the AnnData and
+    return its expression vector aligned to ``metadata.index``.
+    """
+    if numerical_variable in metadata.columns:
+        return metadata[numerical_variable].to_numpy()
+    if _is_anndata(sc_object):
+        adata = sc_object
+        if numerical_variable in adata.var_names:
+            idx = adata.var_names.get_loc(numerical_variable)
+            X = adata.X
+            col = X[:, idx]
+            if hasattr(col, "toarray"):
+                col = col.toarray().ravel()
+            else:
+                col = np.asarray(col).ravel()
+            # Align to metadata row order
+            obs_idx = metadata.index
+            full = pd.Series(col, index=adata.obs_names)
+            return full.loc[obs_idx].to_numpy()
+    raise ValueError(
+        f"Variable '{numerical_variable}' not found in metadata columns "
+        "or feature names."
+    )
+def get_palette(
+    use_palette: Optional[Sequence[str]] = None, n_colors: int = 20
+) -> list:
+    """Return a list of `n_colors` colors.
+    If `use_palette` is None, the default lotsOfCells palette is used.
+    If more colors than provided are requested, a linear interpolation in RGB
+    space (analogue of `colorRampPalette`) is performed.
+    """
+    base = list(use_palette) if use_palette is not None else list(_DEFAULT_PALETTE)
+    if n_colors <= len(base):
+        return base[:n_colors]
+    return _ramp_palette(base, n_colors)
+def _hex_to_rgb(h: str) -> np.ndarray:
+    h = h.lstrip("#")
+    return np.array([int(h[i : i + 2], 16) for i in (0, 2, 4)], dtype=float) / 255.0
+def _rgb_to_hex(rgb: Union[np.ndarray, Sequence[float]]) -> str:
+    rgb = np.clip(np.asarray(rgb), 0, 1)
+    return "#{:02X}{:02X}{:02X}".format(*(int(round(c * 255)) for c in rgb))
+def _ramp_palette(colors: Sequence[str], n: int) -> list:
+    """Equivalent of grDevices::colorRampPalette in linear RGB."""
+    rgbs = np.stack([_hex_to_rgb(c) for c in colors])  # (k, 3)
+    if n == 1:
+        return [_rgb_to_hex(rgbs[0])]
+    src = np.linspace(0, 1, len(colors))
+    tgt = np.linspace(0, 1, n)
+    interp = np.stack(
+        [np.interp(tgt, src, rgbs[:, c]) for c in range(3)], axis=1
+    )
+    return [_rgb_to_hex(rgb) for rgb in interp]
+def lighten(color: str, amount: float = 0.2) -> str:
+    """Lighten an HSV-based color by `amount` (0..1). Analogue of colorspace::lighten."""
+    import colorsys
+    r, g, b = _hex_to_rgb(color)
+    h, l, s = colorsys.rgb_to_hls(r, g, b)
+    l = l + amount * (1 - l)
+    r, g, b = colorsys.hls_to_rgb(h, l, s)
+    return _rgb_to_hex((r, g, b))
+def darken(color: str, amount: float = 0.2) -> str:
+    """Darken color by `amount` (0..1). Analogue of colorspace::darken."""
+    import colorsys
+    r, g, b = _hex_to_rgb(color)
+    h, l, s = colorsys.rgb_to_hls(r, g, b)
+    l = l * (1 - amount)
+    r, g, b = colorsys.hls_to_rgb(h, l, s)
+    return _rgb_to_hex((r, g, b))
+def desaturate(color: str, amount: float = 0.16) -> str:
+    """Reduce saturation. Analogue of colorspace::desaturate."""
+    import colorsys
+    r, g, b = _hex_to_rgb(color)
+    h, l, s = colorsys.rgb_to_hls(r, g, b)
+    s = max(0.0, s * (1 - amount))
+    r, g, b = colorsys.hls_to_rgb(h, l, s)
+    return _rgb_to_hex((r, g, b))
+def save_to_pdf(fig, pdf_file: Optional[str]) -> None:
+    """Save a matplotlib Figure to PDF if `pdf_file` is provided.
+    Used by every plotting function when the user passes ``pdf_file=...``.
+    Uses ``bbox_inches="tight"`` so that legends placed outside the axes
+    are included and not clipped.
+    """
+    if pdf_file is None:
+        return
+    if fig is None:
+        import matplotlib.pyplot as plt
+        fig = plt.gcf()
+    fig.savefig(pdf_file, format="pdf", bbox_inches="tight")