lotsofcells 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ """lotsofcells: proportion-test statistics and visualization on single-cell metadata.
2
+
3
+ Python port of the R package `lotsOfCells`, designed for the scanpy / AnnData
4
+ framework. Compatible with single-cell (`AnnData`) and spatial transcriptomics
5
+ (`SpatialData` / `MuData`) objects, since metadata is read from `.obs`.
6
+
7
+ References
8
+ ----------
9
+ Óscar González-Velasco; lotsOfCells: data visualization and statistics of
10
+ single cell metadata. bioRxiv 2024.05.23.595582;
11
+ https://doi.org/10.1101/2024.05.23.595582
12
+ """
13
+
14
+ from ._utils import get_metadata, get_palette
15
+ from .lotsofcells import lots_of_cells
16
+ from .entropy import entropy_score
17
+ from .plots import (
18
+ bar_chart,
19
+ waffle_chart,
20
+ polar_chart,
21
+ density_chart,
22
+ dynamics_chart,
23
+ plot_abundance_test,
24
+ )
25
+
26
+ __all__ = [
27
+ "get_metadata",
28
+ "get_palette",
29
+ "lots_of_cells",
30
+ "entropy_score",
31
+ "bar_chart",
32
+ "waffle_chart",
33
+ "polar_chart",
34
+ "density_chart",
35
+ "dynamics_chart",
36
+ "plot_abundance_test",
37
+ ]
38
+
39
+ __version__ = "0.3.0"
lotsofcells/_stats.py ADDED
@@ -0,0 +1,279 @@
1
+ """Internal statistical primitives.
2
+
3
+ Direct ports of the R helpers `cellToGamma`, `cellToGammaOriginal` and
4
+ `cellToMontecarlo`. Implementation choices (pseudocounts, transforms) match
5
+ the R version exactly so results are comparable.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from typing import Dict, List, Sequence, Tuple, Union
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+
14
+
15
+ # --- Transformations used everywhere ---------------------------------------------------
16
+
17
+ def pseudo_count(counts: np.ndarray) -> np.ndarray:
18
+ """`counts + 0.5` — matches the R pseudocount in lotsOfCells.R."""
19
+ return counts + 0.5
20
+
21
+
22
+ def pseudo_count_arcsin(counts: np.ndarray) -> np.ndarray:
23
+ """`counts + sqrt(counts^2 + 1)` — matches the R pseudocount in entropyScore.R."""
24
+ return counts + np.sqrt(counts * counts + 1)
25
+
26
+
27
+ def asrt(p: np.ndarray) -> np.ndarray:
28
+ """Arcsin square-root transform (Anscombe-style)."""
29
+ return np.arcsin(np.sqrt(np.clip(p, 0, 1)))
30
+
31
+
32
+ def logit(f: np.ndarray) -> np.ndarray:
33
+ return np.log(f / (1 - f))
34
+
35
+
36
+ def geom_mean(x: np.ndarray) -> float:
37
+ """Geometric mean over the strictly positive entries of ``x``.
38
+
39
+ Note: this intentionally diverges from R's literal ``exp(mean(log(x)))``,
40
+ which collapses to 0 whenever **any** entry is 0. In the symmetric
41
+ divergence formula used by `entropyScore`, a zero in
42
+ ``|p * log2(p/q)|`` means ``p[i] == q[i]`` (the two distributions agree
43
+ on cell type ``i``); such a term should contribute *nothing* to the
44
+ divergence — not zero out the entire score.
45
+
46
+ The 1-class test makes this critical: random partitions inside a single
47
+ condition often share integer totals after the ``int(sqrt(count_s))``
48
+ crowd sizing, which forces ``p[i] == q[i]`` for any cell type missing
49
+ from both subsamples. With strict R semantics every iteration collapses
50
+ to 0; with this version the geom_mean is taken over the cell types
51
+ that actually disagree.
52
+
53
+ If every entry is zero, the divergence really is 0.
54
+ """
55
+ x = np.asarray(x, dtype=float)
56
+ nonzero = x[x > 0]
57
+ if nonzero.size == 0:
58
+ return 0.0
59
+ return float(np.exp(np.mean(np.log(nonzero))))
60
+
61
+
62
+ # --- Contingency tables --------------------------------------------------------------
63
+
64
+ def _table(groups: Sequence[str], covariable: Sequence[str]) -> pd.DataFrame:
65
+ """Equivalent of R `table(data.frame(groups, covariable))`."""
66
+ return (
67
+ pd.crosstab(pd.Series(groups, name="groups"),
68
+ pd.Series(covariable, name="covariable"))
69
+ )
70
+
71
+
72
+ def _ensure_rows(tab: pd.DataFrame, label_order: Sequence[str]) -> pd.DataFrame:
73
+ """Add zero rows for any missing labels and reindex."""
74
+ missing = [l for l in label_order if l not in tab.index]
75
+ if missing:
76
+ z = pd.DataFrame(0, index=missing, columns=tab.columns)
77
+ tab = pd.concat([tab, z])
78
+ return tab.reindex(label_order)
79
+
80
+
81
+ def _ensure_cols(tab: pd.DataFrame, indexes: Sequence[str]) -> pd.DataFrame:
82
+ missing = [c for c in indexes if c not in tab.columns]
83
+ if missing:
84
+ for m in missing:
85
+ tab[m] = 0
86
+ return tab[list(indexes)]
87
+
88
+
89
+ # --- Goodman & Kruskal gamma rank correlation ----------------------------------------
90
+
91
+ def _ranked_proportions(
92
+ tab: pd.DataFrame,
93
+ label_order: Sequence[str],
94
+ indexes: Sequence[str],
95
+ ) -> np.ndarray:
96
+ """Rows=label_order, cols=covariables.
97
+
98
+ Computes per-covariable proportions then ranks across labels.
99
+ Mirrors `t(apply(dftmp,2,function(row){row/(sum(row)+0.1)}))[labelOrder, indexes]`
100
+ followed by `t(apply(.,1,rank))`.
101
+ """
102
+ tab = _ensure_rows(tab, label_order)
103
+ tab = _ensure_cols(tab, indexes)
104
+ # column-wise proportions: row/(sum(row)+0.1) per column => divide each column by (col_sum+0.1)
105
+ col_sums = tab.values.sum(axis=0) + 0.1 # shape (n_cov,)
106
+ contig = tab.values / col_sums[np.newaxis, :] # rows = labels in label_order
107
+ # rank within each row across covariables (R: apply(contig_tab,1,rank))
108
+ # 'average' ties to mirror base::rank's default
109
+ ranks = np.apply_along_axis(_rank_avg, 1, contig)
110
+ return ranks # shape (n_labels, n_cov)
111
+
112
+
113
+ def _rank_avg(x: np.ndarray) -> np.ndarray:
114
+ """Equivalent of R base::rank(x, ties.method='average')."""
115
+ order = np.argsort(x, kind="mergesort")
116
+ ranks = np.empty_like(order, dtype=float)
117
+ ranks[order] = np.arange(1, len(x) + 1, dtype=float)
118
+ # average over ties
119
+ _, inv, counts = np.unique(x, return_inverse=True, return_counts=True)
120
+ sums = np.zeros_like(counts, dtype=float)
121
+ np.add.at(sums, inv, ranks)
122
+ avg = sums / counts
123
+ return avg[inv]
124
+
125
+
126
+ def _concordant_discordant(
127
+ ranks: np.ndarray, rank_index: np.ndarray, original: bool
128
+ ) -> Tuple[np.ndarray, np.ndarray]:
129
+ """For each covariable column, count concordant and discordant pairs.
130
+
131
+ If `original=False` (random/null): concordant means
132
+ sign(ranks[i]-ranks[i+1:]) == -1 (matches the R cellToGamma which assumes
133
+ monotonic 1..N and sign always = -1). Discordant counts where
134
+ `ranks[i] != ranks[k]` and sign != -1.
135
+
136
+ If `original=True`: compare against the actual rank_index sign pattern.
137
+ """
138
+ n_labels, n_cov = ranks.shape
139
+ nconc = np.zeros(n_cov, dtype=int)
140
+ ndisc = np.zeros(n_cov, dtype=int)
141
+ for i in range(n_labels - 1):
142
+ ri = ranks[i]
143
+ rj = ranks[i + 1 :] # (rest, n_cov)
144
+ diff_r = ri[np.newaxis, :] - rj # (rest, n_cov)
145
+ if original:
146
+ idx_diff = rank_index[i] - rank_index[i + 1 :]
147
+ target_sign = np.sign(idx_diff)[:, np.newaxis] # (rest, 1)
148
+ nconc += np.sum(np.sign(diff_r) == target_sign, axis=0)
149
+ mask_neq = diff_r != 0
150
+ ndisc += np.sum((np.sign(diff_r) != target_sign) & mask_neq, axis=0)
151
+ else:
152
+ nconc += np.sum(np.sign(diff_r) == -1, axis=0)
153
+ mask_neq = diff_r != 0
154
+ ndisc += np.sum((np.sign(diff_r) != -1) & mask_neq, axis=0)
155
+ return nconc, ndisc
156
+
157
+
158
+ def cell_to_gamma(
159
+ covariable: np.ndarray,
160
+ groups: np.ndarray,
161
+ label_order: Sequence[str],
162
+ indexes: Sequence[str],
163
+ cell_crowd: Dict[str, int],
164
+ rank_index: np.ndarray,
165
+ rng: np.random.Generator,
166
+ ) -> Tuple[np.ndarray, np.ndarray]:
167
+ """Random null distribution: mix all covariables, then subsample per-group.
168
+
169
+ Returns (n_concordant, n_discordant) per covariable column (length n_cov).
170
+ """
171
+ pieces_cov, pieces_grp = [], []
172
+ for label in label_order:
173
+ n = int(cell_crowd[label])
174
+ sample = rng.choice(covariable, size=n, replace=True)
175
+ pieces_cov.append(sample)
176
+ pieces_grp.append(np.repeat(label, n))
177
+ cov = np.concatenate(pieces_cov)
178
+ grp = np.concatenate(pieces_grp)
179
+ tab = _table(grp, cov)
180
+ ranks = _ranked_proportions(tab, label_order, indexes)
181
+ return _concordant_discordant(ranks, rank_index, original=False)
182
+
183
+
184
+ def cell_to_gamma_original(
185
+ covariable: np.ndarray,
186
+ groups: np.ndarray,
187
+ label_order: Sequence[str],
188
+ indexes: Sequence[str],
189
+ cell_crowd: Dict[str, int],
190
+ rank_index: np.ndarray,
191
+ rng: np.random.Generator,
192
+ ) -> Tuple[np.ndarray, np.ndarray]:
193
+ """Original-data subsampling: subsample within each group preserving labels."""
194
+ pieces_cov, pieces_grp = [], []
195
+ for label in label_order:
196
+ n = int(cell_crowd[label])
197
+ pool = covariable[groups == label]
198
+ if len(pool) == 0:
199
+ continue
200
+ replace = n > len(pool)
201
+ sample = rng.choice(pool, size=n, replace=replace)
202
+ pieces_cov.append(sample)
203
+ pieces_grp.append(np.repeat(label, n))
204
+ cov = np.concatenate(pieces_cov)
205
+ grp = np.concatenate(pieces_grp)
206
+ tab = _table(grp, cov)
207
+ ranks = _ranked_proportions(tab, label_order, indexes)
208
+ return _concordant_discordant(ranks, rank_index, original=True)
209
+
210
+
211
+ # --- Monte Carlo for 2-condition fold-change -----------------------------------------
212
+
213
+ def _proportions_from_table(
214
+ tab: pd.DataFrame,
215
+ label_order: Sequence[str],
216
+ indexes: Sequence[str],
217
+ pseudo: bool = True,
218
+ ) -> np.ndarray:
219
+ """`pseudo_count(tab)` then column-wise proportions, indexed by label_order/indexes."""
220
+ tab = _ensure_rows(tab, label_order)
221
+ tab = _ensure_cols(tab, indexes)
222
+ vals = tab.values.astype(float)
223
+ if pseudo:
224
+ vals = pseudo_count(vals)
225
+ col_sums = vals.sum(axis=0) + 1.0
226
+ return vals / col_sums[np.newaxis, :] # (n_labels, n_cov)
227
+
228
+
229
+ def cell_to_montecarlo(
230
+ covariable: np.ndarray,
231
+ groups: np.ndarray,
232
+ label_order: Sequence[str],
233
+ indexes: Sequence[str],
234
+ cell_crowd: Union[Dict[str, int], Dict[str, List[int]]],
235
+ rng: np.random.Generator,
236
+ ) -> Tuple[np.ndarray, np.ndarray]:
237
+ """Return (mixed-pool fold change, original-resampled fold change).
238
+
239
+ Both are arrays of length len(indexes), holding
240
+ log2( asrt(p1) / asrt(p2) ).
241
+ """
242
+ def _build_mixed(crowd_for_label):
243
+ if isinstance(crowd_for_label, (list, np.ndarray)):
244
+ sizes = np.asarray(crowd_for_label, dtype=int)
245
+ return np.concatenate(
246
+ [rng.choice(covariable, size=int(s), replace=True) for s in sizes]
247
+ )
248
+ return rng.choice(covariable, size=int(crowd_for_label), replace=True)
249
+
250
+ def _build_orig(crowd_for_label, label):
251
+ pool = covariable[groups == label]
252
+ if len(pool) == 0:
253
+ return np.array([], dtype=covariable.dtype)
254
+ if isinstance(crowd_for_label, (list, np.ndarray)):
255
+ sizes = np.asarray(crowd_for_label, dtype=int)
256
+ return np.concatenate(
257
+ [rng.choice(pool, size=int(s), replace=True) for s in sizes]
258
+ )
259
+ n = int(crowd_for_label)
260
+ return rng.choice(pool, size=n, replace=True)
261
+
262
+ mixed_cov, mixed_grp, orig_cov, orig_grp = [], [], [], []
263
+ for label in label_order:
264
+ cm = _build_mixed(cell_crowd[label])
265
+ co = _build_orig(cell_crowd[label], label)
266
+ mixed_cov.append(cm)
267
+ mixed_grp.append(np.repeat(label, len(cm)))
268
+ orig_cov.append(co)
269
+ orig_grp.append(np.repeat(label, len(co)))
270
+
271
+ mixed_tab = _table(np.concatenate(mixed_grp), np.concatenate(mixed_cov))
272
+ orig_tab = _table(np.concatenate(orig_grp), np.concatenate(orig_cov))
273
+
274
+ p_mixed = _proportions_from_table(mixed_tab, label_order, indexes, pseudo=True)
275
+ p_orig = _proportions_from_table(orig_tab, label_order, indexes, pseudo=True)
276
+
277
+ fc_mixed = np.log2(asrt(p_mixed[0]) / asrt(p_mixed[1]))
278
+ fc_orig = np.log2(asrt(p_orig[0]) / asrt(p_orig[1]))
279
+ return fc_mixed, fc_orig
lotsofcells/_utils.py ADDED
@@ -0,0 +1,213 @@
1
+ """Internal helpers: metadata extraction and color palette."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Optional, Sequence, Union
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ # Default categorical palette — scanpy `default_20` (Vega-20 with
10
+ # yellow/green hues swapped for higher contrast). The de facto standard in
11
+ # the single-cell community, so colours feel native to scanpy figures.
12
+ _DEFAULT_PALETTE = [
13
+ "#1F77B4", "#FF7F0E", "#279E68", "#D62728", "#AA40FC",
14
+ "#8C564B", "#E377C2", "#B5BD61", "#17BECF", "#AEC7E8",
15
+ "#FFBB78", "#98DF8A", "#FF9896", "#C5B0D5", "#C49C94",
16
+ "#F7B6D2", "#DBDB8D", "#9EDAE5", "#AD494A", "#8C6D31",
17
+ ]
18
+
19
+
20
+ def _is_anndata(obj) -> bool:
21
+ """Return True if obj quacks like an AnnData (has .obs)."""
22
+ try:
23
+ import anndata # noqa: F401
24
+ except Exception:
25
+ anndata = None # type: ignore
26
+ if anndata is not None and isinstance(obj, anndata.AnnData):
27
+ return True
28
+ return hasattr(obj, "obs") and isinstance(getattr(obj, "obs"), pd.DataFrame)
29
+
30
+
31
+ def _is_spatialdata(obj) -> bool:
32
+ try:
33
+ import spatialdata # type: ignore
34
+ return isinstance(obj, spatialdata.SpatialData)
35
+ except Exception:
36
+ return False
37
+
38
+
39
+ def _is_mudata(obj) -> bool:
40
+ try:
41
+ import mudata # type: ignore
42
+ return isinstance(obj, mudata.MuData)
43
+ except Exception:
44
+ return False
45
+
46
+
47
+ def get_metadata(sc_object, table: Optional[str] = None) -> pd.DataFrame:
48
+ """Return a metadata DataFrame from a scanpy/spatial/dataframe object.
49
+
50
+ Parameters
51
+ ----------
52
+ sc_object
53
+ One of: ``pandas.DataFrame``, ``anndata.AnnData``, ``mudata.MuData``,
54
+ or ``spatialdata.SpatialData``. AnnData/Mu/Spatial objects expose their
55
+ cell-level metadata via ``.obs``; this is the analogue of
56
+ ``Seurat[[]]`` / ``SingleCellExperiment::colData``.
57
+ table
58
+ Only used when ``sc_object`` is a ``SpatialData`` (the name of the
59
+ table whose ``.obs`` should be returned) or ``MuData`` (the modality
60
+ name). If ``None`` and the object has a single table/modality, that
61
+ one is used.
62
+ """
63
+ if sc_object is None:
64
+ raise ValueError("At least an AnnData/SpatialData/DataFrame is required.")
65
+
66
+ if isinstance(sc_object, pd.DataFrame):
67
+ return sc_object.copy()
68
+
69
+ if _is_spatialdata(sc_object):
70
+ tables = dict(sc_object.tables)
71
+ if not tables:
72
+ raise ValueError("SpatialData object has no tables.")
73
+ if table is None:
74
+ if len(tables) > 1:
75
+ raise ValueError(
76
+ f"SpatialData has multiple tables {list(tables)}; "
77
+ "specify `table=...`."
78
+ )
79
+ table = next(iter(tables))
80
+ return tables[table].obs.copy()
81
+
82
+ if _is_mudata(sc_object):
83
+ if table is None:
84
+ return sc_object.obs.copy()
85
+ return sc_object[table].obs.copy()
86
+
87
+ if _is_anndata(sc_object):
88
+ return sc_object.obs.copy()
89
+
90
+ raise TypeError(
91
+ "Unsupported object type for metadata extraction. "
92
+ "Pass a pandas.DataFrame or AnnData/MuData/SpatialData."
93
+ )
94
+
95
+
96
+ def get_numerical_variable(
97
+ sc_object, numerical_variable: str, metadata: pd.DataFrame
98
+ ) -> np.ndarray:
99
+ """Resolve a numerical variable from .obs OR feature counts (gene name).
100
+
101
+ Mirrors the R behaviour of `density_chart`: if the column is in
102
+ metadata, return it; otherwise look for a feature in the AnnData and
103
+ return its expression vector aligned to ``metadata.index``.
104
+ """
105
+ if numerical_variable in metadata.columns:
106
+ return metadata[numerical_variable].to_numpy()
107
+
108
+ if _is_anndata(sc_object):
109
+ adata = sc_object
110
+ if numerical_variable in adata.var_names:
111
+ idx = adata.var_names.get_loc(numerical_variable)
112
+ X = adata.X
113
+ col = X[:, idx]
114
+ if hasattr(col, "toarray"):
115
+ col = col.toarray().ravel()
116
+ else:
117
+ col = np.asarray(col).ravel()
118
+ # Align to metadata row order
119
+ obs_idx = metadata.index
120
+ full = pd.Series(col, index=adata.obs_names)
121
+ return full.loc[obs_idx].to_numpy()
122
+
123
+ raise ValueError(
124
+ f"Variable '{numerical_variable}' not found in metadata columns "
125
+ "or feature names."
126
+ )
127
+
128
+
129
+ def get_palette(
130
+ use_palette: Optional[Sequence[str]] = None, n_colors: int = 20
131
+ ) -> list:
132
+ """Return a list of `n_colors` colors.
133
+
134
+ If `use_palette` is None, the default lotsOfCells palette is used.
135
+ If more colors than provided are requested, a linear interpolation in RGB
136
+ space (analogue of `colorRampPalette`) is performed.
137
+ """
138
+ base = list(use_palette) if use_palette is not None else list(_DEFAULT_PALETTE)
139
+ if n_colors <= len(base):
140
+ return base[:n_colors]
141
+ return _ramp_palette(base, n_colors)
142
+
143
+
144
+ def _hex_to_rgb(h: str) -> np.ndarray:
145
+ h = h.lstrip("#")
146
+ return np.array([int(h[i : i + 2], 16) for i in (0, 2, 4)], dtype=float) / 255.0
147
+
148
+
149
+ def _rgb_to_hex(rgb: Union[np.ndarray, Sequence[float]]) -> str:
150
+ rgb = np.clip(np.asarray(rgb), 0, 1)
151
+ return "#{:02X}{:02X}{:02X}".format(*(int(round(c * 255)) for c in rgb))
152
+
153
+
154
+ def _ramp_palette(colors: Sequence[str], n: int) -> list:
155
+ """Equivalent of grDevices::colorRampPalette in linear RGB."""
156
+ rgbs = np.stack([_hex_to_rgb(c) for c in colors]) # (k, 3)
157
+ if n == 1:
158
+ return [_rgb_to_hex(rgbs[0])]
159
+ src = np.linspace(0, 1, len(colors))
160
+ tgt = np.linspace(0, 1, n)
161
+ interp = np.stack(
162
+ [np.interp(tgt, src, rgbs[:, c]) for c in range(3)], axis=1
163
+ )
164
+ return [_rgb_to_hex(rgb) for rgb in interp]
165
+
166
+
167
+ def lighten(color: str, amount: float = 0.2) -> str:
168
+ """Lighten an HSV-based color by `amount` (0..1). Analogue of colorspace::lighten."""
169
+ import colorsys
170
+
171
+ r, g, b = _hex_to_rgb(color)
172
+ h, l, s = colorsys.rgb_to_hls(r, g, b)
173
+ l = l + amount * (1 - l)
174
+ r, g, b = colorsys.hls_to_rgb(h, l, s)
175
+ return _rgb_to_hex((r, g, b))
176
+
177
+
178
+ def darken(color: str, amount: float = 0.2) -> str:
179
+ """Darken color by `amount` (0..1). Analogue of colorspace::darken."""
180
+ import colorsys
181
+
182
+ r, g, b = _hex_to_rgb(color)
183
+ h, l, s = colorsys.rgb_to_hls(r, g, b)
184
+ l = l * (1 - amount)
185
+ r, g, b = colorsys.hls_to_rgb(h, l, s)
186
+ return _rgb_to_hex((r, g, b))
187
+
188
+
189
+ def desaturate(color: str, amount: float = 0.16) -> str:
190
+ """Reduce saturation. Analogue of colorspace::desaturate."""
191
+ import colorsys
192
+
193
+ r, g, b = _hex_to_rgb(color)
194
+ h, l, s = colorsys.rgb_to_hls(r, g, b)
195
+ s = max(0.0, s * (1 - amount))
196
+ r, g, b = colorsys.hls_to_rgb(h, l, s)
197
+ return _rgb_to_hex((r, g, b))
198
+
199
+
200
+ def save_to_pdf(fig, pdf_file: Optional[str]) -> None:
201
+ """Save a matplotlib Figure to PDF if `pdf_file` is provided.
202
+
203
+ Used by every plotting function when the user passes ``pdf_file=...``.
204
+ Uses ``bbox_inches="tight"`` so that legends placed outside the axes
205
+ are included and not clipped.
206
+ """
207
+ if pdf_file is None:
208
+ return
209
+ if fig is None:
210
+ import matplotlib.pyplot as plt
211
+
212
+ fig = plt.gcf()
213
+ fig.savefig(pdf_file, format="pdf", bbox_inches="tight")