lotsofcells 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lotsofcells/__init__.py +39 -0
- lotsofcells/_stats.py +279 -0
- lotsofcells/_utils.py +211 -0
- lotsofcells/entropy.py +354 -0
- lotsofcells/lotsofcells.py +330 -0
- lotsofcells/plots.py +681 -0
- lotsofcells-0.3.0.dist-info/METADATA +21 -0
- lotsofcells-0.3.0.dist-info/RECORD +10 -0
- lotsofcells-0.3.0.dist-info/WHEEL +5 -0
- lotsofcells-0.3.0.dist-info/top_level.txt +1 -0
lotsofcells/entropy.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"""Symmetric divergence (KL-based) entropy score, plus the 1-class abundance test."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Optional, Sequence
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from ._stats import (
|
|
10
|
+
_ensure_cols,
|
|
11
|
+
_ensure_rows,
|
|
12
|
+
_table,
|
|
13
|
+
geom_mean,
|
|
14
|
+
pseudo_count_arcsin,
|
|
15
|
+
)
|
|
16
|
+
from ._utils import get_metadata
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _proportions_arcsin(
|
|
20
|
+
tab: pd.DataFrame, label_order: Sequence[str], indexes: Sequence[str]
|
|
21
|
+
) -> np.ndarray:
|
|
22
|
+
"""Per-group proportions across covariables (each row sums to 1).
|
|
23
|
+
|
|
24
|
+
Mirrors the R `entropyScore` normalisation. Note: in R the *random*
|
|
25
|
+
contig table is built from `data.frame(covariable, groups)` (covariable
|
|
26
|
+
first) so `table()` produces shape (ncov, ngroups) and the code applies
|
|
27
|
+
`apply(., 2, row/sum(row))` followed by `t()` — which is mathematically
|
|
28
|
+
equivalent to row-normalising on a (ngroups, ncov) matrix. Since
|
|
29
|
+
`pd.crosstab(groups, covariable)` already returns (ngroups, ncov) here,
|
|
30
|
+
a single function works for both observed and random tables.
|
|
31
|
+
"""
|
|
32
|
+
tab = _ensure_rows(tab, label_order)
|
|
33
|
+
tab = _ensure_cols(tab, indexes)
|
|
34
|
+
vals = pseudo_count_arcsin(tab.values.astype(float))
|
|
35
|
+
row_sums = vals.sum(axis=1, keepdims=True)
|
|
36
|
+
return vals / row_sums
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _distance_surprise(p: np.ndarray, q: np.ndarray) -> float:
|
|
40
|
+
return geom_mean(np.abs(p * np.log2(p / q))) + geom_mean(np.abs(q * np.log2(q / p)))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def entropy_score(
|
|
44
|
+
sc_object,
|
|
45
|
+
main_variable: str,
|
|
46
|
+
subtype_variable: str,
|
|
47
|
+
label_order: Sequence[str],
|
|
48
|
+
sample_id: Optional[str] = None,
|
|
49
|
+
permutations: int = 1000,
|
|
50
|
+
seed: Optional[int] = None,
|
|
51
|
+
table: Optional[str] = None,
|
|
52
|
+
plot: bool = True,
|
|
53
|
+
verbose: bool = True,
|
|
54
|
+
pdf_file: Optional[str] = None,
|
|
55
|
+
):
|
|
56
|
+
"""Symmetric divergence score for global proportion dysregulation between 2 groups.
|
|
57
|
+
|
|
58
|
+
Returns a `pandas.Series` with per-covariable relative entropies plus the
|
|
59
|
+
summary fields (``entropy_score``, ``p.val``, ``mean.random.entropy``,
|
|
60
|
+
``sd.random.entropy``).
|
|
61
|
+
|
|
62
|
+
If ``len(label_order) == 1``, runs the 1-class permutation test on
|
|
63
|
+
``sample_id`` (analogue of the R `oneClassTest`) and returns a small
|
|
64
|
+
summary dict instead.
|
|
65
|
+
"""
|
|
66
|
+
metadata = get_metadata(sc_object, table=table)
|
|
67
|
+
|
|
68
|
+
main_vals = metadata[main_variable].astype(str).to_numpy()
|
|
69
|
+
if not all(l in np.unique(main_vals) for l in label_order):
|
|
70
|
+
missing = [l for l in label_order if l not in np.unique(main_vals)]
|
|
71
|
+
raise ValueError(f"Some groups in label_order not in data: {missing}")
|
|
72
|
+
|
|
73
|
+
metadata = metadata.loc[np.isin(main_vals, list(label_order))].copy()
|
|
74
|
+
groups = metadata[main_variable].astype(str).to_numpy()
|
|
75
|
+
covariable = metadata[subtype_variable].astype(str).to_numpy()
|
|
76
|
+
rng = np.random.default_rng(seed)
|
|
77
|
+
|
|
78
|
+
if len(label_order) == 0:
|
|
79
|
+
raise ValueError("label_order must be specified.")
|
|
80
|
+
|
|
81
|
+
if len(label_order) == 1:
|
|
82
|
+
if sample_id is None:
|
|
83
|
+
raise ValueError("In 1-class mode you must specify `sample_id`.")
|
|
84
|
+
return _one_class_test(
|
|
85
|
+
metadata,
|
|
86
|
+
sample_id,
|
|
87
|
+
covariable,
|
|
88
|
+
permutations,
|
|
89
|
+
rng,
|
|
90
|
+
plot=plot,
|
|
91
|
+
verbose=verbose,
|
|
92
|
+
pdf_file=pdf_file,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if len(label_order) > 2:
|
|
96
|
+
raise ValueError(
|
|
97
|
+
f"Only 2 labels are allowed for entropy estimation, got "
|
|
98
|
+
f"{len(label_order)}: {label_order}"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if verbose:
|
|
102
|
+
print(
|
|
103
|
+
"Computing entropy proportion over covariables for groups: "
|
|
104
|
+
f"{label_order[0]} vs {label_order[1]}"
|
|
105
|
+
)
|
|
106
|
+
obs_tab = _table(groups, covariable)
|
|
107
|
+
indexes = list(obs_tab.columns)
|
|
108
|
+
contig = _proportions_arcsin(obs_tab, label_order, indexes)
|
|
109
|
+
|
|
110
|
+
# Per-covariable relative entropies (matches R apply over rows... in the R it's
|
|
111
|
+
# apply(contig_tab, 1, function(x) abs(log2((x[1]*log2(x[2]))/(x[1]*log2(x[1])))));
|
|
112
|
+
# since R contig_tab is rows=labels, columns=covariables, apply over rows iterates
|
|
113
|
+
# COLUMNS — so we replicate by iterating columns here)
|
|
114
|
+
rel_entropies = np.empty(len(indexes))
|
|
115
|
+
for j in range(len(indexes)):
|
|
116
|
+
x = contig[:, j]
|
|
117
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
118
|
+
rel_entropies[j] = np.abs(
|
|
119
|
+
np.log2((x[0] * np.log2(x[1])) / (x[0] * np.log2(x[0])))
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
obs_score = _distance_surprise(contig[0], contig[1])
|
|
123
|
+
|
|
124
|
+
# Build cell-crowd for null sampling
|
|
125
|
+
if sample_id is not None:
|
|
126
|
+
samples = metadata[sample_id].astype(str).to_numpy()
|
|
127
|
+
n_per_sample = (
|
|
128
|
+
pd.crosstab(pd.Series(groups), pd.Series(samples)).reindex(label_order)
|
|
129
|
+
)
|
|
130
|
+
n_per_sample = np.sqrt(n_per_sample)
|
|
131
|
+
cell_crowd = {}
|
|
132
|
+
for cond in label_order:
|
|
133
|
+
row = n_per_sample.loc[cond]
|
|
134
|
+
cell_crowd[cond] = list(row[row != 0].astype(int).to_numpy())
|
|
135
|
+
else:
|
|
136
|
+
counts = pd.Series(groups).value_counts().to_dict()
|
|
137
|
+
cell_crowd = {l: int(round(np.sqrt(counts.get(l, 0)))) for l in label_order}
|
|
138
|
+
|
|
139
|
+
if verbose:
|
|
140
|
+
print(f"Starting Monte-Carlo simulation with n. permutations: {permutations}")
|
|
141
|
+
|
|
142
|
+
null_scores = np.empty(permutations)
|
|
143
|
+
for i in range(permutations):
|
|
144
|
+
pieces_cov, pieces_grp = [], []
|
|
145
|
+
for label in label_order:
|
|
146
|
+
crowd = cell_crowd[label]
|
|
147
|
+
if isinstance(crowd, list):
|
|
148
|
+
for n in crowd:
|
|
149
|
+
s = rng.choice(covariable, size=int(n), replace=True)
|
|
150
|
+
pieces_cov.append(s)
|
|
151
|
+
pieces_grp.append(np.repeat(label, len(s)))
|
|
152
|
+
else:
|
|
153
|
+
s = rng.choice(covariable, size=int(crowd), replace=True)
|
|
154
|
+
pieces_cov.append(s)
|
|
155
|
+
pieces_grp.append(np.repeat(label, len(s)))
|
|
156
|
+
cov = np.concatenate(pieces_cov)
|
|
157
|
+
grp = np.concatenate(pieces_grp)
|
|
158
|
+
rand_tab = _table(grp, cov)
|
|
159
|
+
p = _proportions_arcsin(rand_tab, label_order, indexes)
|
|
160
|
+
null_scores[i] = _distance_surprise(p[0], p[1])
|
|
161
|
+
|
|
162
|
+
p_val = float((null_scores >= obs_score).sum() / permutations)
|
|
163
|
+
|
|
164
|
+
if plot:
|
|
165
|
+
try:
|
|
166
|
+
_plot_entropy(
|
|
167
|
+
contig=contig,
|
|
168
|
+
indexes=indexes,
|
|
169
|
+
label_order=label_order,
|
|
170
|
+
obs_score=obs_score,
|
|
171
|
+
null_scores=null_scores,
|
|
172
|
+
p_val=p_val,
|
|
173
|
+
subtype_variable=subtype_variable,
|
|
174
|
+
pdf_file=pdf_file,
|
|
175
|
+
)
|
|
176
|
+
except Exception as e: # noqa: BLE001
|
|
177
|
+
if verbose:
|
|
178
|
+
print(f"(Plot skipped: {e})")
|
|
179
|
+
|
|
180
|
+
out = pd.Series(rel_entropies, index=indexes)
|
|
181
|
+
out["entropy_score"] = obs_score
|
|
182
|
+
out["p.val"] = p_val
|
|
183
|
+
out["mean.random.entropy"] = float(null_scores.mean())
|
|
184
|
+
out["sd.random.entropy"] = float(null_scores.std(ddof=1))
|
|
185
|
+
return out
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _plot_entropy(
|
|
189
|
+
contig, indexes, label_order, obs_score, null_scores, p_val,
|
|
190
|
+
subtype_variable, pdf_file=None,
|
|
191
|
+
):
|
|
192
|
+
import matplotlib.pyplot as plt
|
|
193
|
+
from ._utils import save_to_pdf
|
|
194
|
+
|
|
195
|
+
fig, axes = plt.subplots(1, 2, figsize=(12, 5), gridspec_kw={"width_ratios": [3, 1]})
|
|
196
|
+
ax = axes[0]
|
|
197
|
+
n = len(indexes)
|
|
198
|
+
width = 0.35
|
|
199
|
+
x = np.arange(n)
|
|
200
|
+
palette = ["#9ECAE1", "#3182BD"]
|
|
201
|
+
for i, label in enumerate(label_order):
|
|
202
|
+
ax.bar(x + (i - 0.5) * width, contig[i], width, label=label, color=palette[i])
|
|
203
|
+
ax.set_xticks(x)
|
|
204
|
+
ax.set_xticklabels(indexes, rotation=45, ha="right")
|
|
205
|
+
ax.set_ylabel("proportion")
|
|
206
|
+
ax.set_title(
|
|
207
|
+
f"Symmetric Divergence Score: {obs_score:.3f} | p.val.adj: {p_val:.3f}"
|
|
208
|
+
)
|
|
209
|
+
ax.legend(title=f"Class: {subtype_variable}")
|
|
210
|
+
|
|
211
|
+
ax2 = axes[1]
|
|
212
|
+
rng = np.random.default_rng(0)
|
|
213
|
+
jitter = rng.uniform(-0.1, 0.1, size=len(null_scores))
|
|
214
|
+
ax2.scatter(jitter, null_scores, color="#D5BADB", alpha=0.5, s=15)
|
|
215
|
+
ax2.axhline(np.median(null_scores), color="#86608E", lw=1)
|
|
216
|
+
ax2.scatter([0], [obs_score], color="#F08080", s=80, zorder=5)
|
|
217
|
+
ax2.set_xticks([])
|
|
218
|
+
ax2.set_ylabel("symmetric divergence")
|
|
219
|
+
plt.tight_layout()
|
|
220
|
+
save_to_pdf(fig, pdf_file)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _one_class_test(
|
|
224
|
+
metadata,
|
|
225
|
+
sample_id,
|
|
226
|
+
covariable,
|
|
227
|
+
permutations,
|
|
228
|
+
rng,
|
|
229
|
+
plot=True,
|
|
230
|
+
verbose=True,
|
|
231
|
+
pdf_file=None,
|
|
232
|
+
):
|
|
233
|
+
"""Permutation test for sample-level proportion variation in a single condition.
|
|
234
|
+
|
|
235
|
+
Departs from R's `oneClassTest` in one important way: the null draws each
|
|
236
|
+
sample's cells from THAT SAMPLE'S own covariable distribution, not from
|
|
237
|
+
the global pool. The R version sampled every cell from the global pool,
|
|
238
|
+
which collapses both random pseudo-groups onto the same global
|
|
239
|
+
distribution and produces a null that is essentially zero — so the user
|
|
240
|
+
never observes any spread no matter how heterogeneous the real samples
|
|
241
|
+
are. Drawing from per-sample pools preserves real per-sample structure
|
|
242
|
+
and lets random partitions of those samples yield a null distribution
|
|
243
|
+
whose spread reflects across-sample heterogeneity, which is what this
|
|
244
|
+
test is meant to assess.
|
|
245
|
+
"""
|
|
246
|
+
samples = metadata[sample_id].astype(str).to_numpy()
|
|
247
|
+
obs_tab = _table(samples, covariable)
|
|
248
|
+
indexes = list(obs_tab.columns)
|
|
249
|
+
n_per_sample = pd.Series(samples).value_counts()
|
|
250
|
+
sqrt_n = np.sqrt(n_per_sample)
|
|
251
|
+
sqrt_n[sqrt_n == 0] = 10
|
|
252
|
+
cell_crowd = sqrt_n.to_dict()
|
|
253
|
+
|
|
254
|
+
# Build a per-sample pool of covariable values (preserves the real cell
|
|
255
|
+
# composition of each sample for the null draw).
|
|
256
|
+
sample_pools = {
|
|
257
|
+
s: covariable[samples == s] for s in n_per_sample.index
|
|
258
|
+
}
|
|
259
|
+
unique_samples = list(n_per_sample.index)
|
|
260
|
+
if len(unique_samples) < 2:
|
|
261
|
+
raise ValueError(
|
|
262
|
+
"1-class entropy test needs at least 2 samples in `sample_id`."
|
|
263
|
+
)
|
|
264
|
+
n_g1 = max(1, round(len(unique_samples) / 2))
|
|
265
|
+
|
|
266
|
+
# Mirror R's iteration count: seq(100) * seq(permutations/10) = 10*perms.
|
|
267
|
+
n_iter = max(int(permutations) * 10, 100)
|
|
268
|
+
null_scores = np.empty(n_iter)
|
|
269
|
+
if verbose:
|
|
270
|
+
print(f"Starting 1-class Monte-Carlo simulation: {n_iter} iterations")
|
|
271
|
+
|
|
272
|
+
for i in range(n_iter):
|
|
273
|
+
perm = rng.permutation(len(unique_samples))
|
|
274
|
+
g1 = [unique_samples[k] for k in perm[:n_g1]]
|
|
275
|
+
g2 = [unique_samples[k] for k in perm[n_g1:]]
|
|
276
|
+
pieces_cov, pieces_grp = [], []
|
|
277
|
+
for s in g1:
|
|
278
|
+
n = max(int(cell_crowd[s]), 1)
|
|
279
|
+
pool = sample_pools[s]
|
|
280
|
+
if len(pool) == 0:
|
|
281
|
+
continue
|
|
282
|
+
draw = rng.choice(pool, size=n, replace=True)
|
|
283
|
+
pieces_cov.append(draw)
|
|
284
|
+
pieces_grp.append(np.repeat("group1", n))
|
|
285
|
+
for s in g2:
|
|
286
|
+
n = max(int(cell_crowd[s]), 1)
|
|
287
|
+
pool = sample_pools[s]
|
|
288
|
+
if len(pool) == 0:
|
|
289
|
+
continue
|
|
290
|
+
draw = rng.choice(pool, size=n, replace=True)
|
|
291
|
+
pieces_cov.append(draw)
|
|
292
|
+
pieces_grp.append(np.repeat("group2", n))
|
|
293
|
+
cov = np.concatenate(pieces_cov)
|
|
294
|
+
grp = np.concatenate(pieces_grp)
|
|
295
|
+
rand_tab = _table(grp, cov)
|
|
296
|
+
p = _proportions_arcsin(rand_tab, ["group1", "group2"], indexes)
|
|
297
|
+
null_scores[i] = _distance_surprise(p[0], p[1])
|
|
298
|
+
|
|
299
|
+
mean_null = float(null_scores.mean())
|
|
300
|
+
sd_null = float(null_scores.std(ddof=1))
|
|
301
|
+
median_null = float(np.median(null_scores))
|
|
302
|
+
cv = float(sd_null / mean_null * 100) if mean_null > 0 else float("inf")
|
|
303
|
+
if median_null > 0:
|
|
304
|
+
relative_iqr = float(
|
|
305
|
+
(np.percentile(null_scores, 75) - np.percentile(null_scores, 25))
|
|
306
|
+
/ median_null
|
|
307
|
+
)
|
|
308
|
+
else:
|
|
309
|
+
relative_iqr = float("nan")
|
|
310
|
+
if cv <= 35:
|
|
311
|
+
variation = "Low"
|
|
312
|
+
elif cv <= 50:
|
|
313
|
+
variation = "Medium"
|
|
314
|
+
else:
|
|
315
|
+
variation = "High"
|
|
316
|
+
|
|
317
|
+
if verbose:
|
|
318
|
+
print(f"Coefficient of Variation: {cv:.2f} %")
|
|
319
|
+
print(f"Variation across samples is considered: {variation}")
|
|
320
|
+
print(f"Relative IQR: {relative_iqr:.3f}")
|
|
321
|
+
|
|
322
|
+
if plot:
|
|
323
|
+
try:
|
|
324
|
+
import matplotlib.pyplot as plt
|
|
325
|
+
from ._utils import save_to_pdf
|
|
326
|
+
|
|
327
|
+
fig, ax = plt.subplots(figsize=(3.5, 5))
|
|
328
|
+
jitter = rng.uniform(-0.1, 0.1, size=len(null_scores))
|
|
329
|
+
ax.scatter(jitter, null_scores, color="#D5BADB", alpha=0.5, s=15)
|
|
330
|
+
ax.axhline(median_null, color="#86608E", lw=1)
|
|
331
|
+
ax.set_xlim(-0.5, 0.5)
|
|
332
|
+
lo = float(min(0.0, null_scores.min()))
|
|
333
|
+
hi = float(null_scores.max())
|
|
334
|
+
pad = max(1e-3, 0.1 * (hi - lo))
|
|
335
|
+
ax.set_ylim(lo, hi + pad)
|
|
336
|
+
ax.set_xticks([])
|
|
337
|
+
ax.set_ylabel("symmetric divergence (null)")
|
|
338
|
+
ax.set_title(
|
|
339
|
+
f"1-class null distribution\n"
|
|
340
|
+
f"median={median_null:.4f} CV={cv:.1f}% ({variation})"
|
|
341
|
+
)
|
|
342
|
+
plt.tight_layout()
|
|
343
|
+
save_to_pdf(fig, pdf_file)
|
|
344
|
+
except Exception:
|
|
345
|
+
pass
|
|
346
|
+
|
|
347
|
+
return {
|
|
348
|
+
"cv": cv,
|
|
349
|
+
"variation": variation,
|
|
350
|
+
"relative_iqr": relative_iqr,
|
|
351
|
+
"mean.random.entropy": mean_null,
|
|
352
|
+
"sd.random.entropy": sd_null,
|
|
353
|
+
"median.random.entropy": median_null,
|
|
354
|
+
}
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
"""Main `lots_of_cells` function: 2-group Monte-Carlo and >2-group Goodman & Kruskal gamma."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Optional, Sequence
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from ._stats import (
|
|
10
|
+
_proportions_from_table,
|
|
11
|
+
_table,
|
|
12
|
+
asrt,
|
|
13
|
+
cell_to_gamma,
|
|
14
|
+
cell_to_gamma_original,
|
|
15
|
+
cell_to_montecarlo,
|
|
16
|
+
)
|
|
17
|
+
from ._utils import get_metadata
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _bh_fdr(pvals: np.ndarray) -> np.ndarray:
|
|
21
|
+
"""Benjamini-Hochberg FDR. Equivalent to R p.adjust(., 'fdr')."""
|
|
22
|
+
p = np.asarray(pvals, dtype=float)
|
|
23
|
+
n = len(p)
|
|
24
|
+
order = np.argsort(p)
|
|
25
|
+
ranked = p[order] * n / (np.arange(n) + 1)
|
|
26
|
+
# cummin from the right
|
|
27
|
+
adj = np.minimum.accumulate(ranked[::-1])[::-1]
|
|
28
|
+
out = np.empty_like(adj)
|
|
29
|
+
out[order] = np.clip(adj, 0, 1)
|
|
30
|
+
return out
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _gamma_godkrus(nc: np.ndarray, nd: np.ndarray, denom: float) -> np.ndarray:
|
|
34
|
+
"""Goodman-Kruskal gamma: (nc - nd) / exp(mean(log(N))) — N is denom (scalar here)."""
|
|
35
|
+
return (nc - nd) / np.exp(np.log(denom))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def lots_of_cells(
|
|
39
|
+
sc_object,
|
|
40
|
+
main_variable: str,
|
|
41
|
+
subtype_variable: str,
|
|
42
|
+
label_order: Sequence[str],
|
|
43
|
+
sample_id: Optional[str] = None,
|
|
44
|
+
permutations: int = 1000,
|
|
45
|
+
seed: Optional[int] = None,
|
|
46
|
+
table: Optional[str] = None,
|
|
47
|
+
plot: bool = True,
|
|
48
|
+
verbose: bool = True,
|
|
49
|
+
pdf_file: Optional[str] = None,
|
|
50
|
+
) -> pd.DataFrame:
|
|
51
|
+
"""Compute proportion tests on single-cell metadata.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
sc_object
|
|
56
|
+
AnnData / SpatialData / MuData / pandas.DataFrame.
|
|
57
|
+
main_variable
|
|
58
|
+
Column in ``.obs`` (or DataFrame) with the main grouping (e.g.
|
|
59
|
+
``"condition"``).
|
|
60
|
+
subtype_variable
|
|
61
|
+
Column with the covariable to test (e.g. ``"cell_type"``).
|
|
62
|
+
label_order
|
|
63
|
+
Order of labels in ``main_variable`` to compare.
|
|
64
|
+
- 2 labels → log2 fold-change of arcsin-sqrt proportions, with
|
|
65
|
+
Monte-Carlo null distribution.
|
|
66
|
+
- >2 labels → Goodman & Kruskal's gamma rank correlation.
|
|
67
|
+
sample_id
|
|
68
|
+
Optional column with sample IDs. When set, the null distribution
|
|
69
|
+
accounts for per-sample heterogeneity.
|
|
70
|
+
permutations
|
|
71
|
+
Number of Monte-Carlo permutations.
|
|
72
|
+
seed
|
|
73
|
+
Random seed for reproducibility.
|
|
74
|
+
table
|
|
75
|
+
For SpatialData/MuData with multiple tables/modalities.
|
|
76
|
+
plot
|
|
77
|
+
If True and 2 labels, show the abundance test plot.
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
pandas.DataFrame with one row per covariable level.
|
|
82
|
+
"""
|
|
83
|
+
metadata = get_metadata(sc_object, table=table)
|
|
84
|
+
|
|
85
|
+
main_vals = metadata[main_variable].astype(str).to_numpy()
|
|
86
|
+
if not all(l in np.unique(main_vals) for l in label_order):
|
|
87
|
+
missing = [l for l in label_order if l not in np.unique(main_vals)]
|
|
88
|
+
raise ValueError(f"Some groups in label_order not found in data: {missing}")
|
|
89
|
+
|
|
90
|
+
mask = np.isin(main_vals, list(label_order))
|
|
91
|
+
metadata = metadata.loc[mask].copy()
|
|
92
|
+
groups = metadata[main_variable].astype(str).to_numpy()
|
|
93
|
+
covariable = metadata[subtype_variable].astype(str).to_numpy()
|
|
94
|
+
|
|
95
|
+
rng = np.random.default_rng(seed)
|
|
96
|
+
min_cells = 10
|
|
97
|
+
|
|
98
|
+
if len(label_order) < 2:
|
|
99
|
+
raise ValueError("label_order must have at least 2 entries.")
|
|
100
|
+
|
|
101
|
+
if len(label_order) > 2:
|
|
102
|
+
return _gamma_path(
|
|
103
|
+
covariable, groups, label_order, permutations, min_cells, rng, verbose
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return _montecarlo_path(
|
|
107
|
+
metadata,
|
|
108
|
+
covariable,
|
|
109
|
+
groups,
|
|
110
|
+
label_order,
|
|
111
|
+
sample_id,
|
|
112
|
+
main_variable,
|
|
113
|
+
subtype_variable,
|
|
114
|
+
permutations,
|
|
115
|
+
min_cells,
|
|
116
|
+
rng,
|
|
117
|
+
verbose,
|
|
118
|
+
plot,
|
|
119
|
+
pdf_file,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# --- 2-condition Monte Carlo path ----------------------------------------------------
|
|
124
|
+
|
|
125
|
+
def _montecarlo_path(
|
|
126
|
+
metadata,
|
|
127
|
+
covariable,
|
|
128
|
+
groups,
|
|
129
|
+
label_order,
|
|
130
|
+
sample_id,
|
|
131
|
+
main_variable,
|
|
132
|
+
subtype_variable,
|
|
133
|
+
permutations,
|
|
134
|
+
min_cells,
|
|
135
|
+
rng,
|
|
136
|
+
verbose,
|
|
137
|
+
plot,
|
|
138
|
+
pdf_file=None,
|
|
139
|
+
):
|
|
140
|
+
if verbose:
|
|
141
|
+
print(f"Only 2 groups detected. Computing FC for {label_order[0]} vs {label_order[1]}")
|
|
142
|
+
|
|
143
|
+
if sample_id is not None:
|
|
144
|
+
if verbose:
|
|
145
|
+
print(f"Additional sub-level for testing: {sample_id}")
|
|
146
|
+
samples = metadata[sample_id].astype(str).to_numpy()
|
|
147
|
+
n_per_sample = pd.crosstab(pd.Series(groups), pd.Series(samples)).reindex(label_order)
|
|
148
|
+
|
|
149
|
+
# Synthetic samples (per-condition resampling) — mirrors R lotsOfCells.R
|
|
150
|
+
synth_meta = metadata[[main_variable, subtype_variable, sample_id]].copy()
|
|
151
|
+
mult_factor = 2
|
|
152
|
+
new_samples = int(round((n_per_sample != 0).sum(axis=1).mean())) * mult_factor
|
|
153
|
+
synth_rows = []
|
|
154
|
+
for i in range(1, new_samples + 1):
|
|
155
|
+
for cond in label_order:
|
|
156
|
+
row = n_per_sample.loc[cond]
|
|
157
|
+
nonzero = row[row != 0]
|
|
158
|
+
if len(nonzero) == 0:
|
|
159
|
+
continue
|
|
160
|
+
n = int(rng.integers(int(nonzero.min()), int(nonzero.max()) + 1))
|
|
161
|
+
pool = covariable[groups == cond]
|
|
162
|
+
synth_cov = rng.choice(pool, size=n, replace=True)
|
|
163
|
+
synth_rows.append(pd.DataFrame({
|
|
164
|
+
main_variable: cond,
|
|
165
|
+
subtype_variable: synth_cov,
|
|
166
|
+
sample_id: f"synthetic_sample_{cond}_{i}",
|
|
167
|
+
}))
|
|
168
|
+
if synth_rows:
|
|
169
|
+
synth_meta = pd.concat([synth_meta, *synth_rows], ignore_index=True)
|
|
170
|
+
|
|
171
|
+
groups_synth = synth_meta[main_variable].astype(str).to_numpy()
|
|
172
|
+
covariable_synth = synth_meta[subtype_variable].astype(str).to_numpy()
|
|
173
|
+
|
|
174
|
+
cell_crowd = {}
|
|
175
|
+
for cond in label_order:
|
|
176
|
+
row = n_per_sample.loc[cond]
|
|
177
|
+
nonzero = row[row != 0].to_numpy()
|
|
178
|
+
cell_crowd[cond] = list(np.maximum(np.sqrt(nonzero), min_cells).astype(int))
|
|
179
|
+
else:
|
|
180
|
+
groups_synth = groups
|
|
181
|
+
covariable_synth = covariable
|
|
182
|
+
counts_per_group = pd.Series(groups).value_counts().to_dict()
|
|
183
|
+
cell_crowd = {
|
|
184
|
+
l: int(round(max(np.sqrt(counts_per_group.get(l, 0)), min_cells)))
|
|
185
|
+
for l in label_order
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
# Observed fold-change
|
|
189
|
+
obs_tab = _table(groups, covariable)
|
|
190
|
+
p_obs = _proportions_from_table(obs_tab, label_order, list(obs_tab.columns), pseudo=True)
|
|
191
|
+
indexes = list(obs_tab.columns)
|
|
192
|
+
obs_fc = np.log2(asrt(p_obs[0]) / asrt(p_obs[1]))
|
|
193
|
+
|
|
194
|
+
if verbose:
|
|
195
|
+
print("- Starting Monte-Carlo simulation of fold changes")
|
|
196
|
+
|
|
197
|
+
null_fcs = np.empty((permutations, len(indexes)))
|
|
198
|
+
real_fcs = np.empty((permutations, len(indexes)))
|
|
199
|
+
for i in range(permutations):
|
|
200
|
+
m, o = cell_to_montecarlo(
|
|
201
|
+
covariable_synth, groups_synth, label_order, indexes, cell_crowd, rng
|
|
202
|
+
)
|
|
203
|
+
null_fcs[i] = m
|
|
204
|
+
real_fcs[i] = o
|
|
205
|
+
|
|
206
|
+
higher = (np.sum(null_fcs >= obs_fc, axis=0) + 1) / (permutations + 1)
|
|
207
|
+
lower = (np.sum(null_fcs <= obs_fc, axis=0) + 1) / (permutations + 1)
|
|
208
|
+
p_vals = np.where(obs_fc > 0, higher, lower)
|
|
209
|
+
p_adj = _bh_fdr(p_vals)
|
|
210
|
+
sd_mc = null_fcs.std(axis=0, ddof=1)
|
|
211
|
+
ci_low = np.quantile(real_fcs, 0.025, axis=0)
|
|
212
|
+
ci_high = np.quantile(real_fcs, 0.975, axis=0)
|
|
213
|
+
|
|
214
|
+
pct1 = np.round(p_obs[0], 3)
|
|
215
|
+
pct2 = np.round(p_obs[1], 3)
|
|
216
|
+
table_results = pd.DataFrame(
|
|
217
|
+
{
|
|
218
|
+
"groupFC": obs_fc,
|
|
219
|
+
f"percent_in_{label_order[0]}": pct1,
|
|
220
|
+
f"percent_in_{label_order[1]}": pct2,
|
|
221
|
+
"p.adj": np.round(p_adj, 5),
|
|
222
|
+
"sd.montecarlo": sd_mc,
|
|
223
|
+
"CI95low": ci_low,
|
|
224
|
+
"CI95high": ci_high,
|
|
225
|
+
},
|
|
226
|
+
index=indexes,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Ensure CIs encompass observed
|
|
230
|
+
bad_low = ~(table_results["CI95low"] < table_results["groupFC"])
|
|
231
|
+
table_results.loc[bad_low, "CI95low"] = table_results.loc[bad_low, "groupFC"]
|
|
232
|
+
bad_high = ~(table_results["CI95high"] > table_results["groupFC"])
|
|
233
|
+
table_results.loc[bad_high, "CI95high"] = table_results.loc[bad_high, "groupFC"]
|
|
234
|
+
|
|
235
|
+
if plot:
|
|
236
|
+
try:
|
|
237
|
+
from .plots import plot_abundance_test
|
|
238
|
+
plot_abundance_test(
|
|
239
|
+
table_results,
|
|
240
|
+
subtype_variable=subtype_variable,
|
|
241
|
+
pdf_file=pdf_file,
|
|
242
|
+
)
|
|
243
|
+
except Exception as e: # noqa: BLE001
|
|
244
|
+
if verbose:
|
|
245
|
+
print(f"(Plot skipped: {e})")
|
|
246
|
+
|
|
247
|
+
return table_results
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
# --- >2-condition Goodman-Kruskal gamma path -----------------------------------------
|
|
251
|
+
|
|
252
|
+
def _gamma_path(covariable, groups, label_order, permutations, min_cells, rng, verbose):
|
|
253
|
+
if verbose:
|
|
254
|
+
print(
|
|
255
|
+
"More than 2 groups detected. Computing Goodman-Kruskal gamma rank "
|
|
256
|
+
f"correlation in order: {' vs '.join(label_order)}"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
counts_per_group = pd.Series(groups).value_counts().to_dict()
|
|
260
|
+
cell_crowd = {
|
|
261
|
+
l: int(round(max(np.sqrt(counts_per_group.get(l, 0)), min_cells)))
|
|
262
|
+
for l in label_order
|
|
263
|
+
}
|
|
264
|
+
kendall_denom = (len(label_order) * (len(label_order) - 1)) / 2
|
|
265
|
+
rank_index = np.arange(1, len(label_order) + 1)
|
|
266
|
+
|
|
267
|
+
obs_tab = _table(groups, covariable)
|
|
268
|
+
indexes = list(obs_tab.columns)
|
|
269
|
+
|
|
270
|
+
# Observed gamma: aggregate over `permutations` subsamplings of the original data
|
|
271
|
+
nc_orig = np.zeros(len(indexes))
|
|
272
|
+
nd_orig = np.zeros(len(indexes))
|
|
273
|
+
for _ in range(permutations):
|
|
274
|
+
nc, nd = cell_to_gamma_original(
|
|
275
|
+
covariable, groups, label_order, indexes, cell_crowd, rank_index, rng
|
|
276
|
+
)
|
|
277
|
+
nc_orig += nc
|
|
278
|
+
nd_orig += nd
|
|
279
|
+
obs_gamma = _gamma_godkrus(nc_orig, nd_orig, kendall_denom * permutations)
|
|
280
|
+
|
|
281
|
+
# Confidence interval via 10 sub-samples of size 100
|
|
282
|
+
sub_gammas = np.empty((10, len(indexes)))
|
|
283
|
+
for s in range(10):
|
|
284
|
+
nc_s = np.zeros(len(indexes))
|
|
285
|
+
nd_s = np.zeros(len(indexes))
|
|
286
|
+
for _ in range(100):
|
|
287
|
+
nc, nd = cell_to_gamma_original(
|
|
288
|
+
covariable, groups, label_order, indexes, cell_crowd, rank_index, rng
|
|
289
|
+
)
|
|
290
|
+
nc_s += nc
|
|
291
|
+
nd_s += nd
|
|
292
|
+
sub_gammas[s] = _gamma_godkrus(nc_s, nd_s, kendall_denom * 100)
|
|
293
|
+
ci_low = np.nanquantile(sub_gammas, 0.025, axis=0)
|
|
294
|
+
ci_high = np.nanquantile(sub_gammas, 0.975, axis=0)
|
|
295
|
+
|
|
296
|
+
if verbose:
|
|
297
|
+
print("- Starting gamma rank permutation analysis, this can take a while...")
|
|
298
|
+
|
|
299
|
+
n_random_observations = 10
|
|
300
|
+
null_gamma = np.empty((permutations, len(indexes)))
|
|
301
|
+
for p in range(permutations):
|
|
302
|
+
nc_p = np.zeros(len(indexes))
|
|
303
|
+
nd_p = np.zeros(len(indexes))
|
|
304
|
+
for _ in range(n_random_observations):
|
|
305
|
+
nc, nd = cell_to_gamma(
|
|
306
|
+
covariable, groups, label_order, indexes, cell_crowd, rank_index, rng
|
|
307
|
+
)
|
|
308
|
+
nc_p += nc
|
|
309
|
+
nd_p += nd
|
|
310
|
+
null_gamma[p] = _gamma_godkrus(nc_p, nd_p, kendall_denom * n_random_observations)
|
|
311
|
+
|
|
312
|
+
with np.errstate(invalid="ignore"):
|
|
313
|
+
higher = np.sum(null_gamma >= obs_gamma, axis=0) / permutations
|
|
314
|
+
lower = np.sum(null_gamma <= obs_gamma, axis=0) / permutations
|
|
315
|
+
p_vals = np.where(obs_gamma > 0, higher, lower)
|
|
316
|
+
p_adj = _bh_fdr(np.nan_to_num(p_vals, nan=1.0))
|
|
317
|
+
|
|
318
|
+
# Per-condition proportions (unnormalised contig table -> per-row proportions)
|
|
319
|
+
contig_tab = _table(groups, covariable).reindex(label_order)
|
|
320
|
+
proportions = contig_tab.div(contig_tab.sum(axis=1), axis=0).reindex(
|
|
321
|
+
index=label_order, columns=indexes
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
df = pd.DataFrame({"groupGammaCor": np.round(obs_gamma, 4)}, index=indexes)
|
|
325
|
+
for l in label_order:
|
|
326
|
+
df[f"percent_in_{l}"] = np.round(proportions.loc[l].values, 3)
|
|
327
|
+
df["p.adj"] = np.round(p_adj, 5)
|
|
328
|
+
df["CI95low"] = np.round(ci_low, 4)
|
|
329
|
+
df["CI95high"] = np.round(ci_high, 4)
|
|
330
|
+
return df
|