lotsofcells 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lotsofcells/__init__.py +39 -0
- lotsofcells/__pycache__/__init__.cpython-39.pyc +0 -0
- lotsofcells/__pycache__/_stats.cpython-39.pyc +0 -0
- lotsofcells/__pycache__/_utils.cpython-39.pyc +0 -0
- lotsofcells/__pycache__/entropy.cpython-39.pyc +0 -0
- lotsofcells/__pycache__/lotsofcells.cpython-39.pyc +0 -0
- lotsofcells/__pycache__/plots.cpython-39.pyc +0 -0
- lotsofcells/_stats.py +279 -0
- lotsofcells/_utils.py +213 -0
- lotsofcells/entropy.py +354 -0
- lotsofcells/lotsofcells.py +360 -0
- lotsofcells/plots.py +681 -0
- lotsofcells-0.0.0.dist-info/METADATA +21 -0
- lotsofcells-0.0.0.dist-info/RECORD +16 -0
- lotsofcells-0.0.0.dist-info/WHEEL +5 -0
- lotsofcells-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
"""Main `lots_of_cells` function: 2-group Monte-Carlo and >2-group Goodman & Kruskal gamma."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Optional, Sequence
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from ._stats import (
|
|
10
|
+
_proportions_from_table,
|
|
11
|
+
_table,
|
|
12
|
+
asrt,
|
|
13
|
+
cell_to_gamma,
|
|
14
|
+
cell_to_gamma_original,
|
|
15
|
+
cell_to_montecarlo,
|
|
16
|
+
)
|
|
17
|
+
from ._utils import get_metadata
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _bh_fdr(pvals: np.ndarray) -> np.ndarray:
|
|
21
|
+
"""Benjamini-Hochberg FDR. Equivalent to R p.adjust(., 'fdr')."""
|
|
22
|
+
p = np.asarray(pvals, dtype=float)
|
|
23
|
+
n = len(p)
|
|
24
|
+
order = np.argsort(p)
|
|
25
|
+
ranked = p[order] * n / (np.arange(n) + 1)
|
|
26
|
+
# cummin from the right
|
|
27
|
+
adj = np.minimum.accumulate(ranked[::-1])[::-1]
|
|
28
|
+
out = np.empty_like(adj)
|
|
29
|
+
out[order] = np.clip(adj, 0, 1)
|
|
30
|
+
return out
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _gamma_godkrus(nc: np.ndarray, nd: np.ndarray, denom: float) -> np.ndarray:
|
|
34
|
+
"""Goodman-Kruskal gamma: (nc - nd) / exp(mean(log(N))) — N is denom (scalar here)."""
|
|
35
|
+
return (nc - nd) / np.exp(np.log(denom))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def lots_of_cells(
|
|
39
|
+
sc_object,
|
|
40
|
+
main_variable: str,
|
|
41
|
+
subtype_variable: str,
|
|
42
|
+
label_order: Sequence[str],
|
|
43
|
+
sample_id: Optional[str] = None,
|
|
44
|
+
permutations: int = 1000,
|
|
45
|
+
seed: Optional[int] = None,
|
|
46
|
+
table: Optional[str] = None,
|
|
47
|
+
plot: bool = True,
|
|
48
|
+
verbose: bool = True,
|
|
49
|
+
pdf_file: Optional[str] = None,
|
|
50
|
+
) -> pd.DataFrame:
|
|
51
|
+
"""Compute proportion tests on single-cell metadata.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
sc_object
|
|
56
|
+
AnnData / SpatialData / MuData / pandas.DataFrame.
|
|
57
|
+
main_variable
|
|
58
|
+
Column in ``.obs`` (or DataFrame) with the main grouping (e.g.
|
|
59
|
+
``"condition"``).
|
|
60
|
+
subtype_variable
|
|
61
|
+
Column with the covariable to test (e.g. ``"cell_type"``).
|
|
62
|
+
label_order
|
|
63
|
+
Order of labels in ``main_variable`` to compare.
|
|
64
|
+
- 2 labels → log2 fold-change of arcsin-sqrt proportions, with
|
|
65
|
+
Monte-Carlo null distribution.
|
|
66
|
+
- >2 labels → Goodman & Kruskal's gamma rank correlation.
|
|
67
|
+
sample_id
|
|
68
|
+
Optional column with sample IDs. When set, the null distribution
|
|
69
|
+
accounts for per-sample heterogeneity.
|
|
70
|
+
permutations
|
|
71
|
+
Number of Monte-Carlo permutations.
|
|
72
|
+
seed
|
|
73
|
+
Random seed for reproducibility.
|
|
74
|
+
table
|
|
75
|
+
For SpatialData/MuData with multiple tables/modalities.
|
|
76
|
+
plot
|
|
77
|
+
If True and 2 labels, show the abundance test plot.
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
pandas.DataFrame with one row per covariable level.
|
|
82
|
+
"""
|
|
83
|
+
metadata = get_metadata(sc_object, table=table)
|
|
84
|
+
|
|
85
|
+
main_vals = metadata[main_variable].astype(str).to_numpy()
|
|
86
|
+
|
|
87
|
+
if isinstance(label_order[0], list) or isinstance(label_order[1], list):
|
|
88
|
+
if verbose:
|
|
89
|
+
print(f"Multiple sub-groups detected.")
|
|
90
|
+
# If several levels, process:
|
|
91
|
+
flat_order = np.array([a for b in label_order for a in b]).astype(str)
|
|
92
|
+
group_1 = np.array(label_order[0]).astype(str)
|
|
93
|
+
group_2 = np.array(label_order[1]).astype(str)
|
|
94
|
+
# Clean data with unwanted levels:
|
|
95
|
+
mask = np.isin(main_vals, flat_order)
|
|
96
|
+
metadata = metadata.loc[mask].copy()
|
|
97
|
+
# Update target labels:
|
|
98
|
+
main_vals = metadata[main_variable].astype(str).to_numpy()
|
|
99
|
+
|
|
100
|
+
# Obtain group labels:
|
|
101
|
+
mask_g1 = np.isin(main_vals, group_1)
|
|
102
|
+
mask_g2 = np.isin(main_vals, group_2)
|
|
103
|
+
# Define new labels:
|
|
104
|
+
label_1 = "loc_group_one [" + " ".join(group_1)+"]"
|
|
105
|
+
label_2 = "loc_group_two [" + " ".join(group_2)+"]"
|
|
106
|
+
# Create synthetic labels:
|
|
107
|
+
metadata.loc[mask_g1,"loc_tmp_group"] = label_1
|
|
108
|
+
metadata.loc[mask_g2,"loc_tmp_group"] = label_2
|
|
109
|
+
# relevel and recompute
|
|
110
|
+
main_variable = "loc_tmp_group"
|
|
111
|
+
main_vals = metadata[main_variable].astype(str).to_numpy()
|
|
112
|
+
# Copy original and push new:
|
|
113
|
+
label_order_original = label_order
|
|
114
|
+
label_order = [label_1, label_2]
|
|
115
|
+
|
|
116
|
+
if not all(l in np.unique(main_vals) for l in label_order):
|
|
117
|
+
missing = [l for l in label_order if l not in np.unique(main_vals)]
|
|
118
|
+
raise ValueError(f"Some groups in label_order not found in data: {missing}")
|
|
119
|
+
|
|
120
|
+
mask = np.isin(main_vals, list(label_order))
|
|
121
|
+
metadata = metadata.loc[mask].copy()
|
|
122
|
+
groups = metadata[main_variable].astype(str).to_numpy()
|
|
123
|
+
covariable = metadata[subtype_variable].astype(str).to_numpy()
|
|
124
|
+
|
|
125
|
+
rng = np.random.default_rng(seed)
|
|
126
|
+
min_cells = 10
|
|
127
|
+
|
|
128
|
+
if len(label_order) < 2:
|
|
129
|
+
raise ValueError("label_order must have at least 2 entries.")
|
|
130
|
+
|
|
131
|
+
if len(label_order) > 2:
|
|
132
|
+
return _gamma_path(
|
|
133
|
+
covariable, groups, label_order, permutations, min_cells, rng, verbose
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return _montecarlo_path(
|
|
137
|
+
metadata,
|
|
138
|
+
covariable,
|
|
139
|
+
groups,
|
|
140
|
+
label_order,
|
|
141
|
+
sample_id,
|
|
142
|
+
main_variable,
|
|
143
|
+
subtype_variable,
|
|
144
|
+
permutations,
|
|
145
|
+
min_cells,
|
|
146
|
+
rng,
|
|
147
|
+
verbose,
|
|
148
|
+
plot,
|
|
149
|
+
pdf_file,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# --- 2-condition Monte Carlo path ----------------------------------------------------
|
|
154
|
+
|
|
155
|
+
def _montecarlo_path(
|
|
156
|
+
metadata,
|
|
157
|
+
covariable,
|
|
158
|
+
groups,
|
|
159
|
+
label_order,
|
|
160
|
+
sample_id,
|
|
161
|
+
main_variable,
|
|
162
|
+
subtype_variable,
|
|
163
|
+
permutations,
|
|
164
|
+
min_cells,
|
|
165
|
+
rng,
|
|
166
|
+
verbose,
|
|
167
|
+
plot,
|
|
168
|
+
pdf_file=None,
|
|
169
|
+
):
|
|
170
|
+
if verbose:
|
|
171
|
+
print(f"Only 2 groups detected. Computing FC for {label_order[0]} vs {label_order[1]}")
|
|
172
|
+
|
|
173
|
+
if sample_id is not None:
|
|
174
|
+
if verbose:
|
|
175
|
+
print(f"Additional sub-level for testing: {sample_id}")
|
|
176
|
+
samples = metadata[sample_id].astype(str).to_numpy()
|
|
177
|
+
n_per_sample = pd.crosstab(pd.Series(groups), pd.Series(samples)).reindex(label_order)
|
|
178
|
+
|
|
179
|
+
# Synthetic samples (per-condition resampling) — mirrors R lotsOfCells.R
|
|
180
|
+
synth_meta = metadata[[main_variable, subtype_variable, sample_id]].copy()
|
|
181
|
+
mult_factor = 2
|
|
182
|
+
new_samples = int(round((n_per_sample != 0).sum(axis=1).mean())) * mult_factor
|
|
183
|
+
synth_rows = []
|
|
184
|
+
for i in range(1, new_samples + 1):
|
|
185
|
+
for cond in label_order:
|
|
186
|
+
row = n_per_sample.loc[cond]
|
|
187
|
+
nonzero = row[row != 0]
|
|
188
|
+
if len(nonzero) == 0:
|
|
189
|
+
continue
|
|
190
|
+
n = int(rng.integers(int(nonzero.min()), int(nonzero.max()) + 1))
|
|
191
|
+
pool = covariable[groups == cond]
|
|
192
|
+
synth_cov = rng.choice(pool, size=n, replace=True)
|
|
193
|
+
synth_rows.append(pd.DataFrame({
|
|
194
|
+
main_variable: cond,
|
|
195
|
+
subtype_variable: synth_cov,
|
|
196
|
+
sample_id: f"synthetic_sample_{cond}_{i}",
|
|
197
|
+
}))
|
|
198
|
+
if synth_rows:
|
|
199
|
+
synth_meta = pd.concat([synth_meta, *synth_rows], ignore_index=True)
|
|
200
|
+
|
|
201
|
+
groups_synth = synth_meta[main_variable].astype(str).to_numpy()
|
|
202
|
+
covariable_synth = synth_meta[subtype_variable].astype(str).to_numpy()
|
|
203
|
+
|
|
204
|
+
cell_crowd = {}
|
|
205
|
+
for cond in label_order:
|
|
206
|
+
row = n_per_sample.loc[cond]
|
|
207
|
+
nonzero = row[row != 0].to_numpy()
|
|
208
|
+
cell_crowd[cond] = list(np.maximum(np.sqrt(nonzero), min_cells).astype(int))
|
|
209
|
+
else:
|
|
210
|
+
groups_synth = groups
|
|
211
|
+
covariable_synth = covariable
|
|
212
|
+
counts_per_group = pd.Series(groups).value_counts().to_dict()
|
|
213
|
+
cell_crowd = {
|
|
214
|
+
l: int(round(max(np.sqrt(counts_per_group.get(l, 0)), min_cells)))
|
|
215
|
+
for l in label_order
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
# Observed fold-change
|
|
219
|
+
obs_tab = _table(groups, covariable)
|
|
220
|
+
p_obs = _proportions_from_table(obs_tab, label_order, list(obs_tab.columns), pseudo=True)
|
|
221
|
+
indexes = list(obs_tab.columns)
|
|
222
|
+
obs_fc = np.log2(asrt(p_obs[0]) / asrt(p_obs[1]))
|
|
223
|
+
|
|
224
|
+
if verbose:
|
|
225
|
+
print("- Starting Monte-Carlo simulation of fold changes")
|
|
226
|
+
|
|
227
|
+
null_fcs = np.empty((permutations, len(indexes)))
|
|
228
|
+
real_fcs = np.empty((permutations, len(indexes)))
|
|
229
|
+
for i in range(permutations):
|
|
230
|
+
m, o = cell_to_montecarlo(
|
|
231
|
+
covariable_synth, groups_synth, label_order, indexes, cell_crowd, rng
|
|
232
|
+
)
|
|
233
|
+
null_fcs[i] = m
|
|
234
|
+
real_fcs[i] = o
|
|
235
|
+
|
|
236
|
+
higher = (np.sum(null_fcs >= obs_fc, axis=0) + 1) / (permutations + 1)
|
|
237
|
+
lower = (np.sum(null_fcs <= obs_fc, axis=0) + 1) / (permutations + 1)
|
|
238
|
+
p_vals = np.where(obs_fc > 0, higher, lower)
|
|
239
|
+
p_adj = _bh_fdr(p_vals)
|
|
240
|
+
sd_mc = null_fcs.std(axis=0, ddof=1)
|
|
241
|
+
ci_low = np.quantile(real_fcs, 0.025, axis=0)
|
|
242
|
+
ci_high = np.quantile(real_fcs, 0.975, axis=0)
|
|
243
|
+
|
|
244
|
+
pct1 = np.round(p_obs[0], 3)
|
|
245
|
+
pct2 = np.round(p_obs[1], 3)
|
|
246
|
+
table_results = pd.DataFrame(
|
|
247
|
+
{
|
|
248
|
+
"groupFC": obs_fc,
|
|
249
|
+
f"percent_in_{label_order[0]}": pct1,
|
|
250
|
+
f"percent_in_{label_order[1]}": pct2,
|
|
251
|
+
"p.adj": np.round(p_adj, 5),
|
|
252
|
+
"sd.montecarlo": sd_mc,
|
|
253
|
+
"CI95low": ci_low,
|
|
254
|
+
"CI95high": ci_high,
|
|
255
|
+
},
|
|
256
|
+
index=indexes,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Ensure CIs encompass observed
|
|
260
|
+
bad_low = ~(table_results["CI95low"] < table_results["groupFC"])
|
|
261
|
+
table_results.loc[bad_low, "CI95low"] = table_results.loc[bad_low, "groupFC"]
|
|
262
|
+
bad_high = ~(table_results["CI95high"] > table_results["groupFC"])
|
|
263
|
+
table_results.loc[bad_high, "CI95high"] = table_results.loc[bad_high, "groupFC"]
|
|
264
|
+
|
|
265
|
+
if plot:
|
|
266
|
+
try:
|
|
267
|
+
from .plots import plot_abundance_test
|
|
268
|
+
plot_abundance_test(
|
|
269
|
+
table_results,
|
|
270
|
+
subtype_variable=subtype_variable,
|
|
271
|
+
pdf_file=pdf_file,
|
|
272
|
+
)
|
|
273
|
+
except Exception as e: # noqa: BLE001
|
|
274
|
+
if verbose:
|
|
275
|
+
print(f"(Plot skipped: {e})")
|
|
276
|
+
|
|
277
|
+
return table_results
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# --- >2-condition Goodman-Kruskal gamma path -----------------------------------------
|
|
281
|
+
|
|
282
|
+
def _gamma_path(covariable, groups, label_order, permutations, min_cells, rng, verbose):
|
|
283
|
+
if verbose:
|
|
284
|
+
print(
|
|
285
|
+
"More than 2 groups detected. Computing Goodman-Kruskal gamma rank "
|
|
286
|
+
f"correlation in order: {' vs '.join(label_order)}"
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
counts_per_group = pd.Series(groups).value_counts().to_dict()
|
|
290
|
+
cell_crowd = {
|
|
291
|
+
l: int(round(max(np.sqrt(counts_per_group.get(l, 0)), min_cells)))
|
|
292
|
+
for l in label_order
|
|
293
|
+
}
|
|
294
|
+
kendall_denom = (len(label_order) * (len(label_order) - 1)) / 2
|
|
295
|
+
rank_index = np.arange(1, len(label_order) + 1)
|
|
296
|
+
|
|
297
|
+
obs_tab = _table(groups, covariable)
|
|
298
|
+
indexes = list(obs_tab.columns)
|
|
299
|
+
|
|
300
|
+
# Observed gamma: aggregate over `permutations` subsamplings of the original data
|
|
301
|
+
nc_orig = np.zeros(len(indexes))
|
|
302
|
+
nd_orig = np.zeros(len(indexes))
|
|
303
|
+
for _ in range(permutations):
|
|
304
|
+
nc, nd = cell_to_gamma_original(
|
|
305
|
+
covariable, groups, label_order, indexes, cell_crowd, rank_index, rng
|
|
306
|
+
)
|
|
307
|
+
nc_orig += nc
|
|
308
|
+
nd_orig += nd
|
|
309
|
+
obs_gamma = _gamma_godkrus(nc_orig, nd_orig, kendall_denom * permutations)
|
|
310
|
+
|
|
311
|
+
# Confidence interval via 10 sub-samples of size 100
|
|
312
|
+
sub_gammas = np.empty((10, len(indexes)))
|
|
313
|
+
for s in range(10):
|
|
314
|
+
nc_s = np.zeros(len(indexes))
|
|
315
|
+
nd_s = np.zeros(len(indexes))
|
|
316
|
+
for _ in range(100):
|
|
317
|
+
nc, nd = cell_to_gamma_original(
|
|
318
|
+
covariable, groups, label_order, indexes, cell_crowd, rank_index, rng
|
|
319
|
+
)
|
|
320
|
+
nc_s += nc
|
|
321
|
+
nd_s += nd
|
|
322
|
+
sub_gammas[s] = _gamma_godkrus(nc_s, nd_s, kendall_denom * 100)
|
|
323
|
+
ci_low = np.nanquantile(sub_gammas, 0.025, axis=0)
|
|
324
|
+
ci_high = np.nanquantile(sub_gammas, 0.975, axis=0)
|
|
325
|
+
|
|
326
|
+
if verbose:
|
|
327
|
+
print("- Starting gamma rank permutation analysis, this can take a while...")
|
|
328
|
+
|
|
329
|
+
n_random_observations = 10
|
|
330
|
+
null_gamma = np.empty((permutations, len(indexes)))
|
|
331
|
+
for p in range(permutations):
|
|
332
|
+
nc_p = np.zeros(len(indexes))
|
|
333
|
+
nd_p = np.zeros(len(indexes))
|
|
334
|
+
for _ in range(n_random_observations):
|
|
335
|
+
nc, nd = cell_to_gamma(
|
|
336
|
+
covariable, groups, label_order, indexes, cell_crowd, rank_index, rng
|
|
337
|
+
)
|
|
338
|
+
nc_p += nc
|
|
339
|
+
nd_p += nd
|
|
340
|
+
null_gamma[p] = _gamma_godkrus(nc_p, nd_p, kendall_denom * n_random_observations)
|
|
341
|
+
|
|
342
|
+
with np.errstate(invalid="ignore"):
|
|
343
|
+
higher = np.sum(null_gamma >= obs_gamma, axis=0) / permutations
|
|
344
|
+
lower = np.sum(null_gamma <= obs_gamma, axis=0) / permutations
|
|
345
|
+
p_vals = np.where(obs_gamma > 0, higher, lower)
|
|
346
|
+
p_adj = _bh_fdr(np.nan_to_num(p_vals, nan=1.0))
|
|
347
|
+
|
|
348
|
+
# Per-condition proportions (unnormalised contig table -> per-row proportions)
|
|
349
|
+
contig_tab = _table(groups, covariable).reindex(label_order)
|
|
350
|
+
proportions = contig_tab.div(contig_tab.sum(axis=1), axis=0).reindex(
|
|
351
|
+
index=label_order, columns=indexes
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
df = pd.DataFrame({"groupGammaCor": np.round(obs_gamma, 4)}, index=indexes)
|
|
355
|
+
for l in label_order:
|
|
356
|
+
df[f"percent_in_{l}"] = np.round(proportions.loc[l].values, 3)
|
|
357
|
+
df["p.adj"] = np.round(p_adj, 5)
|
|
358
|
+
df["CI95low"] = np.round(ci_low, 4)
|
|
359
|
+
df["CI95high"] = np.round(ci_high, 4)
|
|
360
|
+
return df
|