lotsofcells 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,360 @@
1
+ """Main `lots_of_cells` function: 2-group Monte-Carlo and >2-group Goodman & Kruskal gamma."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Optional, Sequence
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ from ._stats import (
10
+ _proportions_from_table,
11
+ _table,
12
+ asrt,
13
+ cell_to_gamma,
14
+ cell_to_gamma_original,
15
+ cell_to_montecarlo,
16
+ )
17
+ from ._utils import get_metadata
18
+
19
+
20
+ def _bh_fdr(pvals: np.ndarray) -> np.ndarray:
21
+ """Benjamini-Hochberg FDR. Equivalent to R p.adjust(., 'fdr')."""
22
+ p = np.asarray(pvals, dtype=float)
23
+ n = len(p)
24
+ order = np.argsort(p)
25
+ ranked = p[order] * n / (np.arange(n) + 1)
26
+ # cummin from the right
27
+ adj = np.minimum.accumulate(ranked[::-1])[::-1]
28
+ out = np.empty_like(adj)
29
+ out[order] = np.clip(adj, 0, 1)
30
+ return out
31
+
32
+
33
+ def _gamma_godkrus(nc: np.ndarray, nd: np.ndarray, denom: float) -> np.ndarray:
34
+ """Goodman-Kruskal gamma: (nc - nd) / exp(mean(log(N))) — N is denom (scalar here)."""
35
+ return (nc - nd) / np.exp(np.log(denom))
36
+
37
+
38
+ def lots_of_cells(
39
+ sc_object,
40
+ main_variable: str,
41
+ subtype_variable: str,
42
+ label_order: Sequence[str],
43
+ sample_id: Optional[str] = None,
44
+ permutations: int = 1000,
45
+ seed: Optional[int] = None,
46
+ table: Optional[str] = None,
47
+ plot: bool = True,
48
+ verbose: bool = True,
49
+ pdf_file: Optional[str] = None,
50
+ ) -> pd.DataFrame:
51
+ """Compute proportion tests on single-cell metadata.
52
+
53
+ Parameters
54
+ ----------
55
+ sc_object
56
+ AnnData / SpatialData / MuData / pandas.DataFrame.
57
+ main_variable
58
+ Column in ``.obs`` (or DataFrame) with the main grouping (e.g.
59
+ ``"condition"``).
60
+ subtype_variable
61
+ Column with the covariable to test (e.g. ``"cell_type"``).
62
+ label_order
63
+ Order of labels in ``main_variable`` to compare.
64
+ - 2 labels → log2 fold-change of arcsin-sqrt proportions, with
65
+ Monte-Carlo null distribution.
66
+ - >2 labels → Goodman & Kruskal's gamma rank correlation.
67
+ sample_id
68
+ Optional column with sample IDs. When set, the null distribution
69
+ accounts for per-sample heterogeneity.
70
+ permutations
71
+ Number of Monte-Carlo permutations.
72
+ seed
73
+ Random seed for reproducibility.
74
+ table
75
+ For SpatialData/MuData with multiple tables/modalities.
76
+ plot
77
+ If True and 2 labels, show the abundance test plot.
78
+
79
+ Returns
80
+ -------
81
+ pandas.DataFrame with one row per covariable level.
82
+ """
83
+ metadata = get_metadata(sc_object, table=table)
84
+
85
+ main_vals = metadata[main_variable].astype(str).to_numpy()
86
+
87
+ if isinstance(label_order[0], list) or isinstance(label_order[1], list):
88
+ if verbose:
89
+ print(f"Multiple sub-groups detected.")
90
+ # If several levels, process:
91
+ flat_order = np.array([a for b in label_order for a in b]).astype(str)
92
+ group_1 = np.array(label_order[0]).astype(str)
93
+ group_2 = np.array(label_order[1]).astype(str)
94
+ # Clean data with unwanted levels:
95
+ mask = np.isin(main_vals, flat_order)
96
+ metadata = metadata.loc[mask].copy()
97
+ # Update target labels:
98
+ main_vals = metadata[main_variable].astype(str).to_numpy()
99
+
100
+ # Obtain group labels:
101
+ mask_g1 = np.isin(main_vals, group_1)
102
+ mask_g2 = np.isin(main_vals, group_2)
103
+ # Define new labels:
104
+ label_1 = "loc_group_one [" + " ".join(group_1)+"]"
105
+ label_2 = "loc_group_two [" + " ".join(group_2)+"]"
106
+ # Create synthetic labels:
107
+ metadata.loc[mask_g1,"loc_tmp_group"] = label_1
108
+ metadata.loc[mask_g2,"loc_tmp_group"] = label_2
109
+ # relevel and recompute
110
+ main_variable = "loc_tmp_group"
111
+ main_vals = metadata[main_variable].astype(str).to_numpy()
112
+ # Copy original and push new:
113
+ label_order_original = label_order
114
+ label_order = [label_1, label_2]
115
+
116
+ if not all(l in np.unique(main_vals) for l in label_order):
117
+ missing = [l for l in label_order if l not in np.unique(main_vals)]
118
+ raise ValueError(f"Some groups in label_order not found in data: {missing}")
119
+
120
+ mask = np.isin(main_vals, list(label_order))
121
+ metadata = metadata.loc[mask].copy()
122
+ groups = metadata[main_variable].astype(str).to_numpy()
123
+ covariable = metadata[subtype_variable].astype(str).to_numpy()
124
+
125
+ rng = np.random.default_rng(seed)
126
+ min_cells = 10
127
+
128
+ if len(label_order) < 2:
129
+ raise ValueError("label_order must have at least 2 entries.")
130
+
131
+ if len(label_order) > 2:
132
+ return _gamma_path(
133
+ covariable, groups, label_order, permutations, min_cells, rng, verbose
134
+ )
135
+
136
+ return _montecarlo_path(
137
+ metadata,
138
+ covariable,
139
+ groups,
140
+ label_order,
141
+ sample_id,
142
+ main_variable,
143
+ subtype_variable,
144
+ permutations,
145
+ min_cells,
146
+ rng,
147
+ verbose,
148
+ plot,
149
+ pdf_file,
150
+ )
151
+
152
+
153
+ # --- 2-condition Monte Carlo path ----------------------------------------------------
154
+
155
+ def _montecarlo_path(
156
+ metadata,
157
+ covariable,
158
+ groups,
159
+ label_order,
160
+ sample_id,
161
+ main_variable,
162
+ subtype_variable,
163
+ permutations,
164
+ min_cells,
165
+ rng,
166
+ verbose,
167
+ plot,
168
+ pdf_file=None,
169
+ ):
170
+ if verbose:
171
+ print(f"Only 2 groups detected. Computing FC for {label_order[0]} vs {label_order[1]}")
172
+
173
+ if sample_id is not None:
174
+ if verbose:
175
+ print(f"Additional sub-level for testing: {sample_id}")
176
+ samples = metadata[sample_id].astype(str).to_numpy()
177
+ n_per_sample = pd.crosstab(pd.Series(groups), pd.Series(samples)).reindex(label_order)
178
+
179
+ # Synthetic samples (per-condition resampling) — mirrors R lotsOfCells.R
180
+ synth_meta = metadata[[main_variable, subtype_variable, sample_id]].copy()
181
+ mult_factor = 2
182
+ new_samples = int(round((n_per_sample != 0).sum(axis=1).mean())) * mult_factor
183
+ synth_rows = []
184
+ for i in range(1, new_samples + 1):
185
+ for cond in label_order:
186
+ row = n_per_sample.loc[cond]
187
+ nonzero = row[row != 0]
188
+ if len(nonzero) == 0:
189
+ continue
190
+ n = int(rng.integers(int(nonzero.min()), int(nonzero.max()) + 1))
191
+ pool = covariable[groups == cond]
192
+ synth_cov = rng.choice(pool, size=n, replace=True)
193
+ synth_rows.append(pd.DataFrame({
194
+ main_variable: cond,
195
+ subtype_variable: synth_cov,
196
+ sample_id: f"synthetic_sample_{cond}_{i}",
197
+ }))
198
+ if synth_rows:
199
+ synth_meta = pd.concat([synth_meta, *synth_rows], ignore_index=True)
200
+
201
+ groups_synth = synth_meta[main_variable].astype(str).to_numpy()
202
+ covariable_synth = synth_meta[subtype_variable].astype(str).to_numpy()
203
+
204
+ cell_crowd = {}
205
+ for cond in label_order:
206
+ row = n_per_sample.loc[cond]
207
+ nonzero = row[row != 0].to_numpy()
208
+ cell_crowd[cond] = list(np.maximum(np.sqrt(nonzero), min_cells).astype(int))
209
+ else:
210
+ groups_synth = groups
211
+ covariable_synth = covariable
212
+ counts_per_group = pd.Series(groups).value_counts().to_dict()
213
+ cell_crowd = {
214
+ l: int(round(max(np.sqrt(counts_per_group.get(l, 0)), min_cells)))
215
+ for l in label_order
216
+ }
217
+
218
+ # Observed fold-change
219
+ obs_tab = _table(groups, covariable)
220
+ p_obs = _proportions_from_table(obs_tab, label_order, list(obs_tab.columns), pseudo=True)
221
+ indexes = list(obs_tab.columns)
222
+ obs_fc = np.log2(asrt(p_obs[0]) / asrt(p_obs[1]))
223
+
224
+ if verbose:
225
+ print("- Starting Monte-Carlo simulation of fold changes")
226
+
227
+ null_fcs = np.empty((permutations, len(indexes)))
228
+ real_fcs = np.empty((permutations, len(indexes)))
229
+ for i in range(permutations):
230
+ m, o = cell_to_montecarlo(
231
+ covariable_synth, groups_synth, label_order, indexes, cell_crowd, rng
232
+ )
233
+ null_fcs[i] = m
234
+ real_fcs[i] = o
235
+
236
+ higher = (np.sum(null_fcs >= obs_fc, axis=0) + 1) / (permutations + 1)
237
+ lower = (np.sum(null_fcs <= obs_fc, axis=0) + 1) / (permutations + 1)
238
+ p_vals = np.where(obs_fc > 0, higher, lower)
239
+ p_adj = _bh_fdr(p_vals)
240
+ sd_mc = null_fcs.std(axis=0, ddof=1)
241
+ ci_low = np.quantile(real_fcs, 0.025, axis=0)
242
+ ci_high = np.quantile(real_fcs, 0.975, axis=0)
243
+
244
+ pct1 = np.round(p_obs[0], 3)
245
+ pct2 = np.round(p_obs[1], 3)
246
+ table_results = pd.DataFrame(
247
+ {
248
+ "groupFC": obs_fc,
249
+ f"percent_in_{label_order[0]}": pct1,
250
+ f"percent_in_{label_order[1]}": pct2,
251
+ "p.adj": np.round(p_adj, 5),
252
+ "sd.montecarlo": sd_mc,
253
+ "CI95low": ci_low,
254
+ "CI95high": ci_high,
255
+ },
256
+ index=indexes,
257
+ )
258
+
259
+ # Ensure CIs encompass observed
260
+ bad_low = ~(table_results["CI95low"] < table_results["groupFC"])
261
+ table_results.loc[bad_low, "CI95low"] = table_results.loc[bad_low, "groupFC"]
262
+ bad_high = ~(table_results["CI95high"] > table_results["groupFC"])
263
+ table_results.loc[bad_high, "CI95high"] = table_results.loc[bad_high, "groupFC"]
264
+
265
+ if plot:
266
+ try:
267
+ from .plots import plot_abundance_test
268
+ plot_abundance_test(
269
+ table_results,
270
+ subtype_variable=subtype_variable,
271
+ pdf_file=pdf_file,
272
+ )
273
+ except Exception as e: # noqa: BLE001
274
+ if verbose:
275
+ print(f"(Plot skipped: {e})")
276
+
277
+ return table_results
278
+
279
+
280
+ # --- >2-condition Goodman-Kruskal gamma path -----------------------------------------
281
+
282
+ def _gamma_path(covariable, groups, label_order, permutations, min_cells, rng, verbose):
283
+ if verbose:
284
+ print(
285
+ "More than 2 groups detected. Computing Goodman-Kruskal gamma rank "
286
+ f"correlation in order: {' vs '.join(label_order)}"
287
+ )
288
+
289
+ counts_per_group = pd.Series(groups).value_counts().to_dict()
290
+ cell_crowd = {
291
+ l: int(round(max(np.sqrt(counts_per_group.get(l, 0)), min_cells)))
292
+ for l in label_order
293
+ }
294
+ kendall_denom = (len(label_order) * (len(label_order) - 1)) / 2
295
+ rank_index = np.arange(1, len(label_order) + 1)
296
+
297
+ obs_tab = _table(groups, covariable)
298
+ indexes = list(obs_tab.columns)
299
+
300
+ # Observed gamma: aggregate over `permutations` subsamplings of the original data
301
+ nc_orig = np.zeros(len(indexes))
302
+ nd_orig = np.zeros(len(indexes))
303
+ for _ in range(permutations):
304
+ nc, nd = cell_to_gamma_original(
305
+ covariable, groups, label_order, indexes, cell_crowd, rank_index, rng
306
+ )
307
+ nc_orig += nc
308
+ nd_orig += nd
309
+ obs_gamma = _gamma_godkrus(nc_orig, nd_orig, kendall_denom * permutations)
310
+
311
+ # Confidence interval via 10 sub-samples of size 100
312
+ sub_gammas = np.empty((10, len(indexes)))
313
+ for s in range(10):
314
+ nc_s = np.zeros(len(indexes))
315
+ nd_s = np.zeros(len(indexes))
316
+ for _ in range(100):
317
+ nc, nd = cell_to_gamma_original(
318
+ covariable, groups, label_order, indexes, cell_crowd, rank_index, rng
319
+ )
320
+ nc_s += nc
321
+ nd_s += nd
322
+ sub_gammas[s] = _gamma_godkrus(nc_s, nd_s, kendall_denom * 100)
323
+ ci_low = np.nanquantile(sub_gammas, 0.025, axis=0)
324
+ ci_high = np.nanquantile(sub_gammas, 0.975, axis=0)
325
+
326
+ if verbose:
327
+ print("- Starting gamma rank permutation analysis, this can take a while...")
328
+
329
+ n_random_observations = 10
330
+ null_gamma = np.empty((permutations, len(indexes)))
331
+ for p in range(permutations):
332
+ nc_p = np.zeros(len(indexes))
333
+ nd_p = np.zeros(len(indexes))
334
+ for _ in range(n_random_observations):
335
+ nc, nd = cell_to_gamma(
336
+ covariable, groups, label_order, indexes, cell_crowd, rank_index, rng
337
+ )
338
+ nc_p += nc
339
+ nd_p += nd
340
+ null_gamma[p] = _gamma_godkrus(nc_p, nd_p, kendall_denom * n_random_observations)
341
+
342
+ with np.errstate(invalid="ignore"):
343
+ higher = np.sum(null_gamma >= obs_gamma, axis=0) / permutations
344
+ lower = np.sum(null_gamma <= obs_gamma, axis=0) / permutations
345
+ p_vals = np.where(obs_gamma > 0, higher, lower)
346
+ p_adj = _bh_fdr(np.nan_to_num(p_vals, nan=1.0))
347
+
348
+ # Per-condition proportions (unnormalised contig table -> per-row proportions)
349
+ contig_tab = _table(groups, covariable).reindex(label_order)
350
+ proportions = contig_tab.div(contig_tab.sum(axis=1), axis=0).reindex(
351
+ index=label_order, columns=indexes
352
+ )
353
+
354
+ df = pd.DataFrame({"groupGammaCor": np.round(obs_gamma, 4)}, index=indexes)
355
+ for l in label_order:
356
+ df[f"percent_in_{l}"] = np.round(proportions.loc[l].values, 3)
357
+ df["p.adj"] = np.round(p_adj, 5)
358
+ df["CI95low"] = np.round(ci_low, 4)
359
+ df["CI95high"] = np.round(ci_high, 4)
360
+ return df