lotsofcells 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lotsofcells/entropy.py ADDED
@@ -0,0 +1,354 @@
1
+ """Symmetric divergence (KL-based) entropy score, plus the 1-class abundance test."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Optional, Sequence
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ from ._stats import (
10
+ _ensure_cols,
11
+ _ensure_rows,
12
+ _table,
13
+ geom_mean,
14
+ pseudo_count_arcsin,
15
+ )
16
+ from ._utils import get_metadata
17
+
18
+
19
+ def _proportions_arcsin(
20
+ tab: pd.DataFrame, label_order: Sequence[str], indexes: Sequence[str]
21
+ ) -> np.ndarray:
22
+ """Per-group proportions across covariables (each row sums to 1).
23
+
24
+ Mirrors the R `entropyScore` normalisation. Note: in R the *random*
25
+ contig table is built from `data.frame(covariable, groups)` (covariable
26
+ first) so `table()` produces shape (ncov, ngroups) and the code applies
27
+ `apply(., 2, row/sum(row))` followed by `t()` — which is mathematically
28
+ equivalent to row-normalising on a (ngroups, ncov) matrix. Since
29
+ `pd.crosstab(groups, covariable)` already returns (ngroups, ncov) here,
30
+ a single function works for both observed and random tables.
31
+ """
32
+ tab = _ensure_rows(tab, label_order)
33
+ tab = _ensure_cols(tab, indexes)
34
+ vals = pseudo_count_arcsin(tab.values.astype(float))
35
+ row_sums = vals.sum(axis=1, keepdims=True)
36
+ return vals / row_sums
37
+
38
+
39
+ def _distance_surprise(p: np.ndarray, q: np.ndarray) -> float:
40
+ return geom_mean(np.abs(p * np.log2(p / q))) + geom_mean(np.abs(q * np.log2(q / p)))
41
+
42
+
43
+ def entropy_score(
44
+ sc_object,
45
+ main_variable: str,
46
+ subtype_variable: str,
47
+ label_order: Sequence[str],
48
+ sample_id: Optional[str] = None,
49
+ permutations: int = 1000,
50
+ seed: Optional[int] = None,
51
+ table: Optional[str] = None,
52
+ plot: bool = True,
53
+ verbose: bool = True,
54
+ pdf_file: Optional[str] = None,
55
+ ):
56
+ """Symmetric divergence score for global proportion dysregulation between 2 groups.
57
+
58
+ Returns a `pandas.Series` with per-covariable relative entropies plus the
59
+ summary fields (``entropy_score``, ``p.val``, ``mean.random.entropy``,
60
+ ``sd.random.entropy``).
61
+
62
+ If ``len(label_order) == 1``, runs the 1-class permutation test on
63
+ ``sample_id`` (analogue of the R `oneClassTest`) and returns a small
64
+ summary dict instead.
65
+ """
66
+ metadata = get_metadata(sc_object, table=table)
67
+
68
+ main_vals = metadata[main_variable].astype(str).to_numpy()
69
+ if not all(l in np.unique(main_vals) for l in label_order):
70
+ missing = [l for l in label_order if l not in np.unique(main_vals)]
71
+ raise ValueError(f"Some groups in label_order not in data: {missing}")
72
+
73
+ metadata = metadata.loc[np.isin(main_vals, list(label_order))].copy()
74
+ groups = metadata[main_variable].astype(str).to_numpy()
75
+ covariable = metadata[subtype_variable].astype(str).to_numpy()
76
+ rng = np.random.default_rng(seed)
77
+
78
+ if len(label_order) == 0:
79
+ raise ValueError("label_order must be specified.")
80
+
81
+ if len(label_order) == 1:
82
+ if sample_id is None:
83
+ raise ValueError("In 1-class mode you must specify `sample_id`.")
84
+ return _one_class_test(
85
+ metadata,
86
+ sample_id,
87
+ covariable,
88
+ permutations,
89
+ rng,
90
+ plot=plot,
91
+ verbose=verbose,
92
+ pdf_file=pdf_file,
93
+ )
94
+
95
+ if len(label_order) > 2:
96
+ raise ValueError(
97
+ f"Only 2 labels are allowed for entropy estimation, got "
98
+ f"{len(label_order)}: {label_order}"
99
+ )
100
+
101
+ if verbose:
102
+ print(
103
+ "Computing entropy proportion over covariables for groups: "
104
+ f"{label_order[0]} vs {label_order[1]}"
105
+ )
106
+ obs_tab = _table(groups, covariable)
107
+ indexes = list(obs_tab.columns)
108
+ contig = _proportions_arcsin(obs_tab, label_order, indexes)
109
+
110
+ # Per-covariable relative entropies (matches R apply over rows... in the R it's
111
+ # apply(contig_tab, 1, function(x) abs(log2((x[1]*log2(x[2]))/(x[1]*log2(x[1])))));
112
+ # since R contig_tab is rows=labels, columns=covariables, apply over rows iterates
113
+ # COLUMNS — so we replicate by iterating columns here)
114
+ rel_entropies = np.empty(len(indexes))
115
+ for j in range(len(indexes)):
116
+ x = contig[:, j]
117
+ with np.errstate(divide="ignore", invalid="ignore"):
118
+ rel_entropies[j] = np.abs(
119
+ np.log2((x[0] * np.log2(x[1])) / (x[0] * np.log2(x[0])))
120
+ )
121
+
122
+ obs_score = _distance_surprise(contig[0], contig[1])
123
+
124
+ # Build cell-crowd for null sampling
125
+ if sample_id is not None:
126
+ samples = metadata[sample_id].astype(str).to_numpy()
127
+ n_per_sample = (
128
+ pd.crosstab(pd.Series(groups), pd.Series(samples)).reindex(label_order)
129
+ )
130
+ n_per_sample = np.sqrt(n_per_sample)
131
+ cell_crowd = {}
132
+ for cond in label_order:
133
+ row = n_per_sample.loc[cond]
134
+ cell_crowd[cond] = list(row[row != 0].astype(int).to_numpy())
135
+ else:
136
+ counts = pd.Series(groups).value_counts().to_dict()
137
+ cell_crowd = {l: int(round(np.sqrt(counts.get(l, 0)))) for l in label_order}
138
+
139
+ if verbose:
140
+ print(f"Starting Monte-Carlo simulation with n. permutations: {permutations}")
141
+
142
+ null_scores = np.empty(permutations)
143
+ for i in range(permutations):
144
+ pieces_cov, pieces_grp = [], []
145
+ for label in label_order:
146
+ crowd = cell_crowd[label]
147
+ if isinstance(crowd, list):
148
+ for n in crowd:
149
+ s = rng.choice(covariable, size=int(n), replace=True)
150
+ pieces_cov.append(s)
151
+ pieces_grp.append(np.repeat(label, len(s)))
152
+ else:
153
+ s = rng.choice(covariable, size=int(crowd), replace=True)
154
+ pieces_cov.append(s)
155
+ pieces_grp.append(np.repeat(label, len(s)))
156
+ cov = np.concatenate(pieces_cov)
157
+ grp = np.concatenate(pieces_grp)
158
+ rand_tab = _table(grp, cov)
159
+ p = _proportions_arcsin(rand_tab, label_order, indexes)
160
+ null_scores[i] = _distance_surprise(p[0], p[1])
161
+
162
+ p_val = float((null_scores >= obs_score).sum() / permutations)
163
+
164
+ if plot:
165
+ try:
166
+ _plot_entropy(
167
+ contig=contig,
168
+ indexes=indexes,
169
+ label_order=label_order,
170
+ obs_score=obs_score,
171
+ null_scores=null_scores,
172
+ p_val=p_val,
173
+ subtype_variable=subtype_variable,
174
+ pdf_file=pdf_file,
175
+ )
176
+ except Exception as e: # noqa: BLE001
177
+ if verbose:
178
+ print(f"(Plot skipped: {e})")
179
+
180
+ out = pd.Series(rel_entropies, index=indexes)
181
+ out["entropy_score"] = obs_score
182
+ out["p.val"] = p_val
183
+ out["mean.random.entropy"] = float(null_scores.mean())
184
+ out["sd.random.entropy"] = float(null_scores.std(ddof=1))
185
+ return out
186
+
187
+
188
+ def _plot_entropy(
189
+ contig, indexes, label_order, obs_score, null_scores, p_val,
190
+ subtype_variable, pdf_file=None,
191
+ ):
192
+ import matplotlib.pyplot as plt
193
+ from ._utils import save_to_pdf
194
+
195
+ fig, axes = plt.subplots(1, 2, figsize=(12, 5), gridspec_kw={"width_ratios": [3, 1]})
196
+ ax = axes[0]
197
+ n = len(indexes)
198
+ width = 0.35
199
+ x = np.arange(n)
200
+ palette = ["#9ECAE1", "#3182BD"]
201
+ for i, label in enumerate(label_order):
202
+ ax.bar(x + (i - 0.5) * width, contig[i], width, label=label, color=palette[i])
203
+ ax.set_xticks(x)
204
+ ax.set_xticklabels(indexes, rotation=45, ha="right")
205
+ ax.set_ylabel("proportion")
206
+ ax.set_title(
207
+ f"Symmetric Divergence Score: {obs_score:.3f} | p.val.adj: {p_val:.3f}"
208
+ )
209
+ ax.legend(title=f"Class: {subtype_variable}")
210
+
211
+ ax2 = axes[1]
212
+ rng = np.random.default_rng(0)
213
+ jitter = rng.uniform(-0.1, 0.1, size=len(null_scores))
214
+ ax2.scatter(jitter, null_scores, color="#D5BADB", alpha=0.5, s=15)
215
+ ax2.axhline(np.median(null_scores), color="#86608E", lw=1)
216
+ ax2.scatter([0], [obs_score], color="#F08080", s=80, zorder=5)
217
+ ax2.set_xticks([])
218
+ ax2.set_ylabel("symmetric divergence")
219
+ plt.tight_layout()
220
+ save_to_pdf(fig, pdf_file)
221
+
222
+
223
+ def _one_class_test(
224
+ metadata,
225
+ sample_id,
226
+ covariable,
227
+ permutations,
228
+ rng,
229
+ plot=True,
230
+ verbose=True,
231
+ pdf_file=None,
232
+ ):
233
+ """Permutation test for sample-level proportion variation in a single condition.
234
+
235
+ Departs from R's `oneClassTest` in one important way: the null draws each
236
+ sample's cells from THAT SAMPLE'S own covariable distribution, not from
237
+ the global pool. The R version sampled every cell from the global pool,
238
+ which collapses both random pseudo-groups onto the same global
239
+ distribution and produces a null that is essentially zero — so the user
240
+ never observes any spread no matter how heterogeneous the real samples
241
+ are. Drawing from per-sample pools preserves real per-sample structure
242
+ and lets random partitions of those samples yield a null distribution
243
+ whose spread reflects across-sample heterogeneity, which is what this
244
+ test is meant to assess.
245
+ """
246
+ samples = metadata[sample_id].astype(str).to_numpy()
247
+ obs_tab = _table(samples, covariable)
248
+ indexes = list(obs_tab.columns)
249
+ n_per_sample = pd.Series(samples).value_counts()
250
+ sqrt_n = np.sqrt(n_per_sample)
251
+ sqrt_n[sqrt_n == 0] = 10
252
+ cell_crowd = sqrt_n.to_dict()
253
+
254
+ # Build a per-sample pool of covariable values (preserves the real cell
255
+ # composition of each sample for the null draw).
256
+ sample_pools = {
257
+ s: covariable[samples == s] for s in n_per_sample.index
258
+ }
259
+ unique_samples = list(n_per_sample.index)
260
+ if len(unique_samples) < 2:
261
+ raise ValueError(
262
+ "1-class entropy test needs at least 2 samples in `sample_id`."
263
+ )
264
+ n_g1 = max(1, round(len(unique_samples) / 2))
265
+
266
+ # Mirror R's iteration count: seq(100) * seq(permutations/10) = 10*perms.
267
+ n_iter = max(int(permutations) * 10, 100)
268
+ null_scores = np.empty(n_iter)
269
+ if verbose:
270
+ print(f"Starting 1-class Monte-Carlo simulation: {n_iter} iterations")
271
+
272
+ for i in range(n_iter):
273
+ perm = rng.permutation(len(unique_samples))
274
+ g1 = [unique_samples[k] for k in perm[:n_g1]]
275
+ g2 = [unique_samples[k] for k in perm[n_g1:]]
276
+ pieces_cov, pieces_grp = [], []
277
+ for s in g1:
278
+ n = max(int(cell_crowd[s]), 1)
279
+ pool = sample_pools[s]
280
+ if len(pool) == 0:
281
+ continue
282
+ draw = rng.choice(pool, size=n, replace=True)
283
+ pieces_cov.append(draw)
284
+ pieces_grp.append(np.repeat("group1", n))
285
+ for s in g2:
286
+ n = max(int(cell_crowd[s]), 1)
287
+ pool = sample_pools[s]
288
+ if len(pool) == 0:
289
+ continue
290
+ draw = rng.choice(pool, size=n, replace=True)
291
+ pieces_cov.append(draw)
292
+ pieces_grp.append(np.repeat("group2", n))
293
+ cov = np.concatenate(pieces_cov)
294
+ grp = np.concatenate(pieces_grp)
295
+ rand_tab = _table(grp, cov)
296
+ p = _proportions_arcsin(rand_tab, ["group1", "group2"], indexes)
297
+ null_scores[i] = _distance_surprise(p[0], p[1])
298
+
299
+ mean_null = float(null_scores.mean())
300
+ sd_null = float(null_scores.std(ddof=1))
301
+ median_null = float(np.median(null_scores))
302
+ cv = float(sd_null / mean_null * 100) if mean_null > 0 else float("inf")
303
+ if median_null > 0:
304
+ relative_iqr = float(
305
+ (np.percentile(null_scores, 75) - np.percentile(null_scores, 25))
306
+ / median_null
307
+ )
308
+ else:
309
+ relative_iqr = float("nan")
310
+ if cv <= 35:
311
+ variation = "Low"
312
+ elif cv <= 50:
313
+ variation = "Medium"
314
+ else:
315
+ variation = "High"
316
+
317
+ if verbose:
318
+ print(f"Coefficient of Variation: {cv:.2f} %")
319
+ print(f"Variation across samples is considered: {variation}")
320
+ print(f"Relative IQR: {relative_iqr:.3f}")
321
+
322
+ if plot:
323
+ try:
324
+ import matplotlib.pyplot as plt
325
+ from ._utils import save_to_pdf
326
+
327
+ fig, ax = plt.subplots(figsize=(3.5, 5))
328
+ jitter = rng.uniform(-0.1, 0.1, size=len(null_scores))
329
+ ax.scatter(jitter, null_scores, color="#D5BADB", alpha=0.5, s=15)
330
+ ax.axhline(median_null, color="#86608E", lw=1)
331
+ ax.set_xlim(-0.5, 0.5)
332
+ lo = float(min(0.0, null_scores.min()))
333
+ hi = float(null_scores.max())
334
+ pad = max(1e-3, 0.1 * (hi - lo))
335
+ ax.set_ylim(lo, hi + pad)
336
+ ax.set_xticks([])
337
+ ax.set_ylabel("symmetric divergence (null)")
338
+ ax.set_title(
339
+ f"1-class null distribution\n"
340
+ f"median={median_null:.4f} CV={cv:.1f}% ({variation})"
341
+ )
342
+ plt.tight_layout()
343
+ save_to_pdf(fig, pdf_file)
344
+ except Exception:
345
+ pass
346
+
347
+ return {
348
+ "cv": cv,
349
+ "variation": variation,
350
+ "relative_iqr": relative_iqr,
351
+ "mean.random.entropy": mean_null,
352
+ "sd.random.entropy": sd_null,
353
+ "median.random.entropy": median_null,
354
+ }
@@ -0,0 +1,330 @@
1
+ """Main `lots_of_cells` function: 2-group Monte-Carlo and >2-group Goodman & Kruskal gamma."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Optional, Sequence
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ from ._stats import (
10
+ _proportions_from_table,
11
+ _table,
12
+ asrt,
13
+ cell_to_gamma,
14
+ cell_to_gamma_original,
15
+ cell_to_montecarlo,
16
+ )
17
+ from ._utils import get_metadata
18
+
19
+
20
+ def _bh_fdr(pvals: np.ndarray) -> np.ndarray:
21
+ """Benjamini-Hochberg FDR. Equivalent to R p.adjust(., 'fdr')."""
22
+ p = np.asarray(pvals, dtype=float)
23
+ n = len(p)
24
+ order = np.argsort(p)
25
+ ranked = p[order] * n / (np.arange(n) + 1)
26
+ # cummin from the right
27
+ adj = np.minimum.accumulate(ranked[::-1])[::-1]
28
+ out = np.empty_like(adj)
29
+ out[order] = np.clip(adj, 0, 1)
30
+ return out
31
+
32
+
33
+ def _gamma_godkrus(nc: np.ndarray, nd: np.ndarray, denom: float) -> np.ndarray:
34
+ """Goodman-Kruskal gamma: (nc - nd) / exp(mean(log(N))) — N is denom (scalar here)."""
35
+ return (nc - nd) / np.exp(np.log(denom))
36
+
37
+
38
+ def lots_of_cells(
39
+ sc_object,
40
+ main_variable: str,
41
+ subtype_variable: str,
42
+ label_order: Sequence[str],
43
+ sample_id: Optional[str] = None,
44
+ permutations: int = 1000,
45
+ seed: Optional[int] = None,
46
+ table: Optional[str] = None,
47
+ plot: bool = True,
48
+ verbose: bool = True,
49
+ pdf_file: Optional[str] = None,
50
+ ) -> pd.DataFrame:
51
+ """Compute proportion tests on single-cell metadata.
52
+
53
+ Parameters
54
+ ----------
55
+ sc_object
56
+ AnnData / SpatialData / MuData / pandas.DataFrame.
57
+ main_variable
58
+ Column in ``.obs`` (or DataFrame) with the main grouping (e.g.
59
+ ``"condition"``).
60
+ subtype_variable
61
+ Column with the covariable to test (e.g. ``"cell_type"``).
62
+ label_order
63
+ Order of labels in ``main_variable`` to compare.
64
+ - 2 labels → log2 fold-change of arcsin-sqrt proportions, with
65
+ Monte-Carlo null distribution.
66
+ - >2 labels → Goodman & Kruskal's gamma rank correlation.
67
+ sample_id
68
+ Optional column with sample IDs. When set, the null distribution
69
+ accounts for per-sample heterogeneity.
70
+ permutations
71
+ Number of Monte-Carlo permutations.
72
+ seed
73
+ Random seed for reproducibility.
74
+ table
75
+ For SpatialData/MuData with multiple tables/modalities.
76
+ plot
77
+ If True and 2 labels, show the abundance test plot.
78
+
79
+ Returns
80
+ -------
81
+ pandas.DataFrame with one row per covariable level.
82
+ """
83
+ metadata = get_metadata(sc_object, table=table)
84
+
85
+ main_vals = metadata[main_variable].astype(str).to_numpy()
86
+ if not all(l in np.unique(main_vals) for l in label_order):
87
+ missing = [l for l in label_order if l not in np.unique(main_vals)]
88
+ raise ValueError(f"Some groups in label_order not found in data: {missing}")
89
+
90
+ mask = np.isin(main_vals, list(label_order))
91
+ metadata = metadata.loc[mask].copy()
92
+ groups = metadata[main_variable].astype(str).to_numpy()
93
+ covariable = metadata[subtype_variable].astype(str).to_numpy()
94
+
95
+ rng = np.random.default_rng(seed)
96
+ min_cells = 10
97
+
98
+ if len(label_order) < 2:
99
+ raise ValueError("label_order must have at least 2 entries.")
100
+
101
+ if len(label_order) > 2:
102
+ return _gamma_path(
103
+ covariable, groups, label_order, permutations, min_cells, rng, verbose
104
+ )
105
+
106
+ return _montecarlo_path(
107
+ metadata,
108
+ covariable,
109
+ groups,
110
+ label_order,
111
+ sample_id,
112
+ main_variable,
113
+ subtype_variable,
114
+ permutations,
115
+ min_cells,
116
+ rng,
117
+ verbose,
118
+ plot,
119
+ pdf_file,
120
+ )
121
+
122
+
123
+ # --- 2-condition Monte Carlo path ----------------------------------------------------
124
+
125
+ def _montecarlo_path(
126
+ metadata,
127
+ covariable,
128
+ groups,
129
+ label_order,
130
+ sample_id,
131
+ main_variable,
132
+ subtype_variable,
133
+ permutations,
134
+ min_cells,
135
+ rng,
136
+ verbose,
137
+ plot,
138
+ pdf_file=None,
139
+ ):
140
+ if verbose:
141
+ print(f"Only 2 groups detected. Computing FC for {label_order[0]} vs {label_order[1]}")
142
+
143
+ if sample_id is not None:
144
+ if verbose:
145
+ print(f"Additional sub-level for testing: {sample_id}")
146
+ samples = metadata[sample_id].astype(str).to_numpy()
147
+ n_per_sample = pd.crosstab(pd.Series(groups), pd.Series(samples)).reindex(label_order)
148
+
149
+ # Synthetic samples (per-condition resampling) — mirrors R lotsOfCells.R
150
+ synth_meta = metadata[[main_variable, subtype_variable, sample_id]].copy()
151
+ mult_factor = 2
152
+ new_samples = int(round((n_per_sample != 0).sum(axis=1).mean())) * mult_factor
153
+ synth_rows = []
154
+ for i in range(1, new_samples + 1):
155
+ for cond in label_order:
156
+ row = n_per_sample.loc[cond]
157
+ nonzero = row[row != 0]
158
+ if len(nonzero) == 0:
159
+ continue
160
+ n = int(rng.integers(int(nonzero.min()), int(nonzero.max()) + 1))
161
+ pool = covariable[groups == cond]
162
+ synth_cov = rng.choice(pool, size=n, replace=True)
163
+ synth_rows.append(pd.DataFrame({
164
+ main_variable: cond,
165
+ subtype_variable: synth_cov,
166
+ sample_id: f"synthetic_sample_{cond}_{i}",
167
+ }))
168
+ if synth_rows:
169
+ synth_meta = pd.concat([synth_meta, *synth_rows], ignore_index=True)
170
+
171
+ groups_synth = synth_meta[main_variable].astype(str).to_numpy()
172
+ covariable_synth = synth_meta[subtype_variable].astype(str).to_numpy()
173
+
174
+ cell_crowd = {}
175
+ for cond in label_order:
176
+ row = n_per_sample.loc[cond]
177
+ nonzero = row[row != 0].to_numpy()
178
+ cell_crowd[cond] = list(np.maximum(np.sqrt(nonzero), min_cells).astype(int))
179
+ else:
180
+ groups_synth = groups
181
+ covariable_synth = covariable
182
+ counts_per_group = pd.Series(groups).value_counts().to_dict()
183
+ cell_crowd = {
184
+ l: int(round(max(np.sqrt(counts_per_group.get(l, 0)), min_cells)))
185
+ for l in label_order
186
+ }
187
+
188
+ # Observed fold-change
189
+ obs_tab = _table(groups, covariable)
190
+ p_obs = _proportions_from_table(obs_tab, label_order, list(obs_tab.columns), pseudo=True)
191
+ indexes = list(obs_tab.columns)
192
+ obs_fc = np.log2(asrt(p_obs[0]) / asrt(p_obs[1]))
193
+
194
+ if verbose:
195
+ print("- Starting Monte-Carlo simulation of fold changes")
196
+
197
+ null_fcs = np.empty((permutations, len(indexes)))
198
+ real_fcs = np.empty((permutations, len(indexes)))
199
+ for i in range(permutations):
200
+ m, o = cell_to_montecarlo(
201
+ covariable_synth, groups_synth, label_order, indexes, cell_crowd, rng
202
+ )
203
+ null_fcs[i] = m
204
+ real_fcs[i] = o
205
+
206
+ higher = (np.sum(null_fcs >= obs_fc, axis=0) + 1) / (permutations + 1)
207
+ lower = (np.sum(null_fcs <= obs_fc, axis=0) + 1) / (permutations + 1)
208
+ p_vals = np.where(obs_fc > 0, higher, lower)
209
+ p_adj = _bh_fdr(p_vals)
210
+ sd_mc = null_fcs.std(axis=0, ddof=1)
211
+ ci_low = np.quantile(real_fcs, 0.025, axis=0)
212
+ ci_high = np.quantile(real_fcs, 0.975, axis=0)
213
+
214
+ pct1 = np.round(p_obs[0], 3)
215
+ pct2 = np.round(p_obs[1], 3)
216
+ table_results = pd.DataFrame(
217
+ {
218
+ "groupFC": obs_fc,
219
+ f"percent_in_{label_order[0]}": pct1,
220
+ f"percent_in_{label_order[1]}": pct2,
221
+ "p.adj": np.round(p_adj, 5),
222
+ "sd.montecarlo": sd_mc,
223
+ "CI95low": ci_low,
224
+ "CI95high": ci_high,
225
+ },
226
+ index=indexes,
227
+ )
228
+
229
+ # Ensure CIs encompass observed
230
+ bad_low = ~(table_results["CI95low"] < table_results["groupFC"])
231
+ table_results.loc[bad_low, "CI95low"] = table_results.loc[bad_low, "groupFC"]
232
+ bad_high = ~(table_results["CI95high"] > table_results["groupFC"])
233
+ table_results.loc[bad_high, "CI95high"] = table_results.loc[bad_high, "groupFC"]
234
+
235
+ if plot:
236
+ try:
237
+ from .plots import plot_abundance_test
238
+ plot_abundance_test(
239
+ table_results,
240
+ subtype_variable=subtype_variable,
241
+ pdf_file=pdf_file,
242
+ )
243
+ except Exception as e: # noqa: BLE001
244
+ if verbose:
245
+ print(f"(Plot skipped: {e})")
246
+
247
+ return table_results
248
+
249
+
250
+ # --- >2-condition Goodman-Kruskal gamma path -----------------------------------------
251
+
252
+ def _gamma_path(covariable, groups, label_order, permutations, min_cells, rng, verbose):
253
+ if verbose:
254
+ print(
255
+ "More than 2 groups detected. Computing Goodman-Kruskal gamma rank "
256
+ f"correlation in order: {' vs '.join(label_order)}"
257
+ )
258
+
259
+ counts_per_group = pd.Series(groups).value_counts().to_dict()
260
+ cell_crowd = {
261
+ l: int(round(max(np.sqrt(counts_per_group.get(l, 0)), min_cells)))
262
+ for l in label_order
263
+ }
264
+ kendall_denom = (len(label_order) * (len(label_order) - 1)) / 2
265
+ rank_index = np.arange(1, len(label_order) + 1)
266
+
267
+ obs_tab = _table(groups, covariable)
268
+ indexes = list(obs_tab.columns)
269
+
270
+ # Observed gamma: aggregate over `permutations` subsamplings of the original data
271
+ nc_orig = np.zeros(len(indexes))
272
+ nd_orig = np.zeros(len(indexes))
273
+ for _ in range(permutations):
274
+ nc, nd = cell_to_gamma_original(
275
+ covariable, groups, label_order, indexes, cell_crowd, rank_index, rng
276
+ )
277
+ nc_orig += nc
278
+ nd_orig += nd
279
+ obs_gamma = _gamma_godkrus(nc_orig, nd_orig, kendall_denom * permutations)
280
+
281
+ # Confidence interval via 10 sub-samples of size 100
282
+ sub_gammas = np.empty((10, len(indexes)))
283
+ for s in range(10):
284
+ nc_s = np.zeros(len(indexes))
285
+ nd_s = np.zeros(len(indexes))
286
+ for _ in range(100):
287
+ nc, nd = cell_to_gamma_original(
288
+ covariable, groups, label_order, indexes, cell_crowd, rank_index, rng
289
+ )
290
+ nc_s += nc
291
+ nd_s += nd
292
+ sub_gammas[s] = _gamma_godkrus(nc_s, nd_s, kendall_denom * 100)
293
+ ci_low = np.nanquantile(sub_gammas, 0.025, axis=0)
294
+ ci_high = np.nanquantile(sub_gammas, 0.975, axis=0)
295
+
296
+ if verbose:
297
+ print("- Starting gamma rank permutation analysis, this can take a while...")
298
+
299
+ n_random_observations = 10
300
+ null_gamma = np.empty((permutations, len(indexes)))
301
+ for p in range(permutations):
302
+ nc_p = np.zeros(len(indexes))
303
+ nd_p = np.zeros(len(indexes))
304
+ for _ in range(n_random_observations):
305
+ nc, nd = cell_to_gamma(
306
+ covariable, groups, label_order, indexes, cell_crowd, rank_index, rng
307
+ )
308
+ nc_p += nc
309
+ nd_p += nd
310
+ null_gamma[p] = _gamma_godkrus(nc_p, nd_p, kendall_denom * n_random_observations)
311
+
312
+ with np.errstate(invalid="ignore"):
313
+ higher = np.sum(null_gamma >= obs_gamma, axis=0) / permutations
314
+ lower = np.sum(null_gamma <= obs_gamma, axis=0) / permutations
315
+ p_vals = np.where(obs_gamma > 0, higher, lower)
316
+ p_adj = _bh_fdr(np.nan_to_num(p_vals, nan=1.0))
317
+
318
+ # Per-condition proportions (unnormalised contig table -> per-row proportions)
319
+ contig_tab = _table(groups, covariable).reindex(label_order)
320
+ proportions = contig_tab.div(contig_tab.sum(axis=1), axis=0).reindex(
321
+ index=label_order, columns=indexes
322
+ )
323
+
324
+ df = pd.DataFrame({"groupGammaCor": np.round(obs_gamma, 4)}, index=indexes)
325
+ for l in label_order:
326
+ df[f"percent_in_{l}"] = np.round(proportions.loc[l].values, 3)
327
+ df["p.adj"] = np.round(p_adj, 5)
328
+ df["CI95low"] = np.round(ci_low, 4)
329
+ df["CI95high"] = np.round(ci_high, 4)
330
+ return df