lotsofcells 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lotsofcells-0.0.0/.github/workflows/publish.yml +127 -0
- lotsofcells-0.0.0/.gitignore +8 -0
- lotsofcells-0.0.0/PKG-INFO +21 -0
- lotsofcells-0.0.0/example.py +104 -0
- lotsofcells-0.0.0/lotsofcells/__init__.py +39 -0
- lotsofcells-0.0.0/lotsofcells/__pycache__/__init__.cpython-39.pyc +0 -0
- lotsofcells-0.0.0/lotsofcells/__pycache__/_stats.cpython-39.pyc +0 -0
- lotsofcells-0.0.0/lotsofcells/__pycache__/_utils.cpython-39.pyc +0 -0
- lotsofcells-0.0.0/lotsofcells/__pycache__/entropy.cpython-39.pyc +0 -0
- lotsofcells-0.0.0/lotsofcells/__pycache__/lotsofcells.cpython-39.pyc +0 -0
- lotsofcells-0.0.0/lotsofcells/__pycache__/plots.cpython-39.pyc +0 -0
- lotsofcells-0.0.0/lotsofcells/_stats.py +279 -0
- lotsofcells-0.0.0/lotsofcells/_utils.py +213 -0
- lotsofcells-0.0.0/lotsofcells/entropy.py +354 -0
- lotsofcells-0.0.0/lotsofcells/lotsofcells.py +360 -0
- lotsofcells-0.0.0/lotsofcells/plots.py +681 -0
- lotsofcells-0.0.0/lotsofcells.egg-info/PKG-INFO +21 -0
- lotsofcells-0.0.0/lotsofcells.egg-info/SOURCES.txt +24 -0
- lotsofcells-0.0.0/lotsofcells.egg-info/dependency_links.txt +1 -0
- lotsofcells-0.0.0/lotsofcells.egg-info/requires.txt +17 -0
- lotsofcells-0.0.0/lotsofcells.egg-info/top_level.txt +1 -0
- lotsofcells-0.0.0/pyproject.toml +31 -0
- lotsofcells-0.0.0/readme.md +535 -0
- lotsofcells-0.0.0/setup.cfg +4 -0
- lotsofcells-0.0.0/test_pdf_and_oneclass.py +124 -0
- lotsofcells-0.0.0/test_plots.py +97 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
# Push a tag like `v0.4.0` to trigger a build + PyPI publish + GitHub release.
|
|
4
|
+
# Can also be run manually (workflow_dispatch) for dry-runs without publishing.
|
|
5
|
+
|
|
6
|
+
on:
|
|
7
|
+
push:
|
|
8
|
+
tags:
|
|
9
|
+
- "v*"
|
|
10
|
+
workflow_dispatch:
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
# ----------------------------------------------------------------------
|
|
14
|
+
# 1. Sanity tests on every supported Python version.
|
|
15
|
+
# ----------------------------------------------------------------------
|
|
16
|
+
test:
|
|
17
|
+
name: Test (py${{ matrix.python-version }})
|
|
18
|
+
runs-on: ubuntu-latest
|
|
19
|
+
strategy:
|
|
20
|
+
fail-fast: false
|
|
21
|
+
matrix:
|
|
22
|
+
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v4
|
|
25
|
+
|
|
26
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
27
|
+
uses: actions/setup-python@v5
|
|
28
|
+
with:
|
|
29
|
+
python-version: ${{ matrix.python-version }}
|
|
30
|
+
cache: pip
|
|
31
|
+
|
|
32
|
+
- name: Install package
|
|
33
|
+
run: |
|
|
34
|
+
python -m pip install --upgrade pip
|
|
35
|
+
pip install -e .
|
|
36
|
+
|
|
37
|
+
- name: Import smoke test
|
|
38
|
+
run: |
|
|
39
|
+
python -c "import lotsofcells; print('lotsofcells', lotsofcells.__version__)"
|
|
40
|
+
python -c "from lotsofcells import lots_of_cells, entropy_score, bar_chart, waffle_chart, polar_chart, density_chart, dynamics_chart, plot_abundance_test; print('imports OK')"
|
|
41
|
+
|
|
42
|
+
# ----------------------------------------------------------------------
|
|
43
|
+
# 2. Build sdist + wheel.
|
|
44
|
+
# ----------------------------------------------------------------------
|
|
45
|
+
build:
|
|
46
|
+
name: Build distribution
|
|
47
|
+
needs: test
|
|
48
|
+
runs-on: ubuntu-latest
|
|
49
|
+
steps:
|
|
50
|
+
- uses: actions/checkout@v4
|
|
51
|
+
|
|
52
|
+
- uses: actions/setup-python@v5
|
|
53
|
+
with:
|
|
54
|
+
python-version: "3.11"
|
|
55
|
+
|
|
56
|
+
- name: Install build backend
|
|
57
|
+
run: |
|
|
58
|
+
python -m pip install --upgrade pip
|
|
59
|
+
pip install build
|
|
60
|
+
|
|
61
|
+
- name: Build sdist + wheel
|
|
62
|
+
run: python -m build
|
|
63
|
+
|
|
64
|
+
- name: Check distributions
|
|
65
|
+
run: |
|
|
66
|
+
pip install twine
|
|
67
|
+
twine check dist/*
|
|
68
|
+
|
|
69
|
+
- name: Upload distributions
|
|
70
|
+
uses: actions/upload-artifact@v4
|
|
71
|
+
with:
|
|
72
|
+
name: python-package-distributions
|
|
73
|
+
path: dist/
|
|
74
|
+
retention-days: 7
|
|
75
|
+
|
|
76
|
+
# ----------------------------------------------------------------------
|
|
77
|
+
# 3. Publish to PyPI using OIDC trusted publishing.
|
|
78
|
+
# Only runs on real tag pushes — not on workflow_dispatch dry runs.
|
|
79
|
+
# Requires one-time setup on PyPI (see notes below).
|
|
80
|
+
# ----------------------------------------------------------------------
|
|
81
|
+
publish-pypi:
|
|
82
|
+
name: Publish to PyPI
|
|
83
|
+
needs: build
|
|
84
|
+
runs-on: ubuntu-latest
|
|
85
|
+
if: startsWith(github.ref, 'refs/tags/v')
|
|
86
|
+
environment:
|
|
87
|
+
name: pypi
|
|
88
|
+
url: https://pypi.org/p/lotsofcells
|
|
89
|
+
permissions:
|
|
90
|
+
id-token: write # required for OIDC trusted publishing
|
|
91
|
+
steps:
|
|
92
|
+
- name: Download distributions
|
|
93
|
+
uses: actions/download-artifact@v4
|
|
94
|
+
with:
|
|
95
|
+
name: python-package-distributions
|
|
96
|
+
path: dist/
|
|
97
|
+
|
|
98
|
+
- name: Publish to PyPI
|
|
99
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
100
|
+
|
|
101
|
+
# ----------------------------------------------------------------------
|
|
102
|
+
# 4. Create a GitHub Release with the built sdist + wheel attached.
|
|
103
|
+
# ----------------------------------------------------------------------
|
|
104
|
+
github-release:
|
|
105
|
+
name: GitHub Release
|
|
106
|
+
needs: publish-pypi
|
|
107
|
+
runs-on: ubuntu-latest
|
|
108
|
+
permissions:
|
|
109
|
+
contents: write
|
|
110
|
+
steps:
|
|
111
|
+
- uses: actions/checkout@v4
|
|
112
|
+
|
|
113
|
+
- name: Download distributions
|
|
114
|
+
uses: actions/download-artifact@v4
|
|
115
|
+
with:
|
|
116
|
+
name: python-package-distributions
|
|
117
|
+
path: dist/
|
|
118
|
+
|
|
119
|
+
- name: Create release
|
|
120
|
+
env:
|
|
121
|
+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
122
|
+
run: |
|
|
123
|
+
gh release create "${GITHUB_REF_NAME}" \
|
|
124
|
+
dist/* \
|
|
125
|
+
--repo "${GITHUB_REPOSITORY}" \
|
|
126
|
+
--title "${GITHUB_REF_NAME}" \
|
|
127
|
+
--generate-notes
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lotsofcells
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: Python port of lotsOfCells: proportion-test statistics and visualization on single-cell metadata. Compatible with scanpy/AnnData and spatial transcriptomics.
|
|
5
|
+
Author: Oscar Gonzalez-Velasco
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: numpy>=1.23
|
|
10
|
+
Requires-Dist: pandas>=1.5
|
|
11
|
+
Requires-Dist: scipy>=1.9
|
|
12
|
+
Requires-Dist: matplotlib>=3.6
|
|
13
|
+
Requires-Dist: anndata>=0.9
|
|
14
|
+
Provides-Extra: scanpy
|
|
15
|
+
Requires-Dist: scanpy>=1.9; extra == "scanpy"
|
|
16
|
+
Provides-Extra: spatial
|
|
17
|
+
Requires-Dist: spatialdata>=0.1; extra == "spatial"
|
|
18
|
+
Provides-Extra: mudata
|
|
19
|
+
Requires-Dist: mudata>=0.2; extra == "mudata"
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""End-to-end example mirroring the R README.
|
|
2
|
+
|
|
3
|
+
Run from the `python/` folder:
|
|
4
|
+
python -m pip install -e .
|
|
5
|
+
python example.py
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
import lotsofcells as loc
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def simulated_metadata() -> pd.DataFrame:
|
|
16
|
+
"""Reproduce the simulated dataset from the R README."""
|
|
17
|
+
sample_blocks = [
|
|
18
|
+
("A", "time 0h", "wt", [("CellTypeA", 700), ("CellTypeB", 300), ("CellTypeC", 500), ("CellTypeD", 1000)]),
|
|
19
|
+
("B", "time 0h", "mut", [("CellTypeA", 1700), ("CellTypeB", 350), ("CellTypeC", 550), ("CellTypeD", 800)]),
|
|
20
|
+
("C", "time 2h", "wt", [("CellTypeA", 1200), ("CellTypeB", 200), ("CellTypeC", 420), ("CellTypeD", 800)]),
|
|
21
|
+
("D", "time 2h", "mut", [("CellTypeA", 500), ("CellTypeB", 1000), ("CellTypeC", 10), ("CellTypeD", 1200)]),
|
|
22
|
+
("E", "time 4h", "wt", [("CellTypeA", 550), ("CellTypeB", 990), ("CellTypeC", 10), ("CellTypeD", 1100)]),
|
|
23
|
+
("F", "time 4h", "mut", [("CellTypeA", 1350), ("CellTypeB", 590), ("CellTypeC", 300), ("CellTypeD", 600)]),
|
|
24
|
+
]
|
|
25
|
+
rows = []
|
|
26
|
+
for sample, t, cond, ct_counts in sample_blocks:
|
|
27
|
+
for ct, n in ct_counts:
|
|
28
|
+
for _ in range(n):
|
|
29
|
+
rows.append((sample, ct, t, cond))
|
|
30
|
+
return pd.DataFrame(rows, columns=["sample", "cell_type", "times", "condition"])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main():
|
|
34
|
+
rng = np.random.default_rng(0)
|
|
35
|
+
meta = simulated_metadata()
|
|
36
|
+
print("Metadata head:")
|
|
37
|
+
print(meta.head())
|
|
38
|
+
print("Shape:", meta.shape)
|
|
39
|
+
|
|
40
|
+
print("\n--- 2-condition test (mut vs wt) ---")
|
|
41
|
+
res = loc.lots_of_cells(
|
|
42
|
+
meta,
|
|
43
|
+
main_variable="condition",
|
|
44
|
+
subtype_variable="cell_type",
|
|
45
|
+
sample_id="sample",
|
|
46
|
+
label_order=["mut", "wt"],
|
|
47
|
+
permutations=300,
|
|
48
|
+
seed=0,
|
|
49
|
+
plot=False,
|
|
50
|
+
)
|
|
51
|
+
print(res)
|
|
52
|
+
|
|
53
|
+
print("\n--- >2-condition gamma rank (time 0h, 2h, 4h) ---")
|
|
54
|
+
gamma = loc.lots_of_cells(
|
|
55
|
+
meta,
|
|
56
|
+
main_variable="times",
|
|
57
|
+
subtype_variable="cell_type",
|
|
58
|
+
sample_id="sample",
|
|
59
|
+
label_order=["time 0h", "time 2h", "time 4h"],
|
|
60
|
+
permutations=100,
|
|
61
|
+
seed=0,
|
|
62
|
+
plot=False,
|
|
63
|
+
)
|
|
64
|
+
print(gamma)
|
|
65
|
+
|
|
66
|
+
print("\n--- Symmetric divergence score (mut vs wt) ---")
|
|
67
|
+
ent = loc.entropy_score(
|
|
68
|
+
meta,
|
|
69
|
+
main_variable="condition",
|
|
70
|
+
subtype_variable="cell_type",
|
|
71
|
+
label_order=["mut", "wt"],
|
|
72
|
+
permutations=200,
|
|
73
|
+
seed=0,
|
|
74
|
+
plot=False,
|
|
75
|
+
)
|
|
76
|
+
print(ent)
|
|
77
|
+
|
|
78
|
+
print("\n--- AnnData round-trip ---")
|
|
79
|
+
try:
|
|
80
|
+
import anndata as ad
|
|
81
|
+
adata = ad.AnnData(np.zeros((len(meta), 1), dtype=float), obs=meta.copy())
|
|
82
|
+
adata.obs_names = adata.obs_names.astype(str)
|
|
83
|
+
# Simulate a numerical feature so density_chart works on .obs
|
|
84
|
+
adata.obs["n_features_RNA"] = np.abs(
|
|
85
|
+
rng.normal(loc=2500, scale=600, size=adata.n_obs)
|
|
86
|
+
)
|
|
87
|
+
res2 = loc.lots_of_cells(
|
|
88
|
+
adata,
|
|
89
|
+
main_variable="condition",
|
|
90
|
+
subtype_variable="cell_type",
|
|
91
|
+
sample_id="sample",
|
|
92
|
+
label_order=["mut", "wt"],
|
|
93
|
+
permutations=100,
|
|
94
|
+
seed=0,
|
|
95
|
+
plot=False,
|
|
96
|
+
)
|
|
97
|
+
assert res2.shape[0] == res.shape[0]
|
|
98
|
+
print("AnnData path OK. Same #covariables in result.")
|
|
99
|
+
except ImportError:
|
|
100
|
+
print("(anndata not installed — skipping AnnData round-trip)")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
if __name__ == "__main__":
|
|
104
|
+
main()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""lotsofcells: proportion-test statistics and visualization on single-cell metadata.
|
|
2
|
+
|
|
3
|
+
Python port of the R package `lotsOfCells`, designed for the scanpy / AnnData
|
|
4
|
+
framework. Compatible with single-cell (`AnnData`) and spatial transcriptomics
|
|
5
|
+
(`SpatialData` / `MuData`) objects, since metadata is read from `.obs`.
|
|
6
|
+
|
|
7
|
+
References
|
|
8
|
+
----------
|
|
9
|
+
Óscar González-Velasco; lotsOfCells: data visualization and statistics of
|
|
10
|
+
single cell metadata. bioRxiv 2024.05.23.595582;
|
|
11
|
+
https://doi.org/10.1101/2024.05.23.595582
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from ._utils import get_metadata, get_palette
|
|
15
|
+
from .lotsofcells import lots_of_cells
|
|
16
|
+
from .entropy import entropy_score
|
|
17
|
+
from .plots import (
|
|
18
|
+
bar_chart,
|
|
19
|
+
waffle_chart,
|
|
20
|
+
polar_chart,
|
|
21
|
+
density_chart,
|
|
22
|
+
dynamics_chart,
|
|
23
|
+
plot_abundance_test,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"get_metadata",
|
|
28
|
+
"get_palette",
|
|
29
|
+
"lots_of_cells",
|
|
30
|
+
"entropy_score",
|
|
31
|
+
"bar_chart",
|
|
32
|
+
"waffle_chart",
|
|
33
|
+
"polar_chart",
|
|
34
|
+
"density_chart",
|
|
35
|
+
"dynamics_chart",
|
|
36
|
+
"plot_abundance_test",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
__version__ = "0.3.0"
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""Internal statistical primitives.
|
|
2
|
+
|
|
3
|
+
Direct ports of the R helpers `cellToGamma`, `cellToGammaOriginal` and
|
|
4
|
+
`cellToMontecarlo`. Implementation choices (pseudocounts, transforms) match
|
|
5
|
+
the R version exactly so results are comparable.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Dict, List, Sequence, Tuple, Union
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# --- Transformations used everywhere ---------------------------------------------------
|
|
16
|
+
|
|
17
|
+
def pseudo_count(counts: np.ndarray) -> np.ndarray:
|
|
18
|
+
"""`counts + 0.5` — matches the R pseudocount in lotsOfCells.R."""
|
|
19
|
+
return counts + 0.5
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def pseudo_count_arcsin(counts: np.ndarray) -> np.ndarray:
|
|
23
|
+
"""`counts + sqrt(counts^2 + 1)` — matches the R pseudocount in entropyScore.R."""
|
|
24
|
+
return counts + np.sqrt(counts * counts + 1)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def asrt(p: np.ndarray) -> np.ndarray:
|
|
28
|
+
"""Arcsin square-root transform (Anscombe-style)."""
|
|
29
|
+
return np.arcsin(np.sqrt(np.clip(p, 0, 1)))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def logit(f: np.ndarray) -> np.ndarray:
|
|
33
|
+
return np.log(f / (1 - f))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def geom_mean(x: np.ndarray) -> float:
|
|
37
|
+
"""Geometric mean over the strictly positive entries of ``x``.
|
|
38
|
+
|
|
39
|
+
Note: this intentionally diverges from R's literal ``exp(mean(log(x)))``,
|
|
40
|
+
which collapses to 0 whenever **any** entry is 0. In the symmetric
|
|
41
|
+
divergence formula used by `entropyScore`, a zero in
|
|
42
|
+
``|p * log2(p/q)|`` means ``p[i] == q[i]`` (the two distributions agree
|
|
43
|
+
on cell type ``i``); such a term should contribute *nothing* to the
|
|
44
|
+
divergence — not zero out the entire score.
|
|
45
|
+
|
|
46
|
+
The 1-class test makes this critical: random partitions inside a single
|
|
47
|
+
condition often share integer totals after the ``int(sqrt(count_s))``
|
|
48
|
+
crowd sizing, which forces ``p[i] == q[i]`` for any cell type missing
|
|
49
|
+
from both subsamples. With strict R semantics every iteration collapses
|
|
50
|
+
to 0; with this version the geom_mean is taken over the cell types
|
|
51
|
+
that actually disagree.
|
|
52
|
+
|
|
53
|
+
If every entry is zero, the divergence really is 0.
|
|
54
|
+
"""
|
|
55
|
+
x = np.asarray(x, dtype=float)
|
|
56
|
+
nonzero = x[x > 0]
|
|
57
|
+
if nonzero.size == 0:
|
|
58
|
+
return 0.0
|
|
59
|
+
return float(np.exp(np.mean(np.log(nonzero))))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# --- Contingency tables --------------------------------------------------------------
|
|
63
|
+
|
|
64
|
+
def _table(groups: Sequence[str], covariable: Sequence[str]) -> pd.DataFrame:
|
|
65
|
+
"""Equivalent of R `table(data.frame(groups, covariable))`."""
|
|
66
|
+
return (
|
|
67
|
+
pd.crosstab(pd.Series(groups, name="groups"),
|
|
68
|
+
pd.Series(covariable, name="covariable"))
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _ensure_rows(tab: pd.DataFrame, label_order: Sequence[str]) -> pd.DataFrame:
|
|
73
|
+
"""Add zero rows for any missing labels and reindex."""
|
|
74
|
+
missing = [l for l in label_order if l not in tab.index]
|
|
75
|
+
if missing:
|
|
76
|
+
z = pd.DataFrame(0, index=missing, columns=tab.columns)
|
|
77
|
+
tab = pd.concat([tab, z])
|
|
78
|
+
return tab.reindex(label_order)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _ensure_cols(tab: pd.DataFrame, indexes: Sequence[str]) -> pd.DataFrame:
|
|
82
|
+
missing = [c for c in indexes if c not in tab.columns]
|
|
83
|
+
if missing:
|
|
84
|
+
for m in missing:
|
|
85
|
+
tab[m] = 0
|
|
86
|
+
return tab[list(indexes)]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# --- Goodman & Kruskal gamma rank correlation ----------------------------------------
|
|
90
|
+
|
|
91
|
+
def _ranked_proportions(
|
|
92
|
+
tab: pd.DataFrame,
|
|
93
|
+
label_order: Sequence[str],
|
|
94
|
+
indexes: Sequence[str],
|
|
95
|
+
) -> np.ndarray:
|
|
96
|
+
"""Rows=label_order, cols=covariables.
|
|
97
|
+
|
|
98
|
+
Computes per-covariable proportions then ranks across labels.
|
|
99
|
+
Mirrors `t(apply(dftmp,2,function(row){row/(sum(row)+0.1)}))[labelOrder, indexes]`
|
|
100
|
+
followed by `t(apply(.,1,rank))`.
|
|
101
|
+
"""
|
|
102
|
+
tab = _ensure_rows(tab, label_order)
|
|
103
|
+
tab = _ensure_cols(tab, indexes)
|
|
104
|
+
# column-wise proportions: row/(sum(row)+0.1) per column => divide each column by (col_sum+0.1)
|
|
105
|
+
col_sums = tab.values.sum(axis=0) + 0.1 # shape (n_cov,)
|
|
106
|
+
contig = tab.values / col_sums[np.newaxis, :] # rows = labels in label_order
|
|
107
|
+
# rank within each row across covariables (R: apply(contig_tab,1,rank))
|
|
108
|
+
# 'average' ties to mirror base::rank's default
|
|
109
|
+
ranks = np.apply_along_axis(_rank_avg, 1, contig)
|
|
110
|
+
return ranks # shape (n_labels, n_cov)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _rank_avg(x: np.ndarray) -> np.ndarray:
|
|
114
|
+
"""Equivalent of R base::rank(x, ties.method='average')."""
|
|
115
|
+
order = np.argsort(x, kind="mergesort")
|
|
116
|
+
ranks = np.empty_like(order, dtype=float)
|
|
117
|
+
ranks[order] = np.arange(1, len(x) + 1, dtype=float)
|
|
118
|
+
# average over ties
|
|
119
|
+
_, inv, counts = np.unique(x, return_inverse=True, return_counts=True)
|
|
120
|
+
sums = np.zeros_like(counts, dtype=float)
|
|
121
|
+
np.add.at(sums, inv, ranks)
|
|
122
|
+
avg = sums / counts
|
|
123
|
+
return avg[inv]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _concordant_discordant(
|
|
127
|
+
ranks: np.ndarray, rank_index: np.ndarray, original: bool
|
|
128
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
129
|
+
"""For each covariable column, count concordant and discordant pairs.
|
|
130
|
+
|
|
131
|
+
If `original=False` (random/null): concordant means
|
|
132
|
+
sign(ranks[i]-ranks[i+1:]) == -1 (matches the R cellToGamma which assumes
|
|
133
|
+
monotonic 1..N and sign always = -1). Discordant counts where
|
|
134
|
+
`ranks[i] != ranks[k]` and sign != -1.
|
|
135
|
+
|
|
136
|
+
If `original=True`: compare against the actual rank_index sign pattern.
|
|
137
|
+
"""
|
|
138
|
+
n_labels, n_cov = ranks.shape
|
|
139
|
+
nconc = np.zeros(n_cov, dtype=int)
|
|
140
|
+
ndisc = np.zeros(n_cov, dtype=int)
|
|
141
|
+
for i in range(n_labels - 1):
|
|
142
|
+
ri = ranks[i]
|
|
143
|
+
rj = ranks[i + 1 :] # (rest, n_cov)
|
|
144
|
+
diff_r = ri[np.newaxis, :] - rj # (rest, n_cov)
|
|
145
|
+
if original:
|
|
146
|
+
idx_diff = rank_index[i] - rank_index[i + 1 :]
|
|
147
|
+
target_sign = np.sign(idx_diff)[:, np.newaxis] # (rest, 1)
|
|
148
|
+
nconc += np.sum(np.sign(diff_r) == target_sign, axis=0)
|
|
149
|
+
mask_neq = diff_r != 0
|
|
150
|
+
ndisc += np.sum((np.sign(diff_r) != target_sign) & mask_neq, axis=0)
|
|
151
|
+
else:
|
|
152
|
+
nconc += np.sum(np.sign(diff_r) == -1, axis=0)
|
|
153
|
+
mask_neq = diff_r != 0
|
|
154
|
+
ndisc += np.sum((np.sign(diff_r) != -1) & mask_neq, axis=0)
|
|
155
|
+
return nconc, ndisc
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def cell_to_gamma(
|
|
159
|
+
covariable: np.ndarray,
|
|
160
|
+
groups: np.ndarray,
|
|
161
|
+
label_order: Sequence[str],
|
|
162
|
+
indexes: Sequence[str],
|
|
163
|
+
cell_crowd: Dict[str, int],
|
|
164
|
+
rank_index: np.ndarray,
|
|
165
|
+
rng: np.random.Generator,
|
|
166
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
167
|
+
"""Random null distribution: mix all covariables, then subsample per-group.
|
|
168
|
+
|
|
169
|
+
Returns (n_concordant, n_discordant) per covariable column (length n_cov).
|
|
170
|
+
"""
|
|
171
|
+
pieces_cov, pieces_grp = [], []
|
|
172
|
+
for label in label_order:
|
|
173
|
+
n = int(cell_crowd[label])
|
|
174
|
+
sample = rng.choice(covariable, size=n, replace=True)
|
|
175
|
+
pieces_cov.append(sample)
|
|
176
|
+
pieces_grp.append(np.repeat(label, n))
|
|
177
|
+
cov = np.concatenate(pieces_cov)
|
|
178
|
+
grp = np.concatenate(pieces_grp)
|
|
179
|
+
tab = _table(grp, cov)
|
|
180
|
+
ranks = _ranked_proportions(tab, label_order, indexes)
|
|
181
|
+
return _concordant_discordant(ranks, rank_index, original=False)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def cell_to_gamma_original(
|
|
185
|
+
covariable: np.ndarray,
|
|
186
|
+
groups: np.ndarray,
|
|
187
|
+
label_order: Sequence[str],
|
|
188
|
+
indexes: Sequence[str],
|
|
189
|
+
cell_crowd: Dict[str, int],
|
|
190
|
+
rank_index: np.ndarray,
|
|
191
|
+
rng: np.random.Generator,
|
|
192
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
193
|
+
"""Original-data subsampling: subsample within each group preserving labels."""
|
|
194
|
+
pieces_cov, pieces_grp = [], []
|
|
195
|
+
for label in label_order:
|
|
196
|
+
n = int(cell_crowd[label])
|
|
197
|
+
pool = covariable[groups == label]
|
|
198
|
+
if len(pool) == 0:
|
|
199
|
+
continue
|
|
200
|
+
replace = n > len(pool)
|
|
201
|
+
sample = rng.choice(pool, size=n, replace=replace)
|
|
202
|
+
pieces_cov.append(sample)
|
|
203
|
+
pieces_grp.append(np.repeat(label, n))
|
|
204
|
+
cov = np.concatenate(pieces_cov)
|
|
205
|
+
grp = np.concatenate(pieces_grp)
|
|
206
|
+
tab = _table(grp, cov)
|
|
207
|
+
ranks = _ranked_proportions(tab, label_order, indexes)
|
|
208
|
+
return _concordant_discordant(ranks, rank_index, original=True)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# --- Monte Carlo for 2-condition fold-change -----------------------------------------
|
|
212
|
+
|
|
213
|
+
def _proportions_from_table(
|
|
214
|
+
tab: pd.DataFrame,
|
|
215
|
+
label_order: Sequence[str],
|
|
216
|
+
indexes: Sequence[str],
|
|
217
|
+
pseudo: bool = True,
|
|
218
|
+
) -> np.ndarray:
|
|
219
|
+
"""`pseudo_count(tab)` then column-wise proportions, indexed by label_order/indexes."""
|
|
220
|
+
tab = _ensure_rows(tab, label_order)
|
|
221
|
+
tab = _ensure_cols(tab, indexes)
|
|
222
|
+
vals = tab.values.astype(float)
|
|
223
|
+
if pseudo:
|
|
224
|
+
vals = pseudo_count(vals)
|
|
225
|
+
col_sums = vals.sum(axis=0) + 1.0
|
|
226
|
+
return vals / col_sums[np.newaxis, :] # (n_labels, n_cov)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def cell_to_montecarlo(
|
|
230
|
+
covariable: np.ndarray,
|
|
231
|
+
groups: np.ndarray,
|
|
232
|
+
label_order: Sequence[str],
|
|
233
|
+
indexes: Sequence[str],
|
|
234
|
+
cell_crowd: Union[Dict[str, int], Dict[str, List[int]]],
|
|
235
|
+
rng: np.random.Generator,
|
|
236
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
237
|
+
"""Return (mixed-pool fold change, original-resampled fold change).
|
|
238
|
+
|
|
239
|
+
Both are arrays of length len(indexes), holding
|
|
240
|
+
log2( asrt(p1) / asrt(p2) ).
|
|
241
|
+
"""
|
|
242
|
+
def _build_mixed(crowd_for_label):
|
|
243
|
+
if isinstance(crowd_for_label, (list, np.ndarray)):
|
|
244
|
+
sizes = np.asarray(crowd_for_label, dtype=int)
|
|
245
|
+
return np.concatenate(
|
|
246
|
+
[rng.choice(covariable, size=int(s), replace=True) for s in sizes]
|
|
247
|
+
)
|
|
248
|
+
return rng.choice(covariable, size=int(crowd_for_label), replace=True)
|
|
249
|
+
|
|
250
|
+
def _build_orig(crowd_for_label, label):
|
|
251
|
+
pool = covariable[groups == label]
|
|
252
|
+
if len(pool) == 0:
|
|
253
|
+
return np.array([], dtype=covariable.dtype)
|
|
254
|
+
if isinstance(crowd_for_label, (list, np.ndarray)):
|
|
255
|
+
sizes = np.asarray(crowd_for_label, dtype=int)
|
|
256
|
+
return np.concatenate(
|
|
257
|
+
[rng.choice(pool, size=int(s), replace=True) for s in sizes]
|
|
258
|
+
)
|
|
259
|
+
n = int(crowd_for_label)
|
|
260
|
+
return rng.choice(pool, size=n, replace=True)
|
|
261
|
+
|
|
262
|
+
mixed_cov, mixed_grp, orig_cov, orig_grp = [], [], [], []
|
|
263
|
+
for label in label_order:
|
|
264
|
+
cm = _build_mixed(cell_crowd[label])
|
|
265
|
+
co = _build_orig(cell_crowd[label], label)
|
|
266
|
+
mixed_cov.append(cm)
|
|
267
|
+
mixed_grp.append(np.repeat(label, len(cm)))
|
|
268
|
+
orig_cov.append(co)
|
|
269
|
+
orig_grp.append(np.repeat(label, len(co)))
|
|
270
|
+
|
|
271
|
+
mixed_tab = _table(np.concatenate(mixed_grp), np.concatenate(mixed_cov))
|
|
272
|
+
orig_tab = _table(np.concatenate(orig_grp), np.concatenate(orig_cov))
|
|
273
|
+
|
|
274
|
+
p_mixed = _proportions_from_table(mixed_tab, label_order, indexes, pseudo=True)
|
|
275
|
+
p_orig = _proportions_from_table(orig_tab, label_order, indexes, pseudo=True)
|
|
276
|
+
|
|
277
|
+
fc_mixed = np.log2(asrt(p_mixed[0]) / asrt(p_mixed[1]))
|
|
278
|
+
fc_orig = np.log2(asrt(p_orig[0]) / asrt(p_orig[1]))
|
|
279
|
+
return fc_mixed, fc_orig
|