equibin 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- equibin/__init__.py +3 -0
- equibin/binning.py +233 -0
- equibin/py.typed +0 -0
- equibin-0.1.0.dist-info/METADATA +19 -0
- equibin-0.1.0.dist-info/RECORD +6 -0
- equibin-0.1.0.dist-info/WHEEL +4 -0
equibin/__init__.py
ADDED
equibin/binning.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""
|
|
2
|
+
2D equal-probability binning (multivariate probability binning).
|
|
3
|
+
|
|
4
|
+
Algorithm from:
|
|
5
|
+
Roederer et al. (2001). Cytometry 45:47-55.
|
|
6
|
+
https://doi.org/10.1002/1097-0320(20010901)45:1<47::AID-CYTO1143>3.0.CO;2-A
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import tempfile
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import numpy.typing as npt
|
|
17
|
+
|
|
18
|
+
Bin = tuple[float, float, float, float]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class BinningResult:
|
|
23
|
+
"""
|
|
24
|
+
Result of 2D equal-probability binning.
|
|
25
|
+
|
|
26
|
+
`counts[i]` is the data count for `bins[i]`.
|
|
27
|
+
`bins[i]` is `(xmin, xmax, ymin, ymax)`.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
counts: npt.NDArray[np.intp]
|
|
31
|
+
bins: list[Bin]
|
|
32
|
+
|
|
33
|
+
def __len__(self) -> int:
|
|
34
|
+
return len(self.bins)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def bin_2d(
|
|
38
|
+
x: npt.ArrayLike,
|
|
39
|
+
y: npt.ArrayLike,
|
|
40
|
+
n_bins: int = 128,
|
|
41
|
+
*,
|
|
42
|
+
xmin: float | None = None,
|
|
43
|
+
xmax: float | None = None,
|
|
44
|
+
ymin: float | None = None,
|
|
45
|
+
ymax: float | None = None,
|
|
46
|
+
) -> BinningResult:
|
|
47
|
+
"""
|
|
48
|
+
Partition 2D data into `n_bins` equal-count bins using multivariate
|
|
49
|
+
probability binning.
|
|
50
|
+
|
|
51
|
+
At each recursive step the dimension with the highest variance is split at
|
|
52
|
+
its median until `n_bins` bins are produced. `n_bins` need not be a power
|
|
53
|
+
of two. Algorithm from:
|
|
54
|
+
|
|
55
|
+
Roederer et al. (2001). Cytometry 45:47-55.
|
|
56
|
+
https://doi.org/10.1002/1097-0320(20010901)45:1<47::AID-CYTO1143>3.0.CO;2-A
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
x, y : array-like
|
|
61
|
+
1-D coordinate arrays of equal length.
|
|
62
|
+
n_bins : int
|
|
63
|
+
Target number of output bins.
|
|
64
|
+
xmin, xmax, ymin, ymax : float, optional
|
|
65
|
+
Points outside these bounds are excluded before binning.
|
|
66
|
+
"""
|
|
67
|
+
if n_bins < 1:
|
|
68
|
+
raise ValueError(f"n_bins must be >= 1, got {n_bins}")
|
|
69
|
+
|
|
70
|
+
xa = np.asarray(x, dtype=float)
|
|
71
|
+
ya = np.asarray(y, dtype=float)
|
|
72
|
+
|
|
73
|
+
mask = np.ones(len(xa), dtype=bool)
|
|
74
|
+
if xmin is not None:
|
|
75
|
+
mask &= xa >= xmin
|
|
76
|
+
if xmax is not None:
|
|
77
|
+
mask &= xa <= xmax
|
|
78
|
+
if ymin is not None:
|
|
79
|
+
mask &= ya >= ymin
|
|
80
|
+
if ymax is not None:
|
|
81
|
+
mask &= ya <= ymax
|
|
82
|
+
|
|
83
|
+
data = np.column_stack((xa[mask], ya[mask]))
|
|
84
|
+
|
|
85
|
+
def _split(
|
|
86
|
+
d: npt.NDArray[np.float64],
|
|
87
|
+
n: int,
|
|
88
|
+
bounds: list[tuple[float, float]],
|
|
89
|
+
) -> list[tuple[list[tuple[float, float]], int]]:
|
|
90
|
+
if n == 1 or len(d) == 0:
|
|
91
|
+
return [(list(bounds), len(d))]
|
|
92
|
+
split_dim = int(np.argmax(np.var(d, axis=0)))
|
|
93
|
+
median = float(np.median(d[:, split_dim]))
|
|
94
|
+
left_mask = d[:, split_dim] <= median
|
|
95
|
+
left_bounds = list(bounds)
|
|
96
|
+
right_bounds = list(bounds)
|
|
97
|
+
left_bounds[split_dim] = (bounds[split_dim][0], median)
|
|
98
|
+
right_bounds[split_dim] = (median, bounds[split_dim][1])
|
|
99
|
+
return _split(d[left_mask], n // 2, left_bounds) + _split(
|
|
100
|
+
d[~left_mask], n - n // 2, right_bounds
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if len(data) == 0:
|
|
104
|
+
init: list[tuple[float, float]] = [
|
|
105
|
+
(xmin if xmin is not None else 0.0, xmax if xmax is not None else 1.0),
|
|
106
|
+
(ymin if ymin is not None else 0.0, ymax if ymax is not None else 1.0),
|
|
107
|
+
]
|
|
108
|
+
else:
|
|
109
|
+
init = [
|
|
110
|
+
(float(data[:, 0].min()), float(data[:, 0].max())),
|
|
111
|
+
(float(data[:, 1].min()), float(data[:, 1].max())),
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
raw = _split(data, n_bins, init)
|
|
115
|
+
counts = np.array([r[1] for r in raw], dtype=np.intp)
|
|
116
|
+
bins: list[Bin] = [
|
|
117
|
+
(float(r[0][0][0]), float(r[0][0][1]), float(r[0][1][0]), float(r[0][1][1])) for r in raw
|
|
118
|
+
]
|
|
119
|
+
return BinningResult(counts=counts, bins=bins)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def plot_bins(
|
|
123
|
+
result: BinningResult,
|
|
124
|
+
x: npt.ArrayLike | None = None,
|
|
125
|
+
y: npt.ArrayLike | None = None,
|
|
126
|
+
*,
|
|
127
|
+
title: str | None = None,
|
|
128
|
+
xlabel: str = "X",
|
|
129
|
+
ylabel: str = "Y",
|
|
130
|
+
xlim: tuple[float, float] | None = None,
|
|
131
|
+
ylim: tuple[float, float] | None = None,
|
|
132
|
+
) -> None:
|
|
133
|
+
"""
|
|
134
|
+
Plot bin rectangles, optionally overlaid on a scatter of the source data.
|
|
135
|
+
"""
|
|
136
|
+
import matplotlib.pyplot as plt
|
|
137
|
+
from matplotlib.patches import Rectangle
|
|
138
|
+
|
|
139
|
+
_fig, ax = plt.subplots(figsize=(8, 8))
|
|
140
|
+
|
|
141
|
+
if x is not None and y is not None:
|
|
142
|
+
ax.scatter(np.asarray(x), np.asarray(y), s=5, alpha=0.5)
|
|
143
|
+
|
|
144
|
+
for xlo, xhi, ylo, yhi in result.bins:
|
|
145
|
+
ax.add_patch(
|
|
146
|
+
Rectangle((xlo, ylo), xhi - xlo, yhi - ylo, edgecolor="red", facecolor="none", lw=1)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
ax.set_xlabel(xlabel)
|
|
150
|
+
ax.set_ylabel(ylabel)
|
|
151
|
+
if title is not None:
|
|
152
|
+
ax.set_title(title)
|
|
153
|
+
if xlim is not None:
|
|
154
|
+
ax.set_xlim(*xlim)
|
|
155
|
+
if ylim is not None:
|
|
156
|
+
ax.set_ylim(*ylim)
|
|
157
|
+
|
|
158
|
+
plt.show()
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def save_bins(
|
|
162
|
+
result: BinningResult,
|
|
163
|
+
output_file: str | Path,
|
|
164
|
+
label_prefix: str = "",
|
|
165
|
+
) -> None:
|
|
166
|
+
"""
|
|
167
|
+
Write bin boundaries to a whitespace-delimited text file.
|
|
168
|
+
|
|
169
|
+
Each line: ``xlo xhi ylo yhi <label_prefix><index>``.
|
|
170
|
+
"""
|
|
171
|
+
path = Path(output_file)
|
|
172
|
+
lines = [
|
|
173
|
+
f"{xlo} {xhi} {ylo} {yhi} {label_prefix}{i}"
|
|
174
|
+
for i, (xlo, xhi, ylo, yhi) in enumerate(result.bins)
|
|
175
|
+
]
|
|
176
|
+
path.write_text("\n".join(lines) + "\n")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
## Tests
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def test_bin_2d_returns_correct_count() -> None:
|
|
183
|
+
rng = np.random.default_rng(0)
|
|
184
|
+
x = rng.uniform(0, 10, 1000)
|
|
185
|
+
y = rng.uniform(0, 10, 1000)
|
|
186
|
+
result = bin_2d(x, y, n_bins=32)
|
|
187
|
+
assert len(result) == 32
|
|
188
|
+
assert result.counts.sum() == 1000
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def test_bin_2d_bounds_filtering() -> None:
|
|
192
|
+
rng = np.random.default_rng(1)
|
|
193
|
+
x = rng.uniform(0, 10, 2000)
|
|
194
|
+
y = rng.uniform(0, 10, 2000)
|
|
195
|
+
full = bin_2d(x, y, n_bins=16)
|
|
196
|
+
bounded = bin_2d(x, y, n_bins=16, xmin=2, xmax=8, ymin=2, ymax=8)
|
|
197
|
+
assert bounded.counts.sum() < full.counts.sum()
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def test_bin_2d_roughly_equal_counts() -> None:
|
|
201
|
+
rng = np.random.default_rng(2)
|
|
202
|
+
n = 10_000
|
|
203
|
+
x = rng.uniform(0, 1, n)
|
|
204
|
+
y = rng.uniform(0, 1, n)
|
|
205
|
+
result = bin_2d(x, y, n_bins=128)
|
|
206
|
+
expected = n / 128
|
|
207
|
+
# Uniform data should give counts within 30% of expected
|
|
208
|
+
assert (np.abs(result.counts - expected) < 0.3 * expected).all()
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def test_save_bins() -> None:
|
|
212
|
+
rng = np.random.default_rng(3)
|
|
213
|
+
result = bin_2d(rng.uniform(0, 1, 100), rng.uniform(0, 1, 100), n_bins=4)
|
|
214
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
215
|
+
out = Path(tmpdir) / "bins.txt"
|
|
216
|
+
save_bins(result, out, label_prefix="run_")
|
|
217
|
+
lines = out.read_text().splitlines()
|
|
218
|
+
assert len(lines) == 4
|
|
219
|
+
assert lines[0].endswith("run_0")
|
|
220
|
+
assert lines[3].endswith("run_3")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def test_plot_bins_runs() -> None:
|
|
224
|
+
import matplotlib
|
|
225
|
+
import matplotlib.pyplot as plt
|
|
226
|
+
|
|
227
|
+
matplotlib.use("Agg")
|
|
228
|
+
rng = np.random.default_rng(4)
|
|
229
|
+
x = rng.uniform(0, 1, 200)
|
|
230
|
+
y = rng.uniform(0, 1, 200)
|
|
231
|
+
result = bin_2d(x, y, n_bins=8)
|
|
232
|
+
plot_bins(result, x, y, title="Test", xlim=(0, 1), ylim=(0, 1))
|
|
233
|
+
plt.close("all")
|
equibin/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: equibin
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: 2D equal-probability binning (multivariate probability binning)
|
|
5
|
+
Project-URL: Repository, https://github.com/det-lab/equibin
|
|
6
|
+
Author-email: Amy Roberts <cantor.duster@gmail.com>, Lekhraj Pandey <lekhraj.pandey@coyotes.usd.edu>, Anthony Villano <anthony.villano@ucdenver.edu>
|
|
7
|
+
License: GPL-2.0-only
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Typing :: Typed
|
|
17
|
+
Requires-Python: <4.0,>=3.11
|
|
18
|
+
Requires-Dist: matplotlib
|
|
19
|
+
Requires-Dist: numpy
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
equibin/__init__.py,sha256=9GMwucoMtNZzq1RjqXwV3PEA-inAbfboT9uaht1-hgY,137
|
|
2
|
+
equibin/binning.py,sha256=gCu2ESETuF7963xfRH67frYJuMmAk0kw5jRc1ohRsN8,6703
|
|
3
|
+
equibin/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
equibin-0.1.0.dist-info/METADATA,sha256=llo5EzAvoS6rznWkeNiI02gl-yicIYSeEi17b8Cn860,844
|
|
5
|
+
equibin-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
6
|
+
equibin-0.1.0.dist-info/RECORD,,
|