equibin 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
equibin/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from equibin.binning import BinningResult, bin_2d, plot_bins, save_bins
2
+
3
+ __all__ = ["bin_2d", "BinningResult", "plot_bins", "save_bins"]
equibin/binning.py ADDED
@@ -0,0 +1,233 @@
1
+ """
2
+ 2D equal-probability binning (multivariate probability binning).
3
+
4
+ Algorithm from:
5
+ Roederer et al. (2001). Cytometry 45:47-55.
6
+ https://doi.org/10.1002/1097-0320(20010901)45:1<47::AID-CYTO1143>3.0.CO;2-A
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import tempfile
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+
15
+ import numpy as np
16
+ import numpy.typing as npt
17
+
18
+ Bin = tuple[float, float, float, float]
19
+
20
+
21
+ @dataclass
22
+ class BinningResult:
23
+ """
24
+ Result of 2D equal-probability binning.
25
+
26
+ `counts[i]` is the data count for `bins[i]`.
27
+ `bins[i]` is `(xmin, xmax, ymin, ymax)`.
28
+ """
29
+
30
+ counts: npt.NDArray[np.intp]
31
+ bins: list[Bin]
32
+
33
+ def __len__(self) -> int:
34
+ return len(self.bins)
35
+
36
+
37
+ def bin_2d(
38
+ x: npt.ArrayLike,
39
+ y: npt.ArrayLike,
40
+ n_bins: int = 128,
41
+ *,
42
+ xmin: float | None = None,
43
+ xmax: float | None = None,
44
+ ymin: float | None = None,
45
+ ymax: float | None = None,
46
+ ) -> BinningResult:
47
+ """
48
+ Partition 2D data into `n_bins` equal-count bins using multivariate
49
+ probability binning.
50
+
51
+ At each recursive step the dimension with the highest variance is split at
52
+ its median until `n_bins` bins are produced. `n_bins` need not be a power
53
+ of two. Algorithm from:
54
+
55
+ Roederer et al. (2001). Cytometry 45:47-55.
56
+ https://doi.org/10.1002/1097-0320(20010901)45:1<47::AID-CYTO1143>3.0.CO;2-A
57
+
58
+ Parameters
59
+ ----------
60
+ x, y : array-like
61
+ 1-D coordinate arrays of equal length.
62
+ n_bins : int
63
+ Target number of output bins.
64
+ xmin, xmax, ymin, ymax : float, optional
65
+ Points outside these bounds are excluded before binning.
66
+ """
67
+ if n_bins < 1:
68
+ raise ValueError(f"n_bins must be >= 1, got {n_bins}")
69
+
70
+ xa = np.asarray(x, dtype=float)
71
+ ya = np.asarray(y, dtype=float)
72
+
73
+ mask = np.ones(len(xa), dtype=bool)
74
+ if xmin is not None:
75
+ mask &= xa >= xmin
76
+ if xmax is not None:
77
+ mask &= xa <= xmax
78
+ if ymin is not None:
79
+ mask &= ya >= ymin
80
+ if ymax is not None:
81
+ mask &= ya <= ymax
82
+
83
+ data = np.column_stack((xa[mask], ya[mask]))
84
+
85
+ def _split(
86
+ d: npt.NDArray[np.float64],
87
+ n: int,
88
+ bounds: list[tuple[float, float]],
89
+ ) -> list[tuple[list[tuple[float, float]], int]]:
90
+ if n == 1 or len(d) == 0:
91
+ return [(list(bounds), len(d))]
92
+ split_dim = int(np.argmax(np.var(d, axis=0)))
93
+ median = float(np.median(d[:, split_dim]))
94
+ left_mask = d[:, split_dim] <= median
95
+ left_bounds = list(bounds)
96
+ right_bounds = list(bounds)
97
+ left_bounds[split_dim] = (bounds[split_dim][0], median)
98
+ right_bounds[split_dim] = (median, bounds[split_dim][1])
99
+ return _split(d[left_mask], n // 2, left_bounds) + _split(
100
+ d[~left_mask], n - n // 2, right_bounds
101
+ )
102
+
103
+ if len(data) == 0:
104
+ init: list[tuple[float, float]] = [
105
+ (xmin if xmin is not None else 0.0, xmax if xmax is not None else 1.0),
106
+ (ymin if ymin is not None else 0.0, ymax if ymax is not None else 1.0),
107
+ ]
108
+ else:
109
+ init = [
110
+ (float(data[:, 0].min()), float(data[:, 0].max())),
111
+ (float(data[:, 1].min()), float(data[:, 1].max())),
112
+ ]
113
+
114
+ raw = _split(data, n_bins, init)
115
+ counts = np.array([r[1] for r in raw], dtype=np.intp)
116
+ bins: list[Bin] = [
117
+ (float(r[0][0][0]), float(r[0][0][1]), float(r[0][1][0]), float(r[0][1][1])) for r in raw
118
+ ]
119
+ return BinningResult(counts=counts, bins=bins)
120
+
121
+
122
+ def plot_bins(
123
+ result: BinningResult,
124
+ x: npt.ArrayLike | None = None,
125
+ y: npt.ArrayLike | None = None,
126
+ *,
127
+ title: str | None = None,
128
+ xlabel: str = "X",
129
+ ylabel: str = "Y",
130
+ xlim: tuple[float, float] | None = None,
131
+ ylim: tuple[float, float] | None = None,
132
+ ) -> None:
133
+ """
134
+ Plot bin rectangles, optionally overlaid on a scatter of the source data.
135
+ """
136
+ import matplotlib.pyplot as plt
137
+ from matplotlib.patches import Rectangle
138
+
139
+ _fig, ax = plt.subplots(figsize=(8, 8))
140
+
141
+ if x is not None and y is not None:
142
+ ax.scatter(np.asarray(x), np.asarray(y), s=5, alpha=0.5)
143
+
144
+ for xlo, xhi, ylo, yhi in result.bins:
145
+ ax.add_patch(
146
+ Rectangle((xlo, ylo), xhi - xlo, yhi - ylo, edgecolor="red", facecolor="none", lw=1)
147
+ )
148
+
149
+ ax.set_xlabel(xlabel)
150
+ ax.set_ylabel(ylabel)
151
+ if title is not None:
152
+ ax.set_title(title)
153
+ if xlim is not None:
154
+ ax.set_xlim(*xlim)
155
+ if ylim is not None:
156
+ ax.set_ylim(*ylim)
157
+
158
+ plt.show()
159
+
160
+
161
+ def save_bins(
162
+ result: BinningResult,
163
+ output_file: str | Path,
164
+ label_prefix: str = "",
165
+ ) -> None:
166
+ """
167
+ Write bin boundaries to a whitespace-delimited text file.
168
+
169
+ Each line: ``xlo xhi ylo yhi <label_prefix><index>``.
170
+ """
171
+ path = Path(output_file)
172
+ lines = [
173
+ f"{xlo} {xhi} {ylo} {yhi} {label_prefix}{i}"
174
+ for i, (xlo, xhi, ylo, yhi) in enumerate(result.bins)
175
+ ]
176
+ path.write_text("\n".join(lines) + "\n")
177
+
178
+
179
+ ## Tests
180
+
181
+
182
+ def test_bin_2d_returns_correct_count() -> None:
183
+ rng = np.random.default_rng(0)
184
+ x = rng.uniform(0, 10, 1000)
185
+ y = rng.uniform(0, 10, 1000)
186
+ result = bin_2d(x, y, n_bins=32)
187
+ assert len(result) == 32
188
+ assert result.counts.sum() == 1000
189
+
190
+
191
+ def test_bin_2d_bounds_filtering() -> None:
192
+ rng = np.random.default_rng(1)
193
+ x = rng.uniform(0, 10, 2000)
194
+ y = rng.uniform(0, 10, 2000)
195
+ full = bin_2d(x, y, n_bins=16)
196
+ bounded = bin_2d(x, y, n_bins=16, xmin=2, xmax=8, ymin=2, ymax=8)
197
+ assert bounded.counts.sum() < full.counts.sum()
198
+
199
+
200
+ def test_bin_2d_roughly_equal_counts() -> None:
201
+ rng = np.random.default_rng(2)
202
+ n = 10_000
203
+ x = rng.uniform(0, 1, n)
204
+ y = rng.uniform(0, 1, n)
205
+ result = bin_2d(x, y, n_bins=128)
206
+ expected = n / 128
207
+ # Uniform data should give counts within 30% of expected
208
+ assert (np.abs(result.counts - expected) < 0.3 * expected).all()
209
+
210
+
211
+ def test_save_bins() -> None:
212
+ rng = np.random.default_rng(3)
213
+ result = bin_2d(rng.uniform(0, 1, 100), rng.uniform(0, 1, 100), n_bins=4)
214
+ with tempfile.TemporaryDirectory() as tmpdir:
215
+ out = Path(tmpdir) / "bins.txt"
216
+ save_bins(result, out, label_prefix="run_")
217
+ lines = out.read_text().splitlines()
218
+ assert len(lines) == 4
219
+ assert lines[0].endswith("run_0")
220
+ assert lines[3].endswith("run_3")
221
+
222
+
223
+ def test_plot_bins_runs() -> None:
224
+ import matplotlib
225
+ import matplotlib.pyplot as plt
226
+
227
+ matplotlib.use("Agg")
228
+ rng = np.random.default_rng(4)
229
+ x = rng.uniform(0, 1, 200)
230
+ y = rng.uniform(0, 1, 200)
231
+ result = bin_2d(x, y, n_bins=8)
232
+ plot_bins(result, x, y, title="Test", xlim=(0, 1), ylim=(0, 1))
233
+ plt.close("all")
equibin/py.typed ADDED
File without changes
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: equibin
3
+ Version: 0.1.0
4
+ Summary: 2D equal-probability binning (multivariate probability binning)
5
+ Project-URL: Repository, https://github.com/det-lab/equibin
6
+ Author-email: Amy Roberts <cantor.duster@gmail.com>, Lekhraj Pandey <lekhraj.pandey@coyotes.usd.edu>, Anthony Villano <anthony.villano@ucdenver.edu>
7
+ License: GPL-2.0-only
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Typing :: Typed
17
+ Requires-Python: <4.0,>=3.11
18
+ Requires-Dist: matplotlib
19
+ Requires-Dist: numpy
@@ -0,0 +1,6 @@
1
+ equibin/__init__.py,sha256=9GMwucoMtNZzq1RjqXwV3PEA-inAbfboT9uaht1-hgY,137
2
+ equibin/binning.py,sha256=gCu2ESETuF7963xfRH67frYJuMmAk0kw5jRc1ohRsN8,6703
3
+ equibin/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ equibin-0.1.0.dist-info/METADATA,sha256=llo5EzAvoS6rznWkeNiI02gl-yicIYSeEi17b8Cn860,844
5
+ equibin-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
6
+ equibin-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any