pycatdap 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pycatdap/__init__.py ADDED
@@ -0,0 +1,30 @@
1
+ """pycatdap: Python implementation of CATDAP (CATegorical Data Analysis Program).
2
+
3
+ CATDAP applies Akaike's Information Criterion (AIC) to categorical data analysis.
4
+ Originally developed by Sakamoto & Katsura (1980) at the Institute of Statistical
5
+ Mathematics, Japan.
6
+
7
+ Main functions:
8
+ catdap1 -- Pairwise AIC evaluation of categorical variable
9
+ associations
10
+ catdap2 -- Optimal explanatory variable subset search
11
+ with continuous variable binning
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from pycatdap._version import (
17
+ __version__,
18
+ __version_tuple__,
19
+ )
20
+ from pycatdap.catdap1 import Catdap1Result, catdap1
21
+ from pycatdap.catdap2 import Catdap2Result, catdap2
22
+
23
+ __all__ = [
24
+ "__version__",
25
+ "__version_tuple__",
26
+ "Catdap1Result",
27
+ "Catdap2Result",
28
+ "catdap1",
29
+ "catdap2",
30
+ ]
pycatdap/_aic.py ADDED
@@ -0,0 +1,194 @@
1
+ """Core AIC computation for contingency tables.
2
+
3
+ Implements the AIC statistics for two-way contingency tables as defined by
4
+ Sakamoto & Katsura (1980). Zero-frequency cells use the convention
5
+ ``0 * ln(0) = 0``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import numpy as np
11
+ import numpy.typing as npt
12
+
13
+ # ---------------------------------------------------------------------------
14
+ # Safe x*log(y) with 0*log(0) = 0
15
+ # ---------------------------------------------------------------------------
16
+
17
+ try:
18
+ from scipy.special import xlogy as _scipy_xlogy # type: ignore[import-untyped]
19
+
20
+ def _safe_xlogy(
21
+ x: npt.NDArray[np.float64],
22
+ y: npt.NDArray[np.float64],
23
+ ) -> npt.NDArray[np.float64]:
24
+ """Compute ``x * ln(y)`` with the convention ``0 * ln(0) = 0``.
25
+
26
+ Uses :func:`scipy.special.xlogy` when available.
27
+
28
+ Parameters
29
+ ----------
30
+ x : ndarray
31
+ Frequencies (non-negative).
32
+ y : ndarray
33
+ Denominators (non-negative).
34
+
35
+ Returns
36
+ -------
37
+ ndarray
38
+ Element-wise ``x * ln(y)``, with 0 where *x* is zero.
39
+ """
40
+ return np.asarray(_scipy_xlogy(x, y), dtype=np.float64)
41
+
42
+ except ImportError: # pragma: no cover – scipy optional
43
+
44
+ def _safe_xlogy(
45
+ x: npt.NDArray[np.float64],
46
+ y: npt.NDArray[np.float64],
47
+ ) -> npt.NDArray[np.float64]:
48
+ """Compute ``x * ln(y)`` with the convention ``0 * ln(0) = 0``.
49
+
50
+ Pure-numpy fallback when scipy is not installed.
51
+ """
52
+ x = np.asarray(x, dtype=np.float64)
53
+ y = np.asarray(y, dtype=np.float64)
54
+ # Guard: negative y with non-zero x indicates a bug upstream
55
+ if np.any((y < 0) & (x != 0)):
56
+ msg = "negative y with non-zero x: frequencies must be non-negative"
57
+ raise ValueError(msg)
58
+ return np.where(x == 0, 0.0, x * np.log(np.where(y > 0, y, 1.0)))
59
+
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # AIC(E; F) — two-way contingency table model
63
+ # ---------------------------------------------------------------------------
64
+
65
+
66
+ def compute_aic_twoway(
67
+ cross_freq: npt.NDArray[np.float64],
68
+ marginal_f: npt.NDArray[np.float64],
69
+ ) -> float:
70
+ r"""Compute AIC for the two-way table model.
71
+
72
+ .. math::
73
+
74
+ AIC(E; F) = -2 \sum_{i,j} n_{EF}(i,j) \ln \frac{n_{EF}(i,j)}{n_F(j)}
75
+ + 2 (C_E - 1) C_F
76
+
77
+ Parameters
78
+ ----------
79
+ cross_freq : ndarray, shape (C_E, C_F)
80
+ Cross-frequency table.
81
+ marginal_f : ndarray, shape (C_F,)
82
+ Marginal frequencies of the explanatory variable.
83
+
84
+ Returns
85
+ -------
86
+ float
87
+ AIC value for the two-way model.
88
+ """
89
+ if cross_freq.sum() == 0:
90
+ msg = "cross_freq must contain at least one observation"
91
+ raise ValueError(msg)
92
+
93
+ c_e: int = cross_freq.shape[0]
94
+ c_f: int = cross_freq.shape[1]
95
+
96
+ if marginal_f.shape[0] != c_f:
97
+ msg = (
98
+ f"marginal_f length ({marginal_f.shape[0]}) must equal "
99
+ f"cross_freq columns ({c_f})"
100
+ )
101
+ raise ValueError(msg)
102
+
103
+ # n_EF(i,j) / n_F(j) — broadcast marginal_f across rows
104
+ ratio = np.where(marginal_f > 0, cross_freq / marginal_f, 0.0)
105
+ log_likelihood = float(np.sum(_safe_xlogy(cross_freq, ratio)))
106
+
107
+ penalty = 2.0 * (c_e - 1) * c_f
108
+ return float(-2.0 * log_likelihood + penalty)
109
+
110
+
111
+ # ---------------------------------------------------------------------------
112
+ # AIC(E; φ) — null (base) model
113
+ # ---------------------------------------------------------------------------
114
+
115
+
116
+ def compute_base_aic(
117
+ marginal_e: npt.NDArray[np.float64],
118
+ n: int,
119
+ ) -> float:
120
+ r"""Compute AIC for the null model (no explanatory variable).
121
+
122
+ .. math::
123
+
124
+ AIC(E; \phi) = -2 \sum_i n_E(i) \ln \frac{n_E(i)}{n} + 2 (C_E - 1)
125
+
126
+ Parameters
127
+ ----------
128
+ marginal_e : ndarray, shape (C_E,)
129
+ Marginal frequencies of the response variable.
130
+ n : int
131
+ Total number of observations.
132
+
133
+ Returns
134
+ -------
135
+ float
136
+ AIC value for the null model.
137
+
138
+ Raises
139
+ ------
140
+ ValueError
141
+ If *n* is not positive.
142
+ """
143
+ if n <= 0:
144
+ msg = "n must be positive; received an empty dataset"
145
+ raise ValueError(msg)
146
+
147
+ c_e = len(marginal_e)
148
+
149
+ ratio = marginal_e / n
150
+ log_likelihood: float = float(np.sum(_safe_xlogy(marginal_e, ratio)))
151
+
152
+ penalty = 2.0 * (c_e - 1)
153
+ return -2.0 * log_likelihood + penalty
154
+
155
+
156
+ # ---------------------------------------------------------------------------
157
+ # ΔAIC = AIC(E; F) − AIC(E; φ)
158
+ # ---------------------------------------------------------------------------
159
+
160
+
161
+ def compute_delta_aic(
162
+ cross_freq: npt.NDArray[np.float64],
163
+ marginal_e: npt.NDArray[np.float64],
164
+ marginal_f: npt.NDArray[np.float64],
165
+ n: int,
166
+ ) -> float:
167
+ r"""Compute the delta AIC between two-way and null models.
168
+
169
+ .. math::
170
+
171
+ \Delta AIC = AIC(E; F) - AIC(E; \phi)
172
+
173
+ A negative value indicates that the explanatory variable *F* is
174
+ informative about the response *E*.
175
+
176
+ Parameters
177
+ ----------
178
+ cross_freq : ndarray, shape (C_E, C_F)
179
+ Cross-frequency table.
180
+ marginal_e : ndarray, shape (C_E,)
181
+ Marginal frequencies of the response variable.
182
+ marginal_f : ndarray, shape (C_F,)
183
+ Marginal frequencies of the explanatory variable.
184
+ n : int
185
+ Total number of observations.
186
+
187
+ Returns
188
+ -------
189
+ float
190
+ ΔAIC value (negative = explanatory variable is useful).
191
+ """
192
+ aic_twoway = compute_aic_twoway(cross_freq, marginal_f)
193
+ aic_base = compute_base_aic(marginal_e, n)
194
+ return aic_twoway - aic_base
@@ -0,0 +1,130 @@
1
+ """Contingency table construction utilities.
2
+
3
+ Converts pandas DataFrames into numpy arrays suitable for AIC computation.
4
+ Input DataFrames are never mutated.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from numpy.typing import NDArray
12
+
13
+
14
+ def build_crosstab(
15
+ data: pd.DataFrame,
16
+ response: str,
17
+ explanatory: str,
18
+ ) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64], int]:
19
+ """Build a two-way frequency table from a DataFrame.
20
+
21
+ Rows with ``NaN`` in either column are dropped before tabulation.
22
+
23
+ Parameters
24
+ ----------
25
+ data : DataFrame
26
+ Input data (not modified).
27
+ response : str
28
+ Column name of the response variable (rows of cross-table).
29
+ explanatory : str
30
+ Column name of the explanatory variable (columns of cross-table).
31
+
32
+ Returns
33
+ -------
34
+ cross_freq : ndarray, shape (C_E, C_F)
35
+ Cross-frequency table.
36
+ marginal_e : ndarray, shape (C_E,)
37
+ Marginal frequencies of the response variable.
38
+ marginal_f : ndarray, shape (C_F,)
39
+ Marginal frequencies of the explanatory variable.
40
+ n : int
41
+ Total number of valid observations.
42
+
43
+ Raises
44
+ ------
45
+ KeyError
46
+ If *response* or *explanatory* is not in the DataFrame.
47
+ """
48
+ # Validate columns exist (raises KeyError if not)
49
+ _ = data[response], data[explanatory]
50
+
51
+ # Drop NaN in the two relevant columns only — do not modify input
52
+ subset = data[[response, explanatory]].dropna()
53
+
54
+ ct = pd.crosstab(subset[response], subset[explanatory])
55
+ cross_freq = ct.to_numpy(dtype=np.float64)
56
+ marginal_e = cross_freq.sum(axis=1).astype(np.float64)
57
+ marginal_f = cross_freq.sum(axis=0).astype(np.float64)
58
+ n = int(cross_freq.sum())
59
+
60
+ return cross_freq, marginal_e, marginal_f, n
61
+
62
+
63
+ def build_multidim_crosstab(
64
+ data: pd.DataFrame,
65
+ response: str,
66
+ explanatory_set: list[str],
67
+ ) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64], int]:
68
+ """Build a cross-frequency table with a composite explanatory variable.
69
+
70
+ The explanatory variables are combined into a single composite variable
71
+ (tupled categories) before tabulation.
72
+
73
+ Parameters
74
+ ----------
75
+ data : DataFrame
76
+ Input data (not modified).
77
+ response : str
78
+ Column name of the response variable.
79
+ explanatory_set : list[str]
80
+ Column names of the explanatory variables to combine.
81
+
82
+ Returns
83
+ -------
84
+ cross_freq : ndarray, shape (C_E, C_F_combined)
85
+ Cross-frequency table.
86
+ marginal_e : ndarray, shape (C_E,)
87
+ Marginal frequencies of the response variable.
88
+ marginal_f : ndarray, shape (C_F_combined,)
89
+ Marginal frequencies of the composite explanatory variable.
90
+ n : int
91
+ Total number of valid observations.
92
+
93
+ Raises
94
+ ------
95
+ ValueError
96
+ If *explanatory_set* is empty.
97
+ KeyError
98
+ If *response* or any column in *explanatory_set* is not in the DataFrame.
99
+ """
100
+ if not explanatory_set:
101
+ msg = "explanatory_set must not be empty"
102
+ raise ValueError(msg)
103
+
104
+ # Validate all columns exist
105
+ _ = data[response]
106
+ for col in explanatory_set:
107
+ _ = data[col]
108
+
109
+ if len(explanatory_set) == 1:
110
+ return build_crosstab(data, response, explanatory_set[0])
111
+
112
+ cols = [response, *explanatory_set]
113
+ subset = data[cols].dropna()
114
+
115
+ # Use unit separator (U+001F) to avoid collisions with category labels
116
+ composite = subset[explanatory_set].astype(str).agg("\x1f".join, axis=1)
117
+ temp = pd.DataFrame(
118
+ {
119
+ "_response_": subset[response].to_numpy(),
120
+ "_composite_": composite.to_numpy(),
121
+ }
122
+ )
123
+
124
+ ct = pd.crosstab(temp["_response_"], temp["_composite_"])
125
+ cross_freq = ct.to_numpy(dtype=np.float64)
126
+ marginal_e = cross_freq.sum(axis=1).astype(np.float64)
127
+ marginal_f = cross_freq.sum(axis=0).astype(np.float64)
128
+ n = int(cross_freq.sum())
129
+
130
+ return cross_freq, marginal_e, marginal_f, n
pycatdap/_pooling.py ADDED
@@ -0,0 +1,349 @@
1
+ """Continuous variable categorization (pooling) via AIC minimization.
2
+
3
+ Converts continuous variables into categorical ones by finding AIC-optimal
4
+ bin boundaries. Two methods are provided:
5
+
6
+ * **Equal pooling** (``pool=0``): top-down, equal-interval bins merged greedily.
7
+ * **Unequal pooling** (``pool=1``, default): bottom-up, fine bins merged until
8
+ no merge improves AIC.
9
+
10
+ Input arrays are never mutated.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from dataclasses import dataclass
16
+
17
+ import numpy as np
18
+ import numpy.typing as npt
19
+
20
+ from pycatdap._aic import _safe_xlogy
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Result container
24
+ # ---------------------------------------------------------------------------
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class PoolingResult:
29
+ """Result of AIC-optimal binning.
30
+
31
+ Attributes
32
+ ----------
33
+ codes : ndarray of int
34
+ Bin assignment for each observation (0-indexed).
35
+ boundaries : list[float]
36
+ Sorted internal bin boundary values. ``len(boundaries) == n_bins - 1``
37
+ where *n_bins* is the number of distinct bins.
38
+ """
39
+
40
+ codes: npt.NDArray[np.intp]
41
+ boundaries: list[float]
42
+
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Internal helpers
46
+ # ---------------------------------------------------------------------------
47
+
48
+
49
+ def _auto_accuracy(values: npt.NDArray[np.float64]) -> float:
50
+ """Guess accuracy from the smallest non-zero gap between sorted values."""
51
+ sorted_vals = np.sort(np.unique(values))
52
+ if len(sorted_vals) <= 1:
53
+ return 1.0
54
+ diffs = np.diff(sorted_vals)
55
+ positive_diffs = diffs[diffs > 0]
56
+ if len(positive_diffs) == 0:
57
+ return 1.0
58
+ return float(np.min(positive_diffs))
59
+
60
+
61
+ def _initial_bins(
62
+ values: npt.NDArray[np.float64],
63
+ accuracy: float,
64
+ ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.float64]]:
65
+ """Divide the value range into fine-grained bins of width *accuracy*.
66
+
67
+ Returns
68
+ -------
69
+ bin_codes : ndarray of int
70
+ Bin index for each observation.
71
+ edges : ndarray of float
72
+ Bin edge values (length = n_bins + 1).
73
+ """
74
+ vmin, vmax = float(values.min()), float(values.max())
75
+
76
+ if vmax - vmin < accuracy:
77
+ # All values in a single bin
78
+ return np.zeros(len(values), dtype=np.intp), np.array([vmin, vmax + accuracy])
79
+
80
+ n_bins = max(1, int(np.ceil((vmax - vmin) / accuracy)))
81
+ edges: npt.NDArray[np.float64] = np.linspace(
82
+ vmin, vmax + accuracy * 0.01, n_bins + 1
83
+ ).astype(np.float64)
84
+ # np.digitize returns 1-based indices; subtract 1, clip to valid range
85
+ codes = np.clip(np.digitize(values, edges[1:-1]), 0, n_bins - 1).astype(np.intp)
86
+ return codes, edges
87
+
88
+
89
+ def _build_bin_freq_table(
90
+ bin_codes: npt.NDArray[np.intp],
91
+ response_codes: npt.NDArray[np.intp],
92
+ n_bins: int,
93
+ n_resp_cats: int,
94
+ ) -> npt.NDArray[np.float64]:
95
+ """Build a (n_resp_cats, n_bins) frequency table from bin and response codes."""
96
+ freq = np.zeros((n_resp_cats, n_bins), dtype=np.float64)
97
+ np.add.at(freq, (response_codes, bin_codes), 1)
98
+ return freq
99
+
100
+
101
+ def _bin_aic(freq: npt.NDArray[np.float64]) -> float:
102
+ """Compute AIC(E; F) for a frequency table (response x bins).
103
+
104
+ AIC = -2 * sum(n_ij * ln(n_ij / n_j)) + 2 * (C_E - 1) * C_F
105
+ """
106
+ c_e, c_f = freq.shape
107
+ marg_f = freq.sum(axis=0)
108
+ ratio = np.where(marg_f > 0, freq / marg_f, 0.0)
109
+ loglik = float(np.sum(_safe_xlogy(freq, ratio)))
110
+ penalty = 2.0 * (c_e - 1) * c_f
111
+ return float(-2.0 * loglik + penalty)
112
+
113
+
114
+ def _merge_bins(
115
+ freq: npt.NDArray[np.float64],
116
+ i: int,
117
+ ) -> npt.NDArray[np.float64]:
118
+ """Return a new frequency table with bins *i* and *i+1* merged."""
119
+ new_freq: npt.NDArray[np.float64] = np.delete(freq, i + 1, axis=1).copy()
120
+ new_freq[:, i] = freq[:, i] + freq[:, i + 1]
121
+ return new_freq
122
+
123
+
124
+ def _encode_response(
125
+ response: npt.NDArray[np.object_],
126
+ ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.object_]]:
127
+ """Encode response labels to integer codes."""
128
+ uniq, codes = np.unique(response, return_inverse=True)
129
+ return codes.astype(np.intp), uniq
130
+
131
+
132
+ def _codes_from_boundaries(
133
+ values: npt.NDArray[np.float64],
134
+ boundaries: list[float],
135
+ ) -> npt.NDArray[np.intp]:
136
+ """Assign bin codes from a sorted boundary list."""
137
+ if not boundaries:
138
+ return np.zeros(len(values), dtype=np.intp)
139
+ return np.digitize(values, boundaries).astype(np.intp)
140
+
141
+
142
+ # ---------------------------------------------------------------------------
143
+ # Equal pooling (pool=0, top-down)
144
+ # ---------------------------------------------------------------------------
145
+
146
+
147
+ def equal_pooling(
148
+ values: npt.NDArray[np.float64],
149
+ response: npt.NDArray[np.object_],
150
+ accuracy: float,
151
+ ) -> PoolingResult:
152
+ """Equal-interval pooling (top-down greedy merge).
153
+
154
+ Start with bins of width *accuracy*, then greedily merge adjacent
155
+ bins when merging reduces AIC.
156
+
157
+ Parameters
158
+ ----------
159
+ values : ndarray
160
+ Continuous variable values.
161
+ response : ndarray
162
+ Response variable labels (categorical).
163
+ accuracy : float
164
+ Minimum bin width (observation precision).
165
+
166
+ Returns
167
+ -------
168
+ PoolingResult
169
+ """
170
+ values = np.asarray(values, dtype=np.float64)
171
+ if len(values) == 0:
172
+ return PoolingResult(codes=np.array([], dtype=np.intp), boundaries=[])
173
+
174
+ resp_codes, _ = _encode_response(np.asarray(response))
175
+
176
+ bin_codes, edges = _initial_bins(values, accuracy)
177
+ n_bins = len(edges) - 1
178
+ n_resp_cats = int(resp_codes.max()) + 1
179
+
180
+ freq = _build_bin_freq_table(bin_codes, resp_codes, n_bins, n_resp_cats)
181
+
182
+ # Remove empty bins
183
+ nonempty = freq.sum(axis=0) > 0
184
+ if not np.all(nonempty):
185
+ freq = freq[:, nonempty]
186
+ edges = np.concatenate(
187
+ # Keep first edge, then non-empty right edges
188
+ [edges[:1], edges[1:][nonempty]]
189
+ )
190
+ # Pad last edge if needed
191
+ if len(edges) <= freq.shape[1]:
192
+ edges = np.append(edges, edges[-1] + accuracy)
193
+
194
+ # Greedy merge: merge the pair that most decreases AIC
195
+ current_aic = _bin_aic(freq)
196
+ changed = True
197
+ while changed and freq.shape[1] > 1:
198
+ changed = False
199
+ best_delta = 0.0
200
+ best_idx = -1
201
+ best_freq: npt.NDArray[np.float64] | None = None
202
+
203
+ for i in range(freq.shape[1] - 1):
204
+ merged = _merge_bins(freq, i)
205
+ new_aic = _bin_aic(merged)
206
+ delta = new_aic - current_aic
207
+ if delta < best_delta:
208
+ best_delta = delta
209
+ best_idx = i
210
+ best_freq = merged
211
+
212
+ if best_freq is not None:
213
+ freq = best_freq
214
+ # Remove the merged edge
215
+ edges = np.delete(edges, best_idx + 1)
216
+ current_aic += best_delta
217
+ changed = True
218
+
219
+ # Build boundaries (internal edges, excluding first and last)
220
+ boundaries = sorted(float(e) for e in edges[1:-1])
221
+ codes = _codes_from_boundaries(values, boundaries)
222
+ return PoolingResult(codes=codes, boundaries=boundaries)
223
+
224
+
225
+ # ---------------------------------------------------------------------------
226
+ # Unequal pooling (pool=1, bottom-up) — default
227
+ # ---------------------------------------------------------------------------
228
+
229
+
230
+ def unequal_pooling(
231
+ values: npt.NDArray[np.float64],
232
+ response: npt.NDArray[np.object_],
233
+ accuracy: float,
234
+ ) -> PoolingResult:
235
+ """Unequal-interval pooling (bottom-up greedy merge).
236
+
237
+ Start with fine bins of width *accuracy*, then iteratively merge the
238
+ adjacent pair that yields the largest AIC decrease. Stop when no
239
+ merge improves AIC.
240
+
241
+ Parameters
242
+ ----------
243
+ values : ndarray
244
+ Continuous variable values.
245
+ response : ndarray
246
+ Response variable labels (categorical).
247
+ accuracy : float
248
+ Minimum bin width.
249
+
250
+ Returns
251
+ -------
252
+ PoolingResult
253
+ """
254
+ values = np.asarray(values, dtype=np.float64)
255
+ if len(values) == 0:
256
+ return PoolingResult(codes=np.array([], dtype=np.intp), boundaries=[])
257
+
258
+ resp_codes, _ = _encode_response(np.asarray(response))
259
+
260
+ bin_codes, edges = _initial_bins(values, accuracy)
261
+ n_bins = len(edges) - 1
262
+ n_resp_cats = int(resp_codes.max()) + 1
263
+
264
+ freq = _build_bin_freq_table(bin_codes, resp_codes, n_bins, n_resp_cats)
265
+
266
+ # Remove empty bins
267
+ nonempty = freq.sum(axis=0) > 0
268
+ if not np.all(nonempty):
269
+ freq = freq[:, nonempty]
270
+ edges = np.concatenate([edges[:1], edges[1:][nonempty]])
271
+ if len(edges) <= freq.shape[1]:
272
+ edges = np.append(edges, edges[-1] + accuracy)
273
+
274
+ # Bottom-up: iteratively merge best adjacent pair
275
+ current_aic = _bin_aic(freq)
276
+ while freq.shape[1] > 1:
277
+ best_delta = 0.0
278
+ best_idx = -1
279
+ best_freq: npt.NDArray[np.float64] | None = None
280
+
281
+ for i in range(freq.shape[1] - 1):
282
+ merged = _merge_bins(freq, i)
283
+ new_aic = _bin_aic(merged)
284
+ delta = new_aic - current_aic
285
+ if delta < best_delta:
286
+ best_delta = delta
287
+ best_idx = i
288
+ best_freq = merged
289
+
290
+ if best_freq is None:
291
+ break # No merge improves AIC
292
+
293
+ freq = best_freq
294
+ edges = np.delete(edges, best_idx + 1)
295
+ current_aic += best_delta
296
+
297
+ boundaries = sorted(float(e) for e in edges[1:-1])
298
+ codes = _codes_from_boundaries(values, boundaries)
299
+ return PoolingResult(codes=codes, boundaries=boundaries)
300
+
301
+
302
+ # ---------------------------------------------------------------------------
303
+ # Public dispatch
304
+ # ---------------------------------------------------------------------------
305
+
306
+
307
+ def optimal_binning(
308
+ values: npt.NDArray[np.float64],
309
+ response: npt.NDArray[np.object_],
310
+ method: str = "bottom_up",
311
+ accuracy: float | None = None,
312
+ ) -> PoolingResult:
313
+ """Categorize a continuous variable via AIC-optimal binning.
314
+
315
+ Parameters
316
+ ----------
317
+ values : ndarray
318
+ Continuous variable values.
319
+ response : ndarray
320
+ Response variable labels (categorical).
321
+ method : {'bottom_up', 'top_down'}
322
+ ``'bottom_up'`` (default) uses unequal pooling;
323
+ ``'top_down'`` uses equal pooling.
324
+ accuracy : float or None
325
+ Minimum bin width. If ``None``, auto-detected from data.
326
+
327
+ Returns
328
+ -------
329
+ PoolingResult
330
+ Bin codes and boundary values.
331
+
332
+ Raises
333
+ ------
334
+ ValueError
335
+ If *method* is not ``'bottom_up'`` or ``'top_down'``.
336
+ """
337
+ values = np.asarray(values, dtype=np.float64)
338
+ response = np.asarray(response)
339
+
340
+ if accuracy is None:
341
+ accuracy = _auto_accuracy(values)
342
+
343
+ if method == "bottom_up":
344
+ return unequal_pooling(values, response, accuracy)
345
+ if method == "top_down":
346
+ return equal_pooling(values, response, accuracy)
347
+
348
+ msg = f"method must be 'bottom_up' or 'top_down', got '{method}'"
349
+ raise ValueError(msg)