pycatdap 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycatdap/__init__.py +30 -0
- pycatdap/_aic.py +194 -0
- pycatdap/_contingency.py +130 -0
- pycatdap/_pooling.py +349 -0
- pycatdap/_subset_search.py +136 -0
- pycatdap/_version.py +34 -0
- pycatdap/catdap1.py +143 -0
- pycatdap/catdap2.py +232 -0
- pycatdap/data/__init__.py +0 -0
- pycatdap/data/health_data.csv +53 -0
- pycatdap/data/hello_goodbye.csv.gz +0 -0
- pycatdap/datasets.py +91 -0
- pycatdap/plotting.py +202 -0
- pycatdap/py.typed +0 -0
- pycatdap-0.2.0.dist-info/METADATA +72 -0
- pycatdap-0.2.0.dist-info/RECORD +18 -0
- pycatdap-0.2.0.dist-info/WHEEL +4 -0
- pycatdap-0.2.0.dist-info/licenses/LICENSE +21 -0
pycatdap/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""pycatdap: Python implementation of CATDAP (CATegorical Data Analysis Program).
|
|
2
|
+
|
|
3
|
+
CATDAP applies Akaike's Information Criterion (AIC) to categorical data analysis.
|
|
4
|
+
Originally developed by Sakamoto & Katsura (1980) at the Institute of Statistical
|
|
5
|
+
Mathematics, Japan.
|
|
6
|
+
|
|
7
|
+
Main functions:
|
|
8
|
+
catdap1 -- Pairwise AIC evaluation of categorical variable
|
|
9
|
+
associations
|
|
10
|
+
catdap2 -- Optimal explanatory variable subset search
|
|
11
|
+
with continuous variable binning
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from pycatdap._version import (
|
|
17
|
+
__version__,
|
|
18
|
+
__version_tuple__,
|
|
19
|
+
)
|
|
20
|
+
from pycatdap.catdap1 import Catdap1Result, catdap1
|
|
21
|
+
from pycatdap.catdap2 import Catdap2Result, catdap2
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"__version__",
|
|
25
|
+
"__version_tuple__",
|
|
26
|
+
"Catdap1Result",
|
|
27
|
+
"Catdap2Result",
|
|
28
|
+
"catdap1",
|
|
29
|
+
"catdap2",
|
|
30
|
+
]
|
pycatdap/_aic.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Core AIC computation for contingency tables.
|
|
2
|
+
|
|
3
|
+
Implements the AIC statistics for two-way contingency tables as defined by
|
|
4
|
+
Sakamoto & Katsura (1980). Zero-frequency cells use the convention
|
|
5
|
+
``0 * ln(0) = 0``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import numpy.typing as npt
|
|
12
|
+
|
|
13
|
+
# ---------------------------------------------------------------------------
|
|
14
|
+
# Safe x*log(y) with 0*log(0) = 0
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from scipy.special import xlogy as _scipy_xlogy # type: ignore[import-untyped]
|
|
19
|
+
|
|
20
|
+
def _safe_xlogy(
|
|
21
|
+
x: npt.NDArray[np.float64],
|
|
22
|
+
y: npt.NDArray[np.float64],
|
|
23
|
+
) -> npt.NDArray[np.float64]:
|
|
24
|
+
"""Compute ``x * ln(y)`` with the convention ``0 * ln(0) = 0``.
|
|
25
|
+
|
|
26
|
+
Uses :func:`scipy.special.xlogy` when available.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
x : ndarray
|
|
31
|
+
Frequencies (non-negative).
|
|
32
|
+
y : ndarray
|
|
33
|
+
Denominators (non-negative).
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
ndarray
|
|
38
|
+
Element-wise ``x * ln(y)``, with 0 where *x* is zero.
|
|
39
|
+
"""
|
|
40
|
+
return np.asarray(_scipy_xlogy(x, y), dtype=np.float64)
|
|
41
|
+
|
|
42
|
+
except ImportError: # pragma: no cover – scipy optional
|
|
43
|
+
|
|
44
|
+
def _safe_xlogy(
|
|
45
|
+
x: npt.NDArray[np.float64],
|
|
46
|
+
y: npt.NDArray[np.float64],
|
|
47
|
+
) -> npt.NDArray[np.float64]:
|
|
48
|
+
"""Compute ``x * ln(y)`` with the convention ``0 * ln(0) = 0``.
|
|
49
|
+
|
|
50
|
+
Pure-numpy fallback when scipy is not installed.
|
|
51
|
+
"""
|
|
52
|
+
x = np.asarray(x, dtype=np.float64)
|
|
53
|
+
y = np.asarray(y, dtype=np.float64)
|
|
54
|
+
# Guard: negative y with non-zero x indicates a bug upstream
|
|
55
|
+
if np.any((y < 0) & (x != 0)):
|
|
56
|
+
msg = "negative y with non-zero x: frequencies must be non-negative"
|
|
57
|
+
raise ValueError(msg)
|
|
58
|
+
return np.where(x == 0, 0.0, x * np.log(np.where(y > 0, y, 1.0)))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
# AIC(E; F) — two-way contingency table model
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def compute_aic_twoway(
|
|
67
|
+
cross_freq: npt.NDArray[np.float64],
|
|
68
|
+
marginal_f: npt.NDArray[np.float64],
|
|
69
|
+
) -> float:
|
|
70
|
+
r"""Compute AIC for the two-way table model.
|
|
71
|
+
|
|
72
|
+
.. math::
|
|
73
|
+
|
|
74
|
+
AIC(E; F) = -2 \sum_{i,j} n_{EF}(i,j) \ln \frac{n_{EF}(i,j)}{n_F(j)}
|
|
75
|
+
+ 2 (C_E - 1) C_F
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
cross_freq : ndarray, shape (C_E, C_F)
|
|
80
|
+
Cross-frequency table.
|
|
81
|
+
marginal_f : ndarray, shape (C_F,)
|
|
82
|
+
Marginal frequencies of the explanatory variable.
|
|
83
|
+
|
|
84
|
+
Returns
|
|
85
|
+
-------
|
|
86
|
+
float
|
|
87
|
+
AIC value for the two-way model.
|
|
88
|
+
"""
|
|
89
|
+
if cross_freq.sum() == 0:
|
|
90
|
+
msg = "cross_freq must contain at least one observation"
|
|
91
|
+
raise ValueError(msg)
|
|
92
|
+
|
|
93
|
+
c_e: int = cross_freq.shape[0]
|
|
94
|
+
c_f: int = cross_freq.shape[1]
|
|
95
|
+
|
|
96
|
+
if marginal_f.shape[0] != c_f:
|
|
97
|
+
msg = (
|
|
98
|
+
f"marginal_f length ({marginal_f.shape[0]}) must equal "
|
|
99
|
+
f"cross_freq columns ({c_f})"
|
|
100
|
+
)
|
|
101
|
+
raise ValueError(msg)
|
|
102
|
+
|
|
103
|
+
# n_EF(i,j) / n_F(j) — broadcast marginal_f across rows
|
|
104
|
+
ratio = np.where(marginal_f > 0, cross_freq / marginal_f, 0.0)
|
|
105
|
+
log_likelihood = float(np.sum(_safe_xlogy(cross_freq, ratio)))
|
|
106
|
+
|
|
107
|
+
penalty = 2.0 * (c_e - 1) * c_f
|
|
108
|
+
return float(-2.0 * log_likelihood + penalty)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# ---------------------------------------------------------------------------
|
|
112
|
+
# AIC(E; φ) — null (base) model
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def compute_base_aic(
|
|
117
|
+
marginal_e: npt.NDArray[np.float64],
|
|
118
|
+
n: int,
|
|
119
|
+
) -> float:
|
|
120
|
+
r"""Compute AIC for the null model (no explanatory variable).
|
|
121
|
+
|
|
122
|
+
.. math::
|
|
123
|
+
|
|
124
|
+
AIC(E; \phi) = -2 \sum_i n_E(i) \ln \frac{n_E(i)}{n} + 2 (C_E - 1)
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
marginal_e : ndarray, shape (C_E,)
|
|
129
|
+
Marginal frequencies of the response variable.
|
|
130
|
+
n : int
|
|
131
|
+
Total number of observations.
|
|
132
|
+
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
float
|
|
136
|
+
AIC value for the null model.
|
|
137
|
+
|
|
138
|
+
Raises
|
|
139
|
+
------
|
|
140
|
+
ValueError
|
|
141
|
+
If *n* is not positive.
|
|
142
|
+
"""
|
|
143
|
+
if n <= 0:
|
|
144
|
+
msg = "n must be positive; received an empty dataset"
|
|
145
|
+
raise ValueError(msg)
|
|
146
|
+
|
|
147
|
+
c_e = len(marginal_e)
|
|
148
|
+
|
|
149
|
+
ratio = marginal_e / n
|
|
150
|
+
log_likelihood: float = float(np.sum(_safe_xlogy(marginal_e, ratio)))
|
|
151
|
+
|
|
152
|
+
penalty = 2.0 * (c_e - 1)
|
|
153
|
+
return -2.0 * log_likelihood + penalty
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# ---------------------------------------------------------------------------
|
|
157
|
+
# ΔAIC = AIC(E; F) − AIC(E; φ)
|
|
158
|
+
# ---------------------------------------------------------------------------
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def compute_delta_aic(
|
|
162
|
+
cross_freq: npt.NDArray[np.float64],
|
|
163
|
+
marginal_e: npt.NDArray[np.float64],
|
|
164
|
+
marginal_f: npt.NDArray[np.float64],
|
|
165
|
+
n: int,
|
|
166
|
+
) -> float:
|
|
167
|
+
r"""Compute the delta AIC between two-way and null models.
|
|
168
|
+
|
|
169
|
+
.. math::
|
|
170
|
+
|
|
171
|
+
\Delta AIC = AIC(E; F) - AIC(E; \phi)
|
|
172
|
+
|
|
173
|
+
A negative value indicates that the explanatory variable *F* is
|
|
174
|
+
informative about the response *E*.
|
|
175
|
+
|
|
176
|
+
Parameters
|
|
177
|
+
----------
|
|
178
|
+
cross_freq : ndarray, shape (C_E, C_F)
|
|
179
|
+
Cross-frequency table.
|
|
180
|
+
marginal_e : ndarray, shape (C_E,)
|
|
181
|
+
Marginal frequencies of the response variable.
|
|
182
|
+
marginal_f : ndarray, shape (C_F,)
|
|
183
|
+
Marginal frequencies of the explanatory variable.
|
|
184
|
+
n : int
|
|
185
|
+
Total number of observations.
|
|
186
|
+
|
|
187
|
+
Returns
|
|
188
|
+
-------
|
|
189
|
+
float
|
|
190
|
+
ΔAIC value (negative = explanatory variable is useful).
|
|
191
|
+
"""
|
|
192
|
+
aic_twoway = compute_aic_twoway(cross_freq, marginal_f)
|
|
193
|
+
aic_base = compute_base_aic(marginal_e, n)
|
|
194
|
+
return aic_twoway - aic_base
|
pycatdap/_contingency.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Contingency table construction utilities.
|
|
2
|
+
|
|
3
|
+
Converts pandas DataFrames into numpy arrays suitable for AIC computation.
|
|
4
|
+
Input DataFrames are never mutated.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from numpy.typing import NDArray
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def build_crosstab(
|
|
15
|
+
data: pd.DataFrame,
|
|
16
|
+
response: str,
|
|
17
|
+
explanatory: str,
|
|
18
|
+
) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64], int]:
|
|
19
|
+
"""Build a two-way frequency table from a DataFrame.
|
|
20
|
+
|
|
21
|
+
Rows with ``NaN`` in either column are dropped before tabulation.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
data : DataFrame
|
|
26
|
+
Input data (not modified).
|
|
27
|
+
response : str
|
|
28
|
+
Column name of the response variable (rows of cross-table).
|
|
29
|
+
explanatory : str
|
|
30
|
+
Column name of the explanatory variable (columns of cross-table).
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
cross_freq : ndarray, shape (C_E, C_F)
|
|
35
|
+
Cross-frequency table.
|
|
36
|
+
marginal_e : ndarray, shape (C_E,)
|
|
37
|
+
Marginal frequencies of the response variable.
|
|
38
|
+
marginal_f : ndarray, shape (C_F,)
|
|
39
|
+
Marginal frequencies of the explanatory variable.
|
|
40
|
+
n : int
|
|
41
|
+
Total number of valid observations.
|
|
42
|
+
|
|
43
|
+
Raises
|
|
44
|
+
------
|
|
45
|
+
KeyError
|
|
46
|
+
If *response* or *explanatory* is not in the DataFrame.
|
|
47
|
+
"""
|
|
48
|
+
# Validate columns exist (raises KeyError if not)
|
|
49
|
+
_ = data[response], data[explanatory]
|
|
50
|
+
|
|
51
|
+
# Drop NaN in the two relevant columns only — do not modify input
|
|
52
|
+
subset = data[[response, explanatory]].dropna()
|
|
53
|
+
|
|
54
|
+
ct = pd.crosstab(subset[response], subset[explanatory])
|
|
55
|
+
cross_freq = ct.to_numpy(dtype=np.float64)
|
|
56
|
+
marginal_e = cross_freq.sum(axis=1).astype(np.float64)
|
|
57
|
+
marginal_f = cross_freq.sum(axis=0).astype(np.float64)
|
|
58
|
+
n = int(cross_freq.sum())
|
|
59
|
+
|
|
60
|
+
return cross_freq, marginal_e, marginal_f, n
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def build_multidim_crosstab(
|
|
64
|
+
data: pd.DataFrame,
|
|
65
|
+
response: str,
|
|
66
|
+
explanatory_set: list[str],
|
|
67
|
+
) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64], int]:
|
|
68
|
+
"""Build a cross-frequency table with a composite explanatory variable.
|
|
69
|
+
|
|
70
|
+
The explanatory variables are combined into a single composite variable
|
|
71
|
+
(tupled categories) before tabulation.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
data : DataFrame
|
|
76
|
+
Input data (not modified).
|
|
77
|
+
response : str
|
|
78
|
+
Column name of the response variable.
|
|
79
|
+
explanatory_set : list[str]
|
|
80
|
+
Column names of the explanatory variables to combine.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
cross_freq : ndarray, shape (C_E, C_F_combined)
|
|
85
|
+
Cross-frequency table.
|
|
86
|
+
marginal_e : ndarray, shape (C_E,)
|
|
87
|
+
Marginal frequencies of the response variable.
|
|
88
|
+
marginal_f : ndarray, shape (C_F_combined,)
|
|
89
|
+
Marginal frequencies of the composite explanatory variable.
|
|
90
|
+
n : int
|
|
91
|
+
Total number of valid observations.
|
|
92
|
+
|
|
93
|
+
Raises
|
|
94
|
+
------
|
|
95
|
+
ValueError
|
|
96
|
+
If *explanatory_set* is empty.
|
|
97
|
+
KeyError
|
|
98
|
+
If *response* or any column in *explanatory_set* is not in the DataFrame.
|
|
99
|
+
"""
|
|
100
|
+
if not explanatory_set:
|
|
101
|
+
msg = "explanatory_set must not be empty"
|
|
102
|
+
raise ValueError(msg)
|
|
103
|
+
|
|
104
|
+
# Validate all columns exist
|
|
105
|
+
_ = data[response]
|
|
106
|
+
for col in explanatory_set:
|
|
107
|
+
_ = data[col]
|
|
108
|
+
|
|
109
|
+
if len(explanatory_set) == 1:
|
|
110
|
+
return build_crosstab(data, response, explanatory_set[0])
|
|
111
|
+
|
|
112
|
+
cols = [response, *explanatory_set]
|
|
113
|
+
subset = data[cols].dropna()
|
|
114
|
+
|
|
115
|
+
# Use unit separator (U+001F) to avoid collisions with category labels
|
|
116
|
+
composite = subset[explanatory_set].astype(str).agg("\x1f".join, axis=1)
|
|
117
|
+
temp = pd.DataFrame(
|
|
118
|
+
{
|
|
119
|
+
"_response_": subset[response].to_numpy(),
|
|
120
|
+
"_composite_": composite.to_numpy(),
|
|
121
|
+
}
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
ct = pd.crosstab(temp["_response_"], temp["_composite_"])
|
|
125
|
+
cross_freq = ct.to_numpy(dtype=np.float64)
|
|
126
|
+
marginal_e = cross_freq.sum(axis=1).astype(np.float64)
|
|
127
|
+
marginal_f = cross_freq.sum(axis=0).astype(np.float64)
|
|
128
|
+
n = int(cross_freq.sum())
|
|
129
|
+
|
|
130
|
+
return cross_freq, marginal_e, marginal_f, n
|
pycatdap/_pooling.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
"""Continuous variable categorization (pooling) via AIC minimization.
|
|
2
|
+
|
|
3
|
+
Converts continuous variables into categorical ones by finding AIC-optimal
|
|
4
|
+
bin boundaries. Two methods are provided:
|
|
5
|
+
|
|
6
|
+
* **Equal pooling** (``pool=0``): top-down, equal-interval bins merged greedily.
|
|
7
|
+
* **Unequal pooling** (``pool=1``, default): bottom-up, fine bins merged until
|
|
8
|
+
no merge improves AIC.
|
|
9
|
+
|
|
10
|
+
Input arrays are never mutated.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import numpy.typing as npt
|
|
19
|
+
|
|
20
|
+
from pycatdap._aic import _safe_xlogy
|
|
21
|
+
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# Result container
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class PoolingResult:
|
|
29
|
+
"""Result of AIC-optimal binning.
|
|
30
|
+
|
|
31
|
+
Attributes
|
|
32
|
+
----------
|
|
33
|
+
codes : ndarray of int
|
|
34
|
+
Bin assignment for each observation (0-indexed).
|
|
35
|
+
boundaries : list[float]
|
|
36
|
+
Sorted internal bin boundary values. ``len(boundaries) == n_bins - 1``
|
|
37
|
+
where *n_bins* is the number of distinct bins.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
codes: npt.NDArray[np.intp]
|
|
41
|
+
boundaries: list[float]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Internal helpers
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _auto_accuracy(values: npt.NDArray[np.float64]) -> float:
|
|
50
|
+
"""Guess accuracy from the smallest non-zero gap between sorted values."""
|
|
51
|
+
sorted_vals = np.sort(np.unique(values))
|
|
52
|
+
if len(sorted_vals) <= 1:
|
|
53
|
+
return 1.0
|
|
54
|
+
diffs = np.diff(sorted_vals)
|
|
55
|
+
positive_diffs = diffs[diffs > 0]
|
|
56
|
+
if len(positive_diffs) == 0:
|
|
57
|
+
return 1.0
|
|
58
|
+
return float(np.min(positive_diffs))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _initial_bins(
|
|
62
|
+
values: npt.NDArray[np.float64],
|
|
63
|
+
accuracy: float,
|
|
64
|
+
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.float64]]:
|
|
65
|
+
"""Divide the value range into fine-grained bins of width *accuracy*.
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
bin_codes : ndarray of int
|
|
70
|
+
Bin index for each observation.
|
|
71
|
+
edges : ndarray of float
|
|
72
|
+
Bin edge values (length = n_bins + 1).
|
|
73
|
+
"""
|
|
74
|
+
vmin, vmax = float(values.min()), float(values.max())
|
|
75
|
+
|
|
76
|
+
if vmax - vmin < accuracy:
|
|
77
|
+
# All values in a single bin
|
|
78
|
+
return np.zeros(len(values), dtype=np.intp), np.array([vmin, vmax + accuracy])
|
|
79
|
+
|
|
80
|
+
n_bins = max(1, int(np.ceil((vmax - vmin) / accuracy)))
|
|
81
|
+
edges: npt.NDArray[np.float64] = np.linspace(
|
|
82
|
+
vmin, vmax + accuracy * 0.01, n_bins + 1
|
|
83
|
+
).astype(np.float64)
|
|
84
|
+
# np.digitize returns 1-based indices; subtract 1, clip to valid range
|
|
85
|
+
codes = np.clip(np.digitize(values, edges[1:-1]), 0, n_bins - 1).astype(np.intp)
|
|
86
|
+
return codes, edges
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _build_bin_freq_table(
|
|
90
|
+
bin_codes: npt.NDArray[np.intp],
|
|
91
|
+
response_codes: npt.NDArray[np.intp],
|
|
92
|
+
n_bins: int,
|
|
93
|
+
n_resp_cats: int,
|
|
94
|
+
) -> npt.NDArray[np.float64]:
|
|
95
|
+
"""Build a (n_resp_cats, n_bins) frequency table from bin and response codes."""
|
|
96
|
+
freq = np.zeros((n_resp_cats, n_bins), dtype=np.float64)
|
|
97
|
+
np.add.at(freq, (response_codes, bin_codes), 1)
|
|
98
|
+
return freq
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _bin_aic(freq: npt.NDArray[np.float64]) -> float:
|
|
102
|
+
"""Compute AIC(E; F) for a frequency table (response x bins).
|
|
103
|
+
|
|
104
|
+
AIC = -2 * sum(n_ij * ln(n_ij / n_j)) + 2 * (C_E - 1) * C_F
|
|
105
|
+
"""
|
|
106
|
+
c_e, c_f = freq.shape
|
|
107
|
+
marg_f = freq.sum(axis=0)
|
|
108
|
+
ratio = np.where(marg_f > 0, freq / marg_f, 0.0)
|
|
109
|
+
loglik = float(np.sum(_safe_xlogy(freq, ratio)))
|
|
110
|
+
penalty = 2.0 * (c_e - 1) * c_f
|
|
111
|
+
return float(-2.0 * loglik + penalty)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _merge_bins(
|
|
115
|
+
freq: npt.NDArray[np.float64],
|
|
116
|
+
i: int,
|
|
117
|
+
) -> npt.NDArray[np.float64]:
|
|
118
|
+
"""Return a new frequency table with bins *i* and *i+1* merged."""
|
|
119
|
+
new_freq: npt.NDArray[np.float64] = np.delete(freq, i + 1, axis=1).copy()
|
|
120
|
+
new_freq[:, i] = freq[:, i] + freq[:, i + 1]
|
|
121
|
+
return new_freq
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _encode_response(
|
|
125
|
+
response: npt.NDArray[np.object_],
|
|
126
|
+
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.object_]]:
|
|
127
|
+
"""Encode response labels to integer codes."""
|
|
128
|
+
uniq, codes = np.unique(response, return_inverse=True)
|
|
129
|
+
return codes.astype(np.intp), uniq
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _codes_from_boundaries(
|
|
133
|
+
values: npt.NDArray[np.float64],
|
|
134
|
+
boundaries: list[float],
|
|
135
|
+
) -> npt.NDArray[np.intp]:
|
|
136
|
+
"""Assign bin codes from a sorted boundary list."""
|
|
137
|
+
if not boundaries:
|
|
138
|
+
return np.zeros(len(values), dtype=np.intp)
|
|
139
|
+
return np.digitize(values, boundaries).astype(np.intp)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
# Equal pooling (pool=0, top-down)
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def equal_pooling(
|
|
148
|
+
values: npt.NDArray[np.float64],
|
|
149
|
+
response: npt.NDArray[np.object_],
|
|
150
|
+
accuracy: float,
|
|
151
|
+
) -> PoolingResult:
|
|
152
|
+
"""Equal-interval pooling (top-down greedy merge).
|
|
153
|
+
|
|
154
|
+
Start with bins of width *accuracy*, then greedily merge adjacent
|
|
155
|
+
bins when merging reduces AIC.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
values : ndarray
|
|
160
|
+
Continuous variable values.
|
|
161
|
+
response : ndarray
|
|
162
|
+
Response variable labels (categorical).
|
|
163
|
+
accuracy : float
|
|
164
|
+
Minimum bin width (observation precision).
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
PoolingResult
|
|
169
|
+
"""
|
|
170
|
+
values = np.asarray(values, dtype=np.float64)
|
|
171
|
+
if len(values) == 0:
|
|
172
|
+
return PoolingResult(codes=np.array([], dtype=np.intp), boundaries=[])
|
|
173
|
+
|
|
174
|
+
resp_codes, _ = _encode_response(np.asarray(response))
|
|
175
|
+
|
|
176
|
+
bin_codes, edges = _initial_bins(values, accuracy)
|
|
177
|
+
n_bins = len(edges) - 1
|
|
178
|
+
n_resp_cats = int(resp_codes.max()) + 1
|
|
179
|
+
|
|
180
|
+
freq = _build_bin_freq_table(bin_codes, resp_codes, n_bins, n_resp_cats)
|
|
181
|
+
|
|
182
|
+
# Remove empty bins
|
|
183
|
+
nonempty = freq.sum(axis=0) > 0
|
|
184
|
+
if not np.all(nonempty):
|
|
185
|
+
freq = freq[:, nonempty]
|
|
186
|
+
edges = np.concatenate(
|
|
187
|
+
# Keep first edge, then non-empty right edges
|
|
188
|
+
[edges[:1], edges[1:][nonempty]]
|
|
189
|
+
)
|
|
190
|
+
# Pad last edge if needed
|
|
191
|
+
if len(edges) <= freq.shape[1]:
|
|
192
|
+
edges = np.append(edges, edges[-1] + accuracy)
|
|
193
|
+
|
|
194
|
+
# Greedy merge: merge the pair that most decreases AIC
|
|
195
|
+
current_aic = _bin_aic(freq)
|
|
196
|
+
changed = True
|
|
197
|
+
while changed and freq.shape[1] > 1:
|
|
198
|
+
changed = False
|
|
199
|
+
best_delta = 0.0
|
|
200
|
+
best_idx = -1
|
|
201
|
+
best_freq: npt.NDArray[np.float64] | None = None
|
|
202
|
+
|
|
203
|
+
for i in range(freq.shape[1] - 1):
|
|
204
|
+
merged = _merge_bins(freq, i)
|
|
205
|
+
new_aic = _bin_aic(merged)
|
|
206
|
+
delta = new_aic - current_aic
|
|
207
|
+
if delta < best_delta:
|
|
208
|
+
best_delta = delta
|
|
209
|
+
best_idx = i
|
|
210
|
+
best_freq = merged
|
|
211
|
+
|
|
212
|
+
if best_freq is not None:
|
|
213
|
+
freq = best_freq
|
|
214
|
+
# Remove the merged edge
|
|
215
|
+
edges = np.delete(edges, best_idx + 1)
|
|
216
|
+
current_aic += best_delta
|
|
217
|
+
changed = True
|
|
218
|
+
|
|
219
|
+
# Build boundaries (internal edges, excluding first and last)
|
|
220
|
+
boundaries = sorted(float(e) for e in edges[1:-1])
|
|
221
|
+
codes = _codes_from_boundaries(values, boundaries)
|
|
222
|
+
return PoolingResult(codes=codes, boundaries=boundaries)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
# ---------------------------------------------------------------------------
|
|
226
|
+
# Unequal pooling (pool=1, bottom-up) — default
|
|
227
|
+
# ---------------------------------------------------------------------------
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def unequal_pooling(
|
|
231
|
+
values: npt.NDArray[np.float64],
|
|
232
|
+
response: npt.NDArray[np.object_],
|
|
233
|
+
accuracy: float,
|
|
234
|
+
) -> PoolingResult:
|
|
235
|
+
"""Unequal-interval pooling (bottom-up greedy merge).
|
|
236
|
+
|
|
237
|
+
Start with fine bins of width *accuracy*, then iteratively merge the
|
|
238
|
+
adjacent pair that yields the largest AIC decrease. Stop when no
|
|
239
|
+
merge improves AIC.
|
|
240
|
+
|
|
241
|
+
Parameters
|
|
242
|
+
----------
|
|
243
|
+
values : ndarray
|
|
244
|
+
Continuous variable values.
|
|
245
|
+
response : ndarray
|
|
246
|
+
Response variable labels (categorical).
|
|
247
|
+
accuracy : float
|
|
248
|
+
Minimum bin width.
|
|
249
|
+
|
|
250
|
+
Returns
|
|
251
|
+
-------
|
|
252
|
+
PoolingResult
|
|
253
|
+
"""
|
|
254
|
+
values = np.asarray(values, dtype=np.float64)
|
|
255
|
+
if len(values) == 0:
|
|
256
|
+
return PoolingResult(codes=np.array([], dtype=np.intp), boundaries=[])
|
|
257
|
+
|
|
258
|
+
resp_codes, _ = _encode_response(np.asarray(response))
|
|
259
|
+
|
|
260
|
+
bin_codes, edges = _initial_bins(values, accuracy)
|
|
261
|
+
n_bins = len(edges) - 1
|
|
262
|
+
n_resp_cats = int(resp_codes.max()) + 1
|
|
263
|
+
|
|
264
|
+
freq = _build_bin_freq_table(bin_codes, resp_codes, n_bins, n_resp_cats)
|
|
265
|
+
|
|
266
|
+
# Remove empty bins
|
|
267
|
+
nonempty = freq.sum(axis=0) > 0
|
|
268
|
+
if not np.all(nonempty):
|
|
269
|
+
freq = freq[:, nonempty]
|
|
270
|
+
edges = np.concatenate([edges[:1], edges[1:][nonempty]])
|
|
271
|
+
if len(edges) <= freq.shape[1]:
|
|
272
|
+
edges = np.append(edges, edges[-1] + accuracy)
|
|
273
|
+
|
|
274
|
+
# Bottom-up: iteratively merge best adjacent pair
|
|
275
|
+
current_aic = _bin_aic(freq)
|
|
276
|
+
while freq.shape[1] > 1:
|
|
277
|
+
best_delta = 0.0
|
|
278
|
+
best_idx = -1
|
|
279
|
+
best_freq: npt.NDArray[np.float64] | None = None
|
|
280
|
+
|
|
281
|
+
for i in range(freq.shape[1] - 1):
|
|
282
|
+
merged = _merge_bins(freq, i)
|
|
283
|
+
new_aic = _bin_aic(merged)
|
|
284
|
+
delta = new_aic - current_aic
|
|
285
|
+
if delta < best_delta:
|
|
286
|
+
best_delta = delta
|
|
287
|
+
best_idx = i
|
|
288
|
+
best_freq = merged
|
|
289
|
+
|
|
290
|
+
if best_freq is None:
|
|
291
|
+
break # No merge improves AIC
|
|
292
|
+
|
|
293
|
+
freq = best_freq
|
|
294
|
+
edges = np.delete(edges, best_idx + 1)
|
|
295
|
+
current_aic += best_delta
|
|
296
|
+
|
|
297
|
+
boundaries = sorted(float(e) for e in edges[1:-1])
|
|
298
|
+
codes = _codes_from_boundaries(values, boundaries)
|
|
299
|
+
return PoolingResult(codes=codes, boundaries=boundaries)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
# ---------------------------------------------------------------------------
|
|
303
|
+
# Public dispatch
|
|
304
|
+
# ---------------------------------------------------------------------------
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def optimal_binning(
|
|
308
|
+
values: npt.NDArray[np.float64],
|
|
309
|
+
response: npt.NDArray[np.object_],
|
|
310
|
+
method: str = "bottom_up",
|
|
311
|
+
accuracy: float | None = None,
|
|
312
|
+
) -> PoolingResult:
|
|
313
|
+
"""Categorize a continuous variable via AIC-optimal binning.
|
|
314
|
+
|
|
315
|
+
Parameters
|
|
316
|
+
----------
|
|
317
|
+
values : ndarray
|
|
318
|
+
Continuous variable values.
|
|
319
|
+
response : ndarray
|
|
320
|
+
Response variable labels (categorical).
|
|
321
|
+
method : {'bottom_up', 'top_down'}
|
|
322
|
+
``'bottom_up'`` (default) uses unequal pooling;
|
|
323
|
+
``'top_down'`` uses equal pooling.
|
|
324
|
+
accuracy : float or None
|
|
325
|
+
Minimum bin width. If ``None``, auto-detected from data.
|
|
326
|
+
|
|
327
|
+
Returns
|
|
328
|
+
-------
|
|
329
|
+
PoolingResult
|
|
330
|
+
Bin codes and boundary values.
|
|
331
|
+
|
|
332
|
+
Raises
|
|
333
|
+
------
|
|
334
|
+
ValueError
|
|
335
|
+
If *method* is not ``'bottom_up'`` or ``'top_down'``.
|
|
336
|
+
"""
|
|
337
|
+
values = np.asarray(values, dtype=np.float64)
|
|
338
|
+
response = np.asarray(response)
|
|
339
|
+
|
|
340
|
+
if accuracy is None:
|
|
341
|
+
accuracy = _auto_accuracy(values)
|
|
342
|
+
|
|
343
|
+
if method == "bottom_up":
|
|
344
|
+
return unequal_pooling(values, response, accuracy)
|
|
345
|
+
if method == "top_down":
|
|
346
|
+
return equal_pooling(values, response, accuracy)
|
|
347
|
+
|
|
348
|
+
msg = f"method must be 'bottom_up' or 'top_down', got '{method}'"
|
|
349
|
+
raise ValueError(msg)
|