catstat 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- catstat/__init__.py +15 -0
- catstat/_aggregations.py +57 -0
- catstat/_base.py +387 -0
- catstat/_cross_fit.py +83 -0
- catstat/_feature_names.py +49 -0
- catstat/_smoothing.py +56 -0
- catstat/_stats.py +101 -0
- catstat/_validation.py +128 -0
- catstat/backends/__init__.py +5 -0
- catstat/backends/_cpu.py +66 -0
- catstat/backends/_dispatch.py +54 -0
- catstat/backends/_gpu.py +120 -0
- catstat/count_encoder.py +36 -0
- catstat/frequency_encoder.py +26 -0
- catstat/py.typed +0 -0
- catstat/target_encoder.py +59 -0
- catstat-0.1.1.dist-info/METADATA +138 -0
- catstat-0.1.1.dist-info/RECORD +20 -0
- catstat-0.1.1.dist-info/WHEEL +4 -0
- catstat-0.1.1.dist-info/licenses/LICENSE +21 -0
catstat/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""catstat -- unified CPU/GPU statistical categorical encoding.
|
|
2
|
+
|
|
3
|
+
Leakage-safe target encoding generalized to arbitrary statistics, with one sklearn-compatible API.
|
|
4
|
+
Runs on CPU (pandas/numpy) today; the GPU path (cuDF/CuPy) is parity-validated but auto-selection
|
|
5
|
+
stays on CPU until it is faster (see docs/roadmap.md and docs/known_issues.md, KI-020).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .count_encoder import CountEncoder
|
|
11
|
+
from .frequency_encoder import FrequencyEncoder
|
|
12
|
+
from .target_encoder import TargetEncoder
|
|
13
|
+
|
|
14
|
+
__all__ = ["TargetEncoder", "CountEncoder", "FrequencyEncoder", "__version__"]
|
|
15
|
+
__version__ = "0.1.1"
|
catstat/_aggregations.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Non-mean target statistics: var / std / median / min / max.
|
|
2
|
+
|
|
3
|
+
These have **no principled smoothing** (the smoothing honesty rule): order statistics never blend,
|
|
4
|
+
and var/std default to no shrinkage. Each falls back to the **global** statistic for unseen
|
|
5
|
+
categories and for categories below ``min_samples_category`` (or where the statistic is undefined,
|
|
6
|
+
e.g. the sample variance of a singleton). Continuous targets only -- the encoders reject these for
|
|
7
|
+
classification.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from .backends import _cpu
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def global_stat(y, stat: str) -> float:
|
|
19
|
+
y = np.asarray(y, dtype=float)
|
|
20
|
+
if stat == "var":
|
|
21
|
+
return float(np.var(y, ddof=1)) if len(y) > 1 else 0.0
|
|
22
|
+
if stat == "std":
|
|
23
|
+
return float(np.std(y, ddof=1)) if len(y) > 1 else 0.0
|
|
24
|
+
if stat == "median":
|
|
25
|
+
return float(np.median(y))
|
|
26
|
+
if stat == "min":
|
|
27
|
+
return float(np.min(y))
|
|
28
|
+
if stat == "max":
|
|
29
|
+
return float(np.max(y))
|
|
30
|
+
if stat == "skew":
|
|
31
|
+
s = pd.Series(y).skew() # NaN for n < 3
|
|
32
|
+
return float(s) if pd.notna(s) else 0.0
|
|
33
|
+
raise ValueError(f"Unknown non-mean stat {stat!r}.")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def fit_custom_encoding(keys, y, fn, min_samples: int) -> tuple[pd.Series, float]:
|
|
37
|
+
"""Return ``(encoding_by_category, global_fallback)`` for a custom aggregation (CPU only)."""
|
|
38
|
+
per_cat = _cpu.category_agg_custom(keys, y, fn)
|
|
39
|
+
counts = pd.Series(keys).value_counts().reindex(per_cat.index)
|
|
40
|
+
gv = float(fn(np.asarray(y, dtype=float)))
|
|
41
|
+
fallback_mask = per_cat.isna() | (counts < max(int(min_samples), 1))
|
|
42
|
+
return per_cat.where(~fallback_mask, gv).astype(float), gv
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def fit_stat_encoding(
|
|
46
|
+
keys, y, stat: str, min_samples: int, backend=None
|
|
47
|
+
) -> tuple[pd.Series, float]:
|
|
48
|
+
"""Return ``(encoding_by_category, global_fallback)`` for a dispersion/order statistic."""
|
|
49
|
+
if backend is None:
|
|
50
|
+
backend = _cpu
|
|
51
|
+
per_cat = backend.category_agg(keys, y, stat) # Series; NaN where undefined (e.g. var of n=1)
|
|
52
|
+
counts = pd.Series(keys).value_counts().reindex(per_cat.index)
|
|
53
|
+
gv = global_stat(y, stat)
|
|
54
|
+
# fall back to the global statistic for undefined or under-supported categories
|
|
55
|
+
fallback_mask = per_cat.isna() | (counts < max(int(min_samples), 1))
|
|
56
|
+
enc = per_cat.where(~fallback_mask, gv).astype(float)
|
|
57
|
+
return enc, gv
|
catstat/_base.py
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
"""``_BaseStatEncoder`` -- the shared fit/transform/fit_transform skeleton.
|
|
2
|
+
|
|
3
|
+
All statistics/leakage logic lives here and in the small helper modules; only ``backends/``
|
|
4
|
+
knows pandas vs cuDF. Subclasses define the sklearn ``__init__`` params and two hooks:
|
|
5
|
+
``_is_supervised`` and ``_resolve_stat_specs``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
13
|
+
from sklearn.utils import check_random_state
|
|
14
|
+
from sklearn.utils.validation import check_is_fitted
|
|
15
|
+
|
|
16
|
+
from ._aggregations import fit_custom_encoding, fit_stat_encoding
|
|
17
|
+
from ._cross_fit import loo_encode, make_folds, ordered_encode, resolve_cv
|
|
18
|
+
from ._feature_names import build_columns
|
|
19
|
+
from ._smoothing import fit_mean_encoding
|
|
20
|
+
from ._validation import (
|
|
21
|
+
check_handle,
|
|
22
|
+
infer_target_type,
|
|
23
|
+
normalize_keys,
|
|
24
|
+
prepare_X,
|
|
25
|
+
select_cols,
|
|
26
|
+
)
|
|
27
|
+
from .backends import _cpu
|
|
28
|
+
from .backends._dispatch import backend_module, select_backend
|
|
29
|
+
|
|
30
|
+
_VALID_OUTPUT = ("auto", "numpy", "pandas", "polars")
|
|
31
|
+
_DEFERRED_OUTPUT = ("cudf", "cupy")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class _BaseStatEncoder(TransformerMixin, BaseEstimator):
|
|
35
|
+
# ---- hooks the subclasses must implement -------------------------------------------------
|
|
36
|
+
def _is_supervised(self) -> bool:
|
|
37
|
+
raise NotImplementedError
|
|
38
|
+
|
|
39
|
+
def _resolve_stat_specs(self):
|
|
40
|
+
raise NotImplementedError
|
|
41
|
+
|
|
42
|
+
# ---- scikit-learn estimator tags ---------------------------------------------------------
|
|
43
|
+
# catstat encoders are categorical encoders: they accept string/categorical columns and learn
|
|
44
|
+
# NaN as its own level when handle_missing="value"; supervised encoders additionally require y.
|
|
45
|
+
# Both tag APIs are provided -- __sklearn_tags__ for scikit-learn >= 1.6, and _more_tags for
|
|
46
|
+
# < 1.6 (newer versions ignore it; older ones ignore __sklearn_tags__).
|
|
47
|
+
def __sklearn_tags__(self):
|
|
48
|
+
tags = super().__sklearn_tags__()
|
|
49
|
+
tags.target_tags.required = self._is_supervised()
|
|
50
|
+
tags.input_tags.categorical = True
|
|
51
|
+
tags.input_tags.string = True
|
|
52
|
+
tags.input_tags.allow_nan = True
|
|
53
|
+
return tags
|
|
54
|
+
|
|
55
|
+
def _more_tags(self):
|
|
56
|
+
return {
|
|
57
|
+
"requires_y": self._is_supervised(),
|
|
58
|
+
"X_types": ["categorical", "string", "2darray"],
|
|
59
|
+
"allow_nan": True,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
# ---- key helper --------------------------------------------------------------------------
|
|
63
|
+
@staticmethod
|
|
64
|
+
def _key(meta):
|
|
65
|
+
return (meta.feature, meta.stat, meta.class_label)
|
|
66
|
+
|
|
67
|
+
# ---- fit ---------------------------------------------------------------------------------
|
|
68
|
+
def fit(self, X, y=None):
|
|
69
|
+
check_handle("handle_unknown", self.handle_unknown)
|
|
70
|
+
check_handle("handle_missing", self.handle_missing)
|
|
71
|
+
mode = getattr(self, "multi_feature_mode", "independent")
|
|
72
|
+
if mode not in ("independent", "combination"):
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"multi_feature_mode={mode!r} must be 'independent' or 'combination'."
|
|
75
|
+
)
|
|
76
|
+
if self.output in _DEFERRED_OUTPUT:
|
|
77
|
+
raise NotImplementedError(f"output={self.output!r} is not supported in M0 (CPU).")
|
|
78
|
+
if self.output not in _VALID_OUTPUT:
|
|
79
|
+
raise ValueError(f"output={self.output!r} must be one of {_VALID_OUTPUT}.")
|
|
80
|
+
|
|
81
|
+
Xdf, was_df, all_cols = prepare_X(X)
|
|
82
|
+
self.n_features_in_ = Xdf.shape[1]
|
|
83
|
+
if was_df:
|
|
84
|
+
self.feature_names_in_ = np.asarray(all_cols, dtype=object)
|
|
85
|
+
self._cat_cols = select_cols(Xdf, self.cols)
|
|
86
|
+
# Encoding units: independent -> one per column; combination -> one joint unit.
|
|
87
|
+
if mode == "combination" and len(self._cat_cols) > 1:
|
|
88
|
+
joint = "+".join(str(c) for c in self._cat_cols)
|
|
89
|
+
self._units = [(joint, list(self._cat_cols))]
|
|
90
|
+
else:
|
|
91
|
+
self._units = [(c, [c]) for c in self._cat_cols]
|
|
92
|
+
self._unit_cols = dict(self._units)
|
|
93
|
+
self._specs = self._resolve_stat_specs()
|
|
94
|
+
self.stats_ = [s.name for s in self._specs]
|
|
95
|
+
|
|
96
|
+
supervised = self._is_supervised()
|
|
97
|
+
if supervised:
|
|
98
|
+
if y is None:
|
|
99
|
+
raise ValueError(f"{type(self).__name__} requires y to be supplied to fit().")
|
|
100
|
+
y_arr = np.asarray(y)
|
|
101
|
+
if len(y_arr) != Xdf.shape[0]:
|
|
102
|
+
raise ValueError("X and y have inconsistent lengths.")
|
|
103
|
+
self.target_type_ = infer_target_type(y_arr, self.target_type)
|
|
104
|
+
self.classes_ = (
|
|
105
|
+
np.unique(y_arr) if self.target_type_ in ("binary", "multiclass") else None
|
|
106
|
+
)
|
|
107
|
+
else:
|
|
108
|
+
y_arr = None
|
|
109
|
+
self.target_type_ = None
|
|
110
|
+
self.classes_ = None
|
|
111
|
+
|
|
112
|
+
for spec in self._specs:
|
|
113
|
+
if spec.continuous_only and self.target_type_ != "continuous":
|
|
114
|
+
raise ValueError(
|
|
115
|
+
f"stat={spec.name!r} requires a continuous target; got "
|
|
116
|
+
f"target_type={self.target_type_!r}. Dispersion/order statistics on "
|
|
117
|
+
"classification targets are not supported."
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
scheme = getattr(self, "scheme", "kfold")
|
|
121
|
+
if scheme not in ("kfold", "loo", "ordered"):
|
|
122
|
+
raise ValueError(f"scheme={scheme!r} must be 'kfold', 'loo', or 'ordered'.")
|
|
123
|
+
if scheme != "kfold":
|
|
124
|
+
bad = [s.name for s in self._specs if s.target_dependent and s.name != "mean"]
|
|
125
|
+
if bad:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"scheme={scheme!r} cross-fits the mean only (count/frequency are allowed "
|
|
128
|
+
f"too); got target-dependent stats {bad}. Use scheme='kfold' for those."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
self._columns_meta = build_columns(
|
|
132
|
+
[name for name, _ in self._units], self._specs, self.target_type_, self.classes_
|
|
133
|
+
)
|
|
134
|
+
self.feature_names_out_ = np.asarray([m.name for m in self._columns_meta], dtype=object)
|
|
135
|
+
|
|
136
|
+
all_gpu = all(s.gpu_supported for s in self._specs)
|
|
137
|
+
self._backend_mod, self.backend_ = select_backend(
|
|
138
|
+
self.backend, Xdf.shape[0], len(self._cat_cols), all_gpu
|
|
139
|
+
)
|
|
140
|
+
# GPU can't run tuple keys (combination) or CPU-only stats (skew/custom) -> host only.
|
|
141
|
+
host_only = (not all_gpu) or any(len(cols) > 1 for _, cols in self._units)
|
|
142
|
+
if self.backend_ == "gpu" and host_only:
|
|
143
|
+
self._backend_mod, self.backend_ = _cpu, _cpu.NAME
|
|
144
|
+
|
|
145
|
+
self._fit_tables = self._fit_all(Xdf, y_arr)
|
|
146
|
+
|
|
147
|
+
# public fitted attributes derived from the full-data tables
|
|
148
|
+
self.categories_ = {}
|
|
149
|
+
self.global_stats_ = {}
|
|
150
|
+
for meta in self._columns_meta:
|
|
151
|
+
enc, fb = self._fit_tables[self._key(meta)]
|
|
152
|
+
self.global_stats_[meta.name] = fb
|
|
153
|
+
self.categories_.setdefault(meta.feature, np.asarray(list(enc.index), dtype=object))
|
|
154
|
+
self._set_target_mean()
|
|
155
|
+
return self
|
|
156
|
+
|
|
157
|
+
def _set_target_mean(self):
|
|
158
|
+
if not self._is_supervised() or "mean" not in self.stats_:
|
|
159
|
+
return
|
|
160
|
+
f0 = self._units[0][0]
|
|
161
|
+
if self.target_type_ == "multiclass":
|
|
162
|
+
self.target_mean_ = np.asarray(
|
|
163
|
+
[self._fit_tables[(f0, "mean", c)][1] for c in self.classes_], dtype=float
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
self.target_mean_ = float(self._fit_tables[(f0, "mean", None)][1])
|
|
167
|
+
|
|
168
|
+
# ---- per-statistic fitting ---------------------------------------------------------------
|
|
169
|
+
def _fit_all(self, Xdf, y_arr):
|
|
170
|
+
"""Return the full encoding tables: ``{(feature, stat, class): (Series, fallback)}``."""
|
|
171
|
+
tables = {}
|
|
172
|
+
hm = self.handle_missing
|
|
173
|
+
for feat, cols in self._units:
|
|
174
|
+
keys_full, missing_mask = self._unit_keys(Xdf, cols)
|
|
175
|
+
if hm == "error" and missing_mask.any():
|
|
176
|
+
raise ValueError(
|
|
177
|
+
f"Missing values in unit {feat!r} with handle_missing='error'."
|
|
178
|
+
)
|
|
179
|
+
sel = np.ones(len(keys_full), dtype=bool) if hm == "value" else ~missing_mask
|
|
180
|
+
keys = keys_full[sel]
|
|
181
|
+
n_total = int(sel.sum())
|
|
182
|
+
for spec in self._specs:
|
|
183
|
+
if spec.name == "count":
|
|
184
|
+
tables[(feat, "count", None)] = self._fit_count(keys, False, n_total)
|
|
185
|
+
elif spec.name == "frequency":
|
|
186
|
+
tables[(feat, "frequency", None)] = self._fit_count(keys, True, n_total)
|
|
187
|
+
elif spec.name == "mean":
|
|
188
|
+
y_sel = y_arr[sel]
|
|
189
|
+
bk = self._backend_mod
|
|
190
|
+
if self.target_type_ == "continuous":
|
|
191
|
+
tables[(feat, "mean", None)] = fit_mean_encoding(
|
|
192
|
+
keys, y_sel.astype(float), self.smooth, bk
|
|
193
|
+
)
|
|
194
|
+
elif self.target_type_ == "binary":
|
|
195
|
+
yb = (y_sel == self.classes_[1]).astype(float)
|
|
196
|
+
tables[(feat, "mean", None)] = fit_mean_encoding(keys, yb, self.smooth, bk)
|
|
197
|
+
else: # multiclass: one-vs-rest per global class
|
|
198
|
+
for c in self.classes_:
|
|
199
|
+
yc = (y_sel == c).astype(float)
|
|
200
|
+
tables[(feat, "mean", c)] = fit_mean_encoding(keys, yc, self.smooth, bk)
|
|
201
|
+
else: # var/std/median/min/max/skew or custom (continuous-only, target-dependent)
|
|
202
|
+
min_samples = getattr(self, "min_samples_category", 1)
|
|
203
|
+
y_sel_f = y_arr[sel].astype(float)
|
|
204
|
+
if spec.func is not None:
|
|
205
|
+
tables[(feat, spec.name, None)] = fit_custom_encoding(
|
|
206
|
+
keys, y_sel_f, spec.func, min_samples
|
|
207
|
+
)
|
|
208
|
+
else:
|
|
209
|
+
tables[(feat, spec.name, None)] = fit_stat_encoding(
|
|
210
|
+
keys, y_sel_f, spec.name, min_samples, self._backend_mod
|
|
211
|
+
)
|
|
212
|
+
return tables
|
|
213
|
+
|
|
214
|
+
@staticmethod
|
|
215
|
+
def _fit_count(keys, normalize, n_total):
|
|
216
|
+
vc = pd.Series(keys).value_counts().astype(float)
|
|
217
|
+
if normalize:
|
|
218
|
+
vc = vc / float(max(n_total, 1))
|
|
219
|
+
return vc, 0.0
|
|
220
|
+
|
|
221
|
+
def _unit_keys(self, Xdf, cols):
|
|
222
|
+
"""Return ``(keys, missing_mask)`` for an encoding unit.
|
|
223
|
+
|
|
224
|
+
A single-column unit uses the column's normalized keys directly. A combination unit uses
|
|
225
|
+
the tuple of its components' keys as one joint category; the row counts as missing if any
|
|
226
|
+
component is missing.
|
|
227
|
+
"""
|
|
228
|
+
if len(cols) == 1:
|
|
229
|
+
return normalize_keys(Xdf[cols[0]].to_numpy())
|
|
230
|
+
comp_keys = []
|
|
231
|
+
missing = np.zeros(Xdf.shape[0], dtype=bool)
|
|
232
|
+
for c in cols:
|
|
233
|
+
k, m = normalize_keys(Xdf[c].to_numpy())
|
|
234
|
+
comp_keys.append(k)
|
|
235
|
+
missing = missing | m
|
|
236
|
+
joint = np.empty(Xdf.shape[0], dtype=object)
|
|
237
|
+
for i in range(Xdf.shape[0]):
|
|
238
|
+
joint[i] = tuple(ck[i] for ck in comp_keys)
|
|
239
|
+
return joint, missing
|
|
240
|
+
|
|
241
|
+
# ---- transform ---------------------------------------------------------------------------
|
|
242
|
+
def _transform_array(self, Xdf, tables) -> np.ndarray:
|
|
243
|
+
n = Xdf.shape[0]
|
|
244
|
+
out = np.full((n, len(self._columns_meta)), np.nan, dtype=float)
|
|
245
|
+
hm, hu = self.handle_missing, self.handle_unknown
|
|
246
|
+
cache: dict = {}
|
|
247
|
+
for j, meta in enumerate(self._columns_meta):
|
|
248
|
+
feat = meta.feature
|
|
249
|
+
if feat not in cache:
|
|
250
|
+
keys, missing_mask = self._unit_keys(Xdf, self._unit_cols[feat])
|
|
251
|
+
if hm == "error" and missing_mask.any():
|
|
252
|
+
raise ValueError(
|
|
253
|
+
f"Missing values in unit {feat!r} with handle_missing='error'."
|
|
254
|
+
)
|
|
255
|
+
cache[feat] = (keys, missing_mask)
|
|
256
|
+
keys, missing_mask = cache[feat]
|
|
257
|
+
|
|
258
|
+
enc_series, fallback = tables[self._key(meta)]
|
|
259
|
+
mapped = pd.Series(keys).map(enc_series).to_numpy(dtype=float)
|
|
260
|
+
col = mapped.copy()
|
|
261
|
+
notfound = np.isnan(mapped)
|
|
262
|
+
|
|
263
|
+
if hm == "return_nan":
|
|
264
|
+
col[missing_mask] = np.nan
|
|
265
|
+
self._apply_unknown(col, notfound & ~missing_mask, fallback, hu, feat)
|
|
266
|
+
elif hm == "value":
|
|
267
|
+
# rows not found are either unseen real categories OR an unseen missing level
|
|
268
|
+
self._apply_unknown(col, notfound, fallback, hu, feat)
|
|
269
|
+
else: # "error" already raised above if any missing present
|
|
270
|
+
self._apply_unknown(col, notfound & ~missing_mask, fallback, hu, feat)
|
|
271
|
+
out[:, j] = col
|
|
272
|
+
return out
|
|
273
|
+
|
|
274
|
+
@staticmethod
|
|
275
|
+
def _apply_unknown(col, mask, fallback, hu, feat):
|
|
276
|
+
if not mask.any():
|
|
277
|
+
return col
|
|
278
|
+
if hu == "error":
|
|
279
|
+
raise ValueError(
|
|
280
|
+
f"Found unknown categories in column {feat!r} with handle_unknown='error'."
|
|
281
|
+
)
|
|
282
|
+
if hu == "value":
|
|
283
|
+
col[mask] = fallback
|
|
284
|
+
# "return_nan": leave the NaN in place
|
|
285
|
+
return col
|
|
286
|
+
|
|
287
|
+
# ---- pickle support ----------------------------------------------------------------------
|
|
288
|
+
# A fitted estimator caches its backend *module* in `_backend_mod`; modules aren't picklable,
|
|
289
|
+
# so drop it on pickle and re-resolve it from the recorded backend name (`backend_`) on load.
|
|
290
|
+
def __getstate__(self):
|
|
291
|
+
state = dict(super().__getstate__())
|
|
292
|
+
state.pop("_backend_mod", None)
|
|
293
|
+
return state
|
|
294
|
+
|
|
295
|
+
def __setstate__(self, state):
|
|
296
|
+
super().__setstate__(state)
|
|
297
|
+
if "backend_" in state:
|
|
298
|
+
self._backend_mod = backend_module(state["backend_"])
|
|
299
|
+
|
|
300
|
+
def transform(self, X):
|
|
301
|
+
check_is_fitted(self, "_fit_tables")
|
|
302
|
+
Xdf, was_df, _ = prepare_X(X)
|
|
303
|
+
arr = self._transform_array(Xdf, self._fit_tables)
|
|
304
|
+
return self._wrap_output(arr, was_df, Xdf)
|
|
305
|
+
|
|
306
|
+
def fit_transform(self, X, y=None, **fit_params):
|
|
307
|
+
self.fit(X, y)
|
|
308
|
+
Xdf, was_df, _ = prepare_X(X)
|
|
309
|
+
full = self._transform_array(Xdf, self._fit_tables)
|
|
310
|
+
|
|
311
|
+
if self._is_supervised() and any(m.target_dependent for m in self._columns_meta):
|
|
312
|
+
y_arr = np.asarray(y)
|
|
313
|
+
scheme = getattr(self, "scheme", "kfold")
|
|
314
|
+
if scheme == "kfold":
|
|
315
|
+
oof = self._kfold_oof(Xdf, y_arr, full.shape)
|
|
316
|
+
else:
|
|
317
|
+
oof = self._loo_ordered_oof(Xdf, y_arr, scheme, full.shape)
|
|
318
|
+
for j, meta in enumerate(self._columns_meta):
|
|
319
|
+
if meta.target_dependent:
|
|
320
|
+
full[:, j] = oof[:, j]
|
|
321
|
+
return self._wrap_output(full, was_df, Xdf)
|
|
322
|
+
|
|
323
|
+
def _kfold_oof(self, Xdf, y_arr, shape):
|
|
324
|
+
splitter = resolve_cv(self.cv, self.target_type_, self.shuffle, self.random_state)
|
|
325
|
+
folds = make_folds(Xdf.shape[0], y_arr, splitter)
|
|
326
|
+
oof = np.full(shape, np.nan)
|
|
327
|
+
for tr, te in folds:
|
|
328
|
+
tbl = self._fit_all(Xdf.iloc[tr], y_arr[tr])
|
|
329
|
+
oof[te, :] = self._transform_array(Xdf.iloc[te], tbl)
|
|
330
|
+
return oof
|
|
331
|
+
|
|
332
|
+
def _mean_y_vector(self, y_arr, meta):
|
|
333
|
+
if self.target_type_ == "continuous":
|
|
334
|
+
return y_arr.astype(float)
|
|
335
|
+
if self.target_type_ == "binary":
|
|
336
|
+
return (y_arr == self.classes_[1]).astype(float)
|
|
337
|
+
return (y_arr == meta.class_label).astype(float) # multiclass one-vs-rest
|
|
338
|
+
|
|
339
|
+
def _loo_ordered_oof(self, Xdf, y_arr, scheme, shape):
|
|
340
|
+
"""Leave-one-out / ordered encodings for the mean columns (validated: mean-only)."""
|
|
341
|
+
oof = np.full(shape, np.nan)
|
|
342
|
+
m = 0.0 if isinstance(self.smooth, str) else float(self.smooth) # loo pseudo-count
|
|
343
|
+
# ordered prior weight a (CatBoost) must be > 0; default 1 for "auto"/non-positive smooth.
|
|
344
|
+
smooth_pos = (not isinstance(self.smooth, str)) and float(self.smooth) > 0
|
|
345
|
+
a = float(self.smooth) if smooth_pos else 1.0
|
|
346
|
+
perm = (
|
|
347
|
+
check_random_state(self.random_state).permutation(len(y_arr))
|
|
348
|
+
if scheme == "ordered"
|
|
349
|
+
else None
|
|
350
|
+
)
|
|
351
|
+
for j, meta in enumerate(self._columns_meta):
|
|
352
|
+
if not (meta.target_dependent and meta.stat == "mean"):
|
|
353
|
+
continue
|
|
354
|
+
keys, missing_mask = self._unit_keys(Xdf, self._unit_cols[meta.feature])
|
|
355
|
+
yv = self._mean_y_vector(y_arr, meta)
|
|
356
|
+
prior = float(yv.mean())
|
|
357
|
+
if scheme == "loo":
|
|
358
|
+
vals = loo_encode(keys, yv, m, prior)
|
|
359
|
+
else:
|
|
360
|
+
vals = ordered_encode(keys, yv, a, prior, perm)
|
|
361
|
+
if self.handle_missing == "return_nan":
|
|
362
|
+
vals = vals.copy()
|
|
363
|
+
vals[missing_mask] = np.nan
|
|
364
|
+
oof[:, j] = vals
|
|
365
|
+
return oof
|
|
366
|
+
|
|
367
|
+
# ---- output container --------------------------------------------------------------------
|
|
368
|
+
def _wrap_output(self, arr, was_df, Xdf):
|
|
369
|
+
if self.output == "numpy":
|
|
370
|
+
return arr
|
|
371
|
+
if self.output == "pandas":
|
|
372
|
+
idx = Xdf.index if was_df else None
|
|
373
|
+
return pd.DataFrame(arr, columns=self.feature_names_out_, index=idx)
|
|
374
|
+
if self.output == "polars":
|
|
375
|
+
try:
|
|
376
|
+
import polars as pl
|
|
377
|
+
except ImportError as e: # pragma: no cover - exercised only without polars
|
|
378
|
+
raise ImportError("output='polars' requires polars (pip install polars).") from e
|
|
379
|
+
return pl.from_numpy(arr, schema=list(self.feature_names_out_))
|
|
380
|
+
# "auto": mirror the input container
|
|
381
|
+
if was_df:
|
|
382
|
+
return pd.DataFrame(arr, columns=self.feature_names_out_, index=Xdf.index)
|
|
383
|
+
return arr
|
|
384
|
+
|
|
385
|
+
def get_feature_names_out(self, input_features=None):
|
|
386
|
+
check_is_fitted(self, "_fit_tables")
|
|
387
|
+
return np.asarray(self.feature_names_out_, dtype=object)
|
catstat/_cross_fit.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Cross-fitting: deterministic fold assignment for leakage-safe ``fit_transform``.
|
|
2
|
+
|
|
3
|
+
``catstat`` owns fold assignment so CPU and (future) GPU produce identical out-of-fold encodings.
|
|
4
|
+
``random_state`` flows only through the resolved splitter; the global numpy RNG is never touched.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from sklearn.model_selection import KFold, StratifiedKFold
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _PrecomputedSplitter:
|
|
15
|
+
"""Wrap a user-provided iterable of ``(train_idx, test_idx)`` tuples."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, splits):
|
|
18
|
+
self._splits = [(np.asarray(tr), np.asarray(te)) for tr, te in splits]
|
|
19
|
+
|
|
20
|
+
def split(self, X, y=None, groups=None):
|
|
21
|
+
return iter(self._splits)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def resolve_cv(cv, target_type: str, shuffle: bool, random_state):
|
|
25
|
+
"""Return a splitter object for the given ``cv`` argument.
|
|
26
|
+
|
|
27
|
+
int -> KFold (continuous) or StratifiedKFold (binary/multiclass). A splitter object is
|
|
28
|
+
returned as-is. An iterable of index pairs is wrapped.
|
|
29
|
+
"""
|
|
30
|
+
if hasattr(cv, "split"):
|
|
31
|
+
return cv
|
|
32
|
+
if isinstance(cv, (int, np.integer)):
|
|
33
|
+
rs = random_state if shuffle else None
|
|
34
|
+
if target_type == "continuous":
|
|
35
|
+
return KFold(n_splits=int(cv), shuffle=shuffle, random_state=rs)
|
|
36
|
+
return StratifiedKFold(n_splits=int(cv), shuffle=shuffle, random_state=rs)
|
|
37
|
+
# assume an iterable of (train, test) index arrays
|
|
38
|
+
return _PrecomputedSplitter(cv)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def make_folds(n_rows: int, y, splitter) -> list[tuple[np.ndarray, np.ndarray]]:
|
|
42
|
+
"""Materialize the ``(train_idx, test_idx)`` folds.
|
|
43
|
+
|
|
44
|
+
A dummy feature matrix is passed for shape; stratified splitters use ``y``.
|
|
45
|
+
"""
|
|
46
|
+
dummy_X = np.zeros((n_rows, 1))
|
|
47
|
+
return list(splitter.split(dummy_X, y))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def loo_encode(keys, y, m: float, prior: float) -> np.ndarray:
|
|
51
|
+
"""Leave-one-out mean encoding (deterministic, leakage-safe for the training set).
|
|
52
|
+
|
|
53
|
+
Each row is encoded by its category mean computed from **every other row**:
|
|
54
|
+
``(cat_sum - y_i + m*prior) / (cat_count - 1 + m)``. With ``m=0`` this is the classic LOO mean;
|
|
55
|
+
singletons (empty denominator) fall back to the global ``prior``.
|
|
56
|
+
"""
|
|
57
|
+
yv = np.asarray(y, dtype=float)
|
|
58
|
+
grp = pd.DataFrame({"k": pd.Series(keys), "y": yv}).groupby("k", sort=False)["y"]
|
|
59
|
+
cat_sum = grp.transform("sum").to_numpy()
|
|
60
|
+
cat_cnt = grp.transform("count").to_numpy()
|
|
61
|
+
num = cat_sum - yv + m * prior
|
|
62
|
+
den = cat_cnt - 1.0 + m
|
|
63
|
+
den_safe = np.where(den > 0, den, 1.0) # avoid 0/0 warning; result is overwritten by prior
|
|
64
|
+
return np.where(den > 0, num / den_safe, prior)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def ordered_encode(keys, y, a: float, prior: float, perm: np.ndarray) -> np.ndarray:
|
|
68
|
+
"""CatBoost-style ordered target statistics.
|
|
69
|
+
|
|
70
|
+
Walk the rows in a random permutation; each row is encoded from only the **prior** rows of its
|
|
71
|
+
category in that order: ``(prior_sum + a*prior) / (prior_count + a)`` (first occurrence ->
|
|
72
|
+
prior).
|
|
73
|
+
"""
|
|
74
|
+
yv = np.asarray(y, dtype=float)
|
|
75
|
+
ks = np.asarray(keys, dtype=object)[perm]
|
|
76
|
+
ys = yv[perm]
|
|
77
|
+
g = pd.Series(ys).groupby(ks, sort=False)
|
|
78
|
+
prior_sum = g.cumsum().to_numpy() - ys # cumsum includes current -> subtract it
|
|
79
|
+
prior_cnt = g.cumcount().to_numpy() # 0-based position == count of earlier rows
|
|
80
|
+
enc_perm = (prior_sum + a * prior) / (prior_cnt + a)
|
|
81
|
+
out = np.empty(len(yv), dtype=float)
|
|
82
|
+
out[perm] = enc_perm
|
|
83
|
+
return out
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Output column metadata and feature-name construction.
|
|
2
|
+
|
|
3
|
+
Column order is feature-major, then stat order, then (for class-expanded multiclass stats) class
|
|
4
|
+
order. The same metadata list drives both ``transform`` assembly and ``get_feature_names_out`` so
|
|
5
|
+
they can never disagree.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
from ._stats import StatSpec
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class ColumnMeta:
|
|
17
|
+
feature: object
|
|
18
|
+
stat: str
|
|
19
|
+
class_label: object # None unless a class-expanded multiclass column
|
|
20
|
+
target_dependent: bool
|
|
21
|
+
name: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def build_columns(cat_cols, specs: list[StatSpec], target_type, classes) -> list[ColumnMeta]:
|
|
25
|
+
cols: list[ColumnMeta] = []
|
|
26
|
+
for feat in cat_cols:
|
|
27
|
+
for spec in specs:
|
|
28
|
+
if spec.class_expanded and target_type == "multiclass":
|
|
29
|
+
for c in classes:
|
|
30
|
+
cols.append(
|
|
31
|
+
ColumnMeta(
|
|
32
|
+
feature=feat,
|
|
33
|
+
stat=spec.name,
|
|
34
|
+
class_label=c,
|
|
35
|
+
target_dependent=spec.target_dependent,
|
|
36
|
+
name=f"{feat}__{spec.name_infix}__class_{c}",
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
else:
|
|
40
|
+
cols.append(
|
|
41
|
+
ColumnMeta(
|
|
42
|
+
feature=feat,
|
|
43
|
+
stat=spec.name,
|
|
44
|
+
class_label=None,
|
|
45
|
+
target_dependent=spec.target_dependent,
|
|
46
|
+
name=f"{feat}__{spec.name_infix}",
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
return cols
|
catstat/_smoothing.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Smoothing for mean/probability statistics.
|
|
2
|
+
|
|
3
|
+
Only mean/probability admit principled smoothing (see docs: the "smoothing honesty rule").
|
|
4
|
+
This module implements the fixed m-estimate and the ``smooth='auto'`` empirical-Bayes estimate.
|
|
5
|
+
|
|
6
|
+
For ``smooth='auto'`` we use the documented empirical-Bayes form ``m_i = sigma_i^2 / tau^2`` with
|
|
7
|
+
population (ddof=0) variances, blending ``lambda_i = n_i / (n_i + m_i)`` toward the global mean.
|
|
8
|
+
The exact parity with scikit-learn's auto formula is a known follow-up (docs/known_issues KI-010);
|
|
9
|
+
the leakage/determinism guarantees do not depend on the smoothing constant.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
from .backends import _cpu
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def fit_mean_encoding(
|
|
21
|
+
keys: np.ndarray, y: np.ndarray, smooth, backend=None
|
|
22
|
+
) -> tuple[pd.Series, float]:
|
|
23
|
+
"""Return ``(encoding_by_category, global_mean)`` for a mean/probability target statistic.
|
|
24
|
+
|
|
25
|
+
``y`` is the (possibly binarized, for classification) target aligned with ``keys``. The heavy
|
|
26
|
+
group-by runs on ``backend`` (CPU by default); the rest is host arithmetic, so CPU and GPU
|
|
27
|
+
produce the same table (to ``allclose``).
|
|
28
|
+
"""
|
|
29
|
+
if backend is None:
|
|
30
|
+
backend = _cpu
|
|
31
|
+
stats = backend.category_reduce(keys, y)
|
|
32
|
+
count = stats["count"]
|
|
33
|
+
mean = stats["mean"]
|
|
34
|
+
global_mean = float(np.mean(np.asarray(y, dtype=float)))
|
|
35
|
+
|
|
36
|
+
if isinstance(smooth, str):
|
|
37
|
+
if smooth != "auto":
|
|
38
|
+
raise ValueError(f"smooth={smooth!r}: only 'auto' or a float >= 0 is allowed.")
|
|
39
|
+
var_pop = (stats["sumsq"] / count - mean**2).clip(lower=0.0)
|
|
40
|
+
tau2 = float(np.var(np.asarray(y, dtype=float))) # population variance
|
|
41
|
+
if tau2 > 0:
|
|
42
|
+
m = var_pop / tau2
|
|
43
|
+
else: # constant target -> every category mean equals the global mean
|
|
44
|
+
m = pd.Series(0.0, index=count.index)
|
|
45
|
+
lam = count / (count + m)
|
|
46
|
+
enc = lam * mean + (1.0 - lam) * global_mean
|
|
47
|
+
else:
|
|
48
|
+
m = float(smooth)
|
|
49
|
+
if m < 0:
|
|
50
|
+
raise ValueError("smooth must be >= 0.")
|
|
51
|
+
if m == 0.0:
|
|
52
|
+
enc = mean.copy()
|
|
53
|
+
else:
|
|
54
|
+
enc = (count * mean + m * global_mean) / (count + m)
|
|
55
|
+
|
|
56
|
+
return enc.astype(float), global_mean
|