catstat 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
catstat/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ """catstat -- unified CPU/GPU statistical categorical encoding.
2
+
3
+ Leakage-safe target encoding generalized to arbitrary statistics, with one sklearn-compatible API.
4
+ Runs on CPU (pandas/numpy) today; the GPU path (cuDF/CuPy) is parity-validated but auto-selection
5
+ stays on CPU until it is faster (see docs/roadmap.md and docs/known_issues.md, KI-020).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .count_encoder import CountEncoder
11
+ from .frequency_encoder import FrequencyEncoder
12
+ from .target_encoder import TargetEncoder
13
+
14
+ __all__ = ["TargetEncoder", "CountEncoder", "FrequencyEncoder", "__version__"]
15
+ __version__ = "0.1.1"
@@ -0,0 +1,57 @@
1
+ """Non-mean target statistics: var / std / median / min / max.
2
+
3
+ These have **no principled smoothing** (the smoothing honesty rule): order statistics never blend,
4
+ and var/std default to no shrinkage. Each falls back to the **global** statistic for unseen
5
+ categories and for categories below ``min_samples_category`` (or where the statistic is undefined,
6
+ e.g. the sample variance of a singleton). Continuous targets only -- the encoders reject these for
7
+ classification.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+
15
+ from .backends import _cpu
16
+
17
+
18
+ def global_stat(y, stat: str) -> float:
19
+ y = np.asarray(y, dtype=float)
20
+ if stat == "var":
21
+ return float(np.var(y, ddof=1)) if len(y) > 1 else 0.0
22
+ if stat == "std":
23
+ return float(np.std(y, ddof=1)) if len(y) > 1 else 0.0
24
+ if stat == "median":
25
+ return float(np.median(y))
26
+ if stat == "min":
27
+ return float(np.min(y))
28
+ if stat == "max":
29
+ return float(np.max(y))
30
+ if stat == "skew":
31
+ s = pd.Series(y).skew() # NaN for n < 3
32
+ return float(s) if pd.notna(s) else 0.0
33
+ raise ValueError(f"Unknown non-mean stat {stat!r}.")
34
+
35
+
36
+ def fit_custom_encoding(keys, y, fn, min_samples: int) -> tuple[pd.Series, float]:
37
+ """Return ``(encoding_by_category, global_fallback)`` for a custom aggregation (CPU only)."""
38
+ per_cat = _cpu.category_agg_custom(keys, y, fn)
39
+ counts = pd.Series(keys).value_counts().reindex(per_cat.index)
40
+ gv = float(fn(np.asarray(y, dtype=float)))
41
+ fallback_mask = per_cat.isna() | (counts < max(int(min_samples), 1))
42
+ return per_cat.where(~fallback_mask, gv).astype(float), gv
43
+
44
+
45
+ def fit_stat_encoding(
46
+ keys, y, stat: str, min_samples: int, backend=None
47
+ ) -> tuple[pd.Series, float]:
48
+ """Return ``(encoding_by_category, global_fallback)`` for a dispersion/order statistic."""
49
+ if backend is None:
50
+ backend = _cpu
51
+ per_cat = backend.category_agg(keys, y, stat) # Series; NaN where undefined (e.g. var of n=1)
52
+ counts = pd.Series(keys).value_counts().reindex(per_cat.index)
53
+ gv = global_stat(y, stat)
54
+ # fall back to the global statistic for undefined or under-supported categories
55
+ fallback_mask = per_cat.isna() | (counts < max(int(min_samples), 1))
56
+ enc = per_cat.where(~fallback_mask, gv).astype(float)
57
+ return enc, gv
catstat/_base.py ADDED
@@ -0,0 +1,387 @@
1
+ """``_BaseStatEncoder`` -- the shared fit/transform/fit_transform skeleton.
2
+
3
+ All statistics/leakage logic lives here and in the small helper modules; only ``backends/``
4
+ knows pandas vs cuDF. Subclasses define the sklearn ``__init__`` params and two hooks:
5
+ ``_is_supervised`` and ``_resolve_stat_specs``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ from sklearn.base import BaseEstimator, TransformerMixin
13
+ from sklearn.utils import check_random_state
14
+ from sklearn.utils.validation import check_is_fitted
15
+
16
+ from ._aggregations import fit_custom_encoding, fit_stat_encoding
17
+ from ._cross_fit import loo_encode, make_folds, ordered_encode, resolve_cv
18
+ from ._feature_names import build_columns
19
+ from ._smoothing import fit_mean_encoding
20
+ from ._validation import (
21
+ check_handle,
22
+ infer_target_type,
23
+ normalize_keys,
24
+ prepare_X,
25
+ select_cols,
26
+ )
27
+ from .backends import _cpu
28
+ from .backends._dispatch import backend_module, select_backend
29
+
30
+ _VALID_OUTPUT = ("auto", "numpy", "pandas", "polars")
31
+ _DEFERRED_OUTPUT = ("cudf", "cupy")
32
+
33
+
34
+ class _BaseStatEncoder(TransformerMixin, BaseEstimator):
35
+ # ---- hooks the subclasses must implement -------------------------------------------------
36
+ def _is_supervised(self) -> bool:
37
+ raise NotImplementedError
38
+
39
+ def _resolve_stat_specs(self):
40
+ raise NotImplementedError
41
+
42
+ # ---- scikit-learn estimator tags ---------------------------------------------------------
43
+ # catstat encoders are categorical encoders: they accept string/categorical columns and learn
44
+ # NaN as its own level when handle_missing="value"; supervised encoders additionally require y.
45
+ # Both tag APIs are provided -- __sklearn_tags__ for scikit-learn >= 1.6, and _more_tags for
46
+ # < 1.6 (newer versions ignore it; older ones ignore __sklearn_tags__).
47
+ def __sklearn_tags__(self):
48
+ tags = super().__sklearn_tags__()
49
+ tags.target_tags.required = self._is_supervised()
50
+ tags.input_tags.categorical = True
51
+ tags.input_tags.string = True
52
+ tags.input_tags.allow_nan = True
53
+ return tags
54
+
55
+ def _more_tags(self):
56
+ return {
57
+ "requires_y": self._is_supervised(),
58
+ "X_types": ["categorical", "string", "2darray"],
59
+ "allow_nan": True,
60
+ }
61
+
62
+ # ---- key helper --------------------------------------------------------------------------
63
+ @staticmethod
64
+ def _key(meta):
65
+ return (meta.feature, meta.stat, meta.class_label)
66
+
67
+ # ---- fit ---------------------------------------------------------------------------------
68
+ def fit(self, X, y=None):
69
+ check_handle("handle_unknown", self.handle_unknown)
70
+ check_handle("handle_missing", self.handle_missing)
71
+ mode = getattr(self, "multi_feature_mode", "independent")
72
+ if mode not in ("independent", "combination"):
73
+ raise ValueError(
74
+ f"multi_feature_mode={mode!r} must be 'independent' or 'combination'."
75
+ )
76
+ if self.output in _DEFERRED_OUTPUT:
77
+ raise NotImplementedError(f"output={self.output!r} is not supported in M0 (CPU).")
78
+ if self.output not in _VALID_OUTPUT:
79
+ raise ValueError(f"output={self.output!r} must be one of {_VALID_OUTPUT}.")
80
+
81
+ Xdf, was_df, all_cols = prepare_X(X)
82
+ self.n_features_in_ = Xdf.shape[1]
83
+ if was_df:
84
+ self.feature_names_in_ = np.asarray(all_cols, dtype=object)
85
+ self._cat_cols = select_cols(Xdf, self.cols)
86
+ # Encoding units: independent -> one per column; combination -> one joint unit.
87
+ if mode == "combination" and len(self._cat_cols) > 1:
88
+ joint = "+".join(str(c) for c in self._cat_cols)
89
+ self._units = [(joint, list(self._cat_cols))]
90
+ else:
91
+ self._units = [(c, [c]) for c in self._cat_cols]
92
+ self._unit_cols = dict(self._units)
93
+ self._specs = self._resolve_stat_specs()
94
+ self.stats_ = [s.name for s in self._specs]
95
+
96
+ supervised = self._is_supervised()
97
+ if supervised:
98
+ if y is None:
99
+ raise ValueError(f"{type(self).__name__} requires y to be supplied to fit().")
100
+ y_arr = np.asarray(y)
101
+ if len(y_arr) != Xdf.shape[0]:
102
+ raise ValueError("X and y have inconsistent lengths.")
103
+ self.target_type_ = infer_target_type(y_arr, self.target_type)
104
+ self.classes_ = (
105
+ np.unique(y_arr) if self.target_type_ in ("binary", "multiclass") else None
106
+ )
107
+ else:
108
+ y_arr = None
109
+ self.target_type_ = None
110
+ self.classes_ = None
111
+
112
+ for spec in self._specs:
113
+ if spec.continuous_only and self.target_type_ != "continuous":
114
+ raise ValueError(
115
+ f"stat={spec.name!r} requires a continuous target; got "
116
+ f"target_type={self.target_type_!r}. Dispersion/order statistics on "
117
+ "classification targets are not supported."
118
+ )
119
+
120
+ scheme = getattr(self, "scheme", "kfold")
121
+ if scheme not in ("kfold", "loo", "ordered"):
122
+ raise ValueError(f"scheme={scheme!r} must be 'kfold', 'loo', or 'ordered'.")
123
+ if scheme != "kfold":
124
+ bad = [s.name for s in self._specs if s.target_dependent and s.name != "mean"]
125
+ if bad:
126
+ raise ValueError(
127
+ f"scheme={scheme!r} cross-fits the mean only (count/frequency are allowed "
128
+ f"too); got target-dependent stats {bad}. Use scheme='kfold' for those."
129
+ )
130
+
131
+ self._columns_meta = build_columns(
132
+ [name for name, _ in self._units], self._specs, self.target_type_, self.classes_
133
+ )
134
+ self.feature_names_out_ = np.asarray([m.name for m in self._columns_meta], dtype=object)
135
+
136
+ all_gpu = all(s.gpu_supported for s in self._specs)
137
+ self._backend_mod, self.backend_ = select_backend(
138
+ self.backend, Xdf.shape[0], len(self._cat_cols), all_gpu
139
+ )
140
+ # GPU can't run tuple keys (combination) or CPU-only stats (skew/custom) -> host only.
141
+ host_only = (not all_gpu) or any(len(cols) > 1 for _, cols in self._units)
142
+ if self.backend_ == "gpu" and host_only:
143
+ self._backend_mod, self.backend_ = _cpu, _cpu.NAME
144
+
145
+ self._fit_tables = self._fit_all(Xdf, y_arr)
146
+
147
+ # public fitted attributes derived from the full-data tables
148
+ self.categories_ = {}
149
+ self.global_stats_ = {}
150
+ for meta in self._columns_meta:
151
+ enc, fb = self._fit_tables[self._key(meta)]
152
+ self.global_stats_[meta.name] = fb
153
+ self.categories_.setdefault(meta.feature, np.asarray(list(enc.index), dtype=object))
154
+ self._set_target_mean()
155
+ return self
156
+
157
+ def _set_target_mean(self):
158
+ if not self._is_supervised() or "mean" not in self.stats_:
159
+ return
160
+ f0 = self._units[0][0]
161
+ if self.target_type_ == "multiclass":
162
+ self.target_mean_ = np.asarray(
163
+ [self._fit_tables[(f0, "mean", c)][1] for c in self.classes_], dtype=float
164
+ )
165
+ else:
166
+ self.target_mean_ = float(self._fit_tables[(f0, "mean", None)][1])
167
+
168
+ # ---- per-statistic fitting ---------------------------------------------------------------
169
+ def _fit_all(self, Xdf, y_arr):
170
+ """Return the full encoding tables: ``{(feature, stat, class): (Series, fallback)}``."""
171
+ tables = {}
172
+ hm = self.handle_missing
173
+ for feat, cols in self._units:
174
+ keys_full, missing_mask = self._unit_keys(Xdf, cols)
175
+ if hm == "error" and missing_mask.any():
176
+ raise ValueError(
177
+ f"Missing values in unit {feat!r} with handle_missing='error'."
178
+ )
179
+ sel = np.ones(len(keys_full), dtype=bool) if hm == "value" else ~missing_mask
180
+ keys = keys_full[sel]
181
+ n_total = int(sel.sum())
182
+ for spec in self._specs:
183
+ if spec.name == "count":
184
+ tables[(feat, "count", None)] = self._fit_count(keys, False, n_total)
185
+ elif spec.name == "frequency":
186
+ tables[(feat, "frequency", None)] = self._fit_count(keys, True, n_total)
187
+ elif spec.name == "mean":
188
+ y_sel = y_arr[sel]
189
+ bk = self._backend_mod
190
+ if self.target_type_ == "continuous":
191
+ tables[(feat, "mean", None)] = fit_mean_encoding(
192
+ keys, y_sel.astype(float), self.smooth, bk
193
+ )
194
+ elif self.target_type_ == "binary":
195
+ yb = (y_sel == self.classes_[1]).astype(float)
196
+ tables[(feat, "mean", None)] = fit_mean_encoding(keys, yb, self.smooth, bk)
197
+ else: # multiclass: one-vs-rest per global class
198
+ for c in self.classes_:
199
+ yc = (y_sel == c).astype(float)
200
+ tables[(feat, "mean", c)] = fit_mean_encoding(keys, yc, self.smooth, bk)
201
+ else: # var/std/median/min/max/skew or custom (continuous-only, target-dependent)
202
+ min_samples = getattr(self, "min_samples_category", 1)
203
+ y_sel_f = y_arr[sel].astype(float)
204
+ if spec.func is not None:
205
+ tables[(feat, spec.name, None)] = fit_custom_encoding(
206
+ keys, y_sel_f, spec.func, min_samples
207
+ )
208
+ else:
209
+ tables[(feat, spec.name, None)] = fit_stat_encoding(
210
+ keys, y_sel_f, spec.name, min_samples, self._backend_mod
211
+ )
212
+ return tables
213
+
214
+ @staticmethod
215
+ def _fit_count(keys, normalize, n_total):
216
+ vc = pd.Series(keys).value_counts().astype(float)
217
+ if normalize:
218
+ vc = vc / float(max(n_total, 1))
219
+ return vc, 0.0
220
+
221
+ def _unit_keys(self, Xdf, cols):
222
+ """Return ``(keys, missing_mask)`` for an encoding unit.
223
+
224
+ A single-column unit uses the column's normalized keys directly. A combination unit uses
225
+ the tuple of its components' keys as one joint category; the row counts as missing if any
226
+ component is missing.
227
+ """
228
+ if len(cols) == 1:
229
+ return normalize_keys(Xdf[cols[0]].to_numpy())
230
+ comp_keys = []
231
+ missing = np.zeros(Xdf.shape[0], dtype=bool)
232
+ for c in cols:
233
+ k, m = normalize_keys(Xdf[c].to_numpy())
234
+ comp_keys.append(k)
235
+ missing = missing | m
236
+ joint = np.empty(Xdf.shape[0], dtype=object)
237
+ for i in range(Xdf.shape[0]):
238
+ joint[i] = tuple(ck[i] for ck in comp_keys)
239
+ return joint, missing
240
+
241
+ # ---- transform ---------------------------------------------------------------------------
242
+ def _transform_array(self, Xdf, tables) -> np.ndarray:
243
+ n = Xdf.shape[0]
244
+ out = np.full((n, len(self._columns_meta)), np.nan, dtype=float)
245
+ hm, hu = self.handle_missing, self.handle_unknown
246
+ cache: dict = {}
247
+ for j, meta in enumerate(self._columns_meta):
248
+ feat = meta.feature
249
+ if feat not in cache:
250
+ keys, missing_mask = self._unit_keys(Xdf, self._unit_cols[feat])
251
+ if hm == "error" and missing_mask.any():
252
+ raise ValueError(
253
+ f"Missing values in unit {feat!r} with handle_missing='error'."
254
+ )
255
+ cache[feat] = (keys, missing_mask)
256
+ keys, missing_mask = cache[feat]
257
+
258
+ enc_series, fallback = tables[self._key(meta)]
259
+ mapped = pd.Series(keys).map(enc_series).to_numpy(dtype=float)
260
+ col = mapped.copy()
261
+ notfound = np.isnan(mapped)
262
+
263
+ if hm == "return_nan":
264
+ col[missing_mask] = np.nan
265
+ self._apply_unknown(col, notfound & ~missing_mask, fallback, hu, feat)
266
+ elif hm == "value":
267
+ # rows not found are either unseen real categories OR an unseen missing level
268
+ self._apply_unknown(col, notfound, fallback, hu, feat)
269
+ else: # "error" already raised above if any missing present
270
+ self._apply_unknown(col, notfound & ~missing_mask, fallback, hu, feat)
271
+ out[:, j] = col
272
+ return out
273
+
274
+ @staticmethod
275
+ def _apply_unknown(col, mask, fallback, hu, feat):
276
+ if not mask.any():
277
+ return col
278
+ if hu == "error":
279
+ raise ValueError(
280
+ f"Found unknown categories in column {feat!r} with handle_unknown='error'."
281
+ )
282
+ if hu == "value":
283
+ col[mask] = fallback
284
+ # "return_nan": leave the NaN in place
285
+ return col
286
+
287
+ # ---- pickle support ----------------------------------------------------------------------
288
+ # A fitted estimator caches its backend *module* in `_backend_mod`; modules aren't picklable,
289
+ # so drop it on pickle and re-resolve it from the recorded backend name (`backend_`) on load.
290
+ def __getstate__(self):
291
+ state = dict(super().__getstate__())
292
+ state.pop("_backend_mod", None)
293
+ return state
294
+
295
+ def __setstate__(self, state):
296
+ super().__setstate__(state)
297
+ if "backend_" in state:
298
+ self._backend_mod = backend_module(state["backend_"])
299
+
300
+ def transform(self, X):
301
+ check_is_fitted(self, "_fit_tables")
302
+ Xdf, was_df, _ = prepare_X(X)
303
+ arr = self._transform_array(Xdf, self._fit_tables)
304
+ return self._wrap_output(arr, was_df, Xdf)
305
+
306
+ def fit_transform(self, X, y=None, **fit_params):
307
+ self.fit(X, y)
308
+ Xdf, was_df, _ = prepare_X(X)
309
+ full = self._transform_array(Xdf, self._fit_tables)
310
+
311
+ if self._is_supervised() and any(m.target_dependent for m in self._columns_meta):
312
+ y_arr = np.asarray(y)
313
+ scheme = getattr(self, "scheme", "kfold")
314
+ if scheme == "kfold":
315
+ oof = self._kfold_oof(Xdf, y_arr, full.shape)
316
+ else:
317
+ oof = self._loo_ordered_oof(Xdf, y_arr, scheme, full.shape)
318
+ for j, meta in enumerate(self._columns_meta):
319
+ if meta.target_dependent:
320
+ full[:, j] = oof[:, j]
321
+ return self._wrap_output(full, was_df, Xdf)
322
+
323
+ def _kfold_oof(self, Xdf, y_arr, shape):
324
+ splitter = resolve_cv(self.cv, self.target_type_, self.shuffle, self.random_state)
325
+ folds = make_folds(Xdf.shape[0], y_arr, splitter)
326
+ oof = np.full(shape, np.nan)
327
+ for tr, te in folds:
328
+ tbl = self._fit_all(Xdf.iloc[tr], y_arr[tr])
329
+ oof[te, :] = self._transform_array(Xdf.iloc[te], tbl)
330
+ return oof
331
+
332
+ def _mean_y_vector(self, y_arr, meta):
333
+ if self.target_type_ == "continuous":
334
+ return y_arr.astype(float)
335
+ if self.target_type_ == "binary":
336
+ return (y_arr == self.classes_[1]).astype(float)
337
+ return (y_arr == meta.class_label).astype(float) # multiclass one-vs-rest
338
+
339
+ def _loo_ordered_oof(self, Xdf, y_arr, scheme, shape):
340
+ """Leave-one-out / ordered encodings for the mean columns (validated: mean-only)."""
341
+ oof = np.full(shape, np.nan)
342
+ m = 0.0 if isinstance(self.smooth, str) else float(self.smooth) # loo pseudo-count
343
+ # ordered prior weight a (CatBoost) must be > 0; default 1 for "auto"/non-positive smooth.
344
+ smooth_pos = (not isinstance(self.smooth, str)) and float(self.smooth) > 0
345
+ a = float(self.smooth) if smooth_pos else 1.0
346
+ perm = (
347
+ check_random_state(self.random_state).permutation(len(y_arr))
348
+ if scheme == "ordered"
349
+ else None
350
+ )
351
+ for j, meta in enumerate(self._columns_meta):
352
+ if not (meta.target_dependent and meta.stat == "mean"):
353
+ continue
354
+ keys, missing_mask = self._unit_keys(Xdf, self._unit_cols[meta.feature])
355
+ yv = self._mean_y_vector(y_arr, meta)
356
+ prior = float(yv.mean())
357
+ if scheme == "loo":
358
+ vals = loo_encode(keys, yv, m, prior)
359
+ else:
360
+ vals = ordered_encode(keys, yv, a, prior, perm)
361
+ if self.handle_missing == "return_nan":
362
+ vals = vals.copy()
363
+ vals[missing_mask] = np.nan
364
+ oof[:, j] = vals
365
+ return oof
366
+
367
+ # ---- output container --------------------------------------------------------------------
368
+ def _wrap_output(self, arr, was_df, Xdf):
369
+ if self.output == "numpy":
370
+ return arr
371
+ if self.output == "pandas":
372
+ idx = Xdf.index if was_df else None
373
+ return pd.DataFrame(arr, columns=self.feature_names_out_, index=idx)
374
+ if self.output == "polars":
375
+ try:
376
+ import polars as pl
377
+ except ImportError as e: # pragma: no cover - exercised only without polars
378
+ raise ImportError("output='polars' requires polars (pip install polars).") from e
379
+ return pl.from_numpy(arr, schema=list(self.feature_names_out_))
380
+ # "auto": mirror the input container
381
+ if was_df:
382
+ return pd.DataFrame(arr, columns=self.feature_names_out_, index=Xdf.index)
383
+ return arr
384
+
385
+ def get_feature_names_out(self, input_features=None):
386
+ check_is_fitted(self, "_fit_tables")
387
+ return np.asarray(self.feature_names_out_, dtype=object)
catstat/_cross_fit.py ADDED
@@ -0,0 +1,83 @@
1
+ """Cross-fitting: deterministic fold assignment for leakage-safe ``fit_transform``.
2
+
3
+ ``catstat`` owns fold assignment so CPU and (future) GPU produce identical out-of-fold encodings.
4
+ ``random_state`` flows only through the resolved splitter; the global numpy RNG is never touched.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from sklearn.model_selection import KFold, StratifiedKFold
12
+
13
+
14
+ class _PrecomputedSplitter:
15
+ """Wrap a user-provided iterable of ``(train_idx, test_idx)`` tuples."""
16
+
17
+ def __init__(self, splits):
18
+ self._splits = [(np.asarray(tr), np.asarray(te)) for tr, te in splits]
19
+
20
+ def split(self, X, y=None, groups=None):
21
+ return iter(self._splits)
22
+
23
+
24
+ def resolve_cv(cv, target_type: str, shuffle: bool, random_state):
25
+ """Return a splitter object for the given ``cv`` argument.
26
+
27
+ int -> KFold (continuous) or StratifiedKFold (binary/multiclass). A splitter object is
28
+ returned as-is. An iterable of index pairs is wrapped.
29
+ """
30
+ if hasattr(cv, "split"):
31
+ return cv
32
+ if isinstance(cv, (int, np.integer)):
33
+ rs = random_state if shuffle else None
34
+ if target_type == "continuous":
35
+ return KFold(n_splits=int(cv), shuffle=shuffle, random_state=rs)
36
+ return StratifiedKFold(n_splits=int(cv), shuffle=shuffle, random_state=rs)
37
+ # assume an iterable of (train, test) index arrays
38
+ return _PrecomputedSplitter(cv)
39
+
40
+
41
+ def make_folds(n_rows: int, y, splitter) -> list[tuple[np.ndarray, np.ndarray]]:
42
+ """Materialize the ``(train_idx, test_idx)`` folds.
43
+
44
+ A dummy feature matrix is passed for shape; stratified splitters use ``y``.
45
+ """
46
+ dummy_X = np.zeros((n_rows, 1))
47
+ return list(splitter.split(dummy_X, y))
48
+
49
+
50
+ def loo_encode(keys, y, m: float, prior: float) -> np.ndarray:
51
+ """Leave-one-out mean encoding (deterministic, leakage-safe for the training set).
52
+
53
+ Each row is encoded by its category mean computed from **every other row**:
54
+ ``(cat_sum - y_i + m*prior) / (cat_count - 1 + m)``. With ``m=0`` this is the classic LOO mean;
55
+ singletons (empty denominator) fall back to the global ``prior``.
56
+ """
57
+ yv = np.asarray(y, dtype=float)
58
+ grp = pd.DataFrame({"k": pd.Series(keys), "y": yv}).groupby("k", sort=False)["y"]
59
+ cat_sum = grp.transform("sum").to_numpy()
60
+ cat_cnt = grp.transform("count").to_numpy()
61
+ num = cat_sum - yv + m * prior
62
+ den = cat_cnt - 1.0 + m
63
+ den_safe = np.where(den > 0, den, 1.0) # avoid 0/0 warning; result is overwritten by prior
64
+ return np.where(den > 0, num / den_safe, prior)
65
+
66
+
67
+ def ordered_encode(keys, y, a: float, prior: float, perm: np.ndarray) -> np.ndarray:
68
+ """CatBoost-style ordered target statistics.
69
+
70
+ Walk the rows in a random permutation; each row is encoded from only the **prior** rows of its
71
+ category in that order: ``(prior_sum + a*prior) / (prior_count + a)`` (first occurrence ->
72
+ prior).
73
+ """
74
+ yv = np.asarray(y, dtype=float)
75
+ ks = np.asarray(keys, dtype=object)[perm]
76
+ ys = yv[perm]
77
+ g = pd.Series(ys).groupby(ks, sort=False)
78
+ prior_sum = g.cumsum().to_numpy() - ys # cumsum includes current -> subtract it
79
+ prior_cnt = g.cumcount().to_numpy() # 0-based position == count of earlier rows
80
+ enc_perm = (prior_sum + a * prior) / (prior_cnt + a)
81
+ out = np.empty(len(yv), dtype=float)
82
+ out[perm] = enc_perm
83
+ return out
@@ -0,0 +1,49 @@
1
+ """Output column metadata and feature-name construction.
2
+
3
+ Column order is feature-major, then stat order, then (for class-expanded multiclass stats) class
4
+ order. The same metadata list drives both ``transform`` assembly and ``get_feature_names_out`` so
5
+ they can never disagree.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+
12
+ from ._stats import StatSpec
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class ColumnMeta:
17
+ feature: object
18
+ stat: str
19
+ class_label: object # None unless a class-expanded multiclass column
20
+ target_dependent: bool
21
+ name: str
22
+
23
+
24
+ def build_columns(cat_cols, specs: list[StatSpec], target_type, classes) -> list[ColumnMeta]:
25
+ cols: list[ColumnMeta] = []
26
+ for feat in cat_cols:
27
+ for spec in specs:
28
+ if spec.class_expanded and target_type == "multiclass":
29
+ for c in classes:
30
+ cols.append(
31
+ ColumnMeta(
32
+ feature=feat,
33
+ stat=spec.name,
34
+ class_label=c,
35
+ target_dependent=spec.target_dependent,
36
+ name=f"{feat}__{spec.name_infix}__class_{c}",
37
+ )
38
+ )
39
+ else:
40
+ cols.append(
41
+ ColumnMeta(
42
+ feature=feat,
43
+ stat=spec.name,
44
+ class_label=None,
45
+ target_dependent=spec.target_dependent,
46
+ name=f"{feat}__{spec.name_infix}",
47
+ )
48
+ )
49
+ return cols
catstat/_smoothing.py ADDED
@@ -0,0 +1,56 @@
1
+ """Smoothing for mean/probability statistics.
2
+
3
+ Only mean/probability admit principled smoothing (see docs: the "smoothing honesty rule").
4
+ This module implements the fixed m-estimate and the ``smooth='auto'`` empirical-Bayes estimate.
5
+
6
+ For ``smooth='auto'`` we use the documented empirical-Bayes form ``m_i = sigma_i^2 / tau^2`` with
7
+ population (ddof=0) variances, blending ``lambda_i = n_i / (n_i + m_i)`` toward the global mean.
8
+ The exact parity with scikit-learn's auto formula is a known follow-up (docs/known_issues KI-010);
9
+ the leakage/determinism guarantees do not depend on the smoothing constant.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+
17
+ from .backends import _cpu
18
+
19
+
20
+ def fit_mean_encoding(
21
+ keys: np.ndarray, y: np.ndarray, smooth, backend=None
22
+ ) -> tuple[pd.Series, float]:
23
+ """Return ``(encoding_by_category, global_mean)`` for a mean/probability target statistic.
24
+
25
+ ``y`` is the (possibly binarized, for classification) target aligned with ``keys``. The heavy
26
+ group-by runs on ``backend`` (CPU by default); the rest is host arithmetic, so CPU and GPU
27
+ produce the same table (to ``allclose``).
28
+ """
29
+ if backend is None:
30
+ backend = _cpu
31
+ stats = backend.category_reduce(keys, y)
32
+ count = stats["count"]
33
+ mean = stats["mean"]
34
+ global_mean = float(np.mean(np.asarray(y, dtype=float)))
35
+
36
+ if isinstance(smooth, str):
37
+ if smooth != "auto":
38
+ raise ValueError(f"smooth={smooth!r}: only 'auto' or a float >= 0 is allowed.")
39
+ var_pop = (stats["sumsq"] / count - mean**2).clip(lower=0.0)
40
+ tau2 = float(np.var(np.asarray(y, dtype=float))) # population variance
41
+ if tau2 > 0:
42
+ m = var_pop / tau2
43
+ else: # constant target -> every category mean equals the global mean
44
+ m = pd.Series(0.0, index=count.index)
45
+ lam = count / (count + m)
46
+ enc = lam * mean + (1.0 - lam) * global_mean
47
+ else:
48
+ m = float(smooth)
49
+ if m < 0:
50
+ raise ValueError("smooth must be >= 0.")
51
+ if m == 0.0:
52
+ enc = mean.copy()
53
+ else:
54
+ enc = (count * mean + m * global_mean) / (count + m)
55
+
56
+ return enc.astype(float), global_mean