chimeraboost 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ """ChimeraBoost: a CatBoost-inspired gradient boosting library in pure Python.
2
+
3
+ Key ingredients borrowed from CatBoost:
4
+ * Ordered target statistics for categorical features (anti-leakage encoding)
5
+ * Oblivious / symmetric trees (fast, strongly regularized -> good defaults)
6
+ * Histogram-based quantized splitting (numba accelerated)
7
+
8
+ Public API:
9
+ >>> from chimeraboost import ChimeraBoostRegressor, ChimeraBoostClassifier
10
+ >>> model = ChimeraBoostClassifier().fit(X, y, cat_features=[0, 3])
11
+ >>> proba = model.predict_proba(X_test)
12
+ """
13
+
14
+ from .sklearn_api import (
15
+ ChimeraBoostRegressor,
16
+ ChimeraBoostClassifier,
17
+ )
18
+
19
+ __all__ = [
20
+ "ChimeraBoostRegressor",
21
+ "ChimeraBoostClassifier",
22
+ ]
23
+ __version__ = "0.4.0"
@@ -0,0 +1,69 @@
1
+ """Quantization of numeric features into integer bins.
2
+
3
+ Borders are learned once on the training data (quantile based). Every feature
4
+ is mapped to a small integer bin index, which is what the tree builder consumes.
5
+ NaNs are routed to a dedicated bin so a split can isolate missing values, the
6
+ way CatBoost/LightGBM do.
7
+
8
+ Bin layout per feature:
9
+ real values -> 0 .. n_borders (via searchsorted on borders)
10
+ NaN -> n_borders + 1 (the highest bin, "missing")
11
+ The histogram width for a feature is therefore (n_borders + 2).
12
+ """
13
+
14
+ import numpy as np
15
+
16
+ BIN_DTYPE = np.uint16
17
+
18
+
19
+ def _feature_borders(col, max_bins):
20
+ """Quantile borders for one numeric column, ignoring NaNs."""
21
+ finite = col[np.isfinite(col)]
22
+ if finite.size == 0:
23
+ return np.array([], dtype=np.float64)
24
+ uniq = np.unique(finite)
25
+ if uniq.size <= max_bins:
26
+ # Few distinct values: put a border between each pair.
27
+ return ((uniq[:-1] + uniq[1:]) / 2.0).astype(np.float64)
28
+ qs = np.linspace(0.0, 1.0, max_bins + 1)[1:-1]
29
+ borders = np.quantile(finite, qs)
30
+ return np.unique(borders).astype(np.float64)
31
+
32
+
33
+ class Binner:
34
+ """Learns per-feature borders and maps a float matrix to bins."""
35
+
36
+ def __init__(self, max_bins=128):
37
+ self.max_bins = int(max_bins)
38
+ self.borders_ = None # list of np.ndarray, one per feature
39
+ self.n_bins_ = None # np.ndarray int, width per feature
40
+
41
+ def fit(self, X):
42
+ """Learn quantile borders for each column from training data."""
43
+ X = np.asarray(X, dtype=np.float64)
44
+ n_features = X.shape[1]
45
+ self.borders_ = [
46
+ _feature_borders(X[:, f], self.max_bins) for f in range(n_features)
47
+ ]
48
+ # +1 for the searchsorted upper bucket, +1 for the NaN bucket.
49
+ self.n_bins_ = np.array(
50
+ [len(b) + 2 for b in self.borders_], dtype=np.int64
51
+ )
52
+ return self
53
+
54
+ def transform(self, X):
55
+ """Map a float matrix to integer bin indices; NaNs go to the top bin."""
56
+ X = np.asarray(X, dtype=np.float64)
57
+ n_samples, n_features = X.shape
58
+ out = np.empty((n_samples, n_features), dtype=BIN_DTYPE)
59
+ for f in range(n_features):
60
+ col = X[:, f]
61
+ borders = self.borders_[f]
62
+ nan_bin = len(borders) + 1
63
+ binned = np.searchsorted(borders, col, side="right").astype(BIN_DTYPE)
64
+ binned[~np.isfinite(col)] = nan_bin
65
+ out[:, f] = binned
66
+ return out
67
+
68
+ def fit_transform(self, X):
69
+ return self.fit(X).transform(X)
@@ -0,0 +1,408 @@
1
+ """The gradient boosting core: builds the full additive model.
2
+
3
+ Two boosters share the same machinery (FeaturePreprocessor, oblivious trees):
4
+ * GradientBoosting -> scalar output (regression, binary classification)
5
+ * MulticlassBoosting -> K simultaneous outputs (softmax multiclass)
6
+ """
7
+
8
+ import time
9
+ import numpy as np
10
+
11
+ from .losses import LOSSES, MultiSoftmax
12
+ from .preprocessing import FeaturePreprocessor
13
+ from .tree import build_oblivious_tree
14
+
15
+
16
+ def _apply_thread_count(thread_count):
17
+ """Set numba's thread pool size. None / -1 means use all detected cores.
18
+
19
+ Returns the effective thread count so callers can record it.
20
+ """
21
+ import numba
22
+ max_threads = numba.config.NUMBA_NUM_THREADS
23
+ if thread_count is None or thread_count < 0:
24
+ n = max_threads
25
+ else:
26
+ n = max(1, min(int(thread_count), max_threads))
27
+ numba.set_num_threads(n)
28
+ return n
29
+
30
+
31
+ def _auto_learning_rate(n_samples, iterations, early_stopping):
32
+ """Pick a default learning rate when the user did not specify one.
33
+
34
+ With early stopping, we default to 0.05 (down from 0.1). This forces the
35
+ model to take smaller steps and build a larger, smoother ensemble. We trade
36
+ a bit of our massive speed advantage for better test generalization.
37
+ Without early stopping, the rate scales inversely with the iteration budget.
38
+ """
39
+ if early_stopping:
40
+ return 0.1
41
+ lr = 20.0 / max(iterations, 1)
42
+ return float(np.clip(lr, 0.03, 0.2))
43
+
44
+
45
+ class _BaseBooster:
46
+ """Shared machinery for the scalar and multiclass boosters.
47
+
48
+ Holds the common hyperparameters and the helpers both subclasses use:
49
+ histogram-buffer allocation, column subsampling, row subsampling, feature
50
+ preprocessing, and split-gain feature importances. Subclasses implement
51
+ `fit` and `predict_raw`.
52
+ """
53
+
54
+ def __init__(self, iterations=500, learning_rate=None, depth=6,
55
+ l2_leaf_reg=3.0, max_bins=128, subsample=1.0,
56
+ colsample=1.0, cat_smoothing=1.0, early_stopping_rounds=None,
57
+ min_child_weight=1.0, thread_count=None, random_state=None,
58
+ verbose=False, ordered_boosting=True):
59
+ self.iterations = int(iterations)
60
+ self.learning_rate = learning_rate
61
+ self.depth = int(depth)
62
+ self.l2_leaf_reg = float(l2_leaf_reg)
63
+ self.max_bins = int(max_bins)
64
+ self.subsample = float(subsample)
65
+ self.colsample = float(colsample)
66
+ self.cat_smoothing = float(cat_smoothing)
67
+ self.early_stopping_rounds = early_stopping_rounds
68
+ self.min_child_weight = float(min_child_weight)
69
+ self.thread_count = thread_count
70
+ self.random_state = random_state
71
+ self.verbose = verbose
72
+ self.ordered_boosting = bool(ordered_boosting)
73
+
74
+ def _alloc_hist_buffers(self, n_features, n_bins):
75
+ """Allocate reusable histogram buffers once per fit.
76
+
77
+ Shape (n_features, 2**depth, max_bins). Reused for every tree and level
78
+ via _build_histograms_into, which zeroes the active slice each call.
79
+ This avoids reallocating these (potentially large) arrays thousands of
80
+ times over a long boosting run.
81
+ """
82
+ max_leaves = 1 << self.depth
83
+ max_bins = int(n_bins.max()) if len(n_bins) else 1
84
+ hg = np.zeros((n_features, max_leaves, max_bins))
85
+ hh = np.zeros((n_features, max_leaves, max_bins))
86
+ return (hg, hh)
87
+
88
+ def _feature_mask(self, n_cols, rng):
89
+ """0/1 mask selecting a random subset of columns for one tree."""
90
+ if self.colsample >= 1.0:
91
+ return None
92
+ k = max(1, int(round(self.colsample * n_cols)))
93
+ mask = np.zeros(n_cols, dtype=np.int64)
94
+ mask[rng.choice(n_cols, size=k, replace=False)] = 1
95
+ return mask
96
+
97
+ def _new_preprocessor(self):
98
+ """Build a FeaturePreprocessor configured from this booster's params."""
99
+ return FeaturePreprocessor(self.max_bins, self.cat_smoothing,
100
+ self.random_state)
101
+
102
+ def _maybe_subsample(self, grad, hess, rng):
103
+ """Stochastic row subsampling: zero out the gradient/hessian of rows not
104
+ in this tree's sample. Zeroed rows contribute nothing to the histograms
105
+ but are still routed to leaves, as in standard stochastic GBDT."""
106
+ if self.subsample >= 1.0:
107
+ return grad, hess
108
+ mask = rng.random(grad.shape[0]) < self.subsample
109
+ return np.where(mask, grad, 0.0), np.where(mask, hess, 0.0)
110
+
111
+ def _accumulate_importance(self, tree):
112
+ """Add this tree's per-split gains to the running importance totals,
113
+ mapped from internal columns back to original input features."""
114
+ for f, g in zip(tree.splits_feat, tree.gains):
115
+ orig = self.prep_.feature_map_[f]
116
+ self._importance[orig] += g
117
+
118
+ @property
119
+ def feature_importances_(self):
120
+ """Total split gain per ORIGINAL input column, normalized to sum 1."""
121
+ imp = self._importance.copy()
122
+ s = imp.sum()
123
+ return imp / s if s > 0 else imp
124
+
125
+
126
+ class GradientBoosting(_BaseBooster):
127
+ """Scalar booster: regression and binary classification."""
128
+
129
+ def __init__(self, loss="RMSE", loss_kwargs=None, **kw):
130
+ super().__init__(**kw)
131
+ self.loss_name = loss
132
+ self.loss_kwargs = loss_kwargs or {}
133
+
134
+ def fit(self, X, y, cat_features=None, eval_set=None, sample_weight=None):
135
+ """Fit the additive model. Optionally pass `cat_features` (column indices
136
+ to target-encode) and `eval_set=(X_val, y_val)` for early stopping.
137
+ `sample_weight` is a 1-D array of per-sample weights; None means uniform.
138
+ Weights are normalized to mean 1 internally so the gradient scale stays
139
+ comparable to the no-weight case."""
140
+ X = (np.asarray(X, dtype=object) if cat_features
141
+ else np.asarray(X, dtype=np.float64))
142
+ y = np.asarray(y, dtype=np.float64)
143
+ n_samples = X.shape[0]
144
+
145
+ # Normalize weights to mean=1. np.ones(n) stays np.ones(n), so
146
+ # sample_weight=np.ones(n) is bitwise-equivalent to sample_weight=None
147
+ # for all losses except MAE/Quantile (which use a different quantile
148
+ # algorithm when weights are present).
149
+ w = None
150
+ if sample_weight is not None:
151
+ w = np.asarray(sample_weight, dtype=np.float64)
152
+ w = w * (n_samples / w.sum())
153
+
154
+ self.n_threads_ = _apply_thread_count(self.thread_count)
155
+ self.loss_ = LOSSES[self.loss_name](**self.loss_kwargs)
156
+ _es = self.early_stopping_rounds is not None and eval_set is not None
157
+ self.lr_ = (self.learning_rate if self.learning_rate is not None
158
+ else _auto_learning_rate(n_samples, self.iterations, _es))
159
+
160
+ self.prep_ = self._new_preprocessor()
161
+ X_binned = self.prep_.fit_transform(X, [y], cat_features)
162
+ n_bins = self.prep_.n_bins_
163
+ hist_buffers = self._alloc_hist_buffers(X_binned.shape[1], n_bins)
164
+ self._importance = np.zeros(self.prep_.n_input_features_)
165
+
166
+ Xv_binned = yv = Fv = None
167
+ if eval_set is not None:
168
+ Xv, yv = eval_set
169
+ Xv = (np.asarray(Xv, dtype=object) if cat_features
170
+ else np.asarray(Xv, dtype=np.float64))
171
+ yv = np.asarray(yv, dtype=np.float64)
172
+ Xv_binned = self.prep_.transform(Xv)
173
+
174
+ self.init_ = self.loss_.init(y, w)
175
+ F = np.full(n_samples, self.init_, dtype=np.float64)
176
+ if yv is not None:
177
+ Fv = np.full(len(yv), self.init_)
178
+
179
+ rng = np.random.default_rng(self.random_state)
180
+ self.trees_ = []
181
+ self.train_history_, self.valid_history_ = [], []
182
+ best_score, best_iter = np.inf, 0
183
+ t0 = time.time()
184
+
185
+ for m in range(self.iterations):
186
+ grad, hess = self.loss_.grad_hess(y, F)
187
+ if w is not None:
188
+ grad = grad * w
189
+ hess = hess * w
190
+ g, h = self._maybe_subsample(grad, hess, rng)
191
+ fmask = self._feature_mask(X_binned.shape[1], rng)
192
+ tree = build_oblivious_tree(X_binned, g, h, n_bins, self.depth,
193
+ self.l2_leaf_reg, self.lr_,
194
+ feature_mask=fmask,
195
+ min_child_weight=self.min_child_weight,
196
+ hist_buffers=hist_buffers)
197
+ # A depth-0 tree found no legal split; subsequent rounds on the same
198
+ # gradients would too, so stop rather than append empty trees.
199
+ if tree.depth == 0:
200
+ if self.verbose:
201
+ print(f"No further splits at iteration {m}; stopping.")
202
+ break
203
+ if getattr(self.loss_, "adjusts_leaves", False):
204
+ self._correct_leaves(tree, X_binned, y - F, w)
205
+ self.trees_.append(tree)
206
+ self._accumulate_importance(tree)
207
+ if self.ordered_boosting and not getattr(self.loss_, "adjusts_leaves", False):
208
+ # Leave-one-out leaf step: each row's update uses its leaf's
209
+ # gradient/hessian totals with that row's own contribution
210
+ # removed, reducing the self-reinforcement of plain boosting.
211
+ # tree.values keeps the standard Newton values for inference;
212
+ # only the training F uses this corrected update. Subsampled-out
213
+ # rows (g=h=0) fall back to the standard leaf value.
214
+ leaf = tree.apply(X_binned)
215
+ n_lv = tree.values.shape[0]
216
+ leaf_G = np.bincount(leaf, weights=g, minlength=n_lv)
217
+ leaf_H = np.bincount(leaf, weights=h, minlength=n_lv)
218
+ F += -self.lr_ * (leaf_G[leaf] - g) / (
219
+ np.maximum(leaf_H[leaf] - h, 0.0) + self.l2_leaf_reg)
220
+ else:
221
+ F += tree.predict(X_binned)
222
+ self.train_history_.append(self.loss_.eval(y, F, w))
223
+
224
+ if Fv is not None:
225
+ Fv += tree.predict(Xv_binned)
226
+ val = self.loss_.eval(yv, Fv) # validation is always unweighted
227
+ self.valid_history_.append(val)
228
+ if val < best_score - 1e-9:
229
+ best_score, best_iter = val, m
230
+ elif (self.early_stopping_rounds and
231
+ m - best_iter >= self.early_stopping_rounds):
232
+ if self.verbose:
233
+ print(f"Early stop at {m} (best {best_iter}, "
234
+ f"val {best_score:.5f})")
235
+ self.trees_ = self.trees_[: best_iter + 1]
236
+ break
237
+
238
+ if self.verbose and (m % max(1, self.iterations // 10) == 0):
239
+ msg = f"[{m}] train {self.train_history_[-1]:.5f}"
240
+ if Fv is not None:
241
+ msg += f" val {self.valid_history_[-1]:.5f}"
242
+ print(msg)
243
+
244
+ self.fit_time_ = time.time() - t0
245
+ self.best_iteration_ = len(self.trees_)
246
+ return self
247
+
248
+ def _correct_leaves(self, tree, X_binned, residuals, sample_weight=None):
249
+ """Override Newton leaf values with the loss-appropriate residual
250
+ statistic (median for MAE, alpha-quantile for Quantile). The tree
251
+ structure was chosen by the gradient; this fixes the step size."""
252
+ leaf = tree.apply(X_binned)
253
+ n_leaves = tree.values.shape[0]
254
+ for l in range(n_leaves):
255
+ mask = leaf == l
256
+ r = residuals[mask]
257
+ w = sample_weight[mask] if sample_weight is not None else None
258
+ tree.values[l] = self.lr_ * self.loss_.leaf_value(r, w)
259
+
260
+ def predict_raw(self, X):
261
+ """Return raw additive scores (pre-link): the regression prediction, or
262
+ the log-odds for binary classification."""
263
+ X = (np.asarray(X, dtype=object) if self.prep_.cat_features_
264
+ else np.asarray(X, dtype=np.float64))
265
+ X_binned = self.prep_.transform(X)
266
+ F = np.full(X_binned.shape[0], self.init_, dtype=np.float64)
267
+ for tree in self.trees_:
268
+ F += tree.predict(X_binned)
269
+ return F
270
+
271
+ def staged_predict_raw(self, X):
272
+ """Yield the cumulative raw prediction after each tree (1..n_trees)."""
273
+ X = (np.asarray(X, dtype=object) if self.prep_.cat_features_
274
+ else np.asarray(X, dtype=np.float64))
275
+ X_binned = self.prep_.transform(X)
276
+ F = np.full(X_binned.shape[0], self.init_, dtype=np.float64)
277
+ for tree in self.trees_:
278
+ F += tree.predict(X_binned)
279
+ yield F.copy()
280
+
281
+
282
+ class MulticlassBoosting(_BaseBooster):
283
+ """Softmax multiclass booster: fits K trees per round (one per class)."""
284
+
285
+ def fit(self, X, y, cat_features=None, eval_set=None, sample_weight=None):
286
+ """Fit K trees per boosting round (one per class) under softmax loss.
287
+ Same `cat_features` / `eval_set` / `sample_weight` semantics as the
288
+ scalar booster."""
289
+ X = (np.asarray(X, dtype=object) if cat_features
290
+ else np.asarray(X, dtype=np.float64))
291
+ y = np.asarray(y)
292
+ self.classes_ = np.unique(y)
293
+ K = self.classes_.size
294
+ self.n_classes_ = K
295
+ y_idx = np.searchsorted(self.classes_, y)
296
+ Y = np.eye(K)[y_idx] # one-hot (n, K)
297
+ n_samples = X.shape[0]
298
+
299
+ w = None
300
+ if sample_weight is not None:
301
+ w = np.asarray(sample_weight, dtype=np.float64)
302
+ w = w * (n_samples / w.sum())
303
+
304
+ self.n_threads_ = _apply_thread_count(self.thread_count)
305
+ self.loss_ = MultiSoftmax(K)
306
+ _es = self.early_stopping_rounds is not None and eval_set is not None
307
+ self.lr_ = (self.learning_rate if self.learning_rate is not None
308
+ else _auto_learning_rate(n_samples, self.iterations, _es))
309
+
310
+ # One ordered-TS target per class (CatBoost-style per-class statistics).
311
+ self.prep_ = self._new_preprocessor()
312
+ X_binned = self.prep_.fit_transform(X, [Y[:, k] for k in range(K)],
313
+ cat_features)
314
+ n_bins = self.prep_.n_bins_
315
+ hist_buffers = self._alloc_hist_buffers(X_binned.shape[1], n_bins)
316
+ self._importance = np.zeros(self.prep_.n_input_features_)
317
+
318
+ Xv_binned = Yv = Fv = yv_idx = None
319
+ if eval_set is not None:
320
+ Xv, yv = eval_set
321
+ Xv = (np.asarray(Xv, dtype=object) if cat_features
322
+ else np.asarray(Xv, dtype=np.float64))
323
+ yv_idx = np.searchsorted(self.classes_, np.asarray(yv))
324
+ Yv = np.eye(K)[yv_idx]
325
+ Xv_binned = self.prep_.transform(Xv)
326
+
327
+ self.init_ = self.loss_.init(Y, w) # (K,)
328
+ F = np.tile(self.init_, (n_samples, 1)) # (n, K)
329
+ if Yv is not None:
330
+ Fv = np.tile(self.init_, (len(yv_idx), 1))
331
+
332
+ rng = np.random.default_rng(self.random_state)
333
+ self.trees_ = [] # list of rounds; each = K trees
334
+ self.train_history_, self.valid_history_ = [], []
335
+ best_score, best_iter = np.inf, 0
336
+ t0 = time.time()
337
+
338
+ for m in range(self.iterations):
339
+ grad, hess = self.loss_.grad_hess(Y, F) # (n, K) each
340
+ if w is not None:
341
+ grad = grad * w[:, None]
342
+ hess = hess * w[:, None]
343
+ fmask = self._feature_mask(X_binned.shape[1], rng)
344
+ round_trees = []
345
+ for k in range(K):
346
+ g, h = self._maybe_subsample(np.ascontiguousarray(grad[:, k]),
347
+ np.ascontiguousarray(hess[:, k]), rng)
348
+ tree = build_oblivious_tree(X_binned, g, h, n_bins, self.depth,
349
+ self.l2_leaf_reg, self.lr_,
350
+ feature_mask=fmask,
351
+ min_child_weight=self.min_child_weight,
352
+ hist_buffers=hist_buffers)
353
+ round_trees.append(tree)
354
+ self._accumulate_importance(tree)
355
+ if self.ordered_boosting and tree.depth > 0:
356
+ leaf = tree.apply(X_binned)
357
+ n_lv = tree.values.shape[0]
358
+ leaf_G = np.bincount(leaf, weights=g, minlength=n_lv)
359
+ leaf_H = np.bincount(leaf, weights=h, minlength=n_lv)
360
+ F[:, k] += -self.lr_ * (leaf_G[leaf] - g) / (
361
+ np.maximum(leaf_H[leaf] - h, 0.0) + self.l2_leaf_reg)
362
+ else:
363
+ F[:, k] += tree.predict(X_binned)
364
+ # Stop only if EVERY class exhausted its splits this round; if even
365
+ # one class is still learning, the round was productive.
366
+ if all(t.depth == 0 for t in round_trees):
367
+ if self.verbose:
368
+ print(f"No further splits for any class at iteration {m}; "
369
+ f"stopping.")
370
+ break
371
+ self.trees_.append(round_trees)
372
+ self.train_history_.append(self.loss_.eval(Y, F, w))
373
+
374
+ if Fv is not None:
375
+ for k in range(K):
376
+ Fv[:, k] += round_trees[k].predict(Xv_binned)
377
+ val = self.loss_.eval(Yv, Fv) # validation is always unweighted
378
+ self.valid_history_.append(val)
379
+ if val < best_score - 1e-9:
380
+ best_score, best_iter = val, m
381
+ elif (self.early_stopping_rounds and
382
+ m - best_iter >= self.early_stopping_rounds):
383
+ if self.verbose:
384
+ print(f"Early stop at {m} (best {best_iter})")
385
+ self.trees_ = self.trees_[: best_iter + 1]
386
+ break
387
+
388
+ if self.verbose and (m % max(1, self.iterations // 10) == 0):
389
+ msg = f"[{m}] train {self.train_history_[-1]:.5f}"
390
+ if Fv is not None:
391
+ msg += f" val {self.valid_history_[-1]:.5f}"
392
+ print(msg)
393
+
394
+ self.fit_time_ = time.time() - t0
395
+ self.best_iteration_ = len(self.trees_)
396
+ return self
397
+
398
+ def predict_raw(self, X):
399
+ """Return the (n_samples, n_classes) matrix of raw per-class scores
400
+ (pre-softmax)."""
401
+ X = (np.asarray(X, dtype=object) if self.prep_.cat_features_
402
+ else np.asarray(X, dtype=np.float64))
403
+ X_binned = self.prep_.transform(X)
404
+ F = np.tile(self.init_, (X_binned.shape[0], 1))
405
+ for round_trees in self.trees_:
406
+ for k in range(self.n_classes_):
407
+ F[:, k] += round_trees[k].predict(X_binned)
408
+ return F