chimeraboost 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chimeraboost/__init__.py +23 -0
- chimeraboost/binning.py +69 -0
- chimeraboost/booster.py +408 -0
- chimeraboost/losses.py +207 -0
- chimeraboost/preprocessing.py +135 -0
- chimeraboost/sklearn_api.py +306 -0
- chimeraboost/target_encoding.py +117 -0
- chimeraboost/tree.py +239 -0
- chimeraboost-0.5.2.dist-info/METADATA +36 -0
- chimeraboost-0.5.2.dist-info/RECORD +13 -0
- chimeraboost-0.5.2.dist-info/WHEEL +5 -0
- chimeraboost-0.5.2.dist-info/licenses/LICENSE +201 -0
- chimeraboost-0.5.2.dist-info/top_level.txt +1 -0
chimeraboost/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""ChimeraBoost: a CatBoost-inspired gradient boosting library in pure Python.
|
|
2
|
+
|
|
3
|
+
Key ingredients borrowed from CatBoost:
|
|
4
|
+
* Ordered target statistics for categorical features (anti-leakage encoding)
|
|
5
|
+
* Oblivious / symmetric trees (fast, strongly regularized -> good defaults)
|
|
6
|
+
* Histogram-based quantized splitting (numba accelerated)
|
|
7
|
+
|
|
8
|
+
Public API:
|
|
9
|
+
>>> from chimeraboost import ChimeraBoostRegressor, ChimeraBoostClassifier
|
|
10
|
+
>>> model = ChimeraBoostClassifier().fit(X, y, cat_features=[0, 3])
|
|
11
|
+
>>> proba = model.predict_proba(X_test)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .sklearn_api import (
|
|
15
|
+
ChimeraBoostRegressor,
|
|
16
|
+
ChimeraBoostClassifier,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"ChimeraBoostRegressor",
|
|
21
|
+
"ChimeraBoostClassifier",
|
|
22
|
+
]
|
|
23
|
+
__version__ = "0.4.0"
|
chimeraboost/binning.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Quantization of numeric features into integer bins.
|
|
2
|
+
|
|
3
|
+
Borders are learned once on the training data (quantile based). Every feature
|
|
4
|
+
is mapped to a small integer bin index, which is what the tree builder consumes.
|
|
5
|
+
NaNs are routed to a dedicated bin so a split can isolate missing values, the
|
|
6
|
+
way CatBoost/LightGBM do.
|
|
7
|
+
|
|
8
|
+
Bin layout per feature:
|
|
9
|
+
real values -> 0 .. n_borders (via searchsorted on borders)
|
|
10
|
+
NaN -> n_borders + 1 (the highest bin, "missing")
|
|
11
|
+
The histogram width for a feature is therefore (n_borders + 2).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
BIN_DTYPE = np.uint16
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _feature_borders(col, max_bins):
|
|
20
|
+
"""Quantile borders for one numeric column, ignoring NaNs."""
|
|
21
|
+
finite = col[np.isfinite(col)]
|
|
22
|
+
if finite.size == 0:
|
|
23
|
+
return np.array([], dtype=np.float64)
|
|
24
|
+
uniq = np.unique(finite)
|
|
25
|
+
if uniq.size <= max_bins:
|
|
26
|
+
# Few distinct values: put a border between each pair.
|
|
27
|
+
return ((uniq[:-1] + uniq[1:]) / 2.0).astype(np.float64)
|
|
28
|
+
qs = np.linspace(0.0, 1.0, max_bins + 1)[1:-1]
|
|
29
|
+
borders = np.quantile(finite, qs)
|
|
30
|
+
return np.unique(borders).astype(np.float64)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Binner:
|
|
34
|
+
"""Learns per-feature borders and maps a float matrix to bins."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, max_bins=128):
|
|
37
|
+
self.max_bins = int(max_bins)
|
|
38
|
+
self.borders_ = None # list of np.ndarray, one per feature
|
|
39
|
+
self.n_bins_ = None # np.ndarray int, width per feature
|
|
40
|
+
|
|
41
|
+
def fit(self, X):
|
|
42
|
+
"""Learn quantile borders for each column from training data."""
|
|
43
|
+
X = np.asarray(X, dtype=np.float64)
|
|
44
|
+
n_features = X.shape[1]
|
|
45
|
+
self.borders_ = [
|
|
46
|
+
_feature_borders(X[:, f], self.max_bins) for f in range(n_features)
|
|
47
|
+
]
|
|
48
|
+
# +1 for the searchsorted upper bucket, +1 for the NaN bucket.
|
|
49
|
+
self.n_bins_ = np.array(
|
|
50
|
+
[len(b) + 2 for b in self.borders_], dtype=np.int64
|
|
51
|
+
)
|
|
52
|
+
return self
|
|
53
|
+
|
|
54
|
+
def transform(self, X):
|
|
55
|
+
"""Map a float matrix to integer bin indices; NaNs go to the top bin."""
|
|
56
|
+
X = np.asarray(X, dtype=np.float64)
|
|
57
|
+
n_samples, n_features = X.shape
|
|
58
|
+
out = np.empty((n_samples, n_features), dtype=BIN_DTYPE)
|
|
59
|
+
for f in range(n_features):
|
|
60
|
+
col = X[:, f]
|
|
61
|
+
borders = self.borders_[f]
|
|
62
|
+
nan_bin = len(borders) + 1
|
|
63
|
+
binned = np.searchsorted(borders, col, side="right").astype(BIN_DTYPE)
|
|
64
|
+
binned[~np.isfinite(col)] = nan_bin
|
|
65
|
+
out[:, f] = binned
|
|
66
|
+
return out
|
|
67
|
+
|
|
68
|
+
def fit_transform(self, X):
|
|
69
|
+
return self.fit(X).transform(X)
|
chimeraboost/booster.py
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
"""The gradient boosting core: builds the full additive model.
|
|
2
|
+
|
|
3
|
+
Two boosters share the same machinery (FeaturePreprocessor, oblivious trees):
|
|
4
|
+
* GradientBoosting -> scalar output (regression, binary classification)
|
|
5
|
+
* MulticlassBoosting -> K simultaneous outputs (softmax multiclass)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from .losses import LOSSES, MultiSoftmax
|
|
12
|
+
from .preprocessing import FeaturePreprocessor
|
|
13
|
+
from .tree import build_oblivious_tree
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _apply_thread_count(thread_count):
|
|
17
|
+
"""Set numba's thread pool size. None / -1 means use all detected cores.
|
|
18
|
+
|
|
19
|
+
Returns the effective thread count so callers can record it.
|
|
20
|
+
"""
|
|
21
|
+
import numba
|
|
22
|
+
max_threads = numba.config.NUMBA_NUM_THREADS
|
|
23
|
+
if thread_count is None or thread_count < 0:
|
|
24
|
+
n = max_threads
|
|
25
|
+
else:
|
|
26
|
+
n = max(1, min(int(thread_count), max_threads))
|
|
27
|
+
numba.set_num_threads(n)
|
|
28
|
+
return n
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _auto_learning_rate(n_samples, iterations, early_stopping):
|
|
32
|
+
"""Pick a default learning rate when the user did not specify one.
|
|
33
|
+
|
|
34
|
+
With early stopping, we default to 0.05 (down from 0.1). This forces the
|
|
35
|
+
model to take smaller steps and build a larger, smoother ensemble. We trade
|
|
36
|
+
a bit of our massive speed advantage for better test generalization.
|
|
37
|
+
Without early stopping, the rate scales inversely with the iteration budget.
|
|
38
|
+
"""
|
|
39
|
+
if early_stopping:
|
|
40
|
+
return 0.1
|
|
41
|
+
lr = 20.0 / max(iterations, 1)
|
|
42
|
+
return float(np.clip(lr, 0.03, 0.2))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class _BaseBooster:
|
|
46
|
+
"""Shared machinery for the scalar and multiclass boosters.
|
|
47
|
+
|
|
48
|
+
Holds the common hyperparameters and the helpers both subclasses use:
|
|
49
|
+
histogram-buffer allocation, column subsampling, row subsampling, feature
|
|
50
|
+
preprocessing, and split-gain feature importances. Subclasses implement
|
|
51
|
+
`fit` and `predict_raw`.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, iterations=500, learning_rate=None, depth=6,
|
|
55
|
+
l2_leaf_reg=3.0, max_bins=128, subsample=1.0,
|
|
56
|
+
colsample=1.0, cat_smoothing=1.0, early_stopping_rounds=None,
|
|
57
|
+
min_child_weight=1.0, thread_count=None, random_state=None,
|
|
58
|
+
verbose=False, ordered_boosting=True):
|
|
59
|
+
self.iterations = int(iterations)
|
|
60
|
+
self.learning_rate = learning_rate
|
|
61
|
+
self.depth = int(depth)
|
|
62
|
+
self.l2_leaf_reg = float(l2_leaf_reg)
|
|
63
|
+
self.max_bins = int(max_bins)
|
|
64
|
+
self.subsample = float(subsample)
|
|
65
|
+
self.colsample = float(colsample)
|
|
66
|
+
self.cat_smoothing = float(cat_smoothing)
|
|
67
|
+
self.early_stopping_rounds = early_stopping_rounds
|
|
68
|
+
self.min_child_weight = float(min_child_weight)
|
|
69
|
+
self.thread_count = thread_count
|
|
70
|
+
self.random_state = random_state
|
|
71
|
+
self.verbose = verbose
|
|
72
|
+
self.ordered_boosting = bool(ordered_boosting)
|
|
73
|
+
|
|
74
|
+
def _alloc_hist_buffers(self, n_features, n_bins):
|
|
75
|
+
"""Allocate reusable histogram buffers once per fit.
|
|
76
|
+
|
|
77
|
+
Shape (n_features, 2**depth, max_bins). Reused for every tree and level
|
|
78
|
+
via _build_histograms_into, which zeroes the active slice each call.
|
|
79
|
+
This avoids reallocating these (potentially large) arrays thousands of
|
|
80
|
+
times over a long boosting run.
|
|
81
|
+
"""
|
|
82
|
+
max_leaves = 1 << self.depth
|
|
83
|
+
max_bins = int(n_bins.max()) if len(n_bins) else 1
|
|
84
|
+
hg = np.zeros((n_features, max_leaves, max_bins))
|
|
85
|
+
hh = np.zeros((n_features, max_leaves, max_bins))
|
|
86
|
+
return (hg, hh)
|
|
87
|
+
|
|
88
|
+
def _feature_mask(self, n_cols, rng):
|
|
89
|
+
"""0/1 mask selecting a random subset of columns for one tree."""
|
|
90
|
+
if self.colsample >= 1.0:
|
|
91
|
+
return None
|
|
92
|
+
k = max(1, int(round(self.colsample * n_cols)))
|
|
93
|
+
mask = np.zeros(n_cols, dtype=np.int64)
|
|
94
|
+
mask[rng.choice(n_cols, size=k, replace=False)] = 1
|
|
95
|
+
return mask
|
|
96
|
+
|
|
97
|
+
def _new_preprocessor(self):
|
|
98
|
+
"""Build a FeaturePreprocessor configured from this booster's params."""
|
|
99
|
+
return FeaturePreprocessor(self.max_bins, self.cat_smoothing,
|
|
100
|
+
self.random_state)
|
|
101
|
+
|
|
102
|
+
def _maybe_subsample(self, grad, hess, rng):
|
|
103
|
+
"""Stochastic row subsampling: zero out the gradient/hessian of rows not
|
|
104
|
+
in this tree's sample. Zeroed rows contribute nothing to the histograms
|
|
105
|
+
but are still routed to leaves, as in standard stochastic GBDT."""
|
|
106
|
+
if self.subsample >= 1.0:
|
|
107
|
+
return grad, hess
|
|
108
|
+
mask = rng.random(grad.shape[0]) < self.subsample
|
|
109
|
+
return np.where(mask, grad, 0.0), np.where(mask, hess, 0.0)
|
|
110
|
+
|
|
111
|
+
def _accumulate_importance(self, tree):
|
|
112
|
+
"""Add this tree's per-split gains to the running importance totals,
|
|
113
|
+
mapped from internal columns back to original input features."""
|
|
114
|
+
for f, g in zip(tree.splits_feat, tree.gains):
|
|
115
|
+
orig = self.prep_.feature_map_[f]
|
|
116
|
+
self._importance[orig] += g
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def feature_importances_(self):
|
|
120
|
+
"""Total split gain per ORIGINAL input column, normalized to sum 1."""
|
|
121
|
+
imp = self._importance.copy()
|
|
122
|
+
s = imp.sum()
|
|
123
|
+
return imp / s if s > 0 else imp
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class GradientBoosting(_BaseBooster):
|
|
127
|
+
"""Scalar booster: regression and binary classification."""
|
|
128
|
+
|
|
129
|
+
def __init__(self, loss="RMSE", loss_kwargs=None, **kw):
|
|
130
|
+
super().__init__(**kw)
|
|
131
|
+
self.loss_name = loss
|
|
132
|
+
self.loss_kwargs = loss_kwargs or {}
|
|
133
|
+
|
|
134
|
+
def fit(self, X, y, cat_features=None, eval_set=None, sample_weight=None):
|
|
135
|
+
"""Fit the additive model. Optionally pass `cat_features` (column indices
|
|
136
|
+
to target-encode) and `eval_set=(X_val, y_val)` for early stopping.
|
|
137
|
+
`sample_weight` is a 1-D array of per-sample weights; None means uniform.
|
|
138
|
+
Weights are normalized to mean 1 internally so the gradient scale stays
|
|
139
|
+
comparable to the no-weight case."""
|
|
140
|
+
X = (np.asarray(X, dtype=object) if cat_features
|
|
141
|
+
else np.asarray(X, dtype=np.float64))
|
|
142
|
+
y = np.asarray(y, dtype=np.float64)
|
|
143
|
+
n_samples = X.shape[0]
|
|
144
|
+
|
|
145
|
+
# Normalize weights to mean=1. np.ones(n) stays np.ones(n), so
|
|
146
|
+
# sample_weight=np.ones(n) is bitwise-equivalent to sample_weight=None
|
|
147
|
+
# for all losses except MAE/Quantile (which use a different quantile
|
|
148
|
+
# algorithm when weights are present).
|
|
149
|
+
w = None
|
|
150
|
+
if sample_weight is not None:
|
|
151
|
+
w = np.asarray(sample_weight, dtype=np.float64)
|
|
152
|
+
w = w * (n_samples / w.sum())
|
|
153
|
+
|
|
154
|
+
self.n_threads_ = _apply_thread_count(self.thread_count)
|
|
155
|
+
self.loss_ = LOSSES[self.loss_name](**self.loss_kwargs)
|
|
156
|
+
_es = self.early_stopping_rounds is not None and eval_set is not None
|
|
157
|
+
self.lr_ = (self.learning_rate if self.learning_rate is not None
|
|
158
|
+
else _auto_learning_rate(n_samples, self.iterations, _es))
|
|
159
|
+
|
|
160
|
+
self.prep_ = self._new_preprocessor()
|
|
161
|
+
X_binned = self.prep_.fit_transform(X, [y], cat_features)
|
|
162
|
+
n_bins = self.prep_.n_bins_
|
|
163
|
+
hist_buffers = self._alloc_hist_buffers(X_binned.shape[1], n_bins)
|
|
164
|
+
self._importance = np.zeros(self.prep_.n_input_features_)
|
|
165
|
+
|
|
166
|
+
Xv_binned = yv = Fv = None
|
|
167
|
+
if eval_set is not None:
|
|
168
|
+
Xv, yv = eval_set
|
|
169
|
+
Xv = (np.asarray(Xv, dtype=object) if cat_features
|
|
170
|
+
else np.asarray(Xv, dtype=np.float64))
|
|
171
|
+
yv = np.asarray(yv, dtype=np.float64)
|
|
172
|
+
Xv_binned = self.prep_.transform(Xv)
|
|
173
|
+
|
|
174
|
+
self.init_ = self.loss_.init(y, w)
|
|
175
|
+
F = np.full(n_samples, self.init_, dtype=np.float64)
|
|
176
|
+
if yv is not None:
|
|
177
|
+
Fv = np.full(len(yv), self.init_)
|
|
178
|
+
|
|
179
|
+
rng = np.random.default_rng(self.random_state)
|
|
180
|
+
self.trees_ = []
|
|
181
|
+
self.train_history_, self.valid_history_ = [], []
|
|
182
|
+
best_score, best_iter = np.inf, 0
|
|
183
|
+
t0 = time.time()
|
|
184
|
+
|
|
185
|
+
for m in range(self.iterations):
|
|
186
|
+
grad, hess = self.loss_.grad_hess(y, F)
|
|
187
|
+
if w is not None:
|
|
188
|
+
grad = grad * w
|
|
189
|
+
hess = hess * w
|
|
190
|
+
g, h = self._maybe_subsample(grad, hess, rng)
|
|
191
|
+
fmask = self._feature_mask(X_binned.shape[1], rng)
|
|
192
|
+
tree = build_oblivious_tree(X_binned, g, h, n_bins, self.depth,
|
|
193
|
+
self.l2_leaf_reg, self.lr_,
|
|
194
|
+
feature_mask=fmask,
|
|
195
|
+
min_child_weight=self.min_child_weight,
|
|
196
|
+
hist_buffers=hist_buffers)
|
|
197
|
+
# A depth-0 tree found no legal split; subsequent rounds on the same
|
|
198
|
+
# gradients would too, so stop rather than append empty trees.
|
|
199
|
+
if tree.depth == 0:
|
|
200
|
+
if self.verbose:
|
|
201
|
+
print(f"No further splits at iteration {m}; stopping.")
|
|
202
|
+
break
|
|
203
|
+
if getattr(self.loss_, "adjusts_leaves", False):
|
|
204
|
+
self._correct_leaves(tree, X_binned, y - F, w)
|
|
205
|
+
self.trees_.append(tree)
|
|
206
|
+
self._accumulate_importance(tree)
|
|
207
|
+
if self.ordered_boosting and not getattr(self.loss_, "adjusts_leaves", False):
|
|
208
|
+
# Leave-one-out leaf step: each row's update uses its leaf's
|
|
209
|
+
# gradient/hessian totals with that row's own contribution
|
|
210
|
+
# removed, reducing the self-reinforcement of plain boosting.
|
|
211
|
+
# tree.values keeps the standard Newton values for inference;
|
|
212
|
+
# only the training F uses this corrected update. Subsampled-out
|
|
213
|
+
# rows (g=h=0) fall back to the standard leaf value.
|
|
214
|
+
leaf = tree.apply(X_binned)
|
|
215
|
+
n_lv = tree.values.shape[0]
|
|
216
|
+
leaf_G = np.bincount(leaf, weights=g, minlength=n_lv)
|
|
217
|
+
leaf_H = np.bincount(leaf, weights=h, minlength=n_lv)
|
|
218
|
+
F += -self.lr_ * (leaf_G[leaf] - g) / (
|
|
219
|
+
np.maximum(leaf_H[leaf] - h, 0.0) + self.l2_leaf_reg)
|
|
220
|
+
else:
|
|
221
|
+
F += tree.predict(X_binned)
|
|
222
|
+
self.train_history_.append(self.loss_.eval(y, F, w))
|
|
223
|
+
|
|
224
|
+
if Fv is not None:
|
|
225
|
+
Fv += tree.predict(Xv_binned)
|
|
226
|
+
val = self.loss_.eval(yv, Fv) # validation is always unweighted
|
|
227
|
+
self.valid_history_.append(val)
|
|
228
|
+
if val < best_score - 1e-9:
|
|
229
|
+
best_score, best_iter = val, m
|
|
230
|
+
elif (self.early_stopping_rounds and
|
|
231
|
+
m - best_iter >= self.early_stopping_rounds):
|
|
232
|
+
if self.verbose:
|
|
233
|
+
print(f"Early stop at {m} (best {best_iter}, "
|
|
234
|
+
f"val {best_score:.5f})")
|
|
235
|
+
self.trees_ = self.trees_[: best_iter + 1]
|
|
236
|
+
break
|
|
237
|
+
|
|
238
|
+
if self.verbose and (m % max(1, self.iterations // 10) == 0):
|
|
239
|
+
msg = f"[{m}] train {self.train_history_[-1]:.5f}"
|
|
240
|
+
if Fv is not None:
|
|
241
|
+
msg += f" val {self.valid_history_[-1]:.5f}"
|
|
242
|
+
print(msg)
|
|
243
|
+
|
|
244
|
+
self.fit_time_ = time.time() - t0
|
|
245
|
+
self.best_iteration_ = len(self.trees_)
|
|
246
|
+
return self
|
|
247
|
+
|
|
248
|
+
def _correct_leaves(self, tree, X_binned, residuals, sample_weight=None):
|
|
249
|
+
"""Override Newton leaf values with the loss-appropriate residual
|
|
250
|
+
statistic (median for MAE, alpha-quantile for Quantile). The tree
|
|
251
|
+
structure was chosen by the gradient; this fixes the step size."""
|
|
252
|
+
leaf = tree.apply(X_binned)
|
|
253
|
+
n_leaves = tree.values.shape[0]
|
|
254
|
+
for l in range(n_leaves):
|
|
255
|
+
mask = leaf == l
|
|
256
|
+
r = residuals[mask]
|
|
257
|
+
w = sample_weight[mask] if sample_weight is not None else None
|
|
258
|
+
tree.values[l] = self.lr_ * self.loss_.leaf_value(r, w)
|
|
259
|
+
|
|
260
|
+
def predict_raw(self, X):
|
|
261
|
+
"""Return raw additive scores (pre-link): the regression prediction, or
|
|
262
|
+
the log-odds for binary classification."""
|
|
263
|
+
X = (np.asarray(X, dtype=object) if self.prep_.cat_features_
|
|
264
|
+
else np.asarray(X, dtype=np.float64))
|
|
265
|
+
X_binned = self.prep_.transform(X)
|
|
266
|
+
F = np.full(X_binned.shape[0], self.init_, dtype=np.float64)
|
|
267
|
+
for tree in self.trees_:
|
|
268
|
+
F += tree.predict(X_binned)
|
|
269
|
+
return F
|
|
270
|
+
|
|
271
|
+
def staged_predict_raw(self, X):
|
|
272
|
+
"""Yield the cumulative raw prediction after each tree (1..n_trees)."""
|
|
273
|
+
X = (np.asarray(X, dtype=object) if self.prep_.cat_features_
|
|
274
|
+
else np.asarray(X, dtype=np.float64))
|
|
275
|
+
X_binned = self.prep_.transform(X)
|
|
276
|
+
F = np.full(X_binned.shape[0], self.init_, dtype=np.float64)
|
|
277
|
+
for tree in self.trees_:
|
|
278
|
+
F += tree.predict(X_binned)
|
|
279
|
+
yield F.copy()
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
class MulticlassBoosting(_BaseBooster):
|
|
283
|
+
"""Softmax multiclass booster: fits K trees per round (one per class)."""
|
|
284
|
+
|
|
285
|
+
def fit(self, X, y, cat_features=None, eval_set=None, sample_weight=None):
|
|
286
|
+
"""Fit K trees per boosting round (one per class) under softmax loss.
|
|
287
|
+
Same `cat_features` / `eval_set` / `sample_weight` semantics as the
|
|
288
|
+
scalar booster."""
|
|
289
|
+
X = (np.asarray(X, dtype=object) if cat_features
|
|
290
|
+
else np.asarray(X, dtype=np.float64))
|
|
291
|
+
y = np.asarray(y)
|
|
292
|
+
self.classes_ = np.unique(y)
|
|
293
|
+
K = self.classes_.size
|
|
294
|
+
self.n_classes_ = K
|
|
295
|
+
y_idx = np.searchsorted(self.classes_, y)
|
|
296
|
+
Y = np.eye(K)[y_idx] # one-hot (n, K)
|
|
297
|
+
n_samples = X.shape[0]
|
|
298
|
+
|
|
299
|
+
w = None
|
|
300
|
+
if sample_weight is not None:
|
|
301
|
+
w = np.asarray(sample_weight, dtype=np.float64)
|
|
302
|
+
w = w * (n_samples / w.sum())
|
|
303
|
+
|
|
304
|
+
self.n_threads_ = _apply_thread_count(self.thread_count)
|
|
305
|
+
self.loss_ = MultiSoftmax(K)
|
|
306
|
+
_es = self.early_stopping_rounds is not None and eval_set is not None
|
|
307
|
+
self.lr_ = (self.learning_rate if self.learning_rate is not None
|
|
308
|
+
else _auto_learning_rate(n_samples, self.iterations, _es))
|
|
309
|
+
|
|
310
|
+
# One ordered-TS target per class (CatBoost-style per-class statistics).
|
|
311
|
+
self.prep_ = self._new_preprocessor()
|
|
312
|
+
X_binned = self.prep_.fit_transform(X, [Y[:, k] for k in range(K)],
|
|
313
|
+
cat_features)
|
|
314
|
+
n_bins = self.prep_.n_bins_
|
|
315
|
+
hist_buffers = self._alloc_hist_buffers(X_binned.shape[1], n_bins)
|
|
316
|
+
self._importance = np.zeros(self.prep_.n_input_features_)
|
|
317
|
+
|
|
318
|
+
Xv_binned = Yv = Fv = yv_idx = None
|
|
319
|
+
if eval_set is not None:
|
|
320
|
+
Xv, yv = eval_set
|
|
321
|
+
Xv = (np.asarray(Xv, dtype=object) if cat_features
|
|
322
|
+
else np.asarray(Xv, dtype=np.float64))
|
|
323
|
+
yv_idx = np.searchsorted(self.classes_, np.asarray(yv))
|
|
324
|
+
Yv = np.eye(K)[yv_idx]
|
|
325
|
+
Xv_binned = self.prep_.transform(Xv)
|
|
326
|
+
|
|
327
|
+
self.init_ = self.loss_.init(Y, w) # (K,)
|
|
328
|
+
F = np.tile(self.init_, (n_samples, 1)) # (n, K)
|
|
329
|
+
if Yv is not None:
|
|
330
|
+
Fv = np.tile(self.init_, (len(yv_idx), 1))
|
|
331
|
+
|
|
332
|
+
rng = np.random.default_rng(self.random_state)
|
|
333
|
+
self.trees_ = [] # list of rounds; each = K trees
|
|
334
|
+
self.train_history_, self.valid_history_ = [], []
|
|
335
|
+
best_score, best_iter = np.inf, 0
|
|
336
|
+
t0 = time.time()
|
|
337
|
+
|
|
338
|
+
for m in range(self.iterations):
|
|
339
|
+
grad, hess = self.loss_.grad_hess(Y, F) # (n, K) each
|
|
340
|
+
if w is not None:
|
|
341
|
+
grad = grad * w[:, None]
|
|
342
|
+
hess = hess * w[:, None]
|
|
343
|
+
fmask = self._feature_mask(X_binned.shape[1], rng)
|
|
344
|
+
round_trees = []
|
|
345
|
+
for k in range(K):
|
|
346
|
+
g, h = self._maybe_subsample(np.ascontiguousarray(grad[:, k]),
|
|
347
|
+
np.ascontiguousarray(hess[:, k]), rng)
|
|
348
|
+
tree = build_oblivious_tree(X_binned, g, h, n_bins, self.depth,
|
|
349
|
+
self.l2_leaf_reg, self.lr_,
|
|
350
|
+
feature_mask=fmask,
|
|
351
|
+
min_child_weight=self.min_child_weight,
|
|
352
|
+
hist_buffers=hist_buffers)
|
|
353
|
+
round_trees.append(tree)
|
|
354
|
+
self._accumulate_importance(tree)
|
|
355
|
+
if self.ordered_boosting and tree.depth > 0:
|
|
356
|
+
leaf = tree.apply(X_binned)
|
|
357
|
+
n_lv = tree.values.shape[0]
|
|
358
|
+
leaf_G = np.bincount(leaf, weights=g, minlength=n_lv)
|
|
359
|
+
leaf_H = np.bincount(leaf, weights=h, minlength=n_lv)
|
|
360
|
+
F[:, k] += -self.lr_ * (leaf_G[leaf] - g) / (
|
|
361
|
+
np.maximum(leaf_H[leaf] - h, 0.0) + self.l2_leaf_reg)
|
|
362
|
+
else:
|
|
363
|
+
F[:, k] += tree.predict(X_binned)
|
|
364
|
+
# Stop only if EVERY class exhausted its splits this round; if even
|
|
365
|
+
# one class is still learning, the round was productive.
|
|
366
|
+
if all(t.depth == 0 for t in round_trees):
|
|
367
|
+
if self.verbose:
|
|
368
|
+
print(f"No further splits for any class at iteration {m}; "
|
|
369
|
+
f"stopping.")
|
|
370
|
+
break
|
|
371
|
+
self.trees_.append(round_trees)
|
|
372
|
+
self.train_history_.append(self.loss_.eval(Y, F, w))
|
|
373
|
+
|
|
374
|
+
if Fv is not None:
|
|
375
|
+
for k in range(K):
|
|
376
|
+
Fv[:, k] += round_trees[k].predict(Xv_binned)
|
|
377
|
+
val = self.loss_.eval(Yv, Fv) # validation is always unweighted
|
|
378
|
+
self.valid_history_.append(val)
|
|
379
|
+
if val < best_score - 1e-9:
|
|
380
|
+
best_score, best_iter = val, m
|
|
381
|
+
elif (self.early_stopping_rounds and
|
|
382
|
+
m - best_iter >= self.early_stopping_rounds):
|
|
383
|
+
if self.verbose:
|
|
384
|
+
print(f"Early stop at {m} (best {best_iter})")
|
|
385
|
+
self.trees_ = self.trees_[: best_iter + 1]
|
|
386
|
+
break
|
|
387
|
+
|
|
388
|
+
if self.verbose and (m % max(1, self.iterations // 10) == 0):
|
|
389
|
+
msg = f"[{m}] train {self.train_history_[-1]:.5f}"
|
|
390
|
+
if Fv is not None:
|
|
391
|
+
msg += f" val {self.valid_history_[-1]:.5f}"
|
|
392
|
+
print(msg)
|
|
393
|
+
|
|
394
|
+
self.fit_time_ = time.time() - t0
|
|
395
|
+
self.best_iteration_ = len(self.trees_)
|
|
396
|
+
return self
|
|
397
|
+
|
|
398
|
+
def predict_raw(self, X):
|
|
399
|
+
"""Return the (n_samples, n_classes) matrix of raw per-class scores
|
|
400
|
+
(pre-softmax)."""
|
|
401
|
+
X = (np.asarray(X, dtype=object) if self.prep_.cat_features_
|
|
402
|
+
else np.asarray(X, dtype=np.float64))
|
|
403
|
+
X_binned = self.prep_.transform(X)
|
|
404
|
+
F = np.tile(self.init_, (X_binned.shape[0], 1))
|
|
405
|
+
for round_trees in self.trees_:
|
|
406
|
+
for k in range(self.n_classes_):
|
|
407
|
+
F[:, k] += round_trees[k].predict(X_binned)
|
|
408
|
+
return F
|