path-boost 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,394 @@
1
+ import pandas as pd
2
+ import copy
3
+ import numpy as np
4
+ import warnings
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.metrics import log_loss, accuracy_score
7
+ from .extended_boosting_matrix import ExtendedBoostingMatrix
8
+ from typing import Iterable
9
+ from .interfaces.interface_base_learner import BaseLearnerClassInterface
10
+ from sklearn.tree import DecisionTreeRegressor
11
+
12
+
13
+ class AdditiveModelWrapperClassifier:
14
+ def __init__(
15
+ self,
16
+ BaseModelClass,
17
+ base_model_class_kwargs,
18
+ learning_rate: float,
19
+ use_tree_boost: bool = False,
20
+ ):
21
+ self.use_tree_boost = use_tree_boost
22
+ if self.use_tree_boost:
23
+ BaseModelClass = BoostedTreeBaselearner
24
+
25
+ # Ensure BaseModelClass respects BaseLearnerClassInterface
26
+ if not issubclass(BaseModelClass, BaseLearnerClassInterface):
27
+ raise TypeError(
28
+ f"{BaseModelClass.__name__} must implement BaseLearnerClassInterface"
29
+ )
30
+
31
+ self._last_train_prediction: pd.Series | None = None
32
+
33
+ self.train_logloss = []
34
+ self.train_accuracy = []
35
+ self.eval_sets_logloss: list[list[float]] = []
36
+ self.eval_sets_accuracy: list[list[float]] = []
37
+ self.learning_rate = learning_rate
38
+ self.base_learners_list: list[BaseLearnerClassInterface] = []
39
+ self.considered_columns = []
40
+ self.BaseModelClass = BaseModelClass
41
+ self.base_model_class_kwargs = base_model_class_kwargs
42
+
43
+ def fit_one_step(self, X: pd.DataFrame, y, best_path, eval_set=None):
44
+ # it fits one step of the boosting
45
+
46
+ columns_to_keep = ExtendedBoostingMatrix.get_columns_related_to_path(
47
+ best_path, X.columns
48
+ )
49
+ restricted_df = X[columns_to_keep]
50
+
51
+ self.trained_ = True
52
+ if eval_set is not None and not hasattr(self, "_last_eval_set_prediction_"):
53
+ self._last_eval_set_prediction_ = []
54
+ for eval_tuple in eval_set:
55
+ if eval_tuple is None:
56
+ self._last_eval_set_prediction_.append(None)
57
+ else:
58
+ self._last_eval_set_prediction_.append(
59
+ pd.Series(
60
+ np.zeros(len(eval_tuple[0])), index=eval_tuple[0].index
61
+ )
62
+ )
63
+
64
+ if len(self.base_learners_list) == 0:
65
+ # it is the first time we fit it so we do not need to compute the neg gradient
66
+ new_base_learner = FirstConstantBaseLearner()
67
+ self._target_variable_mean_ = []
68
+ self._target_variable_mean_.append(0.0)
69
+
70
+ new_base_learner.fit(restricted_df, np.array(y))
71
+ self.base_learners_list.append(new_base_learner)
72
+ self.considered_columns.append(columns_to_keep)
73
+
74
+ # this gives the log-odd of being in class 1 (F(x))
75
+ self._last_train_model_output_F = pd.Series(
76
+ new_base_learner.predict(X[columns_to_keep])
77
+ )
78
+
79
+ else:
80
+ # compute the new target (we have to use zeroed_y - true_neg_gradient instead of just zeroed_y, more explained in paper)
81
+
82
+ if self.base_model_class_kwargs is not None:
83
+ new_base_learner = self.BaseModelClass(**self.base_model_class_kwargs)
84
+ else:
85
+ new_base_learner = self.BaseModelClass()
86
+
87
+ negative_gradient = self._neg_gradient(
88
+ y=y, y_hat=self._last_train_prediction_probability
89
+ )
90
+ new_y = np.array(negative_gradient)
91
+
92
+ self._target_variable_mean_.append(new_y.mean())
93
+ new_y = new_y - self._target_variable_mean_[-1]
94
+
95
+ if self.use_tree_boost:
96
+ new_base_learner.fit(
97
+ restricted_df,
98
+ neg_gradient=new_y,
99
+ current_f=self._last_train_model_output_F.values,
100
+ y_true=np.array(y),
101
+ )
102
+ else:
103
+ new_base_learner.fit(restricted_df, new_y)
104
+
105
+ self.base_learners_list.append(new_base_learner)
106
+ self.considered_columns.append(columns_to_keep)
107
+
108
+ last_train_model_output_F = self._target_variable_mean_[
109
+ -1
110
+ ] + self.learning_rate * self.base_learners_list[-1].predict(
111
+ X[columns_to_keep]
112
+ )
113
+ self._last_train_model_output_F += last_train_model_output_F
114
+
115
+ # we transform the model prediction F into probability (p(x) or y_hat)
116
+ self._last_train_prediction_probability = self._sigmoid(
117
+ self._last_train_model_output_F
118
+ )
119
+
120
+ # this gives us the predicted class
121
+ self._last_train_prediction = (
122
+ self._last_train_prediction_probability >= 0.5
123
+ ).astype(int)
124
+
125
+ train_logloss = log_loss(
126
+ y_true=y, y_pred=self._last_train_prediction_probability
127
+ )
128
+ train_accuracy = accuracy_score(y_true=y, y_pred=self._last_train_prediction)
129
+
130
+ self.train_logloss.append(train_logloss)
131
+ self.train_accuracy.append(train_accuracy)
132
+
133
+ if eval_set is not None:
134
+ this_iter_eval_set_logloss: list[float | None] = [
135
+ None for _ in range(len(eval_set))
136
+ ]
137
+ this_iter_eval_set_accuracy: list[float | None] = [
138
+ None for _ in range(len(eval_set))
139
+ ]
140
+
141
+ for i, eval_tuple in enumerate(eval_set):
142
+ if eval_tuple is None:
143
+ self._last_eval_set_prediction_[i] = None
144
+ continue
145
+ ebm_df_eval, y_eval = eval_tuple
146
+ assert isinstance(ebm_df_eval, pd.DataFrame)
147
+
148
+ base_learner_prediction_F = self._target_variable_mean_[
149
+ -1
150
+ ] + self.learning_rate * new_base_learner.predict(
151
+ ebm_df_eval[columns_to_keep]
152
+ )
153
+
154
+ self._last_eval_set_prediction_[i] += base_learner_prediction_F
155
+
156
+ # we transform the model prediction F into probability (p(x) or y_hat)
157
+ last_eval_set_prediction_probability = self._sigmoid(
158
+ self._last_eval_set_prediction_[i]
159
+ )
160
+
161
+ # this gives us the predicted class
162
+ self._last_eval_set_prediction = (
163
+ last_eval_set_prediction_probability >= 0.5
164
+ ).astype(int)
165
+
166
+ eval_logloss = log_loss(
167
+ y_true=y_eval,
168
+ y_pred=last_eval_set_prediction_probability,
169
+ labels=[0, 1],
170
+ )
171
+ eval_accuracy = accuracy_score(
172
+ y_true=y_eval, y_pred=self._last_eval_set_prediction
173
+ )
174
+
175
+ this_iter_eval_set_logloss[i] = eval_logloss
176
+ this_iter_eval_set_accuracy[i] = eval_accuracy
177
+
178
+ if len(self.eval_sets_logloss) == 0:
179
+ for eval_set_error in this_iter_eval_set_logloss:
180
+ self.eval_sets_logloss.append([eval_set_error])
181
+ else:
182
+ for i, eval_set_error in enumerate(this_iter_eval_set_logloss):
183
+ self.eval_sets_logloss[i].append(eval_set_error)
184
+
185
+ if len(self.eval_sets_accuracy) == 0:
186
+ for eval_set_error in this_iter_eval_set_accuracy:
187
+ self.eval_sets_accuracy.append([eval_set_error])
188
+ else:
189
+ for i, eval_set_error in enumerate(this_iter_eval_set_accuracy):
190
+ self.eval_sets_accuracy[i].append(eval_set_error)
191
+
192
+ return self
193
+
194
+ def predict(self, X: pd.DataFrame, class_probability: bool = False, **kwargs):
195
+ predictions = self.predict_step_by_step(
196
+ X, return_class_probability=class_probability, **kwargs
197
+ )
198
+ return predictions[-1]
199
+
200
+ def predict_step_by_step(
201
+ self, X: pd.DataFrame, return_class_probability=False, **kwargs
202
+ ) -> list[np.array]:
203
+ """
204
+ Generates predictions for each boosting iteration step.
205
+
206
+ Args:
207
+ X (pd.DataFrame): Input features for prediction.
208
+ return_class_probability (bool, optional): If True, returns class probabilities for each step.
209
+ If False, returns binary class predictions. Defaults to False.
210
+ **kwargs: Additional keyword arguments passed to the base learner's predict method.
211
+
212
+ Returns:
213
+ list[np.array]: List of predictions for each boosting step. Each element is either
214
+ an array of class probabilities or binary predictions, depending on `return_class_probability`.
215
+ """
216
+ prediction = []
217
+ last_prediction_model_F = np.zeros(len(X))
218
+ for i, base_learner in enumerate(self.base_learners_list):
219
+ chosen_columns = self.considered_columns[i]
220
+ # the first base learner is not scaled by the learning rate because it is just the average of the labels
221
+ if i == 0:
222
+ learning_rate = 1.0
223
+ else:
224
+ learning_rate = self.learning_rate
225
+ last_prediction_model_F += self._target_variable_mean_[
226
+ i
227
+ ] + learning_rate * np.array(
228
+ base_learner.predict(X[chosen_columns], **kwargs)
229
+ )
230
+
231
+ last_prediction_probability = self._sigmoid(last_prediction_model_F)
232
+ if return_class_probability:
233
+ prediction.append(last_prediction_probability)
234
+ else:
235
+ prediction.append((last_prediction_probability >= 0.5).astype(int))
236
+
237
+ return prediction
238
+
239
+ def evaluate(self, X: pd.DataFrame, y: Iterable, **kwargs) -> list[float]:
240
+ # it returns the evolution of the mse with increasing number of iterations
241
+ predictions = self.predict_step_by_step(
242
+ X, return_class_probability=True, **kwargs
243
+ )
244
+ evolution_logloss = []
245
+ for prediction in predictions:
246
+ logloss = log_loss(y_true=y, y_pred=prediction)
247
+ evolution_logloss.append(logloss)
248
+ return evolution_logloss
249
+
250
+ def get_model(self):
251
+ return self.base_learners_list
252
+
253
+ def _sigmoid(self, x):
254
+ return 1 / (1 + np.exp(-x))
255
+
256
+ @staticmethod
257
+ def _neg_gradient(y, y_hat):
258
+ return y - y_hat
259
+
260
+
261
+ class BoostedTreeBaselearner(BaseLearnerClassInterface):
262
+ def __init__(self, **kwargs):
263
+ self.model = DecisionTreeRegressor(**kwargs)
264
+ self.fitted_ = False
265
+ self.leaf_gammas = {} # Store optimized gamma for each leaf
266
+
267
+ def fit(
268
+ self,
269
+ X: pd.DataFrame,
270
+ neg_gradient: Iterable,
271
+ current_f: np.ndarray = None,
272
+ y_true: np.ndarray = None,
273
+ **kwargs,
274
+ ):
275
+ """
276
+ Parameters:
277
+ -----------
278
+ X : pd.DataFrame
279
+ Features
280
+ neg_gradient : Iterable
281
+ Pseudo-residuals (negative gradient)
282
+ current_f : np.ndarray
283
+ Current model predictions (in log-odds space)
284
+ y_true : np.ndarray
285
+ True labels in {0, 1} format
286
+ """
287
+ # First fit tree to pseudo-residuals
288
+ self.model.fit(X, neg_gradient)
289
+
290
+ # If we have current_f and y_true, optimize gamma for each leaf
291
+ if current_f is not None and y_true is not None:
292
+ self._optimize_leaf_gammas(X, y_true, current_f)
293
+ else:
294
+ # Fallback: use tree's predictions as-is
295
+ leaf_indices = self.model.apply(X)
296
+ unique_leaves = np.unique(leaf_indices)
297
+ for leaf_id in unique_leaves:
298
+ mask = leaf_indices == leaf_id
299
+ # Use average of pseudo-residuals in leaf
300
+ self.leaf_gammas[leaf_id] = self.model.predict(
301
+ X.iloc[[np.where(mask)[0][0]]]
302
+ )[0]
303
+
304
+ self.fitted_ = True
305
+ return self
306
+
307
+ def _optimize_leaf_gammas(
308
+ self, X: pd.DataFrame, y_true: np.ndarray, current_f: np.ndarray
309
+ ):
310
+ """
311
+ For each leaf, find optimal gamma that minimizes logistic loss
312
+ Works with y_true in {0, 1}
313
+ """
314
+ # Get leaf assignments for all samples
315
+ leaf_indices = self.model.apply(X)
316
+ unique_leaves = np.unique(leaf_indices)
317
+
318
+ for leaf_id in unique_leaves:
319
+ # Get samples in this leaf
320
+ mask = leaf_indices == leaf_id
321
+
322
+ if np.sum(mask) == 0:
323
+ self.leaf_gammas[leaf_id] = 0.0
324
+ continue
325
+
326
+ # Get tree's base prediction for this leaf (all same value)
327
+ X_leaf = X[mask]
328
+ h_pred_leaf = self.model.predict(X_leaf.iloc[[0]])[0]
329
+
330
+ # Get data for this leaf
331
+ y_leaf = y_true[mask] # Shape: (n_samples_in_leaf,), values in {0, 1}
332
+ f_leaf = current_f[mask] # Shape: (n_samples_in_leaf,), log-odds
333
+
334
+ # Optimize gamma for this specific leaf
335
+ def loss(gamma):
336
+ f_new = f_leaf + gamma * h_pred_leaf
337
+ # Binary cross-entropy (logistic loss) for y in {0, 1}:
338
+ # -[y*log(p) + (1-y)*log(1-p)] where p = sigmoid(f)
339
+ # Equivalent to: log(1 + exp(-f)) if y=1, log(1 + exp(f)) if y=0
340
+ # Combined: y*log(1 + exp(-f)) + (1-y)*log(1 + exp(f))
341
+ p = 1 / (1 + np.exp(-np.clip(f_new, -500, 500))) # sigmoid
342
+ return -np.sum(
343
+ y_leaf * np.log(p + 1e-15) + (1 - y_leaf) * np.log(1 - p + 1e-15)
344
+ )
345
+
346
+ # Simple line search
347
+ best_gamma = 0.0
348
+ best_loss = loss(0.0)
349
+
350
+ # Search in reasonable range
351
+ for gamma in np.linspace(-10, 10, 100):
352
+ current_loss = loss(gamma)
353
+ if current_loss < best_loss:
354
+ best_loss = current_loss
355
+ best_gamma = gamma
356
+
357
+ self.leaf_gammas[leaf_id] = best_gamma
358
+
359
+ def predict(self, X: pd.DataFrame, **kwargs):
360
+ """
361
+ Predict using optimized gamma values instead of tree's leaf values
362
+ """
363
+ if not self.fitted_:
364
+ raise ValueError("Model not fitted yet")
365
+
366
+ # Get leaf assignments
367
+ leaf_indices = self.model.apply(X)
368
+
369
+ # Map each sample to its leaf's optimized gamma
370
+ predictions = np.array(
371
+ [self.leaf_gammas.get(leaf_id, 0.0) for leaf_id in leaf_indices]
372
+ )
373
+
374
+ return predictions
375
+
376
+
377
+ class FirstConstantBaseLearner(BaseLearnerClassInterface):
378
+ # use for the first base learner in classification tasks
379
+ # it always predicts the most frequent class in the training set
380
+ def __init__(self):
381
+ self.fitted_ = False
382
+
383
+ def fit(self, X: pd.DataFrame, y: Iterable, **kwargs):
384
+ self.fitted_ = True
385
+ self.unique_classes_ = np.unique(y)
386
+ class_mean = np.mean(y)
387
+ self._predict_value = self._log_odds(class_mean)
388
+ return self
389
+
390
+ def predict(self, X: pd.DataFrame, **kwargs):
391
+ return self._predict_value * np.ones(len(X))
392
+
393
+ def _log_odds(self, mean_y):
394
+ return np.log(mean_y / (1 - mean_y))