path-boost 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- path_boost/__init__.py +18 -0
- path_boost/_path_boost.py +1096 -0
- path_boost/_version.py +24 -0
- path_boost/utils/__init__.py +2 -0
- path_boost/utils/classes/__init__.py +0 -0
- path_boost/utils/classes/additive_model_wrapper.py +301 -0
- path_boost/utils/classes/additive_model_wrapper_classifier.py +394 -0
- path_boost/utils/classes/extended_boosting_matrix.py +596 -0
- path_boost/utils/classes/interfaces/__init__.py +0 -0
- path_boost/utils/classes/interfaces/interface_base_learner.py +30 -0
- path_boost/utils/classes/interfaces/interface_selector.py +27 -0
- path_boost/utils/classes/sequential_path_boost.py +1023 -0
- path_boost/utils/classes/sequential_path_boost_classifier.py +840 -0
- path_boost/utils/cross_validation.py +49 -0
- path_boost/utils/cyclic_path_boost_utils.py +76 -0
- path_boost/utils/datasets_for_examples/__init__.py +2 -0
- path_boost/utils/datasets_for_examples/generate_example_dataset.py +304 -0
- path_boost/utils/discovery.py +217 -0
- path_boost/utils/plots_functions.py +153 -0
- path_boost/utils/validate_data.py +223 -0
- path_boost/utils/variable_importance_according_to_path_boost.py +341 -0
- path_boost-2.1.0.dist-info/METADATA +174 -0
- path_boost-2.1.0.dist-info/RECORD +26 -0
- path_boost-2.1.0.dist-info/WHEEL +5 -0
- path_boost-2.1.0.dist-info/licenses/LICENSE +21 -0
- path_boost-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import copy
|
|
3
|
+
import numpy as np
|
|
4
|
+
import warnings
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
from sklearn.metrics import log_loss, accuracy_score
|
|
7
|
+
from .extended_boosting_matrix import ExtendedBoostingMatrix
|
|
8
|
+
from typing import Iterable
|
|
9
|
+
from .interfaces.interface_base_learner import BaseLearnerClassInterface
|
|
10
|
+
from sklearn.tree import DecisionTreeRegressor
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AdditiveModelWrapperClassifier:
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
BaseModelClass,
|
|
17
|
+
base_model_class_kwargs,
|
|
18
|
+
learning_rate: float,
|
|
19
|
+
use_tree_boost: bool = False,
|
|
20
|
+
):
|
|
21
|
+
self.use_tree_boost = use_tree_boost
|
|
22
|
+
if self.use_tree_boost:
|
|
23
|
+
BaseModelClass = BoostedTreeBaselearner
|
|
24
|
+
|
|
25
|
+
# Ensure BaseModelClass respects BaseLearnerClassInterface
|
|
26
|
+
if not issubclass(BaseModelClass, BaseLearnerClassInterface):
|
|
27
|
+
raise TypeError(
|
|
28
|
+
f"{BaseModelClass.__name__} must implement BaseLearnerClassInterface"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
self._last_train_prediction: pd.Series | None = None
|
|
32
|
+
|
|
33
|
+
self.train_logloss = []
|
|
34
|
+
self.train_accuracy = []
|
|
35
|
+
self.eval_sets_logloss: list[list[float]] = []
|
|
36
|
+
self.eval_sets_accuracy: list[list[float]] = []
|
|
37
|
+
self.learning_rate = learning_rate
|
|
38
|
+
self.base_learners_list: list[BaseLearnerClassInterface] = []
|
|
39
|
+
self.considered_columns = []
|
|
40
|
+
self.BaseModelClass = BaseModelClass
|
|
41
|
+
self.base_model_class_kwargs = base_model_class_kwargs
|
|
42
|
+
|
|
43
|
+
def fit_one_step(self, X: pd.DataFrame, y, best_path, eval_set=None):
|
|
44
|
+
# it fits one step of the boosting
|
|
45
|
+
|
|
46
|
+
columns_to_keep = ExtendedBoostingMatrix.get_columns_related_to_path(
|
|
47
|
+
best_path, X.columns
|
|
48
|
+
)
|
|
49
|
+
restricted_df = X[columns_to_keep]
|
|
50
|
+
|
|
51
|
+
self.trained_ = True
|
|
52
|
+
if eval_set is not None and not hasattr(self, "_last_eval_set_prediction_"):
|
|
53
|
+
self._last_eval_set_prediction_ = []
|
|
54
|
+
for eval_tuple in eval_set:
|
|
55
|
+
if eval_tuple is None:
|
|
56
|
+
self._last_eval_set_prediction_.append(None)
|
|
57
|
+
else:
|
|
58
|
+
self._last_eval_set_prediction_.append(
|
|
59
|
+
pd.Series(
|
|
60
|
+
np.zeros(len(eval_tuple[0])), index=eval_tuple[0].index
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if len(self.base_learners_list) == 0:
|
|
65
|
+
# it is the first time we fit it so we do not need to compute the neg gradient
|
|
66
|
+
new_base_learner = FirstConstantBaseLearner()
|
|
67
|
+
self._target_variable_mean_ = []
|
|
68
|
+
self._target_variable_mean_.append(0.0)
|
|
69
|
+
|
|
70
|
+
new_base_learner.fit(restricted_df, np.array(y))
|
|
71
|
+
self.base_learners_list.append(new_base_learner)
|
|
72
|
+
self.considered_columns.append(columns_to_keep)
|
|
73
|
+
|
|
74
|
+
# this gives the log-odd of being in class 1 (F(x))
|
|
75
|
+
self._last_train_model_output_F = pd.Series(
|
|
76
|
+
new_base_learner.predict(X[columns_to_keep])
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
else:
|
|
80
|
+
# compute the new target (we have to use zeroed_y - true_neg_gradient instead of just zeroed_y, more explained in paper)
|
|
81
|
+
|
|
82
|
+
if self.base_model_class_kwargs is not None:
|
|
83
|
+
new_base_learner = self.BaseModelClass(**self.base_model_class_kwargs)
|
|
84
|
+
else:
|
|
85
|
+
new_base_learner = self.BaseModelClass()
|
|
86
|
+
|
|
87
|
+
negative_gradient = self._neg_gradient(
|
|
88
|
+
y=y, y_hat=self._last_train_prediction_probability
|
|
89
|
+
)
|
|
90
|
+
new_y = np.array(negative_gradient)
|
|
91
|
+
|
|
92
|
+
self._target_variable_mean_.append(new_y.mean())
|
|
93
|
+
new_y = new_y - self._target_variable_mean_[-1]
|
|
94
|
+
|
|
95
|
+
if self.use_tree_boost:
|
|
96
|
+
new_base_learner.fit(
|
|
97
|
+
restricted_df,
|
|
98
|
+
neg_gradient=new_y,
|
|
99
|
+
current_f=self._last_train_model_output_F.values,
|
|
100
|
+
y_true=np.array(y),
|
|
101
|
+
)
|
|
102
|
+
else:
|
|
103
|
+
new_base_learner.fit(restricted_df, new_y)
|
|
104
|
+
|
|
105
|
+
self.base_learners_list.append(new_base_learner)
|
|
106
|
+
self.considered_columns.append(columns_to_keep)
|
|
107
|
+
|
|
108
|
+
last_train_model_output_F = self._target_variable_mean_[
|
|
109
|
+
-1
|
|
110
|
+
] + self.learning_rate * self.base_learners_list[-1].predict(
|
|
111
|
+
X[columns_to_keep]
|
|
112
|
+
)
|
|
113
|
+
self._last_train_model_output_F += last_train_model_output_F
|
|
114
|
+
|
|
115
|
+
# we transform the model prediction F into probability (p(x) or y_hat)
|
|
116
|
+
self._last_train_prediction_probability = self._sigmoid(
|
|
117
|
+
self._last_train_model_output_F
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# this gives us the predicted class
|
|
121
|
+
self._last_train_prediction = (
|
|
122
|
+
self._last_train_prediction_probability >= 0.5
|
|
123
|
+
).astype(int)
|
|
124
|
+
|
|
125
|
+
train_logloss = log_loss(
|
|
126
|
+
y_true=y, y_pred=self._last_train_prediction_probability
|
|
127
|
+
)
|
|
128
|
+
train_accuracy = accuracy_score(y_true=y, y_pred=self._last_train_prediction)
|
|
129
|
+
|
|
130
|
+
self.train_logloss.append(train_logloss)
|
|
131
|
+
self.train_accuracy.append(train_accuracy)
|
|
132
|
+
|
|
133
|
+
if eval_set is not None:
|
|
134
|
+
this_iter_eval_set_logloss: list[float | None] = [
|
|
135
|
+
None for _ in range(len(eval_set))
|
|
136
|
+
]
|
|
137
|
+
this_iter_eval_set_accuracy: list[float | None] = [
|
|
138
|
+
None for _ in range(len(eval_set))
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
for i, eval_tuple in enumerate(eval_set):
|
|
142
|
+
if eval_tuple is None:
|
|
143
|
+
self._last_eval_set_prediction_[i] = None
|
|
144
|
+
continue
|
|
145
|
+
ebm_df_eval, y_eval = eval_tuple
|
|
146
|
+
assert isinstance(ebm_df_eval, pd.DataFrame)
|
|
147
|
+
|
|
148
|
+
base_learner_prediction_F = self._target_variable_mean_[
|
|
149
|
+
-1
|
|
150
|
+
] + self.learning_rate * new_base_learner.predict(
|
|
151
|
+
ebm_df_eval[columns_to_keep]
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
self._last_eval_set_prediction_[i] += base_learner_prediction_F
|
|
155
|
+
|
|
156
|
+
# we transform the model prediction F into probability (p(x) or y_hat)
|
|
157
|
+
last_eval_set_prediction_probability = self._sigmoid(
|
|
158
|
+
self._last_eval_set_prediction_[i]
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# this gives us the predicted class
|
|
162
|
+
self._last_eval_set_prediction = (
|
|
163
|
+
last_eval_set_prediction_probability >= 0.5
|
|
164
|
+
).astype(int)
|
|
165
|
+
|
|
166
|
+
eval_logloss = log_loss(
|
|
167
|
+
y_true=y_eval,
|
|
168
|
+
y_pred=last_eval_set_prediction_probability,
|
|
169
|
+
labels=[0, 1],
|
|
170
|
+
)
|
|
171
|
+
eval_accuracy = accuracy_score(
|
|
172
|
+
y_true=y_eval, y_pred=self._last_eval_set_prediction
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
this_iter_eval_set_logloss[i] = eval_logloss
|
|
176
|
+
this_iter_eval_set_accuracy[i] = eval_accuracy
|
|
177
|
+
|
|
178
|
+
if len(self.eval_sets_logloss) == 0:
|
|
179
|
+
for eval_set_error in this_iter_eval_set_logloss:
|
|
180
|
+
self.eval_sets_logloss.append([eval_set_error])
|
|
181
|
+
else:
|
|
182
|
+
for i, eval_set_error in enumerate(this_iter_eval_set_logloss):
|
|
183
|
+
self.eval_sets_logloss[i].append(eval_set_error)
|
|
184
|
+
|
|
185
|
+
if len(self.eval_sets_accuracy) == 0:
|
|
186
|
+
for eval_set_error in this_iter_eval_set_accuracy:
|
|
187
|
+
self.eval_sets_accuracy.append([eval_set_error])
|
|
188
|
+
else:
|
|
189
|
+
for i, eval_set_error in enumerate(this_iter_eval_set_accuracy):
|
|
190
|
+
self.eval_sets_accuracy[i].append(eval_set_error)
|
|
191
|
+
|
|
192
|
+
return self
|
|
193
|
+
|
|
194
|
+
def predict(self, X: pd.DataFrame, class_probability: bool = False, **kwargs):
|
|
195
|
+
predictions = self.predict_step_by_step(
|
|
196
|
+
X, return_class_probability=class_probability, **kwargs
|
|
197
|
+
)
|
|
198
|
+
return predictions[-1]
|
|
199
|
+
|
|
200
|
+
def predict_step_by_step(
|
|
201
|
+
self, X: pd.DataFrame, return_class_probability=False, **kwargs
|
|
202
|
+
) -> list[np.array]:
|
|
203
|
+
"""
|
|
204
|
+
Generates predictions for each boosting iteration step.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
X (pd.DataFrame): Input features for prediction.
|
|
208
|
+
return_class_probability (bool, optional): If True, returns class probabilities for each step.
|
|
209
|
+
If False, returns binary class predictions. Defaults to False.
|
|
210
|
+
**kwargs: Additional keyword arguments passed to the base learner's predict method.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
list[np.array]: List of predictions for each boosting step. Each element is either
|
|
214
|
+
an array of class probabilities or binary predictions, depending on `return_class_probability`.
|
|
215
|
+
"""
|
|
216
|
+
prediction = []
|
|
217
|
+
last_prediction_model_F = np.zeros(len(X))
|
|
218
|
+
for i, base_learner in enumerate(self.base_learners_list):
|
|
219
|
+
chosen_columns = self.considered_columns[i]
|
|
220
|
+
# the first base learner is not scaled by the learning rate because it is just the average of the labels
|
|
221
|
+
if i == 0:
|
|
222
|
+
learning_rate = 1.0
|
|
223
|
+
else:
|
|
224
|
+
learning_rate = self.learning_rate
|
|
225
|
+
last_prediction_model_F += self._target_variable_mean_[
|
|
226
|
+
i
|
|
227
|
+
] + learning_rate * np.array(
|
|
228
|
+
base_learner.predict(X[chosen_columns], **kwargs)
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
last_prediction_probability = self._sigmoid(last_prediction_model_F)
|
|
232
|
+
if return_class_probability:
|
|
233
|
+
prediction.append(last_prediction_probability)
|
|
234
|
+
else:
|
|
235
|
+
prediction.append((last_prediction_probability >= 0.5).astype(int))
|
|
236
|
+
|
|
237
|
+
return prediction
|
|
238
|
+
|
|
239
|
+
def evaluate(self, X: pd.DataFrame, y: Iterable, **kwargs) -> list[float]:
|
|
240
|
+
# it returns the evolution of the mse with increasing number of iterations
|
|
241
|
+
predictions = self.predict_step_by_step(
|
|
242
|
+
X, return_class_probability=True, **kwargs
|
|
243
|
+
)
|
|
244
|
+
evolution_logloss = []
|
|
245
|
+
for prediction in predictions:
|
|
246
|
+
logloss = log_loss(y_true=y, y_pred=prediction)
|
|
247
|
+
evolution_logloss.append(logloss)
|
|
248
|
+
return evolution_logloss
|
|
249
|
+
|
|
250
|
+
def get_model(self):
|
|
251
|
+
return self.base_learners_list
|
|
252
|
+
|
|
253
|
+
def _sigmoid(self, x):
|
|
254
|
+
return 1 / (1 + np.exp(-x))
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def _neg_gradient(y, y_hat):
|
|
258
|
+
return y - y_hat
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class BoostedTreeBaselearner(BaseLearnerClassInterface):
|
|
262
|
+
def __init__(self, **kwargs):
|
|
263
|
+
self.model = DecisionTreeRegressor(**kwargs)
|
|
264
|
+
self.fitted_ = False
|
|
265
|
+
self.leaf_gammas = {} # Store optimized gamma for each leaf
|
|
266
|
+
|
|
267
|
+
def fit(
|
|
268
|
+
self,
|
|
269
|
+
X: pd.DataFrame,
|
|
270
|
+
neg_gradient: Iterable,
|
|
271
|
+
current_f: np.ndarray = None,
|
|
272
|
+
y_true: np.ndarray = None,
|
|
273
|
+
**kwargs,
|
|
274
|
+
):
|
|
275
|
+
"""
|
|
276
|
+
Parameters:
|
|
277
|
+
-----------
|
|
278
|
+
X : pd.DataFrame
|
|
279
|
+
Features
|
|
280
|
+
neg_gradient : Iterable
|
|
281
|
+
Pseudo-residuals (negative gradient)
|
|
282
|
+
current_f : np.ndarray
|
|
283
|
+
Current model predictions (in log-odds space)
|
|
284
|
+
y_true : np.ndarray
|
|
285
|
+
True labels in {0, 1} format
|
|
286
|
+
"""
|
|
287
|
+
# First fit tree to pseudo-residuals
|
|
288
|
+
self.model.fit(X, neg_gradient)
|
|
289
|
+
|
|
290
|
+
# If we have current_f and y_true, optimize gamma for each leaf
|
|
291
|
+
if current_f is not None and y_true is not None:
|
|
292
|
+
self._optimize_leaf_gammas(X, y_true, current_f)
|
|
293
|
+
else:
|
|
294
|
+
# Fallback: use tree's predictions as-is
|
|
295
|
+
leaf_indices = self.model.apply(X)
|
|
296
|
+
unique_leaves = np.unique(leaf_indices)
|
|
297
|
+
for leaf_id in unique_leaves:
|
|
298
|
+
mask = leaf_indices == leaf_id
|
|
299
|
+
# Use average of pseudo-residuals in leaf
|
|
300
|
+
self.leaf_gammas[leaf_id] = self.model.predict(
|
|
301
|
+
X.iloc[[np.where(mask)[0][0]]]
|
|
302
|
+
)[0]
|
|
303
|
+
|
|
304
|
+
self.fitted_ = True
|
|
305
|
+
return self
|
|
306
|
+
|
|
307
|
+
def _optimize_leaf_gammas(
|
|
308
|
+
self, X: pd.DataFrame, y_true: np.ndarray, current_f: np.ndarray
|
|
309
|
+
):
|
|
310
|
+
"""
|
|
311
|
+
For each leaf, find optimal gamma that minimizes logistic loss
|
|
312
|
+
Works with y_true in {0, 1}
|
|
313
|
+
"""
|
|
314
|
+
# Get leaf assignments for all samples
|
|
315
|
+
leaf_indices = self.model.apply(X)
|
|
316
|
+
unique_leaves = np.unique(leaf_indices)
|
|
317
|
+
|
|
318
|
+
for leaf_id in unique_leaves:
|
|
319
|
+
# Get samples in this leaf
|
|
320
|
+
mask = leaf_indices == leaf_id
|
|
321
|
+
|
|
322
|
+
if np.sum(mask) == 0:
|
|
323
|
+
self.leaf_gammas[leaf_id] = 0.0
|
|
324
|
+
continue
|
|
325
|
+
|
|
326
|
+
# Get tree's base prediction for this leaf (all same value)
|
|
327
|
+
X_leaf = X[mask]
|
|
328
|
+
h_pred_leaf = self.model.predict(X_leaf.iloc[[0]])[0]
|
|
329
|
+
|
|
330
|
+
# Get data for this leaf
|
|
331
|
+
y_leaf = y_true[mask] # Shape: (n_samples_in_leaf,), values in {0, 1}
|
|
332
|
+
f_leaf = current_f[mask] # Shape: (n_samples_in_leaf,), log-odds
|
|
333
|
+
|
|
334
|
+
# Optimize gamma for this specific leaf
|
|
335
|
+
def loss(gamma):
|
|
336
|
+
f_new = f_leaf + gamma * h_pred_leaf
|
|
337
|
+
# Binary cross-entropy (logistic loss) for y in {0, 1}:
|
|
338
|
+
# -[y*log(p) + (1-y)*log(1-p)] where p = sigmoid(f)
|
|
339
|
+
# Equivalent to: log(1 + exp(-f)) if y=1, log(1 + exp(f)) if y=0
|
|
340
|
+
# Combined: y*log(1 + exp(-f)) + (1-y)*log(1 + exp(f))
|
|
341
|
+
p = 1 / (1 + np.exp(-np.clip(f_new, -500, 500))) # sigmoid
|
|
342
|
+
return -np.sum(
|
|
343
|
+
y_leaf * np.log(p + 1e-15) + (1 - y_leaf) * np.log(1 - p + 1e-15)
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Simple line search
|
|
347
|
+
best_gamma = 0.0
|
|
348
|
+
best_loss = loss(0.0)
|
|
349
|
+
|
|
350
|
+
# Search in reasonable range
|
|
351
|
+
for gamma in np.linspace(-10, 10, 100):
|
|
352
|
+
current_loss = loss(gamma)
|
|
353
|
+
if current_loss < best_loss:
|
|
354
|
+
best_loss = current_loss
|
|
355
|
+
best_gamma = gamma
|
|
356
|
+
|
|
357
|
+
self.leaf_gammas[leaf_id] = best_gamma
|
|
358
|
+
|
|
359
|
+
def predict(self, X: pd.DataFrame, **kwargs):
|
|
360
|
+
"""
|
|
361
|
+
Predict using optimized gamma values instead of tree's leaf values
|
|
362
|
+
"""
|
|
363
|
+
if not self.fitted_:
|
|
364
|
+
raise ValueError("Model not fitted yet")
|
|
365
|
+
|
|
366
|
+
# Get leaf assignments
|
|
367
|
+
leaf_indices = self.model.apply(X)
|
|
368
|
+
|
|
369
|
+
# Map each sample to its leaf's optimized gamma
|
|
370
|
+
predictions = np.array(
|
|
371
|
+
[self.leaf_gammas.get(leaf_id, 0.0) for leaf_id in leaf_indices]
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
return predictions
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
class FirstConstantBaseLearner(BaseLearnerClassInterface):
|
|
378
|
+
# use for the first base learner in classification tasks
|
|
379
|
+
# it always predicts the most frequent class in the training set
|
|
380
|
+
def __init__(self):
|
|
381
|
+
self.fitted_ = False
|
|
382
|
+
|
|
383
|
+
def fit(self, X: pd.DataFrame, y: Iterable, **kwargs):
|
|
384
|
+
self.fitted_ = True
|
|
385
|
+
self.unique_classes_ = np.unique(y)
|
|
386
|
+
class_mean = np.mean(y)
|
|
387
|
+
self._predict_value = self._log_odds(class_mean)
|
|
388
|
+
return self
|
|
389
|
+
|
|
390
|
+
def predict(self, X: pd.DataFrame, **kwargs):
|
|
391
|
+
return self._predict_value * np.ones(len(X))
|
|
392
|
+
|
|
393
|
+
def _log_odds(self, mean_y):
|
|
394
|
+
return np.log(mean_y / (1 - mean_y))
|