path-boost 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- path_boost/__init__.py +18 -0
- path_boost/_path_boost.py +1096 -0
- path_boost/_version.py +24 -0
- path_boost/utils/__init__.py +2 -0
- path_boost/utils/classes/__init__.py +0 -0
- path_boost/utils/classes/additive_model_wrapper.py +301 -0
- path_boost/utils/classes/additive_model_wrapper_classifier.py +394 -0
- path_boost/utils/classes/extended_boosting_matrix.py +596 -0
- path_boost/utils/classes/interfaces/__init__.py +0 -0
- path_boost/utils/classes/interfaces/interface_base_learner.py +30 -0
- path_boost/utils/classes/interfaces/interface_selector.py +27 -0
- path_boost/utils/classes/sequential_path_boost.py +1023 -0
- path_boost/utils/classes/sequential_path_boost_classifier.py +840 -0
- path_boost/utils/cross_validation.py +49 -0
- path_boost/utils/cyclic_path_boost_utils.py +76 -0
- path_boost/utils/datasets_for_examples/__init__.py +2 -0
- path_boost/utils/datasets_for_examples/generate_example_dataset.py +304 -0
- path_boost/utils/discovery.py +217 -0
- path_boost/utils/plots_functions.py +153 -0
- path_boost/utils/validate_data.py +223 -0
- path_boost/utils/variable_importance_according_to_path_boost.py +341 -0
- path_boost-2.1.0.dist-info/METADATA +174 -0
- path_boost-2.1.0.dist-info/RECORD +26 -0
- path_boost-2.1.0.dist-info/WHEEL +5 -0
- path_boost-2.1.0.dist-info/licenses/LICENSE +21 -0
- path_boost-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,840 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import numbers
|
|
3
|
+
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
import networkx as nx
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger("path_boost")
|
|
10
|
+
|
|
11
|
+
from .interfaces.interface_base_learner import BaseLearnerClassInterface
|
|
12
|
+
from .interfaces.interface_selector import SelectorClassInterface
|
|
13
|
+
from ..validate_data import util_validate_data
|
|
14
|
+
from ..variable_importance_according_to_path_boost import (
|
|
15
|
+
VariableImportance_ForSequentialPathBoost,
|
|
16
|
+
)
|
|
17
|
+
from ..plots_functions import (
|
|
18
|
+
plot_training_and_eval_errors,
|
|
19
|
+
plot_variable_importance_utils,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
from sklearn.base import BaseEstimator
|
|
23
|
+
from sklearn.base import RegressorMixin
|
|
24
|
+
from .extended_boosting_matrix import ExtendedBoostingMatrix
|
|
25
|
+
from typing import Iterable
|
|
26
|
+
from sklearn.tree import DecisionTreeRegressor, plot_tree
|
|
27
|
+
from .additive_model_wrapper_classifier import AdditiveModelWrapperClassifier
|
|
28
|
+
from sklearn.metrics import mean_squared_error
|
|
29
|
+
from matplotlib.ticker import MaxNLocator
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SequentialPathBoostClassifier(BaseEstimator, RegressorMixin):
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
n_iter=100,
|
|
36
|
+
max_path_length=10,
|
|
37
|
+
learning_rate=0.1,
|
|
38
|
+
patience=None,
|
|
39
|
+
target_error=None,
|
|
40
|
+
BaseLearnerClass=DecisionTreeRegressor,
|
|
41
|
+
kwargs_for_base_learner=None,
|
|
42
|
+
SelectorClass=DecisionTreeRegressor,
|
|
43
|
+
kwargs_for_selector=None,
|
|
44
|
+
parameters_variable_importance=None,
|
|
45
|
+
replace_nan_with=np.nan,
|
|
46
|
+
verbose=False,
|
|
47
|
+
use_tree_boost=False,
|
|
48
|
+
):
|
|
49
|
+
"""
|
|
50
|
+
Initializes the SequentialPathBoost model.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
n_iter : int, default=100
|
|
55
|
+
The number of boosting iterations to perform.
|
|
56
|
+
max_path_length : int, default=10
|
|
57
|
+
The maximum length of paths to consider as features. Paths longer
|
|
58
|
+
than this will not be explored for extending the Extended Boosting Matrix (EBM).
|
|
59
|
+
learning_rate : float, default=0.1
|
|
60
|
+
The learning_rate shrinks the contribution of each base learner.
|
|
61
|
+
It is used by the `AdditiveModelWrapperClassifier` when fitting each step.
|
|
62
|
+
patience : int, optional, default=None
|
|
63
|
+
Number of iterations with no improvement on the first evaluation set's score
|
|
64
|
+
before stopping early. If None, early stopping is not performed.
|
|
65
|
+
Requires an `eval_set` to be provided during fitting. The check is performed
|
|
66
|
+
based on the Mean Squared Error (MSE) of the first evaluation set in `eval_set`.
|
|
67
|
+
BaseLearnerClass : type, default=sklearn.tree.DecisionTreeRegressor
|
|
68
|
+
The class of the base learner to be used within each boosting iteration.
|
|
69
|
+
This class must implement the `BaseLearnerClassInterface`.
|
|
70
|
+
kwargs_for_base_learner : dict, default=None
|
|
71
|
+
Keyword arguments to be passed to the constructor of the `BaseLearnerClass`.
|
|
72
|
+
If None, default arguments for `DecisionTreeRegressor` will be used.
|
|
73
|
+
SelectorClass : type, default=sklearn.tree.DecisionTreeRegressor
|
|
74
|
+
The class of the feature selector used to identify the best paths in each iteration.
|
|
75
|
+
This class must implement the `SelectorClassInterface`.
|
|
76
|
+
kwargs_for_selector : dict, default=None
|
|
77
|
+
Keyword arguments to be passed to the constructor of the `SelectorClass`.
|
|
78
|
+
If None, default arguments for `DecisionTreeRegressor` will be used.
|
|
79
|
+
parameters_variable_importance : dict, default=None
|
|
80
|
+
Parameters for computing variable importance. If None, variable importance is not computed.
|
|
81
|
+
Expected keys include 'criterion', 'error_used', 'use_correlation', 'normalize'.
|
|
82
|
+
replace_nan_with : any, default=np.nan
|
|
83
|
+
Value used to replace NaN values encountered during feature generation in the EBM.
|
|
84
|
+
This is important for base learners that cannot handle NaN values.
|
|
85
|
+
verbose : bool, default=False
|
|
86
|
+
If True, prints progress messages during the fitting process, such as the
|
|
87
|
+
current iteration number and the best path selected.
|
|
88
|
+
use_tree_boost : bool, default=False
|
|
89
|
+
If True, uses the TreeBoost modification in the base learner, optimizing
|
|
90
|
+
separate gamma values for each leaf region. This requires the `BaseLearnerClass`
|
|
91
|
+
to support this functionality.
|
|
92
|
+
"""
|
|
93
|
+
self.n_iter = n_iter
|
|
94
|
+
self.max_path_length = max_path_length
|
|
95
|
+
self.patience = patience
|
|
96
|
+
self.target_error = target_error
|
|
97
|
+
self.learning_rate = learning_rate
|
|
98
|
+
self.BaseLearnerClass = BaseLearnerClass
|
|
99
|
+
self.verbose = verbose
|
|
100
|
+
self.replace_nan_with = replace_nan_with
|
|
101
|
+
self.kwargs_for_base_learner = kwargs_for_base_learner
|
|
102
|
+
self.SelectorClass = SelectorClass
|
|
103
|
+
self.kwargs_for_selector = kwargs_for_selector
|
|
104
|
+
self.parameters_variable_importance = parameters_variable_importance
|
|
105
|
+
self.use_tree_boost = use_tree_boost
|
|
106
|
+
|
|
107
|
+
def fit(
|
|
108
|
+
self,
|
|
109
|
+
X: list[nx.Graph],
|
|
110
|
+
y: np.array,
|
|
111
|
+
list_anchor_nodes_labels: list[tuple],
|
|
112
|
+
anchor_nodes_label_name,
|
|
113
|
+
eval_set: list[tuple[list[nx.Graph], Iterable]] = None,
|
|
114
|
+
):
|
|
115
|
+
"""
|
|
116
|
+
Fits the SequentialPathBoost model to the training data.
|
|
117
|
+
|
|
118
|
+
This method iteratively builds an ensemble of base learners. In each iteration:
|
|
119
|
+
1. It identifies the 'best path' from the current set of available paths in the
|
|
120
|
+
Extended Boosting Matrix (EBM) using a selector model. The target for the
|
|
121
|
+
selector is the original target `y` in the first iteration, and the
|
|
122
|
+
negative gradient of the loss function in subsequent iterations.
|
|
123
|
+
2. It trains a new base learner on the features corresponding to the `best_path`
|
|
124
|
+
and adds it to the ensemble. The `AdditiveModelWrapperClassifier` handles the
|
|
125
|
+
fitting of this base learner and updates the cumulative predictions.
|
|
126
|
+
3. It expands the training EBM by generating new path-based features derived
|
|
127
|
+
from extending the `best_path`.
|
|
128
|
+
4. If variable importance calculation is enabled, it updates the importance scores
|
|
129
|
+
based on the selected path and the current gradient.
|
|
130
|
+
5. It expands the EBM for evaluation sets (if provided) to include features
|
|
131
|
+
derived from the `best_path`.
|
|
132
|
+
|
|
133
|
+
The process continues for `n_iter` iterations. After fitting, training and
|
|
134
|
+
evaluation (if `eval_set` is provided) metrics (MSE, MAE) are stored.
|
|
135
|
+
If `parameters_variable_importance` was set, the final variable importance
|
|
136
|
+
scores are computed.
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
X : list[nx.Graph]
|
|
141
|
+
A list of NetworkX graph objects representing the training input samples.
|
|
142
|
+
y : np.array
|
|
143
|
+
A NumPy array of target values corresponding to `X`.
|
|
144
|
+
list_anchor_nodes_labels : list[tuple]
|
|
145
|
+
A list of tuples, where each tuple contains the label(s) identifying
|
|
146
|
+
anchor nodes. These are used to initialize the EBM.
|
|
147
|
+
anchor_nodes_label_name : str
|
|
148
|
+
The name of the node attribute in the graphs that contains the labels
|
|
149
|
+
used to identify anchor nodes and subsequent path elements.
|
|
150
|
+
eval_set : list[tuple[list[nx.Graph], Iterable]], optional, default=None
|
|
151
|
+
A list of (X_eval, y_eval) tuples for monitoring the model's performance
|
|
152
|
+
on one or more evaluation sets during training.
|
|
153
|
+
|
|
154
|
+
Returns
|
|
155
|
+
-------
|
|
156
|
+
self : object
|
|
157
|
+
The fitted SequentialPathBoost estimator.
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
# Configure logging based on verbose flag
|
|
161
|
+
if self.verbose:
|
|
162
|
+
logging.getLogger("path_boost").setLevel(logging.INFO)
|
|
163
|
+
if not logging.getLogger("path_boost").handlers:
|
|
164
|
+
handler = logging.StreamHandler()
|
|
165
|
+
handler.setFormatter(logging.Formatter("%(message)s"))
|
|
166
|
+
logging.getLogger("path_boost").addHandler(handler)
|
|
167
|
+
|
|
168
|
+
self._default_kwargs_for_base_learner = {
|
|
169
|
+
"max_depth": 3,
|
|
170
|
+
"random_state": 0,
|
|
171
|
+
"splitter": "best",
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
self._default_kwargs_for_selector = {
|
|
175
|
+
"max_depth": 1,
|
|
176
|
+
"random_state": 0,
|
|
177
|
+
"splitter": "best",
|
|
178
|
+
"criterion": "squared_error",
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
self._validate_data(
|
|
182
|
+
X=X,
|
|
183
|
+
y=y,
|
|
184
|
+
list_anchor_nodes_labels=list_anchor_nodes_labels,
|
|
185
|
+
name_of_label_attribute=anchor_nodes_label_name,
|
|
186
|
+
eval_set=eval_set,
|
|
187
|
+
parameters_variable_importance=self.parameters_variable_importance,
|
|
188
|
+
patience=self.patience,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
self.is_fitted_ = True
|
|
192
|
+
|
|
193
|
+
self.name_of_label_attribute_ = anchor_nodes_label_name
|
|
194
|
+
|
|
195
|
+
self.paths_selected_by_epb_ = set()
|
|
196
|
+
self._initialize_path_boosting(
|
|
197
|
+
X=X,
|
|
198
|
+
list_anchor_nodes_labels=list_anchor_nodes_labels,
|
|
199
|
+
main_label_name=anchor_nodes_label_name,
|
|
200
|
+
eval_set=eval_set,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
if self.parameters_variable_importance is not None:
|
|
204
|
+
self.class_variable_importance_: VariableImportance_ForSequentialPathBoost = VariableImportance_ForSequentialPathBoost(
|
|
205
|
+
**self.parameters_variable_importance
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
for n_iteration in range(self.n_iter):
|
|
209
|
+
if self.verbose:
|
|
210
|
+
logger.info(f"iteration number: {n_iteration + 1}")
|
|
211
|
+
|
|
212
|
+
# this is a parameter used for a check when computing variable importance, to make sure we are computing it on the right iteration, with the right ebm
|
|
213
|
+
self._ebm_has_been_expanded_in_this_iteration = False
|
|
214
|
+
|
|
215
|
+
if n_iteration == 0:
|
|
216
|
+
best_path = self._find_best_path(
|
|
217
|
+
train_ebm_dataframe=self.train_ebm_dataframe_,
|
|
218
|
+
y=y,
|
|
219
|
+
SelectorClass=self.SelectorClass,
|
|
220
|
+
kwargs_for_selector=self.kwargs_for_selector,
|
|
221
|
+
)
|
|
222
|
+
else:
|
|
223
|
+
negative_gradient = AdditiveModelWrapperClassifier._neg_gradient(
|
|
224
|
+
y=y,
|
|
225
|
+
y_hat=np.array(
|
|
226
|
+
self.base_learner_._last_train_prediction.to_numpy()
|
|
227
|
+
),
|
|
228
|
+
)
|
|
229
|
+
best_path = self._find_best_path(
|
|
230
|
+
train_ebm_dataframe=self.train_ebm_dataframe_,
|
|
231
|
+
y=pd.Series(negative_gradient),
|
|
232
|
+
SelectorClass=self.SelectorClass,
|
|
233
|
+
kwargs_for_selector=self.kwargs_for_selector,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
if self.verbose:
|
|
237
|
+
logger.info(f"Best path: {best_path}")
|
|
238
|
+
|
|
239
|
+
# we collect some values for variable importance, important that this operation it is done between the
|
|
240
|
+
# selection of the best path and the expansion of the ebm dataframe
|
|
241
|
+
if self.parameters_variable_importance is not None:
|
|
242
|
+
if n_iteration == 0:
|
|
243
|
+
self.class_variable_importance_._update(
|
|
244
|
+
path_boost=self,
|
|
245
|
+
selected_path=best_path,
|
|
246
|
+
iteration_number=n_iteration,
|
|
247
|
+
gradient=y,
|
|
248
|
+
)
|
|
249
|
+
else:
|
|
250
|
+
self.class_variable_importance_._update(
|
|
251
|
+
path_boost=self,
|
|
252
|
+
selected_path=best_path,
|
|
253
|
+
iteration_number=n_iteration,
|
|
254
|
+
gradient=negative_gradient,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# expand the EVAL set in order to contain the selected columns path
|
|
258
|
+
self._expand_eval_ebm_dataframe_with_best_path(
|
|
259
|
+
best_path=best_path,
|
|
260
|
+
main_label_name=anchor_nodes_label_name,
|
|
261
|
+
eval_set=eval_set,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
self.base_learner_.fit_one_step(
|
|
265
|
+
X=self.train_ebm_dataframe_,
|
|
266
|
+
y=y,
|
|
267
|
+
best_path=best_path,
|
|
268
|
+
eval_set=self.eval_set_ebm_df_and_target_,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
if eval_set is not None:
|
|
272
|
+
if self._check_if_stop_early(
|
|
273
|
+
mse_eval_set=self.base_learner_.eval_sets_logloss[0],
|
|
274
|
+
patience=self.patience,
|
|
275
|
+
target_error=self.target_error,
|
|
276
|
+
):
|
|
277
|
+
if self.verbose:
|
|
278
|
+
logger.info(
|
|
279
|
+
f"Early stopping at iteration {n_iteration + 1} due to no"
|
|
280
|
+
" improvement in evaluation set logloss."
|
|
281
|
+
)
|
|
282
|
+
self.n_iter = n_iteration
|
|
283
|
+
break
|
|
284
|
+
|
|
285
|
+
# expand the ebm dataframe with the new columns starting from the selected path
|
|
286
|
+
self._expand_ebm_dataframe(
|
|
287
|
+
X=X, selected_path=best_path, main_label_name=anchor_nodes_label_name
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
self.train_logloss_ = self.base_learner_.train_logloss
|
|
291
|
+
self.train_accuracy_ = self.base_learner_.train_accuracy
|
|
292
|
+
|
|
293
|
+
if self.parameters_variable_importance is not None:
|
|
294
|
+
self.variable_importance_: dict = (
|
|
295
|
+
self.class_variable_importance_.compute_variable_importance(
|
|
296
|
+
path_boost=self
|
|
297
|
+
)
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
if eval_set is not None:
|
|
301
|
+
self.eval_sets_logloss_ = self.base_learner_.eval_sets_logloss
|
|
302
|
+
self.eval_sets_accuracy_ = self.base_learner_.eval_sets_accuracy
|
|
303
|
+
|
|
304
|
+
self.columns_names_ = self.train_ebm_dataframe_.columns
|
|
305
|
+
|
|
306
|
+
return self
|
|
307
|
+
|
|
308
|
+
def _check_if_stop_early(
|
|
309
|
+
self,
|
|
310
|
+
mse_eval_set: list[float],
|
|
311
|
+
patience: int | None = None,
|
|
312
|
+
target_error: float | None = None,
|
|
313
|
+
) -> bool:
|
|
314
|
+
"""
|
|
315
|
+
Determines whether to stop the training process early based on evaluation metrics.
|
|
316
|
+
|
|
317
|
+
Early stopping can be triggered under two conditions:
|
|
318
|
+
1. If a `target_error` is specified: Training stops if the Mean Squared Error (MSE)
|
|
319
|
+
on the (first) evaluation set falls at or below this target.
|
|
320
|
+
2. If `patience` is specified: Training stops if the MSE on the (first) evaluation
|
|
321
|
+
set has not improved (i.e., decreased) for a consecutive number of iterations
|
|
322
|
+
equal to `patience`. An "improvement" is defined as the current MSE being strictly
|
|
323
|
+
less than the MSE `patience` iterations ago. If the MSE remains the same or
|
|
324
|
+
increases for `patience` iterations, training stops.
|
|
325
|
+
|
|
326
|
+
Parameters
|
|
327
|
+
----------
|
|
328
|
+
mse_eval_set : list[float]
|
|
329
|
+
A list of Mean Squared Errors (MSE) recorded for the first evaluation set
|
|
330
|
+
at each iteration so far.
|
|
331
|
+
patience : int or None, optional
|
|
332
|
+
The number of iterations to wait for an improvement before stopping.
|
|
333
|
+
If None, this condition for early stopping is disabled.
|
|
334
|
+
target_error : float or None, optional
|
|
335
|
+
A specific MSE value. If the evaluation MSE reaches this value or lower,
|
|
336
|
+
training stops. If None, this condition is disabled.
|
|
337
|
+
|
|
338
|
+
Returns
|
|
339
|
+
-------
|
|
340
|
+
bool
|
|
341
|
+
True if the conditions for early stopping are met, False otherwise.
|
|
342
|
+
Returns False if `patience` is None and `target_error` is None, or if
|
|
343
|
+
insufficient iterations have passed to evaluate the patience condition.
|
|
344
|
+
"""
|
|
345
|
+
|
|
346
|
+
if target_error is not None:
|
|
347
|
+
# If a target error is specified, check if the last MSE is less than or equal to the target error
|
|
348
|
+
if mse_eval_set and mse_eval_set[-1] <= target_error:
|
|
349
|
+
return True
|
|
350
|
+
else:
|
|
351
|
+
return False
|
|
352
|
+
|
|
353
|
+
if patience is None:
|
|
354
|
+
return False
|
|
355
|
+
|
|
356
|
+
if len(mse_eval_set) < patience:
|
|
357
|
+
return False
|
|
358
|
+
|
|
359
|
+
# Check if the last `patience` MSE values are all greater than or equal to the last MSE value
|
|
360
|
+
return all(mse >= mse_eval_set[-1] for mse in mse_eval_set[-patience:])
|
|
361
|
+
|
|
362
|
+
def _expand_eval_ebm_dataframe_with_best_path(
|
|
363
|
+
self, best_path, main_label_name, eval_set=None
|
|
364
|
+
):
|
|
365
|
+
# we expand the ebm dataframe ONLY by adding the new columns related to the best path, we are not exploring new paths
|
|
366
|
+
if eval_set is not None:
|
|
367
|
+
columns_names = ExtendedBoostingMatrix.get_columns_related_to_path(
|
|
368
|
+
best_path, self.train_ebm_dataframe_.columns
|
|
369
|
+
)
|
|
370
|
+
for eval_set_number, eval_set_tuple in enumerate(eval_set):
|
|
371
|
+
if eval_set_tuple is None:
|
|
372
|
+
continue
|
|
373
|
+
eval_set_dataset, y_eval_set = eval_set_tuple
|
|
374
|
+
# find the new columns in the eval set
|
|
375
|
+
missing_columns = [
|
|
376
|
+
col
|
|
377
|
+
for col in columns_names
|
|
378
|
+
if col
|
|
379
|
+
not in self.eval_set_ebm_df_and_target_[eval_set_number][0].columns
|
|
380
|
+
]
|
|
381
|
+
new_columns_for_eval_set = (
|
|
382
|
+
ExtendedBoostingMatrix.generate_new_columns_from_columns_names(
|
|
383
|
+
dataset=eval_set_dataset,
|
|
384
|
+
ebm_to_be_expanded=self.eval_set_ebm_df_and_target_[
|
|
385
|
+
eval_set_number
|
|
386
|
+
][0],
|
|
387
|
+
columns_names=missing_columns,
|
|
388
|
+
main_label_name=main_label_name,
|
|
389
|
+
replace_nan_with=self.replace_nan_with,
|
|
390
|
+
)
|
|
391
|
+
)
|
|
392
|
+
self.eval_set_ebm_df_and_target_[eval_set_number][0] = pd.concat(
|
|
393
|
+
[
|
|
394
|
+
self.eval_set_ebm_df_and_target_[eval_set_number][0],
|
|
395
|
+
new_columns_for_eval_set,
|
|
396
|
+
],
|
|
397
|
+
axis=1,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
def generate_ebm_for_dataset(self, dataset: list[nx.Graph], columns_names=None):
|
|
401
|
+
"""
|
|
402
|
+
Generates an Extended Boosting Matrix (EBM) for a given dataset of graphs.
|
|
403
|
+
|
|
404
|
+
The EBM is a pandas DataFrame where rows correspond to graphs and columns
|
|
405
|
+
correspond to features derived from paths in the graphs. If `columns_names`
|
|
406
|
+
is provided, the EBM will only contain these columns. Otherwise, it will
|
|
407
|
+
include columns related to all paths selected during the fitting process
|
|
408
|
+
(stored in `self.paths_selected_by_epb_` and `self.columns_names_`).
|
|
409
|
+
|
|
410
|
+
Parameters
|
|
411
|
+
----------
|
|
412
|
+
dataset : list[nx.Graph]
|
|
413
|
+
A list of NetworkX graph objects for which to generate the EBM.
|
|
414
|
+
columns_names : list[str], optional
|
|
415
|
+
A list of column names to include in the generated EBM. If None,
|
|
416
|
+
columns are determined by the paths selected during fitting.
|
|
417
|
+
Defaults to None.
|
|
418
|
+
|
|
419
|
+
Returns
|
|
420
|
+
-------
|
|
421
|
+
pd.DataFrame
|
|
422
|
+
The generated Extended Boosting Matrix.
|
|
423
|
+
|
|
424
|
+
Raises
|
|
425
|
+
------
|
|
426
|
+
AssertionError
|
|
427
|
+
If the model has not been fitted yet (i.e., `self.is_fitted_` is False).
|
|
428
|
+
"""
|
|
429
|
+
|
|
430
|
+
assert self.is_fitted_
|
|
431
|
+
if columns_names is None:
|
|
432
|
+
selected_path = list(self.paths_selected_by_epb_)
|
|
433
|
+
columns_names = []
|
|
434
|
+
for path in selected_path:
|
|
435
|
+
columns_names += ExtendedBoostingMatrix.get_columns_related_to_path(
|
|
436
|
+
path=path, columns_names=self.columns_names_
|
|
437
|
+
)
|
|
438
|
+
columns_names = list(set(columns_names))
|
|
439
|
+
ebm_dataframe = ExtendedBoostingMatrix.generate_new_columns_from_columns_names(
|
|
440
|
+
dataset=dataset,
|
|
441
|
+
columns_names=columns_names,
|
|
442
|
+
main_label_name=self.name_of_label_attribute_,
|
|
443
|
+
replace_nan_with=self.replace_nan_with,
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
return ebm_dataframe
|
|
447
|
+
|
|
448
|
+
def predict(
|
|
449
|
+
self,
|
|
450
|
+
X: list[nx.Graph] | nx.Graph | None = None,
|
|
451
|
+
ebm_dataframe: pd.DataFrame | None = None,
|
|
452
|
+
class_probability: bool = False,
|
|
453
|
+
) -> list[numbers.Number]:
|
|
454
|
+
"""
|
|
455
|
+
Predicts target values for the given input data.
|
|
456
|
+
|
|
457
|
+
The method can accept either a list of NetworkX graphs (`X`) or a pre-computed
|
|
458
|
+
Extended Boosting Matrix (`ebm_dataframe`).
|
|
459
|
+
|
|
460
|
+
Parameters
|
|
461
|
+
----------
|
|
462
|
+
X : list[nx.Graph] | None, default=None
|
|
463
|
+
A list of NetworkX graph objects for which to make predictions.
|
|
464
|
+
Required if `ebm_dataframe` is not provided.
|
|
465
|
+
ebm_dataframe : pd.DataFrame | None, default=None
|
|
466
|
+
A pre-computed Extended Boosting Matrix. If provided, this matrix will be
|
|
467
|
+
used directly for prediction, bypassing the EBM generation from `X`.
|
|
468
|
+
Required if `X` is not provided.
|
|
469
|
+
|
|
470
|
+
Returns
|
|
471
|
+
-------
|
|
472
|
+
list[numbers.Number]
|
|
473
|
+
A list of predicted numerical values for the input samples.
|
|
474
|
+
|
|
475
|
+
Raises
|
|
476
|
+
------
|
|
477
|
+
AssertionError
|
|
478
|
+
If the model has not been fitted yet (i.e., `fit` has not been called).
|
|
479
|
+
If neither `X` nor `ebm_dataframe` is provided.
|
|
480
|
+
"""
|
|
481
|
+
assert X is not None or ebm_dataframe is not None
|
|
482
|
+
assert self.is_fitted_
|
|
483
|
+
if ebm_dataframe is None:
|
|
484
|
+
if isinstance(X, nx.Graph):
|
|
485
|
+
X = [X]
|
|
486
|
+
ebm_dataframe = self.generate_ebm_for_dataset(dataset=X)
|
|
487
|
+
return self.base_learner_.predict(
|
|
488
|
+
X=ebm_dataframe, class_probability=class_probability
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
def predict_step_by_step(
|
|
492
|
+
self,
|
|
493
|
+
X: list[nx.Graph] | nx.Graph | None = None,
|
|
494
|
+
ebm_dataframe: pd.DataFrame | None = None,
|
|
495
|
+
class_probability: bool = False,
|
|
496
|
+
) -> list[np.array]:
|
|
497
|
+
"""
|
|
498
|
+
Generates predictions for each input sample at each boosting iteration.
|
|
499
|
+
|
|
500
|
+
This method takes either a list of NetworkX graphs or a precomputed
|
|
501
|
+
Extended Boosting Matrix (EBM) as input. It uses the trained base learner
|
|
502
|
+
to make predictions iteratively, returning a list where each element
|
|
503
|
+
is an array of predictions for all samples at a specific boosting step.
|
|
504
|
+
|
|
505
|
+
Parameters
|
|
506
|
+
----------
|
|
507
|
+
X : list[nx.Graph] | None, default=None
|
|
508
|
+
A list of NetworkX graph objects for which to generate predictions.
|
|
509
|
+
Either `X` or `ebm_dataframe` must be provided.
|
|
510
|
+
ebm_dataframe : pd.DataFrame | None, default=None
|
|
511
|
+
A precomputed Extended Boosting Matrix. If provided, `X` is ignored.
|
|
512
|
+
Either `X` or `ebm_dataframe` must be provided.
|
|
513
|
+
|
|
514
|
+
Returns
|
|
515
|
+
-------
|
|
516
|
+
list[np.array]
|
|
517
|
+
A list of NumPy arrays. Each array contains the predictions for all
|
|
518
|
+
input samples at a specific boosting iteration. The outer list
|
|
519
|
+
corresponds to the iterations, and the inner arrays contain
|
|
520
|
+
the predictions.
|
|
521
|
+
|
|
522
|
+
Raises
|
|
523
|
+
------
|
|
524
|
+
AssertionError
|
|
525
|
+
If the model has not been fitted (i.e., `self.is_fitted_` is False).
|
|
526
|
+
AssertionError
|
|
527
|
+
If both `X` and `ebm_dataframe` are None.
|
|
528
|
+
"""
|
|
529
|
+
assert X is not None or ebm_dataframe is not None
|
|
530
|
+
assert self.is_fitted_
|
|
531
|
+
if ebm_dataframe is None:
|
|
532
|
+
if isinstance(X, nx.Graph):
|
|
533
|
+
X = [X]
|
|
534
|
+
ebm_dataframe = self.generate_ebm_for_dataset(dataset=X)
|
|
535
|
+
return self.base_learner_.predict_step_by_step(
|
|
536
|
+
X=ebm_dataframe, class_probability=class_probability
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
def evaluate(
|
|
540
|
+
self,
|
|
541
|
+
X: list[nx.Graph] | nx.Graph | None = None,
|
|
542
|
+
y=None,
|
|
543
|
+
ebm_dataframe: pd.DataFrame | None = None,
|
|
544
|
+
):
|
|
545
|
+
"""
|
|
546
|
+
Evaluates the model on the given dataset and returns the Mean Squared Error (MSE) for each iteration.
|
|
547
|
+
|
|
548
|
+
Parameters
|
|
549
|
+
----------
|
|
550
|
+
X : list[nx.Graph] | None, default=None
|
|
551
|
+
A list of NetworkX graph objects representing the input samples.
|
|
552
|
+
y : array-like
|
|
553
|
+
The true target values corresponding to `X` or `ebm_dataframe`.
|
|
554
|
+
ebm_dataframe : pd.DataFrame | None, default=None
|
|
555
|
+
A pre-generated Extended Boosting Matrix for the input samples.
|
|
556
|
+
If provided, `X` is ignored for EBM generation.
|
|
557
|
+
|
|
558
|
+
Returns
|
|
559
|
+
-------
|
|
560
|
+
list[float]
|
|
561
|
+
A list of float values, where each value is the Mean Squared Error
|
|
562
|
+
of the model on the provided dataset at a specific boosting iteration.
|
|
563
|
+
The length of the list corresponds to the number of boosting iterations (`n_iter`).
|
|
564
|
+
|
|
565
|
+
Raises
|
|
566
|
+
------
|
|
567
|
+
AssertionError
|
|
568
|
+
If `y` is None.
|
|
569
|
+
If both `X` and `ebm_dataframe` are None.
|
|
570
|
+
If the model has not been fitted yet (i.e., `fit` has not been called).
|
|
571
|
+
"""
|
|
572
|
+
# it returns the evolution of the mse for each iteration
|
|
573
|
+
assert y is not None
|
|
574
|
+
assert X is not None or ebm_dataframe is not None
|
|
575
|
+
assert self.is_fitted_
|
|
576
|
+
if ebm_dataframe is None:
|
|
577
|
+
if isinstance(X, nx.Graph):
|
|
578
|
+
X = [X]
|
|
579
|
+
ebm_dataframe = self.generate_ebm_for_dataset(dataset=X)
|
|
580
|
+
return self.base_learner_.evaluate(ebm_dataframe, y)
|
|
581
|
+
|
|
582
|
+
def _expand_ebm_dataframe(
|
|
583
|
+
self, X: list[nx.Graph], selected_path, main_label_name: str
|
|
584
|
+
):
|
|
585
|
+
self._ebm_has_been_expanded_in_this_iteration = True
|
|
586
|
+
if selected_path in self.paths_selected_by_epb_:
|
|
587
|
+
return
|
|
588
|
+
elif len(selected_path) >= self.max_path_length:
|
|
589
|
+
self.paths_selected_by_epb_.add(selected_path)
|
|
590
|
+
else:
|
|
591
|
+
self.paths_selected_by_epb_.add(selected_path)
|
|
592
|
+
new_columns = (
|
|
593
|
+
ExtendedBoostingMatrix.new_columns_to_expand_ebm_dataframe_with_path(
|
|
594
|
+
dataset=X,
|
|
595
|
+
selected_path=selected_path,
|
|
596
|
+
main_label_name=main_label_name,
|
|
597
|
+
df_to_be_expanded=self.train_ebm_dataframe_,
|
|
598
|
+
replace_nan_with=self.replace_nan_with,
|
|
599
|
+
)
|
|
600
|
+
)
|
|
601
|
+
self.train_ebm_dataframe_ = pd.concat(
|
|
602
|
+
[self.train_ebm_dataframe_, new_columns], axis=1
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
def _initialize_path_boosting(
|
|
606
|
+
self,
|
|
607
|
+
X,
|
|
608
|
+
list_anchor_nodes_labels: list,
|
|
609
|
+
main_label_name: str,
|
|
610
|
+
eval_set: list[tuple[list[nx.Graph], Iterable]] = None,
|
|
611
|
+
):
|
|
612
|
+
self.name_of_label_attribute = main_label_name
|
|
613
|
+
|
|
614
|
+
# greate extended boosting matrix for train dataset
|
|
615
|
+
self.train_ebm_dataframe_ = ExtendedBoostingMatrix.initialize_boosting_matrix_with_anchor_nodes_attributes(
|
|
616
|
+
dataset=X,
|
|
617
|
+
list_anchor_nodes_labels=list_anchor_nodes_labels,
|
|
618
|
+
id_label_name=main_label_name,
|
|
619
|
+
replace_nan_with=self.replace_nan_with,
|
|
620
|
+
)
|
|
621
|
+
self.eval_set_ebm_df_and_target_ = []
|
|
622
|
+
|
|
623
|
+
# generate extended boosting matrix for eval dataset
|
|
624
|
+
if eval_set is None:
|
|
625
|
+
pass
|
|
626
|
+
else:
|
|
627
|
+
for eval_tuple in eval_set:
|
|
628
|
+
if eval_tuple is None:
|
|
629
|
+
self.eval_set_ebm_df_and_target_.append(None)
|
|
630
|
+
continue
|
|
631
|
+
else:
|
|
632
|
+
eval_dataset, y_eval_set = eval_tuple
|
|
633
|
+
# prepare extended boosting matrix for eval dataset
|
|
634
|
+
eval_set_ebm_dataframe = ExtendedBoostingMatrix.initialize_boosting_matrix_with_anchor_nodes_attributes(
|
|
635
|
+
dataset=eval_dataset,
|
|
636
|
+
list_anchor_nodes_labels=list_anchor_nodes_labels,
|
|
637
|
+
id_label_name=main_label_name,
|
|
638
|
+
replace_nan_with=self.replace_nan_with,
|
|
639
|
+
)
|
|
640
|
+
self.eval_set_ebm_df_and_target_.append(
|
|
641
|
+
[eval_set_ebm_dataframe, y_eval_set]
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
# initialize base learner wrapper
|
|
645
|
+
self.base_learner_: AdditiveModelWrapperClassifier = (
|
|
646
|
+
AdditiveModelWrapperClassifier(
|
|
647
|
+
BaseModelClass=self.BaseLearnerClass,
|
|
648
|
+
base_model_class_kwargs=self.kwargs_for_base_learner,
|
|
649
|
+
learning_rate=self.learning_rate,
|
|
650
|
+
use_tree_boost=self.use_tree_boost,
|
|
651
|
+
)
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
@staticmethod
|
|
655
|
+
def _find_best_path(
|
|
656
|
+
train_ebm_dataframe: pd.DataFrame, y, SelectorClass, kwargs_for_selector
|
|
657
|
+
) -> tuple[int]:
|
|
658
|
+
"""
|
|
659
|
+
Selects the path with the highest importance from a frequency-focused dataframe by training a feature selector,
|
|
660
|
+
identifying the most significant column, and extracting the corresponding path.
|
|
661
|
+
|
|
662
|
+
Note:important that this stays as static method because it is used also by the variable importance class, to select variable importance by comparison
|
|
663
|
+
|
|
664
|
+
Parameters:
|
|
665
|
+
train_ebm_dataframe (pd.DataFrame): Extended boosting matrix containing path frequency details.
|
|
666
|
+
y (array-like): The target values or negative gradient for path selection.
|
|
667
|
+
SelectorClass: A feature selector (e.g., a regressor) used to determine column importance.
|
|
668
|
+
kwargs_for_selector (dict): Configuration parameters for SelectorClass.
|
|
669
|
+
|
|
670
|
+
Returns:
|
|
671
|
+
tuple[int]: The path corresponding to the most important column.
|
|
672
|
+
"""
|
|
673
|
+
|
|
674
|
+
base_feature_selector = SelectorClass(**kwargs_for_selector)
|
|
675
|
+
frequency_boosting_matrix = (
|
|
676
|
+
ExtendedBoostingMatrix.get_frequency_boosting_matrix(train_ebm_dataframe)
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
base_feature_selector = base_feature_selector.fit(
|
|
680
|
+
X=frequency_boosting_matrix, y=y
|
|
681
|
+
)
|
|
682
|
+
best_feature_index = np.array(
|
|
683
|
+
base_feature_selector.feature_importances_
|
|
684
|
+
).argmax()
|
|
685
|
+
best_feature = frequency_boosting_matrix.columns[best_feature_index]
|
|
686
|
+
best_path = ExtendedBoostingMatrix.get_path_from_column_name(best_feature)
|
|
687
|
+
|
|
688
|
+
return best_path
|
|
689
|
+
|
|
690
|
+
def _validate_data(
|
|
691
|
+
self,
|
|
692
|
+
X: list[nx.Graph] = "no_validation",
|
|
693
|
+
y="no_validation",
|
|
694
|
+
**check_params,
|
|
695
|
+
):
|
|
696
|
+
util_validate_data(model=self, X=X, y=y, **check_params)
|
|
697
|
+
|
|
698
|
+
def plot_training_and_eval_errors(
|
|
699
|
+
self,
|
|
700
|
+
skip_first_n_iterations=False,
|
|
701
|
+
show=True,
|
|
702
|
+
save=False,
|
|
703
|
+
save_path: str | None = None,
|
|
704
|
+
):
|
|
705
|
+
"""
|
|
706
|
+
Plots the training and evaluation set errors (Mean Squared Error) over iterations.
|
|
707
|
+
|
|
708
|
+
This method visualizes the progression of the training error and, if
|
|
709
|
+
evaluation sets were provided during fitting, their respective errors
|
|
710
|
+
across the boosting iterations.
|
|
711
|
+
|
|
712
|
+
Parameters
|
|
713
|
+
----------
|
|
714
|
+
skip_first_n_iterations : int or bool, default=False
|
|
715
|
+
If True, a default number of initial iterations (calculated based on
|
|
716
|
+
learning rate) are skipped in the plot, as early iterations can sometimes
|
|
717
|
+
be outliers.
|
|
718
|
+
If an integer, that specific number of initial iterations' errors are skipped.
|
|
719
|
+
If False or 0, all iterations' errors are plotted.
|
|
720
|
+
The actual skipping logic is handled by the underlying
|
|
721
|
+
`plot_training_and_eval_errors` utility function.
|
|
722
|
+
show : bool, default=True
|
|
723
|
+
If True, the plot is displayed.
|
|
724
|
+
save : bool, default=False
|
|
725
|
+
If True, the plot is saved to a file.
|
|
726
|
+
save_path : str | None, default=None
|
|
727
|
+
The directory where the plot will be saved. If None, the current
|
|
728
|
+
working directory is used.
|
|
729
|
+
|
|
730
|
+
"""
|
|
731
|
+
if hasattr(self, "fitted_"):
|
|
732
|
+
if not self.fitted_:
|
|
733
|
+
raise ValueError(
|
|
734
|
+
"The model has not been fitted yet. Please call fit() before"
|
|
735
|
+
" plotting."
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
if hasattr(self, "eval_sets_logloss_"):
|
|
739
|
+
eval_sets_logloss = self.eval_sets_logloss_
|
|
740
|
+
else:
|
|
741
|
+
eval_sets_logloss = None
|
|
742
|
+
plot_training_and_eval_errors(
|
|
743
|
+
learning_rate=self.learning_rate,
|
|
744
|
+
train_mse=self.train_logloss_,
|
|
745
|
+
mse_eval_set=eval_sets_logloss,
|
|
746
|
+
skip_first_n_iterations=skip_first_n_iterations,
|
|
747
|
+
show=show,
|
|
748
|
+
save=save,
|
|
749
|
+
save_path=save_path,
|
|
750
|
+
)
|
|
751
|
+
|
|
752
|
+
def plot_variable_importance(
|
|
753
|
+
self, top_n_features: int | None = None, show: bool = True
|
|
754
|
+
):
|
|
755
|
+
"""
|
|
756
|
+
Plots the computed variable importance scores.
|
|
757
|
+
|
|
758
|
+
This method visualizes the importance of features (paths) as determined
|
|
759
|
+
by the SequentialPathBoost model. It uses the `variable_importance_`
|
|
760
|
+
attribute, which is populated during the `fit` method if
|
|
761
|
+
`parameters_variable_importance` was provided at initialization.
|
|
762
|
+
The visual characteristics of the plot are guided by the settings
|
|
763
|
+
contained within `self.parameters_variable_importance`.
|
|
764
|
+
show : bool, default=True
|
|
765
|
+
If True, the plot is displayed.
|
|
766
|
+
"""
|
|
767
|
+
if hasattr(self, "fitted_"):
|
|
768
|
+
if not self.fitted_:
|
|
769
|
+
raise ValueError(
|
|
770
|
+
"The model has not been fitted yet. Please call fit() before"
|
|
771
|
+
" plotting."
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
if self.parameters_variable_importance is None:
|
|
775
|
+
raise ValueError(
|
|
776
|
+
"Variable importance is not computed. Please set"
|
|
777
|
+
" parameters_variable_importance in the constructor."
|
|
778
|
+
)
|
|
779
|
+
plot_variable_importance_utils(
|
|
780
|
+
variable_importance=self.variable_importance_,
|
|
781
|
+
parameters_variable_importance=self.parameters_variable_importance,
|
|
782
|
+
top_n=top_n_features,
|
|
783
|
+
show=show,
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
def get_mse_for_patience(self, patience: int, eval_set_index: int = 0) -> float:
|
|
787
|
+
"""
|
|
788
|
+
Returns the Mean Squared Error (MSE) that we would obtain if we stopped training at the specified patience.
|
|
789
|
+
By default the mse returned is the MSE relative to the first eval_set,
|
|
790
|
+
"""
|
|
791
|
+
if not hasattr(self, "fitted_"):
|
|
792
|
+
raise ValueError(
|
|
793
|
+
"The model has not been fitted yet. Please call fit() before getting"
|
|
794
|
+
" MSE for patience."
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
if not hasattr(self, "eval_sets_logloss_"):
|
|
798
|
+
raise ValueError(
|
|
799
|
+
"The model has not been evaluated on any evaluation set. Please provide"
|
|
800
|
+
" an eval_set during fitting."
|
|
801
|
+
)
|
|
802
|
+
|
|
803
|
+
if len(self.eval_sets_logloss_) <= eval_set_index:
|
|
804
|
+
raise ValueError(
|
|
805
|
+
f"Eval set index {eval_set_index} is out of bounds for the number of"
|
|
806
|
+
f" evaluation sets: {len(self.eval_sets_logloss_)}."
|
|
807
|
+
)
|
|
808
|
+
if len(self.eval_sets_logloss_[eval_set_index]) < patience:
|
|
809
|
+
raise ValueError(
|
|
810
|
+
f"Patience {patience} exceeds the number of training iterations."
|
|
811
|
+
)
|
|
812
|
+
|
|
813
|
+
consecutive_increases = 0
|
|
814
|
+
last_logloss_value = self.eval_sets_logloss_[eval_set_index][0]
|
|
815
|
+
for error in self.eval_sets_logloss_[eval_set_index]:
|
|
816
|
+
if error >= last_logloss_value:
|
|
817
|
+
consecutive_increases += 1
|
|
818
|
+
else:
|
|
819
|
+
consecutive_increases = 0
|
|
820
|
+
last_logloss_value = error
|
|
821
|
+
if consecutive_increases >= patience:
|
|
822
|
+
return last_logloss_value
|
|
823
|
+
|
|
824
|
+
# If we never hit the patience condition, return the last MSE value
|
|
825
|
+
return self.eval_sets_logloss_[eval_set_index][-1]
|
|
826
|
+
|
|
827
|
+
def get_final_eval_set_mse(self):
|
|
828
|
+
"""
|
|
829
|
+
Returns the evaluation set MSE if it was computed during fitting.
|
|
830
|
+
"""
|
|
831
|
+
if hasattr(self, "eval_sets_logloss_"):
|
|
832
|
+
final_eval_set_logloss = []
|
|
833
|
+
for logloss in self.eval_sets_logloss_:
|
|
834
|
+
final_eval_set_logloss.append(logloss[-1])
|
|
835
|
+
return final_eval_set_logloss
|
|
836
|
+
else:
|
|
837
|
+
raise AttributeError(
|
|
838
|
+
"Evaluation set MSE is not available. Please fit the model with"
|
|
839
|
+
" eval_set."
|
|
840
|
+
)
|