path-boost 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,840 @@
1
+ import logging
2
+ import numbers
3
+
4
+ import matplotlib.pyplot as plt
5
+ import networkx as nx
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ logger = logging.getLogger("path_boost")
10
+
11
+ from .interfaces.interface_base_learner import BaseLearnerClassInterface
12
+ from .interfaces.interface_selector import SelectorClassInterface
13
+ from ..validate_data import util_validate_data
14
+ from ..variable_importance_according_to_path_boost import (
15
+ VariableImportance_ForSequentialPathBoost,
16
+ )
17
+ from ..plots_functions import (
18
+ plot_training_and_eval_errors,
19
+ plot_variable_importance_utils,
20
+ )
21
+
22
+ from sklearn.base import BaseEstimator
23
+ from sklearn.base import RegressorMixin
24
+ from .extended_boosting_matrix import ExtendedBoostingMatrix
25
+ from typing import Iterable
26
+ from sklearn.tree import DecisionTreeRegressor, plot_tree
27
+ from .additive_model_wrapper_classifier import AdditiveModelWrapperClassifier
28
+ from sklearn.metrics import mean_squared_error
29
+ from matplotlib.ticker import MaxNLocator
30
+
31
+
32
+ class SequentialPathBoostClassifier(BaseEstimator, RegressorMixin):
33
+ def __init__(
34
+ self,
35
+ n_iter=100,
36
+ max_path_length=10,
37
+ learning_rate=0.1,
38
+ patience=None,
39
+ target_error=None,
40
+ BaseLearnerClass=DecisionTreeRegressor,
41
+ kwargs_for_base_learner=None,
42
+ SelectorClass=DecisionTreeRegressor,
43
+ kwargs_for_selector=None,
44
+ parameters_variable_importance=None,
45
+ replace_nan_with=np.nan,
46
+ verbose=False,
47
+ use_tree_boost=False,
48
+ ):
49
+ """
50
+ Initializes the SequentialPathBoost model.
51
+
52
+ Parameters
53
+ ----------
54
+ n_iter : int, default=100
55
+ The number of boosting iterations to perform.
56
+ max_path_length : int, default=10
57
+ The maximum length of paths to consider as features. Paths longer
58
+ than this will not be explored for extending the Extended Boosting Matrix (EBM).
59
+ learning_rate : float, default=0.1
60
+ The learning_rate shrinks the contribution of each base learner.
61
+ It is used by the `AdditiveModelWrapperClassifier` when fitting each step.
62
+ patience : int, optional, default=None
63
+ Number of iterations with no improvement on the first evaluation set's score
64
+ before stopping early. If None, early stopping is not performed.
65
+ Requires an `eval_set` to be provided during fitting. The check is performed
66
+ based on the Mean Squared Error (MSE) of the first evaluation set in `eval_set`.
67
+ BaseLearnerClass : type, default=sklearn.tree.DecisionTreeRegressor
68
+ The class of the base learner to be used within each boosting iteration.
69
+ This class must implement the `BaseLearnerClassInterface`.
70
+ kwargs_for_base_learner : dict, default=None
71
+ Keyword arguments to be passed to the constructor of the `BaseLearnerClass`.
72
+ If None, default arguments for `DecisionTreeRegressor` will be used.
73
+ SelectorClass : type, default=sklearn.tree.DecisionTreeRegressor
74
+ The class of the feature selector used to identify the best paths in each iteration.
75
+ This class must implement the `SelectorClassInterface`.
76
+ kwargs_for_selector : dict, default=None
77
+ Keyword arguments to be passed to the constructor of the `SelectorClass`.
78
+ If None, default arguments for `DecisionTreeRegressor` will be used.
79
+ parameters_variable_importance : dict, default=None
80
+ Parameters for computing variable importance. If None, variable importance is not computed.
81
+ Expected keys include 'criterion', 'error_used', 'use_correlation', 'normalize'.
82
+ replace_nan_with : any, default=np.nan
83
+ Value used to replace NaN values encountered during feature generation in the EBM.
84
+ This is important for base learners that cannot handle NaN values.
85
+ verbose : bool, default=False
86
+ If True, prints progress messages during the fitting process, such as the
87
+ current iteration number and the best path selected.
88
+ use_tree_boost : bool, default=False
89
+ If True, uses the TreeBoost modification in the base learner, optimizing
90
+ separate gamma values for each leaf region. This requires the `BaseLearnerClass`
91
+ to support this functionality.
92
+ """
93
+ self.n_iter = n_iter
94
+ self.max_path_length = max_path_length
95
+ self.patience = patience
96
+ self.target_error = target_error
97
+ self.learning_rate = learning_rate
98
+ self.BaseLearnerClass = BaseLearnerClass
99
+ self.verbose = verbose
100
+ self.replace_nan_with = replace_nan_with
101
+ self.kwargs_for_base_learner = kwargs_for_base_learner
102
+ self.SelectorClass = SelectorClass
103
+ self.kwargs_for_selector = kwargs_for_selector
104
+ self.parameters_variable_importance = parameters_variable_importance
105
+ self.use_tree_boost = use_tree_boost
106
+
107
+ def fit(
108
+ self,
109
+ X: list[nx.Graph],
110
+ y: np.array,
111
+ list_anchor_nodes_labels: list[tuple],
112
+ anchor_nodes_label_name,
113
+ eval_set: list[tuple[list[nx.Graph], Iterable]] = None,
114
+ ):
115
+ """
116
+ Fits the SequentialPathBoost model to the training data.
117
+
118
+ This method iteratively builds an ensemble of base learners. In each iteration:
119
+ 1. It identifies the 'best path' from the current set of available paths in the
120
+ Extended Boosting Matrix (EBM) using a selector model. The target for the
121
+ selector is the original target `y` in the first iteration, and the
122
+ negative gradient of the loss function in subsequent iterations.
123
+ 2. It trains a new base learner on the features corresponding to the `best_path`
124
+ and adds it to the ensemble. The `AdditiveModelWrapperClassifier` handles the
125
+ fitting of this base learner and updates the cumulative predictions.
126
+ 3. It expands the training EBM by generating new path-based features derived
127
+ from extending the `best_path`.
128
+ 4. If variable importance calculation is enabled, it updates the importance scores
129
+ based on the selected path and the current gradient.
130
+ 5. It expands the EBM for evaluation sets (if provided) to include features
131
+ derived from the `best_path`.
132
+
133
+ The process continues for `n_iter` iterations. After fitting, training and
134
+ evaluation (if `eval_set` is provided) metrics (MSE, MAE) are stored.
135
+ If `parameters_variable_importance` was set, the final variable importance
136
+ scores are computed.
137
+
138
+ Parameters
139
+ ----------
140
+ X : list[nx.Graph]
141
+ A list of NetworkX graph objects representing the training input samples.
142
+ y : np.array
143
+ A NumPy array of target values corresponding to `X`.
144
+ list_anchor_nodes_labels : list[tuple]
145
+ A list of tuples, where each tuple contains the label(s) identifying
146
+ anchor nodes. These are used to initialize the EBM.
147
+ anchor_nodes_label_name : str
148
+ The name of the node attribute in the graphs that contains the labels
149
+ used to identify anchor nodes and subsequent path elements.
150
+ eval_set : list[tuple[list[nx.Graph], Iterable]], optional, default=None
151
+ A list of (X_eval, y_eval) tuples for monitoring the model's performance
152
+ on one or more evaluation sets during training.
153
+
154
+ Returns
155
+ -------
156
+ self : object
157
+ The fitted SequentialPathBoost estimator.
158
+ """
159
+
160
+ # Configure logging based on verbose flag
161
+ if self.verbose:
162
+ logging.getLogger("path_boost").setLevel(logging.INFO)
163
+ if not logging.getLogger("path_boost").handlers:
164
+ handler = logging.StreamHandler()
165
+ handler.setFormatter(logging.Formatter("%(message)s"))
166
+ logging.getLogger("path_boost").addHandler(handler)
167
+
168
+ self._default_kwargs_for_base_learner = {
169
+ "max_depth": 3,
170
+ "random_state": 0,
171
+ "splitter": "best",
172
+ }
173
+
174
+ self._default_kwargs_for_selector = {
175
+ "max_depth": 1,
176
+ "random_state": 0,
177
+ "splitter": "best",
178
+ "criterion": "squared_error",
179
+ }
180
+
181
+ self._validate_data(
182
+ X=X,
183
+ y=y,
184
+ list_anchor_nodes_labels=list_anchor_nodes_labels,
185
+ name_of_label_attribute=anchor_nodes_label_name,
186
+ eval_set=eval_set,
187
+ parameters_variable_importance=self.parameters_variable_importance,
188
+ patience=self.patience,
189
+ )
190
+
191
+ self.is_fitted_ = True
192
+
193
+ self.name_of_label_attribute_ = anchor_nodes_label_name
194
+
195
+ self.paths_selected_by_epb_ = set()
196
+ self._initialize_path_boosting(
197
+ X=X,
198
+ list_anchor_nodes_labels=list_anchor_nodes_labels,
199
+ main_label_name=anchor_nodes_label_name,
200
+ eval_set=eval_set,
201
+ )
202
+
203
+ if self.parameters_variable_importance is not None:
204
+ self.class_variable_importance_: VariableImportance_ForSequentialPathBoost = VariableImportance_ForSequentialPathBoost(
205
+ **self.parameters_variable_importance
206
+ )
207
+
208
+ for n_iteration in range(self.n_iter):
209
+ if self.verbose:
210
+ logger.info(f"iteration number: {n_iteration + 1}")
211
+
212
+ # this is a parameter used for a check when computing variable importance, to make sure we are computing it on the right iteration, with the right ebm
213
+ self._ebm_has_been_expanded_in_this_iteration = False
214
+
215
+ if n_iteration == 0:
216
+ best_path = self._find_best_path(
217
+ train_ebm_dataframe=self.train_ebm_dataframe_,
218
+ y=y,
219
+ SelectorClass=self.SelectorClass,
220
+ kwargs_for_selector=self.kwargs_for_selector,
221
+ )
222
+ else:
223
+ negative_gradient = AdditiveModelWrapperClassifier._neg_gradient(
224
+ y=y,
225
+ y_hat=np.array(
226
+ self.base_learner_._last_train_prediction.to_numpy()
227
+ ),
228
+ )
229
+ best_path = self._find_best_path(
230
+ train_ebm_dataframe=self.train_ebm_dataframe_,
231
+ y=pd.Series(negative_gradient),
232
+ SelectorClass=self.SelectorClass,
233
+ kwargs_for_selector=self.kwargs_for_selector,
234
+ )
235
+
236
+ if self.verbose:
237
+ logger.info(f"Best path: {best_path}")
238
+
239
+ # we collect some values for variable importance, important that this operation it is done between the
240
+ # selection of the best path and the expansion of the ebm dataframe
241
+ if self.parameters_variable_importance is not None:
242
+ if n_iteration == 0:
243
+ self.class_variable_importance_._update(
244
+ path_boost=self,
245
+ selected_path=best_path,
246
+ iteration_number=n_iteration,
247
+ gradient=y,
248
+ )
249
+ else:
250
+ self.class_variable_importance_._update(
251
+ path_boost=self,
252
+ selected_path=best_path,
253
+ iteration_number=n_iteration,
254
+ gradient=negative_gradient,
255
+ )
256
+
257
+ # expand the EVAL set in order to contain the selected columns path
258
+ self._expand_eval_ebm_dataframe_with_best_path(
259
+ best_path=best_path,
260
+ main_label_name=anchor_nodes_label_name,
261
+ eval_set=eval_set,
262
+ )
263
+
264
+ self.base_learner_.fit_one_step(
265
+ X=self.train_ebm_dataframe_,
266
+ y=y,
267
+ best_path=best_path,
268
+ eval_set=self.eval_set_ebm_df_and_target_,
269
+ )
270
+
271
+ if eval_set is not None:
272
+ if self._check_if_stop_early(
273
+ mse_eval_set=self.base_learner_.eval_sets_logloss[0],
274
+ patience=self.patience,
275
+ target_error=self.target_error,
276
+ ):
277
+ if self.verbose:
278
+ logger.info(
279
+ f"Early stopping at iteration {n_iteration + 1} due to no"
280
+ " improvement in evaluation set logloss."
281
+ )
282
+ self.n_iter = n_iteration
283
+ break
284
+
285
+ # expand the ebm dataframe with the new columns starting from the selected path
286
+ self._expand_ebm_dataframe(
287
+ X=X, selected_path=best_path, main_label_name=anchor_nodes_label_name
288
+ )
289
+
290
+ self.train_logloss_ = self.base_learner_.train_logloss
291
+ self.train_accuracy_ = self.base_learner_.train_accuracy
292
+
293
+ if self.parameters_variable_importance is not None:
294
+ self.variable_importance_: dict = (
295
+ self.class_variable_importance_.compute_variable_importance(
296
+ path_boost=self
297
+ )
298
+ )
299
+
300
+ if eval_set is not None:
301
+ self.eval_sets_logloss_ = self.base_learner_.eval_sets_logloss
302
+ self.eval_sets_accuracy_ = self.base_learner_.eval_sets_accuracy
303
+
304
+ self.columns_names_ = self.train_ebm_dataframe_.columns
305
+
306
+ return self
307
+
308
+ def _check_if_stop_early(
309
+ self,
310
+ mse_eval_set: list[float],
311
+ patience: int | None = None,
312
+ target_error: float | None = None,
313
+ ) -> bool:
314
+ """
315
+ Determines whether to stop the training process early based on evaluation metrics.
316
+
317
+ Early stopping can be triggered under two conditions:
318
+ 1. If a `target_error` is specified: Training stops if the Mean Squared Error (MSE)
319
+ on the (first) evaluation set falls at or below this target.
320
+ 2. If `patience` is specified: Training stops if the MSE on the (first) evaluation
321
+ set has not improved (i.e., decreased) for a consecutive number of iterations
322
+ equal to `patience`. An "improvement" is defined as the current MSE being strictly
323
+ less than the MSE `patience` iterations ago. If the MSE remains the same or
324
+ increases for `patience` iterations, training stops.
325
+
326
+ Parameters
327
+ ----------
328
+ mse_eval_set : list[float]
329
+ A list of Mean Squared Errors (MSE) recorded for the first evaluation set
330
+ at each iteration so far.
331
+ patience : int or None, optional
332
+ The number of iterations to wait for an improvement before stopping.
333
+ If None, this condition for early stopping is disabled.
334
+ target_error : float or None, optional
335
+ A specific MSE value. If the evaluation MSE reaches this value or lower,
336
+ training stops. If None, this condition is disabled.
337
+
338
+ Returns
339
+ -------
340
+ bool
341
+ True if the conditions for early stopping are met, False otherwise.
342
+ Returns False if `patience` is None and `target_error` is None, or if
343
+ insufficient iterations have passed to evaluate the patience condition.
344
+ """
345
+
346
+ if target_error is not None:
347
+ # If a target error is specified, check if the last MSE is less than or equal to the target error
348
+ if mse_eval_set and mse_eval_set[-1] <= target_error:
349
+ return True
350
+ else:
351
+ return False
352
+
353
+ if patience is None:
354
+ return False
355
+
356
+ if len(mse_eval_set) < patience:
357
+ return False
358
+
359
+ # Check if the last `patience` MSE values are all greater than or equal to the last MSE value
360
+ return all(mse >= mse_eval_set[-1] for mse in mse_eval_set[-patience:])
361
+
362
+ def _expand_eval_ebm_dataframe_with_best_path(
363
+ self, best_path, main_label_name, eval_set=None
364
+ ):
365
+ # we expand the ebm dataframe ONLY by adding the new columns related to the best path, we are not exploring new paths
366
+ if eval_set is not None:
367
+ columns_names = ExtendedBoostingMatrix.get_columns_related_to_path(
368
+ best_path, self.train_ebm_dataframe_.columns
369
+ )
370
+ for eval_set_number, eval_set_tuple in enumerate(eval_set):
371
+ if eval_set_tuple is None:
372
+ continue
373
+ eval_set_dataset, y_eval_set = eval_set_tuple
374
+ # find the new columns in the eval set
375
+ missing_columns = [
376
+ col
377
+ for col in columns_names
378
+ if col
379
+ not in self.eval_set_ebm_df_and_target_[eval_set_number][0].columns
380
+ ]
381
+ new_columns_for_eval_set = (
382
+ ExtendedBoostingMatrix.generate_new_columns_from_columns_names(
383
+ dataset=eval_set_dataset,
384
+ ebm_to_be_expanded=self.eval_set_ebm_df_and_target_[
385
+ eval_set_number
386
+ ][0],
387
+ columns_names=missing_columns,
388
+ main_label_name=main_label_name,
389
+ replace_nan_with=self.replace_nan_with,
390
+ )
391
+ )
392
+ self.eval_set_ebm_df_and_target_[eval_set_number][0] = pd.concat(
393
+ [
394
+ self.eval_set_ebm_df_and_target_[eval_set_number][0],
395
+ new_columns_for_eval_set,
396
+ ],
397
+ axis=1,
398
+ )
399
+
400
+ def generate_ebm_for_dataset(self, dataset: list[nx.Graph], columns_names=None):
401
+ """
402
+ Generates an Extended Boosting Matrix (EBM) for a given dataset of graphs.
403
+
404
+ The EBM is a pandas DataFrame where rows correspond to graphs and columns
405
+ correspond to features derived from paths in the graphs. If `columns_names`
406
+ is provided, the EBM will only contain these columns. Otherwise, it will
407
+ include columns related to all paths selected during the fitting process
408
+ (stored in `self.paths_selected_by_epb_` and `self.columns_names_`).
409
+
410
+ Parameters
411
+ ----------
412
+ dataset : list[nx.Graph]
413
+ A list of NetworkX graph objects for which to generate the EBM.
414
+ columns_names : list[str], optional
415
+ A list of column names to include in the generated EBM. If None,
416
+ columns are determined by the paths selected during fitting.
417
+ Defaults to None.
418
+
419
+ Returns
420
+ -------
421
+ pd.DataFrame
422
+ The generated Extended Boosting Matrix.
423
+
424
+ Raises
425
+ ------
426
+ AssertionError
427
+ If the model has not been fitted yet (i.e., `self.is_fitted_` is False).
428
+ """
429
+
430
+ assert self.is_fitted_
431
+ if columns_names is None:
432
+ selected_path = list(self.paths_selected_by_epb_)
433
+ columns_names = []
434
+ for path in selected_path:
435
+ columns_names += ExtendedBoostingMatrix.get_columns_related_to_path(
436
+ path=path, columns_names=self.columns_names_
437
+ )
438
+ columns_names = list(set(columns_names))
439
+ ebm_dataframe = ExtendedBoostingMatrix.generate_new_columns_from_columns_names(
440
+ dataset=dataset,
441
+ columns_names=columns_names,
442
+ main_label_name=self.name_of_label_attribute_,
443
+ replace_nan_with=self.replace_nan_with,
444
+ )
445
+
446
+ return ebm_dataframe
447
+
448
+ def predict(
449
+ self,
450
+ X: list[nx.Graph] | nx.Graph | None = None,
451
+ ebm_dataframe: pd.DataFrame | None = None,
452
+ class_probability: bool = False,
453
+ ) -> list[numbers.Number]:
454
+ """
455
+ Predicts target values for the given input data.
456
+
457
+ The method can accept either a list of NetworkX graphs (`X`) or a pre-computed
458
+ Extended Boosting Matrix (`ebm_dataframe`).
459
+
460
+ Parameters
461
+ ----------
462
+ X : list[nx.Graph] | None, default=None
463
+ A list of NetworkX graph objects for which to make predictions.
464
+ Required if `ebm_dataframe` is not provided.
465
+ ebm_dataframe : pd.DataFrame | None, default=None
466
+ A pre-computed Extended Boosting Matrix. If provided, this matrix will be
467
+ used directly for prediction, bypassing the EBM generation from `X`.
468
+ Required if `X` is not provided.
469
+
470
+ Returns
471
+ -------
472
+ list[numbers.Number]
473
+ A list of predicted numerical values for the input samples.
474
+
475
+ Raises
476
+ ------
477
+ AssertionError
478
+ If the model has not been fitted yet (i.e., `fit` has not been called).
479
+ If neither `X` nor `ebm_dataframe` is provided.
480
+ """
481
+ assert X is not None or ebm_dataframe is not None
482
+ assert self.is_fitted_
483
+ if ebm_dataframe is None:
484
+ if isinstance(X, nx.Graph):
485
+ X = [X]
486
+ ebm_dataframe = self.generate_ebm_for_dataset(dataset=X)
487
+ return self.base_learner_.predict(
488
+ X=ebm_dataframe, class_probability=class_probability
489
+ )
490
+
491
+ def predict_step_by_step(
492
+ self,
493
+ X: list[nx.Graph] | nx.Graph | None = None,
494
+ ebm_dataframe: pd.DataFrame | None = None,
495
+ class_probability: bool = False,
496
+ ) -> list[np.array]:
497
+ """
498
+ Generates predictions for each input sample at each boosting iteration.
499
+
500
+ This method takes either a list of NetworkX graphs or a precomputed
501
+ Extended Boosting Matrix (EBM) as input. It uses the trained base learner
502
+ to make predictions iteratively, returning a list where each element
503
+ is an array of predictions for all samples at a specific boosting step.
504
+
505
+ Parameters
506
+ ----------
507
+ X : list[nx.Graph] | None, default=None
508
+ A list of NetworkX graph objects for which to generate predictions.
509
+ Either `X` or `ebm_dataframe` must be provided.
510
+ ebm_dataframe : pd.DataFrame | None, default=None
511
+ A precomputed Extended Boosting Matrix. If provided, `X` is ignored.
512
+ Either `X` or `ebm_dataframe` must be provided.
513
+
514
+ Returns
515
+ -------
516
+ list[np.array]
517
+ A list of NumPy arrays. Each array contains the predictions for all
518
+ input samples at a specific boosting iteration. The outer list
519
+ corresponds to the iterations, and the inner arrays contain
520
+ the predictions.
521
+
522
+ Raises
523
+ ------
524
+ AssertionError
525
+ If the model has not been fitted (i.e., `self.is_fitted_` is False).
526
+ AssertionError
527
+ If both `X` and `ebm_dataframe` are None.
528
+ """
529
+ assert X is not None or ebm_dataframe is not None
530
+ assert self.is_fitted_
531
+ if ebm_dataframe is None:
532
+ if isinstance(X, nx.Graph):
533
+ X = [X]
534
+ ebm_dataframe = self.generate_ebm_for_dataset(dataset=X)
535
+ return self.base_learner_.predict_step_by_step(
536
+ X=ebm_dataframe, class_probability=class_probability
537
+ )
538
+
539
+ def evaluate(
540
+ self,
541
+ X: list[nx.Graph] | nx.Graph | None = None,
542
+ y=None,
543
+ ebm_dataframe: pd.DataFrame | None = None,
544
+ ):
545
+ """
546
+ Evaluates the model on the given dataset and returns the Mean Squared Error (MSE) for each iteration.
547
+
548
+ Parameters
549
+ ----------
550
+ X : list[nx.Graph] | None, default=None
551
+ A list of NetworkX graph objects representing the input samples.
552
+ y : array-like
553
+ The true target values corresponding to `X` or `ebm_dataframe`.
554
+ ebm_dataframe : pd.DataFrame | None, default=None
555
+ A pre-generated Extended Boosting Matrix for the input samples.
556
+ If provided, `X` is ignored for EBM generation.
557
+
558
+ Returns
559
+ -------
560
+ list[float]
561
+ A list of float values, where each value is the Mean Squared Error
562
+ of the model on the provided dataset at a specific boosting iteration.
563
+ The length of the list corresponds to the number of boosting iterations (`n_iter`).
564
+
565
+ Raises
566
+ ------
567
+ AssertionError
568
+ If `y` is None.
569
+ If both `X` and `ebm_dataframe` are None.
570
+ If the model has not been fitted yet (i.e., `fit` has not been called).
571
+ """
572
+ # it returns the evolution of the mse for each iteration
573
+ assert y is not None
574
+ assert X is not None or ebm_dataframe is not None
575
+ assert self.is_fitted_
576
+ if ebm_dataframe is None:
577
+ if isinstance(X, nx.Graph):
578
+ X = [X]
579
+ ebm_dataframe = self.generate_ebm_for_dataset(dataset=X)
580
+ return self.base_learner_.evaluate(ebm_dataframe, y)
581
+
582
+ def _expand_ebm_dataframe(
583
+ self, X: list[nx.Graph], selected_path, main_label_name: str
584
+ ):
585
+ self._ebm_has_been_expanded_in_this_iteration = True
586
+ if selected_path in self.paths_selected_by_epb_:
587
+ return
588
+ elif len(selected_path) >= self.max_path_length:
589
+ self.paths_selected_by_epb_.add(selected_path)
590
+ else:
591
+ self.paths_selected_by_epb_.add(selected_path)
592
+ new_columns = (
593
+ ExtendedBoostingMatrix.new_columns_to_expand_ebm_dataframe_with_path(
594
+ dataset=X,
595
+ selected_path=selected_path,
596
+ main_label_name=main_label_name,
597
+ df_to_be_expanded=self.train_ebm_dataframe_,
598
+ replace_nan_with=self.replace_nan_with,
599
+ )
600
+ )
601
+ self.train_ebm_dataframe_ = pd.concat(
602
+ [self.train_ebm_dataframe_, new_columns], axis=1
603
+ )
604
+
605
+ def _initialize_path_boosting(
606
+ self,
607
+ X,
608
+ list_anchor_nodes_labels: list,
609
+ main_label_name: str,
610
+ eval_set: list[tuple[list[nx.Graph], Iterable]] = None,
611
+ ):
612
+ self.name_of_label_attribute = main_label_name
613
+
614
+ # greate extended boosting matrix for train dataset
615
+ self.train_ebm_dataframe_ = ExtendedBoostingMatrix.initialize_boosting_matrix_with_anchor_nodes_attributes(
616
+ dataset=X,
617
+ list_anchor_nodes_labels=list_anchor_nodes_labels,
618
+ id_label_name=main_label_name,
619
+ replace_nan_with=self.replace_nan_with,
620
+ )
621
+ self.eval_set_ebm_df_and_target_ = []
622
+
623
+ # generate extended boosting matrix for eval dataset
624
+ if eval_set is None:
625
+ pass
626
+ else:
627
+ for eval_tuple in eval_set:
628
+ if eval_tuple is None:
629
+ self.eval_set_ebm_df_and_target_.append(None)
630
+ continue
631
+ else:
632
+ eval_dataset, y_eval_set = eval_tuple
633
+ # prepare extended boosting matrix for eval dataset
634
+ eval_set_ebm_dataframe = ExtendedBoostingMatrix.initialize_boosting_matrix_with_anchor_nodes_attributes(
635
+ dataset=eval_dataset,
636
+ list_anchor_nodes_labels=list_anchor_nodes_labels,
637
+ id_label_name=main_label_name,
638
+ replace_nan_with=self.replace_nan_with,
639
+ )
640
+ self.eval_set_ebm_df_and_target_.append(
641
+ [eval_set_ebm_dataframe, y_eval_set]
642
+ )
643
+
644
+ # initialize base learner wrapper
645
+ self.base_learner_: AdditiveModelWrapperClassifier = (
646
+ AdditiveModelWrapperClassifier(
647
+ BaseModelClass=self.BaseLearnerClass,
648
+ base_model_class_kwargs=self.kwargs_for_base_learner,
649
+ learning_rate=self.learning_rate,
650
+ use_tree_boost=self.use_tree_boost,
651
+ )
652
+ )
653
+
654
+ @staticmethod
655
+ def _find_best_path(
656
+ train_ebm_dataframe: pd.DataFrame, y, SelectorClass, kwargs_for_selector
657
+ ) -> tuple[int]:
658
+ """
659
+ Selects the path with the highest importance from a frequency-focused dataframe by training a feature selector,
660
+ identifying the most significant column, and extracting the corresponding path.
661
+
662
+ Note:important that this stays as static method because it is used also by the variable importance class, to select variable importance by comparison
663
+
664
+ Parameters:
665
+ train_ebm_dataframe (pd.DataFrame): Extended boosting matrix containing path frequency details.
666
+ y (array-like): The target values or negative gradient for path selection.
667
+ SelectorClass: A feature selector (e.g., a regressor) used to determine column importance.
668
+ kwargs_for_selector (dict): Configuration parameters for SelectorClass.
669
+
670
+ Returns:
671
+ tuple[int]: The path corresponding to the most important column.
672
+ """
673
+
674
+ base_feature_selector = SelectorClass(**kwargs_for_selector)
675
+ frequency_boosting_matrix = (
676
+ ExtendedBoostingMatrix.get_frequency_boosting_matrix(train_ebm_dataframe)
677
+ )
678
+
679
+ base_feature_selector = base_feature_selector.fit(
680
+ X=frequency_boosting_matrix, y=y
681
+ )
682
+ best_feature_index = np.array(
683
+ base_feature_selector.feature_importances_
684
+ ).argmax()
685
+ best_feature = frequency_boosting_matrix.columns[best_feature_index]
686
+ best_path = ExtendedBoostingMatrix.get_path_from_column_name(best_feature)
687
+
688
+ return best_path
689
+
690
+ def _validate_data(
691
+ self,
692
+ X: list[nx.Graph] = "no_validation",
693
+ y="no_validation",
694
+ **check_params,
695
+ ):
696
+ util_validate_data(model=self, X=X, y=y, **check_params)
697
+
698
+ def plot_training_and_eval_errors(
699
+ self,
700
+ skip_first_n_iterations=False,
701
+ show=True,
702
+ save=False,
703
+ save_path: str | None = None,
704
+ ):
705
+ """
706
+ Plots the training and evaluation set errors (Mean Squared Error) over iterations.
707
+
708
+ This method visualizes the progression of the training error and, if
709
+ evaluation sets were provided during fitting, their respective errors
710
+ across the boosting iterations.
711
+
712
+ Parameters
713
+ ----------
714
+ skip_first_n_iterations : int or bool, default=False
715
+ If True, a default number of initial iterations (calculated based on
716
+ learning rate) are skipped in the plot, as early iterations can sometimes
717
+ be outliers.
718
+ If an integer, that specific number of initial iterations' errors are skipped.
719
+ If False or 0, all iterations' errors are plotted.
720
+ The actual skipping logic is handled by the underlying
721
+ `plot_training_and_eval_errors` utility function.
722
+ show : bool, default=True
723
+ If True, the plot is displayed.
724
+ save : bool, default=False
725
+ If True, the plot is saved to a file.
726
+ save_path : str | None, default=None
727
+ The directory where the plot will be saved. If None, the current
728
+ working directory is used.
729
+
730
+ """
731
+ if hasattr(self, "fitted_"):
732
+ if not self.fitted_:
733
+ raise ValueError(
734
+ "The model has not been fitted yet. Please call fit() before"
735
+ " plotting."
736
+ )
737
+
738
+ if hasattr(self, "eval_sets_logloss_"):
739
+ eval_sets_logloss = self.eval_sets_logloss_
740
+ else:
741
+ eval_sets_logloss = None
742
+ plot_training_and_eval_errors(
743
+ learning_rate=self.learning_rate,
744
+ train_mse=self.train_logloss_,
745
+ mse_eval_set=eval_sets_logloss,
746
+ skip_first_n_iterations=skip_first_n_iterations,
747
+ show=show,
748
+ save=save,
749
+ save_path=save_path,
750
+ )
751
+
752
+ def plot_variable_importance(
753
+ self, top_n_features: int | None = None, show: bool = True
754
+ ):
755
+ """
756
+ Plots the computed variable importance scores.
757
+
758
+ This method visualizes the importance of features (paths) as determined
759
+ by the SequentialPathBoost model. It uses the `variable_importance_`
760
+ attribute, which is populated during the `fit` method if
761
+ `parameters_variable_importance` was provided at initialization.
762
+ The visual characteristics of the plot are guided by the settings
763
+ contained within `self.parameters_variable_importance`.
764
+ show : bool, default=True
765
+ If True, the plot is displayed.
766
+ """
767
+ if hasattr(self, "fitted_"):
768
+ if not self.fitted_:
769
+ raise ValueError(
770
+ "The model has not been fitted yet. Please call fit() before"
771
+ " plotting."
772
+ )
773
+
774
+ if self.parameters_variable_importance is None:
775
+ raise ValueError(
776
+ "Variable importance is not computed. Please set"
777
+ " parameters_variable_importance in the constructor."
778
+ )
779
+ plot_variable_importance_utils(
780
+ variable_importance=self.variable_importance_,
781
+ parameters_variable_importance=self.parameters_variable_importance,
782
+ top_n=top_n_features,
783
+ show=show,
784
+ )
785
+
786
+ def get_mse_for_patience(self, patience: int, eval_set_index: int = 0) -> float:
787
+ """
788
+ Returns the Mean Squared Error (MSE) that we would obtain if we stopped training at the specified patience.
789
+ By default the mse returned is the MSE relative to the first eval_set,
790
+ """
791
+ if not hasattr(self, "fitted_"):
792
+ raise ValueError(
793
+ "The model has not been fitted yet. Please call fit() before getting"
794
+ " MSE for patience."
795
+ )
796
+
797
+ if not hasattr(self, "eval_sets_logloss_"):
798
+ raise ValueError(
799
+ "The model has not been evaluated on any evaluation set. Please provide"
800
+ " an eval_set during fitting."
801
+ )
802
+
803
+ if len(self.eval_sets_logloss_) <= eval_set_index:
804
+ raise ValueError(
805
+ f"Eval set index {eval_set_index} is out of bounds for the number of"
806
+ f" evaluation sets: {len(self.eval_sets_logloss_)}."
807
+ )
808
+ if len(self.eval_sets_logloss_[eval_set_index]) < patience:
809
+ raise ValueError(
810
+ f"Patience {patience} exceeds the number of training iterations."
811
+ )
812
+
813
+ consecutive_increases = 0
814
+ last_logloss_value = self.eval_sets_logloss_[eval_set_index][0]
815
+ for error in self.eval_sets_logloss_[eval_set_index]:
816
+ if error >= last_logloss_value:
817
+ consecutive_increases += 1
818
+ else:
819
+ consecutive_increases = 0
820
+ last_logloss_value = error
821
+ if consecutive_increases >= patience:
822
+ return last_logloss_value
823
+
824
+ # If we never hit the patience condition, return the last MSE value
825
+ return self.eval_sets_logloss_[eval_set_index][-1]
826
+
827
+ def get_final_eval_set_mse(self):
828
+ """
829
+ Returns the evaluation set MSE if it was computed during fitting.
830
+ """
831
+ if hasattr(self, "eval_sets_logloss_"):
832
+ final_eval_set_logloss = []
833
+ for logloss in self.eval_sets_logloss_:
834
+ final_eval_set_logloss.append(logloss[-1])
835
+ return final_eval_set_logloss
836
+ else:
837
+ raise AttributeError(
838
+ "Evaluation set MSE is not available. Please fit the model with"
839
+ " eval_set."
840
+ )