path-boost 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1023 @@
1
+ import networkx as nx
2
+ import pandas as pd
3
+ import numbers
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import logging
7
+
8
+ from .interfaces.interface_base_learner import BaseLearnerClassInterface
9
+ from .interfaces.interface_selector import SelectorClassInterface
10
+ from ..validate_data import util_validate_data
11
+ from ..variable_importance_according_to_path_boost import (
12
+ VariableImportance_ForSequentialPathBoost,
13
+ )
14
+ from ..plots_functions import (
15
+ plot_training_and_eval_errors,
16
+ plot_variable_importance_utils,
17
+ )
18
+
19
+ from sklearn.base import BaseEstimator
20
+ from sklearn.base import RegressorMixin
21
+ from .extended_boosting_matrix import ExtendedBoostingMatrix
22
+ from typing import Iterable, Union, Optional, List, Tuple, Dict, Any, Type, Callable
23
+ from sklearn.tree import DecisionTreeRegressor, plot_tree
24
+ from .additive_model_wrapper import AdditiveModelWrapper
25
+ from sklearn.metrics import mean_squared_error
26
+ from matplotlib.ticker import MaxNLocator
27
+
28
+ try:
29
+ from tqdm import tqdm
30
+
31
+ TQDM_AVAILABLE = True
32
+ except ImportError:
33
+ TQDM_AVAILABLE = False
34
+
35
+ # Set up logger for the module
36
+ logger = logging.getLogger("path_boost")
37
+
38
+ # Type aliases
39
+ GraphList = List[nx.Graph]
40
+ PathTuple = Tuple[Union[int, str], ...]
41
+ EvalSet = List[Tuple[GraphList, Iterable]]
42
+
43
+
44
+ class SequentialPathBoost(BaseEstimator, RegressorMixin):
45
+ """
46
+ Gradient boosting for graph-structured data using path-based features.
47
+
48
+ SequentialPathBoost iteratively discovers labeled paths in graphs that
49
+ are predictive of the target variable. At each iteration, it:
50
+
51
+ 1. Selects the most informative path using a selector model
52
+ 2. Expands the Extended Boosting Matrix (EBM) with path extensions
53
+ 3. Fits a base learner on the selected path's features
54
+ 4. Updates predictions using gradient boosting
55
+
56
+ This class is typically used through PathBoost, which handles multiple
57
+ anchor node types in parallel.
58
+
59
+ Attributes
60
+ ----------
61
+ train_ebm_dataframe_ : pd.DataFrame
62
+ The Extended Boosting Matrix built during training.
63
+ train_mse_ : List[float]
64
+ Training MSE at each iteration.
65
+ paths_selected_by_epb_ : set
66
+ Set of paths selected during training.
67
+ variable_importance_ : Dict[str, float]
68
+ Path importance scores (if parameters_variable_importance provided).
69
+ is_fitted_ : bool
70
+ Whether the model has been fitted.
71
+
72
+ See Also
73
+ --------
74
+ PathBoost : Ensemble variant with multiple anchor node types.
75
+ """
76
+
77
+ # Fitted attributes (declared for type checking)
78
+ train_ebm_dataframe_: pd.DataFrame
79
+ train_mse_: List[float]
80
+ train_mae_: List[float]
81
+ paths_selected_by_epb_: set
82
+ variable_importance_: Optional[Dict[str, float]]
83
+ is_fitted_: bool
84
+ columns_names_: List[str]
85
+
86
+ def __init__(
87
+ self,
88
+ n_iter: int = 100,
89
+ max_path_length: int = 10,
90
+ learning_rate: float = 0.1,
91
+ patience: Optional[int] = None,
92
+ target_error: Optional[float] = None,
93
+ tol: float = 1e-4,
94
+ restore_best_model: bool = True,
95
+ learning_rate_scheduler: Optional[Callable[[float, int], float]] = None,
96
+ BaseLearnerClass: type = DecisionTreeRegressor,
97
+ kwargs_for_base_learner: Optional[Dict[str, Any]] = None,
98
+ SelectorClass: type = DecisionTreeRegressor,
99
+ kwargs_for_selector: Optional[Dict[str, Any]] = None,
100
+ parameters_variable_importance: Optional[Dict[str, Any]] = None,
101
+ replace_nan_with: float = np.nan,
102
+ verbose: bool = False,
103
+ ) -> None:
104
+ """
105
+ Initializes the SequentialPathBoost model.
106
+
107
+ Parameters
108
+ ----------
109
+ n_iter : int, default=100
110
+ The number of boosting iterations to perform.
111
+ max_path_length : int, default=10
112
+ The maximum length of paths to consider as features. Paths longer
113
+ than this will not be explored for extending the Extended Boosting Matrix (EBM).
114
+ learning_rate : float, default=0.1
115
+ The learning_rate shrinks the contribution of each base learner.
116
+ It is used by the `AdditiveModelWrapper` when fitting each step.
117
+ patience : int, optional, default=None
118
+ Number of iterations with no improvement on the first evaluation set's score
119
+ before stopping early. If None, early stopping is not performed.
120
+ Requires an `eval_set` to be provided during fitting. The check is performed
121
+ based on the Mean Squared Error (MSE) of the first evaluation set in `eval_set`.
122
+ tol : float, default=1e-4
123
+ Minimum improvement in evaluation MSE required to consider as "improvement".
124
+ If the MSE decreases by less than `tol` over `patience` iterations, training stops.
125
+ This prevents stopping on insignificant improvements.
126
+ restore_best_model : bool, default=True
127
+ If True and eval_set is provided, the model will be restored to the iteration
128
+ with the lowest evaluation MSE after training completes. This prevents returning
129
+ an overfit model when early stopping doesn't trigger soon enough.
130
+ learning_rate_scheduler : callable, optional, default=None
131
+ A function that takes `initial_lr` and `iteration` as arguments and returns
132
+ the learning rate to use for that iteration. If None, the learning rate is constant.
133
+ Built-in schedulers: `exponential_decay_scheduler`, `step_decay_scheduler`, `linear_decay_scheduler`.
134
+ BaseLearnerClass : type, default=sklearn.tree.DecisionTreeRegressor
135
+ The class of the base learner to be used within each boosting iteration.
136
+ This class must implement the `BaseLearnerClassInterface`.
137
+ kwargs_for_base_learner : dict, default=None
138
+ Keyword arguments to be passed to the constructor of the `BaseLearnerClass`.
139
+ If None, default arguments for `DecisionTreeRegressor` will be used.
140
+ SelectorClass : type, default=sklearn.tree.DecisionTreeRegressor
141
+ The class of the feature selector used to identify the best paths in each iteration.
142
+ This class must implement the `SelectorClassInterface`.
143
+ kwargs_for_selector : dict, default=None
144
+ Keyword arguments to be passed to the constructor of the `SelectorClass`.
145
+ If None, default arguments for `DecisionTreeRegressor` will be used.
146
+ parameters_variable_importance : dict, default=None
147
+ Parameters for computing variable importance. If None, variable importance is not computed.
148
+ Expected keys include 'criterion', 'error_used', 'use_correlation', 'normalize'.
149
+ replace_nan_with : any, default=np.nan
150
+ Value used to replace NaN values encountered during feature generation in the EBM.
151
+ This is important for base learners that cannot handle NaN values.
152
+ verbose : bool, default=False
153
+ If True, prints progress messages during the fitting process, such as the
154
+ current iteration number and the best path selected.
155
+ """
156
+ self.n_iter = n_iter
157
+ self.max_path_length = max_path_length
158
+ self.patience = patience
159
+ self.target_error = target_error
160
+ self.tol = tol
161
+ self.restore_best_model = restore_best_model
162
+ self.learning_rate_scheduler = learning_rate_scheduler
163
+ self.learning_rate = learning_rate
164
+ self.BaseLearnerClass = BaseLearnerClass
165
+ self.verbose = verbose
166
+ self.replace_nan_with = replace_nan_with
167
+ self.kwargs_for_base_learner = kwargs_for_base_learner
168
+ self.SelectorClass = SelectorClass
169
+ self.kwargs_for_selector = kwargs_for_selector
170
+ self.parameters_variable_importance = parameters_variable_importance
171
+
172
+ def fit(
173
+ self,
174
+ X: list[nx.Graph],
175
+ y: np.array,
176
+ list_anchor_nodes_labels: list[tuple],
177
+ anchor_nodes_label_name,
178
+ eval_set: list[tuple[list[nx.Graph], Iterable]] = None,
179
+ ):
180
+ """
181
+ Fits the SequentialPathBoost model to the training data.
182
+
183
+ This method iteratively builds an ensemble of base learners. In each iteration:
184
+ 1. It identifies the 'best path' from the current set of available paths in the
185
+ Extended Boosting Matrix (EBM) using a selector model. The target for the
186
+ selector is the original target `y` in the first iteration, and the
187
+ negative gradient of the loss function in subsequent iterations.
188
+ 2. It trains a new base learner on the features corresponding to the `best_path`
189
+ and adds it to the ensemble. The `AdditiveModelWrapper` handles the
190
+ fitting of this base learner and updates the cumulative predictions.
191
+ 3. It expands the training EBM by generating new path-based features derived
192
+ from extending the `best_path`.
193
+ 4. If variable importance calculation is enabled, it updates the importance scores
194
+ based on the selected path and the current gradient.
195
+ 5. It expands the EBM for evaluation sets (if provided) to include features
196
+ derived from the `best_path`.
197
+
198
+ The process continues for `n_iter` iterations. After fitting, training and
199
+ evaluation (if `eval_set` is provided) metrics (MSE, MAE) are stored.
200
+ If `parameters_variable_importance` was set, the final variable importance
201
+ scores are computed.
202
+
203
+ Parameters
204
+ ----------
205
+ X : list[nx.Graph]
206
+ A list of NetworkX graph objects representing the training input samples.
207
+ y : np.array
208
+ A NumPy array of target values corresponding to `X`.
209
+ list_anchor_nodes_labels : list[tuple]
210
+ A list of tuples, where each tuple contains the label(s) identifying
211
+ anchor nodes. These are used to initialize the EBM.
212
+ anchor_nodes_label_name : str
213
+ The name of the node attribute in the graphs that contains the labels
214
+ used to identify anchor nodes and subsequent path elements.
215
+ eval_set : list[tuple[list[nx.Graph], Iterable]], optional, default=None
216
+ A list of (X_eval, y_eval) tuples for monitoring the model's performance
217
+ on one or more evaluation sets during training.
218
+
219
+ Returns
220
+ -------
221
+ self : object
222
+ The fitted SequentialPathBoost estimator.
223
+ """
224
+
225
+ # Configure logging based on verbose flag
226
+ if self.verbose:
227
+ logging.getLogger("path_boost").setLevel(logging.INFO)
228
+ if not logging.getLogger("path_boost").handlers:
229
+ handler = logging.StreamHandler()
230
+ handler.setFormatter(logging.Formatter("%(message)s"))
231
+ logging.getLogger("path_boost").addHandler(handler)
232
+
233
+ self._default_kwargs_for_base_learner = {
234
+ "max_depth": 3,
235
+ "random_state": 0,
236
+ "splitter": "best",
237
+ }
238
+
239
+ self._default_kwargs_for_selector = {
240
+ "max_depth": 1,
241
+ "random_state": 0,
242
+ "splitter": "best",
243
+ "criterion": "squared_error",
244
+ }
245
+
246
+ self._validate_data(
247
+ X=X,
248
+ y=y,
249
+ list_anchor_nodes_labels=list_anchor_nodes_labels,
250
+ name_of_label_attribute=anchor_nodes_label_name,
251
+ eval_set=eval_set,
252
+ parameters_variable_importance=self.parameters_variable_importance,
253
+ patience=self.patience,
254
+ )
255
+
256
+ self.is_fitted_ = True
257
+
258
+ self.name_of_label_attribute_ = anchor_nodes_label_name
259
+
260
+ self.paths_selected_by_epb_ = set()
261
+ self._initialize_path_boosting(
262
+ X=X,
263
+ list_anchor_nodes_labels=list_anchor_nodes_labels,
264
+ main_label_name=anchor_nodes_label_name,
265
+ eval_set=eval_set,
266
+ )
267
+
268
+ if self.parameters_variable_importance is not None:
269
+ self.class_variable_importance_: VariableImportance_ForSequentialPathBoost = VariableImportance_ForSequentialPathBoost(
270
+ **self.parameters_variable_importance
271
+ )
272
+
273
+ # Log training start
274
+ logger.info(
275
+ f"Starting SequentialPathBoost training with {len(X)} samples,"
276
+ f" {self.n_iter} max iterations"
277
+ )
278
+ logger.debug(
279
+ f"Training parameters: learning_rate={self.learning_rate},"
280
+ f" max_path_length={self.max_path_length}"
281
+ )
282
+
283
+ # Set up iterator with optional progress bar
284
+ iterator = range(self.n_iter)
285
+ if self.verbose and TQDM_AVAILABLE:
286
+ iterator = tqdm(
287
+ iterator,
288
+ desc="SequentialPathBoost",
289
+ unit="iter",
290
+ bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
291
+ )
292
+
293
+ for n_iteration in iterator:
294
+ if self.verbose and not TQDM_AVAILABLE:
295
+ logger.info(f"iteration number: {n_iteration + 1}")
296
+ logger.debug(f"Starting iteration {n_iteration + 1}")
297
+
298
+ # this is a parameter used for a check when computing variable importance, to make sure we are computing it on the right iteration, with the right ebm
299
+ self._ebm_has_been_expanded_in_this_iteration = False
300
+
301
+ if n_iteration == 0:
302
+ best_path = self._find_best_path(
303
+ train_ebm_dataframe=self.train_ebm_dataframe_,
304
+ y=y,
305
+ SelectorClass=self.SelectorClass,
306
+ kwargs_for_selector=self.kwargs_for_selector,
307
+ )
308
+ else:
309
+ negative_gradient = AdditiveModelWrapper._neg_gradient(
310
+ y=y,
311
+ y_hat=np.array(
312
+ self.base_learner_._last_train_prediction.to_numpy()
313
+ ),
314
+ )
315
+ best_path = self._find_best_path(
316
+ train_ebm_dataframe=self.train_ebm_dataframe_,
317
+ y=pd.Series(negative_gradient),
318
+ SelectorClass=self.SelectorClass,
319
+ kwargs_for_selector=self.kwargs_for_selector,
320
+ )
321
+
322
+ # Log selected path
323
+ logger.debug(f"Iteration {n_iteration + 1}: selected path {best_path}")
324
+
325
+ if self.verbose:
326
+ if TQDM_AVAILABLE:
327
+ # Update tqdm postfix with best path info
328
+ if hasattr(iterator, "set_postfix"):
329
+ iterator.set_postfix({"path": str(best_path)[:30]})
330
+ else:
331
+ logger.info(f"Best path: {best_path}")
332
+
333
+ # we collect some values for variable importance, important that this operation it is done between the
334
+ # selection of the best path and the expansion of the ebm dataframe
335
+ # -------------------------------------------------------------------------------------------------
336
+ if self.parameters_variable_importance is not None:
337
+ if n_iteration == 0:
338
+ self.class_variable_importance_._update(
339
+ path_boost=self,
340
+ selected_path=best_path,
341
+ iteration_number=n_iteration,
342
+ gradient=y,
343
+ )
344
+ else:
345
+ self.class_variable_importance_._update(
346
+ path_boost=self,
347
+ selected_path=best_path,
348
+ iteration_number=n_iteration,
349
+ gradient=negative_gradient,
350
+ )
351
+
352
+ # -------------------------------------------------------------------------------------------------
353
+
354
+ # expand the EVAL set in order to contain the selected columns path
355
+ self._expand_eval_ebm_dataframe_with_best_path(
356
+ best_path=best_path,
357
+ anchor_node_label_name=anchor_nodes_label_name,
358
+ eval_set=eval_set,
359
+ )
360
+
361
+ self.base_learner_.fit_one_step(
362
+ X=self.train_ebm_dataframe_,
363
+ y=y,
364
+ best_path=best_path,
365
+ eval_set=self.eval_set_ebm_df_and_target_,
366
+ )
367
+
368
+ if eval_set is not None:
369
+ # Track best model for potential restoration
370
+ if (
371
+ len(self.base_learner_.eval_sets_mse) > 0
372
+ and len(self.base_learner_.eval_sets_mse[0]) > 0
373
+ ):
374
+ current_eval_mse = self.base_learner_.eval_sets_mse[0][-1]
375
+ if (
376
+ not hasattr(self, "_best_eval_mse_")
377
+ or current_eval_mse < self._best_eval_mse_
378
+ ):
379
+ self._best_eval_mse_ = current_eval_mse
380
+ self._best_iteration_ = n_iteration
381
+ self._best_base_learners_count_ = len(
382
+ self.base_learner_.base_learners_list
383
+ )
384
+
385
+ if self._check_if_stop_early(
386
+ mse_eval_set=self.base_learner_.eval_sets_mse[0],
387
+ patience=self.patience,
388
+ target_error=self.target_error,
389
+ tol=self.tol,
390
+ ):
391
+ logger.info(
392
+ f"Early stopping at iteration {n_iteration + 1} (best MSE:"
393
+ f" {self._best_eval_mse_:.6f})"
394
+ )
395
+ if self.verbose:
396
+ logger.info(
397
+ f"Early stopping at iteration {n_iteration + 1} due to no"
398
+ " improvement in evaluation set MSE."
399
+ )
400
+ self.n_iter = n_iteration
401
+ break
402
+
403
+ # expand the ebm dataframe with the new columns starting from the selected path
404
+ self._expand_ebm_dataframe(
405
+ X=X, selected_path=best_path, main_label_name=anchor_nodes_label_name
406
+ )
407
+
408
+ self.train_mse_ = self.base_learner_.train_mse
409
+ self.train_mae_ = self.base_learner_.train_mae
410
+
411
+ if self.parameters_variable_importance is not None:
412
+ self.variable_importance_: dict = (
413
+ self.class_variable_importance_.compute_variable_importance(
414
+ path_boost=self
415
+ )
416
+ )
417
+
418
+ if eval_set is not None:
419
+ self.eval_sets_mse_ = self.base_learner_.eval_sets_mse
420
+ self.eval_sets_mae_ = self.base_learner_.eval_sets_mae
421
+
422
+ self.columns_names_ = self.train_ebm_dataframe_.columns
423
+
424
+ # Restore best model if enabled and we have a best iteration
425
+ if (
426
+ self.restore_best_model
427
+ and eval_set is not None
428
+ and hasattr(self, "_best_iteration_")
429
+ ):
430
+ final_iteration = len(self.base_learner_.base_learners_list) - 1
431
+ if self._best_iteration_ < final_iteration:
432
+ # Truncate to best iteration
433
+ self.base_learner_.base_learners_list = (
434
+ self.base_learner_.base_learners_list[
435
+ : self._best_base_learners_count_
436
+ ]
437
+ )
438
+ self.base_learner_.considered_columns = (
439
+ self.base_learner_.considered_columns[
440
+ : self._best_base_learners_count_
441
+ ]
442
+ )
443
+ self.base_learner_._target_variable_mean_ = (
444
+ self.base_learner_._target_variable_mean_[
445
+ : self._best_base_learners_count_
446
+ ]
447
+ )
448
+ self.base_learner_.train_mse = self.base_learner_.train_mse[
449
+ : self._best_base_learners_count_
450
+ ]
451
+ self.base_learner_.train_mae = self.base_learner_.train_mae[
452
+ : self._best_base_learners_count_
453
+ ]
454
+ # Also truncate eval set metrics
455
+ for i in range(len(self.base_learner_.eval_sets_mse)):
456
+ self.base_learner_.eval_sets_mse[i] = (
457
+ self.base_learner_.eval_sets_mse[i][
458
+ : self._best_base_learners_count_
459
+ ]
460
+ )
461
+ self.base_learner_.eval_sets_mae[i] = (
462
+ self.base_learner_.eval_sets_mae[i][
463
+ : self._best_base_learners_count_
464
+ ]
465
+ )
466
+ # Update the stored metrics
467
+ self.train_mse_ = self.base_learner_.train_mse
468
+ self.train_mae_ = self.base_learner_.train_mae
469
+ self.eval_sets_mse_ = self.base_learner_.eval_sets_mse
470
+ self.eval_sets_mae_ = self.base_learner_.eval_sets_mae
471
+
472
+ if self.verbose:
473
+ logger.info(
474
+ "Restored model to best iteration"
475
+ f" {self._best_iteration_ + 1} with eval MSE"
476
+ f" {self._best_eval_mse_:.6f}"
477
+ )
478
+ logger.info(
479
+ f"Restored model to best iteration {self._best_iteration_ + 1}"
480
+ )
481
+
482
+ # Log training completion
483
+ final_mse = self.train_mse_[-1] if len(self.train_mse_) > 0 else None
484
+ logger.info(
485
+ f"Training completed: {len(self.train_mse_)} iterations, final train MSE:"
486
+ f" {final_mse}"
487
+ )
488
+
489
+ return self
490
+
491
+ def _check_if_stop_early(
492
+ self,
493
+ mse_eval_set: list[float],
494
+ patience: int | None = None,
495
+ target_error: float | None = None,
496
+ tol: float = 1e-4,
497
+ ) -> bool:
498
+ """
499
+ Determines whether to stop the training process early based on evaluation metrics.
500
+
501
+ Early stopping can be triggered under two conditions:
502
+ 1. If a `target_error` is specified: Training stops if the Mean Squared Error (MSE)
503
+ on the (first) evaluation set falls at or below this target.
504
+ 2. If `patience` is specified: Training stops if the MSE on the (first) evaluation
505
+ set has not improved by at least `tol` over `patience` iterations. This prevents
506
+ stopping on insignificant improvements.
507
+
508
+ Parameters
509
+ ----------
510
+ mse_eval_set : list[float]
511
+ A list of Mean Squared Errors (MSE) recorded for the first evaluation set
512
+ at each iteration so far.
513
+ patience : int or None, optional
514
+ The number of iterations to wait for an improvement before stopping.
515
+ If None, this condition for early stopping is disabled.
516
+ target_error : float or None, optional
517
+ A specific MSE value. If the evaluation MSE reaches this value or lower,
518
+ training stops. If None, this condition is disabled.
519
+ tol : float, default=1e-4
520
+ Minimum improvement required to consider as "improvement".
521
+ If MSE decreases by less than tol, it's not considered an improvement.
522
+
523
+ Returns
524
+ -------
525
+ bool
526
+ True if the conditions for early stopping are met, False otherwise.
527
+ Returns False if `patience` is None and `target_error` is None, or if
528
+ insufficient iterations have passed to evaluate the patience condition.
529
+ """
530
+
531
+ if target_error is not None:
532
+ # If a target error is specified, check if the last MSE is less than or equal to the target error
533
+ if mse_eval_set and mse_eval_set[-1] <= target_error:
534
+ return True
535
+ else:
536
+ return False
537
+
538
+ if patience is None:
539
+ return False
540
+
541
+ if len(mse_eval_set) < patience:
542
+ return False
543
+
544
+ # Check if improvement over `patience` iterations is below tolerance
545
+ # Compare current MSE to MSE from `patience` iterations ago
546
+ old_mse = mse_eval_set[-patience]
547
+ current_mse = mse_eval_set[-1]
548
+ improvement = old_mse - current_mse
549
+
550
+ # Stop if no significant improvement
551
+ return improvement < tol
552
+
553
+ def _expand_eval_ebm_dataframe_with_best_path(
554
+ self, best_path, anchor_node_label_name, eval_set=None
555
+ ):
556
+ # we expand the ebm dataframe ONLY by adding the new columns related to the best path, we are not exploring new paths
557
+ if eval_set is not None:
558
+ columns_names = ExtendedBoostingMatrix.get_columns_related_to_path(
559
+ best_path, self.train_ebm_dataframe_.columns
560
+ )
561
+ for eval_set_number, eval_set_tuple in enumerate(eval_set):
562
+ if eval_set_tuple is None:
563
+ continue
564
+ eval_set_dataset, y_eval_set = eval_set_tuple
565
+ # find the new columns in the eval set
566
+ missing_columns = [
567
+ col
568
+ for col in columns_names
569
+ if col
570
+ not in self.eval_set_ebm_df_and_target_[eval_set_number][0].columns
571
+ ]
572
+ new_columns_for_eval_set = (
573
+ ExtendedBoostingMatrix.generate_new_columns_from_columns_names(
574
+ dataset=eval_set_dataset,
575
+ ebm_to_be_expanded=self.eval_set_ebm_df_and_target_[
576
+ eval_set_number
577
+ ][0],
578
+ columns_names=missing_columns,
579
+ main_label_name=anchor_node_label_name,
580
+ replace_nan_with=self.replace_nan_with,
581
+ )
582
+ )
583
+ self.eval_set_ebm_df_and_target_[eval_set_number][0] = pd.concat(
584
+ [
585
+ self.eval_set_ebm_df_and_target_[eval_set_number][0],
586
+ new_columns_for_eval_set,
587
+ ],
588
+ axis=1,
589
+ )
590
+
591
+ def generate_ebm_for_dataset(self, dataset: list[nx.Graph], columns_names=None):
592
+ """
593
+ Generates an Extended Boosting Matrix (EBM) for a given dataset of graphs.
594
+
595
+ The EBM is a pandas DataFrame where rows correspond to graphs and columns
596
+ correspond to features derived from paths in the graphs. If `columns_names`
597
+ is provided, the EBM will only contain these columns. Otherwise, it will
598
+ include columns related to all paths selected during the fitting process
599
+ (stored in `self.paths_selected_by_epb_` and `self.columns_names_`).
600
+
601
+ Parameters
602
+ ----------
603
+ dataset : list[nx.Graph]
604
+ A list of NetworkX graph objects for which to generate the EBM.
605
+ columns_names : list[str], optional
606
+ A list of column names to include in the generated EBM. If None,
607
+ columns are determined by the paths selected during fitting.
608
+ Defaults to None.
609
+
610
+ Returns
611
+ -------
612
+ pd.DataFrame
613
+ The generated Extended Boosting Matrix.
614
+
615
+ Raises
616
+ ------
617
+ AssertionError
618
+ If the model has not been fitted yet (i.e., `self.is_fitted_` is False).
619
+ """
620
+
621
+ assert self.is_fitted_
622
+ if columns_names is None:
623
+ selected_path = list(self.paths_selected_by_epb_)
624
+ columns_names = []
625
+ for path in selected_path:
626
+ columns_names += ExtendedBoostingMatrix.get_columns_related_to_path(
627
+ path=path, columns_names=self.columns_names_
628
+ )
629
+ columns_names = list(set(columns_names))
630
+ ebm_dataframe = ExtendedBoostingMatrix.generate_new_columns_from_columns_names(
631
+ dataset=dataset,
632
+ columns_names=columns_names,
633
+ main_label_name=self.name_of_label_attribute_,
634
+ replace_nan_with=self.replace_nan_with,
635
+ )
636
+
637
+ return ebm_dataframe
638
+
639
+ def predict(
640
+ self,
641
+ X: list[nx.Graph] | nx.Graph | None = None,
642
+ ebm_dataframe: pd.DataFrame | None = None,
643
+ ) -> list[numbers.Number]:
644
+ """
645
+ Predicts target values for the given input data.
646
+
647
+ The method can accept either a list of NetworkX graphs (`X`) or a pre-computed
648
+ Extended Boosting Matrix (`ebm_dataframe`).
649
+
650
+ Parameters
651
+ ----------
652
+ X : list[nx.Graph] | None, default=None
653
+ A list of NetworkX graph objects for which to make predictions.
654
+ Required if `ebm_dataframe` is not provided.
655
+ ebm_dataframe : pd.DataFrame | None, default=None
656
+ A pre-computed Extended Boosting Matrix. If provided, this matrix will be
657
+ used directly for prediction, bypassing the EBM generation from `X`.
658
+ Required if `X` is not provided.
659
+
660
+ Returns
661
+ -------
662
+ list[numbers.Number]
663
+ A list of predicted numerical values for the input samples.
664
+
665
+ Raises
666
+ ------
667
+ AssertionError
668
+ If the model has not been fitted yet (i.e., `fit` has not been called).
669
+ If neither `X` nor `ebm_dataframe` is provided.
670
+ """
671
+ assert X is not None or ebm_dataframe is not None
672
+ assert self.is_fitted_
673
+ if ebm_dataframe is None:
674
+ if isinstance(X, nx.Graph):
675
+ X = [X]
676
+ ebm_dataframe = self.generate_ebm_for_dataset(dataset=X)
677
+ return self.base_learner_.predict(ebm_dataframe)
678
+
679
+ def predict_step_by_step(
680
+ self,
681
+ X: list[nx.Graph] | nx.Graph | None = None,
682
+ ebm_dataframe: pd.DataFrame | None = None,
683
+ ) -> list[np.array]:
684
+ """
685
+ Generates predictions for each input sample at each boosting iteration.
686
+
687
+ This method takes either a list of NetworkX graphs or a precomputed
688
+ Extended Boosting Matrix (EBM) as input. It uses the trained base learner
689
+ to make predictions iteratively, returning a list where each element
690
+ is an array of predictions for all samples at a specific boosting step.
691
+
692
+ Parameters
693
+ ----------
694
+ X : list[nx.Graph] | None, default=None
695
+ A list of NetworkX graph objects for which to generate predictions.
696
+ Either `X` or `ebm_dataframe` must be provided.
697
+ ebm_dataframe : pd.DataFrame | None, default=None
698
+ A precomputed Extended Boosting Matrix. If provided, `X` is ignored.
699
+ Either `X` or `ebm_dataframe` must be provided.
700
+
701
+ Returns
702
+ -------
703
+ list[np.array]
704
+ A list of NumPy arrays. Each array contains the predictions for all
705
+ input samples at a specific boosting iteration. The outer list
706
+ corresponds to the iterations, and the inner arrays contain
707
+ the predictions.
708
+
709
+ Raises
710
+ ------
711
+ AssertionError
712
+ If the model has not been fitted (i.e., `self.is_fitted_` is False).
713
+ AssertionError
714
+ If both `X` and `ebm_dataframe` are None.
715
+ """
716
+ assert X is not None or ebm_dataframe is not None
717
+ assert self.is_fitted_
718
+ if ebm_dataframe is None:
719
+ if isinstance(X, nx.Graph):
720
+ X = [X]
721
+ ebm_dataframe = self.generate_ebm_for_dataset(dataset=X)
722
+ return self.base_learner_.predict_step_by_step(ebm_dataframe)
723
+
724
+ def evaluate(
725
+ self,
726
+ X: list[nx.Graph] | nx.Graph | None = None,
727
+ y=None,
728
+ ebm_dataframe: pd.DataFrame | None = None,
729
+ ):
730
+ """
731
+ Evaluates the model on the given dataset and returns the Mean Squared Error (MSE) for each iteration.
732
+
733
+ Parameters
734
+ ----------
735
+ X : list[nx.Graph] | None, default=None
736
+ A list of NetworkX graph objects representing the input samples.
737
+ y : array-like
738
+ The true target values corresponding to `X` or `ebm_dataframe`.
739
+ ebm_dataframe : pd.DataFrame | None, default=None
740
+ A pre-generated Extended Boosting Matrix for the input samples.
741
+ If provided, `X` is ignored for EBM generation.
742
+
743
+ Returns
744
+ -------
745
+ list[float]
746
+ A list of float values, where each value is the Mean Squared Error
747
+ of the model on the provided dataset at a specific boosting iteration.
748
+ The length of the list corresponds to the number of boosting iterations (`n_iter`).
749
+
750
+ Raises
751
+ ------
752
+ AssertionError
753
+ If `y` is None.
754
+ If both `X` and `ebm_dataframe` are None.
755
+ If the model has not been fitted yet (i.e., `fit` has not been called).
756
+ """
757
+ # it returns the evolution of the mse for each iteration
758
+ assert y is not None
759
+ assert X is not None or ebm_dataframe is not None
760
+ assert self.is_fitted_
761
+ if ebm_dataframe is None:
762
+ if isinstance(X, nx.Graph):
763
+ X = [X]
764
+ ebm_dataframe = self.generate_ebm_for_dataset(dataset=X)
765
+ return self.base_learner_.evaluate(ebm_dataframe, y)
766
+
767
+ def _expand_ebm_dataframe(
768
+ self, X: list[nx.Graph], selected_path, main_label_name: str
769
+ ):
770
+ self._ebm_has_been_expanded_in_this_iteration = True
771
+ if selected_path in self.paths_selected_by_epb_:
772
+ return
773
+ elif len(selected_path) >= self.max_path_length:
774
+ self.paths_selected_by_epb_.add(selected_path)
775
+ else:
776
+ self.paths_selected_by_epb_.add(selected_path)
777
+ new_columns = (
778
+ ExtendedBoostingMatrix.new_columns_to_expand_ebm_dataframe_with_path(
779
+ dataset=X,
780
+ selected_path=selected_path,
781
+ main_label_name=main_label_name,
782
+ df_to_be_expanded=self.train_ebm_dataframe_,
783
+ replace_nan_with=self.replace_nan_with,
784
+ )
785
+ )
786
+ self.train_ebm_dataframe_ = pd.concat(
787
+ [self.train_ebm_dataframe_, new_columns], axis=1
788
+ )
789
+
790
+ def _initialize_path_boosting(
791
+ self,
792
+ X,
793
+ list_anchor_nodes_labels: list,
794
+ main_label_name: str,
795
+ eval_set: list[tuple[list[nx.Graph], Iterable]] = None,
796
+ ):
797
+ self.name_of_label_attribute = main_label_name
798
+
799
+ # greate extended boosting matrix for train dataset
800
+ self.train_ebm_dataframe_ = ExtendedBoostingMatrix.initialize_boosting_matrix_with_anchor_nodes_attributes(
801
+ dataset=X,
802
+ list_anchor_nodes_labels=list_anchor_nodes_labels,
803
+ id_label_name=main_label_name,
804
+ replace_nan_with=self.replace_nan_with,
805
+ )
806
+ self.eval_set_ebm_df_and_target_ = []
807
+
808
+ # generate extended boosting matrix for eval dataset
809
+ if eval_set is None:
810
+ pass
811
+ else:
812
+ for eval_tuple in eval_set:
813
+ if eval_tuple is None:
814
+ self.eval_set_ebm_df_and_target_.append(None)
815
+ continue
816
+ else:
817
+ eval_dataset, y_eval_set = eval_tuple
818
+ # prepare extended boosting matrix for eval dataset
819
+ eval_set_ebm_dataframe = ExtendedBoostingMatrix.initialize_boosting_matrix_with_anchor_nodes_attributes(
820
+ dataset=eval_dataset,
821
+ list_anchor_nodes_labels=list_anchor_nodes_labels,
822
+ id_label_name=main_label_name,
823
+ replace_nan_with=self.replace_nan_with,
824
+ )
825
+ self.eval_set_ebm_df_and_target_.append(
826
+ [eval_set_ebm_dataframe, y_eval_set]
827
+ )
828
+
829
+ # initialize base learner wrapper
830
+ self.base_learner_: AdditiveModelWrapper = AdditiveModelWrapper(
831
+ BaseModelClass=self.BaseLearnerClass,
832
+ base_model_class_kwargs=self.kwargs_for_base_learner,
833
+ learning_rate=self.learning_rate,
834
+ learning_rate_scheduler=self.learning_rate_scheduler,
835
+ )
836
+
837
+ @staticmethod
838
+ def _find_best_path(
839
+ train_ebm_dataframe: pd.DataFrame, y, SelectorClass, kwargs_for_selector
840
+ ) -> tuple[int]:
841
+ """
842
+ Selects the path with the highest importance from a frequency-focused dataframe by training a feature selector,
843
+ identifying the most significant column, and extracting the corresponding path.
844
+
845
+ Note:important that this stays as static method because it is used also by the variable importance class, to select variable importance by comparison
846
+
847
+ Parameters:
848
+ train_ebm_dataframe (pd.DataFrame): Extended boosting matrix containing path frequency details.
849
+ y (array-like): The target values or negative gradient for path selection.
850
+ SelectorClass: A feature selector (e.g., a regressor) used to determine column importance.
851
+ kwargs_for_selector (dict): Configuration parameters for SelectorClass.
852
+
853
+ Returns:
854
+ tuple[int]: The path corresponding to the most important column.
855
+ """
856
+
857
+ base_feature_selector = SelectorClass(**kwargs_for_selector)
858
+ frequency_boosting_matrix = (
859
+ ExtendedBoostingMatrix.get_frequency_boosting_matrix(train_ebm_dataframe)
860
+ )
861
+
862
+ base_feature_selector = base_feature_selector.fit(
863
+ X=frequency_boosting_matrix, y=y
864
+ )
865
+ best_feature_index = np.array(
866
+ base_feature_selector.feature_importances_
867
+ ).argmax()
868
+ best_feature = frequency_boosting_matrix.columns[best_feature_index]
869
+ best_path = ExtendedBoostingMatrix.get_path_from_column_name(best_feature)
870
+
871
+ return best_path
872
+
873
+ def _validate_data(
874
+ self,
875
+ X: list[nx.Graph] = "no_validation",
876
+ y="no_validation",
877
+ **check_params,
878
+ ):
879
+ util_validate_data(model=self, X=X, y=y, **check_params)
880
+
881
+ def plot_training_and_eval_errors(
882
+ self,
883
+ skip_first_n_iterations=False,
884
+ show=True,
885
+ save=False,
886
+ save_path: str | None = None,
887
+ ):
888
+ """
889
+ Plots the training and evaluation set errors (Mean Squared Error) over iterations.
890
+
891
+ This method visualizes the progression of the training error and, if
892
+ evaluation sets were provided during fitting, their respective errors
893
+ across the boosting iterations.
894
+
895
+ Parameters
896
+ ----------
897
+ skip_first_n_iterations : int or bool, default=False
898
+ If True, a default number of initial iterations (calculated based on
899
+ learning rate) are skipped in the plot, as early iterations can sometimes
900
+ be outliers.
901
+ If an integer, that specific number of initial iterations' errors are skipped.
902
+ If False or 0, all iterations' errors are plotted.
903
+ The actual skipping logic is handled by the underlying
904
+ `plot_training_and_eval_errors` utility function.
905
+ show : bool, default=True
906
+ If True, the plot is displayed.
907
+ save : bool, default=False
908
+ If True, the plot is saved to a file.
909
+ save_path : str | None, default=None
910
+ The directory where the plot will be saved. If None, the current
911
+ working directory is used.
912
+
913
+ """
914
+ if hasattr(self, "fitted_"):
915
+ if not self.fitted_:
916
+ raise ValueError(
917
+ "The model has not been fitted yet. Please call fit() before"
918
+ " plotting."
919
+ )
920
+
921
+ if hasattr(self, "mse_eval_set_"):
922
+ eval_sets_mse = self.mse_eval_set_
923
+ else:
924
+ eval_sets_mse = None
925
+ plot_training_and_eval_errors(
926
+ learning_rate=self.learning_rate,
927
+ train_mse=self.train_mse_,
928
+ mse_eval_set=eval_sets_mse,
929
+ skip_first_n_iterations=skip_first_n_iterations,
930
+ show=show,
931
+ save=save,
932
+ save_path=save_path,
933
+ )
934
+
935
+ def plot_variable_importance(
936
+ self, top_n_features: int | None = None, show: bool = True
937
+ ):
938
+ """
939
+ Plots the computed variable importance scores.
940
+
941
+ This method visualizes the importance of features (paths) as determined
942
+ by the SequentialPathBoost model. It uses the `variable_importance_`
943
+ attribute, which is populated during the `fit` method if
944
+ `parameters_variable_importance` was provided at initialization.
945
+ The visual characteristics of the plot are guided by the settings
946
+ contained within `self.parameters_variable_importance`.
947
+ show : bool, default=True
948
+ If True, the plot is displayed.
949
+ """
950
+ if hasattr(self, "fitted_"):
951
+ if not self.fitted_:
952
+ raise ValueError(
953
+ "The model has not been fitted yet. Please call fit() before"
954
+ " plotting."
955
+ )
956
+
957
+ if self.parameters_variable_importance is None:
958
+ raise ValueError(
959
+ "Variable importance is not computed. Please set"
960
+ " parameters_variable_importance in the constructor."
961
+ )
962
+ plot_variable_importance_utils(
963
+ variable_importance=self.variable_importance_,
964
+ parameters_variable_importance=self.parameters_variable_importance,
965
+ top_n=top_n_features,
966
+ show=show,
967
+ )
968
+
969
+ def get_mse_for_patience(self, patience: int, eval_set_index: int = 0) -> float:
970
+ """
971
+ Returns the Mean Squared Error (MSE) that we would obtain if we stopped training at the specified patience.
972
+ By default the mse returned is the MSE relative to the first eval_set,
973
+ """
974
+ if not hasattr(self, "fitted_"):
975
+ raise ValueError(
976
+ "The model has not been fitted yet. Please call fit() before getting"
977
+ " MSE for patience."
978
+ )
979
+
980
+ if not hasattr(self, "eval_sets_mse_"):
981
+ raise ValueError(
982
+ "The model has not been evaluated on any evaluation set. Please provide"
983
+ " an eval_set during fitting."
984
+ )
985
+
986
+ if len(self.eval_sets_mse_) <= eval_set_index:
987
+ raise ValueError(
988
+ f"Eval set index {eval_set_index} is out of bounds for the number of"
989
+ f" evaluation sets: {len(self.eval_sets_mse_)}."
990
+ )
991
+ if len(self.eval_sets_mse_[eval_set_index]) < patience:
992
+ raise ValueError(
993
+ f"Patience {patience} exceeds the number of training iterations."
994
+ )
995
+
996
+ consecutive_increases = 0
997
+ last_mse_value = self.eval_sets_mse_[eval_set_index][0]
998
+ for error in self.eval_sets_mse_[eval_set_index]:
999
+ if error >= last_mse_value:
1000
+ consecutive_increases += 1
1001
+ else:
1002
+ consecutive_increases = 0
1003
+ last_mse_value = error
1004
+ if consecutive_increases >= patience:
1005
+ return last_mse_value
1006
+
1007
+ # If we never hit the patience condition, return the last MSE value
1008
+ return self.eval_sets_mse_[eval_set_index][-1]
1009
+
1010
+ def get_final_eval_set_mse(self):
1011
+ """
1012
+ Returns the evaluation set MSE if it was computed during fitting.
1013
+ """
1014
+ if hasattr(self, "mse_eval_set_"):
1015
+ final_eval_set_mse = []
1016
+ for mse in self.mse_eval_set_:
1017
+ final_eval_set_mse.append(mse[-1])
1018
+ return final_eval_set_mse
1019
+ else:
1020
+ raise AttributeError(
1021
+ "Evaluation set MSE is not available. Please fit the model with"
1022
+ " eval_set."
1023
+ )