path-boost 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1096 @@
1
+ """
2
+ Extended Path Boost - A gradient boosting algorithm for graph-structured data.
3
+
4
+ This module provides the PathBoost class, an ensemble learning method that builds
5
+ interpretable models for graph data by discovering path-based features.
6
+ """
7
+
8
+ # Authors: scikit-learn-contrib developers
9
+ # License: BSD 3 clause
10
+
11
+
12
+ import os
13
+ import logging
14
+
15
+ # done to limit the number of spawned threads during parallelization
16
+
17
+ max_n_threads = 2
18
+ os.environ["MKL_NUM_THREADS"] = str(max_n_threads)
19
+ os.environ["NUMEXPR_NUM_THREADS"] = str(max_n_threads)
20
+ os.environ["OMP_NUM_THREADS"] = str(max_n_threads)
21
+
22
+ import numbers
23
+ import numpy as np
24
+ import warnings
25
+ import itertools
26
+ import multiprocessing as mp
27
+ import networkx as nx
28
+ import matplotlib.pyplot as plt
29
+
30
+ from .utils.classes.sequential_path_boost import SequentialPathBoost
31
+ from .utils import cyclic_path_boost_utils as wbu
32
+ from .utils.classes.interfaces.interface_base_learner import BaseLearnerClassInterface
33
+ from .utils.variable_importance_according_to_path_boost import (
34
+ VariableImportance_ForSequentialPathBoost,
35
+ )
36
+ from .utils.classes.interfaces.interface_selector import SelectorClassInterface
37
+ from .utils.validate_data import util_validate_data
38
+ from .utils.plots_functions import (
39
+ plot_training_and_eval_errors,
40
+ plot_variable_importance_utils,
41
+ )
42
+ from typing import Iterable, List, Tuple, Optional, Union, Dict, Any, Type
43
+ from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, _fit_context
44
+ from sklearn.metrics import mean_squared_error
45
+ from sklearn.utils.validation import check_is_fitted, validate_data
46
+ from sklearn.tree import DecisionTreeRegressor
47
+ from sklearn.base import RegressorMixin
48
+
49
+ # Type aliases for better readability
50
+ GraphList = List[nx.Graph]
51
+ AnchorLabel = Union[int, str, Tuple]
52
+ AnchorLabelList = List[AnchorLabel]
53
+ EvalSet = List[Tuple[GraphList, Iterable]]
54
+
55
+ try:
56
+ from tqdm import tqdm
57
+
58
+ TQDM_AVAILABLE = True
59
+ except ImportError:
60
+ TQDM_AVAILABLE = False
61
+
62
+ # Set up logger for the module
63
+ logger = logging.getLogger("path_boost")
64
+
65
+
66
+ class PathBoost(BaseEstimator, RegressorMixin):
67
+ """
68
+ PathBoost is an ensemble learning method that builds a model by iteratively fitting
69
+ SequentialPathBoost models on different subsets of the data, partitioned by anchor nodes.
70
+ It is designed for graph-based data where paths originating from specified anchor nodes
71
+ are used as features.
72
+
73
+ The model trains a separate `SequentialPathBoost` instance for each unique anchor node
74
+ label provided. Predictions are then aggregated (averaged) from these individual models.
75
+ It supports parallel training of the `SequentialPathBoost` models across multiple cores.
76
+
77
+ Parameters
78
+ ----------
79
+ n_iter : int, default=100
80
+ The number of boosting iterations to perform for each `SequentialPathBoost` model.
81
+ max_path_length : int, default=10
82
+ The maximum length of paths to consider as features.
83
+ learning_rate : float, default=0.1
84
+ The learning rate shrinks the contribution of each base learner in the `SequentialPathBoost` model.
85
+ m_stops : list[int], default=None
86
+ A list of iteration numbers at which to stop boosting for specific models.
87
+ Currently, this parameter is validated but not fully implemented in the core logic.
88
+ BaseLearnerClass : type, default=sklearn.tree.DecisionTreeRegressor
89
+ The class of the base learner to be used within each boosting iteration in the `SequentialPathBoost` model.
90
+ Must implement the `BaseLearnerClassInterface`.
91
+ kwargs_for_base_learner : dict, default=None
92
+ Keyword arguments to be passed to the constructor of the `BaseLearnerClass`.
93
+ SelectorClass : type, default=sklearn.tree.DecisionTreeRegressor
94
+ The class of the feature selector used to identify the best paths in each iteration.
95
+ Must implement the `SelectorClassInterface`.
96
+ kwargs_for_selector : dict, default=None
97
+ Keyword arguments to be passed to the constructor of the `SelectorClass`.
98
+ parameters_variable_importance : dict, default=None
99
+ Parameters for computing variable importance. If None, variable importance is not computed.
100
+ Expected keys include 'criterion' = 'absolute' or 'relative', 'error_used' = 'mse' or 'mae', 'use_correlation' = True or False, 'normalize' = True or False.
101
+ replace_nan_with : any, default=np.nan
102
+ Value used to replace NaN values encountered during feature generation. It is needed for some base learners like linear models who can not deal with NaN values.
103
+ verbose : bool, default=False
104
+ If True, prints progress messages during fitting.
105
+ n_of_cores : int, default=1
106
+ The number of CPU cores to use for parallel training of `SequentialPathBoost` models.
107
+ If 1, training is sequential.
108
+ """
109
+
110
+ # This is a dictionary allowing to define the type of parameters.
111
+ # It used to validate parameter within the `_fit_context` decorator.
112
+ _parameter_constraints = {
113
+ "n_iter": [numbers.Integral],
114
+ "max_path_length": [numbers.Integral],
115
+ "learning_rate": [numbers.Integral, numbers.Real],
116
+ "target_error": [numbers.Real, None],
117
+ "base_learner_kwargs": [dict, None],
118
+ "BaseLearnerClass": [type],
119
+ "SelectorClass": [type],
120
+ "kwargs_for_selector": [dict, None],
121
+ "eval_set": [list[tuple[list[nx.Graph], Iterable]], None],
122
+ "list_anchor_nodes_labels": [list[tuple]],
123
+ "X": [list[nx.Graph]],
124
+ "y": [Iterable],
125
+ "anchor_nodes_label_name": [str],
126
+ "verbose": [bool],
127
+ "n_of_cores": [numbers.Integral],
128
+ "parameters_variable_importance": [dict, None],
129
+ }
130
+
131
+ def __init__(
132
+ self,
133
+ n_iter=100,
134
+ patience: int | None = None,
135
+ target_error: float | None = None,
136
+ max_path_length=10,
137
+ learning_rate=0.1,
138
+ m_stops: list[int] = None,
139
+ BaseLearnerClass=DecisionTreeRegressor,
140
+ kwargs_for_base_learner=None,
141
+ SelectorClass=DecisionTreeRegressor,
142
+ kwargs_for_selector=None,
143
+ parameters_variable_importance=None,
144
+ replace_nan_with=np.nan,
145
+ verbose: bool = False,
146
+ n_of_cores: int = 1,
147
+ ):
148
+ self.n_iter: int = n_iter
149
+ self.patience: int = patience
150
+ self.target_error: float | None = target_error
151
+ self.m_stops: list[int] = m_stops
152
+ self.max_path_length: int = max_path_length
153
+ self.learning_rate: float = learning_rate
154
+ self.BaseLearnerClass: type[BaseLearnerClassInterface] = BaseLearnerClass
155
+ self.verbose: bool = verbose
156
+ self.n_of_cores = n_of_cores
157
+ self.kwargs_for_base_learner: dict = kwargs_for_base_learner
158
+ self.SelectorClass: type[SelectorClassInterface] = SelectorClass
159
+ self.kwargs_for_selector: dict = kwargs_for_selector
160
+ self.replace_nan_with = replace_nan_with
161
+ self.parameters_variable_importance = parameters_variable_importance
162
+
163
+ @_fit_context(prefer_skip_nested_validation=True)
164
+ def fit(
165
+ self,
166
+ X: list[nx.Graph],
167
+ y: Iterable,
168
+ anchor_nodes_label_name: str,
169
+ list_anchor_nodes_labels: list[tuple],
170
+ eval_set: list[tuple[list[nx.Graph], Iterable]] | None = None,
171
+ ):
172
+ """
173
+ Fits the PathBoost model to the training data.
174
+
175
+ This method trains a `SequentialPathBoost` model for each unique anchor node label.
176
+ The training data `X` and `y` are partitioned based on `list_anchor_nodes_labels`
177
+ and `anchor_nodes_label_name`. Each partition is used to train a corresponding
178
+ `SequentialPathBoost` model. If `n_of_cores` is greater than 1, these models
179
+ are trained in parallel.
180
+
181
+ The method also handles the initialization of variable importance computation
182
+ if `parameters_variable_importance` is set. After training, it computes
183
+ the overall training Mean Squared Error (MSE) and, if `eval_set` is provided,
184
+ the MSE for each evaluation set.
185
+
186
+ Parameters
187
+ ----------
188
+ X : list[nx.Graph]
189
+ A list of NetworkX graph objects representing the training input samples.
190
+ y : Iterable
191
+ The target values (real numbers in regression) corresponding to `X`.
192
+ Must be array-like of shape (n_samples,) or (n_samples, n_outputs).
193
+ anchor_nodes_label_name : str
194
+ The name of the node attribute in the graphs that identifies the attribute used to identify the anchor nodes.
195
+ e.g. if the anchor nodes are defined by the atomic number, this should be "feature_atomic_number".
196
+ list_anchor_nodes_labels : list[tuple]
197
+ A list of unique labels for the anchor nodes. The data will be partitioned
198
+ based on these labels, and a separate `SequentialPathBoost` model will be
199
+ trained for each.
200
+ eval_set : list[tuple[list[nx.Graph], Iterable]] | None, default=None
201
+ A list of (X_eval, y_eval) tuples for monitoring the model's performance
202
+ on one or more evaluation sets during training.
203
+
204
+ Returns
205
+ -------
206
+ self : object
207
+ The fitted PathBoost estimator.
208
+ """
209
+ # Configure logging based on verbose flag
210
+ if self.verbose:
211
+ logging.getLogger("path_boost").setLevel(logging.INFO)
212
+ if not logging.getLogger("path_boost").handlers:
213
+ handler = logging.StreamHandler()
214
+ handler.setFormatter(logging.Formatter("%(message)s"))
215
+ logging.getLogger("path_boost").addHandler(handler)
216
+
217
+ self._default_kwargs_for_base_learner = {
218
+ "max_depth": 3,
219
+ "random_state": 0,
220
+ "splitter": "best",
221
+ "criterion": "squared_error",
222
+ }
223
+
224
+ self._default_kwargs_for_selector = {
225
+ "max_depth": 1,
226
+ "random_state": 0,
227
+ "splitter": "best",
228
+ "criterion": "squared_error",
229
+ }
230
+
231
+ self.anchor_nodes_label_name_ = anchor_nodes_label_name
232
+ self.list_anchor_nodes_labels_ = list_anchor_nodes_labels
233
+
234
+ X, y = self._validate_data(
235
+ X=X,
236
+ y=y,
237
+ list_anchor_nodes_labels=list_anchor_nodes_labels,
238
+ eval_set=eval_set,
239
+ m_stops=self.m_stops,
240
+ name_of_label_attribute=anchor_nodes_label_name,
241
+ parameters_variable_importance=self.parameters_variable_importance,
242
+ patience=self.patience,
243
+ )
244
+
245
+ # if variable importance is used, we need all the sub models to not normalize the data and eventually remember to normalize later
246
+ if self.parameters_variable_importance is not None:
247
+ self.normalize_path_importance_: bool = (
248
+ self.parameters_variable_importance.get("normalize", False)
249
+ )
250
+ self.parameters_variable_importance["normalize"] = False
251
+
252
+ self.is_fitted_ = True
253
+
254
+ # divide the training dataset by metal center
255
+ indexes_of_train_graphs_for_each_anchor_label: list[list[int]] = (
256
+ wbu.split_dataset_by_metal_centers(
257
+ graphs_list=X,
258
+ anchor_nodes_label_name=self.anchor_nodes_label_name_,
259
+ anchor_nodes=self.list_anchor_nodes_labels_,
260
+ )
261
+ )
262
+
263
+ train_datasets_for_each_anchor_label = []
264
+ train_labels_for_each_anchor_label = []
265
+
266
+ self.models_list_: list[SequentialPathBoost] = []
267
+
268
+ m_stops_counter = 0
269
+ # create a train dataset and model
270
+ for i, _ in enumerate(self.list_anchor_nodes_labels_):
271
+ train_indexes = indexes_of_train_graphs_for_each_anchor_label[i]
272
+ train_dataset = [X[index] for index in train_indexes]
273
+ train_labels = [y[index] for index in train_indexes]
274
+ train_datasets_for_each_anchor_label.append(train_dataset)
275
+ train_labels_for_each_anchor_label.append(train_labels)
276
+ if len(train_dataset) != 0:
277
+ n_iter = self.n_iter
278
+ # needed to be done to distinguish the case when we are given an m_stops for each anchor node or when we are given a m_stop for each trained model
279
+ if self.m_stops is not None:
280
+ if len(self.m_stops) == len(self.list_anchor_nodes_labels_):
281
+ n_iter = self.m_stops[i]
282
+ else:
283
+ n_iter = self.m_stops[m_stops_counter]
284
+ m_stops_counter += 1
285
+
286
+ self.models_list_.append(
287
+ SequentialPathBoost(
288
+ n_iter=n_iter,
289
+ patience=self.patience,
290
+ target_error=self.target_error,
291
+ max_path_length=self.max_path_length,
292
+ learning_rate=self.learning_rate,
293
+ BaseLearnerClass=self.BaseLearnerClass,
294
+ SelectorClass=self.SelectorClass,
295
+ kwargs_for_base_learner=self.kwargs_for_base_learner,
296
+ kwargs_for_selector=self.kwargs_for_selector,
297
+ parameters_variable_importance=self.parameters_variable_importance,
298
+ replace_nan_with=self.replace_nan_with,
299
+ verbose=self.verbose,
300
+ )
301
+ )
302
+
303
+ else:
304
+ # if there is no training data, we will append None to the list of models
305
+ self.models_list_.append(None)
306
+
307
+ # parallelization
308
+ # We will use the `wbu.train_pattern_boosting` function to train the model in parallel.
309
+ input_for_parallelization = list(
310
+ zip(
311
+ self.models_list_,
312
+ train_datasets_for_each_anchor_label,
313
+ train_labels_for_each_anchor_label,
314
+ self.list_anchor_nodes_labels_,
315
+ [
316
+ anchor_nodes_label_name
317
+ for _ in range(len(self.list_anchor_nodes_labels_))
318
+ ],
319
+ )
320
+ )
321
+
322
+ number_of_effective_trained_models: int = sum(
323
+ 1 for model in self.models_list_ if model is not None
324
+ )
325
+ number_of_cores_used = min(
326
+ mp.cpu_count(), self.n_of_cores, number_of_effective_trained_models
327
+ )
328
+ if number_of_cores_used <= 1:
329
+ path_boosting_models = []
330
+ # Set up iterator with optional progress bar for sequential training
331
+ iterator = range(len(input_for_parallelization))
332
+ if self.verbose and TQDM_AVAILABLE:
333
+ iterator = tqdm(
334
+ iterator,
335
+ desc="Training anchor models",
336
+ unit="model",
337
+ bar_format=(
338
+ "{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
339
+ ),
340
+ )
341
+ for i in iterator:
342
+ path_boosting_models.append(
343
+ wbu.train_pattern_boosting(input_for_parallelization[i])
344
+ )
345
+
346
+ else:
347
+ with mp.get_context("spawn").Pool(number_of_cores_used) as pool:
348
+ path_boosting_models = pool.map(
349
+ wbu.train_pattern_boosting, input_for_parallelization
350
+ )
351
+
352
+ self.models_list_ = path_boosting_models
353
+ self.train_mse_ = self._compute_train_mse(
354
+ number_of_observations_for_each_model=[
355
+ len(dataset) for dataset in train_datasets_for_each_anchor_label
356
+ ]
357
+ )
358
+
359
+ if eval_set is not None:
360
+ self.mse_eval_set_ = []
361
+ for eval_tuple in eval_set:
362
+ self.mse_eval_set_.append(
363
+ self.evaluate(X=eval_tuple[0], y=eval_tuple[1])
364
+ )
365
+
366
+ if self.parameters_variable_importance is not None:
367
+ self.compute_variable_importance()
368
+
369
+ # `fit` should always return `self`
370
+ return self
371
+
372
+ def compute_variable_importance(self):
373
+ self.parameters_variable_importance["normalize"] = (
374
+ self.normalize_path_importance_
375
+ )
376
+
377
+ self.variable_importance_ = VariableImportance_ForSequentialPathBoost(
378
+ **self.parameters_variable_importance,
379
+ ).combine_variable_importance_from_list_of_sequential_models(
380
+ sequential_models=self.models_list_,
381
+ )
382
+
383
+ def _compute_train_mse(self, number_of_observations_for_each_model: list[int]):
384
+ train_mse = np.zeros(self.n_iter)
385
+ for i, smc_model in enumerate(self.models_list_):
386
+ if smc_model is not None:
387
+ train_mse += (
388
+ np.array(smc_model.train_mse_)
389
+ * number_of_observations_for_each_model[i]
390
+ )
391
+ train_mse = train_mse / sum(number_of_observations_for_each_model)
392
+ return train_mse
393
+
394
+ def predict(self, X: List[nx.Graph]) -> List[float]:
395
+ """
396
+ Predict target values for the input graphs.
397
+
398
+ This method partitions the input graphs by anchor nodes, generates
399
+ predictions using the corresponding trained SequentialPathBoost models,
400
+ and averages predictions across models for graphs with multiple anchor types.
401
+
402
+ Parameters
403
+ ----------
404
+ X : List[nx.Graph]
405
+ A list of NetworkX graph objects for which to make predictions.
406
+
407
+ Returns
408
+ -------
409
+ predictions : List[float]
410
+ Predicted target values for each input graph.
411
+
412
+ Raises
413
+ ------
414
+ sklearn.exceptions.NotFittedError
415
+ If the model has not been fitted yet.
416
+
417
+ Examples
418
+ --------
419
+ >>> predictions = model.predict(X_test) # doctest: +SKIP
420
+ >>> print(f"Mean prediction: {np.mean(predictions):.3f}") # doctest: +SKIP
421
+ """
422
+ # Check if fit had been called
423
+ check_is_fitted(self)
424
+ # We need to set reset=False because we don't want to overwrite `n_features_in_`
425
+ # `feature_names_in_` but only check that the shape is consistent.
426
+ X = self._validate_data(X=X)
427
+
428
+ # divide the input by the anchor node
429
+ indexes_of_graphs_for_each_anchor_label: list[list[int]] = (
430
+ wbu.split_dataset_by_metal_centers(
431
+ graphs_list=X,
432
+ anchor_nodes_label_name=self.anchor_nodes_label_name_,
433
+ anchor_nodes=self.list_anchor_nodes_labels_,
434
+ )
435
+ )
436
+
437
+ # create the dataset for each anchor node
438
+ datasets_for_each_anchor_label = []
439
+ for i, _ in enumerate(self.list_anchor_nodes_labels_):
440
+ indexes = indexes_of_graphs_for_each_anchor_label[i]
441
+ dataset = [X[index] for index in indexes]
442
+ datasets_for_each_anchor_label.append(dataset)
443
+
444
+ number_of_effective_trained_models: int = sum(
445
+ 1 for model in self.models_list_ if model is not None
446
+ )
447
+ number_of_dataset_to_be_predicted = sum(
448
+ 1 for dataset in datasets_for_each_anchor_label if len(dataset) != 0
449
+ )
450
+ number_of_cores_used = min(
451
+ mp.cpu_count(),
452
+ self.n_of_cores,
453
+ number_of_dataset_to_be_predicted,
454
+ number_of_effective_trained_models,
455
+ )
456
+
457
+ if number_of_cores_used <= 1:
458
+ predictions_for_each_anchor_node = []
459
+ for i in range(len(datasets_for_each_anchor_label)):
460
+ if self.models_list_[i] is not None:
461
+ predictions = wbu.parallel_predict(
462
+ input_from_parallelization=(
463
+ self.models_list_[i],
464
+ datasets_for_each_anchor_label[i],
465
+ )
466
+ )
467
+ predictions_for_each_anchor_node.append(predictions)
468
+ else:
469
+ predictions_for_each_anchor_node.append(None)
470
+ else:
471
+ input_for_parallelization = list(
472
+ zip(self.models_list_, datasets_for_each_anchor_label)
473
+ )
474
+ with mp.get_context("spawn").Pool(self.n_of_cores) as pool:
475
+ predictions_for_each_anchor_node = pool.map(
476
+ wbu.parallel_predict, input_for_parallelization
477
+ )
478
+
479
+ # create a matrix (list of lists) where the columns refer to the anchor nodes and the rows to the graphs
480
+ predictions_for_each_anchor_node_padded_with_none = [
481
+ [None for _ in range(len(X))]
482
+ for _ in range(len(self.list_anchor_nodes_labels_))
483
+ ]
484
+ for anchor_node_number in range(len(self.list_anchor_nodes_labels_)):
485
+ for i in range(
486
+ len(indexes_of_graphs_for_each_anchor_label[anchor_node_number])
487
+ ):
488
+ graph_number = indexes_of_graphs_for_each_anchor_label[
489
+ anchor_node_number
490
+ ][i]
491
+ predictions_for_each_anchor_node_padded_with_none[anchor_node_number][
492
+ graph_number
493
+ ] = predictions_for_each_anchor_node[anchor_node_number][i]
494
+
495
+ # Transpose the list of lists, filling missing values with None
496
+ transposed_list = list(
497
+ map(
498
+ list,
499
+ itertools.zip_longest(
500
+ *predictions_for_each_anchor_node_padded_with_none, fillvalue=None
501
+ ),
502
+ )
503
+ )
504
+
505
+ # Calculate the average of each row, ignoring None values
506
+ predictions = []
507
+ for sublist in transposed_list:
508
+ if len(sublist) > 0:
509
+ non_none_values = [x for x in sublist if x is not None]
510
+ if len(non_none_values) > 0:
511
+ avg = np.mean(non_none_values)
512
+ else:
513
+ avg = 0
514
+ else:
515
+ avg = 0
516
+ predictions.append(avg)
517
+
518
+ predictions = [
519
+ x if x is not None and not np.isnan(x) else 0 for x in predictions
520
+ ]
521
+
522
+ return predictions
523
+
524
+ def predict_step_by_step(self, X: list[nx.Graph]) -> list[list[float]]:
525
+ """
526
+ Predicts the target values for the input graphs step by step, returning the predictions at each iteration.
527
+
528
+ This method divides the input graphs by anchor nodes, generates datasets for each anchor node, and then
529
+ uses the trained models to predict the target values for each dataset. The predictions are made iteratively,
530
+ and the method returns the predictions at each iteration.
531
+
532
+ Parameters
533
+ ----------
534
+ X : list[nx.Graph]
535
+ A list of networkx graph objects to be used for prediction.
536
+
537
+ Returns
538
+ -------
539
+ list[list[float]]
540
+ A list of lists where each inner list contains the predictions for the input graphs at a specific iteration.
541
+ The outer list contains the predictions for all iterations.
542
+ """
543
+
544
+ # Check if fit had been called
545
+ check_is_fitted(self)
546
+ # We need to set reset=False because we don't want to overwrite `n_features_in_`
547
+ # `feature_names_in_` but only check that the shape is consistent.
548
+ X = self._validate_data(X=X)
549
+
550
+ # divide the input by the anchor node
551
+ indexes_of_graphs_for_each_anchor_label: list[list[int]] = (
552
+ wbu.split_dataset_by_metal_centers(
553
+ graphs_list=X,
554
+ anchor_nodes_label_name=self.anchor_nodes_label_name_,
555
+ anchor_nodes=self.list_anchor_nodes_labels_,
556
+ )
557
+ )
558
+
559
+ # create the dataset for each anchor node
560
+ datasets_for_each_anchor_label = []
561
+ for i, _ in enumerate(self.list_anchor_nodes_labels_):
562
+ indexes = indexes_of_graphs_for_each_anchor_label[i]
563
+ dataset = [X[index] for index in indexes]
564
+ datasets_for_each_anchor_label.append(dataset)
565
+
566
+ number_of_effective_trained_models: int = sum(
567
+ 1 for model in self.models_list_ if model is not None
568
+ )
569
+ number_of_dataset_to_be_predicted = sum(
570
+ 1 for dataset in datasets_for_each_anchor_label if len(dataset) != 0
571
+ )
572
+ number_of_cores_used = min(
573
+ mp.cpu_count(),
574
+ self.n_of_cores,
575
+ number_of_dataset_to_be_predicted,
576
+ number_of_effective_trained_models,
577
+ )
578
+
579
+ # get the step by step predictions for each anchor node
580
+ if number_of_cores_used <= 1:
581
+ step_by_step_predictions_for_each_anchor_node: list[
582
+ list[list[numbers.Number]]
583
+ ] = []
584
+ for i in range(len(datasets_for_each_anchor_label)):
585
+ if self.models_list_[i] is not None:
586
+ predictions_step_by_step = wbu.parallel_predict_step_by_step(
587
+ (self.models_list_[i], datasets_for_each_anchor_label[i])
588
+ )
589
+ step_by_step_predictions_for_each_anchor_node.append(
590
+ predictions_step_by_step
591
+ )
592
+ else:
593
+ step_by_step_predictions_for_each_anchor_node.append(None)
594
+ else:
595
+ input_for_parallelization = list(
596
+ zip(self.models_list_, datasets_for_each_anchor_label)
597
+ )
598
+ with mp.get_context("spawn").Pool(self.n_of_cores) as pool:
599
+ step_by_step_predictions_for_each_anchor_node: list[
600
+ list[list[numbers.Number]]
601
+ ] = pool.map(
602
+ wbu.parallel_predict_step_by_step, input_for_parallelization
603
+ )
604
+
605
+ # create a matrix for each iteration (list of lists) where the columns refer to the anchor nodes and the rows to the graphs
606
+ iterations_predictions_for_each_anchor_node_padded_with_none = []
607
+ for iteration in range(self.n_iter):
608
+ predictions_for_each_anchor_node_padded_with_none = [
609
+ [None for _ in range(len(X))]
610
+ for _ in range(len(self.list_anchor_nodes_labels_))
611
+ ]
612
+
613
+ for anchor_node_number in range(len(self.list_anchor_nodes_labels_)):
614
+ for i in range(
615
+ len(indexes_of_graphs_for_each_anchor_label[anchor_node_number])
616
+ ):
617
+ graph_number = indexes_of_graphs_for_each_anchor_label[
618
+ anchor_node_number
619
+ ][i]
620
+ predictions_for_each_anchor_node_padded_with_none[
621
+ anchor_node_number
622
+ ][graph_number] = step_by_step_predictions_for_each_anchor_node[
623
+ anchor_node_number
624
+ ][
625
+ iteration
626
+ ][
627
+ i
628
+ ]
629
+
630
+ iterations_predictions_for_each_anchor_node_padded_with_none.append(
631
+ predictions_for_each_anchor_node_padded_with_none
632
+ )
633
+
634
+ transposed_iteration_predictions = []
635
+ for iteration in range(self.n_iter):
636
+ # Transpose the list of lists, filling missing values with None
637
+ transposed_list = list(
638
+ map(
639
+ list,
640
+ itertools.zip_longest(
641
+ *iterations_predictions_for_each_anchor_node_padded_with_none[
642
+ iteration
643
+ ],
644
+ fillvalue=None,
645
+ ),
646
+ )
647
+ )
648
+
649
+ transposed_iteration_predictions.append(transposed_list)
650
+
651
+ # Calculate the average of each row, ignoring None values
652
+ predictions_step_by_step = []
653
+ for iteration in range(self.n_iter):
654
+ averages = []
655
+ for sublist in transposed_iteration_predictions[iteration]:
656
+ if len(sublist) > 0:
657
+ non_none_values = [x for x in sublist if x is not None]
658
+ if len(non_none_values) > 0:
659
+ avg = np.mean(non_none_values)
660
+ else:
661
+ avg = 0
662
+ else:
663
+ avg = 0
664
+ averages.append(avg)
665
+ averages = [x if x is not None and not np.isnan(x) else 0 for x in averages]
666
+ predictions_step_by_step.append(averages)
667
+
668
+ return predictions_step_by_step
669
+
670
+ def _merge_values_from_single_path_boost(
671
+ self,
672
+ len_X: int,
673
+ indexes_of_graphs_for_each_anchor_label: list[list[int]],
674
+ values_for_each_anchor_node: list[list[float]],
675
+ ):
676
+ """
677
+ This method is used to merge (average) the values (predictions) from a SingleMetalCenterPathBoost instance into the current instance of PathBoost
678
+ """
679
+
680
+ averaged_values = [0 for _ in range(len_X)]
681
+ counter = [0 for _ in range(len_X)]
682
+ for graph_number in range(len_X):
683
+ for anchor_node_number in range(len(self.list_anchor_nodes_labels_)):
684
+ if (
685
+ graph_number
686
+ in indexes_of_graphs_for_each_anchor_label[anchor_node_number]
687
+ ):
688
+ graph_position_in_sub_dataset = (
689
+ indexes_of_graphs_for_each_anchor_label[
690
+ anchor_node_number
691
+ ].index(graph_number)
692
+ )
693
+ averaged_values[graph_number] += values_for_each_anchor_node[
694
+ anchor_node_number
695
+ ][graph_position_in_sub_dataset]
696
+ counter[graph_number] += 1
697
+
698
+ averaged_values = np.divide(
699
+ averaged_values,
700
+ counter,
701
+ out=np.zeros_like(averaged_values),
702
+ where=counter != 0,
703
+ )
704
+
705
+ return averaged_values
706
+
707
+ def evaluate(self, X: list[nx.Graph], y: Iterable) -> list[float]:
708
+ # it returns the evolution of the mse with increasing number of iterations
709
+ predictions = self.predict_step_by_step(X)
710
+ evolution_mse = []
711
+ for prediction in predictions:
712
+ mse = mean_squared_error(y_true=y, y_pred=prediction)
713
+ evolution_mse.append(mse)
714
+ return evolution_mse
715
+
716
+ def plot_training_and_eval_errors(
717
+ self,
718
+ skip_first_n_iterations: int | bool = True,
719
+ plot_eval_sets_error=True,
720
+ show=True,
721
+ save=False,
722
+ save_path: str | None = None,
723
+ ):
724
+ """
725
+ Plots the training and evaluation set errors over iterations.
726
+ """
727
+ if hasattr(self, "mse_eval_set_") and plot_eval_sets_error is True:
728
+ eval_sets_mse = self.mse_eval_set_
729
+ else:
730
+ eval_sets_mse = None
731
+ plot_training_and_eval_errors(
732
+ learning_rate=self.learning_rate,
733
+ train_mse=self.train_mse_,
734
+ mse_eval_set=eval_sets_mse,
735
+ skip_first_n_iterations=skip_first_n_iterations,
736
+ show=show,
737
+ save=save,
738
+ save_path=save_path,
739
+ )
740
+
741
+ def plot_variable_importance(
742
+ self, top_n_features: int | None = None, show: bool = True
743
+ ):
744
+ if self.parameters_variable_importance is None:
745
+ raise ValueError(
746
+ "Variable importance is not computed. Please set"
747
+ " parameters_variable_importance in the constructor."
748
+ )
749
+ plot_variable_importance_utils(
750
+ variable_importance=self.variable_importance_,
751
+ parameters_variable_importance=self.parameters_variable_importance,
752
+ top_n=top_n_features,
753
+ show=show,
754
+ )
755
+
756
+ def score(self, X, y, sample_weight=None):
757
+ # This method is used to evaluate the model on the given data.
758
+ # It is defined in the `RegressorMixin` class.
759
+ # It allows to:
760
+ # - evaluate the model on the given data
761
+ # - return the score
762
+ mse_evolution = self.evaluate(X=X, y=y)
763
+ best_mse = mse_evolution[-1]
764
+ return -best_mse
765
+
766
+ def _validate_data(
767
+ self,
768
+ X="no_validation",
769
+ y="no_validation",
770
+ reset=True,
771
+ validate_separately=False,
772
+ **check_params,
773
+ ):
774
+ util_validate_data(
775
+ model=self,
776
+ X=X,
777
+ y=y,
778
+ reset=reset,
779
+ validate_separately=validate_separately,
780
+ **check_params,
781
+ )
782
+
783
+ if not np.array_equal(y, "no_validation"):
784
+ validate_data(
785
+ self,
786
+ X="no_validation",
787
+ y=y,
788
+ reset=reset,
789
+ validate_separately=validate_separately,
790
+ )
791
+
792
+ if not np.array_equal(X, "no_validation") and not np.array_equal(
793
+ y, "no_validation"
794
+ ):
795
+ return X, y
796
+ elif not np.array_equal(X, "no_validation"):
797
+ return X
798
+ elif not np.array_equal(y, "no_validation"):
799
+ return y
800
+
801
+ def get_final_eval_set_mse(self):
802
+ """
803
+ Returns the evaluation set MSE if it was computed during fitting.
804
+ """
805
+ if hasattr(self, "mse_eval_set_"):
806
+ final_eval_set_mse = []
807
+ for mse in self.mse_eval_set_:
808
+ final_eval_set_mse.append(mse[-1])
809
+ return final_eval_set_mse
810
+ else:
811
+ raise AttributeError(
812
+ "Evaluation set MSE is not available. Please fit the model with"
813
+ " eval_set."
814
+ )
815
+
816
+ def save(self, filepath: str) -> None:
817
+ """
818
+ Save the fitted model to a file.
819
+
820
+ This method serializes the entire PathBoost model, including all trained
821
+ SequentialPathBoost sub-models, to a file using joblib. The saved file
822
+ includes metadata such as the package version and training parameters.
823
+
824
+ Parameters
825
+ ----------
826
+ filepath : str
827
+ The path where the model should be saved. The file extension
828
+ `.joblib` is recommended but not required.
829
+
830
+ Raises
831
+ ------
832
+ ValueError
833
+ If the model has not been fitted yet.
834
+
835
+ Examples
836
+ --------
837
+ >>> model = PathBoost(n_iter=50) # doctest: +SKIP
838
+ >>> model.fit(X_train, y_train, anchor_nodes_label_name='atomic_number',
839
+ ... list_anchor_nodes_labels=[1, 6, 7, 8]) # doctest: +SKIP
840
+ >>> model.save('my_model.joblib') # doctest: +SKIP
841
+
842
+ See Also
843
+ --------
844
+ load : Load a saved model from file.
845
+ """
846
+ import joblib
847
+ from datetime import datetime
848
+ from ._version import __version__
849
+
850
+ check_is_fitted(self)
851
+
852
+ # Create metadata dictionary
853
+ metadata = {
854
+ "version": __version__,
855
+ "saved_at": datetime.now().isoformat(),
856
+ "n_iter": self.n_iter,
857
+ "max_path_length": self.max_path_length,
858
+ "learning_rate": self.learning_rate,
859
+ "anchor_nodes_label_name": self.anchor_nodes_label_name_,
860
+ "list_anchor_nodes_labels": self.list_anchor_nodes_labels_,
861
+ "n_models": len(self.models_list_),
862
+ }
863
+
864
+ # Package model and metadata
865
+ save_dict = {
866
+ "model": self,
867
+ "metadata": metadata,
868
+ }
869
+
870
+ joblib.dump(save_dict, filepath)
871
+ logger.info(f"Model saved to {filepath}")
872
+
873
+ @classmethod
874
+ def load(cls, filepath: str) -> "PathBoost":
875
+ """
876
+ Load a saved PathBoost model from a file.
877
+
878
+ This class method deserializes a PathBoost model that was previously
879
+ saved using the `save` method. It restores the complete model state
880
+ including all trained sub-models and parameters.
881
+
882
+ Parameters
883
+ ----------
884
+ filepath : str
885
+ The path to the saved model file.
886
+
887
+ Returns
888
+ -------
889
+ PathBoost
890
+ The loaded PathBoost model, ready for prediction.
891
+
892
+ Raises
893
+ ------
894
+ FileNotFoundError
895
+ If the specified file does not exist.
896
+ ValueError
897
+ If the file does not contain a valid PathBoost model.
898
+
899
+ Warns
900
+ -----
901
+ UserWarning
902
+ If the model was saved with a different version of the package.
903
+
904
+ Examples
905
+ --------
906
+ >>> model = PathBoost.load('my_model.joblib') # doctest: +SKIP
907
+ >>> predictions = model.predict(X_test) # doctest: +SKIP
908
+
909
+ See Also
910
+ --------
911
+ save : Save a fitted model to file.
912
+
913
+ Notes
914
+ -----
915
+ Models saved with older versions of the package may not be fully
916
+ compatible. A warning will be issued if version mismatch is detected.
917
+ """
918
+ import joblib
919
+ from ._version import __version__
920
+
921
+ save_dict = joblib.load(filepath)
922
+
923
+ # Validate the loaded object
924
+ if not isinstance(save_dict, dict) or "model" not in save_dict:
925
+ raise ValueError(
926
+ "Invalid model file. Expected a PathBoost save file, but got"
927
+ f" {type(save_dict)}"
928
+ )
929
+
930
+ model = save_dict["model"]
931
+ metadata = save_dict.get("metadata", {})
932
+
933
+ # Check version compatibility
934
+ saved_version = metadata.get("version", "unknown")
935
+ if saved_version != __version__:
936
+ warnings.warn(
937
+ (
938
+ f"Model was saved with version {saved_version}, but current version"
939
+ f" is {__version__}. Some features may not work correctly."
940
+ ),
941
+ UserWarning,
942
+ )
943
+
944
+ logger.info(
945
+ f"Model loaded from {filepath} (saved:"
946
+ f" {metadata.get('saved_at', 'unknown')})"
947
+ )
948
+ return model
949
+
950
+ def predict_with_uncertainty(
951
+ self, X: List[nx.Graph], confidence: float = 0.95
952
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
953
+ """
954
+ Predict target values with uncertainty estimates using anchor node bootstrap.
955
+
956
+ This method leverages the natural ensemble structure of PathBoost, where
957
+ separate models are trained for each anchor node type. Instead of simply
958
+ averaging predictions across anchor models, this method computes the
959
+ variance across individual model predictions to estimate uncertainty.
960
+
961
+ Parameters
962
+ ----------
963
+ X : List[nx.Graph]
964
+ A list of NetworkX graph objects for which to make predictions.
965
+ confidence : float, default=0.95
966
+ The confidence level for the prediction intervals. Must be between
967
+ 0 and 1. Common values are 0.90, 0.95, and 0.99.
968
+
969
+ Returns
970
+ -------
971
+ predictions : np.ndarray
972
+ Point estimates (mean predictions) for each input graph.
973
+ Shape: (n_samples,)
974
+ lower_bounds : np.ndarray
975
+ Lower bounds of the confidence interval for each prediction.
976
+ Shape: (n_samples,)
977
+ upper_bounds : np.ndarray
978
+ Upper bounds of the confidence interval for each prediction.
979
+ Shape: (n_samples,)
980
+
981
+ Raises
982
+ ------
983
+ ValueError
984
+ If confidence is not between 0 and 1.
985
+ If fewer than 2 anchor node models were trained (uncertainty requires
986
+ multiple models for bootstrap estimation).
987
+
988
+ Notes
989
+ -----
990
+ The uncertainty estimates are based on the variance across anchor node
991
+ models. This approach is valid when:
992
+ - Multiple anchor node types were used during training
993
+ - Each graph contains nodes from multiple anchor types
994
+
995
+ For graphs that only match a single anchor type, the uncertainty
996
+ estimate will be based on fewer models and may be less reliable.
997
+
998
+ The confidence intervals assume approximate normality of the prediction
999
+ distribution across anchor models. For small numbers of anchor types,
1000
+ this assumption may not hold exactly.
1001
+
1002
+ Examples
1003
+ --------
1004
+ >>> model = PathBoost(n_iter=50) # doctest: +SKIP
1005
+ >>> model.fit(X_train, y_train, anchor_nodes_label_name='atomic_number',
1006
+ ... list_anchor_nodes_labels=[6, 7, 8, 9]) # doctest: +SKIP
1007
+ >>> predictions, lower, upper = model.predict_with_uncertainty(X_test) # doctest: +SKIP
1008
+ >>> for i in range(5): # doctest: +SKIP
1009
+ ... print(f"Pred: {predictions[i]:.2f}, CI: [{lower[i]:.2f}, {upper[i]:.2f}]")
1010
+
1011
+ See Also
1012
+ --------
1013
+ predict : Standard prediction without uncertainty estimates.
1014
+ """
1015
+ from scipy import stats
1016
+
1017
+ # Validate inputs
1018
+ check_is_fitted(self)
1019
+
1020
+ if not 0 < confidence < 1:
1021
+ raise ValueError(f"confidence must be between 0 and 1, got {confidence}")
1022
+
1023
+ n_effective_models = sum(1 for m in self.models_list_ if m is not None)
1024
+ if n_effective_models < 2:
1025
+ raise ValueError(
1026
+ "predict_with_uncertainty requires at least 2 trained anchor models, "
1027
+ f"but only {n_effective_models} model(s) were trained. "
1028
+ "Use predict() for standard predictions."
1029
+ )
1030
+
1031
+ X = self._validate_data(X=X)
1032
+
1033
+ # Get predictions from each anchor model separately
1034
+ indexes_of_graphs_for_each_anchor_label: List[List[int]] = (
1035
+ wbu.split_dataset_by_metal_centers(
1036
+ graphs_list=X,
1037
+ anchor_nodes_label_name=self.anchor_nodes_label_name_,
1038
+ anchor_nodes=self.list_anchor_nodes_labels_,
1039
+ )
1040
+ )
1041
+
1042
+ # Create datasets for each anchor node
1043
+ datasets_for_each_anchor_label = []
1044
+ for i, _ in enumerate(self.list_anchor_nodes_labels_):
1045
+ indexes = indexes_of_graphs_for_each_anchor_label[i]
1046
+ dataset = [X[index] for index in indexes]
1047
+ datasets_for_each_anchor_label.append(dataset)
1048
+
1049
+ # Get predictions from each model
1050
+ predictions_matrix = np.full(
1051
+ (len(X), len(self.list_anchor_nodes_labels_)), np.nan
1052
+ )
1053
+
1054
+ for anchor_idx, (model, dataset) in enumerate(
1055
+ zip(self.models_list_, datasets_for_each_anchor_label)
1056
+ ):
1057
+ if model is not None and len(dataset) > 0:
1058
+ preds = model.predict(dataset)
1059
+ graph_indices = indexes_of_graphs_for_each_anchor_label[anchor_idx]
1060
+ for local_idx, global_idx in enumerate(graph_indices):
1061
+ predictions_matrix[global_idx, anchor_idx] = preds[local_idx]
1062
+
1063
+ # Compute statistics across anchor models
1064
+ with warnings.catch_warnings():
1065
+ warnings.simplefilter("ignore", category=RuntimeWarning)
1066
+ # Mean prediction (ignoring NaN)
1067
+ predictions = np.nanmean(predictions_matrix, axis=1)
1068
+ # Standard deviation across models
1069
+ std_devs = np.nanstd(predictions_matrix, axis=1, ddof=1)
1070
+ # Count of models contributing to each prediction
1071
+ n_models_per_sample = np.sum(~np.isnan(predictions_matrix), axis=1)
1072
+
1073
+ # Replace NaN predictions with 0 (same as regular predict)
1074
+ predictions = np.where(np.isnan(predictions), 0, predictions)
1075
+
1076
+ # Compute confidence intervals
1077
+ # Use t-distribution for small sample sizes
1078
+ alpha = 1 - confidence
1079
+ z_score = stats.norm.ppf(1 - alpha / 2) # Two-tailed
1080
+
1081
+ # Standard error of the mean
1082
+ standard_errors = std_devs / np.sqrt(np.maximum(n_models_per_sample, 1))
1083
+
1084
+ # For samples with only 1 model, set uncertainty to 0 (no information)
1085
+ standard_errors = np.where(n_models_per_sample < 2, 0, standard_errors)
1086
+
1087
+ lower_bounds = predictions - z_score * standard_errors
1088
+ upper_bounds = predictions + z_score * standard_errors
1089
+
1090
+ return predictions, lower_bounds, upper_bounds
1091
+
1092
+
1093
+ if __name__ == "__main__":
1094
+ from sklearn.utils.estimator_checks import check_estimator
1095
+
1096
+ check_estimator = check_estimator(PathBoost())