path-boost 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- path_boost/__init__.py +18 -0
- path_boost/_path_boost.py +1096 -0
- path_boost/_version.py +24 -0
- path_boost/utils/__init__.py +2 -0
- path_boost/utils/classes/__init__.py +0 -0
- path_boost/utils/classes/additive_model_wrapper.py +301 -0
- path_boost/utils/classes/additive_model_wrapper_classifier.py +394 -0
- path_boost/utils/classes/extended_boosting_matrix.py +596 -0
- path_boost/utils/classes/interfaces/__init__.py +0 -0
- path_boost/utils/classes/interfaces/interface_base_learner.py +30 -0
- path_boost/utils/classes/interfaces/interface_selector.py +27 -0
- path_boost/utils/classes/sequential_path_boost.py +1023 -0
- path_boost/utils/classes/sequential_path_boost_classifier.py +840 -0
- path_boost/utils/cross_validation.py +49 -0
- path_boost/utils/cyclic_path_boost_utils.py +76 -0
- path_boost/utils/datasets_for_examples/__init__.py +2 -0
- path_boost/utils/datasets_for_examples/generate_example_dataset.py +304 -0
- path_boost/utils/discovery.py +217 -0
- path_boost/utils/plots_functions.py +153 -0
- path_boost/utils/validate_data.py +223 -0
- path_boost/utils/variable_importance_according_to_path_boost.py +341 -0
- path_boost-2.1.0.dist-info/METADATA +174 -0
- path_boost-2.1.0.dist-info/RECORD +26 -0
- path_boost-2.1.0.dist-info/WHEEL +5 -0
- path_boost-2.1.0.dist-info/licenses/LICENSE +21 -0
- path_boost-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1096 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Extended Path Boost - A gradient boosting algorithm for graph-structured data.
|
|
3
|
+
|
|
4
|
+
This module provides the PathBoost class, an ensemble learning method that builds
|
|
5
|
+
interpretable models for graph data by discovering path-based features.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Authors: scikit-learn-contrib developers
|
|
9
|
+
# License: BSD 3 clause
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import logging
|
|
14
|
+
|
|
15
|
+
# done to limit the number of spawned threads during parallelization
|
|
16
|
+
|
|
17
|
+
max_n_threads = 2
|
|
18
|
+
os.environ["MKL_NUM_THREADS"] = str(max_n_threads)
|
|
19
|
+
os.environ["NUMEXPR_NUM_THREADS"] = str(max_n_threads)
|
|
20
|
+
os.environ["OMP_NUM_THREADS"] = str(max_n_threads)
|
|
21
|
+
|
|
22
|
+
import numbers
|
|
23
|
+
import numpy as np
|
|
24
|
+
import warnings
|
|
25
|
+
import itertools
|
|
26
|
+
import multiprocessing as mp
|
|
27
|
+
import networkx as nx
|
|
28
|
+
import matplotlib.pyplot as plt
|
|
29
|
+
|
|
30
|
+
from .utils.classes.sequential_path_boost import SequentialPathBoost
|
|
31
|
+
from .utils import cyclic_path_boost_utils as wbu
|
|
32
|
+
from .utils.classes.interfaces.interface_base_learner import BaseLearnerClassInterface
|
|
33
|
+
from .utils.variable_importance_according_to_path_boost import (
|
|
34
|
+
VariableImportance_ForSequentialPathBoost,
|
|
35
|
+
)
|
|
36
|
+
from .utils.classes.interfaces.interface_selector import SelectorClassInterface
|
|
37
|
+
from .utils.validate_data import util_validate_data
|
|
38
|
+
from .utils.plots_functions import (
|
|
39
|
+
plot_training_and_eval_errors,
|
|
40
|
+
plot_variable_importance_utils,
|
|
41
|
+
)
|
|
42
|
+
from typing import Iterable, List, Tuple, Optional, Union, Dict, Any, Type
|
|
43
|
+
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, _fit_context
|
|
44
|
+
from sklearn.metrics import mean_squared_error
|
|
45
|
+
from sklearn.utils.validation import check_is_fitted, validate_data
|
|
46
|
+
from sklearn.tree import DecisionTreeRegressor
|
|
47
|
+
from sklearn.base import RegressorMixin
|
|
48
|
+
|
|
49
|
+
# Type aliases for better readability
|
|
50
|
+
GraphList = List[nx.Graph]
|
|
51
|
+
AnchorLabel = Union[int, str, Tuple]
|
|
52
|
+
AnchorLabelList = List[AnchorLabel]
|
|
53
|
+
EvalSet = List[Tuple[GraphList, Iterable]]
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
from tqdm import tqdm
|
|
57
|
+
|
|
58
|
+
TQDM_AVAILABLE = True
|
|
59
|
+
except ImportError:
|
|
60
|
+
TQDM_AVAILABLE = False
|
|
61
|
+
|
|
62
|
+
# Set up logger for the module
|
|
63
|
+
logger = logging.getLogger("path_boost")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class PathBoost(BaseEstimator, RegressorMixin):
|
|
67
|
+
"""
|
|
68
|
+
PathBoost is an ensemble learning method that builds a model by iteratively fitting
|
|
69
|
+
SequentialPathBoost models on different subsets of the data, partitioned by anchor nodes.
|
|
70
|
+
It is designed for graph-based data where paths originating from specified anchor nodes
|
|
71
|
+
are used as features.
|
|
72
|
+
|
|
73
|
+
The model trains a separate `SequentialPathBoost` instance for each unique anchor node
|
|
74
|
+
label provided. Predictions are then aggregated (averaged) from these individual models.
|
|
75
|
+
It supports parallel training of the `SequentialPathBoost` models across multiple cores.
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
n_iter : int, default=100
|
|
80
|
+
The number of boosting iterations to perform for each `SequentialPathBoost` model.
|
|
81
|
+
max_path_length : int, default=10
|
|
82
|
+
The maximum length of paths to consider as features.
|
|
83
|
+
learning_rate : float, default=0.1
|
|
84
|
+
The learning rate shrinks the contribution of each base learner in the `SequentialPathBoost` model.
|
|
85
|
+
m_stops : list[int], default=None
|
|
86
|
+
A list of iteration numbers at which to stop boosting for specific models.
|
|
87
|
+
Currently, this parameter is validated but not fully implemented in the core logic.
|
|
88
|
+
BaseLearnerClass : type, default=sklearn.tree.DecisionTreeRegressor
|
|
89
|
+
The class of the base learner to be used within each boosting iteration in the `SequentialPathBoost` model.
|
|
90
|
+
Must implement the `BaseLearnerClassInterface`.
|
|
91
|
+
kwargs_for_base_learner : dict, default=None
|
|
92
|
+
Keyword arguments to be passed to the constructor of the `BaseLearnerClass`.
|
|
93
|
+
SelectorClass : type, default=sklearn.tree.DecisionTreeRegressor
|
|
94
|
+
The class of the feature selector used to identify the best paths in each iteration.
|
|
95
|
+
Must implement the `SelectorClassInterface`.
|
|
96
|
+
kwargs_for_selector : dict, default=None
|
|
97
|
+
Keyword arguments to be passed to the constructor of the `SelectorClass`.
|
|
98
|
+
parameters_variable_importance : dict, default=None
|
|
99
|
+
Parameters for computing variable importance. If None, variable importance is not computed.
|
|
100
|
+
Expected keys include 'criterion' = 'absolute' or 'relative', 'error_used' = 'mse' or 'mae', 'use_correlation' = True or False, 'normalize' = True or False.
|
|
101
|
+
replace_nan_with : any, default=np.nan
|
|
102
|
+
Value used to replace NaN values encountered during feature generation. It is needed for some base learners like linear models who can not deal with NaN values.
|
|
103
|
+
verbose : bool, default=False
|
|
104
|
+
If True, prints progress messages during fitting.
|
|
105
|
+
n_of_cores : int, default=1
|
|
106
|
+
The number of CPU cores to use for parallel training of `SequentialPathBoost` models.
|
|
107
|
+
If 1, training is sequential.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
# This is a dictionary allowing to define the type of parameters.
|
|
111
|
+
# It used to validate parameter within the `_fit_context` decorator.
|
|
112
|
+
_parameter_constraints = {
|
|
113
|
+
"n_iter": [numbers.Integral],
|
|
114
|
+
"max_path_length": [numbers.Integral],
|
|
115
|
+
"learning_rate": [numbers.Integral, numbers.Real],
|
|
116
|
+
"target_error": [numbers.Real, None],
|
|
117
|
+
"base_learner_kwargs": [dict, None],
|
|
118
|
+
"BaseLearnerClass": [type],
|
|
119
|
+
"SelectorClass": [type],
|
|
120
|
+
"kwargs_for_selector": [dict, None],
|
|
121
|
+
"eval_set": [list[tuple[list[nx.Graph], Iterable]], None],
|
|
122
|
+
"list_anchor_nodes_labels": [list[tuple]],
|
|
123
|
+
"X": [list[nx.Graph]],
|
|
124
|
+
"y": [Iterable],
|
|
125
|
+
"anchor_nodes_label_name": [str],
|
|
126
|
+
"verbose": [bool],
|
|
127
|
+
"n_of_cores": [numbers.Integral],
|
|
128
|
+
"parameters_variable_importance": [dict, None],
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
def __init__(
|
|
132
|
+
self,
|
|
133
|
+
n_iter=100,
|
|
134
|
+
patience: int | None = None,
|
|
135
|
+
target_error: float | None = None,
|
|
136
|
+
max_path_length=10,
|
|
137
|
+
learning_rate=0.1,
|
|
138
|
+
m_stops: list[int] = None,
|
|
139
|
+
BaseLearnerClass=DecisionTreeRegressor,
|
|
140
|
+
kwargs_for_base_learner=None,
|
|
141
|
+
SelectorClass=DecisionTreeRegressor,
|
|
142
|
+
kwargs_for_selector=None,
|
|
143
|
+
parameters_variable_importance=None,
|
|
144
|
+
replace_nan_with=np.nan,
|
|
145
|
+
verbose: bool = False,
|
|
146
|
+
n_of_cores: int = 1,
|
|
147
|
+
):
|
|
148
|
+
self.n_iter: int = n_iter
|
|
149
|
+
self.patience: int = patience
|
|
150
|
+
self.target_error: float | None = target_error
|
|
151
|
+
self.m_stops: list[int] = m_stops
|
|
152
|
+
self.max_path_length: int = max_path_length
|
|
153
|
+
self.learning_rate: float = learning_rate
|
|
154
|
+
self.BaseLearnerClass: type[BaseLearnerClassInterface] = BaseLearnerClass
|
|
155
|
+
self.verbose: bool = verbose
|
|
156
|
+
self.n_of_cores = n_of_cores
|
|
157
|
+
self.kwargs_for_base_learner: dict = kwargs_for_base_learner
|
|
158
|
+
self.SelectorClass: type[SelectorClassInterface] = SelectorClass
|
|
159
|
+
self.kwargs_for_selector: dict = kwargs_for_selector
|
|
160
|
+
self.replace_nan_with = replace_nan_with
|
|
161
|
+
self.parameters_variable_importance = parameters_variable_importance
|
|
162
|
+
|
|
163
|
+
@_fit_context(prefer_skip_nested_validation=True)
|
|
164
|
+
def fit(
|
|
165
|
+
self,
|
|
166
|
+
X: list[nx.Graph],
|
|
167
|
+
y: Iterable,
|
|
168
|
+
anchor_nodes_label_name: str,
|
|
169
|
+
list_anchor_nodes_labels: list[tuple],
|
|
170
|
+
eval_set: list[tuple[list[nx.Graph], Iterable]] | None = None,
|
|
171
|
+
):
|
|
172
|
+
"""
|
|
173
|
+
Fits the PathBoost model to the training data.
|
|
174
|
+
|
|
175
|
+
This method trains a `SequentialPathBoost` model for each unique anchor node label.
|
|
176
|
+
The training data `X` and `y` are partitioned based on `list_anchor_nodes_labels`
|
|
177
|
+
and `anchor_nodes_label_name`. Each partition is used to train a corresponding
|
|
178
|
+
`SequentialPathBoost` model. If `n_of_cores` is greater than 1, these models
|
|
179
|
+
are trained in parallel.
|
|
180
|
+
|
|
181
|
+
The method also handles the initialization of variable importance computation
|
|
182
|
+
if `parameters_variable_importance` is set. After training, it computes
|
|
183
|
+
the overall training Mean Squared Error (MSE) and, if `eval_set` is provided,
|
|
184
|
+
the MSE for each evaluation set.
|
|
185
|
+
|
|
186
|
+
Parameters
|
|
187
|
+
----------
|
|
188
|
+
X : list[nx.Graph]
|
|
189
|
+
A list of NetworkX graph objects representing the training input samples.
|
|
190
|
+
y : Iterable
|
|
191
|
+
The target values (real numbers in regression) corresponding to `X`.
|
|
192
|
+
Must be array-like of shape (n_samples,) or (n_samples, n_outputs).
|
|
193
|
+
anchor_nodes_label_name : str
|
|
194
|
+
The name of the node attribute in the graphs that identifies the attribute used to identify the anchor nodes.
|
|
195
|
+
e.g. if the anchor nodes are defined by the atomic number, this should be "feature_atomic_number".
|
|
196
|
+
list_anchor_nodes_labels : list[tuple]
|
|
197
|
+
A list of unique labels for the anchor nodes. The data will be partitioned
|
|
198
|
+
based on these labels, and a separate `SequentialPathBoost` model will be
|
|
199
|
+
trained for each.
|
|
200
|
+
eval_set : list[tuple[list[nx.Graph], Iterable]] | None, default=None
|
|
201
|
+
A list of (X_eval, y_eval) tuples for monitoring the model's performance
|
|
202
|
+
on one or more evaluation sets during training.
|
|
203
|
+
|
|
204
|
+
Returns
|
|
205
|
+
-------
|
|
206
|
+
self : object
|
|
207
|
+
The fitted PathBoost estimator.
|
|
208
|
+
"""
|
|
209
|
+
# Configure logging based on verbose flag
|
|
210
|
+
if self.verbose:
|
|
211
|
+
logging.getLogger("path_boost").setLevel(logging.INFO)
|
|
212
|
+
if not logging.getLogger("path_boost").handlers:
|
|
213
|
+
handler = logging.StreamHandler()
|
|
214
|
+
handler.setFormatter(logging.Formatter("%(message)s"))
|
|
215
|
+
logging.getLogger("path_boost").addHandler(handler)
|
|
216
|
+
|
|
217
|
+
self._default_kwargs_for_base_learner = {
|
|
218
|
+
"max_depth": 3,
|
|
219
|
+
"random_state": 0,
|
|
220
|
+
"splitter": "best",
|
|
221
|
+
"criterion": "squared_error",
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
self._default_kwargs_for_selector = {
|
|
225
|
+
"max_depth": 1,
|
|
226
|
+
"random_state": 0,
|
|
227
|
+
"splitter": "best",
|
|
228
|
+
"criterion": "squared_error",
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
self.anchor_nodes_label_name_ = anchor_nodes_label_name
|
|
232
|
+
self.list_anchor_nodes_labels_ = list_anchor_nodes_labels
|
|
233
|
+
|
|
234
|
+
X, y = self._validate_data(
|
|
235
|
+
X=X,
|
|
236
|
+
y=y,
|
|
237
|
+
list_anchor_nodes_labels=list_anchor_nodes_labels,
|
|
238
|
+
eval_set=eval_set,
|
|
239
|
+
m_stops=self.m_stops,
|
|
240
|
+
name_of_label_attribute=anchor_nodes_label_name,
|
|
241
|
+
parameters_variable_importance=self.parameters_variable_importance,
|
|
242
|
+
patience=self.patience,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# if variable importance is used, we need all the sub models to not normalize the data and eventually remember to normalize later
|
|
246
|
+
if self.parameters_variable_importance is not None:
|
|
247
|
+
self.normalize_path_importance_: bool = (
|
|
248
|
+
self.parameters_variable_importance.get("normalize", False)
|
|
249
|
+
)
|
|
250
|
+
self.parameters_variable_importance["normalize"] = False
|
|
251
|
+
|
|
252
|
+
self.is_fitted_ = True
|
|
253
|
+
|
|
254
|
+
# divide the training dataset by metal center
|
|
255
|
+
indexes_of_train_graphs_for_each_anchor_label: list[list[int]] = (
|
|
256
|
+
wbu.split_dataset_by_metal_centers(
|
|
257
|
+
graphs_list=X,
|
|
258
|
+
anchor_nodes_label_name=self.anchor_nodes_label_name_,
|
|
259
|
+
anchor_nodes=self.list_anchor_nodes_labels_,
|
|
260
|
+
)
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
train_datasets_for_each_anchor_label = []
|
|
264
|
+
train_labels_for_each_anchor_label = []
|
|
265
|
+
|
|
266
|
+
self.models_list_: list[SequentialPathBoost] = []
|
|
267
|
+
|
|
268
|
+
m_stops_counter = 0
|
|
269
|
+
# create a train dataset and model
|
|
270
|
+
for i, _ in enumerate(self.list_anchor_nodes_labels_):
|
|
271
|
+
train_indexes = indexes_of_train_graphs_for_each_anchor_label[i]
|
|
272
|
+
train_dataset = [X[index] for index in train_indexes]
|
|
273
|
+
train_labels = [y[index] for index in train_indexes]
|
|
274
|
+
train_datasets_for_each_anchor_label.append(train_dataset)
|
|
275
|
+
train_labels_for_each_anchor_label.append(train_labels)
|
|
276
|
+
if len(train_dataset) != 0:
|
|
277
|
+
n_iter = self.n_iter
|
|
278
|
+
# needed to be done to distinguish the case when we are given an m_stops for each anchor node or when we are given a m_stop for each trained model
|
|
279
|
+
if self.m_stops is not None:
|
|
280
|
+
if len(self.m_stops) == len(self.list_anchor_nodes_labels_):
|
|
281
|
+
n_iter = self.m_stops[i]
|
|
282
|
+
else:
|
|
283
|
+
n_iter = self.m_stops[m_stops_counter]
|
|
284
|
+
m_stops_counter += 1
|
|
285
|
+
|
|
286
|
+
self.models_list_.append(
|
|
287
|
+
SequentialPathBoost(
|
|
288
|
+
n_iter=n_iter,
|
|
289
|
+
patience=self.patience,
|
|
290
|
+
target_error=self.target_error,
|
|
291
|
+
max_path_length=self.max_path_length,
|
|
292
|
+
learning_rate=self.learning_rate,
|
|
293
|
+
BaseLearnerClass=self.BaseLearnerClass,
|
|
294
|
+
SelectorClass=self.SelectorClass,
|
|
295
|
+
kwargs_for_base_learner=self.kwargs_for_base_learner,
|
|
296
|
+
kwargs_for_selector=self.kwargs_for_selector,
|
|
297
|
+
parameters_variable_importance=self.parameters_variable_importance,
|
|
298
|
+
replace_nan_with=self.replace_nan_with,
|
|
299
|
+
verbose=self.verbose,
|
|
300
|
+
)
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
else:
|
|
304
|
+
# if there is no training data, we will append None to the list of models
|
|
305
|
+
self.models_list_.append(None)
|
|
306
|
+
|
|
307
|
+
# parallelization
|
|
308
|
+
# We will use the `wbu.train_pattern_boosting` function to train the model in parallel.
|
|
309
|
+
input_for_parallelization = list(
|
|
310
|
+
zip(
|
|
311
|
+
self.models_list_,
|
|
312
|
+
train_datasets_for_each_anchor_label,
|
|
313
|
+
train_labels_for_each_anchor_label,
|
|
314
|
+
self.list_anchor_nodes_labels_,
|
|
315
|
+
[
|
|
316
|
+
anchor_nodes_label_name
|
|
317
|
+
for _ in range(len(self.list_anchor_nodes_labels_))
|
|
318
|
+
],
|
|
319
|
+
)
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
number_of_effective_trained_models: int = sum(
|
|
323
|
+
1 for model in self.models_list_ if model is not None
|
|
324
|
+
)
|
|
325
|
+
number_of_cores_used = min(
|
|
326
|
+
mp.cpu_count(), self.n_of_cores, number_of_effective_trained_models
|
|
327
|
+
)
|
|
328
|
+
if number_of_cores_used <= 1:
|
|
329
|
+
path_boosting_models = []
|
|
330
|
+
# Set up iterator with optional progress bar for sequential training
|
|
331
|
+
iterator = range(len(input_for_parallelization))
|
|
332
|
+
if self.verbose and TQDM_AVAILABLE:
|
|
333
|
+
iterator = tqdm(
|
|
334
|
+
iterator,
|
|
335
|
+
desc="Training anchor models",
|
|
336
|
+
unit="model",
|
|
337
|
+
bar_format=(
|
|
338
|
+
"{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
|
|
339
|
+
),
|
|
340
|
+
)
|
|
341
|
+
for i in iterator:
|
|
342
|
+
path_boosting_models.append(
|
|
343
|
+
wbu.train_pattern_boosting(input_for_parallelization[i])
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
else:
|
|
347
|
+
with mp.get_context("spawn").Pool(number_of_cores_used) as pool:
|
|
348
|
+
path_boosting_models = pool.map(
|
|
349
|
+
wbu.train_pattern_boosting, input_for_parallelization
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
self.models_list_ = path_boosting_models
|
|
353
|
+
self.train_mse_ = self._compute_train_mse(
|
|
354
|
+
number_of_observations_for_each_model=[
|
|
355
|
+
len(dataset) for dataset in train_datasets_for_each_anchor_label
|
|
356
|
+
]
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
if eval_set is not None:
|
|
360
|
+
self.mse_eval_set_ = []
|
|
361
|
+
for eval_tuple in eval_set:
|
|
362
|
+
self.mse_eval_set_.append(
|
|
363
|
+
self.evaluate(X=eval_tuple[0], y=eval_tuple[1])
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
if self.parameters_variable_importance is not None:
|
|
367
|
+
self.compute_variable_importance()
|
|
368
|
+
|
|
369
|
+
# `fit` should always return `self`
|
|
370
|
+
return self
|
|
371
|
+
|
|
372
|
+
def compute_variable_importance(self):
|
|
373
|
+
self.parameters_variable_importance["normalize"] = (
|
|
374
|
+
self.normalize_path_importance_
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
self.variable_importance_ = VariableImportance_ForSequentialPathBoost(
|
|
378
|
+
**self.parameters_variable_importance,
|
|
379
|
+
).combine_variable_importance_from_list_of_sequential_models(
|
|
380
|
+
sequential_models=self.models_list_,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
def _compute_train_mse(self, number_of_observations_for_each_model: list[int]):
|
|
384
|
+
train_mse = np.zeros(self.n_iter)
|
|
385
|
+
for i, smc_model in enumerate(self.models_list_):
|
|
386
|
+
if smc_model is not None:
|
|
387
|
+
train_mse += (
|
|
388
|
+
np.array(smc_model.train_mse_)
|
|
389
|
+
* number_of_observations_for_each_model[i]
|
|
390
|
+
)
|
|
391
|
+
train_mse = train_mse / sum(number_of_observations_for_each_model)
|
|
392
|
+
return train_mse
|
|
393
|
+
|
|
394
|
+
def predict(self, X: List[nx.Graph]) -> List[float]:
|
|
395
|
+
"""
|
|
396
|
+
Predict target values for the input graphs.
|
|
397
|
+
|
|
398
|
+
This method partitions the input graphs by anchor nodes, generates
|
|
399
|
+
predictions using the corresponding trained SequentialPathBoost models,
|
|
400
|
+
and averages predictions across models for graphs with multiple anchor types.
|
|
401
|
+
|
|
402
|
+
Parameters
|
|
403
|
+
----------
|
|
404
|
+
X : List[nx.Graph]
|
|
405
|
+
A list of NetworkX graph objects for which to make predictions.
|
|
406
|
+
|
|
407
|
+
Returns
|
|
408
|
+
-------
|
|
409
|
+
predictions : List[float]
|
|
410
|
+
Predicted target values for each input graph.
|
|
411
|
+
|
|
412
|
+
Raises
|
|
413
|
+
------
|
|
414
|
+
sklearn.exceptions.NotFittedError
|
|
415
|
+
If the model has not been fitted yet.
|
|
416
|
+
|
|
417
|
+
Examples
|
|
418
|
+
--------
|
|
419
|
+
>>> predictions = model.predict(X_test) # doctest: +SKIP
|
|
420
|
+
>>> print(f"Mean prediction: {np.mean(predictions):.3f}") # doctest: +SKIP
|
|
421
|
+
"""
|
|
422
|
+
# Check if fit had been called
|
|
423
|
+
check_is_fitted(self)
|
|
424
|
+
# We need to set reset=False because we don't want to overwrite `n_features_in_`
|
|
425
|
+
# `feature_names_in_` but only check that the shape is consistent.
|
|
426
|
+
X = self._validate_data(X=X)
|
|
427
|
+
|
|
428
|
+
# divide the input by the anchor node
|
|
429
|
+
indexes_of_graphs_for_each_anchor_label: list[list[int]] = (
|
|
430
|
+
wbu.split_dataset_by_metal_centers(
|
|
431
|
+
graphs_list=X,
|
|
432
|
+
anchor_nodes_label_name=self.anchor_nodes_label_name_,
|
|
433
|
+
anchor_nodes=self.list_anchor_nodes_labels_,
|
|
434
|
+
)
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
# create the dataset for each anchor node
|
|
438
|
+
datasets_for_each_anchor_label = []
|
|
439
|
+
for i, _ in enumerate(self.list_anchor_nodes_labels_):
|
|
440
|
+
indexes = indexes_of_graphs_for_each_anchor_label[i]
|
|
441
|
+
dataset = [X[index] for index in indexes]
|
|
442
|
+
datasets_for_each_anchor_label.append(dataset)
|
|
443
|
+
|
|
444
|
+
number_of_effective_trained_models: int = sum(
|
|
445
|
+
1 for model in self.models_list_ if model is not None
|
|
446
|
+
)
|
|
447
|
+
number_of_dataset_to_be_predicted = sum(
|
|
448
|
+
1 for dataset in datasets_for_each_anchor_label if len(dataset) != 0
|
|
449
|
+
)
|
|
450
|
+
number_of_cores_used = min(
|
|
451
|
+
mp.cpu_count(),
|
|
452
|
+
self.n_of_cores,
|
|
453
|
+
number_of_dataset_to_be_predicted,
|
|
454
|
+
number_of_effective_trained_models,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
if number_of_cores_used <= 1:
|
|
458
|
+
predictions_for_each_anchor_node = []
|
|
459
|
+
for i in range(len(datasets_for_each_anchor_label)):
|
|
460
|
+
if self.models_list_[i] is not None:
|
|
461
|
+
predictions = wbu.parallel_predict(
|
|
462
|
+
input_from_parallelization=(
|
|
463
|
+
self.models_list_[i],
|
|
464
|
+
datasets_for_each_anchor_label[i],
|
|
465
|
+
)
|
|
466
|
+
)
|
|
467
|
+
predictions_for_each_anchor_node.append(predictions)
|
|
468
|
+
else:
|
|
469
|
+
predictions_for_each_anchor_node.append(None)
|
|
470
|
+
else:
|
|
471
|
+
input_for_parallelization = list(
|
|
472
|
+
zip(self.models_list_, datasets_for_each_anchor_label)
|
|
473
|
+
)
|
|
474
|
+
with mp.get_context("spawn").Pool(self.n_of_cores) as pool:
|
|
475
|
+
predictions_for_each_anchor_node = pool.map(
|
|
476
|
+
wbu.parallel_predict, input_for_parallelization
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
# create a matrix (list of lists) where the columns refer to the anchor nodes and the rows to the graphs
|
|
480
|
+
predictions_for_each_anchor_node_padded_with_none = [
|
|
481
|
+
[None for _ in range(len(X))]
|
|
482
|
+
for _ in range(len(self.list_anchor_nodes_labels_))
|
|
483
|
+
]
|
|
484
|
+
for anchor_node_number in range(len(self.list_anchor_nodes_labels_)):
|
|
485
|
+
for i in range(
|
|
486
|
+
len(indexes_of_graphs_for_each_anchor_label[anchor_node_number])
|
|
487
|
+
):
|
|
488
|
+
graph_number = indexes_of_graphs_for_each_anchor_label[
|
|
489
|
+
anchor_node_number
|
|
490
|
+
][i]
|
|
491
|
+
predictions_for_each_anchor_node_padded_with_none[anchor_node_number][
|
|
492
|
+
graph_number
|
|
493
|
+
] = predictions_for_each_anchor_node[anchor_node_number][i]
|
|
494
|
+
|
|
495
|
+
# Transpose the list of lists, filling missing values with None
|
|
496
|
+
transposed_list = list(
|
|
497
|
+
map(
|
|
498
|
+
list,
|
|
499
|
+
itertools.zip_longest(
|
|
500
|
+
*predictions_for_each_anchor_node_padded_with_none, fillvalue=None
|
|
501
|
+
),
|
|
502
|
+
)
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
# Calculate the average of each row, ignoring None values
|
|
506
|
+
predictions = []
|
|
507
|
+
for sublist in transposed_list:
|
|
508
|
+
if len(sublist) > 0:
|
|
509
|
+
non_none_values = [x for x in sublist if x is not None]
|
|
510
|
+
if len(non_none_values) > 0:
|
|
511
|
+
avg = np.mean(non_none_values)
|
|
512
|
+
else:
|
|
513
|
+
avg = 0
|
|
514
|
+
else:
|
|
515
|
+
avg = 0
|
|
516
|
+
predictions.append(avg)
|
|
517
|
+
|
|
518
|
+
predictions = [
|
|
519
|
+
x if x is not None and not np.isnan(x) else 0 for x in predictions
|
|
520
|
+
]
|
|
521
|
+
|
|
522
|
+
return predictions
|
|
523
|
+
|
|
524
|
+
def predict_step_by_step(self, X: list[nx.Graph]) -> list[list[float]]:
|
|
525
|
+
"""
|
|
526
|
+
Predicts the target values for the input graphs step by step, returning the predictions at each iteration.
|
|
527
|
+
|
|
528
|
+
This method divides the input graphs by anchor nodes, generates datasets for each anchor node, and then
|
|
529
|
+
uses the trained models to predict the target values for each dataset. The predictions are made iteratively,
|
|
530
|
+
and the method returns the predictions at each iteration.
|
|
531
|
+
|
|
532
|
+
Parameters
|
|
533
|
+
----------
|
|
534
|
+
X : list[nx.Graph]
|
|
535
|
+
A list of networkx graph objects to be used for prediction.
|
|
536
|
+
|
|
537
|
+
Returns
|
|
538
|
+
-------
|
|
539
|
+
list[list[float]]
|
|
540
|
+
A list of lists where each inner list contains the predictions for the input graphs at a specific iteration.
|
|
541
|
+
The outer list contains the predictions for all iterations.
|
|
542
|
+
"""
|
|
543
|
+
|
|
544
|
+
# Check if fit had been called
|
|
545
|
+
check_is_fitted(self)
|
|
546
|
+
# We need to set reset=False because we don't want to overwrite `n_features_in_`
|
|
547
|
+
# `feature_names_in_` but only check that the shape is consistent.
|
|
548
|
+
X = self._validate_data(X=X)
|
|
549
|
+
|
|
550
|
+
# divide the input by the anchor node
|
|
551
|
+
indexes_of_graphs_for_each_anchor_label: list[list[int]] = (
|
|
552
|
+
wbu.split_dataset_by_metal_centers(
|
|
553
|
+
graphs_list=X,
|
|
554
|
+
anchor_nodes_label_name=self.anchor_nodes_label_name_,
|
|
555
|
+
anchor_nodes=self.list_anchor_nodes_labels_,
|
|
556
|
+
)
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
# create the dataset for each anchor node
|
|
560
|
+
datasets_for_each_anchor_label = []
|
|
561
|
+
for i, _ in enumerate(self.list_anchor_nodes_labels_):
|
|
562
|
+
indexes = indexes_of_graphs_for_each_anchor_label[i]
|
|
563
|
+
dataset = [X[index] for index in indexes]
|
|
564
|
+
datasets_for_each_anchor_label.append(dataset)
|
|
565
|
+
|
|
566
|
+
number_of_effective_trained_models: int = sum(
|
|
567
|
+
1 for model in self.models_list_ if model is not None
|
|
568
|
+
)
|
|
569
|
+
number_of_dataset_to_be_predicted = sum(
|
|
570
|
+
1 for dataset in datasets_for_each_anchor_label if len(dataset) != 0
|
|
571
|
+
)
|
|
572
|
+
number_of_cores_used = min(
|
|
573
|
+
mp.cpu_count(),
|
|
574
|
+
self.n_of_cores,
|
|
575
|
+
number_of_dataset_to_be_predicted,
|
|
576
|
+
number_of_effective_trained_models,
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
# get the step by step predictions for each anchor node
|
|
580
|
+
if number_of_cores_used <= 1:
|
|
581
|
+
step_by_step_predictions_for_each_anchor_node: list[
|
|
582
|
+
list[list[numbers.Number]]
|
|
583
|
+
] = []
|
|
584
|
+
for i in range(len(datasets_for_each_anchor_label)):
|
|
585
|
+
if self.models_list_[i] is not None:
|
|
586
|
+
predictions_step_by_step = wbu.parallel_predict_step_by_step(
|
|
587
|
+
(self.models_list_[i], datasets_for_each_anchor_label[i])
|
|
588
|
+
)
|
|
589
|
+
step_by_step_predictions_for_each_anchor_node.append(
|
|
590
|
+
predictions_step_by_step
|
|
591
|
+
)
|
|
592
|
+
else:
|
|
593
|
+
step_by_step_predictions_for_each_anchor_node.append(None)
|
|
594
|
+
else:
|
|
595
|
+
input_for_parallelization = list(
|
|
596
|
+
zip(self.models_list_, datasets_for_each_anchor_label)
|
|
597
|
+
)
|
|
598
|
+
with mp.get_context("spawn").Pool(self.n_of_cores) as pool:
|
|
599
|
+
step_by_step_predictions_for_each_anchor_node: list[
|
|
600
|
+
list[list[numbers.Number]]
|
|
601
|
+
] = pool.map(
|
|
602
|
+
wbu.parallel_predict_step_by_step, input_for_parallelization
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
# create a matrix for each iteration (list of lists) where the columns refer to the anchor nodes and the rows to the graphs
|
|
606
|
+
iterations_predictions_for_each_anchor_node_padded_with_none = []
|
|
607
|
+
for iteration in range(self.n_iter):
|
|
608
|
+
predictions_for_each_anchor_node_padded_with_none = [
|
|
609
|
+
[None for _ in range(len(X))]
|
|
610
|
+
for _ in range(len(self.list_anchor_nodes_labels_))
|
|
611
|
+
]
|
|
612
|
+
|
|
613
|
+
for anchor_node_number in range(len(self.list_anchor_nodes_labels_)):
|
|
614
|
+
for i in range(
|
|
615
|
+
len(indexes_of_graphs_for_each_anchor_label[anchor_node_number])
|
|
616
|
+
):
|
|
617
|
+
graph_number = indexes_of_graphs_for_each_anchor_label[
|
|
618
|
+
anchor_node_number
|
|
619
|
+
][i]
|
|
620
|
+
predictions_for_each_anchor_node_padded_with_none[
|
|
621
|
+
anchor_node_number
|
|
622
|
+
][graph_number] = step_by_step_predictions_for_each_anchor_node[
|
|
623
|
+
anchor_node_number
|
|
624
|
+
][
|
|
625
|
+
iteration
|
|
626
|
+
][
|
|
627
|
+
i
|
|
628
|
+
]
|
|
629
|
+
|
|
630
|
+
iterations_predictions_for_each_anchor_node_padded_with_none.append(
|
|
631
|
+
predictions_for_each_anchor_node_padded_with_none
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
transposed_iteration_predictions = []
|
|
635
|
+
for iteration in range(self.n_iter):
|
|
636
|
+
# Transpose the list of lists, filling missing values with None
|
|
637
|
+
transposed_list = list(
|
|
638
|
+
map(
|
|
639
|
+
list,
|
|
640
|
+
itertools.zip_longest(
|
|
641
|
+
*iterations_predictions_for_each_anchor_node_padded_with_none[
|
|
642
|
+
iteration
|
|
643
|
+
],
|
|
644
|
+
fillvalue=None,
|
|
645
|
+
),
|
|
646
|
+
)
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
transposed_iteration_predictions.append(transposed_list)
|
|
650
|
+
|
|
651
|
+
# Calculate the average of each row, ignoring None values
|
|
652
|
+
predictions_step_by_step = []
|
|
653
|
+
for iteration in range(self.n_iter):
|
|
654
|
+
averages = []
|
|
655
|
+
for sublist in transposed_iteration_predictions[iteration]:
|
|
656
|
+
if len(sublist) > 0:
|
|
657
|
+
non_none_values = [x for x in sublist if x is not None]
|
|
658
|
+
if len(non_none_values) > 0:
|
|
659
|
+
avg = np.mean(non_none_values)
|
|
660
|
+
else:
|
|
661
|
+
avg = 0
|
|
662
|
+
else:
|
|
663
|
+
avg = 0
|
|
664
|
+
averages.append(avg)
|
|
665
|
+
averages = [x if x is not None and not np.isnan(x) else 0 for x in averages]
|
|
666
|
+
predictions_step_by_step.append(averages)
|
|
667
|
+
|
|
668
|
+
return predictions_step_by_step
|
|
669
|
+
|
|
670
|
+
def _merge_values_from_single_path_boost(
|
|
671
|
+
self,
|
|
672
|
+
len_X: int,
|
|
673
|
+
indexes_of_graphs_for_each_anchor_label: list[list[int]],
|
|
674
|
+
values_for_each_anchor_node: list[list[float]],
|
|
675
|
+
):
|
|
676
|
+
"""
|
|
677
|
+
This method is used to merge (average) the values (predictions) from a SingleMetalCenterPathBoost instance into the current instance of PathBoost
|
|
678
|
+
"""
|
|
679
|
+
|
|
680
|
+
averaged_values = [0 for _ in range(len_X)]
|
|
681
|
+
counter = [0 for _ in range(len_X)]
|
|
682
|
+
for graph_number in range(len_X):
|
|
683
|
+
for anchor_node_number in range(len(self.list_anchor_nodes_labels_)):
|
|
684
|
+
if (
|
|
685
|
+
graph_number
|
|
686
|
+
in indexes_of_graphs_for_each_anchor_label[anchor_node_number]
|
|
687
|
+
):
|
|
688
|
+
graph_position_in_sub_dataset = (
|
|
689
|
+
indexes_of_graphs_for_each_anchor_label[
|
|
690
|
+
anchor_node_number
|
|
691
|
+
].index(graph_number)
|
|
692
|
+
)
|
|
693
|
+
averaged_values[graph_number] += values_for_each_anchor_node[
|
|
694
|
+
anchor_node_number
|
|
695
|
+
][graph_position_in_sub_dataset]
|
|
696
|
+
counter[graph_number] += 1
|
|
697
|
+
|
|
698
|
+
averaged_values = np.divide(
|
|
699
|
+
averaged_values,
|
|
700
|
+
counter,
|
|
701
|
+
out=np.zeros_like(averaged_values),
|
|
702
|
+
where=counter != 0,
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
return averaged_values
|
|
706
|
+
|
|
707
|
+
def evaluate(self, X: list[nx.Graph], y: Iterable) -> list[float]:
|
|
708
|
+
# it returns the evolution of the mse with increasing number of iterations
|
|
709
|
+
predictions = self.predict_step_by_step(X)
|
|
710
|
+
evolution_mse = []
|
|
711
|
+
for prediction in predictions:
|
|
712
|
+
mse = mean_squared_error(y_true=y, y_pred=prediction)
|
|
713
|
+
evolution_mse.append(mse)
|
|
714
|
+
return evolution_mse
|
|
715
|
+
|
|
716
|
+
def plot_training_and_eval_errors(
|
|
717
|
+
self,
|
|
718
|
+
skip_first_n_iterations: int | bool = True,
|
|
719
|
+
plot_eval_sets_error=True,
|
|
720
|
+
show=True,
|
|
721
|
+
save=False,
|
|
722
|
+
save_path: str | None = None,
|
|
723
|
+
):
|
|
724
|
+
"""
|
|
725
|
+
Plots the training and evaluation set errors over iterations.
|
|
726
|
+
"""
|
|
727
|
+
if hasattr(self, "mse_eval_set_") and plot_eval_sets_error is True:
|
|
728
|
+
eval_sets_mse = self.mse_eval_set_
|
|
729
|
+
else:
|
|
730
|
+
eval_sets_mse = None
|
|
731
|
+
plot_training_and_eval_errors(
|
|
732
|
+
learning_rate=self.learning_rate,
|
|
733
|
+
train_mse=self.train_mse_,
|
|
734
|
+
mse_eval_set=eval_sets_mse,
|
|
735
|
+
skip_first_n_iterations=skip_first_n_iterations,
|
|
736
|
+
show=show,
|
|
737
|
+
save=save,
|
|
738
|
+
save_path=save_path,
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
def plot_variable_importance(
|
|
742
|
+
self, top_n_features: int | None = None, show: bool = True
|
|
743
|
+
):
|
|
744
|
+
if self.parameters_variable_importance is None:
|
|
745
|
+
raise ValueError(
|
|
746
|
+
"Variable importance is not computed. Please set"
|
|
747
|
+
" parameters_variable_importance in the constructor."
|
|
748
|
+
)
|
|
749
|
+
plot_variable_importance_utils(
|
|
750
|
+
variable_importance=self.variable_importance_,
|
|
751
|
+
parameters_variable_importance=self.parameters_variable_importance,
|
|
752
|
+
top_n=top_n_features,
|
|
753
|
+
show=show,
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
def score(self, X, y, sample_weight=None):
|
|
757
|
+
# This method is used to evaluate the model on the given data.
|
|
758
|
+
# It is defined in the `RegressorMixin` class.
|
|
759
|
+
# It allows to:
|
|
760
|
+
# - evaluate the model on the given data
|
|
761
|
+
# - return the score
|
|
762
|
+
mse_evolution = self.evaluate(X=X, y=y)
|
|
763
|
+
best_mse = mse_evolution[-1]
|
|
764
|
+
return -best_mse
|
|
765
|
+
|
|
766
|
+
def _validate_data(
|
|
767
|
+
self,
|
|
768
|
+
X="no_validation",
|
|
769
|
+
y="no_validation",
|
|
770
|
+
reset=True,
|
|
771
|
+
validate_separately=False,
|
|
772
|
+
**check_params,
|
|
773
|
+
):
|
|
774
|
+
util_validate_data(
|
|
775
|
+
model=self,
|
|
776
|
+
X=X,
|
|
777
|
+
y=y,
|
|
778
|
+
reset=reset,
|
|
779
|
+
validate_separately=validate_separately,
|
|
780
|
+
**check_params,
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
if not np.array_equal(y, "no_validation"):
|
|
784
|
+
validate_data(
|
|
785
|
+
self,
|
|
786
|
+
X="no_validation",
|
|
787
|
+
y=y,
|
|
788
|
+
reset=reset,
|
|
789
|
+
validate_separately=validate_separately,
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
if not np.array_equal(X, "no_validation") and not np.array_equal(
|
|
793
|
+
y, "no_validation"
|
|
794
|
+
):
|
|
795
|
+
return X, y
|
|
796
|
+
elif not np.array_equal(X, "no_validation"):
|
|
797
|
+
return X
|
|
798
|
+
elif not np.array_equal(y, "no_validation"):
|
|
799
|
+
return y
|
|
800
|
+
|
|
801
|
+
def get_final_eval_set_mse(self):
|
|
802
|
+
"""
|
|
803
|
+
Returns the evaluation set MSE if it was computed during fitting.
|
|
804
|
+
"""
|
|
805
|
+
if hasattr(self, "mse_eval_set_"):
|
|
806
|
+
final_eval_set_mse = []
|
|
807
|
+
for mse in self.mse_eval_set_:
|
|
808
|
+
final_eval_set_mse.append(mse[-1])
|
|
809
|
+
return final_eval_set_mse
|
|
810
|
+
else:
|
|
811
|
+
raise AttributeError(
|
|
812
|
+
"Evaluation set MSE is not available. Please fit the model with"
|
|
813
|
+
" eval_set."
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
def save(self, filepath: str) -> None:
|
|
817
|
+
"""
|
|
818
|
+
Save the fitted model to a file.
|
|
819
|
+
|
|
820
|
+
This method serializes the entire PathBoost model, including all trained
|
|
821
|
+
SequentialPathBoost sub-models, to a file using joblib. The saved file
|
|
822
|
+
includes metadata such as the package version and training parameters.
|
|
823
|
+
|
|
824
|
+
Parameters
|
|
825
|
+
----------
|
|
826
|
+
filepath : str
|
|
827
|
+
The path where the model should be saved. The file extension
|
|
828
|
+
`.joblib` is recommended but not required.
|
|
829
|
+
|
|
830
|
+
Raises
|
|
831
|
+
------
|
|
832
|
+
ValueError
|
|
833
|
+
If the model has not been fitted yet.
|
|
834
|
+
|
|
835
|
+
Examples
|
|
836
|
+
--------
|
|
837
|
+
>>> model = PathBoost(n_iter=50) # doctest: +SKIP
|
|
838
|
+
>>> model.fit(X_train, y_train, anchor_nodes_label_name='atomic_number',
|
|
839
|
+
... list_anchor_nodes_labels=[1, 6, 7, 8]) # doctest: +SKIP
|
|
840
|
+
>>> model.save('my_model.joblib') # doctest: +SKIP
|
|
841
|
+
|
|
842
|
+
See Also
|
|
843
|
+
--------
|
|
844
|
+
load : Load a saved model from file.
|
|
845
|
+
"""
|
|
846
|
+
import joblib
|
|
847
|
+
from datetime import datetime
|
|
848
|
+
from ._version import __version__
|
|
849
|
+
|
|
850
|
+
check_is_fitted(self)
|
|
851
|
+
|
|
852
|
+
# Create metadata dictionary
|
|
853
|
+
metadata = {
|
|
854
|
+
"version": __version__,
|
|
855
|
+
"saved_at": datetime.now().isoformat(),
|
|
856
|
+
"n_iter": self.n_iter,
|
|
857
|
+
"max_path_length": self.max_path_length,
|
|
858
|
+
"learning_rate": self.learning_rate,
|
|
859
|
+
"anchor_nodes_label_name": self.anchor_nodes_label_name_,
|
|
860
|
+
"list_anchor_nodes_labels": self.list_anchor_nodes_labels_,
|
|
861
|
+
"n_models": len(self.models_list_),
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
# Package model and metadata
|
|
865
|
+
save_dict = {
|
|
866
|
+
"model": self,
|
|
867
|
+
"metadata": metadata,
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
joblib.dump(save_dict, filepath)
|
|
871
|
+
logger.info(f"Model saved to {filepath}")
|
|
872
|
+
|
|
873
|
+
@classmethod
|
|
874
|
+
def load(cls, filepath: str) -> "PathBoost":
|
|
875
|
+
"""
|
|
876
|
+
Load a saved PathBoost model from a file.
|
|
877
|
+
|
|
878
|
+
This class method deserializes a PathBoost model that was previously
|
|
879
|
+
saved using the `save` method. It restores the complete model state
|
|
880
|
+
including all trained sub-models and parameters.
|
|
881
|
+
|
|
882
|
+
Parameters
|
|
883
|
+
----------
|
|
884
|
+
filepath : str
|
|
885
|
+
The path to the saved model file.
|
|
886
|
+
|
|
887
|
+
Returns
|
|
888
|
+
-------
|
|
889
|
+
PathBoost
|
|
890
|
+
The loaded PathBoost model, ready for prediction.
|
|
891
|
+
|
|
892
|
+
Raises
|
|
893
|
+
------
|
|
894
|
+
FileNotFoundError
|
|
895
|
+
If the specified file does not exist.
|
|
896
|
+
ValueError
|
|
897
|
+
If the file does not contain a valid PathBoost model.
|
|
898
|
+
|
|
899
|
+
Warns
|
|
900
|
+
-----
|
|
901
|
+
UserWarning
|
|
902
|
+
If the model was saved with a different version of the package.
|
|
903
|
+
|
|
904
|
+
Examples
|
|
905
|
+
--------
|
|
906
|
+
>>> model = PathBoost.load('my_model.joblib') # doctest: +SKIP
|
|
907
|
+
>>> predictions = model.predict(X_test) # doctest: +SKIP
|
|
908
|
+
|
|
909
|
+
See Also
|
|
910
|
+
--------
|
|
911
|
+
save : Save a fitted model to file.
|
|
912
|
+
|
|
913
|
+
Notes
|
|
914
|
+
-----
|
|
915
|
+
Models saved with older versions of the package may not be fully
|
|
916
|
+
compatible. A warning will be issued if version mismatch is detected.
|
|
917
|
+
"""
|
|
918
|
+
import joblib
|
|
919
|
+
from ._version import __version__
|
|
920
|
+
|
|
921
|
+
save_dict = joblib.load(filepath)
|
|
922
|
+
|
|
923
|
+
# Validate the loaded object
|
|
924
|
+
if not isinstance(save_dict, dict) or "model" not in save_dict:
|
|
925
|
+
raise ValueError(
|
|
926
|
+
"Invalid model file. Expected a PathBoost save file, but got"
|
|
927
|
+
f" {type(save_dict)}"
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
model = save_dict["model"]
|
|
931
|
+
metadata = save_dict.get("metadata", {})
|
|
932
|
+
|
|
933
|
+
# Check version compatibility
|
|
934
|
+
saved_version = metadata.get("version", "unknown")
|
|
935
|
+
if saved_version != __version__:
|
|
936
|
+
warnings.warn(
|
|
937
|
+
(
|
|
938
|
+
f"Model was saved with version {saved_version}, but current version"
|
|
939
|
+
f" is {__version__}. Some features may not work correctly."
|
|
940
|
+
),
|
|
941
|
+
UserWarning,
|
|
942
|
+
)
|
|
943
|
+
|
|
944
|
+
logger.info(
|
|
945
|
+
f"Model loaded from {filepath} (saved:"
|
|
946
|
+
f" {metadata.get('saved_at', 'unknown')})"
|
|
947
|
+
)
|
|
948
|
+
return model
|
|
949
|
+
|
|
950
|
+
def predict_with_uncertainty(
|
|
951
|
+
self, X: List[nx.Graph], confidence: float = 0.95
|
|
952
|
+
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
953
|
+
"""
|
|
954
|
+
Predict target values with uncertainty estimates using anchor node bootstrap.
|
|
955
|
+
|
|
956
|
+
This method leverages the natural ensemble structure of PathBoost, where
|
|
957
|
+
separate models are trained for each anchor node type. Instead of simply
|
|
958
|
+
averaging predictions across anchor models, this method computes the
|
|
959
|
+
variance across individual model predictions to estimate uncertainty.
|
|
960
|
+
|
|
961
|
+
Parameters
|
|
962
|
+
----------
|
|
963
|
+
X : List[nx.Graph]
|
|
964
|
+
A list of NetworkX graph objects for which to make predictions.
|
|
965
|
+
confidence : float, default=0.95
|
|
966
|
+
The confidence level for the prediction intervals. Must be between
|
|
967
|
+
0 and 1. Common values are 0.90, 0.95, and 0.99.
|
|
968
|
+
|
|
969
|
+
Returns
|
|
970
|
+
-------
|
|
971
|
+
predictions : np.ndarray
|
|
972
|
+
Point estimates (mean predictions) for each input graph.
|
|
973
|
+
Shape: (n_samples,)
|
|
974
|
+
lower_bounds : np.ndarray
|
|
975
|
+
Lower bounds of the confidence interval for each prediction.
|
|
976
|
+
Shape: (n_samples,)
|
|
977
|
+
upper_bounds : np.ndarray
|
|
978
|
+
Upper bounds of the confidence interval for each prediction.
|
|
979
|
+
Shape: (n_samples,)
|
|
980
|
+
|
|
981
|
+
Raises
|
|
982
|
+
------
|
|
983
|
+
ValueError
|
|
984
|
+
If confidence is not between 0 and 1.
|
|
985
|
+
If fewer than 2 anchor node models were trained (uncertainty requires
|
|
986
|
+
multiple models for bootstrap estimation).
|
|
987
|
+
|
|
988
|
+
Notes
|
|
989
|
+
-----
|
|
990
|
+
The uncertainty estimates are based on the variance across anchor node
|
|
991
|
+
models. This approach is valid when:
|
|
992
|
+
- Multiple anchor node types were used during training
|
|
993
|
+
- Each graph contains nodes from multiple anchor types
|
|
994
|
+
|
|
995
|
+
For graphs that only match a single anchor type, the uncertainty
|
|
996
|
+
estimate will be based on fewer models and may be less reliable.
|
|
997
|
+
|
|
998
|
+
The confidence intervals assume approximate normality of the prediction
|
|
999
|
+
distribution across anchor models. For small numbers of anchor types,
|
|
1000
|
+
this assumption may not hold exactly.
|
|
1001
|
+
|
|
1002
|
+
Examples
|
|
1003
|
+
--------
|
|
1004
|
+
>>> model = PathBoost(n_iter=50) # doctest: +SKIP
|
|
1005
|
+
>>> model.fit(X_train, y_train, anchor_nodes_label_name='atomic_number',
|
|
1006
|
+
... list_anchor_nodes_labels=[6, 7, 8, 9]) # doctest: +SKIP
|
|
1007
|
+
>>> predictions, lower, upper = model.predict_with_uncertainty(X_test) # doctest: +SKIP
|
|
1008
|
+
>>> for i in range(5): # doctest: +SKIP
|
|
1009
|
+
... print(f"Pred: {predictions[i]:.2f}, CI: [{lower[i]:.2f}, {upper[i]:.2f}]")
|
|
1010
|
+
|
|
1011
|
+
See Also
|
|
1012
|
+
--------
|
|
1013
|
+
predict : Standard prediction without uncertainty estimates.
|
|
1014
|
+
"""
|
|
1015
|
+
from scipy import stats
|
|
1016
|
+
|
|
1017
|
+
# Validate inputs
|
|
1018
|
+
check_is_fitted(self)
|
|
1019
|
+
|
|
1020
|
+
if not 0 < confidence < 1:
|
|
1021
|
+
raise ValueError(f"confidence must be between 0 and 1, got {confidence}")
|
|
1022
|
+
|
|
1023
|
+
n_effective_models = sum(1 for m in self.models_list_ if m is not None)
|
|
1024
|
+
if n_effective_models < 2:
|
|
1025
|
+
raise ValueError(
|
|
1026
|
+
"predict_with_uncertainty requires at least 2 trained anchor models, "
|
|
1027
|
+
f"but only {n_effective_models} model(s) were trained. "
|
|
1028
|
+
"Use predict() for standard predictions."
|
|
1029
|
+
)
|
|
1030
|
+
|
|
1031
|
+
X = self._validate_data(X=X)
|
|
1032
|
+
|
|
1033
|
+
# Get predictions from each anchor model separately
|
|
1034
|
+
indexes_of_graphs_for_each_anchor_label: List[List[int]] = (
|
|
1035
|
+
wbu.split_dataset_by_metal_centers(
|
|
1036
|
+
graphs_list=X,
|
|
1037
|
+
anchor_nodes_label_name=self.anchor_nodes_label_name_,
|
|
1038
|
+
anchor_nodes=self.list_anchor_nodes_labels_,
|
|
1039
|
+
)
|
|
1040
|
+
)
|
|
1041
|
+
|
|
1042
|
+
# Create datasets for each anchor node
|
|
1043
|
+
datasets_for_each_anchor_label = []
|
|
1044
|
+
for i, _ in enumerate(self.list_anchor_nodes_labels_):
|
|
1045
|
+
indexes = indexes_of_graphs_for_each_anchor_label[i]
|
|
1046
|
+
dataset = [X[index] for index in indexes]
|
|
1047
|
+
datasets_for_each_anchor_label.append(dataset)
|
|
1048
|
+
|
|
1049
|
+
# Get predictions from each model
|
|
1050
|
+
predictions_matrix = np.full(
|
|
1051
|
+
(len(X), len(self.list_anchor_nodes_labels_)), np.nan
|
|
1052
|
+
)
|
|
1053
|
+
|
|
1054
|
+
for anchor_idx, (model, dataset) in enumerate(
|
|
1055
|
+
zip(self.models_list_, datasets_for_each_anchor_label)
|
|
1056
|
+
):
|
|
1057
|
+
if model is not None and len(dataset) > 0:
|
|
1058
|
+
preds = model.predict(dataset)
|
|
1059
|
+
graph_indices = indexes_of_graphs_for_each_anchor_label[anchor_idx]
|
|
1060
|
+
for local_idx, global_idx in enumerate(graph_indices):
|
|
1061
|
+
predictions_matrix[global_idx, anchor_idx] = preds[local_idx]
|
|
1062
|
+
|
|
1063
|
+
# Compute statistics across anchor models
|
|
1064
|
+
with warnings.catch_warnings():
|
|
1065
|
+
warnings.simplefilter("ignore", category=RuntimeWarning)
|
|
1066
|
+
# Mean prediction (ignoring NaN)
|
|
1067
|
+
predictions = np.nanmean(predictions_matrix, axis=1)
|
|
1068
|
+
# Standard deviation across models
|
|
1069
|
+
std_devs = np.nanstd(predictions_matrix, axis=1, ddof=1)
|
|
1070
|
+
# Count of models contributing to each prediction
|
|
1071
|
+
n_models_per_sample = np.sum(~np.isnan(predictions_matrix), axis=1)
|
|
1072
|
+
|
|
1073
|
+
# Replace NaN predictions with 0 (same as regular predict)
|
|
1074
|
+
predictions = np.where(np.isnan(predictions), 0, predictions)
|
|
1075
|
+
|
|
1076
|
+
# Compute confidence intervals
|
|
1077
|
+
# Use t-distribution for small sample sizes
|
|
1078
|
+
alpha = 1 - confidence
|
|
1079
|
+
z_score = stats.norm.ppf(1 - alpha / 2) # Two-tailed
|
|
1080
|
+
|
|
1081
|
+
# Standard error of the mean
|
|
1082
|
+
standard_errors = std_devs / np.sqrt(np.maximum(n_models_per_sample, 1))
|
|
1083
|
+
|
|
1084
|
+
# For samples with only 1 model, set uncertainty to 0 (no information)
|
|
1085
|
+
standard_errors = np.where(n_models_per_sample < 2, 0, standard_errors)
|
|
1086
|
+
|
|
1087
|
+
lower_bounds = predictions - z_score * standard_errors
|
|
1088
|
+
upper_bounds = predictions + z_score * standard_errors
|
|
1089
|
+
|
|
1090
|
+
return predictions, lower_bounds, upper_bounds
|
|
1091
|
+
|
|
1092
|
+
|
|
1093
|
+
if __name__ == "__main__":
|
|
1094
|
+
from sklearn.utils.estimator_checks import check_estimator
|
|
1095
|
+
|
|
1096
|
+
check_estimator = check_estimator(PathBoost())
|