path-boost 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- path_boost/__init__.py +18 -0
- path_boost/_path_boost.py +1096 -0
- path_boost/_version.py +24 -0
- path_boost/utils/__init__.py +2 -0
- path_boost/utils/classes/__init__.py +0 -0
- path_boost/utils/classes/additive_model_wrapper.py +301 -0
- path_boost/utils/classes/additive_model_wrapper_classifier.py +394 -0
- path_boost/utils/classes/extended_boosting_matrix.py +596 -0
- path_boost/utils/classes/interfaces/__init__.py +0 -0
- path_boost/utils/classes/interfaces/interface_base_learner.py +30 -0
- path_boost/utils/classes/interfaces/interface_selector.py +27 -0
- path_boost/utils/classes/sequential_path_boost.py +1023 -0
- path_boost/utils/classes/sequential_path_boost_classifier.py +840 -0
- path_boost/utils/cross_validation.py +49 -0
- path_boost/utils/cyclic_path_boost_utils.py +76 -0
- path_boost/utils/datasets_for_examples/__init__.py +2 -0
- path_boost/utils/datasets_for_examples/generate_example_dataset.py +304 -0
- path_boost/utils/discovery.py +217 -0
- path_boost/utils/plots_functions.py +153 -0
- path_boost/utils/validate_data.py +223 -0
- path_boost/utils/variable_importance_according_to_path_boost.py +341 -0
- path_boost-2.1.0.dist-info/METADATA +174 -0
- path_boost-2.1.0.dist-info/RECORD +26 -0
- path_boost-2.1.0.dist-info/WHEEL +5 -0
- path_boost-2.1.0.dist-info/licenses/LICENSE +21 -0
- path_boost-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import warnings
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from .classes.sequential_path_boost import SequentialPathBoost
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger("path_boost")
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from .classes.extended_boosting_matrix import ExtendedBoostingMatrix
|
|
14
|
+
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class VariableImportance_ForSequentialPathBoost:
|
|
18
|
+
# this class is used to compute the variable importance according to the path boost algorithm
|
|
19
|
+
# note, that for the relative variable importance, we need hevvy rely on the fact that the loss function passed
|
|
20
|
+
# to the base learner is mse, if changed we can not guarantee the correct behaviour of the algorithm
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
criterion: str,
|
|
25
|
+
use_correlation: bool = False,
|
|
26
|
+
normalize: bool = True,
|
|
27
|
+
error_used: str = "mse",
|
|
28
|
+
normalization_value: float = 100,
|
|
29
|
+
):
|
|
30
|
+
assert (
|
|
31
|
+
error_used == "mse" or error_used == "mae"
|
|
32
|
+
), f"error must be either mse or mae, but got {error_used}"
|
|
33
|
+
assert (
|
|
34
|
+
criterion == "absolute" or criterion == "relative"
|
|
35
|
+
), f"criterion must be either absolute or relative, but got {criterion}"
|
|
36
|
+
|
|
37
|
+
self.error_used = error_used
|
|
38
|
+
self.criterion = criterion
|
|
39
|
+
self.use_correlation = use_correlation
|
|
40
|
+
self.normalize = normalize
|
|
41
|
+
|
|
42
|
+
self.normalization_value = normalization_value
|
|
43
|
+
|
|
44
|
+
criterion_choices = {
|
|
45
|
+
"absolute": self.compute_absolute_variable_importance,
|
|
46
|
+
"relative": self.compute_relative_variable_importance,
|
|
47
|
+
}
|
|
48
|
+
self.compute_variable_importance = criterion_choices[criterion]
|
|
49
|
+
|
|
50
|
+
# all the selected paths in order of selection
|
|
51
|
+
self.selected_path_at_iteration = []
|
|
52
|
+
self.columns_at_iteration = []
|
|
53
|
+
self.gradient_at_iteration = []
|
|
54
|
+
|
|
55
|
+
def _update(
|
|
56
|
+
self,
|
|
57
|
+
path_boost: "SequentialPathBoost",
|
|
58
|
+
selected_path: tuple,
|
|
59
|
+
iteration_number: int,
|
|
60
|
+
gradient: np.ndarray | None = None,
|
|
61
|
+
):
|
|
62
|
+
# update is used during training in sequential path boost to save, at each iteration the parameters needed later for the computation of the path importance
|
|
63
|
+
# NB we expect that gradient is just y if we are in the first (0-th) iteration
|
|
64
|
+
# NB we expect that absolute_error is None if we are in the first (0-th) iteration
|
|
65
|
+
|
|
66
|
+
# check that the base learner has been already trained in this iteration
|
|
67
|
+
assert iteration_number == len(path_boost.base_learner_.train_mse), (
|
|
68
|
+
f"iteration number {iteration_number} does not match the number of base"
|
|
69
|
+
f" learners {len(path_boost.base_learner_.train_mse)}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# check that the ebm has not been expanded yet
|
|
73
|
+
assert path_boost._ebm_has_been_expanded_in_this_iteration is False
|
|
74
|
+
|
|
75
|
+
columns_names = path_boost.train_ebm_dataframe_.columns
|
|
76
|
+
|
|
77
|
+
self.selected_path_at_iteration.append(selected_path)
|
|
78
|
+
self.columns_at_iteration.append(columns_names)
|
|
79
|
+
|
|
80
|
+
self.gradient_at_iteration.append(gradient)
|
|
81
|
+
|
|
82
|
+
def compute_absolute_variable_importance(
|
|
83
|
+
self, path_boost: "SequentialPathBoost"
|
|
84
|
+
) -> dict:
|
|
85
|
+
# compute importance by error improvement
|
|
86
|
+
# if we are in the first iteration there is no previous error to compare with, then we set it equal to the second eror improvement
|
|
87
|
+
# note: one can think to compare the first iteration with the error that we would have if we would have used
|
|
88
|
+
# the mean of the labels (that is the variance of the labels in the case of MSE) however this does not work since
|
|
89
|
+
# the bae learner is limited by the learining rate, sometimes making it first training even worse than the mean
|
|
90
|
+
|
|
91
|
+
# check that the iteration number is correct
|
|
92
|
+
# if the iteration number does not coincide with the number of base learners (same as the number of errors)
|
|
93
|
+
# it means we are in a new iteration, but we still have to train the base_learner
|
|
94
|
+
|
|
95
|
+
error_improvement = defaultdict(float)
|
|
96
|
+
previous_improvement = 0
|
|
97
|
+
for iteration in range(path_boost.n_iter):
|
|
98
|
+
if iteration == 0:
|
|
99
|
+
# in the first iteration we do not have a previous error to compare with so we skip
|
|
100
|
+
pass
|
|
101
|
+
else:
|
|
102
|
+
path = self.selected_path_at_iteration[iteration]
|
|
103
|
+
if self.error_used == "mse":
|
|
104
|
+
improvement = (
|
|
105
|
+
path_boost.train_mse_[iteration - 1]
|
|
106
|
+
- path_boost.train_mse_[iteration]
|
|
107
|
+
)
|
|
108
|
+
if improvement < 0 and previous_improvement > 0:
|
|
109
|
+
logger.debug(
|
|
110
|
+
f"error improvement between iteration {iteration} and"
|
|
111
|
+
f" {iteration - 1} is negative ({improvement}). This is"
|
|
112
|
+
" expected by the algorithm, but it might be a sign of"
|
|
113
|
+
" overfitting even if we are comparing the improvement on"
|
|
114
|
+
" the train error"
|
|
115
|
+
)
|
|
116
|
+
error_improvement[path] += improvement
|
|
117
|
+
|
|
118
|
+
elif self.error_used == "mae":
|
|
119
|
+
improvement = (
|
|
120
|
+
path_boost.train_mae_[iteration - 1]
|
|
121
|
+
- path_boost.train_mae_[iteration]
|
|
122
|
+
)
|
|
123
|
+
if improvement < 0 and previous_improvement > 0:
|
|
124
|
+
logger.debug(
|
|
125
|
+
f"error improvement between iteration {iteration} and"
|
|
126
|
+
f" {iteration - 1} is negative. This is expected by the"
|
|
127
|
+
" algorithm, but it might be a sign of overfitting even if"
|
|
128
|
+
" we are comparing the improvement on the train error"
|
|
129
|
+
)
|
|
130
|
+
error_improvement[path] += improvement
|
|
131
|
+
previous_improvement = improvement
|
|
132
|
+
|
|
133
|
+
if iteration == 1:
|
|
134
|
+
# since we did not set any importance for the path selected in the zeroth iteration,
|
|
135
|
+
# we now set it equal to the importance assignet to the second-selected path
|
|
136
|
+
first_selected_path = self.selected_path_at_iteration[0]
|
|
137
|
+
if self.error_used == "mse":
|
|
138
|
+
error_improvement[first_selected_path] = (
|
|
139
|
+
path_boost.train_mse_[0] - path_boost.train_mse_[1]
|
|
140
|
+
)
|
|
141
|
+
elif self.error_used == "mae":
|
|
142
|
+
error_improvement[first_selected_path] = (
|
|
143
|
+
path_boost.train_mae_[0] - path_boost.train_mae_[1]
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
dict_error_improvement = self._get_correlation_and_normalize_if_needed(
|
|
147
|
+
path_boost=path_boost, error_improvement=error_improvement
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return dict_error_improvement
|
|
151
|
+
|
|
152
|
+
def compute_relative_variable_importance(
|
|
153
|
+
self, path_boost: "SequentialPathBoost"
|
|
154
|
+
) -> dict:
|
|
155
|
+
# this is a relative measure of importance
|
|
156
|
+
# it is computed as the ratio between the error improvement of a the second best path and the error improvement of the best path
|
|
157
|
+
|
|
158
|
+
error_improvement = defaultdict(float)
|
|
159
|
+
for iteration in range(path_boost.n_iter):
|
|
160
|
+
selected_path_at_iteration = self.selected_path_at_iteration[iteration]
|
|
161
|
+
|
|
162
|
+
train_ebm_dataframe_at_iteration = path_boost.train_ebm_dataframe_[
|
|
163
|
+
self.columns_at_iteration[iteration]
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
frequency_matrix_at_iteration = (
|
|
167
|
+
ExtendedBoostingMatrix.get_frequency_boosting_matrix(
|
|
168
|
+
train_ebm_dataframe_at_iteration
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
frequency_path_name = (
|
|
173
|
+
ExtendedBoostingMatrix.generate_frequency_column_name_for_path(
|
|
174
|
+
path_label=selected_path_at_iteration
|
|
175
|
+
)
|
|
176
|
+
)
|
|
177
|
+
frequency_matrix_without_best_path = frequency_matrix_at_iteration.drop(
|
|
178
|
+
frequency_path_name, axis=1, inplace=False
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
gradient = self.gradient_at_iteration[iteration]
|
|
182
|
+
|
|
183
|
+
# get the second-best path
|
|
184
|
+
if iteration == 0:
|
|
185
|
+
# in the first iteration we do not have a previous error to compare with so we skip
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
second_best_path = path_boost._find_best_path(
|
|
189
|
+
train_ebm_dataframe=frequency_matrix_without_best_path,
|
|
190
|
+
y=gradient,
|
|
191
|
+
SelectorClass=path_boost.SelectorClass,
|
|
192
|
+
kwargs_for_selector=path_boost.kwargs_for_selector,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# fit a new base learner on the second-best path
|
|
196
|
+
columns_to_keep = ExtendedBoostingMatrix.get_columns_related_to_path(
|
|
197
|
+
second_best_path, train_ebm_dataframe_at_iteration.columns
|
|
198
|
+
)
|
|
199
|
+
restricted_df = train_ebm_dataframe_at_iteration[columns_to_keep]
|
|
200
|
+
|
|
201
|
+
new_base_learner = path_boost.BaseLearnerClass(
|
|
202
|
+
**path_boost.kwargs_for_base_learner
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
new_base_learner.fit(restricted_df, gradient)
|
|
206
|
+
if self.error_used == "mse":
|
|
207
|
+
new_base_learner_prediction = path_boost.learning_rate * pd.Series(
|
|
208
|
+
new_base_learner.predict(restricted_df)
|
|
209
|
+
)
|
|
210
|
+
new_base_learner_error = mean_squared_error(
|
|
211
|
+
y_true=gradient, y_pred=new_base_learner_prediction
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
error_difference = (
|
|
215
|
+
new_base_learner_error - path_boost.train_mse_[iteration]
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
elif self.error_used == "mae":
|
|
219
|
+
new_base_learner_prediction = path_boost.learning_rate * pd.Series(
|
|
220
|
+
new_base_learner.predict(restricted_df)
|
|
221
|
+
)
|
|
222
|
+
new_base_learner_error = mean_absolute_error(
|
|
223
|
+
y_true=gradient, y_pred=new_base_learner_prediction
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
error_difference = (
|
|
227
|
+
new_base_learner_error - path_boost.train_mae_[iteration]
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# update the error improvement
|
|
231
|
+
error_improvement[selected_path_at_iteration] += error_difference
|
|
232
|
+
|
|
233
|
+
dict_error_improvement = self._get_correlation_and_normalize_if_needed(
|
|
234
|
+
path_boost=path_boost, error_improvement=error_improvement
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
return dict_error_improvement
|
|
238
|
+
|
|
239
|
+
def _get_correlation_and_normalize_if_needed(
|
|
240
|
+
self, path_boost: "SequentialPathBoost", error_improvement: dict
|
|
241
|
+
) -> dict:
|
|
242
|
+
dict_error_improvement = dict(error_improvement)
|
|
243
|
+
|
|
244
|
+
if self.use_correlation:
|
|
245
|
+
# we need to compute the correlation between the paths
|
|
246
|
+
dict_error_improvement = self.correlation_importance(
|
|
247
|
+
path_boost, dict_error_improvement
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
if self.normalize:
|
|
251
|
+
total_error_improvement = sum(dict_error_improvement.values())
|
|
252
|
+
for path in dict_error_improvement.keys():
|
|
253
|
+
dict_error_improvement[path] = (
|
|
254
|
+
dict_error_improvement[path] / total_error_improvement
|
|
255
|
+
) * self.normalization_value
|
|
256
|
+
return dict_error_improvement
|
|
257
|
+
|
|
258
|
+
def correlation_importance(
|
|
259
|
+
self, path_boost: "SequentialPathBoost", variable_importance: dict
|
|
260
|
+
) -> dict:
|
|
261
|
+
# we want the ebm dataframe only for the paths tha have some importance
|
|
262
|
+
frequency_name_of_the_paths = [
|
|
263
|
+
ExtendedBoostingMatrix.generate_frequency_column_name_for_path(
|
|
264
|
+
path_label=path
|
|
265
|
+
)
|
|
266
|
+
for path in variable_importance.keys()
|
|
267
|
+
]
|
|
268
|
+
train_ebm_dataframe = path_boost.train_ebm_dataframe_[
|
|
269
|
+
frequency_name_of_the_paths
|
|
270
|
+
]
|
|
271
|
+
|
|
272
|
+
frequency_matrix = ExtendedBoostingMatrix.get_frequency_boosting_matrix(
|
|
273
|
+
train_ebm_dataframe
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# get the correlation matrix
|
|
277
|
+
correlation_matrix = frequency_matrix.corr()
|
|
278
|
+
|
|
279
|
+
# The keys in variable_importance and the correlation matrix must match.
|
|
280
|
+
# variable_importance is a dict with tuple keys; correlation_matrix is a pd.DataFrame.
|
|
281
|
+
|
|
282
|
+
correlation_variable_importance = dict()
|
|
283
|
+
for path in variable_importance.keys():
|
|
284
|
+
frequency_name_of_path = (
|
|
285
|
+
ExtendedBoostingMatrix.generate_frequency_column_name_for_path(
|
|
286
|
+
path_label=path
|
|
287
|
+
)
|
|
288
|
+
)
|
|
289
|
+
correlation_variable_importance[path] = variable_importance[path]
|
|
290
|
+
for second_path in variable_importance.keys():
|
|
291
|
+
frequency_name_of_second_path = (
|
|
292
|
+
ExtendedBoostingMatrix.generate_frequency_column_name_for_path(
|
|
293
|
+
path_label=second_path
|
|
294
|
+
)
|
|
295
|
+
)
|
|
296
|
+
if (
|
|
297
|
+
len(path) > len(second_path)
|
|
298
|
+
and path[: len(second_path)] == second_path
|
|
299
|
+
):
|
|
300
|
+
corr = correlation_matrix.loc[
|
|
301
|
+
frequency_name_of_path, frequency_name_of_second_path
|
|
302
|
+
]
|
|
303
|
+
if not pd.isna(corr):
|
|
304
|
+
# we want to add the correlation only if it is not nan, it is nan
|
|
305
|
+
correlation_variable_importance[path] += (
|
|
306
|
+
corr * variable_importance[second_path]
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
return correlation_variable_importance
|
|
310
|
+
|
|
311
|
+
def combine_variable_importance_from_list_of_sequential_models(
|
|
312
|
+
self,
|
|
313
|
+
sequential_models: list,
|
|
314
|
+
) -> dict:
|
|
315
|
+
variable_importance_dictionary = defaultdict(list)
|
|
316
|
+
total_obs = 0
|
|
317
|
+
for sequential_model in sequential_models:
|
|
318
|
+
if sequential_model is not None:
|
|
319
|
+
# we want to get the variable importance of each model and sum all them up
|
|
320
|
+
n_obs = sequential_model.train_ebm_dataframe_.shape[0]
|
|
321
|
+
total_obs += n_obs
|
|
322
|
+
for key, value in sequential_model.variable_importance_.items():
|
|
323
|
+
variable_importance_dictionary[key].append(value * n_obs)
|
|
324
|
+
|
|
325
|
+
averaged_variable_importance = {}
|
|
326
|
+
for key, value_list in variable_importance_dictionary.items():
|
|
327
|
+
if value_list: # Check if the list is not empty to avoid division by zero
|
|
328
|
+
averaged_variable_importance[key] = sum(value_list) / (
|
|
329
|
+
len(value_list) * total_obs
|
|
330
|
+
)
|
|
331
|
+
else:
|
|
332
|
+
averaged_variable_importance[key] = 0
|
|
333
|
+
|
|
334
|
+
if self.normalize:
|
|
335
|
+
total_error_improvement = sum(averaged_variable_importance.values())
|
|
336
|
+
for path in averaged_variable_importance.keys():
|
|
337
|
+
averaged_variable_importance[path] = (
|
|
338
|
+
averaged_variable_importance[path] / total_error_improvement
|
|
339
|
+
) * self.normalization_value
|
|
340
|
+
|
|
341
|
+
return averaged_variable_importance
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: path_boost
|
|
3
|
+
Version: 2.1.0
|
|
4
|
+
Summary: Interpretable machine learning on graph-structured data using path-based boosting.
|
|
5
|
+
Author-email: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>, Guillaume Lemaitre <g.lemaitre58@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Claudio-Me/extended_path_boost
|
|
8
|
+
Project-URL: Issues, https://github.com/Claudio-Me/extended_path_boost/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Operating System :: POSIX
|
|
15
|
+
Classifier: Operating System :: Unix
|
|
16
|
+
Classifier: Operating System :: MacOS
|
|
17
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: scikit-learn>=1.4.2
|
|
22
|
+
Requires-Dist: networkx>=3.0
|
|
23
|
+
Requires-Dist: pandas>=2.0
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Requires-Dist: matplotlib>=3.5
|
|
26
|
+
Requires-Dist: joblib>=1.2
|
|
27
|
+
Requires-Dist: scipy>=1.10
|
|
28
|
+
Provides-Extra: progress
|
|
29
|
+
Requires-Dist: tqdm>=4.64; extra == "progress"
|
|
30
|
+
Provides-Extra: all
|
|
31
|
+
Requires-Dist: tqdm>=4.64; extra == "all"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# Path Boost
|
|
35
|
+
|
|
36
|
+
Path Boost is a Python library for interpretable machine learning on graph-structured data. It implements the PathBoost and SequentialPathBoost algorithms, which iteratively construct features based on paths in graphs and use boosting to build predictive models. The library is designed for tasks where input data consists of collections of graphs (e.g., molecules, social networks) and supports variable importance analysis for interpretability.
|
|
37
|
+
|
|
38
|
+
## Features
|
|
39
|
+
|
|
40
|
+
- **PathBoost**: Ensemble learning over graph paths, partitioned by anchor nodes.
|
|
41
|
+
- **SequentialPathBoost**: Boosting with path-based features, iteratively expanding the feature space.
|
|
42
|
+
- **Variable Importance**: Quantifies the importance of paths/features in prediction.
|
|
43
|
+
- **Parallel Training**: Supports multi-core training for large datasets.
|
|
44
|
+
- **Evaluation and Visualization**: Built-in tools for error tracking and variable importance plotting.
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
Install from PyPI:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install path_boost
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Usage Example
|
|
55
|
+
|
|
56
|
+
Below is a minimal example using the `PathBoost` model:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
import numpy as np
|
|
60
|
+
import networkx as nx
|
|
61
|
+
from sklearn.model_selection import train_test_split
|
|
62
|
+
from path_boost import PathBoost
|
|
63
|
+
from path_boost.utils.datasets_for_examples.generate_example_dataset import generate_synthetic_graph_dataset
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
if __name__ == "__main__":
|
|
69
|
+
|
|
70
|
+
# Generate synthetic dataset
|
|
71
|
+
nx_graphs, y, true_paths, true_weights = generate_synthetic_graph_dataset()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
list_anchor_nodes_labels = [0, 1, 2]
|
|
75
|
+
|
|
76
|
+
parameters_variable_importance: dict = {
|
|
77
|
+
'criterion': 'absolute',
|
|
78
|
+
'error_used': 'mse',
|
|
79
|
+
'use_correlation': False,
|
|
80
|
+
'normalize': True,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
X_train, X_test, y_train, y_test = train_test_split(nx_graphs, y, test_size=0.25, random_state=42)
|
|
84
|
+
|
|
85
|
+
eval_set = [(X_test, y_test)]
|
|
86
|
+
|
|
87
|
+
path_boost = PathBoost(
|
|
88
|
+
n_iter=50, # Reduced for quicker example run
|
|
89
|
+
max_path_length=5,
|
|
90
|
+
learning_rate=0.1,
|
|
91
|
+
n_of_cores=1, # Set to >1 for parallel processing if desired
|
|
92
|
+
verbose=True,
|
|
93
|
+
parameters_variable_importance=parameters_variable_importance
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Fit the model
|
|
97
|
+
# anchor_nodes_label_name must correspond to the feature storing node types ('feature_0')
|
|
98
|
+
path_boost.fit(
|
|
99
|
+
X=X_train,
|
|
100
|
+
y=y_train,
|
|
101
|
+
eval_set=eval_set,
|
|
102
|
+
list_anchor_nodes_labels=list_anchor_nodes_labels,
|
|
103
|
+
anchor_nodes_label_name="feature_0" # Node types are in 'feature_0'
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
print(f"Generated {len(nx_graphs)} graphs.")
|
|
107
|
+
print(f"Example y values: {y[:5]}")
|
|
108
|
+
print(f"True paths definitions: {true_paths}")
|
|
109
|
+
print(f"True path weights: {true_weights}")
|
|
110
|
+
|
|
111
|
+
path_boost.plot_training_and_eval_errors(skip_first_n_iterations=0, plot_eval_sets_error=True)
|
|
112
|
+
if path_boost.parameters_variable_importance is not None and hasattr(path_boost, 'variable_importance_'):
|
|
113
|
+
path_boost.plot_variable_importance(top_n_features=10)
|
|
114
|
+
else:
|
|
115
|
+
print("Variable importance not computed or available.")
|
|
116
|
+
|
|
117
|
+
print("Example run finished.")
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## API Overview
|
|
122
|
+
|
|
123
|
+
### PathBoost
|
|
124
|
+
|
|
125
|
+
- `fit(X, y, anchor_nodes_label_name, list_anchor_nodes_labels, eval_set=None)`
|
|
126
|
+
- `predict(X)`
|
|
127
|
+
- `predict_step_by_step(X)`
|
|
128
|
+
- `evaluate(X, y)`
|
|
129
|
+
- `plot_training_and_eval_errors(skip_first_n_iterations=True)`
|
|
130
|
+
- `plot_variable_importance()`
|
|
131
|
+
- **Attributes:**
|
|
132
|
+
- `train_mse_`: Training error (MSE) at each iteration
|
|
133
|
+
- `mse_eval_set_`: Evaluation set error (MSE) at each iteration (if `eval_set` is provided)
|
|
134
|
+
- `variable_importance_`: Variable/path importance scores (if enabled)
|
|
135
|
+
- `is_fitted_`: Whether the model is fitted
|
|
136
|
+
- `models_list_`: List of fitted SequentialPathBoost models (one per anchor node)
|
|
137
|
+
- (Each SequentialPathBoost in `models_list_` exposes the attributes below)
|
|
138
|
+
|
|
139
|
+
### SequentialPathBoost
|
|
140
|
+
|
|
141
|
+
- `fit(X, y, list_anchor_nodes_labels, name_of_label_attribute, eval_set=None)`
|
|
142
|
+
- `predict(X)`
|
|
143
|
+
- `predict_step_by_step(X)`
|
|
144
|
+
- `evaluate(X, y)`
|
|
145
|
+
- `plot_training_and_eval_errors(skip_first_n_iterations=True)`
|
|
146
|
+
- `plot_variable_importance()`
|
|
147
|
+
- **Attributes:**
|
|
148
|
+
- `train_mse_`: Training error (MSE) at each iteration
|
|
149
|
+
- `train_mae_`: Training MAE at each iteration
|
|
150
|
+
- `eval_sets_mse_`: Evaluation set error (MSE) at each iteration (if `eval_set` is provided)
|
|
151
|
+
- `eval_sets_mae_`: Evaluation set MAE at each iteration (if `eval_set` is provided)
|
|
152
|
+
- `variable_importance_`: Variable/path importance scores (if enabled)
|
|
153
|
+
- `paths_selected_by_epb_`: Set of selected paths during boosting
|
|
154
|
+
- `columns_names_`: Names of EBM columns/features used
|
|
155
|
+
- `is_fitted_`: Whether the model is fitted
|
|
156
|
+
|
|
157
|
+
## Requirements
|
|
158
|
+
|
|
159
|
+
- Python 3.10+
|
|
160
|
+
- numpy
|
|
161
|
+
- pandas
|
|
162
|
+
- scikit-learn
|
|
163
|
+
- networkx
|
|
164
|
+
- matplotlib
|
|
165
|
+
|
|
166
|
+
(See `requirements.txt` for the full list.)
|
|
167
|
+
|
|
168
|
+
## Citation
|
|
169
|
+
|
|
170
|
+
If you use this library in your research, please cite the corresponding paper (add citation here).
|
|
171
|
+
|
|
172
|
+
## License
|
|
173
|
+
|
|
174
|
+
BSD 3-Clause License
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
path_boost/__init__.py,sha256=DJGVzv6WAkNXzYPAR4bLOP3Doa774hSHFDLj9dxP9fg,427
|
|
2
|
+
path_boost/_path_boost.py,sha256=4R0ZaoPApvxRiWhrlo1oTY7onedgiMMkdvU608EVs-g,43791
|
|
3
|
+
path_boost/_version.py,sha256=v9WVy_5yL3epBJmMyIAo86Fa1LzYly1fPB2fENrVANg,520
|
|
4
|
+
path_boost/utils/__init__.py,sha256=OLs978IP7BWkfydS9GzMQg4oPPAsQl9dEfxS2iQD9nI,67
|
|
5
|
+
path_boost/utils/cross_validation.py,sha256=kXslG2reDePdn4a0wtsHKDiOI7iW0xWGlz0XPqG3Y78,1467
|
|
6
|
+
path_boost/utils/cyclic_path_boost_utils.py,sha256=2--3OflOIDfqtT94nGmukScJGDdO77yU3ldUUeGfHmM,2843
|
|
7
|
+
path_boost/utils/discovery.py,sha256=fB7nHL8qQLoGJBuWtpqDr8ylvg_A6rRYy4z0EZXaWu8,7136
|
|
8
|
+
path_boost/utils/plots_functions.py,sha256=rKr9cQlnrZbwL42D-NMRgvuZq0LFuBMe4wYpq-1pMSw,5233
|
|
9
|
+
path_boost/utils/validate_data.py,sha256=wMILdnJaPys945VDN9LKJa5Tg7f1TaUrDgV6rReK_RI,8916
|
|
10
|
+
path_boost/utils/variable_importance_according_to_path_boost.py,sha256=BSsyyW43wxrl03pK6HHuTlSxh9UK7vqmCOP2oQsIQhg,15037
|
|
11
|
+
path_boost/utils/classes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
path_boost/utils/classes/additive_model_wrapper.py,sha256=WjWhzVAPrR7TrK92IrSeuP-xxi_SY3KfWx44kWcqmNk,10930
|
|
13
|
+
path_boost/utils/classes/additive_model_wrapper_classifier.py,sha256=D6vJ8Q7qEglayqlnL_3XD7HpSCfGJeTjHOA57R6UfE4,15044
|
|
14
|
+
path_boost/utils/classes/extended_boosting_matrix.py,sha256=wXzZkGsrlfOyUyvun7M__JCk_k2rJLL5a80ayIJ_sJY,23716
|
|
15
|
+
path_boost/utils/classes/sequential_path_boost.py,sha256=uXHVUaUC-J8_9_lppNczSUvHsfaiaDWfshCemC1ah2E,43544
|
|
16
|
+
path_boost/utils/classes/sequential_path_boost_classifier.py,sha256=_cDWLdIrxpi2hps6W-0mxYPiAQ9r5rLIbQ2U4VbpJSI,35659
|
|
17
|
+
path_boost/utils/classes/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
+
path_boost/utils/classes/interfaces/interface_base_learner.py,sha256=PtYXYMoknCn6U-0Bl_7hFo3DX-_830D1WuY9PH5jD3c,818
|
|
19
|
+
path_boost/utils/classes/interfaces/interface_selector.py,sha256=1xuCsgIwj5-6mU27R6mQLQWmqpgFj197UzIWwt-RQV0,673
|
|
20
|
+
path_boost/utils/datasets_for_examples/__init__.py,sha256=OLs978IP7BWkfydS9GzMQg4oPPAsQl9dEfxS2iQD9nI,67
|
|
21
|
+
path_boost/utils/datasets_for_examples/generate_example_dataset.py,sha256=gtHOyRYEWN5pT-LQZoHvvaWhmpnWcQ6s01UklJJKzgY,11929
|
|
22
|
+
path_boost-2.1.0.dist-info/licenses/LICENSE,sha256=oaQgL_11JDZk4U2flY7eCLcjXqHuVp0H9Novv7PrsV0,1092
|
|
23
|
+
path_boost-2.1.0.dist-info/METADATA,sha256=Ih3nwPzYMlS9Ym3upZP3-17GXk9dvFT2FEtUO_KqOlc,6147
|
|
24
|
+
path_boost-2.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
25
|
+
path_boost-2.1.0.dist-info/top_level.txt,sha256=Cz9AC5j1bERLpvkxax55nUSGy7G7g2HjbUO-XOxQ2Zs,11
|
|
26
|
+
path_boost-2.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Claudio Meggio
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
path_boost
|