path-boost 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- path_boost/__init__.py +18 -0
- path_boost/_path_boost.py +1096 -0
- path_boost/_version.py +24 -0
- path_boost/utils/__init__.py +2 -0
- path_boost/utils/classes/__init__.py +0 -0
- path_boost/utils/classes/additive_model_wrapper.py +301 -0
- path_boost/utils/classes/additive_model_wrapper_classifier.py +394 -0
- path_boost/utils/classes/extended_boosting_matrix.py +596 -0
- path_boost/utils/classes/interfaces/__init__.py +0 -0
- path_boost/utils/classes/interfaces/interface_base_learner.py +30 -0
- path_boost/utils/classes/interfaces/interface_selector.py +27 -0
- path_boost/utils/classes/sequential_path_boost.py +1023 -0
- path_boost/utils/classes/sequential_path_boost_classifier.py +840 -0
- path_boost/utils/cross_validation.py +49 -0
- path_boost/utils/cyclic_path_boost_utils.py +76 -0
- path_boost/utils/datasets_for_examples/__init__.py +2 -0
- path_boost/utils/datasets_for_examples/generate_example_dataset.py +304 -0
- path_boost/utils/discovery.py +217 -0
- path_boost/utils/plots_functions.py +153 -0
- path_boost/utils/validate_data.py +223 -0
- path_boost/utils/variable_importance_according_to_path_boost.py +341 -0
- path_boost-2.1.0.dist-info/METADATA +174 -0
- path_boost-2.1.0.dist-info/RECORD +26 -0
- path_boost-2.1.0.dist-info/WHEEL +5 -0
- path_boost-2.1.0.dist-info/licenses/LICENSE +21 -0
- path_boost-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import networkx as nx
|
|
4
|
+
import numpy as np
|
|
5
|
+
from sklearn.model_selection import GridSearchCV, train_test_split
|
|
6
|
+
|
|
7
|
+
from path_boost._path_boost import PathBoost
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger("path_boost")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def independent_cross_validation_on_each_anchor_node(
|
|
13
|
+
X: list[nx.Graph], y, param_grid: dict = None
|
|
14
|
+
):
|
|
15
|
+
# Define the parameter grid
|
|
16
|
+
# TODO remove this hard coded param_grid
|
|
17
|
+
if param_grid is None:
|
|
18
|
+
param_grid = {
|
|
19
|
+
"learning_rate": [0.01, 0.02, 0.05],
|
|
20
|
+
"max_path_length": [3, 5, 7],
|
|
21
|
+
"kwargs_for_base_learner": [{"max_depth": 3}, {"max_depth": 4}],
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# Initialize the PathBoost model
|
|
25
|
+
path_boost = PathBoost(n_iter=10, n_of_cores=10, verbose=False)
|
|
26
|
+
|
|
27
|
+
# Initialize GridSearchCV
|
|
28
|
+
grid_search = GridSearchCV(
|
|
29
|
+
estimator=path_boost,
|
|
30
|
+
param_grid=param_grid,
|
|
31
|
+
cv=3,
|
|
32
|
+
scoring="neg_mean_squared_error",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Fit the model on the training data
|
|
36
|
+
grid_search.fit(
|
|
37
|
+
X_train,
|
|
38
|
+
y_train,
|
|
39
|
+
list_anchor_nodes_labels=[25, 47, 48, 80],
|
|
40
|
+
anchor_nodes_label_name="feature_atomic_number",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
logger.info(f"Best parameters found: {grid_search.best_params_}")
|
|
44
|
+
logger.info(f"Best cross-validation score: {-grid_search.best_score_}")
|
|
45
|
+
|
|
46
|
+
# Evaluate the best model on the test set
|
|
47
|
+
best_model = grid_search.best_estimator_
|
|
48
|
+
test_score = best_model.score(X_test, y_test)
|
|
49
|
+
logger.info(f"Test set score: {test_score}")
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import networkx as nx
|
|
2
|
+
|
|
3
|
+
from .classes.extended_boosting_matrix import ExtendedBoostingMatrix
|
|
4
|
+
from .classes.sequential_path_boost import SequentialPathBoost
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def split_dataset_by_metal_centers(
|
|
8
|
+
graphs_list: list[nx.Graph], anchor_nodes_label_name: str, anchor_nodes: list
|
|
9
|
+
) -> list[list[int]]:
|
|
10
|
+
"""
|
|
11
|
+
Splits a list of graphs into subgroups based on anchor node labels.
|
|
12
|
+
|
|
13
|
+
This static method takes a list of graphs, an anchor nodes label name,
|
|
14
|
+
and a list of anchor nodes. It iterates through each graph and identifies
|
|
15
|
+
nodes labeled with anchor node labels. It then organizes the indices of
|
|
16
|
+
graphs where such anchor nodes are found into corresponding subgroups.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
graphs_list (list[nx.Graph]): A list of networkx Graph objects to be processed.
|
|
20
|
+
anchor_nodes_label_name (str): The name of the attribute used to identify anchor nodes in the graphs.
|
|
21
|
+
anchor_nodes (list): A list of anchor nodes to be used as a reference for grouping.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
list[list[int]]: A list containing sublists of indices corresponding to the grouping
|
|
25
|
+
of graphs based on the presence of the anchor nodes.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
indices_list = [[] for _ in range(len(anchor_nodes))]
|
|
29
|
+
for index_in_anchor_nodes, anchor_node_label in enumerate(anchor_nodes):
|
|
30
|
+
for i, graph in enumerate(graphs_list):
|
|
31
|
+
path_found = ExtendedBoostingMatrix.find_labelled_path_in_nx_graph(
|
|
32
|
+
graph=graph,
|
|
33
|
+
path_labels=anchor_node_label,
|
|
34
|
+
main_label_name=anchor_nodes_label_name,
|
|
35
|
+
)
|
|
36
|
+
if len(path_found) > 0:
|
|
37
|
+
indices_list[index_in_anchor_nodes].append(i)
|
|
38
|
+
return indices_list
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def train_pattern_boosting(
|
|
42
|
+
input_from_parallelization: tuple,
|
|
43
|
+
) -> SequentialPathBoost | None:
|
|
44
|
+
model: SequentialPathBoost = input_from_parallelization[0]
|
|
45
|
+
if model is None:
|
|
46
|
+
return None
|
|
47
|
+
X = input_from_parallelization[1]
|
|
48
|
+
y = input_from_parallelization[2]
|
|
49
|
+
list_anchor_nodes_labels: tuple = input_from_parallelization[3]
|
|
50
|
+
name_of_label_attribute = input_from_parallelization[4]
|
|
51
|
+
model.fit(
|
|
52
|
+
X=X,
|
|
53
|
+
y=y,
|
|
54
|
+
eval_set=None,
|
|
55
|
+
list_anchor_nodes_labels=[list_anchor_nodes_labels],
|
|
56
|
+
anchor_nodes_label_name=name_of_label_attribute,
|
|
57
|
+
)
|
|
58
|
+
return model
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def parallel_predict(input_from_parallelization: tuple):
|
|
62
|
+
model: SequentialPathBoost = input_from_parallelization[0]
|
|
63
|
+
X = input_from_parallelization[1]
|
|
64
|
+
if model is None or len(X) == 0:
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
return model.predict(X)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def parallel_predict_step_by_step(input_from_parallelization: tuple):
|
|
71
|
+
model: SequentialPathBoost = input_from_parallelization[0]
|
|
72
|
+
X = input_from_parallelization[1]
|
|
73
|
+
if model is None or len(X) == 0:
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
return model.predict_step_by_step(X)
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import networkx as nx
|
|
3
|
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
|
4
|
+
from path_boost._path_boost import PathBoost
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _find_path_instances(graph, path_definition, type_attribute_name="feature_0"):
|
|
8
|
+
"""
|
|
9
|
+
Finds all instances of a given path definition in a graph.
|
|
10
|
+
A path definition is a sequence of node types.
|
|
11
|
+
An instance is a sequence of connected node IDs whose types match the definition.
|
|
12
|
+
"""
|
|
13
|
+
instances = []
|
|
14
|
+
k = len(path_definition)
|
|
15
|
+
if k == 0:
|
|
16
|
+
return []
|
|
17
|
+
|
|
18
|
+
for start_node in graph.nodes:
|
|
19
|
+
if graph.nodes[start_node].get(type_attribute_name) == path_definition[0]:
|
|
20
|
+
# Stack stores (current_node_sequence, current_definition_index)
|
|
21
|
+
dfs_stack = [([start_node], 0)]
|
|
22
|
+
while dfs_stack:
|
|
23
|
+
current_nodes_in_path, def_idx = dfs_stack.pop()
|
|
24
|
+
|
|
25
|
+
if def_idx == k - 1: # Path complete
|
|
26
|
+
instances.append(list(current_nodes_in_path))
|
|
27
|
+
continue
|
|
28
|
+
|
|
29
|
+
last_node_in_current_path = current_nodes_in_path[-1]
|
|
30
|
+
next_def_idx = def_idx + 1
|
|
31
|
+
expected_next_type = path_definition[next_def_idx]
|
|
32
|
+
|
|
33
|
+
for neighbor in graph.neighbors(last_node_in_current_path):
|
|
34
|
+
if (
|
|
35
|
+
neighbor not in current_nodes_in_path
|
|
36
|
+
and graph.nodes[neighbor].get(type_attribute_name)
|
|
37
|
+
== expected_next_type
|
|
38
|
+
):
|
|
39
|
+
new_path_nodes = list(current_nodes_in_path) # Make a copy
|
|
40
|
+
new_path_nodes.append(neighbor)
|
|
41
|
+
dfs_stack.append((new_path_nodes, next_def_idx))
|
|
42
|
+
return instances
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def generate_synthetic_graph_dataset(
|
|
46
|
+
n_graphs=100,
|
|
47
|
+
avg_n_nodes=15,
|
|
48
|
+
std_n_nodes=5,
|
|
49
|
+
graph_density=0.3,
|
|
50
|
+
n_node_types=5,
|
|
51
|
+
anchor_node_types: list | None = None, # New parameter
|
|
52
|
+
n_numerical_node_features=2, # e.g., feature_1, feature_2
|
|
53
|
+
n_edge_features=1,
|
|
54
|
+
n_true_paths=3,
|
|
55
|
+
avg_true_path_length=3,
|
|
56
|
+
std_true_path_length=1,
|
|
57
|
+
numerical_feature_idx_for_label=1, # 0 for feature_0 (type), 1 for feature_1 etc.
|
|
58
|
+
noise_std=0.5,
|
|
59
|
+
random_state=42,
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Generates a synthetic dataset of graphs.
|
|
63
|
+
Labels 'y' are derived from predefined "true paths" found in the graphs.
|
|
64
|
+
Node types are stored in 'feature_0'. Numerical features are 'feature_1', 'feature_2', ...
|
|
65
|
+
"""
|
|
66
|
+
rng = np.random.RandomState(random_state)
|
|
67
|
+
|
|
68
|
+
possible_node_types = list(range(n_node_types))
|
|
69
|
+
|
|
70
|
+
# Determine which types can start a true path
|
|
71
|
+
valid_starting_types_for_true_paths = possible_node_types
|
|
72
|
+
if anchor_node_types is not None and len(anchor_node_types) > 0:
|
|
73
|
+
# Ensure anchor_node_types are valid w.r.t. n_node_types
|
|
74
|
+
if not all(t in possible_node_types for t in anchor_node_types):
|
|
75
|
+
raise ValueError(
|
|
76
|
+
f"All anchor_node_types must be within the range [0, {n_node_types-1}]"
|
|
77
|
+
)
|
|
78
|
+
valid_starting_types_for_true_paths = anchor_node_types
|
|
79
|
+
|
|
80
|
+
# 1. Define True Paths and their weights
|
|
81
|
+
true_paths_definitions = []
|
|
82
|
+
for _ in range(n_true_paths):
|
|
83
|
+
path_len = int(max(1, rng.normal(avg_true_path_length, std_true_path_length)))
|
|
84
|
+
if path_len == 0:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# First element of the path must be from valid_starting_types_for_true_paths
|
|
88
|
+
first_node_type = rng.choice(valid_starting_types_for_true_paths)
|
|
89
|
+
|
|
90
|
+
if path_len == 1:
|
|
91
|
+
path_def = tuple([int(first_node_type)])
|
|
92
|
+
else:
|
|
93
|
+
remaining_path_types = rng.choice(possible_node_types, size=path_len - 1)
|
|
94
|
+
path_def = tuple(
|
|
95
|
+
[int(first_node_type)] + [int(x) for x in remaining_path_types]
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Add all prefixes of the path that start with an anchor type
|
|
99
|
+
for i in range(len(path_def)):
|
|
100
|
+
prefix = path_def[: i + 1]
|
|
101
|
+
if (
|
|
102
|
+
prefix[0] in valid_starting_types_for_true_paths
|
|
103
|
+
): # Ensure prefix starts with a valid anchor
|
|
104
|
+
true_paths_definitions.append(prefix)
|
|
105
|
+
|
|
106
|
+
true_paths_definitions = list(set(true_paths_definitions)) # Remove duplicates
|
|
107
|
+
|
|
108
|
+
# Adjust number of weights if true_paths_definitions changed size due to prefix addition and set conversion
|
|
109
|
+
actual_n_true_paths = len(true_paths_definitions)
|
|
110
|
+
true_path_weights = rng.uniform(-2, 2, size=actual_n_true_paths)
|
|
111
|
+
|
|
112
|
+
graphs = []
|
|
113
|
+
y_labels = []
|
|
114
|
+
|
|
115
|
+
# Determine the name of the numerical feature to use for label calculation
|
|
116
|
+
# feature_0 is type, feature_1 is the first numerical, etc.
|
|
117
|
+
label_feature_name = f"feature_{numerical_feature_idx_for_label}"
|
|
118
|
+
|
|
119
|
+
for i_graph in range(n_graphs):
|
|
120
|
+
# 2. Generate a random graph
|
|
121
|
+
num_nodes = int(max(2, rng.normal(avg_n_nodes, std_n_nodes)))
|
|
122
|
+
G = nx.erdos_renyi_graph(num_nodes, graph_density, seed=random_state + i_graph)
|
|
123
|
+
if not nx.is_connected(
|
|
124
|
+
G
|
|
125
|
+
): # Ensure graph is connected for more interesting paths
|
|
126
|
+
G = nx.erdos_renyi_graph(
|
|
127
|
+
num_nodes, graph_density, seed=random_state + i_graph + n_graphs
|
|
128
|
+
)
|
|
129
|
+
if not nx.is_connected(G): # If still not connected, take largest component
|
|
130
|
+
if G.number_of_nodes() > 0 and G.number_of_edges() > 0:
|
|
131
|
+
largest_cc = max(nx.connected_components(G), key=len)
|
|
132
|
+
G = G.subgraph(largest_cc).copy()
|
|
133
|
+
if G.number_of_nodes() < 2: # if too small, regenerate a simple one
|
|
134
|
+
G = nx.path_graph(max(2, num_nodes // 2), create_using=nx.Graph())
|
|
135
|
+
|
|
136
|
+
# 3. Assign node features
|
|
137
|
+
for node_idx in G.nodes:
|
|
138
|
+
G.nodes[node_idx]["feature_0"] = rng.choice(
|
|
139
|
+
possible_node_types
|
|
140
|
+
) # Node type
|
|
141
|
+
for f_idx in range(n_numerical_node_features):
|
|
142
|
+
G.nodes[node_idx][f"feature_{f_idx + 1}"] = rng.uniform(-1, 1)
|
|
143
|
+
|
|
144
|
+
# 4. Assign edge features
|
|
145
|
+
for u, v in G.edges:
|
|
146
|
+
for ef_idx in range(n_edge_features):
|
|
147
|
+
G.edges[u, v][f"edge_feature_{ef_idx}"] = rng.uniform(-1, 1)
|
|
148
|
+
|
|
149
|
+
# 5. Calculate label y for the graph
|
|
150
|
+
current_y = 0.0
|
|
151
|
+
for path_idx, path_def in enumerate(true_paths_definitions):
|
|
152
|
+
path_weight = true_path_weights[path_idx]
|
|
153
|
+
path_instances = _find_path_instances(
|
|
154
|
+
G, path_def, type_attribute_name="feature_0"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
for instance_nodes in path_instances:
|
|
158
|
+
feature_sum_on_instance = 0.0
|
|
159
|
+
if label_feature_name == "feature_0": # if using the type itself
|
|
160
|
+
feature_sum_on_instance = sum(
|
|
161
|
+
G.nodes[node][label_feature_name]
|
|
162
|
+
for node in instance_nodes
|
|
163
|
+
if label_feature_name in G.nodes[node]
|
|
164
|
+
)
|
|
165
|
+
else: # if using numerical features
|
|
166
|
+
feature_sum_on_instance = sum(
|
|
167
|
+
G.nodes[node].get(label_feature_name, 0)
|
|
168
|
+
for node in instance_nodes
|
|
169
|
+
)
|
|
170
|
+
current_y += path_weight * feature_sum_on_instance
|
|
171
|
+
|
|
172
|
+
current_y += rng.normal(0, noise_std)
|
|
173
|
+
|
|
174
|
+
graphs.append(G)
|
|
175
|
+
y_labels.append(current_y)
|
|
176
|
+
|
|
177
|
+
return graphs, np.array(y_labels), true_paths_definitions, true_path_weights
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
if __name__ == "__main__":
|
|
181
|
+
N_NODE_TYPES = 5 # Total distinct types of nodes, e.g., 0, 1, 2, 3, 4
|
|
182
|
+
# Define which node types will be considered as anchors for true path generation AND for PathBoost
|
|
183
|
+
anchor_types_for_generation_and_boosting = [
|
|
184
|
+
0,
|
|
185
|
+
1,
|
|
186
|
+
2,
|
|
187
|
+
] # Example: types 0, 1, and 2 are anchors
|
|
188
|
+
|
|
189
|
+
# Generate synthetic dataset
|
|
190
|
+
nx_graphs, y, true_paths, true_weights = generate_synthetic_graph_dataset(
|
|
191
|
+
n_graphs=100,
|
|
192
|
+
avg_n_nodes=12,
|
|
193
|
+
std_n_nodes=3,
|
|
194
|
+
graph_density=0.4,
|
|
195
|
+
n_node_types=N_NODE_TYPES,
|
|
196
|
+
anchor_node_types=anchor_types_for_generation_and_boosting, # Pass the anchor types here
|
|
197
|
+
n_numerical_node_features=2, # feature_1, feature_2
|
|
198
|
+
n_edge_features=1,
|
|
199
|
+
n_true_paths=4, # This will be the number of "base" true paths, prefixes will be added
|
|
200
|
+
avg_true_path_length=3,
|
|
201
|
+
std_true_path_length=1,
|
|
202
|
+
numerical_feature_idx_for_label=1, # Use feature_1 for label calculation
|
|
203
|
+
noise_std=0.1,
|
|
204
|
+
random_state=42,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
print(f"Generated {len(nx_graphs)} graphs.")
|
|
208
|
+
print(f"Example y values: {y[:5]}")
|
|
209
|
+
print(f"True paths definitions (may include prefixes): {true_paths}")
|
|
210
|
+
print(f"True path weights: {true_weights}")
|
|
211
|
+
|
|
212
|
+
list_anchor_nodes_labels = (
|
|
213
|
+
anchor_types_for_generation_and_boosting # Use the same for PathBoost
|
|
214
|
+
)
|
|
215
|
+
anchor_nodes_label_name_for_fitting = "feature_0" # Node types are in 'feature_0'
|
|
216
|
+
|
|
217
|
+
parameters_variable_importance: dict = {
|
|
218
|
+
"criterion": "absolute",
|
|
219
|
+
"error_used": "mse",
|
|
220
|
+
"use_correlation": False,
|
|
221
|
+
"normalize": True,
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
225
|
+
nx_graphs, y, test_size=0.25, random_state=42
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
eval_set = [(X_test, y_test)]
|
|
229
|
+
|
|
230
|
+
# --- GridSearchCV for hyperparameter tuning ---
|
|
231
|
+
print("\nStarting GridSearchCV for hyperparameter tuning...")
|
|
232
|
+
|
|
233
|
+
# Define the parameter grid
|
|
234
|
+
param_grid = {
|
|
235
|
+
"learning_rate": [0.01, 0.1, 0.8],
|
|
236
|
+
"max_path_length": [3, 4],
|
|
237
|
+
"kwargs_for_base_learner": [{"max_depth": 3}, {"max_depth": 5}],
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
# Initialize a base PathBoost model for GridSearchCV
|
|
241
|
+
# n_iter is set to a smaller value for faster CV.
|
|
242
|
+
# Other parameters like n_of_cores, verbose are set for CV.
|
|
243
|
+
base_pb_for_cv = PathBoost(
|
|
244
|
+
n_iter=20, # smaller n_iter for quicker CV
|
|
245
|
+
n_of_cores=1,
|
|
246
|
+
verbose=False,
|
|
247
|
+
parameters_variable_importance=None, # Disable var importance during CV
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
grid_search = GridSearchCV(
|
|
251
|
+
estimator=base_pb_for_cv,
|
|
252
|
+
param_grid=param_grid,
|
|
253
|
+
scoring="neg_mean_squared_error",
|
|
254
|
+
cv=3,
|
|
255
|
+
verbose=1,
|
|
256
|
+
n_jobs=1,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Fit GridSearchCV
|
|
260
|
+
# Pass anchor_nodes_label_name and list_anchor_nodes_labels as they are needed by PathBoost.fit
|
|
261
|
+
grid_search.fit(
|
|
262
|
+
X_train,
|
|
263
|
+
y_train,
|
|
264
|
+
anchor_nodes_label_name=anchor_nodes_label_name_for_fitting,
|
|
265
|
+
list_anchor_nodes_labels=list_anchor_nodes_labels,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
print("\nGridSearchCV finished.")
|
|
269
|
+
print(f"Best parameters found: {grid_search.best_params_}")
|
|
270
|
+
print(f"Best cross-validation score (Negative MSE): {grid_search.best_score_}")
|
|
271
|
+
|
|
272
|
+
# --- Fit final model with best parameters ---
|
|
273
|
+
print("\nFitting final PathBoost model with best parameters...")
|
|
274
|
+
best_params_from_cv = grid_search.best_params_
|
|
275
|
+
|
|
276
|
+
path_boost_final = PathBoost(
|
|
277
|
+
max_path_length=best_params_from_cv["max_path_length"],
|
|
278
|
+
learning_rate=best_params_from_cv["learning_rate"],
|
|
279
|
+
kwargs_for_base_learner=best_params_from_cv["kwargs_for_base_learner"],
|
|
280
|
+
verbose=True,
|
|
281
|
+
parameters_variable_importance=parameters_variable_importance,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Fit the final model
|
|
285
|
+
path_boost_final.fit(
|
|
286
|
+
X=X_train,
|
|
287
|
+
y=y_train,
|
|
288
|
+
eval_set=eval_set,
|
|
289
|
+
list_anchor_nodes_labels=list_anchor_nodes_labels,
|
|
290
|
+
anchor_nodes_label_name=anchor_nodes_label_name_for_fitting,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
print("\nPlotting results for the final tuned model...")
|
|
294
|
+
path_boost_final.plot_training_and_eval_errors(
|
|
295
|
+
skip_first_n_iterations=0, plot_eval_sets_error=True
|
|
296
|
+
)
|
|
297
|
+
if path_boost_final.parameters_variable_importance is not None and hasattr(
|
|
298
|
+
path_boost_final, "variable_importance_"
|
|
299
|
+
):
|
|
300
|
+
path_boost_final.plot_variable_importance(top_n_features=10)
|
|
301
|
+
else:
|
|
302
|
+
print("Variable importance not computed or available for the final model.")
|
|
303
|
+
|
|
304
|
+
print("\nExample run with GridSearchCV finished.")
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""
|
|
2
|
+
The :mod:`path_boost.utils.discovery` module includes utilities to discover
|
|
3
|
+
objects (i.e. estimators, displays, functions) from the `path_boost` package.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# Adapted from scikit-learn
|
|
7
|
+
# Authors: scikit-learn-contrib developers
|
|
8
|
+
# License: BSD 3 clause
|
|
9
|
+
|
|
10
|
+
import inspect
|
|
11
|
+
import pkgutil
|
|
12
|
+
from importlib import import_module
|
|
13
|
+
from operator import itemgetter
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from sklearn.base import (
|
|
17
|
+
BaseEstimator,
|
|
18
|
+
ClassifierMixin,
|
|
19
|
+
ClusterMixin,
|
|
20
|
+
RegressorMixin,
|
|
21
|
+
TransformerMixin,
|
|
22
|
+
)
|
|
23
|
+
from sklearn.utils._testing import ignore_warnings
|
|
24
|
+
|
|
25
|
+
_MODULE_TO_IGNORE = {"tests"}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def all_estimators(type_filter=None):
|
|
29
|
+
"""Get a list of all estimators from `path_boost`.
|
|
30
|
+
|
|
31
|
+
This function crawls the module and gets all classes that inherit
|
|
32
|
+
from `BaseEstimator`. Classes that are defined in test-modules are not
|
|
33
|
+
included.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
type_filter : {"classifier", "regressor", "cluster", "transformer"} \
|
|
38
|
+
or list of such str, default=None
|
|
39
|
+
Which kind of estimators should be returned. If None, no filter is
|
|
40
|
+
applied and all estimators are returned. Possible values are
|
|
41
|
+
'classifier', 'regressor', 'cluster' and 'transformer' to get
|
|
42
|
+
estimators only of these specific types, or a list of these to
|
|
43
|
+
get the estimators that fit at least one of the types.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
estimators : list of tuples
|
|
48
|
+
List of (name, class), where ``name`` is the class name as string
|
|
49
|
+
and ``class`` is the actual type of the class.
|
|
50
|
+
|
|
51
|
+
Examples
|
|
52
|
+
--------
|
|
53
|
+
>>> from path_boost.utils.discovery import all_estimators
|
|
54
|
+
>>> estimators = all_estimators()
|
|
55
|
+
>>> type(estimators)
|
|
56
|
+
<class 'list'>
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def is_abstract(c):
|
|
60
|
+
if not (hasattr(c, "__abstractmethods__")):
|
|
61
|
+
return False
|
|
62
|
+
if not len(c.__abstractmethods__):
|
|
63
|
+
return False
|
|
64
|
+
return True
|
|
65
|
+
|
|
66
|
+
all_classes = []
|
|
67
|
+
root = str(Path(__file__).parent.parent) # path_boost package
|
|
68
|
+
# Ignore deprecation warnings triggered at import time and from walking
|
|
69
|
+
# packages
|
|
70
|
+
with ignore_warnings(category=FutureWarning):
|
|
71
|
+
for _, module_name, _ in pkgutil.walk_packages(
|
|
72
|
+
path=[root], prefix="path_boost."
|
|
73
|
+
):
|
|
74
|
+
module_parts = module_name.split(".")
|
|
75
|
+
if any(part in _MODULE_TO_IGNORE for part in module_parts):
|
|
76
|
+
continue
|
|
77
|
+
module = import_module(module_name)
|
|
78
|
+
classes = inspect.getmembers(module, inspect.isclass)
|
|
79
|
+
classes = [
|
|
80
|
+
(name, est_cls) for name, est_cls in classes if not name.startswith("_")
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
all_classes.extend(classes)
|
|
84
|
+
|
|
85
|
+
all_classes = set(all_classes)
|
|
86
|
+
|
|
87
|
+
estimators = [
|
|
88
|
+
c
|
|
89
|
+
for c in all_classes
|
|
90
|
+
if (issubclass(c[1], BaseEstimator) and c[0] != "BaseEstimator")
|
|
91
|
+
]
|
|
92
|
+
# get rid of abstract base classes
|
|
93
|
+
estimators = [c for c in estimators if not is_abstract(c[1])]
|
|
94
|
+
|
|
95
|
+
if type_filter is not None:
|
|
96
|
+
if not isinstance(type_filter, list):
|
|
97
|
+
type_filter = [type_filter]
|
|
98
|
+
else:
|
|
99
|
+
type_filter = list(type_filter) # copy
|
|
100
|
+
filtered_estimators = []
|
|
101
|
+
filters = {
|
|
102
|
+
"classifier": ClassifierMixin,
|
|
103
|
+
"regressor": RegressorMixin,
|
|
104
|
+
"transformer": TransformerMixin,
|
|
105
|
+
"cluster": ClusterMixin,
|
|
106
|
+
}
|
|
107
|
+
for name, mixin in filters.items():
|
|
108
|
+
if name in type_filter:
|
|
109
|
+
type_filter.remove(name)
|
|
110
|
+
filtered_estimators.extend(
|
|
111
|
+
[est for est in estimators if issubclass(est[1], mixin)]
|
|
112
|
+
)
|
|
113
|
+
estimators = filtered_estimators
|
|
114
|
+
if type_filter:
|
|
115
|
+
raise ValueError(
|
|
116
|
+
"Parameter type_filter must be 'classifier', "
|
|
117
|
+
"'regressor', 'transformer', 'cluster' or "
|
|
118
|
+
"None, got"
|
|
119
|
+
f" {repr(type_filter)}."
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# drop duplicates, sort for reproducibility
|
|
123
|
+
# itemgetter is used to ensure the sort does not extend to the 2nd item of
|
|
124
|
+
# the tuple
|
|
125
|
+
return sorted(set(estimators), key=itemgetter(0))
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def all_displays():
|
|
129
|
+
"""Get a list of all displays from `path_boost`.
|
|
130
|
+
|
|
131
|
+
Returns
|
|
132
|
+
-------
|
|
133
|
+
displays : list of tuples
|
|
134
|
+
List of (name, class), where ``name`` is the display class name as
|
|
135
|
+
string and ``class`` is the actual type of the class.
|
|
136
|
+
|
|
137
|
+
Examples
|
|
138
|
+
--------
|
|
139
|
+
>>> from path_boost.utils.discovery import all_displays
|
|
140
|
+
>>> displays = all_displays()
|
|
141
|
+
"""
|
|
142
|
+
all_classes = []
|
|
143
|
+
root = str(Path(__file__).parent.parent) # path_boost package
|
|
144
|
+
# Ignore deprecation warnings triggered at import time and from walking
|
|
145
|
+
# packages
|
|
146
|
+
with ignore_warnings(category=FutureWarning):
|
|
147
|
+
for _, module_name, _ in pkgutil.walk_packages(
|
|
148
|
+
path=[root], prefix="path_boost."
|
|
149
|
+
):
|
|
150
|
+
module_parts = module_name.split(".")
|
|
151
|
+
if any(part in _MODULE_TO_IGNORE for part in module_parts):
|
|
152
|
+
continue
|
|
153
|
+
module = import_module(module_name)
|
|
154
|
+
classes = inspect.getmembers(module, inspect.isclass)
|
|
155
|
+
classes = [
|
|
156
|
+
(name, display_class)
|
|
157
|
+
for name, display_class in classes
|
|
158
|
+
if not name.startswith("_") and name.endswith("Display")
|
|
159
|
+
]
|
|
160
|
+
all_classes.extend(classes)
|
|
161
|
+
|
|
162
|
+
return sorted(set(all_classes), key=itemgetter(0))
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _is_checked_function(item):
|
|
166
|
+
if not inspect.isfunction(item):
|
|
167
|
+
return False
|
|
168
|
+
|
|
169
|
+
if item.__name__.startswith("_"):
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
mod = item.__module__
|
|
173
|
+
if not mod.startswith("path_boost.") or mod.endswith("estimator_checks"):
|
|
174
|
+
return False
|
|
175
|
+
|
|
176
|
+
return True
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def all_functions():
|
|
180
|
+
"""Get a list of all functions from `path_boost`.
|
|
181
|
+
|
|
182
|
+
Returns
|
|
183
|
+
-------
|
|
184
|
+
functions : list of tuples
|
|
185
|
+
List of (name, function), where ``name`` is the function name as
|
|
186
|
+
string and ``function`` is the actual function.
|
|
187
|
+
|
|
188
|
+
Examples
|
|
189
|
+
--------
|
|
190
|
+
>>> from path_boost.utils.discovery import all_functions
|
|
191
|
+
>>> functions = all_functions()
|
|
192
|
+
"""
|
|
193
|
+
all_functions = []
|
|
194
|
+
root = str(Path(__file__).parent.parent) # path_boost package
|
|
195
|
+
# Ignore deprecation warnings triggered at import time and from walking
|
|
196
|
+
# packages
|
|
197
|
+
with ignore_warnings(category=FutureWarning):
|
|
198
|
+
for _, module_name, _ in pkgutil.walk_packages(
|
|
199
|
+
path=[root], prefix="path_boost."
|
|
200
|
+
):
|
|
201
|
+
module_parts = module_name.split(".")
|
|
202
|
+
if any(part in _MODULE_TO_IGNORE for part in module_parts):
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
module = import_module(module_name)
|
|
206
|
+
functions = inspect.getmembers(module, _is_checked_function)
|
|
207
|
+
functions = [
|
|
208
|
+
(func.__name__, func)
|
|
209
|
+
for name, func in functions
|
|
210
|
+
if not name.startswith("_")
|
|
211
|
+
]
|
|
212
|
+
all_functions.extend(functions)
|
|
213
|
+
|
|
214
|
+
# drop duplicates, sort for reproducibility
|
|
215
|
+
# itemgetter is used to ensure the sort does not extend to the 2nd item of
|
|
216
|
+
# the tuple
|
|
217
|
+
return sorted(set(all_functions), key=itemgetter(0))
|