asf 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- asf/__init__.py +0 -0
- asf/cli/__init__.py +0 -0
- asf/cli/cli_train.py +161 -0
- asf/epm/__init__.py +4 -0
- asf/epm/epm.py +147 -0
- asf/epm/epm_tuner.py +141 -0
- asf/metrics/__init__.py +14 -0
- asf/metrics/baselines.py +127 -0
- asf/pre_selector/__init__.py +18 -0
- asf/pre_selector/abstract_pre_selector.py +32 -0
- asf/pre_selector/beam_search_pre_selection.py +84 -0
- asf/pre_selector/brute_force_pre_selection.py +83 -0
- asf/pre_selector/knee_of_the_curve_pre_selector.py +140 -0
- asf/pre_selector/marginal_contribution_based.py +97 -0
- asf/pre_selector/optimize_pre_selection.py +146 -0
- asf/pre_selector/sbs_pre_selection.py +75 -0
- asf/predictors/__init__.py +33 -0
- asf/predictors/abstract_predictor.py +155 -0
- asf/predictors/epm_extra_trees.py +137 -0
- asf/predictors/epm_random_forest.py +215 -0
- asf/predictors/linear_model.py +245 -0
- asf/predictors/mlp.py +329 -0
- asf/predictors/random_forest.py +293 -0
- asf/predictors/ranking_mlp.py +207 -0
- asf/predictors/regression_mlp.py +165 -0
- asf/predictors/sklearn_wrapper.py +108 -0
- asf/predictors/svm.py +301 -0
- asf/predictors/utils/datasets.py +90 -0
- asf/predictors/utils/losses.py +40 -0
- asf/predictors/utils/mlp.py +26 -0
- asf/predictors/xgboost.py +553 -0
- asf/preprocessing/__init__.py +27 -0
- asf/preprocessing/performace_scaling.py +500 -0
- asf/preprocessing/sklearn_preprocessor.py +49 -0
- asf/presolving/__init__.py +7 -0
- asf/presolving/asap_v2.py +277 -0
- asf/presolving/aspeed.py +189 -0
- asf/presolving/presolver.py +22 -0
- asf/scenario/__init__.py +3 -0
- asf/scenario/aslib_reader.py +207 -0
- asf/scenario/epmbench_reader.py +178 -0
- asf/selectors/__init__.py +28 -0
- asf/selectors/abstract_model_based_selector.py +70 -0
- asf/selectors/abstract_selector.py +249 -0
- asf/selectors/feature_generator.py +38 -0
- asf/selectors/joint_ranking.py +107 -0
- asf/selectors/mutli_class.py +60 -0
- asf/selectors/pairwise_classifier.py +252 -0
- asf/selectors/pairwise_regressor.py +235 -0
- asf/selectors/performance_model.py +156 -0
- asf/selectors/selector_pipeline.py +128 -0
- asf/selectors/selector_tuner.py +196 -0
- asf/selectors/simple_ranking.py +133 -0
- asf/selectors/survival_analysis.py +146 -0
- asf/utils/groupkfoldshuffle.py +29 -0
- asf-0.1.1.dist-info/METADATA +179 -0
- asf-0.1.1.dist-info/RECORD +60 -0
- asf-0.1.1.dist-info/WHEEL +5 -0
- asf-0.1.1.dist-info/licenses/LICENSE +21 -0
- asf-0.1.1.dist-info/top_level.txt +1 -0
asf/__init__.py
ADDED
|
File without changes
|
asf/cli/__init__.py
ADDED
|
File without changes
|
asf/cli/cli_train.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""CLI entry point for training selectors.
|
|
3
|
+
|
|
4
|
+
This script provides a command-line interface for training model-based selectors.
|
|
5
|
+
It allows users to specify the selector type, model, budget, and other parameters
|
|
6
|
+
to train and save the selector model.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from functools import partial
|
|
12
|
+
from typing import Dict, Callable, List
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from asf import selectors
|
|
17
|
+
|
|
18
|
+
import sklearn
|
|
19
|
+
|
|
20
|
+
# Mapping of file extensions to pandas read functions
|
|
21
|
+
pandas_read_map: Dict[str, Callable] = {
|
|
22
|
+
".csv": pd.read_csv,
|
|
23
|
+
".parquet": pd.read_parquet,
|
|
24
|
+
".json": pd.read_json,
|
|
25
|
+
".feather": pd.read_feather,
|
|
26
|
+
".hdf": pd.read_hdf,
|
|
27
|
+
".html": pd.read_html,
|
|
28
|
+
".xml": pd.read_xml,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def parser_function() -> argparse.ArgumentParser:
|
|
33
|
+
"""Define command line arguments for the CLI.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
argparse.ArgumentParser: The argument parser with defined arguments.
|
|
37
|
+
"""
|
|
38
|
+
parser = argparse.ArgumentParser()
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"--selector",
|
|
41
|
+
choices=selectors.__implemented__,
|
|
42
|
+
required=True,
|
|
43
|
+
help="Selector to train",
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--model",
|
|
47
|
+
default="RandomForestClassifier",
|
|
48
|
+
help="Model to use for the selector. "
|
|
49
|
+
"Make sure to specify as an attribute of sklearn.ensemble.",
|
|
50
|
+
)
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"--budget",
|
|
53
|
+
type=int,
|
|
54
|
+
default=None,
|
|
55
|
+
required=False,
|
|
56
|
+
help="Budget for the solvers",
|
|
57
|
+
)
|
|
58
|
+
parser.add_argument(
|
|
59
|
+
"--maximize",
|
|
60
|
+
type=bool,
|
|
61
|
+
default=False,
|
|
62
|
+
required=False,
|
|
63
|
+
help="Maximize the objective",
|
|
64
|
+
)
|
|
65
|
+
parser.add_argument(
|
|
66
|
+
"--performance-metric",
|
|
67
|
+
type=str,
|
|
68
|
+
default="",
|
|
69
|
+
required=False,
|
|
70
|
+
help="Performance metric to optimize",
|
|
71
|
+
)
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
"--feature-data",
|
|
74
|
+
type=Path,
|
|
75
|
+
required=True,
|
|
76
|
+
help="Path to feature data",
|
|
77
|
+
)
|
|
78
|
+
parser.add_argument(
|
|
79
|
+
"--performance-data",
|
|
80
|
+
type=Path,
|
|
81
|
+
required=True,
|
|
82
|
+
help="Path to performance data",
|
|
83
|
+
)
|
|
84
|
+
parser.add_argument(
|
|
85
|
+
"--model-path",
|
|
86
|
+
type=Path,
|
|
87
|
+
required=True,
|
|
88
|
+
help="Path to save model",
|
|
89
|
+
)
|
|
90
|
+
return parser
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def build_cli_command(
|
|
94
|
+
selector: selectors.AbstractModelBasedSelector,
|
|
95
|
+
feature_data: Path,
|
|
96
|
+
performance_data: Path,
|
|
97
|
+
destination: Path,
|
|
98
|
+
) -> List[str]:
|
|
99
|
+
"""Build a CLI command from variables for async jobs.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
selector (selectors.AbstractModelBasedSelector): Selector to train.
|
|
103
|
+
feature_data (Path): Path to feature data DataFrame.
|
|
104
|
+
performance_data (Path): Path to performance data DataFrame.
|
|
105
|
+
destination (Path): Path to save the trained model.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
List[str]: A list of command-line arguments to execute the training job.
|
|
109
|
+
"""
|
|
110
|
+
model_class = (
|
|
111
|
+
selector.model_class.args[0]
|
|
112
|
+
if isinstance(selector.model_class, partial)
|
|
113
|
+
else selector.model_class
|
|
114
|
+
)
|
|
115
|
+
return [
|
|
116
|
+
"python",
|
|
117
|
+
str(Path(__file__).absolute()),
|
|
118
|
+
"--selector",
|
|
119
|
+
type(selector).__name__,
|
|
120
|
+
"--model",
|
|
121
|
+
f"{model_class.__name__}",
|
|
122
|
+
"--budget",
|
|
123
|
+
str(selector.budget),
|
|
124
|
+
"--maximize",
|
|
125
|
+
str(selector.maximize),
|
|
126
|
+
"--performance-metric",
|
|
127
|
+
str(selector.performance_metric),
|
|
128
|
+
"--feature-data",
|
|
129
|
+
str(feature_data),
|
|
130
|
+
"--performance-data",
|
|
131
|
+
str(performance_data),
|
|
132
|
+
"--model-path",
|
|
133
|
+
str(destination),
|
|
134
|
+
]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
if __name__ == "__main__":
|
|
138
|
+
parser = parser_function()
|
|
139
|
+
args = parser.parse_args()
|
|
140
|
+
|
|
141
|
+
# Parse selector into variable
|
|
142
|
+
selector_class = getattr(selectors, args.selector)
|
|
143
|
+
model_class = getattr(sklearn.ensemble, args.model)
|
|
144
|
+
|
|
145
|
+
# Parse training data into variables
|
|
146
|
+
features: pd.DataFrame = pandas_read_map[args.feature_data.suffix](
|
|
147
|
+
args.feature_data, index_col=0
|
|
148
|
+
)
|
|
149
|
+
performance_data: pd.DataFrame = pandas_read_map[args.performance_data.suffix](
|
|
150
|
+
args.performance_data, index_col=0
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
selector = selector_class(
|
|
154
|
+
model_class,
|
|
155
|
+
maximize=args.maximize,
|
|
156
|
+
budget=args.budget,
|
|
157
|
+
)
|
|
158
|
+
selector.fit(features, performance_data)
|
|
159
|
+
|
|
160
|
+
# Save the model to the specified path
|
|
161
|
+
selector.save(args.model_path)
|
asf/epm/__init__.py
ADDED
asf/epm/epm.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from functools import partial
|
|
2
|
+
from typing import Type, Union, Optional
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
from sklearn.base import RegressorMixin
|
|
7
|
+
|
|
8
|
+
from asf.preprocessing.performace_scaling import AbstractNormalization, LogNormalization
|
|
9
|
+
from asf.predictors import SklearnWrapper
|
|
10
|
+
from asf.preprocessing.sklearn_preprocessor import get_default_preprocessor
|
|
11
|
+
from sklearn.base import TransformerMixin
|
|
12
|
+
from asf.predictors.abstract_predictor import AbstractPredictor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EPM:
|
|
16
|
+
"""
|
|
17
|
+
The EPM (Empirical Performance Model) class is a wrapper for machine learning models
|
|
18
|
+
that includes preprocessing, normalization, and optional inverse transformation of predictions.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
predictor_class (Type[AbstractPredictor] | Type[RegressorMixin]): The class of the predictor to use.
|
|
22
|
+
normalization_class (Type[AbstractNormalization]): The normalization class to apply to the target variable.
|
|
23
|
+
transform_back (bool): Whether to apply inverse transformation to predictions.
|
|
24
|
+
features_preprocessing (Union[str, TransformerMixin]): Preprocessing pipeline for features.
|
|
25
|
+
predictor_config (Optional[dict]): Configuration for the predictor.
|
|
26
|
+
predictor_kwargs (Optional[dict]): Additional keyword arguments for the predictor.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
predictor_class: Union[Type[AbstractPredictor], Type[RegressorMixin]],
|
|
32
|
+
normalization_class: Type[AbstractNormalization] = LogNormalization,
|
|
33
|
+
transform_back: bool = True,
|
|
34
|
+
features_preprocessing: Union[str, TransformerMixin] = "default",
|
|
35
|
+
categorical_features: Optional[list] = None,
|
|
36
|
+
numerical_features: Optional[list] = None,
|
|
37
|
+
predictor_config: Optional[dict] = None,
|
|
38
|
+
predictor_kwargs: Optional[dict] = None,
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Initialize the EPM model.
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
predictor_class (Type[AbstractPredictor] | Type[RegressorMixin]): The class of the predictor to use.
|
|
45
|
+
normalization_class (Type[AbstractNormalization]): The normalization class to apply to the target variable.
|
|
46
|
+
transform_back (bool): Whether to apply inverse transformation to predictions.
|
|
47
|
+
features_preprocessing (Union[str, TransformerMixin]): Preprocessing pipeline for features.
|
|
48
|
+
categorical_features (Optional[list]): List of categorical feature names.
|
|
49
|
+
numerical_features (Optional[list]): List of numerical feature names.
|
|
50
|
+
predictor_config (Optional[dict]): Configuration for the predictor.
|
|
51
|
+
predictor_kwargs (Optional[dict]): Additional keyword arguments for the predictor.
|
|
52
|
+
"""
|
|
53
|
+
if isinstance(predictor_class, type) and issubclass(
|
|
54
|
+
predictor_class, (RegressorMixin)
|
|
55
|
+
):
|
|
56
|
+
self.model_class = partial(SklearnWrapper, predictor_class)
|
|
57
|
+
else:
|
|
58
|
+
self.model_class = predictor_class
|
|
59
|
+
|
|
60
|
+
self.predictor_class = predictor_class
|
|
61
|
+
self.normalization_class = normalization_class
|
|
62
|
+
self.transform_back = transform_back
|
|
63
|
+
self.predictor_config = predictor_config
|
|
64
|
+
self.predictor_kwargs = predictor_kwargs or {}
|
|
65
|
+
self.numpy = False
|
|
66
|
+
|
|
67
|
+
if features_preprocessing == "default":
|
|
68
|
+
self.features_preprocessing = get_default_preprocessor(
|
|
69
|
+
categorical_features=categorical_features,
|
|
70
|
+
numerical_features=numerical_features,
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
self.features_preprocessing = features_preprocessing
|
|
74
|
+
|
|
75
|
+
def fit(
|
|
76
|
+
self,
|
|
77
|
+
X: Union[pd.DataFrame, pd.Series, list],
|
|
78
|
+
y: Union[pd.Series, list],
|
|
79
|
+
sample_weight: Optional[list] = None,
|
|
80
|
+
) -> "EPM":
|
|
81
|
+
"""
|
|
82
|
+
Fit the EPM model to the data.
|
|
83
|
+
|
|
84
|
+
Parameters:
|
|
85
|
+
X (Union[pd.DataFrame, pd.Series, list]): Features.
|
|
86
|
+
y (Union[pd.Series, list]): Target variable.
|
|
87
|
+
sample_weight (Optional[list]): Sample weights (optional).
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
EPM: The fitted EPM model.
|
|
91
|
+
"""
|
|
92
|
+
if isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
|
|
93
|
+
X = pd.DataFrame(
|
|
94
|
+
X,
|
|
95
|
+
index=range(len(X)),
|
|
96
|
+
columns=[f"f_{i}" for i in range(X.shape[1])],
|
|
97
|
+
)
|
|
98
|
+
y = pd.Series(
|
|
99
|
+
y,
|
|
100
|
+
index=range(len(y)),
|
|
101
|
+
)
|
|
102
|
+
self.numpy = True
|
|
103
|
+
|
|
104
|
+
if self.features_preprocessing is not None:
|
|
105
|
+
X = self.features_preprocessing.fit_transform(X)
|
|
106
|
+
|
|
107
|
+
self.normalization = self.normalization_class()
|
|
108
|
+
self.normalization.fit(y)
|
|
109
|
+
y = self.normalization.transform(y)
|
|
110
|
+
|
|
111
|
+
if self.predictor_config is None:
|
|
112
|
+
self.predictor = self.predictor_class()
|
|
113
|
+
else:
|
|
114
|
+
self.predictor = self.predictor_class.get_from_configuration(
|
|
115
|
+
self.predictor_config, **self.predictor_kwargs
|
|
116
|
+
)()
|
|
117
|
+
|
|
118
|
+
self.predictor.fit(X, y, sample_weight=sample_weight)
|
|
119
|
+
return self
|
|
120
|
+
|
|
121
|
+
def predict(self, X: Union[pd.DataFrame, pd.Series, list]) -> list:
|
|
122
|
+
"""
|
|
123
|
+
Predict using the fitted EPM model.
|
|
124
|
+
|
|
125
|
+
Parameters:
|
|
126
|
+
X (Union[pd.DataFrame, pd.Series, list]): Features.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
list: Predicted values.
|
|
130
|
+
"""
|
|
131
|
+
if self.numpy:
|
|
132
|
+
if isinstance(X, np.ndarray):
|
|
133
|
+
X = pd.DataFrame(
|
|
134
|
+
X,
|
|
135
|
+
index=range(len(X)),
|
|
136
|
+
columns=[f"f_{i}" for i in range(X.shape[1])],
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if self.features_preprocessing is not None:
|
|
140
|
+
X = self.features_preprocessing.transform(X)
|
|
141
|
+
|
|
142
|
+
y_pred = self.predictor.predict(X)
|
|
143
|
+
|
|
144
|
+
if self.transform_back:
|
|
145
|
+
y_pred = self.normalization.inverse_transform(y_pred)
|
|
146
|
+
|
|
147
|
+
return y_pred
|
asf/epm/epm_tuner.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from typing import Type, Union, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
from sklearn.base import TransformerMixin
|
|
6
|
+
from sklearn.metrics import mean_squared_error # Fixed incorrect import
|
|
7
|
+
from sklearn.model_selection import KFold
|
|
8
|
+
from smac import HyperparameterOptimizationFacade, Scenario
|
|
9
|
+
from asf.utils.groupkfoldshuffle import GroupKFoldShuffle
|
|
10
|
+
|
|
11
|
+
from asf.epm.epm import EPM
|
|
12
|
+
from asf.preprocessing.performace_scaling import AbstractNormalization, LogNormalization
|
|
13
|
+
from asf.predictors.abstract_predictor import AbstractPredictor
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def tune_epm(
|
|
17
|
+
X: np.ndarray,
|
|
18
|
+
y: np.ndarray,
|
|
19
|
+
model_class: Type[AbstractPredictor],
|
|
20
|
+
normalization_class: Type[AbstractNormalization] = LogNormalization,
|
|
21
|
+
features_preprocessing: Union[str, TransformerMixin] = "default",
|
|
22
|
+
categorical_features: Optional[list] = None,
|
|
23
|
+
numerical_features: Optional[list] = None,
|
|
24
|
+
groups: Optional[np.ndarray] = None,
|
|
25
|
+
cv: int = 5,
|
|
26
|
+
timeout: int = 3600,
|
|
27
|
+
runcount_limit: int = 100,
|
|
28
|
+
output_dir: str = "./smac_output",
|
|
29
|
+
seed: int = 0,
|
|
30
|
+
smac_metric: callable = mean_squared_error, # Fixed incorrect import
|
|
31
|
+
smac_scenario_kwargs: Optional[dict] = {},
|
|
32
|
+
smac_kwargs: Optional[dict] = {},
|
|
33
|
+
predictor_kwargs: Optional[dict] = {},
|
|
34
|
+
) -> EPM:
|
|
35
|
+
"""
|
|
36
|
+
Tune the Empirical Performance Model (EPM) using SMAC (Sequential Model-based Algorithm Configuration).
|
|
37
|
+
|
|
38
|
+
Parameters:
|
|
39
|
+
----------
|
|
40
|
+
X : np.ndarray
|
|
41
|
+
Feature matrix for training and validation.
|
|
42
|
+
y : np.ndarray
|
|
43
|
+
Target values corresponding to the feature matrix.
|
|
44
|
+
model_class : Type[AbstractPredictor]
|
|
45
|
+
The predictor class to be tuned.
|
|
46
|
+
normalization_class : Type[AbstractNormalization], optional
|
|
47
|
+
The normalization class to be applied to the data. Defaults to LogNormalization.
|
|
48
|
+
features_preprocessing : Union[str, TransformerMixin], optional
|
|
49
|
+
Preprocessing method for features. Defaults to "default".
|
|
50
|
+
categorical_features : Optional[list], optional
|
|
51
|
+
List of categorical feature names. Defaults to None.
|
|
52
|
+
numerical_features : Optional[list], optional
|
|
53
|
+
List of numerical feature names. Defaults to None.
|
|
54
|
+
groups : Optional[np.ndarray], optional
|
|
55
|
+
Group labels for cross-validation. Defaults to None.
|
|
56
|
+
cv : int, optional
|
|
57
|
+
Number of cross-validation folds. Defaults to 5.
|
|
58
|
+
timeout : int, optional
|
|
59
|
+
Time limit for the tuning process in seconds. Defaults to 3600.
|
|
60
|
+
runcount_limit : int, optional
|
|
61
|
+
Maximum number of configurations to evaluate. Defaults to 100.
|
|
62
|
+
output_dir : str, optional
|
|
63
|
+
Directory to store SMAC output. Defaults to "./smac_output".
|
|
64
|
+
seed : int, optional
|
|
65
|
+
Random seed for reproducibility. Defaults to 0.
|
|
66
|
+
smac_metric : callable, optional
|
|
67
|
+
Metric function to evaluate model performance. Defaults to mean_squared_error.
|
|
68
|
+
smac_scenario_kwargs : Optional[dict], optional
|
|
69
|
+
Additional keyword arguments for the SMAC scenario. Defaults to None.
|
|
70
|
+
smac_kwargs : Optional[dict], optional
|
|
71
|
+
Additional keyword arguments for SMAC optimization. Defaults to None.
|
|
72
|
+
predictor_kwargs : Optional[dict], optional
|
|
73
|
+
Additional keyword arguments for the predictor. Defaults to None.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
-------
|
|
77
|
+
EPM
|
|
78
|
+
The tuned Empirical Performance Model instance.
|
|
79
|
+
"""
|
|
80
|
+
if isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
|
|
81
|
+
X = pd.DataFrame(
|
|
82
|
+
X,
|
|
83
|
+
index=range(len(X)),
|
|
84
|
+
columns=[f"f_{i}" for i in range(X.shape[1])],
|
|
85
|
+
)
|
|
86
|
+
y = pd.Series(
|
|
87
|
+
y,
|
|
88
|
+
index=range(len(y)),
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
scenario = Scenario(
|
|
92
|
+
configspace=model_class.get_configuration_space(),
|
|
93
|
+
n_trials=runcount_limit,
|
|
94
|
+
walltime_limit=timeout,
|
|
95
|
+
deterministic=True,
|
|
96
|
+
output_directory=output_dir,
|
|
97
|
+
seed=seed,
|
|
98
|
+
**smac_scenario_kwargs,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def target_function(config, seed):
|
|
102
|
+
if groups is not None:
|
|
103
|
+
kfold = GroupKFoldShuffle(n_splits=cv, shuffle=True, random_state=seed)
|
|
104
|
+
else:
|
|
105
|
+
kfold = KFold(n_splits=cv, shuffle=True, random_state=seed)
|
|
106
|
+
|
|
107
|
+
scores = []
|
|
108
|
+
for train_idx, test_idx in kfold.split(X, y, groups):
|
|
109
|
+
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
|
|
110
|
+
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
|
|
111
|
+
|
|
112
|
+
epm = EPM(
|
|
113
|
+
predictor_class=model_class,
|
|
114
|
+
normalization_class=normalization_class,
|
|
115
|
+
transform_back=True,
|
|
116
|
+
predictor_config=config,
|
|
117
|
+
predictor_kwargs=predictor_kwargs,
|
|
118
|
+
features_preprocessing=features_preprocessing,
|
|
119
|
+
categorical_features=categorical_features,
|
|
120
|
+
numerical_features=numerical_features,
|
|
121
|
+
)
|
|
122
|
+
epm.fit(X_train, y_train)
|
|
123
|
+
|
|
124
|
+
y_pred = epm.predict(X_test)
|
|
125
|
+
score = smac_metric(y_test, y_pred)
|
|
126
|
+
scores.append(score)
|
|
127
|
+
|
|
128
|
+
return np.mean(scores)
|
|
129
|
+
|
|
130
|
+
smac = HyperparameterOptimizationFacade(scenario, target_function, **smac_kwargs)
|
|
131
|
+
best_config = smac.optimize()
|
|
132
|
+
|
|
133
|
+
return EPM(
|
|
134
|
+
predictor_class=model_class,
|
|
135
|
+
normalization_class=normalization_class,
|
|
136
|
+
transform_back=True,
|
|
137
|
+
predictor_config=best_config,
|
|
138
|
+
features_preprocessing=features_preprocessing,
|
|
139
|
+
categorical_features=categorical_features,
|
|
140
|
+
numerical_features=numerical_features,
|
|
141
|
+
)
|
asf/metrics/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from asf.metrics.baselines import (
|
|
2
|
+
single_best_solver,
|
|
3
|
+
virtual_best_solver,
|
|
4
|
+
running_time_selector_performance,
|
|
5
|
+
running_time_closed_gap,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"single_best_solver",
|
|
11
|
+
"virtual_best_solver",
|
|
12
|
+
"running_time_selector_performance",
|
|
13
|
+
"running_time_closed_gap",
|
|
14
|
+
]
|
asf/metrics/baselines.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import Dict, List, Tuple, Optional, Union
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def single_best_solver(performance: pd.DataFrame, maximize: bool = False) -> float:
|
|
6
|
+
"""
|
|
7
|
+
Selects the single best solver across all instances based on the aggregated performance.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
schedules (pd.DataFrame): The schedules to evaluate (not used in this function).
|
|
11
|
+
performance (pd.DataFrame): The performance data for the algorithms.
|
|
12
|
+
maximize (bool): Whether to maximize or minimize the performance.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
float: The best aggregated performance value across all instances.
|
|
16
|
+
"""
|
|
17
|
+
perf_sum = performance.sum(axis=0)
|
|
18
|
+
if maximize:
|
|
19
|
+
return perf_sum.max()
|
|
20
|
+
else:
|
|
21
|
+
return perf_sum.min()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def virtual_best_solver(performance: pd.DataFrame, maximize: bool = False) -> float:
|
|
25
|
+
"""
|
|
26
|
+
Selects the virtual best solver for each instance by choosing the best performance per instance.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
schedules (pd.DataFrame): The schedules to evaluate (not used in this function).
|
|
30
|
+
performance (pd.DataFrame): The performance data for the algorithms.
|
|
31
|
+
maximize (bool): Whether to maximize or minimize the performance.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
float: The sum of the best performance values for each instance.
|
|
35
|
+
"""
|
|
36
|
+
if maximize:
|
|
37
|
+
return performance.max(axis=1).sum()
|
|
38
|
+
else:
|
|
39
|
+
return performance.min(axis=1).sum()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def running_time_selector_performance(
|
|
43
|
+
schedules: Dict[str, List[Tuple[str, float]]],
|
|
44
|
+
performance: pd.DataFrame,
|
|
45
|
+
budget: float = 5000,
|
|
46
|
+
par: float = 10,
|
|
47
|
+
feature_time: Optional[pd.DataFrame] = None,
|
|
48
|
+
) -> Dict[str, Union[float, int]]:
|
|
49
|
+
"""
|
|
50
|
+
Calculates the total running time for a selector based on the given schedules and performance data.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
schedules (Dict[str, List[Tuple[str, float]]]): The schedules to evaluate, where each key is an instance
|
|
54
|
+
and the value is a list of tuples (algorithm, allocated budget).
|
|
55
|
+
performance (pd.DataFrame): The performance data for the algorithms.
|
|
56
|
+
budget (float): The budget for the scenario.
|
|
57
|
+
par (float): The penalization factor for unsolved instances.
|
|
58
|
+
feature_time (Optional[pd.DataFrame]): The feature time data for each instance. Defaults to zero if not provided.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Dict[str, Union[float, int]]: A dictionary mapping each instance to its total running time.
|
|
62
|
+
"""
|
|
63
|
+
if feature_time is None:
|
|
64
|
+
feature_time = pd.DataFrame(
|
|
65
|
+
0, index=performance.index, columns=["feature_time"]
|
|
66
|
+
)
|
|
67
|
+
total_time = {}
|
|
68
|
+
for instance, schedule in schedules.items():
|
|
69
|
+
allocated_times = {algorithm: 0 for algorithm in performance.columns}
|
|
70
|
+
solved = False
|
|
71
|
+
for algorithm, algo_budget in schedule:
|
|
72
|
+
remaining_budget = (
|
|
73
|
+
budget
|
|
74
|
+
- sum(allocated_times.values())
|
|
75
|
+
- feature_time.loc[instance].sum().item()
|
|
76
|
+
)
|
|
77
|
+
remaining_time_to_solve = performance.loc[instance, algorithm] - (
|
|
78
|
+
algo_budget + allocated_times[algorithm]
|
|
79
|
+
)
|
|
80
|
+
if remaining_time_to_solve < 0:
|
|
81
|
+
allocated_times[algorithm] = performance.loc[instance, algorithm]
|
|
82
|
+
solved = True
|
|
83
|
+
break
|
|
84
|
+
elif remaining_time_to_solve <= remaining_budget:
|
|
85
|
+
allocated_times[algorithm] += remaining_time_to_solve
|
|
86
|
+
else:
|
|
87
|
+
allocated_times[algorithm] += remaining_budget
|
|
88
|
+
break
|
|
89
|
+
if solved:
|
|
90
|
+
total_time[instance] = (
|
|
91
|
+
sum(allocated_times.values()) + feature_time.loc[instance].sum().item()
|
|
92
|
+
)
|
|
93
|
+
else:
|
|
94
|
+
total_time[instance] = budget * par
|
|
95
|
+
|
|
96
|
+
total_time = sum(list(total_time.values()))
|
|
97
|
+
return total_time
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def running_time_closed_gap(
|
|
101
|
+
schedules: Dict[str, List[Tuple[str, float]]],
|
|
102
|
+
performance: pd.DataFrame,
|
|
103
|
+
budget: float,
|
|
104
|
+
feature_time: pd.DataFrame,
|
|
105
|
+
par: float = 10,
|
|
106
|
+
) -> float:
|
|
107
|
+
"""
|
|
108
|
+
Calculates the closed gap metric for a given selector.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
schedules (Dict[str, List[Tuple[str, float]]]): The schedules to evaluate.
|
|
112
|
+
performance (pd.DataFrame): The performance data for the algorithms.
|
|
113
|
+
budget (float): The budget for the scenario.
|
|
114
|
+
par (float): The penalization factor for unsolved instances.
|
|
115
|
+
feature_time (pd.DataFrame): The feature time data for each instance.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
float: The closed gap value, representing the improvement of the selector over the single best solver
|
|
119
|
+
relative to the virtual best solver.
|
|
120
|
+
"""
|
|
121
|
+
sbs_val = single_best_solver(performance, False)
|
|
122
|
+
vbs_val = virtual_best_solver(performance, False)
|
|
123
|
+
s_val = running_time_selector_performance(
|
|
124
|
+
schedules, performance, budget, par, feature_time
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return (sbs_val - s_val) / (sbs_val - vbs_val)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from asf.pre_selector.marginal_contribution_based import (
|
|
2
|
+
MarginalContributionBasedPreSelector,
|
|
3
|
+
)
|
|
4
|
+
from asf.pre_selector.optimize_pre_selection import OptimizePreSelection
|
|
5
|
+
from asf.pre_selector.sbs_pre_selection import SBSPreSelector
|
|
6
|
+
from asf.pre_selector.brute_force_pre_selection import BruteForcePreSelector
|
|
7
|
+
from asf.pre_selector.beam_search_pre_selection import BeamSearchPreSelector
|
|
8
|
+
from asf.pre_selector.knee_of_the_curve_pre_selector import KneeOfCurvePreSelector
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"MarginalContributionBasedPreSelector",
|
|
13
|
+
"OptimizePreSelection",
|
|
14
|
+
"SBSPreSelector",
|
|
15
|
+
"BruteForcePreSelector",
|
|
16
|
+
"BeamSearchPreSelector",
|
|
17
|
+
"KneeOfCurvePreSelector",
|
|
18
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from typing import Union, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AbstractPreSelector:
|
|
7
|
+
"""
|
|
8
|
+
Abstract class for pre-selectors.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, n_algorithms: Optional[int] = None):
|
|
12
|
+
"""
|
|
13
|
+
Initialize the pre-selector with the given configuration.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
config (dict): Configuration for the pre-selector.
|
|
17
|
+
"""
|
|
18
|
+
self.n_algorithms = n_algorithms
|
|
19
|
+
|
|
20
|
+
def fit_transform(
|
|
21
|
+
self, performance: Union[pd.DataFrame, np.ndarray]
|
|
22
|
+
) -> Union[pd.DataFrame, np.ndarray]:
|
|
23
|
+
"""
|
|
24
|
+
Fit the pre-selector to the performance data and transform it.
|
|
25
|
+
Args:
|
|
26
|
+
performance (Union[pd.DataFrame, np.ndarray]): Performance data to fit and transform.
|
|
27
|
+
Returns:
|
|
28
|
+
Union[pd.DataFrame, np.ndarray]: Transformed performance data.
|
|
29
|
+
"""
|
|
30
|
+
raise NotImplementedError(
|
|
31
|
+
"fit_transform method must be implemented in subclasses."
|
|
32
|
+
)
|