openstef 3.4.56__py3-none-any.whl → 4.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstef-4.0.0a3.dist-info/METADATA +177 -0
- openstef-4.0.0a3.dist-info/RECORD +4 -0
- {openstef-3.4.56.dist-info → openstef-4.0.0a3.dist-info}/WHEEL +1 -2
- openstef/__init__.py +0 -14
- openstef/__main__.py +0 -3
- openstef/app_settings.py +0 -19
- openstef/data/NL_terrestrial_radiation.csv +0 -25585
- openstef/data/NL_terrestrial_radiation.csv.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +0 -18
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +0 -3
- openstef/data/dutch_holidays.csv +0 -1759
- openstef/data/dutch_holidays.csv.license +0 -3
- openstef/data/pv_single_coefs.csv +0 -601
- openstef/data/pv_single_coefs.csv.license +0 -3
- openstef/data_classes/__init__.py +0 -3
- openstef/data_classes/data_prep.py +0 -99
- openstef/data_classes/model_specifications.py +0 -30
- openstef/data_classes/prediction_job.py +0 -135
- openstef/data_classes/split_function.py +0 -97
- openstef/enums.py +0 -140
- openstef/exceptions.py +0 -74
- openstef/feature_engineering/__init__.py +0 -3
- openstef/feature_engineering/apply_features.py +0 -138
- openstef/feature_engineering/bidding_zone_to_country_mapping.py +0 -106
- openstef/feature_engineering/cyclic_features.py +0 -161
- openstef/feature_engineering/data_preparation.py +0 -152
- openstef/feature_engineering/feature_adder.py +0 -206
- openstef/feature_engineering/feature_applicator.py +0 -202
- openstef/feature_engineering/general.py +0 -141
- openstef/feature_engineering/holiday_features.py +0 -231
- openstef/feature_engineering/lag_features.py +0 -165
- openstef/feature_engineering/missing_values_transformer.py +0 -141
- openstef/feature_engineering/rolling_features.py +0 -58
- openstef/feature_engineering/weather_features.py +0 -492
- openstef/metrics/__init__.py +0 -3
- openstef/metrics/figure.py +0 -303
- openstef/metrics/metrics.py +0 -486
- openstef/metrics/reporter.py +0 -222
- openstef/model/__init__.py +0 -3
- openstef/model/basecase.py +0 -82
- openstef/model/confidence_interval_applicator.py +0 -242
- openstef/model/fallback.py +0 -77
- openstef/model/metamodels/__init__.py +0 -3
- openstef/model/metamodels/feature_clipper.py +0 -90
- openstef/model/metamodels/grouped_regressor.py +0 -222
- openstef/model/metamodels/missing_values_handler.py +0 -138
- openstef/model/model_creator.py +0 -214
- openstef/model/objective.py +0 -426
- openstef/model/objective_creator.py +0 -65
- openstef/model/regressors/__init__.py +0 -3
- openstef/model/regressors/arima.py +0 -197
- openstef/model/regressors/custom_regressor.py +0 -64
- openstef/model/regressors/dazls.py +0 -116
- openstef/model/regressors/flatliner.py +0 -95
- openstef/model/regressors/gblinear_quantile.py +0 -334
- openstef/model/regressors/lgbm.py +0 -29
- openstef/model/regressors/linear.py +0 -90
- openstef/model/regressors/linear_quantile.py +0 -305
- openstef/model/regressors/regressor.py +0 -114
- openstef/model/regressors/xgb.py +0 -52
- openstef/model/regressors/xgb_multioutput_quantile.py +0 -261
- openstef/model/regressors/xgb_quantile.py +0 -228
- openstef/model/serializer.py +0 -431
- openstef/model/standard_deviation_generator.py +0 -81
- openstef/model_selection/__init__.py +0 -3
- openstef/model_selection/model_selection.py +0 -311
- openstef/monitoring/__init__.py +0 -3
- openstef/monitoring/performance_meter.py +0 -92
- openstef/monitoring/teams.py +0 -203
- openstef/pipeline/__init__.py +0 -3
- openstef/pipeline/create_basecase_forecast.py +0 -133
- openstef/pipeline/create_component_forecast.py +0 -168
- openstef/pipeline/create_forecast.py +0 -171
- openstef/pipeline/optimize_hyperparameters.py +0 -317
- openstef/pipeline/train_create_forecast_backtest.py +0 -163
- openstef/pipeline/train_model.py +0 -561
- openstef/pipeline/utils.py +0 -52
- openstef/postprocessing/__init__.py +0 -3
- openstef/postprocessing/postprocessing.py +0 -275
- openstef/preprocessing/__init__.py +0 -3
- openstef/preprocessing/preprocessing.py +0 -42
- openstef/settings.py +0 -15
- openstef/tasks/__init__.py +0 -3
- openstef/tasks/calculate_kpi.py +0 -324
- openstef/tasks/create_basecase_forecast.py +0 -118
- openstef/tasks/create_components_forecast.py +0 -162
- openstef/tasks/create_forecast.py +0 -145
- openstef/tasks/create_solar_forecast.py +0 -420
- openstef/tasks/create_wind_forecast.py +0 -80
- openstef/tasks/optimize_hyperparameters.py +0 -135
- openstef/tasks/split_forecast.py +0 -273
- openstef/tasks/train_model.py +0 -224
- openstef/tasks/utils/__init__.py +0 -3
- openstef/tasks/utils/dependencies.py +0 -107
- openstef/tasks/utils/predictionjobloop.py +0 -243
- openstef/tasks/utils/taskcontext.py +0 -160
- openstef/validation/__init__.py +0 -3
- openstef/validation/validation.py +0 -322
- openstef-3.4.56.dist-info/METADATA +0 -154
- openstef-3.4.56.dist-info/RECORD +0 -102
- openstef-3.4.56.dist-info/top_level.txt +0 -1
- /openstef-3.4.56.dist-info/LICENSE → /openstef-4.0.0a3.dist-info/licenses/LICENSE.md +0 -0
openstef/model/objective.py
DELETED
|
@@ -1,426 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
import copy
|
|
5
|
-
from datetime import datetime, timezone
|
|
6
|
-
from typing import Any, Callable, Optional
|
|
7
|
-
|
|
8
|
-
import optuna
|
|
9
|
-
import pandas as pd
|
|
10
|
-
|
|
11
|
-
from openstef.enums import ModelType
|
|
12
|
-
from openstef.metrics import metrics
|
|
13
|
-
from openstef.metrics.reporter import Report, Reporter
|
|
14
|
-
from openstef.model.regressors.regressor import OpenstfRegressor
|
|
15
|
-
from openstef.model.standard_deviation_generator import StandardDeviationGenerator
|
|
16
|
-
from openstef.model_selection.model_selection import split_data_train_validation_test
|
|
17
|
-
|
|
18
|
-
EARLY_STOPPING_ROUNDS: int = 10
|
|
19
|
-
TEST_FRACTION: float = 0.15
|
|
20
|
-
VALIDATION_FRACTION: float = 0.15
|
|
21
|
-
# See https://xgboost.readthedocs.io/en/latest/parameter.html for all possibilities
|
|
22
|
-
EVAL_METRIC: str = "mae"
|
|
23
|
-
|
|
24
|
-
# https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class RegressorObjective:
|
|
28
|
-
"""Regressor optuna objective function.
|
|
29
|
-
|
|
30
|
-
Use any of the derived classes for optimization using an optuna study.
|
|
31
|
-
The constructor is used to set the "input_data", specify the splitting function
|
|
32
|
-
and its arguments and optionally add some configuration.
|
|
33
|
-
Next the instance will be called by he optuna study during optimization.
|
|
34
|
-
|
|
35
|
-
Example usage:
|
|
36
|
-
|
|
37
|
-
.. code-block:: py
|
|
38
|
-
|
|
39
|
-
# initialize a (derived class) objective function
|
|
40
|
-
objective = XGBRegressorObjective(input_data, test_fraction)
|
|
41
|
-
# use the objective function
|
|
42
|
-
study.optimize(objective)
|
|
43
|
-
|
|
44
|
-
"""
|
|
45
|
-
|
|
46
|
-
def __init__(
|
|
47
|
-
self,
|
|
48
|
-
model: OpenstfRegressor,
|
|
49
|
-
input_data: pd.DataFrame,
|
|
50
|
-
split_func: Optional[Callable] = None,
|
|
51
|
-
split_args: Optional[dict[str, Any]] = None,
|
|
52
|
-
test_fraction=TEST_FRACTION,
|
|
53
|
-
validation_fraction=VALIDATION_FRACTION,
|
|
54
|
-
eval_metric=EVAL_METRIC,
|
|
55
|
-
verbose=False,
|
|
56
|
-
):
|
|
57
|
-
self.input_data = input_data
|
|
58
|
-
self.train_data = None
|
|
59
|
-
self.validation_data = None
|
|
60
|
-
self.test_data = None
|
|
61
|
-
self.model = model
|
|
62
|
-
self.start_time = datetime.now(timezone.utc)
|
|
63
|
-
self.test_fraction = test_fraction
|
|
64
|
-
self.validation_fraction = validation_fraction
|
|
65
|
-
self.eval_metric = eval_metric
|
|
66
|
-
self.eval_metric_function = metrics.get_eval_metric_function(eval_metric)
|
|
67
|
-
self.verbose = verbose
|
|
68
|
-
# Should be set on a derived classes
|
|
69
|
-
self.model_type = None
|
|
70
|
-
self.track_trials = {}
|
|
71
|
-
|
|
72
|
-
# split function and arguments
|
|
73
|
-
self.split_func = split_func
|
|
74
|
-
self.split_args = split_args
|
|
75
|
-
|
|
76
|
-
# default behavior for splitting
|
|
77
|
-
if self.split_func is None:
|
|
78
|
-
self.split_func = split_data_train_validation_test
|
|
79
|
-
self.split_args = None
|
|
80
|
-
|
|
81
|
-
def __call__(
|
|
82
|
-
self,
|
|
83
|
-
trial: optuna.trial.FrozenTrial,
|
|
84
|
-
) -> float:
|
|
85
|
-
"""Optuna objective function.
|
|
86
|
-
|
|
87
|
-
Args: trial
|
|
88
|
-
|
|
89
|
-
Returns:
|
|
90
|
-
Mean absolute error for this trial.
|
|
91
|
-
|
|
92
|
-
"""
|
|
93
|
-
# Perform data preprocessing
|
|
94
|
-
split_args = self.split_args
|
|
95
|
-
if split_args is None:
|
|
96
|
-
split_args = {
|
|
97
|
-
"stratification_min_max": True,
|
|
98
|
-
"back_test": True,
|
|
99
|
-
}
|
|
100
|
-
(
|
|
101
|
-
self.train_data,
|
|
102
|
-
self.validation_data,
|
|
103
|
-
self.test_data,
|
|
104
|
-
self.operational_score_data,
|
|
105
|
-
) = self.split_func(
|
|
106
|
-
self.input_data,
|
|
107
|
-
test_fraction=self.test_fraction,
|
|
108
|
-
validation_fraction=self.validation_fraction,
|
|
109
|
-
**split_args,
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
# Test if first column is "load" and last column is "horizon"
|
|
113
|
-
if (
|
|
114
|
-
self.train_data.columns[0] != "load"
|
|
115
|
-
or self.train_data.columns[-1] != "horizon"
|
|
116
|
-
):
|
|
117
|
-
raise RuntimeError(
|
|
118
|
-
"Column order in train input data not as expected, "
|
|
119
|
-
"could not train a model!"
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
# Split in x, y data (x are the features, y is the load)
|
|
123
|
-
train_x, train_y = self.train_data.iloc[:, 1:-1], self.train_data.iloc[:, 0]
|
|
124
|
-
valid_x, valid_y = (
|
|
125
|
-
self.validation_data.iloc[:, 1:-1],
|
|
126
|
-
self.validation_data.iloc[:, 0],
|
|
127
|
-
)
|
|
128
|
-
test_x, test_y = self.test_data.iloc[:, 1:-1], self.test_data.iloc[:, 0]
|
|
129
|
-
|
|
130
|
-
# Configure evals for early stopping
|
|
131
|
-
eval_set = [(train_x, train_y), (valid_x, valid_y)]
|
|
132
|
-
|
|
133
|
-
# get the parameters used in this trial
|
|
134
|
-
hyper_params = self.get_params(trial)
|
|
135
|
-
|
|
136
|
-
# insert parameters into model
|
|
137
|
-
self.model.set_params(**hyper_params)
|
|
138
|
-
|
|
139
|
-
# create the specific pruning callback
|
|
140
|
-
pruning_callback = self.get_pruning_callback(trial)
|
|
141
|
-
if pruning_callback is None:
|
|
142
|
-
callbacks = None
|
|
143
|
-
else:
|
|
144
|
-
callbacks = [pruning_callback]
|
|
145
|
-
|
|
146
|
-
# validation_0 and validation_1 are available
|
|
147
|
-
self.model.fit(
|
|
148
|
-
train_x,
|
|
149
|
-
train_y,
|
|
150
|
-
eval_set=eval_set,
|
|
151
|
-
early_stopping_rounds=EARLY_STOPPING_ROUNDS,
|
|
152
|
-
verbose=self.verbose,
|
|
153
|
-
eval_metric=self.eval_metric,
|
|
154
|
-
callbacks=callbacks,
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
self.model.feature_importance_dataframe = self.model.set_feature_importance()
|
|
158
|
-
|
|
159
|
-
# Do confidence interval determination
|
|
160
|
-
self.model = StandardDeviationGenerator(
|
|
161
|
-
self.validation_data
|
|
162
|
-
).generate_standard_deviation_data(self.model)
|
|
163
|
-
|
|
164
|
-
forecast_y = self.model.predict(test_x)
|
|
165
|
-
score = self.eval_metric_function(test_y, forecast_y)
|
|
166
|
-
|
|
167
|
-
# Convert float32 to float because float32 is not JSON serializable
|
|
168
|
-
self.track_trials[f" trial: {trial.number}"] = {
|
|
169
|
-
"score": float(score),
|
|
170
|
-
"params": hyper_params,
|
|
171
|
-
}
|
|
172
|
-
trial.set_user_attr(key="model", value=copy.deepcopy(self.model))
|
|
173
|
-
return score
|
|
174
|
-
|
|
175
|
-
def get_params(self, trial: optuna.trial.FrozenTrial) -> dict:
|
|
176
|
-
"""Get parameters for objective without model specific get_params function.
|
|
177
|
-
|
|
178
|
-
Args: trial
|
|
179
|
-
|
|
180
|
-
Returns:
|
|
181
|
-
Dictionary with hyperparameter name as key and hyperparamer value as value.
|
|
182
|
-
|
|
183
|
-
"""
|
|
184
|
-
default_params = {
|
|
185
|
-
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
|
|
186
|
-
"alpha": trial.suggest_float("alpha", 0, 1.0),
|
|
187
|
-
"lambda": trial.suggest_float("lambda", 1e-8, 1.0),
|
|
188
|
-
"subsample": trial.suggest_float("subsample", 0.4, 1.0),
|
|
189
|
-
"min_child_weight": trial.suggest_int("min_child_weight", 1, 16),
|
|
190
|
-
"max_depth": trial.suggest_int("max_depth", 3, 10),
|
|
191
|
-
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
|
|
192
|
-
"max_delta_step": trial.suggest_int("max_delta_step", 0, 10),
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
# Compare the list to the default parameter space
|
|
196
|
-
model_parameters = self.model.get_params()
|
|
197
|
-
keys = [x for x in model_parameters.keys() if x in default_params.keys()]
|
|
198
|
-
# create a dictionary with the matching parameters
|
|
199
|
-
params = {parameter: default_params[parameter] for parameter in keys}
|
|
200
|
-
|
|
201
|
-
return params
|
|
202
|
-
|
|
203
|
-
def get_pruning_callback(self, trial: optuna.trial.FrozenTrial):
|
|
204
|
-
return None
|
|
205
|
-
|
|
206
|
-
def get_trial_track(self) -> dict:
|
|
207
|
-
"""Get a dictionary of al trials.
|
|
208
|
-
|
|
209
|
-
Returns:
|
|
210
|
-
Dict with al trials and it's parameters
|
|
211
|
-
|
|
212
|
-
"""
|
|
213
|
-
return self.track_trials
|
|
214
|
-
|
|
215
|
-
def create_report(self, model: OpenstfRegressor) -> Report:
|
|
216
|
-
"""Generate a report from the data available inside the objective function.
|
|
217
|
-
|
|
218
|
-
Args:
|
|
219
|
-
model: OpenstfRegressor, model to create a report on
|
|
220
|
-
|
|
221
|
-
Returns:
|
|
222
|
-
Report about the model
|
|
223
|
-
|
|
224
|
-
"""
|
|
225
|
-
# Report about the training process
|
|
226
|
-
reporter = Reporter(self.train_data, self.validation_data, self.test_data)
|
|
227
|
-
report = reporter.generate_report(model)
|
|
228
|
-
|
|
229
|
-
return report
|
|
230
|
-
|
|
231
|
-
@classmethod
|
|
232
|
-
def get_default_values(cls) -> dict:
|
|
233
|
-
return {
|
|
234
|
-
"learning_rate": 0.3,
|
|
235
|
-
"alpha": 0.0,
|
|
236
|
-
"lambda": 1.0,
|
|
237
|
-
"subsample": 1.0,
|
|
238
|
-
"min_child_weight": 1,
|
|
239
|
-
"max_depth": 6,
|
|
240
|
-
"colsample_bytree": 1,
|
|
241
|
-
"max_delta_step": 0,
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
class XGBRegressorObjective(RegressorObjective):
|
|
246
|
-
def __init__(self, *args, **kwargs):
|
|
247
|
-
super().__init__(*args, **kwargs)
|
|
248
|
-
self.model_type = ModelType.XGB
|
|
249
|
-
|
|
250
|
-
# extend the parameters with the model specific ones per implementation
|
|
251
|
-
def get_params(self, trial: optuna.trial.FrozenTrial) -> dict:
|
|
252
|
-
"""Get parameters for XGB Regressor Objective with objective specific parameters.
|
|
253
|
-
|
|
254
|
-
Args: trial
|
|
255
|
-
|
|
256
|
-
Returns:
|
|
257
|
-
Dictionary with hyperparameter name as key and hyperparamer value as value.
|
|
258
|
-
|
|
259
|
-
"""
|
|
260
|
-
# Filtered default parameters
|
|
261
|
-
model_params = super().get_params(trial)
|
|
262
|
-
|
|
263
|
-
# XGB specific parameters
|
|
264
|
-
params = {
|
|
265
|
-
"gamma": trial.suggest_float("gamma", 0.0, 1.0),
|
|
266
|
-
"booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
|
|
267
|
-
}
|
|
268
|
-
return {**model_params, **params}
|
|
269
|
-
|
|
270
|
-
def get_pruning_callback(self, trial: optuna.trial.FrozenTrial):
|
|
271
|
-
return optuna.integration.XGBoostPruningCallback(
|
|
272
|
-
trial, observation_key=f"validation_1-{self.eval_metric}"
|
|
273
|
-
)
|
|
274
|
-
|
|
275
|
-
@classmethod
|
|
276
|
-
def get_default_values(cls) -> dict:
|
|
277
|
-
default_parameter_values = super().get_default_values()
|
|
278
|
-
default_parameter_values.update({"gamma": 0.0, "booster": "gbtree"})
|
|
279
|
-
return default_parameter_values
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
class LGBRegressorObjective(RegressorObjective):
|
|
283
|
-
def __init__(self, *args, **kwargs):
|
|
284
|
-
super().__init__(*args, **kwargs)
|
|
285
|
-
self.model_type = ModelType.LGB
|
|
286
|
-
|
|
287
|
-
def get_params(self, trial: optuna.trial.FrozenTrial) -> dict:
|
|
288
|
-
"""Get parameters for LGB Regressor Objective with objective specific parameters.
|
|
289
|
-
|
|
290
|
-
Args: trial
|
|
291
|
-
|
|
292
|
-
Returns:
|
|
293
|
-
Dictionary with hyperparameter name as key and hyperparamer value as value.
|
|
294
|
-
|
|
295
|
-
"""
|
|
296
|
-
# Filtered default parameters
|
|
297
|
-
model_params = super().get_params(trial)
|
|
298
|
-
|
|
299
|
-
# LGB specific parameters
|
|
300
|
-
params = {
|
|
301
|
-
"num_leaves": trial.suggest_int("num_leaves", 16, 62),
|
|
302
|
-
"boosting_type": trial.suggest_categorical(
|
|
303
|
-
"boosting_type", ["gbdt", "dart", "rf"]
|
|
304
|
-
),
|
|
305
|
-
"tree_learner": trial.suggest_categorical(
|
|
306
|
-
"tree_learner", ["serial", "feature", "data", "voting"]
|
|
307
|
-
),
|
|
308
|
-
"n_estimators": trial.suggest_int("n_estimators", 50, 150),
|
|
309
|
-
"min_split_gain": trial.suggest_float("min_split_gain", 1e-8, 1),
|
|
310
|
-
"subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
|
|
311
|
-
}
|
|
312
|
-
return {**model_params, **params}
|
|
313
|
-
|
|
314
|
-
def get_pruning_callback(self, trial: optuna.trial.FrozenTrial):
|
|
315
|
-
metric = self.eval_metric
|
|
316
|
-
if metric == "mae":
|
|
317
|
-
metric = "l1"
|
|
318
|
-
return optuna.integration.LightGBMPruningCallback(
|
|
319
|
-
trial, metric=metric, valid_name="valid_1"
|
|
320
|
-
)
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
class XGBQuantileRegressorObjective(RegressorObjective):
|
|
324
|
-
def __init__(self, *args, **kwargs):
|
|
325
|
-
super().__init__(*args, **kwargs)
|
|
326
|
-
self.model_type = ModelType.XGB_QUANTILE
|
|
327
|
-
|
|
328
|
-
def get_params(self, trial: optuna.trial.FrozenTrial) -> dict:
|
|
329
|
-
"""Get parameters for XGBQuantile Regressor Objective with objective specific parameters.
|
|
330
|
-
|
|
331
|
-
Args: trial
|
|
332
|
-
|
|
333
|
-
Returns:
|
|
334
|
-
Dictionary with hyperparameter name as key and hyperparamer value as value.
|
|
335
|
-
|
|
336
|
-
"""
|
|
337
|
-
# Filtered default parameters
|
|
338
|
-
model_params = super().get_params(trial)
|
|
339
|
-
|
|
340
|
-
# XGB specific parameters
|
|
341
|
-
params = {
|
|
342
|
-
"gamma": trial.suggest_float("gamma", 1e-8, 1.0),
|
|
343
|
-
}
|
|
344
|
-
return {**model_params, **params}
|
|
345
|
-
|
|
346
|
-
def get_pruning_callback(self, trial: optuna.trial.FrozenTrial):
|
|
347
|
-
return optuna.integration.XGBoostPruningCallback(
|
|
348
|
-
trial, observation_key=f"validation_1-{self.eval_metric}"
|
|
349
|
-
)
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
class XGBMultioutputQuantileRegressorObjective(RegressorObjective):
|
|
353
|
-
def __init__(self, *args, **kwargs):
|
|
354
|
-
super().__init__(*args, **kwargs)
|
|
355
|
-
self.model_type = ModelType.XGB_QUANTILE
|
|
356
|
-
|
|
357
|
-
def get_params(self, trial: optuna.trial.FrozenTrial) -> dict:
|
|
358
|
-
"""Get parameters for XGB Multioutput Quantile Regressor Objective with objective specific parameters.
|
|
359
|
-
|
|
360
|
-
Args: trial
|
|
361
|
-
|
|
362
|
-
Returns:
|
|
363
|
-
Dictionary with hyperparameter name as key and hyperparamer value as value.
|
|
364
|
-
|
|
365
|
-
"""
|
|
366
|
-
# Filtered default parameters
|
|
367
|
-
model_params = super().get_params(trial)
|
|
368
|
-
|
|
369
|
-
# XGB specific parameters
|
|
370
|
-
params = {
|
|
371
|
-
"gamma": trial.suggest_float("gamma", 1e-8, 1.0),
|
|
372
|
-
"arctan_smoothing": trial.suggest_float("arctan_smoothing", 0.025, 0.15),
|
|
373
|
-
}
|
|
374
|
-
return {**model_params, **params}
|
|
375
|
-
|
|
376
|
-
def get_pruning_callback(self, trial: optuna.trial.FrozenTrial):
|
|
377
|
-
return optuna.integration.XGBoostPruningCallback(
|
|
378
|
-
trial, observation_key=f"validation_1-{self.eval_metric}"
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
class LinearRegressorObjective(RegressorObjective):
|
|
383
|
-
def __init__(self, *args, **kwargs):
|
|
384
|
-
super().__init__(*args, **kwargs)
|
|
385
|
-
self.model_type = ModelType.LINEAR
|
|
386
|
-
|
|
387
|
-
def get_params(self, trial: optuna.trial.FrozenTrial) -> dict:
|
|
388
|
-
"""Get parameters for Linear Regressor Objective with objective specific parameters.
|
|
389
|
-
|
|
390
|
-
Args: trial
|
|
391
|
-
|
|
392
|
-
Returns:
|
|
393
|
-
Dictionary with hyperparameter name as key and hyperparamer value as value.
|
|
394
|
-
|
|
395
|
-
"""
|
|
396
|
-
# Imputation strategy
|
|
397
|
-
params = {
|
|
398
|
-
"imputation_strategy": trial.suggest_categorical(
|
|
399
|
-
"imputation_strategy", ["mean", "median", "most_frequent"]
|
|
400
|
-
),
|
|
401
|
-
}
|
|
402
|
-
return params
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
class ARIMARegressorObjective(RegressorObjective):
|
|
406
|
-
def __init__(self, *args, **kwargs):
|
|
407
|
-
super().__init__(*args, **kwargs)
|
|
408
|
-
self.model_type = ModelType.ARIMA
|
|
409
|
-
|
|
410
|
-
def get_params(self, trial: optuna.trial.FrozenTrial) -> dict:
|
|
411
|
-
"""Get parameters for ARIMA Regressor Objective with objective specific parameters.
|
|
412
|
-
|
|
413
|
-
Temporary, it seems strange to use optuna for ARIMA models,
|
|
414
|
-
it is usually done via statistical analysis and heuristics.
|
|
415
|
-
|
|
416
|
-
Args: trial
|
|
417
|
-
|
|
418
|
-
Returns:
|
|
419
|
-
Dictionary with hyperparameter name as key and hyperparamer value as value.
|
|
420
|
-
|
|
421
|
-
"""
|
|
422
|
-
# Imputation strategy
|
|
423
|
-
params = {
|
|
424
|
-
"trend": trial.suggest_categorical("trend", ["n", "c", "t", "ct"]),
|
|
425
|
-
}
|
|
426
|
-
return params
|
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
|
|
5
|
-
from typing import Union
|
|
6
|
-
|
|
7
|
-
from openstef.enums import ModelType
|
|
8
|
-
from openstef.model.objective import (
|
|
9
|
-
ARIMARegressorObjective,
|
|
10
|
-
LGBRegressorObjective,
|
|
11
|
-
LinearRegressorObjective,
|
|
12
|
-
RegressorObjective,
|
|
13
|
-
XGBQuantileRegressorObjective,
|
|
14
|
-
XGBRegressorObjective,
|
|
15
|
-
XGBMultioutputQuantileRegressorObjective,
|
|
16
|
-
)
|
|
17
|
-
from openstef.model.regressors.custom_regressor import (
|
|
18
|
-
create_custom_objective,
|
|
19
|
-
is_custom_type,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class ObjectiveCreator:
|
|
24
|
-
OBJECTIVES = {
|
|
25
|
-
ModelType.XGB: XGBRegressorObjective,
|
|
26
|
-
ModelType.LGB: LGBRegressorObjective,
|
|
27
|
-
ModelType.XGB_QUANTILE: XGBQuantileRegressorObjective,
|
|
28
|
-
ModelType.XGB_MULTIOUTPUT_QUANTILE: XGBMultioutputQuantileRegressorObjective,
|
|
29
|
-
ModelType.LINEAR: LinearRegressorObjective,
|
|
30
|
-
ModelType.LINEAR_QUANTILE: LinearRegressorObjective,
|
|
31
|
-
ModelType.GBLINEAR_QUANTILE: LinearRegressorObjective,
|
|
32
|
-
ModelType.ARIMA: ARIMARegressorObjective,
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
@staticmethod
|
|
36
|
-
def create_objective(model_type: Union[ModelType, str]) -> RegressorObjective:
|
|
37
|
-
"""Create an objective function based on model type.
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
model_type: Model type to construct.
|
|
41
|
-
|
|
42
|
-
Raises:
|
|
43
|
-
NotImplementedError: When using an invalid model_type.
|
|
44
|
-
|
|
45
|
-
Returns:
|
|
46
|
-
Objective function
|
|
47
|
-
|
|
48
|
-
"""
|
|
49
|
-
try:
|
|
50
|
-
# This will raise a ValueError when an invalid model_type str is used
|
|
51
|
-
# and nothing when a MLModelType enum is used.
|
|
52
|
-
if is_custom_type(model_type):
|
|
53
|
-
objective = create_custom_objective(model_type)
|
|
54
|
-
else:
|
|
55
|
-
model_type = ModelType(model_type)
|
|
56
|
-
objective = ObjectiveCreator.OBJECTIVES[model_type]
|
|
57
|
-
except ValueError as e:
|
|
58
|
-
valid_types = [t.value for t in ModelType]
|
|
59
|
-
raise NotImplementedError(
|
|
60
|
-
f"No objective for '{model_type}', "
|
|
61
|
-
f"valid model_types are: {valid_types}"
|
|
62
|
-
"or import a custom model"
|
|
63
|
-
) from e
|
|
64
|
-
|
|
65
|
-
return objective
|
|
@@ -1,197 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Alliander N.V. <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
"""This module contains the SARIMAX regressor wrapper around statsmodels implementation."""
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
import statsmodels.api as sm
|
|
8
|
-
from sklearn.metrics import r2_score
|
|
9
|
-
from sklearn.model_selection import TimeSeriesSplit
|
|
10
|
-
|
|
11
|
-
from openstef.model.regressors.regressor import OpenstfRegressor
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class ARIMAOpenstfRegressor(OpenstfRegressor):
|
|
15
|
-
"""Wrapper around statmodels implementation of (S)ARIMA(X) model.
|
|
16
|
-
|
|
17
|
-
The fit of an ARIMA statsmodels produces a result object which is used to perform the various computations around forecasting.
|
|
18
|
-
(see https://www.statsmodels.org/dev/generated/statsmodels.tsa.arima.model.ARIMAResults.html)
|
|
19
|
-
|
|
20
|
-
To make a prediction, it needs to update the result object's historic data,
|
|
21
|
-
ie the past values of the target/endogenous data and the features/exogenous data,
|
|
22
|
-
applying the fitted parameters to these new data unrelated to the original training data.
|
|
23
|
-
This update can be performed by the method `update_historic_data`.
|
|
24
|
-
|
|
25
|
-
In the following code, we use interchangeably the statmodels and scikit-learn terminology for the variables:
|
|
26
|
-
- the features 'x' is equivalent to the exogenous data: 'exog' for short.
|
|
27
|
-
- the target 'y' is equivalent to the endogenous data: 'endog' for short.
|
|
28
|
-
More information here https://www.statsmodels.org/stable/endog_exog.html.
|
|
29
|
-
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
def __init__(
|
|
33
|
-
self,
|
|
34
|
-
backtest_max_horizon=1440,
|
|
35
|
-
order=(0, 0, 0),
|
|
36
|
-
seasonal_order=(0, 0, 0, 0),
|
|
37
|
-
trend=None,
|
|
38
|
-
):
|
|
39
|
-
self.backtest_max_horizon = backtest_max_horizon
|
|
40
|
-
self.order = order
|
|
41
|
-
self.seasonal_order = seasonal_order
|
|
42
|
-
self.trend = trend
|
|
43
|
-
|
|
44
|
-
def fit(self, x, y, **kwargs):
|
|
45
|
-
dates = x.index
|
|
46
|
-
self.model_ = sm.tsa.arima.ARIMA(
|
|
47
|
-
endog=y,
|
|
48
|
-
exog=x,
|
|
49
|
-
dates=dates,
|
|
50
|
-
order=self.order,
|
|
51
|
-
seasonal_order=self.seasonal_order,
|
|
52
|
-
trend=self.trend,
|
|
53
|
-
)
|
|
54
|
-
self.results_ = self.model_.fit()
|
|
55
|
-
self.feature_in_names_ = list(x.columns)
|
|
56
|
-
return self
|
|
57
|
-
|
|
58
|
-
def update_historic_data(self, x_past, y_past):
|
|
59
|
-
"""Apply the fitted parameters to new data unrelated to the original training data. It's a side-effect.
|
|
60
|
-
|
|
61
|
-
Creates a new result object using the current fitted parameters,
|
|
62
|
-
applied to a completely new dataset that is assumed to be unrelated to the model’s original data.
|
|
63
|
-
The new results can then be used for analysis or forecasting.
|
|
64
|
-
It should be used before forecasting, to wedge the historic data just before the first forecast timestamp,
|
|
65
|
-
with:
|
|
66
|
-
- New observations from the modeled time-series process.
|
|
67
|
-
- New observations of exogenous regressors.
|
|
68
|
-
|
|
69
|
-
Parameters
|
|
70
|
-
----------
|
|
71
|
-
x_past : pd.DataFrame
|
|
72
|
-
The exogenous (features) data.
|
|
73
|
-
y_past : pd.DataFrame
|
|
74
|
-
The endogenous (target) data.
|
|
75
|
-
|
|
76
|
-
"""
|
|
77
|
-
self.results_ = self.results_.apply(
|
|
78
|
-
endog=y_past,
|
|
79
|
-
exog=x_past,
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
def predict_quantile(self, start, end, exog, quantile):
|
|
83
|
-
"""Quantile prediction.
|
|
84
|
-
|
|
85
|
-
It relies on the parameters' confidence intervals.
|
|
86
|
-
|
|
87
|
-
Parameters
|
|
88
|
-
----------
|
|
89
|
-
start : int, str, or datetime, optional
|
|
90
|
-
Zero-indexed observation number at which to start forecasting, i.e.,
|
|
91
|
-
the first forecast is start. Can also be a date string to parse or a datetime type.
|
|
92
|
-
Default is the the zeroth observation.
|
|
93
|
-
end : int, str, or datetime, optional
|
|
94
|
-
Zero-indexed observation number at which to end forecasting, i.e.,
|
|
95
|
-
the last forecast is end. Can also be a date string to parse or a datetime type.
|
|
96
|
-
However, if the dates index does not have a fixed frequency,
|
|
97
|
-
end must be an integer index if you want out of sample prediction.
|
|
98
|
-
Default is the last observation in the sample.
|
|
99
|
-
exog : pd.DataFrame
|
|
100
|
-
Exogenous data (features).
|
|
101
|
-
quantile : float
|
|
102
|
-
The quantile for the confidence interval.
|
|
103
|
-
|
|
104
|
-
Returns
|
|
105
|
-
-------
|
|
106
|
-
pd.Serie
|
|
107
|
-
The quantile prediction.
|
|
108
|
-
|
|
109
|
-
"""
|
|
110
|
-
alpha = quantile
|
|
111
|
-
idx = 0
|
|
112
|
-
if quantile > 0.5:
|
|
113
|
-
alpha = 1 - quantile
|
|
114
|
-
idx = 1
|
|
115
|
-
return (
|
|
116
|
-
self.results_.get_prediction(start, end, exog=exog)
|
|
117
|
-
.conf_int(alpha=alpha)
|
|
118
|
-
.iloc[:, idx]
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
def predict(self, x, quantile: float = 0.5, **kwargs):
|
|
122
|
-
start = x.iloc[0].name
|
|
123
|
-
end = x.iloc[-1].name
|
|
124
|
-
predictions = self.results_.predict(start, end, exog=x).to_numpy()
|
|
125
|
-
if quantile != 0.5:
|
|
126
|
-
predictions = self.predict_quantile(start, end, exog=x, quantile=quantile)
|
|
127
|
-
return predictions
|
|
128
|
-
|
|
129
|
-
def set_feature_importance(self):
|
|
130
|
-
"""Because report needs 'weight' and 'gain' as importance metrics, we set the values to these names.
|
|
131
|
-
|
|
132
|
-
- 'weight' is corresponding to the coefficients values
|
|
133
|
-
- 'gain' is corresponding to the pvalue for the nullity test of each coefficient
|
|
134
|
-
|
|
135
|
-
"""
|
|
136
|
-
importances = pd.DataFrame(
|
|
137
|
-
{"weight": self.results_.params, "gain": self.results_.pvalues}
|
|
138
|
-
)
|
|
139
|
-
return importances
|
|
140
|
-
|
|
141
|
-
@property
|
|
142
|
-
def feature_names(self):
|
|
143
|
-
"""The names of he features used to train the model."""
|
|
144
|
-
return self.feature_in_names_
|
|
145
|
-
|
|
146
|
-
@property
|
|
147
|
-
def can_predict_quantiles(self):
|
|
148
|
-
"""Indicates wether this model can make quantile predictions."""
|
|
149
|
-
return True
|
|
150
|
-
|
|
151
|
-
def score(self, x, y):
|
|
152
|
-
"""Compute R2 score with backtesting strategy.
|
|
153
|
-
|
|
154
|
-
The backtest is performed by the Time Series cross-validator of scikit-learn which
|
|
155
|
-
returns first k folds as train set and the (k+1)th fold as test set in the kth split.
|
|
156
|
-
(see https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html)
|
|
157
|
-
|
|
158
|
-
It needs to update the historic data with (x_past, y_past) for each split.
|
|
159
|
-
|
|
160
|
-
"""
|
|
161
|
-
ys_true = []
|
|
162
|
-
ys_pred = []
|
|
163
|
-
|
|
164
|
-
# Build the cross-validator
|
|
165
|
-
freq = pd.infer_freq(x.index)
|
|
166
|
-
if not (freq[0].isdigit()):
|
|
167
|
-
freq = f"1{freq}"
|
|
168
|
-
max_horizon_delta = pd.Timedelta(self.backtest_max_horizon, "minutes")
|
|
169
|
-
freq_delta = pd.Timedelta(freq)
|
|
170
|
-
test_size = max_horizon_delta // freq_delta
|
|
171
|
-
n_splits = (x.shape[0] // test_size) - 1
|
|
172
|
-
time_series_cross_validator = TimeSeriesSplit(
|
|
173
|
-
n_splits=n_splits, test_size=test_size
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
# Backtesting
|
|
177
|
-
for apply_index, test_index in time_series_cross_validator.split(x):
|
|
178
|
-
# Update the historic data to the current split (ie the k first folds)
|
|
179
|
-
updated_results = self.results_.apply(
|
|
180
|
-
y.iloc[apply_index], x.iloc[apply_index]
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
# The (k+1)th fold as the test data
|
|
184
|
-
x_test, y_true_test = x.iloc[test_index], y.iloc[test_index]
|
|
185
|
-
start_test = x_test.iloc[0].name
|
|
186
|
-
end_test = x_test.iloc[-1].name
|
|
187
|
-
|
|
188
|
-
# Compute and gather the predictions
|
|
189
|
-
y_pred_test = updated_results.predict(
|
|
190
|
-
start=start_test, end=end_test, exog=x_test
|
|
191
|
-
)
|
|
192
|
-
ys_true.append(y_true_test)
|
|
193
|
-
ys_pred.append(y_pred_test)
|
|
194
|
-
|
|
195
|
-
ys_true = np.concatenate(ys_true)
|
|
196
|
-
ys_pred = np.concatenate(ys_pred)
|
|
197
|
-
return r2_score(ys_true, ys_pred)
|