openstef 3.4.56__py3-none-any.whl → 4.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstef-4.0.0a3.dist-info/METADATA +177 -0
- openstef-4.0.0a3.dist-info/RECORD +4 -0
- {openstef-3.4.56.dist-info → openstef-4.0.0a3.dist-info}/WHEEL +1 -2
- openstef/__init__.py +0 -14
- openstef/__main__.py +0 -3
- openstef/app_settings.py +0 -19
- openstef/data/NL_terrestrial_radiation.csv +0 -25585
- openstef/data/NL_terrestrial_radiation.csv.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +0 -18
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +0 -3
- openstef/data/dutch_holidays.csv +0 -1759
- openstef/data/dutch_holidays.csv.license +0 -3
- openstef/data/pv_single_coefs.csv +0 -601
- openstef/data/pv_single_coefs.csv.license +0 -3
- openstef/data_classes/__init__.py +0 -3
- openstef/data_classes/data_prep.py +0 -99
- openstef/data_classes/model_specifications.py +0 -30
- openstef/data_classes/prediction_job.py +0 -135
- openstef/data_classes/split_function.py +0 -97
- openstef/enums.py +0 -140
- openstef/exceptions.py +0 -74
- openstef/feature_engineering/__init__.py +0 -3
- openstef/feature_engineering/apply_features.py +0 -138
- openstef/feature_engineering/bidding_zone_to_country_mapping.py +0 -106
- openstef/feature_engineering/cyclic_features.py +0 -161
- openstef/feature_engineering/data_preparation.py +0 -152
- openstef/feature_engineering/feature_adder.py +0 -206
- openstef/feature_engineering/feature_applicator.py +0 -202
- openstef/feature_engineering/general.py +0 -141
- openstef/feature_engineering/holiday_features.py +0 -231
- openstef/feature_engineering/lag_features.py +0 -165
- openstef/feature_engineering/missing_values_transformer.py +0 -141
- openstef/feature_engineering/rolling_features.py +0 -58
- openstef/feature_engineering/weather_features.py +0 -492
- openstef/metrics/__init__.py +0 -3
- openstef/metrics/figure.py +0 -303
- openstef/metrics/metrics.py +0 -486
- openstef/metrics/reporter.py +0 -222
- openstef/model/__init__.py +0 -3
- openstef/model/basecase.py +0 -82
- openstef/model/confidence_interval_applicator.py +0 -242
- openstef/model/fallback.py +0 -77
- openstef/model/metamodels/__init__.py +0 -3
- openstef/model/metamodels/feature_clipper.py +0 -90
- openstef/model/metamodels/grouped_regressor.py +0 -222
- openstef/model/metamodels/missing_values_handler.py +0 -138
- openstef/model/model_creator.py +0 -214
- openstef/model/objective.py +0 -426
- openstef/model/objective_creator.py +0 -65
- openstef/model/regressors/__init__.py +0 -3
- openstef/model/regressors/arima.py +0 -197
- openstef/model/regressors/custom_regressor.py +0 -64
- openstef/model/regressors/dazls.py +0 -116
- openstef/model/regressors/flatliner.py +0 -95
- openstef/model/regressors/gblinear_quantile.py +0 -334
- openstef/model/regressors/lgbm.py +0 -29
- openstef/model/regressors/linear.py +0 -90
- openstef/model/regressors/linear_quantile.py +0 -305
- openstef/model/regressors/regressor.py +0 -114
- openstef/model/regressors/xgb.py +0 -52
- openstef/model/regressors/xgb_multioutput_quantile.py +0 -261
- openstef/model/regressors/xgb_quantile.py +0 -228
- openstef/model/serializer.py +0 -431
- openstef/model/standard_deviation_generator.py +0 -81
- openstef/model_selection/__init__.py +0 -3
- openstef/model_selection/model_selection.py +0 -311
- openstef/monitoring/__init__.py +0 -3
- openstef/monitoring/performance_meter.py +0 -92
- openstef/monitoring/teams.py +0 -203
- openstef/pipeline/__init__.py +0 -3
- openstef/pipeline/create_basecase_forecast.py +0 -133
- openstef/pipeline/create_component_forecast.py +0 -168
- openstef/pipeline/create_forecast.py +0 -171
- openstef/pipeline/optimize_hyperparameters.py +0 -317
- openstef/pipeline/train_create_forecast_backtest.py +0 -163
- openstef/pipeline/train_model.py +0 -561
- openstef/pipeline/utils.py +0 -52
- openstef/postprocessing/__init__.py +0 -3
- openstef/postprocessing/postprocessing.py +0 -275
- openstef/preprocessing/__init__.py +0 -3
- openstef/preprocessing/preprocessing.py +0 -42
- openstef/settings.py +0 -15
- openstef/tasks/__init__.py +0 -3
- openstef/tasks/calculate_kpi.py +0 -324
- openstef/tasks/create_basecase_forecast.py +0 -118
- openstef/tasks/create_components_forecast.py +0 -162
- openstef/tasks/create_forecast.py +0 -145
- openstef/tasks/create_solar_forecast.py +0 -420
- openstef/tasks/create_wind_forecast.py +0 -80
- openstef/tasks/optimize_hyperparameters.py +0 -135
- openstef/tasks/split_forecast.py +0 -273
- openstef/tasks/train_model.py +0 -224
- openstef/tasks/utils/__init__.py +0 -3
- openstef/tasks/utils/dependencies.py +0 -107
- openstef/tasks/utils/predictionjobloop.py +0 -243
- openstef/tasks/utils/taskcontext.py +0 -160
- openstef/validation/__init__.py +0 -3
- openstef/validation/validation.py +0 -322
- openstef-3.4.56.dist-info/METADATA +0 -154
- openstef-3.4.56.dist-info/RECORD +0 -102
- openstef-3.4.56.dist-info/top_level.txt +0 -1
- /openstef-3.4.56.dist-info/LICENSE → /openstef-4.0.0a3.dist-info/licenses/LICENSE.md +0 -0
|
@@ -1,317 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
import logging
|
|
5
|
-
import os
|
|
6
|
-
from typing import Any
|
|
7
|
-
|
|
8
|
-
import optuna
|
|
9
|
-
import pandas as pd
|
|
10
|
-
import structlog
|
|
11
|
-
|
|
12
|
-
from openstef.data_classes.model_specifications import ModelSpecificationDataClass
|
|
13
|
-
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
|
14
|
-
from openstef.exceptions import (
|
|
15
|
-
InputDataInsufficientError,
|
|
16
|
-
InputDataWrongColumnOrderError,
|
|
17
|
-
)
|
|
18
|
-
from openstef.feature_engineering.feature_applicator import TrainFeatureApplicator
|
|
19
|
-
from openstef.metrics.reporter import Report, Reporter
|
|
20
|
-
from openstef.model.model_creator import ModelCreator
|
|
21
|
-
from openstef.model.objective import RegressorObjective
|
|
22
|
-
from openstef.model.objective_creator import ObjectiveCreator
|
|
23
|
-
from openstef.model.regressors.regressor import OpenstfRegressor
|
|
24
|
-
from openstef.model.serializer import MLflowSerializer
|
|
25
|
-
from openstef.model_selection.model_selection import split_data_train_validation_test
|
|
26
|
-
from openstef.pipeline.train_model import (
|
|
27
|
-
DEFAULT_TRAIN_HORIZONS_HOURS,
|
|
28
|
-
train_model_pipeline_core,
|
|
29
|
-
)
|
|
30
|
-
from openstef.settings import Settings
|
|
31
|
-
from openstef.validation import validation
|
|
32
|
-
|
|
33
|
-
optuna.logging.enable_propagation() # Propagate logs to the root logger.
|
|
34
|
-
optuna.logging.disable_default_handler() # Stop showing logs in sys.stderr.
|
|
35
|
-
|
|
36
|
-
structlog.configure(
|
|
37
|
-
wrapper_class=structlog.make_filtering_bound_logger(
|
|
38
|
-
logging.getLevelName(Settings.log_level)
|
|
39
|
-
)
|
|
40
|
-
)
|
|
41
|
-
logger = structlog.get_logger(__name__)
|
|
42
|
-
|
|
43
|
-
# See https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize
|
|
44
|
-
N_TRIALS: int = 100 # The number of trials.
|
|
45
|
-
TIMEOUT: int = 600 # Stop study after the given number of second(s).
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def optimize_hyperparameters_pipeline(
|
|
49
|
-
pj: PredictionJobDataClass,
|
|
50
|
-
input_data: pd.DataFrame,
|
|
51
|
-
mlflow_tracking_uri: str,
|
|
52
|
-
artifact_folder: str,
|
|
53
|
-
n_trials: int = N_TRIALS,
|
|
54
|
-
) -> dict:
|
|
55
|
-
"""Optimize hyperparameters pipeline.
|
|
56
|
-
|
|
57
|
-
Expected prediction job key's: "name", "model"
|
|
58
|
-
|
|
59
|
-
Args:
|
|
60
|
-
pj: Prediction job
|
|
61
|
-
input_data: Raw training input data
|
|
62
|
-
mlflow_tracking_uri: Path/Uri to mlflow service
|
|
63
|
-
artifact_folder: Path where artifacts, such as trained models, are stored
|
|
64
|
-
horizons: horizons for feature engineering.
|
|
65
|
-
n_trials: The number of trials. Defaults to N_TRIALS.
|
|
66
|
-
|
|
67
|
-
Raises:
|
|
68
|
-
ValueError: If the input_date is insufficient.
|
|
69
|
-
InputDataInsufficientError: If the input dataframe is empty.
|
|
70
|
-
InputDataWrongColumnOrderError: If the load column is missing in the input dataframe.
|
|
71
|
-
OldModelHigherScoreError: When old model is better than new model.
|
|
72
|
-
|
|
73
|
-
Returns:
|
|
74
|
-
Optimized hyperparameters.
|
|
75
|
-
|
|
76
|
-
"""
|
|
77
|
-
if pj.train_horizons_minutes is None:
|
|
78
|
-
horizons = DEFAULT_TRAIN_HORIZONS_HOURS
|
|
79
|
-
else:
|
|
80
|
-
horizons = [
|
|
81
|
-
horizon_minutes / 60 for horizon_minutes in pj.train_horizons_minutes
|
|
82
|
-
]
|
|
83
|
-
(
|
|
84
|
-
best_model,
|
|
85
|
-
model_specs,
|
|
86
|
-
report,
|
|
87
|
-
trials,
|
|
88
|
-
best_trial_number,
|
|
89
|
-
best_params,
|
|
90
|
-
) = optimize_hyperparameters_pipeline_core(pj, input_data, horizons, n_trials)
|
|
91
|
-
|
|
92
|
-
# Create serializer
|
|
93
|
-
serializer = MLflowSerializer(mlflow_tracking_uri=mlflow_tracking_uri)
|
|
94
|
-
|
|
95
|
-
# Save model, optimization results and report
|
|
96
|
-
serializer.save_model(
|
|
97
|
-
model=best_model,
|
|
98
|
-
experiment_name=str(pj["id"]),
|
|
99
|
-
model_type=pj["model"],
|
|
100
|
-
model_specs=model_specs,
|
|
101
|
-
report=report,
|
|
102
|
-
phase="Hyperparameter_opt",
|
|
103
|
-
trials=trials,
|
|
104
|
-
trial_number=best_trial_number,
|
|
105
|
-
)
|
|
106
|
-
if artifact_folder:
|
|
107
|
-
report_folder = os.path.join(artifact_folder, str(pj["id"]))
|
|
108
|
-
Reporter.write_report_to_disk(report=report, report_folder=report_folder)
|
|
109
|
-
return best_params
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def optimize_hyperparameters_pipeline_core(
|
|
113
|
-
pj: PredictionJobDataClass,
|
|
114
|
-
input_data: pd.DataFrame,
|
|
115
|
-
horizons: list[float] = DEFAULT_TRAIN_HORIZONS_HOURS,
|
|
116
|
-
n_trials: int = N_TRIALS,
|
|
117
|
-
) -> tuple[
|
|
118
|
-
OpenstfRegressor, ModelSpecificationDataClass, Report, dict, int, dict[str, Any]
|
|
119
|
-
]:
|
|
120
|
-
"""Optimize hyperparameters pipeline core.
|
|
121
|
-
|
|
122
|
-
Expected prediction job key's: "name", "model"
|
|
123
|
-
|
|
124
|
-
Args:
|
|
125
|
-
pj: Prediction job
|
|
126
|
-
input_data: Raw training input data
|
|
127
|
-
horizons: horizons for feature engineering in hours.
|
|
128
|
-
n_trials: The number of trials. Defaults to N_TRIALS.
|
|
129
|
-
|
|
130
|
-
Raises:
|
|
131
|
-
ValueError: If the input_date is insufficient.
|
|
132
|
-
InputDataInsufficientError: If the input dataframe is empty.
|
|
133
|
-
InputDataWrongColumnOrderError: If the load column is missing in the input dataframe.
|
|
134
|
-
OldModelHigherScoreError: When old model is better than new model.
|
|
135
|
-
InputDataOngoingZeroFlatlinerError: When all recent load measurements are zero.
|
|
136
|
-
|
|
137
|
-
Returns:
|
|
138
|
-
- Best model,
|
|
139
|
-
- Model specifications of the best model,
|
|
140
|
-
- Report of the best training round,
|
|
141
|
-
- Trials,
|
|
142
|
-
- Best trial number,
|
|
143
|
-
- Optimized hyperparameters.
|
|
144
|
-
|
|
145
|
-
"""
|
|
146
|
-
if input_data.empty:
|
|
147
|
-
raise InputDataInsufficientError("Input dataframe is empty")
|
|
148
|
-
elif "load" not in input_data.columns:
|
|
149
|
-
raise InputDataWrongColumnOrderError(
|
|
150
|
-
"Missing the load column in the input dataframe"
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
# Validate and clean data
|
|
154
|
-
validated_data = validation.drop_target_na(
|
|
155
|
-
validation.validate(
|
|
156
|
-
pj["id"],
|
|
157
|
-
input_data,
|
|
158
|
-
pj["flatliner_threshold_minutes"],
|
|
159
|
-
pj["resolution_minutes"],
|
|
160
|
-
)
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
# Check if sufficient data is left after cleaning
|
|
164
|
-
if not validation.is_data_sufficient(
|
|
165
|
-
validated_data, pj["completeness_threshold"], pj["minimal_table_length"]
|
|
166
|
-
):
|
|
167
|
-
raise InputDataInsufficientError(
|
|
168
|
-
f"Input data is insufficient for {pj['name']} after validation and cleaning"
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
if pj.default_modelspecs:
|
|
172
|
-
feature_names = pj.default_modelspecs.feature_names
|
|
173
|
-
feature_modules = pj.default_modelspecs.feature_modules
|
|
174
|
-
else:
|
|
175
|
-
feature_names = None
|
|
176
|
-
feature_modules = []
|
|
177
|
-
|
|
178
|
-
if isinstance(horizons, str):
|
|
179
|
-
if horizons not in set(input_data.columns):
|
|
180
|
-
raise ValueError(
|
|
181
|
-
f"The horizon parameter specifies a column name ({horizons}) missing in"
|
|
182
|
-
" the input data."
|
|
183
|
-
)
|
|
184
|
-
else:
|
|
185
|
-
# sort data to avoid same date repeated multiple time
|
|
186
|
-
input_data = input_data.sort_values(horizons)
|
|
187
|
-
|
|
188
|
-
validated_data_with_features = TrainFeatureApplicator(
|
|
189
|
-
horizons=horizons, feature_names=feature_names, feature_modules=feature_modules
|
|
190
|
-
).add_features(validated_data, pj=pj)
|
|
191
|
-
|
|
192
|
-
# Create objective (NOTE: this is a callable class)
|
|
193
|
-
objective = ObjectiveCreator.create_objective(model_type=pj["model"])
|
|
194
|
-
|
|
195
|
-
study, objective = optuna_optimization(
|
|
196
|
-
pj, objective, validated_data_with_features, n_trials
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
best_hyperparams = study.best_params
|
|
200
|
-
# The best_model could be accessed via study.user_attrs["best_model"]
|
|
201
|
-
|
|
202
|
-
logger.info(
|
|
203
|
-
f"Finished hyperparameter optimization, error objective {study.best_value} "
|
|
204
|
-
f"and params {best_hyperparams}"
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
# Add quantiles to hyperparams so they are stored with the model info
|
|
208
|
-
if pj["quantiles"]:
|
|
209
|
-
best_hyperparams.update(quantiles=pj["quantiles"])
|
|
210
|
-
|
|
211
|
-
# model specification
|
|
212
|
-
model_specs = ModelSpecificationDataClass(
|
|
213
|
-
id=pj["id"],
|
|
214
|
-
feature_names=list(validated_data_with_features.columns),
|
|
215
|
-
hyper_params=best_hyperparams,
|
|
216
|
-
)
|
|
217
|
-
|
|
218
|
-
# Train a model using the regular train pipeline.
|
|
219
|
-
# The train/validation/test split used in hyperparam optimisation
|
|
220
|
-
# is less suitable for an operational model.
|
|
221
|
-
model, report, model_specs, _ = train_model_pipeline_core(
|
|
222
|
-
pj=pj, input_data=input_data, model_specs=model_specs
|
|
223
|
-
)
|
|
224
|
-
|
|
225
|
-
trials = objective.get_trial_track()
|
|
226
|
-
best_trial_number = study.best_trial.number
|
|
227
|
-
|
|
228
|
-
return model, model_specs, report, trials, best_trial_number, study.best_params
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
def optuna_optimization(
|
|
232
|
-
pj: PredictionJobDataClass,
|
|
233
|
-
objective: RegressorObjective,
|
|
234
|
-
validated_data_with_features: pd.DataFrame,
|
|
235
|
-
n_trials: int,
|
|
236
|
-
) -> tuple[optuna.study.Study, RegressorObjective]:
|
|
237
|
-
"""Perform hyperparameter optimization with optuna.
|
|
238
|
-
|
|
239
|
-
Args:
|
|
240
|
-
pj: Prediction job
|
|
241
|
-
objective: Objective function for optuna
|
|
242
|
-
validated_data_with_features: cleaned input dataframe
|
|
243
|
-
n_trials: number of optuna trials
|
|
244
|
-
|
|
245
|
-
Returns:
|
|
246
|
-
- Optimization study from optuna
|
|
247
|
-
- The objective object used by optuna
|
|
248
|
-
|
|
249
|
-
"""
|
|
250
|
-
model = ModelCreator.create_model(pj["model"], **(pj.model_kwargs or {}))
|
|
251
|
-
# Apply set to default hyperparameters if they are specified in the pj
|
|
252
|
-
if pj.default_modelspecs:
|
|
253
|
-
valid_hyper_parameters = {
|
|
254
|
-
key: value
|
|
255
|
-
for key, value in pj.default_modelspecs.hyper_params.items()
|
|
256
|
-
if key in model.get_params().keys()
|
|
257
|
-
}
|
|
258
|
-
model.set_params(**valid_hyper_parameters)
|
|
259
|
-
|
|
260
|
-
study = optuna.create_study(
|
|
261
|
-
study_name=pj["model"],
|
|
262
|
-
pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
|
|
263
|
-
direction="minimize",
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
# Start with evaluating the default set of parameters,
|
|
267
|
-
# this way the optimization never get worse than the default values
|
|
268
|
-
study.enqueue_trial(objective.get_default_values())
|
|
269
|
-
|
|
270
|
-
if pj.train_split_func is None:
|
|
271
|
-
split_func = split_data_train_validation_test
|
|
272
|
-
split_args = {
|
|
273
|
-
"stratification_min_max": True,
|
|
274
|
-
"back_test": True,
|
|
275
|
-
}
|
|
276
|
-
else:
|
|
277
|
-
split_func, split_args = pj.train_split_func.load(
|
|
278
|
-
required_arguments=["data", "test_fraction", "validation_fraction"]
|
|
279
|
-
)
|
|
280
|
-
|
|
281
|
-
objective = objective(
|
|
282
|
-
model,
|
|
283
|
-
validated_data_with_features,
|
|
284
|
-
split_func=split_func,
|
|
285
|
-
split_args=split_args,
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
# Optuna updates the model by itself
|
|
289
|
-
# and the model is the optimized over this finishes
|
|
290
|
-
study.optimize(
|
|
291
|
-
objective,
|
|
292
|
-
n_trials=n_trials,
|
|
293
|
-
callbacks=[_log_study_progress_and_save_best_model],
|
|
294
|
-
show_progress_bar=False,
|
|
295
|
-
timeout=TIMEOUT,
|
|
296
|
-
)
|
|
297
|
-
|
|
298
|
-
return study, objective
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
def _log_study_progress_and_save_best_model(
|
|
302
|
-
study: optuna.study.Study, trial: optuna.trial.FrozenTrial
|
|
303
|
-
) -> None:
|
|
304
|
-
# Collect study and trial data
|
|
305
|
-
# trial_index = study.trials.index(trial)
|
|
306
|
-
# best_trial_index = study.trials.index(study.best_trial)
|
|
307
|
-
value = trial.value
|
|
308
|
-
params = trial.params
|
|
309
|
-
duration = (trial.datetime_complete - trial.datetime_start).total_seconds()
|
|
310
|
-
# Log information about this trial
|
|
311
|
-
logger.debug(
|
|
312
|
-
f"Trial {trial.number} finished with value: {value} and parameters: {params}."
|
|
313
|
-
f"Best trial is {study.best_trial.number}. Iteration took {duration} s"
|
|
314
|
-
)
|
|
315
|
-
# If this trial is the best save the model as a study attribute
|
|
316
|
-
if study.best_trial.number == trial.number:
|
|
317
|
-
study.set_user_attr(key="best_model", value=trial.user_attrs["model"])
|
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
|
|
5
|
-
import pandas as pd
|
|
6
|
-
|
|
7
|
-
from openstef.data_classes.model_specifications import ModelSpecificationDataClass
|
|
8
|
-
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
|
9
|
-
from openstef.model.confidence_interval_applicator import ConfidenceIntervalApplicator
|
|
10
|
-
from openstef.model.regressors.regressor import OpenstfRegressor
|
|
11
|
-
from openstef.model_selection.model_selection import backtest_split_default
|
|
12
|
-
from openstef.pipeline import train_model
|
|
13
|
-
from openstef.postprocessing.postprocessing import (
|
|
14
|
-
add_prediction_job_properties_to_forecast,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
DEFAULT_TRAIN_HORIZONS: list[float] = [0.25, 24.0]
|
|
18
|
-
DEFAULT_EARLY_STOPPING_ROUNDS: int = 10
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def train_model_and_forecast_back_test(
|
|
22
|
-
pj: PredictionJobDataClass,
|
|
23
|
-
modelspecs: ModelSpecificationDataClass,
|
|
24
|
-
input_data: pd.DataFrame,
|
|
25
|
-
training_horizons: list[float] = None,
|
|
26
|
-
n_folds: int = 1,
|
|
27
|
-
) -> tuple[
|
|
28
|
-
pd.DataFrame,
|
|
29
|
-
list[OpenstfRegressor],
|
|
30
|
-
list[pd.DataFrame],
|
|
31
|
-
list[pd.DataFrame],
|
|
32
|
-
list[pd.DataFrame],
|
|
33
|
-
]:
|
|
34
|
-
"""Pipeline for a back test.
|
|
35
|
-
|
|
36
|
-
When number of folds is larger than 1: apply pipeline for a back test when forecasting
|
|
37
|
-
the entire input range.
|
|
38
|
-
|
|
39
|
-
- Makes use of kfold cross validation in order to split data multiple times.
|
|
40
|
-
- Results of all the testsets are added together to obtain the forecast for the whole input range.
|
|
41
|
-
- Obtaining the days for each fold can be done either randomly or not
|
|
42
|
-
**DO NOT USE THIS PIPELINE FOR OPERATIONAL FORECASTS**
|
|
43
|
-
|
|
44
|
-
Args:
|
|
45
|
-
pj: Prediction job.
|
|
46
|
-
modelspecs: Dataclass containing model specifications
|
|
47
|
-
input_data: Input data
|
|
48
|
-
training_horizons: horizons to train on in hours.
|
|
49
|
-
These horizons are also used to make predictions (one for every horizon)
|
|
50
|
-
n_folds: number of folds to apply (if 1, no cross validation will be applied)
|
|
51
|
-
|
|
52
|
-
Returns:
|
|
53
|
-
- Forecast (pandas.DataFrame)
|
|
54
|
-
- Fitted models (list[OpenStfRegressor])
|
|
55
|
-
- Train data sets (list[pd.DataFrame])
|
|
56
|
-
- Validation data sets (list[pd.DataFrame])
|
|
57
|
-
- Test data sets (list[pd.DataFrame])
|
|
58
|
-
|
|
59
|
-
Raises:
|
|
60
|
-
InputDataInsufficientError: when input data is insufficient.
|
|
61
|
-
InputDataWrongColumnOrderError: when input data has a invalid column order.
|
|
62
|
-
ValueError: when the horizon is a string and the corresponding column in not in the input data
|
|
63
|
-
InputDataOngoingZeroFlatlinerError: when all recent load measurements are zero.
|
|
64
|
-
|
|
65
|
-
"""
|
|
66
|
-
if pj.backtest_split_func is None:
|
|
67
|
-
backtest_split_func = backtest_split_default
|
|
68
|
-
backtest_split_args = {"stratification_min_max": True}
|
|
69
|
-
else:
|
|
70
|
-
backtest_split_func, backtest_split_args = pj.backtest_split_func.load(
|
|
71
|
-
required_arguments=["data", "n_folds"]
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
data_with_features = train_model.train_pipeline_step_compute_features(
|
|
75
|
-
input_data=input_data, pj=pj, model_specs=modelspecs, horizons=training_horizons
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
# The use of zip allows to take advantage of the lazy estimation mechanisms of Python, especially if the
|
|
79
|
-
# backtest_split_func returns a generator. This can avoid unwanted multiple data copies.
|
|
80
|
-
# 1. First we retrieve a generator (use of () comprehensive) on (model, forecast, train, val, test)
|
|
81
|
-
# 2. Then we unzip the result into generators separated by result type (models, forecasts, trains, vals, tests)
|
|
82
|
-
(
|
|
83
|
-
models_folds,
|
|
84
|
-
forecast_folds,
|
|
85
|
-
train_data_folds,
|
|
86
|
-
validation_data_folds,
|
|
87
|
-
test_data_folds,
|
|
88
|
-
) = zip(
|
|
89
|
-
*(
|
|
90
|
-
train_model_and_forecast_test_core(
|
|
91
|
-
pj,
|
|
92
|
-
modelspecs,
|
|
93
|
-
train_data,
|
|
94
|
-
validation_data,
|
|
95
|
-
test_data,
|
|
96
|
-
)
|
|
97
|
-
+ (train_data, validation_data, test_data)
|
|
98
|
-
for train_data, validation_data, test_data, _ in backtest_split_func(
|
|
99
|
-
data_with_features, n_folds, **backtest_split_args
|
|
100
|
-
)
|
|
101
|
-
)
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
return (
|
|
105
|
-
pd.concat(forecast_folds, axis=0).sort_index(),
|
|
106
|
-
list(models_folds),
|
|
107
|
-
list(train_data_folds),
|
|
108
|
-
list(validation_data_folds),
|
|
109
|
-
list(test_data_folds),
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def train_model_and_forecast_test_core(
|
|
114
|
-
pj: PredictionJobDataClass,
|
|
115
|
-
modelspecs: ModelSpecificationDataClass,
|
|
116
|
-
train_data: pd.DataFrame,
|
|
117
|
-
validation_data: pd.DataFrame,
|
|
118
|
-
test_data: pd.DataFrame,
|
|
119
|
-
) -> tuple[OpenstfRegressor, pd.DataFrame]:
|
|
120
|
-
"""Trains the model and forecast on the test set.
|
|
121
|
-
|
|
122
|
-
Args:
|
|
123
|
-
pj: Prediction job.
|
|
124
|
-
modelspecs: Dataclass containing model specifications
|
|
125
|
-
train_data: Train data with computed features
|
|
126
|
-
validation_data: Validation data with computed features
|
|
127
|
-
test_data: Test data with computed features
|
|
128
|
-
|
|
129
|
-
Returns:
|
|
130
|
-
- The trained model
|
|
131
|
-
- The forecast on the test set.
|
|
132
|
-
|
|
133
|
-
Raises:
|
|
134
|
-
NotImplementedError: When using invalid model type in the prediction job.
|
|
135
|
-
InputDataWrongColumnOrderError: When 'load' column is not first and 'horizon' column is not last.
|
|
136
|
-
|
|
137
|
-
"""
|
|
138
|
-
model = train_model.train_pipeline_step_train_model(
|
|
139
|
-
pj, modelspecs, train_data, validation_data
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
# Predict
|
|
143
|
-
model_forecast = model.predict(test_data.iloc[:, 1:-1])
|
|
144
|
-
forecast = pd.DataFrame(index=test_data.index, data={"forecast": model_forecast})
|
|
145
|
-
|
|
146
|
-
# Define tAhead to something meaningfull in the context of a backtest
|
|
147
|
-
forecast["tAhead"] = test_data.iloc[:, -1]
|
|
148
|
-
|
|
149
|
-
# Add confidence
|
|
150
|
-
forecast = ConfidenceIntervalApplicator(
|
|
151
|
-
model, test_data.iloc[:, 1:-1]
|
|
152
|
-
).add_confidence_interval(forecast, pj)
|
|
153
|
-
|
|
154
|
-
# Prepare for output
|
|
155
|
-
forecast = add_prediction_job_properties_to_forecast(
|
|
156
|
-
pj, forecast, algorithm_type="backtest"
|
|
157
|
-
)
|
|
158
|
-
|
|
159
|
-
# Add column with realised load and horizon information
|
|
160
|
-
forecast["realised"] = test_data.iloc[:, 0]
|
|
161
|
-
forecast["horizon"] = test_data.iloc[:, -1]
|
|
162
|
-
|
|
163
|
-
return model, forecast
|