openstef 3.4.56__py3-none-any.whl → 4.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstef-4.0.0a3.dist-info/METADATA +177 -0
- openstef-4.0.0a3.dist-info/RECORD +4 -0
- {openstef-3.4.56.dist-info → openstef-4.0.0a3.dist-info}/WHEEL +1 -2
- openstef/__init__.py +0 -14
- openstef/__main__.py +0 -3
- openstef/app_settings.py +0 -19
- openstef/data/NL_terrestrial_radiation.csv +0 -25585
- openstef/data/NL_terrestrial_radiation.csv.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +0 -18
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +0 -3
- openstef/data/dutch_holidays.csv +0 -1759
- openstef/data/dutch_holidays.csv.license +0 -3
- openstef/data/pv_single_coefs.csv +0 -601
- openstef/data/pv_single_coefs.csv.license +0 -3
- openstef/data_classes/__init__.py +0 -3
- openstef/data_classes/data_prep.py +0 -99
- openstef/data_classes/model_specifications.py +0 -30
- openstef/data_classes/prediction_job.py +0 -135
- openstef/data_classes/split_function.py +0 -97
- openstef/enums.py +0 -140
- openstef/exceptions.py +0 -74
- openstef/feature_engineering/__init__.py +0 -3
- openstef/feature_engineering/apply_features.py +0 -138
- openstef/feature_engineering/bidding_zone_to_country_mapping.py +0 -106
- openstef/feature_engineering/cyclic_features.py +0 -161
- openstef/feature_engineering/data_preparation.py +0 -152
- openstef/feature_engineering/feature_adder.py +0 -206
- openstef/feature_engineering/feature_applicator.py +0 -202
- openstef/feature_engineering/general.py +0 -141
- openstef/feature_engineering/holiday_features.py +0 -231
- openstef/feature_engineering/lag_features.py +0 -165
- openstef/feature_engineering/missing_values_transformer.py +0 -141
- openstef/feature_engineering/rolling_features.py +0 -58
- openstef/feature_engineering/weather_features.py +0 -492
- openstef/metrics/__init__.py +0 -3
- openstef/metrics/figure.py +0 -303
- openstef/metrics/metrics.py +0 -486
- openstef/metrics/reporter.py +0 -222
- openstef/model/__init__.py +0 -3
- openstef/model/basecase.py +0 -82
- openstef/model/confidence_interval_applicator.py +0 -242
- openstef/model/fallback.py +0 -77
- openstef/model/metamodels/__init__.py +0 -3
- openstef/model/metamodels/feature_clipper.py +0 -90
- openstef/model/metamodels/grouped_regressor.py +0 -222
- openstef/model/metamodels/missing_values_handler.py +0 -138
- openstef/model/model_creator.py +0 -214
- openstef/model/objective.py +0 -426
- openstef/model/objective_creator.py +0 -65
- openstef/model/regressors/__init__.py +0 -3
- openstef/model/regressors/arima.py +0 -197
- openstef/model/regressors/custom_regressor.py +0 -64
- openstef/model/regressors/dazls.py +0 -116
- openstef/model/regressors/flatliner.py +0 -95
- openstef/model/regressors/gblinear_quantile.py +0 -334
- openstef/model/regressors/lgbm.py +0 -29
- openstef/model/regressors/linear.py +0 -90
- openstef/model/regressors/linear_quantile.py +0 -305
- openstef/model/regressors/regressor.py +0 -114
- openstef/model/regressors/xgb.py +0 -52
- openstef/model/regressors/xgb_multioutput_quantile.py +0 -261
- openstef/model/regressors/xgb_quantile.py +0 -228
- openstef/model/serializer.py +0 -431
- openstef/model/standard_deviation_generator.py +0 -81
- openstef/model_selection/__init__.py +0 -3
- openstef/model_selection/model_selection.py +0 -311
- openstef/monitoring/__init__.py +0 -3
- openstef/monitoring/performance_meter.py +0 -92
- openstef/monitoring/teams.py +0 -203
- openstef/pipeline/__init__.py +0 -3
- openstef/pipeline/create_basecase_forecast.py +0 -133
- openstef/pipeline/create_component_forecast.py +0 -168
- openstef/pipeline/create_forecast.py +0 -171
- openstef/pipeline/optimize_hyperparameters.py +0 -317
- openstef/pipeline/train_create_forecast_backtest.py +0 -163
- openstef/pipeline/train_model.py +0 -561
- openstef/pipeline/utils.py +0 -52
- openstef/postprocessing/__init__.py +0 -3
- openstef/postprocessing/postprocessing.py +0 -275
- openstef/preprocessing/__init__.py +0 -3
- openstef/preprocessing/preprocessing.py +0 -42
- openstef/settings.py +0 -15
- openstef/tasks/__init__.py +0 -3
- openstef/tasks/calculate_kpi.py +0 -324
- openstef/tasks/create_basecase_forecast.py +0 -118
- openstef/tasks/create_components_forecast.py +0 -162
- openstef/tasks/create_forecast.py +0 -145
- openstef/tasks/create_solar_forecast.py +0 -420
- openstef/tasks/create_wind_forecast.py +0 -80
- openstef/tasks/optimize_hyperparameters.py +0 -135
- openstef/tasks/split_forecast.py +0 -273
- openstef/tasks/train_model.py +0 -224
- openstef/tasks/utils/__init__.py +0 -3
- openstef/tasks/utils/dependencies.py +0 -107
- openstef/tasks/utils/predictionjobloop.py +0 -243
- openstef/tasks/utils/taskcontext.py +0 -160
- openstef/validation/__init__.py +0 -3
- openstef/validation/validation.py +0 -322
- openstef-3.4.56.dist-info/METADATA +0 -154
- openstef-3.4.56.dist-info/RECORD +0 -102
- openstef-3.4.56.dist-info/top_level.txt +0 -1
- /openstef-3.4.56.dist-info/LICENSE → /openstef-4.0.0a3.dist-info/licenses/LICENSE.md +0 -0
openstef/pipeline/train_model.py
DELETED
|
@@ -1,561 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
import logging
|
|
5
|
-
import os
|
|
6
|
-
from typing import Optional, Union, Tuple
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import structlog
|
|
10
|
-
|
|
11
|
-
from openstef.data_classes.model_specifications import ModelSpecificationDataClass
|
|
12
|
-
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
|
13
|
-
from openstef.exceptions import (
|
|
14
|
-
InputDataInsufficientError,
|
|
15
|
-
InputDataWrongColumnOrderError,
|
|
16
|
-
OldModelHigherScoreError,
|
|
17
|
-
SkipSaveTrainingForecasts,
|
|
18
|
-
)
|
|
19
|
-
from openstef.feature_engineering.feature_applicator import TrainFeatureApplicator
|
|
20
|
-
from openstef.metrics.reporter import Report, Reporter
|
|
21
|
-
from openstef.model.model_creator import ModelCreator
|
|
22
|
-
from openstef.model.regressors.regressor import OpenstfRegressor
|
|
23
|
-
from openstef.model.serializer import MLflowSerializer
|
|
24
|
-
from openstef.model.standard_deviation_generator import StandardDeviationGenerator
|
|
25
|
-
from openstef.model_selection.model_selection import split_data_train_validation_test
|
|
26
|
-
from openstef.settings import Settings
|
|
27
|
-
from openstef.validation import validation
|
|
28
|
-
|
|
29
|
-
DEFAULT_TRAIN_HORIZONS_HOURS: list[float] = [0.25, 47.0]
|
|
30
|
-
MAXIMUM_MODEL_AGE: int = 7
|
|
31
|
-
|
|
32
|
-
DEFAULT_EARLY_STOPPING_ROUNDS: int = 10
|
|
33
|
-
PENALTY_FACTOR_OLD_MODEL: float = 1.2
|
|
34
|
-
|
|
35
|
-
structlog.configure(
|
|
36
|
-
wrapper_class=structlog.make_filtering_bound_logger(
|
|
37
|
-
logging.getLevelName(Settings.log_level)
|
|
38
|
-
)
|
|
39
|
-
)
|
|
40
|
-
logger = structlog.get_logger(__name__)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def train_model_pipeline(
|
|
44
|
-
pj: PredictionJobDataClass,
|
|
45
|
-
input_data: pd.DataFrame,
|
|
46
|
-
check_old_model_age: bool,
|
|
47
|
-
mlflow_tracking_uri: str,
|
|
48
|
-
artifact_folder: str,
|
|
49
|
-
) -> Optional[tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
|
|
50
|
-
"""Middle level pipeline that takes care of all persistent storage dependencies.
|
|
51
|
-
|
|
52
|
-
Expected prediction jobs keys: "id",
|
|
53
|
-
"model", "hyper_params", "feature_names".
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
pj: Prediction job
|
|
57
|
-
input_data: Raw training input data
|
|
58
|
-
check_old_model_age: Check if training should be skipped because the model is too young
|
|
59
|
-
mlflow_tracking_uri: Tracking URI for MLFlow
|
|
60
|
-
artifact_folder: Path where artifacts, such as trained models, are stored
|
|
61
|
-
|
|
62
|
-
Returns:
|
|
63
|
-
If pj.save_train_forecasts is False, None is returned
|
|
64
|
-
Otherwise:
|
|
65
|
-
- The train dataset with forecasts
|
|
66
|
-
- The validation dataset with forecasts
|
|
67
|
-
- The test dataset with forecasts
|
|
68
|
-
|
|
69
|
-
Raises:
|
|
70
|
-
InputDataInsufficientError: when input data is insufficient.
|
|
71
|
-
InputDataWrongColumnOrderError: when input data has a invalid column order.
|
|
72
|
-
'load' column should be first and 'horizon' column last.
|
|
73
|
-
OldModelHigherScoreError: When old model is better than new model.
|
|
74
|
-
SkipSaveTrainingForecasts: If old model is better or younger than `MAXIMUM_MODEL_AGE`, the model is not saved.
|
|
75
|
-
|
|
76
|
-
"""
|
|
77
|
-
# Initialize serializer
|
|
78
|
-
serializer = MLflowSerializer(mlflow_tracking_uri=mlflow_tracking_uri)
|
|
79
|
-
|
|
80
|
-
# Get old model and age
|
|
81
|
-
old_model, model_specs, old_model_age = train_pipeline_step_load_model(
|
|
82
|
-
pj, serializer
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
# Check old model age and continue yes/no
|
|
86
|
-
if (old_model_age < MAXIMUM_MODEL_AGE) and check_old_model_age:
|
|
87
|
-
logger.warning(
|
|
88
|
-
f"Old model is younger than {MAXIMUM_MODEL_AGE} days, skip training"
|
|
89
|
-
)
|
|
90
|
-
if pj.save_train_forecasts:
|
|
91
|
-
raise SkipSaveTrainingForecasts
|
|
92
|
-
return
|
|
93
|
-
|
|
94
|
-
# Train model with core pipeline
|
|
95
|
-
try:
|
|
96
|
-
if pj.train_horizons_minutes is None:
|
|
97
|
-
horizons = DEFAULT_TRAIN_HORIZONS_HOURS
|
|
98
|
-
else:
|
|
99
|
-
horizons = [
|
|
100
|
-
horizon_minutes / 60 for horizon_minutes in pj.train_horizons_minutes
|
|
101
|
-
]
|
|
102
|
-
|
|
103
|
-
model, report, model_specs_updated, data_sets = train_model_pipeline_core(
|
|
104
|
-
pj,
|
|
105
|
-
model_specs,
|
|
106
|
-
input_data,
|
|
107
|
-
old_model,
|
|
108
|
-
horizons=horizons,
|
|
109
|
-
)
|
|
110
|
-
except OldModelHigherScoreError as OMHSE:
|
|
111
|
-
logger.error("Old model is better than new model", pid=pj["id"], exc_info=OMHSE)
|
|
112
|
-
if pj.save_train_forecasts:
|
|
113
|
-
raise SkipSaveTrainingForecasts from OMHSE
|
|
114
|
-
return
|
|
115
|
-
|
|
116
|
-
except InputDataInsufficientError as IDIE:
|
|
117
|
-
logger.error(
|
|
118
|
-
"Input data is insufficient after validation and cleaning",
|
|
119
|
-
pid=pj["id"],
|
|
120
|
-
exc_info=IDIE,
|
|
121
|
-
)
|
|
122
|
-
raise InputDataInsufficientError(IDIE)
|
|
123
|
-
|
|
124
|
-
except InputDataWrongColumnOrderError as IDWCOE:
|
|
125
|
-
logger.error(
|
|
126
|
-
"Wrong column order, 'load' column should be first and 'horizon' column"
|
|
127
|
-
" last.",
|
|
128
|
-
pid=pj["id"],
|
|
129
|
-
exc_info=IDWCOE,
|
|
130
|
-
)
|
|
131
|
-
raise InputDataWrongColumnOrderError(IDWCOE)
|
|
132
|
-
|
|
133
|
-
# Save model and report. Report is always saved to MLFlow and optionally to disk
|
|
134
|
-
serializer.save_model(
|
|
135
|
-
model=model,
|
|
136
|
-
experiment_name=str(pj["id"]),
|
|
137
|
-
model_type=pj["model"],
|
|
138
|
-
model_specs=model_specs_updated,
|
|
139
|
-
report=report,
|
|
140
|
-
)
|
|
141
|
-
if artifact_folder:
|
|
142
|
-
report_folder = os.path.join(artifact_folder, str(pj["id"]))
|
|
143
|
-
Reporter.write_report_to_disk(report=report, report_folder=report_folder)
|
|
144
|
-
|
|
145
|
-
# Clean up older models
|
|
146
|
-
serializer.remove_old_models(experiment_name=str(pj["id"]))
|
|
147
|
-
|
|
148
|
-
if pj.save_train_forecasts:
|
|
149
|
-
return data_sets
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def train_model_pipeline_core(
|
|
153
|
-
pj: PredictionJobDataClass,
|
|
154
|
-
model_specs: ModelSpecificationDataClass,
|
|
155
|
-
input_data: pd.DataFrame,
|
|
156
|
-
old_model: OpenstfRegressor = None,
|
|
157
|
-
horizons: list[float] = DEFAULT_TRAIN_HORIZONS_HOURS,
|
|
158
|
-
) -> Tuple[
|
|
159
|
-
OpenstfRegressor,
|
|
160
|
-
Report,
|
|
161
|
-
ModelSpecificationDataClass,
|
|
162
|
-
tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame],
|
|
163
|
-
]:
|
|
164
|
-
"""Train model core pipeline.
|
|
165
|
-
|
|
166
|
-
Trains a new model given a prediction job, input data and compares it to an old model.
|
|
167
|
-
This pipeline has no database or persistent storage dependencies.
|
|
168
|
-
|
|
169
|
-
Args:
|
|
170
|
-
pj: Prediction job
|
|
171
|
-
model_specs: Dataclass containing model specifications
|
|
172
|
-
input_data: Input data
|
|
173
|
-
old_model: Old model to compare to. Defaults to None.
|
|
174
|
-
horizons: Horizons to train on in hours, relevant for feature engineering.
|
|
175
|
-
|
|
176
|
-
Raises:
|
|
177
|
-
InputDataInsufficientError: when input data is insufficient.
|
|
178
|
-
InputDataWrongColumnOrderError: when input data has a invalid column order.
|
|
179
|
-
OldModelHigherScoreError: When old model is better than new model.
|
|
180
|
-
InputDataOngoingZeroFlatlinerError: when all recent load measurements are zero.
|
|
181
|
-
|
|
182
|
-
Returns:
|
|
183
|
-
- Fitted_model (OpenstfRegressor)
|
|
184
|
-
- Report (Report)
|
|
185
|
-
- Modelspecs (ModelSpecificationDataClass)
|
|
186
|
-
- Datasets (tuple[pd.DataFrmae, pd.DataFrame, pd.Dataframe): The train, validation and test sets
|
|
187
|
-
|
|
188
|
-
"""
|
|
189
|
-
# Call common pipeline
|
|
190
|
-
(
|
|
191
|
-
model,
|
|
192
|
-
report,
|
|
193
|
-
train_data,
|
|
194
|
-
validation_data,
|
|
195
|
-
test_data,
|
|
196
|
-
operational_score_data,
|
|
197
|
-
) = train_pipeline_common(
|
|
198
|
-
pj,
|
|
199
|
-
model_specs,
|
|
200
|
-
input_data,
|
|
201
|
-
horizons,
|
|
202
|
-
)
|
|
203
|
-
model_specs.feature_names = list(train_data.columns)
|
|
204
|
-
|
|
205
|
-
# Check if new model is better than old model
|
|
206
|
-
if old_model:
|
|
207
|
-
combined = pd.concat([train_data, validation_data])
|
|
208
|
-
# skip the forecast column added at the end of dataframes
|
|
209
|
-
if pj.save_train_forecasts:
|
|
210
|
-
combined = combined.iloc[:, :-1]
|
|
211
|
-
|
|
212
|
-
x_data, y_data = (
|
|
213
|
-
operational_score_data.iloc[:, 1:-1],
|
|
214
|
-
operational_score_data.iloc[:, 0],
|
|
215
|
-
)
|
|
216
|
-
|
|
217
|
-
# Score method always returns R^2
|
|
218
|
-
score_new_model = model.score(x_data, y_data)
|
|
219
|
-
|
|
220
|
-
# Try to compare new model to old model.
|
|
221
|
-
# If this does not success, for example since the feature names of the
|
|
222
|
-
# old model differ from the new model, the new model is considered better
|
|
223
|
-
try:
|
|
224
|
-
score_old_model = old_model.score(x_data, y_data)
|
|
225
|
-
|
|
226
|
-
# Check if R^2 is better for old model
|
|
227
|
-
if score_old_model > score_new_model * PENALTY_FACTOR_OLD_MODEL:
|
|
228
|
-
raise OldModelHigherScoreError(
|
|
229
|
-
f"Old model is better than new model for {pj['id']}."
|
|
230
|
-
)
|
|
231
|
-
|
|
232
|
-
logger.info(
|
|
233
|
-
"New model is better than old model, continuing with training procces"
|
|
234
|
-
)
|
|
235
|
-
except ValueError as e:
|
|
236
|
-
logger.info("Could not compare to old model", pid=pj["id"], exc_info=e)
|
|
237
|
-
|
|
238
|
-
return model, report, model_specs, (train_data, validation_data, test_data)
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
def train_pipeline_common(
|
|
242
|
-
pj: PredictionJobDataClass,
|
|
243
|
-
model_specs: ModelSpecificationDataClass,
|
|
244
|
-
input_data: pd.DataFrame,
|
|
245
|
-
horizons: list[float],
|
|
246
|
-
test_fraction: float = 0.0,
|
|
247
|
-
backtest: bool = False,
|
|
248
|
-
test_data_predefined: pd.DataFrame = pd.DataFrame(),
|
|
249
|
-
) -> tuple[
|
|
250
|
-
OpenstfRegressor, Report, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame
|
|
251
|
-
]:
|
|
252
|
-
"""Common pipeline shared with operational training and backtest training.
|
|
253
|
-
|
|
254
|
-
Args:
|
|
255
|
-
pj: Prediction job
|
|
256
|
-
model_specs: Dataclass containing model specifications
|
|
257
|
-
input_data: Input data
|
|
258
|
-
horizons: horizons to train on in hours.
|
|
259
|
-
test_fraction: fraction of data to use for testing
|
|
260
|
-
backtest: boolean if we need to do a backtest
|
|
261
|
-
test_data_predefined: Predefined test data frame to be used in the pipeline
|
|
262
|
-
(empty data frame by default)
|
|
263
|
-
|
|
264
|
-
Returns:
|
|
265
|
-
- The trained model
|
|
266
|
-
- Report
|
|
267
|
-
- The train data
|
|
268
|
-
- The validation data
|
|
269
|
-
- The test data
|
|
270
|
-
|
|
271
|
-
Raises:
|
|
272
|
-
InputDataInsufficientError: when input data is insufficient.
|
|
273
|
-
InputDataWrongColumnOrderError: when input data has a invalid column order.
|
|
274
|
-
'load' column should be first and 'horizon' column last.
|
|
275
|
-
InputDataOngoingZeroFlatlinerError: when all recent load measurements are zero.
|
|
276
|
-
|
|
277
|
-
"""
|
|
278
|
-
data_with_features = train_pipeline_step_compute_features(
|
|
279
|
-
pj=pj,
|
|
280
|
-
model_specs=model_specs,
|
|
281
|
-
input_data=input_data,
|
|
282
|
-
horizons=horizons,
|
|
283
|
-
)
|
|
284
|
-
|
|
285
|
-
(
|
|
286
|
-
train_data,
|
|
287
|
-
validation_data,
|
|
288
|
-
test_data,
|
|
289
|
-
operational_score_data,
|
|
290
|
-
) = train_pipeline_step_split_data(
|
|
291
|
-
data_with_features=data_with_features,
|
|
292
|
-
pj=pj,
|
|
293
|
-
test_fraction=test_fraction,
|
|
294
|
-
backtest=backtest,
|
|
295
|
-
test_data_predefined=test_data_predefined,
|
|
296
|
-
)
|
|
297
|
-
|
|
298
|
-
model = train_pipeline_step_train_model(
|
|
299
|
-
pj=pj,
|
|
300
|
-
model_specs=model_specs,
|
|
301
|
-
train_data=train_data,
|
|
302
|
-
validation_data=validation_data,
|
|
303
|
-
)
|
|
304
|
-
|
|
305
|
-
# Report about the training process
|
|
306
|
-
reporter = Reporter(train_data, validation_data, test_data, pj.quantiles)
|
|
307
|
-
report = reporter.generate_report(model)
|
|
308
|
-
|
|
309
|
-
if pj.save_train_forecasts:
|
|
310
|
-
train_data["forecast"] = model.predict(train_data.iloc[:, 1:-1])
|
|
311
|
-
validation_data["forecast"] = model.predict(validation_data.iloc[:, 1:-1])
|
|
312
|
-
test_data["forecast"] = model.predict(test_data.iloc[:, 1:-1])
|
|
313
|
-
|
|
314
|
-
return model, report, train_data, validation_data, test_data, operational_score_data
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
def train_pipeline_step_load_model(
|
|
318
|
-
pj: PredictionJobDataClass, serializer: MLflowSerializer
|
|
319
|
-
) -> Tuple[OpenstfRegressor, ModelSpecificationDataClass, Union[int, float]]:
|
|
320
|
-
old_model: Optional[OpenstfRegressor]
|
|
321
|
-
try:
|
|
322
|
-
old_model, model_specs = serializer.load_model(experiment_name=str(pj.id))
|
|
323
|
-
old_model_age = old_model.age # Age attribute is openstef specific
|
|
324
|
-
return old_model, model_specs, old_model_age
|
|
325
|
-
except (AttributeError, FileNotFoundError, LookupError):
|
|
326
|
-
logger.warning("No old model found, training new model", pid=pj.id)
|
|
327
|
-
except Exception:
|
|
328
|
-
logger.exception("Old model could not be loaded, training new model", pid=pj.id)
|
|
329
|
-
old_model = None
|
|
330
|
-
old_model_age = float("inf")
|
|
331
|
-
if pj["default_modelspecs"] is not None:
|
|
332
|
-
model_specs = pj["default_modelspecs"]
|
|
333
|
-
if model_specs.id != pj.id:
|
|
334
|
-
raise RuntimeError(
|
|
335
|
-
"The id of the prediction job and its default model_specs do not"
|
|
336
|
-
" match."
|
|
337
|
-
)
|
|
338
|
-
else:
|
|
339
|
-
# create basic model_specs
|
|
340
|
-
model_specs = ModelSpecificationDataClass(id=pj["id"])
|
|
341
|
-
|
|
342
|
-
return old_model, model_specs, old_model_age
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
def train_pipeline_step_compute_features(
|
|
346
|
-
pj: PredictionJobDataClass,
|
|
347
|
-
model_specs: ModelSpecificationDataClass,
|
|
348
|
-
input_data: pd.DataFrame,
|
|
349
|
-
horizons=list[float],
|
|
350
|
-
) -> pd.DataFrame:
|
|
351
|
-
"""Compute features and perform consistency checks.
|
|
352
|
-
|
|
353
|
-
Args:
|
|
354
|
-
pj: Prediction job
|
|
355
|
-
model_specs: Dataclass containing model specifications
|
|
356
|
-
input_data: Input data
|
|
357
|
-
horizons: horizons to train on in hours.
|
|
358
|
-
|
|
359
|
-
Returns:
|
|
360
|
-
The dataframe with features need to train the model
|
|
361
|
-
|
|
362
|
-
Raises:
|
|
363
|
-
InputDataInsufficientError: when input data is insufficient.
|
|
364
|
-
InputDataWrongColumnOrderError: when input data has a invalid column order.
|
|
365
|
-
ValueError: when the horizon is a string and the corresponding column in not in the input data
|
|
366
|
-
InputDataOngoingZeroFlatlinerError: when all recent load measurements are zero.
|
|
367
|
-
|
|
368
|
-
"""
|
|
369
|
-
if input_data.empty:
|
|
370
|
-
raise InputDataInsufficientError("Input dataframe is empty")
|
|
371
|
-
elif "load" not in input_data.columns:
|
|
372
|
-
raise InputDataWrongColumnOrderError(
|
|
373
|
-
"Missing the load column in the input dataframe"
|
|
374
|
-
)
|
|
375
|
-
|
|
376
|
-
if isinstance(horizons, str):
|
|
377
|
-
if horizons not in set(input_data.columns):
|
|
378
|
-
raise ValueError(
|
|
379
|
-
f"The horizon parameter specifies a column name ({horizons}) missing in"
|
|
380
|
-
" the input data."
|
|
381
|
-
)
|
|
382
|
-
else:
|
|
383
|
-
# sort data to avoid same date repeated multiple time
|
|
384
|
-
input_data = input_data.sort_values(horizons)
|
|
385
|
-
# Validate and clean data
|
|
386
|
-
validated_data = validation.drop_target_na(
|
|
387
|
-
validation.validate(
|
|
388
|
-
pj["id"],
|
|
389
|
-
input_data,
|
|
390
|
-
pj["flatliner_threshold_minutes"],
|
|
391
|
-
pj["resolution_minutes"],
|
|
392
|
-
)
|
|
393
|
-
)
|
|
394
|
-
# Check if sufficient data is left after cleaning
|
|
395
|
-
if not validation.is_data_sufficient(
|
|
396
|
-
validated_data,
|
|
397
|
-
pj["completeness_threshold"],
|
|
398
|
-
pj["minimal_table_length"],
|
|
399
|
-
):
|
|
400
|
-
raise InputDataInsufficientError(
|
|
401
|
-
"Input data is insufficient, after validation and cleaning"
|
|
402
|
-
)
|
|
403
|
-
|
|
404
|
-
# Custom data prep or legacy behavior
|
|
405
|
-
if pj.data_prep_class:
|
|
406
|
-
data_prep_class, data_prep_args = pj.data_prep_class.load()
|
|
407
|
-
data_with_features = data_prep_class(
|
|
408
|
-
pj=pj,
|
|
409
|
-
model_specs=model_specs,
|
|
410
|
-
horizons=horizons,
|
|
411
|
-
**data_prep_args,
|
|
412
|
-
).prepare_train_data(validated_data)
|
|
413
|
-
else:
|
|
414
|
-
data_with_features = TrainFeatureApplicator(
|
|
415
|
-
horizons=horizons,
|
|
416
|
-
feature_names=model_specs.feature_names,
|
|
417
|
-
feature_modules=model_specs.feature_modules,
|
|
418
|
-
).add_features(validated_data, pj=pj)
|
|
419
|
-
|
|
420
|
-
return data_with_features
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
def train_pipeline_step_train_model(
|
|
424
|
-
pj: PredictionJobDataClass,
|
|
425
|
-
model_specs: ModelSpecificationDataClass,
|
|
426
|
-
train_data: pd.DataFrame,
|
|
427
|
-
validation_data: pd.DataFrame,
|
|
428
|
-
) -> OpenstfRegressor:
|
|
429
|
-
"""Train the model.
|
|
430
|
-
|
|
431
|
-
Args:
|
|
432
|
-
pj: Prediction job
|
|
433
|
-
model_specs: Dataclass containing model specifications
|
|
434
|
-
train_data: The training data
|
|
435
|
-
validation_data: The test data
|
|
436
|
-
|
|
437
|
-
Returns:
|
|
438
|
-
The trained model
|
|
439
|
-
|
|
440
|
-
Raises:
|
|
441
|
-
NotImplementedError: When using invalid model type in the prediction job.
|
|
442
|
-
InputDataWrongColumnOrderError: When 'load' column is not first and 'horizon' column is not last.
|
|
443
|
-
|
|
444
|
-
"""
|
|
445
|
-
# Test if first column is "load" and last column is "horizon"
|
|
446
|
-
if train_data.columns[0] != "load" or train_data.columns[-1] != "horizon":
|
|
447
|
-
raise InputDataWrongColumnOrderError(
|
|
448
|
-
f"Wrong column order for {pj['id']} "
|
|
449
|
-
"'load' column should be first and 'horizon' column last."
|
|
450
|
-
)
|
|
451
|
-
|
|
452
|
-
# Create relevant model
|
|
453
|
-
model = ModelCreator.create_model(
|
|
454
|
-
pj["model"],
|
|
455
|
-
quantiles=pj["quantiles"],
|
|
456
|
-
**(pj.model_kwargs or {}),
|
|
457
|
-
)
|
|
458
|
-
|
|
459
|
-
# split x and y data
|
|
460
|
-
train_x, train_y = train_data.iloc[:, 1:-1], train_data.iloc[:, 0]
|
|
461
|
-
validation_x, validation_y = (
|
|
462
|
-
validation_data.iloc[:, 1:-1],
|
|
463
|
-
validation_data.iloc[:, 0],
|
|
464
|
-
)
|
|
465
|
-
|
|
466
|
-
# Configure evals for early stopping
|
|
467
|
-
eval_set = [(train_x, train_y), (validation_x, validation_y)]
|
|
468
|
-
|
|
469
|
-
# Set relevant hyperparameters
|
|
470
|
-
# define protected hyperparams which are derived from prediction_job
|
|
471
|
-
protected_hyperparams = ["quantiles"]
|
|
472
|
-
valid_hyper_parameters = {
|
|
473
|
-
key: value
|
|
474
|
-
for key, value in model_specs.hyper_params.items()
|
|
475
|
-
if key in model.get_params().keys() and key not in protected_hyperparams
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
# Add early stopping to set_params if this is supported by the model
|
|
479
|
-
if "early_stopping_rounds" in model.get_params().keys():
|
|
480
|
-
valid_hyper_parameters.update(
|
|
481
|
-
dict(early_stopping_rounds=DEFAULT_EARLY_STOPPING_ROUNDS)
|
|
482
|
-
)
|
|
483
|
-
|
|
484
|
-
# Temporary fix to allow xgboost version upgrade -> set n_estimators if present and None
|
|
485
|
-
if not valid_hyper_parameters.get("n_estimators", True):
|
|
486
|
-
valid_hyper_parameters.update(dict(n_estimators=100))
|
|
487
|
-
logging.info("Deprecation warning: n_estimators=None found, overwriting.")
|
|
488
|
-
|
|
489
|
-
model.set_params(**valid_hyper_parameters)
|
|
490
|
-
model.fit(
|
|
491
|
-
train_x,
|
|
492
|
-
train_y,
|
|
493
|
-
eval_set=eval_set,
|
|
494
|
-
verbose=False,
|
|
495
|
-
)
|
|
496
|
-
# Gets the feature importance df or None if we don't have feature importance
|
|
497
|
-
model.feature_importance_dataframe = model.set_feature_importance()
|
|
498
|
-
|
|
499
|
-
logging.info("Fitted a new model, not yet stored")
|
|
500
|
-
|
|
501
|
-
# Do confidence interval determination
|
|
502
|
-
model = StandardDeviationGenerator(
|
|
503
|
-
validation_data
|
|
504
|
-
).generate_standard_deviation_data(model)
|
|
505
|
-
|
|
506
|
-
return model
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
def train_pipeline_step_split_data(
|
|
510
|
-
data_with_features: pd.DataFrame,
|
|
511
|
-
pj: PredictionJobDataClass,
|
|
512
|
-
test_fraction: float,
|
|
513
|
-
backtest: bool = False,
|
|
514
|
-
test_data_predefined: pd.DataFrame = pd.DataFrame(),
|
|
515
|
-
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
|
516
|
-
"""The default way to perform train, val, test split.
|
|
517
|
-
|
|
518
|
-
Args:
|
|
519
|
-
data_with_features: Input data
|
|
520
|
-
pj: Prediction job
|
|
521
|
-
test_fraction: fraction of data to use for testing
|
|
522
|
-
backtest: boolean if we need to do a backtest
|
|
523
|
-
test_data_predefined: Predefined test data frame to be used in the pipeline
|
|
524
|
-
(empty data frame by default)
|
|
525
|
-
|
|
526
|
-
Returns:
|
|
527
|
-
- Train dataset
|
|
528
|
-
- Validation dataset
|
|
529
|
-
- Test dataset
|
|
530
|
-
|
|
531
|
-
"""
|
|
532
|
-
# if test_data is predefined, apply the pipeline only on the remaining data
|
|
533
|
-
if not test_data_predefined.empty:
|
|
534
|
-
test_data_predefined = data_with_features[
|
|
535
|
-
data_with_features.index.isin(test_data_predefined.index)
|
|
536
|
-
].sort_index()
|
|
537
|
-
data_with_features = data_with_features[
|
|
538
|
-
~data_with_features.index.isin(test_data_predefined.index)
|
|
539
|
-
].sort_index()
|
|
540
|
-
|
|
541
|
-
# Split data
|
|
542
|
-
if pj.train_split_func is None:
|
|
543
|
-
split_func = split_data_train_validation_test
|
|
544
|
-
split_args = {
|
|
545
|
-
"stratification_min_max": True,
|
|
546
|
-
"back_test": backtest,
|
|
547
|
-
}
|
|
548
|
-
else:
|
|
549
|
-
split_func, split_args = pj.train_split_func.load(
|
|
550
|
-
required_arguments=["data", "test_fraction"]
|
|
551
|
-
)
|
|
552
|
-
|
|
553
|
-
train_data, validation_data, test_data, operational_score_data = split_func(
|
|
554
|
-
data_with_features, test_fraction, **split_args
|
|
555
|
-
)
|
|
556
|
-
|
|
557
|
-
# if test_data is predefined, use this over the returned test_data of split function
|
|
558
|
-
if not test_data_predefined.empty:
|
|
559
|
-
test_data = test_data_predefined
|
|
560
|
-
|
|
561
|
-
return train_data, validation_data, test_data, operational_score_data
|
openstef/pipeline/utils.py
DELETED
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
|
|
6
|
-
import pandas as pd
|
|
7
|
-
import scipy.ndimage as mnts
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def generate_forecast_datetime_range(
|
|
11
|
-
forecast_data: pd.DataFrame,
|
|
12
|
-
) -> tuple[datetime, datetime]:
|
|
13
|
-
"""Generate forecast range based on last cluster of null values in first target column of forecast data.
|
|
14
|
-
|
|
15
|
-
Example:
|
|
16
|
-
|
|
17
|
-
A forecast dataset with data between 2021-11-05 and 2021-11-19, and the
|
|
18
|
-
target column 'load' as first column is given as input to this function. The first
|
|
19
|
-
column 'load' has null values between 2021-11-17 04:00:00 and 2021-11-19 05:00:00.
|
|
20
|
-
The null values at the end of the column indicate when forecasts are needed.
|
|
21
|
-
Therefore this function sets starting time of forecasts as 2021-11-17 04:00:00 and
|
|
22
|
-
end time of forecasts as 2021-11-19 05:00:00.
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
forecast_data: The forecast dataframe.
|
|
26
|
-
|
|
27
|
-
Returns:
|
|
28
|
-
Start and end datetimes of the forecast range.
|
|
29
|
-
|
|
30
|
-
Raises:
|
|
31
|
-
ValueError: If the target column does not have null values.
|
|
32
|
-
|
|
33
|
-
"""
|
|
34
|
-
# By labeling the True/False values (based on the isnull() statement) as clusters,
|
|
35
|
-
# we find what True value belongs to what cluster and the amount of True clusters.
|
|
36
|
-
label_clusters, n_clusters = mnts.label(forecast_data.iloc[:, 0].isnull().values)
|
|
37
|
-
|
|
38
|
-
# If there are zero true clusters, it means the target column does not have nulls
|
|
39
|
-
if n_clusters == 0:
|
|
40
|
-
raise ValueError(
|
|
41
|
-
"Forecast target column must have null values to indicate "
|
|
42
|
-
"when forecast starts and ends."
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
# If there are multiple true clusters, we select the last cluster as forecast range
|
|
46
|
-
forecast_range_data = forecast_data.loc[label_clusters == n_clusters]
|
|
47
|
-
# We select first datetime index of last cluster
|
|
48
|
-
forecast_start_dt = forecast_range_data.index[0].to_pydatetime()
|
|
49
|
-
|
|
50
|
-
# Forecast end is based on last datetime of given forecast data
|
|
51
|
-
forecast_end_dt = forecast_data.index[-1].to_pydatetime()
|
|
52
|
-
return forecast_start_dt, forecast_end_dt
|