openstef 3.4.10__py3-none-any.whl → 3.4.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstef/app_settings.py +19 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +18 -0
- openstef/data/dutch_holidays.csv +1759 -0
- openstef/data_classes/data_prep.py +1 -1
- openstef/data_classes/prediction_job.py +15 -9
- openstef/enums.py +108 -9
- openstef/exceptions.py +1 -1
- openstef/feature_engineering/apply_features.py +25 -6
- openstef/feature_engineering/bidding_zone_to_country_mapping.py +106 -0
- openstef/feature_engineering/cyclic_features.py +102 -0
- openstef/feature_engineering/data_preparation.py +12 -5
- openstef/feature_engineering/feature_applicator.py +1 -5
- openstef/feature_engineering/general.py +14 -0
- openstef/feature_engineering/holiday_features.py +35 -26
- openstef/feature_engineering/missing_values_transformer.py +141 -0
- openstef/feature_engineering/weather_features.py +7 -0
- openstef/metrics/figure.py +3 -0
- openstef/metrics/metrics.py +58 -1
- openstef/metrics/reporter.py +7 -0
- openstef/model/confidence_interval_applicator.py +28 -3
- openstef/model/model_creator.py +54 -41
- openstef/model/objective.py +17 -34
- openstef/model/objective_creator.py +13 -12
- openstef/model/regressors/arima.py +1 -1
- openstef/model/regressors/dazls.py +35 -96
- openstef/model/regressors/flatliner.py +95 -0
- openstef/model/regressors/linear_quantile.py +296 -0
- openstef/model/regressors/xgb.py +23 -0
- openstef/model/regressors/xgb_multioutput_quantile.py +261 -0
- openstef/model/regressors/xgb_quantile.py +3 -0
- openstef/model/serializer.py +10 -0
- openstef/model_selection/model_selection.py +4 -1
- openstef/monitoring/performance_meter.py +1 -2
- openstef/monitoring/teams.py +11 -0
- openstef/pipeline/create_basecase_forecast.py +11 -1
- openstef/pipeline/create_component_forecast.py +24 -28
- openstef/pipeline/create_forecast.py +20 -1
- openstef/pipeline/optimize_hyperparameters.py +18 -16
- openstef/pipeline/train_create_forecast_backtest.py +11 -1
- openstef/pipeline/train_model.py +31 -12
- openstef/pipeline/utils.py +3 -0
- openstef/postprocessing/postprocessing.py +29 -0
- openstef/settings.py +15 -0
- openstef/tasks/calculate_kpi.py +23 -20
- openstef/tasks/create_basecase_forecast.py +15 -7
- openstef/tasks/create_components_forecast.py +24 -8
- openstef/tasks/create_forecast.py +9 -6
- openstef/tasks/create_solar_forecast.py +4 -4
- openstef/tasks/optimize_hyperparameters.py +2 -2
- openstef/tasks/split_forecast.py +9 -2
- openstef/tasks/train_model.py +9 -7
- openstef/tasks/utils/taskcontext.py +7 -0
- openstef/validation/validation.py +28 -3
- {openstef-3.4.10.dist-info → openstef-3.4.44.dist-info}/METADATA +65 -57
- openstef-3.4.44.dist-info/RECORD +97 -0
- {openstef-3.4.10.dist-info → openstef-3.4.44.dist-info}/WHEEL +1 -1
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_adaptation_model.z +0 -0
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_adaptation_model_features.z +0 -0
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_adaptation_model_scaler.z +0 -0
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_domain_model.z +0 -0
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_domain_model.z.license +0 -3
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_domain_model_features.z +0 -2
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_domain_model_features.z.license +0 -3
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_domain_model_scaler.z +0 -0
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_domain_model_scaler.z.license +0 -3
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_target.z +0 -0
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_target.z.license +0 -3
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_target_scaler.z +0 -6
- openstef/data/dazls_model_3.4.0/dazls_stored_3.4.0_target_scaler.z.license +0 -3
- openstef/data/dutch_holidays_2020-2022.csv +0 -831
- openstef/data/dutch_holidays_2020-2022.csv.license +0 -3
- openstef/feature_engineering/historic_features.py +0 -40
- openstef/model/regressors/proloaf.py +0 -281
- openstef/tasks/run_tracy.py +0 -145
- openstef-3.4.10.dist-info/RECORD +0 -104
- /openstef/data/{dazls_model_3.4.0/dazls_stored_3.4.0_adaptation_model.z.license → dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license} +0 -0
- /openstef/data/{dazls_model_3.4.0/dazls_stored_3.4.0_adaptation_model_features.z.license → dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license} +0 -0
- /openstef/data/{dazls_model_3.4.0/dazls_stored_3.4.0_adaptation_model_scaler.z.license → dutch_holidays.csv.license} +0 -0
- {openstef-3.4.10.dist-info → openstef-3.4.44.dist-info}/LICENSE +0 -0
- {openstef-3.4.10.dist-info → openstef-3.4.44.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,14 @@
|
|
1
1
|
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
2
2
|
#
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
4
|
+
import logging
|
4
5
|
from pathlib import Path
|
5
6
|
|
6
7
|
import pandas as pd
|
7
8
|
import structlog
|
8
9
|
|
9
10
|
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
10
|
-
from openstef.exceptions import
|
11
|
+
from openstef.exceptions import InputDataOngoingZeroFlatlinerError, NoRealisedLoadError
|
11
12
|
from openstef.feature_engineering.feature_applicator import (
|
12
13
|
OperationalPredictFeatureApplicator,
|
13
14
|
)
|
@@ -18,6 +19,7 @@ from openstef.postprocessing.postprocessing import (
|
|
18
19
|
add_components_base_case_forecast,
|
19
20
|
add_prediction_job_properties_to_forecast,
|
20
21
|
)
|
22
|
+
from openstef.settings import Settings
|
21
23
|
from openstef.validation import validation
|
22
24
|
|
23
25
|
MODEL_LOCATION = Path(".")
|
@@ -38,7 +40,15 @@ def create_basecase_forecast_pipeline(
|
|
38
40
|
Returns:
|
39
41
|
Base case forecast
|
40
42
|
|
43
|
+
Raises:
|
44
|
+
NoRealisedLoadError: When no realised load for given datetime range.
|
45
|
+
|
41
46
|
"""
|
47
|
+
structlog.configure(
|
48
|
+
wrapper_class=structlog.make_filtering_bound_logger(
|
49
|
+
logging.getLevelName(Settings.log_level)
|
50
|
+
)
|
51
|
+
)
|
42
52
|
logger = structlog.get_logger(__name__)
|
43
53
|
|
44
54
|
logger.info("Preprocessing data for basecase forecast")
|
@@ -2,7 +2,10 @@
|
|
2
2
|
#
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
4
4
|
|
5
|
+
import logging
|
6
|
+
|
5
7
|
import joblib
|
8
|
+
import numpy as np
|
6
9
|
import pandas as pd
|
7
10
|
import structlog
|
8
11
|
|
@@ -11,12 +14,11 @@ from openstef import PROJECT_ROOT
|
|
11
14
|
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
12
15
|
from openstef.enums import ForecastType
|
13
16
|
from openstef.model.regressors.dazls import Dazls
|
14
|
-
|
15
|
-
import numpy as np
|
17
|
+
from openstef.settings import Settings
|
16
18
|
|
17
19
|
# Set the path for the Dazls stored model
|
18
20
|
DAZLS_STORED = str(
|
19
|
-
PROJECT_ROOT / "openstef" / "data" / "dazls_model_3.4.
|
21
|
+
PROJECT_ROOT / "openstef" / "data" / "dazls_model_3.4.24" / "dazls_stored_3.4.24_"
|
20
22
|
)
|
21
23
|
|
22
24
|
|
@@ -96,47 +98,34 @@ def create_components_forecast_pipeline(
|
|
96
98
|
"algtype"
|
97
99
|
|
98
100
|
"""
|
101
|
+
structlog.configure(
|
102
|
+
wrapper_class=structlog.make_filtering_bound_logger(
|
103
|
+
logging.getLevelName(Settings.log_level)
|
104
|
+
)
|
105
|
+
)
|
99
106
|
logger = structlog.get_logger(__name__)
|
100
107
|
logger.info("Make components prediction", pid=pj["id"])
|
101
108
|
|
102
109
|
# Make component forecasts
|
103
110
|
try:
|
104
|
-
|
111
|
+
dazls_input_data = create_input(pj, input_data, weather_data)
|
105
112
|
|
106
113
|
# Save and load the model as .sav file (or as .z file)
|
107
114
|
# For the code contact: korte.termijn.prognoses@alliander.com
|
108
115
|
dazls_model = Dazls()
|
109
|
-
dazls_model.
|
110
|
-
dazls_model.domain_model_scaler = joblib.load(
|
111
|
-
DAZLS_STORED + "domain_model_scaler.z"
|
112
|
-
)
|
113
|
-
dazls_model.domain_model_input_columns = joblib.load(
|
114
|
-
DAZLS_STORED + "domain_model_features.z"
|
115
|
-
)
|
116
|
+
dazls_model.model_ = joblib.load(DAZLS_STORED + "baseline_model.z")
|
116
117
|
|
117
|
-
dazls_model.adaptation_model = joblib.load(DAZLS_STORED + "adaptation_model.z")
|
118
|
-
dazls_model.adaptation_model_scaler = joblib.load(
|
119
|
-
DAZLS_STORED + "adaptation_model_scaler.z"
|
120
|
-
)
|
121
|
-
dazls_model.adaptation_model_input_columns = joblib.load(
|
122
|
-
DAZLS_STORED + "adaptation_model_features.z"
|
123
|
-
)
|
124
|
-
|
125
|
-
dazls_model.target_columns = joblib.load(DAZLS_STORED + "target.z")
|
126
|
-
dazls_model.target_scaler = joblib.load(DAZLS_STORED + "target_scaler.z")
|
127
|
-
|
128
|
-
logger = structlog.get_logger(__name__)
|
129
118
|
logger.info("DAZLS model loaded", dazls_model=str(dazls_model))
|
130
119
|
|
131
120
|
# Use the predict function of Dazls model
|
132
121
|
# As input data we use the input_data function which takes into consideration what we want as an input for the forecast and what Dazls can accept as an input
|
133
|
-
forecasts = dazls_model.predict(x=
|
122
|
+
forecasts = dazls_model.predict(x=dazls_input_data)
|
134
123
|
|
135
124
|
# Set the columns for the output forecast dataframe
|
136
125
|
forecasts = pd.DataFrame(
|
137
126
|
forecasts,
|
138
127
|
columns=["forecast_wind_on_shore", "forecast_solar"],
|
139
|
-
index=
|
128
|
+
index=dazls_input_data.index,
|
140
129
|
)
|
141
130
|
|
142
131
|
# Make post-processed forecasts for solar and wind power
|
@@ -151,18 +140,25 @@ def create_components_forecast_pipeline(
|
|
151
140
|
|
152
141
|
# Make forecast for the component: "forecast_other"
|
153
142
|
forecasts["forecast_other"] = (
|
154
|
-
|
143
|
+
dazls_input_data["total_load"]
|
155
144
|
- forecasts["forecast_solar"]
|
156
145
|
- forecasts["forecast_wind_on_shore"]
|
157
146
|
)
|
147
|
+
|
148
|
+
# Make sure the forecasts have the same form as the input data. Pad with 0 if necessary
|
149
|
+
forecasts = forecasts.reindex(index=input_data.index, fill_value=0)
|
158
150
|
except Exception as e:
|
159
|
-
# In case something goes wrong we fall back on
|
151
|
+
# In case something goes wrong we fall back on an a zero-filled dataframe
|
160
152
|
logger.warning(
|
161
153
|
f"Could not make component forecasts: {e}, falling back on series of"
|
162
154
|
" zeros!",
|
163
155
|
exc_info=e,
|
164
156
|
)
|
165
|
-
forecasts = pd.DataFrame(
|
157
|
+
forecasts = pd.DataFrame(
|
158
|
+
data=0,
|
159
|
+
index=input_data.index,
|
160
|
+
columns=["forecast_wind_on_shore", "forecast_solar", "forecast_other"],
|
161
|
+
)
|
166
162
|
|
167
163
|
# Prepare for output
|
168
164
|
# Add more prediction properties to the forecast ("pid","customer","description","type","algtype)
|
@@ -1,6 +1,8 @@
|
|
1
1
|
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
2
2
|
#
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
4
|
+
import logging
|
5
|
+
|
4
6
|
import pandas as pd
|
5
7
|
import structlog
|
6
8
|
|
@@ -16,7 +18,9 @@ from openstef.model.serializer import MLflowSerializer
|
|
16
18
|
from openstef.pipeline.utils import generate_forecast_datetime_range
|
17
19
|
from openstef.postprocessing.postprocessing import (
|
18
20
|
add_prediction_job_properties_to_forecast,
|
21
|
+
sort_quantiles,
|
19
22
|
)
|
23
|
+
from openstef.settings import Settings
|
20
24
|
from openstef.validation import validation
|
21
25
|
|
22
26
|
|
@@ -40,6 +44,10 @@ def create_forecast_pipeline(
|
|
40
44
|
Returns:
|
41
45
|
DataFrame with the forecast
|
42
46
|
|
47
|
+
Raises:
|
48
|
+
InputDataOngoingZeroFlatlinerError: When all recent load measurements are zero.
|
49
|
+
LookupError: When no model is found for the given prediction job in MLflow.
|
50
|
+
|
43
51
|
"""
|
44
52
|
prediction_model_pid = pj["id"]
|
45
53
|
# Use the alternative forecast model if it's specify in the pj
|
@@ -64,7 +72,7 @@ def create_forecast_pipeline_core(
|
|
64
72
|
Computes the forecasts and confidence intervals given a prediction job and input data.
|
65
73
|
This pipeline has no database or persisitent storage dependencies.
|
66
74
|
|
67
|
-
Expected prediction job keys: "resolution_minutes", "
|
75
|
+
Expected prediction job keys: "resolution_minutes", "id", "type",
|
68
76
|
"name", "quantiles"
|
69
77
|
|
70
78
|
Args:
|
@@ -76,7 +84,15 @@ def create_forecast_pipeline_core(
|
|
76
84
|
Returns:
|
77
85
|
Forecast
|
78
86
|
|
87
|
+
Raises:
|
88
|
+
InputDataOngoingZeroFlatlinerError: When all recent load measurements are zero.
|
89
|
+
|
79
90
|
"""
|
91
|
+
structlog.configure(
|
92
|
+
wrapper_class=structlog.make_filtering_bound_logger(
|
93
|
+
logging.getLevelName(Settings.log_level)
|
94
|
+
)
|
95
|
+
)
|
80
96
|
logger = structlog.get_logger(__name__)
|
81
97
|
|
82
98
|
fallback_strategy = "extreme_day" # this can later be expanded
|
@@ -142,6 +158,9 @@ def create_forecast_pipeline_core(
|
|
142
158
|
model, forecast_input_data
|
143
159
|
).add_confidence_interval(forecast, pj)
|
144
160
|
|
161
|
+
# Sort quantiles - prevents crossing and is statistically sound
|
162
|
+
forecast = sort_quantiles(forecast)
|
163
|
+
|
145
164
|
# Prepare for output
|
146
165
|
forecast = add_prediction_job_properties_to_forecast(
|
147
166
|
pj,
|
@@ -1,8 +1,9 @@
|
|
1
1
|
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
2
2
|
#
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
4
|
+
import logging
|
4
5
|
import os
|
5
|
-
from typing import Any
|
6
|
+
from typing import Any
|
6
7
|
|
7
8
|
import optuna
|
8
9
|
import pandas as pd
|
@@ -21,16 +22,22 @@ from openstef.model.objective import RegressorObjective
|
|
21
22
|
from openstef.model.objective_creator import ObjectiveCreator
|
22
23
|
from openstef.model.regressors.regressor import OpenstfRegressor
|
23
24
|
from openstef.model.serializer import MLflowSerializer
|
25
|
+
from openstef.model_selection.model_selection import split_data_train_validation_test
|
24
26
|
from openstef.pipeline.train_model import (
|
25
27
|
DEFAULT_TRAIN_HORIZONS_HOURS,
|
26
28
|
train_model_pipeline_core,
|
27
29
|
)
|
30
|
+
from openstef.settings import Settings
|
28
31
|
from openstef.validation import validation
|
29
|
-
from openstef.model_selection.model_selection import split_data_train_validation_test
|
30
32
|
|
31
33
|
optuna.logging.enable_propagation() # Propagate logs to the root logger.
|
32
34
|
optuna.logging.disable_default_handler() # Stop showing logs in sys.stderr.
|
33
35
|
|
36
|
+
structlog.configure(
|
37
|
+
wrapper_class=structlog.make_filtering_bound_logger(
|
38
|
+
logging.getLevelName(Settings.log_level)
|
39
|
+
)
|
40
|
+
)
|
34
41
|
logger = structlog.get_logger(__name__)
|
35
42
|
|
36
43
|
# See https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize
|
@@ -59,6 +66,9 @@ def optimize_hyperparameters_pipeline(
|
|
59
66
|
|
60
67
|
Raises:
|
61
68
|
ValueError: If the input_date is insufficient.
|
69
|
+
InputDataInsufficientError: If the input dataframe is empty.
|
70
|
+
InputDataWrongColumnOrderError: If the load column is missing in the input dataframe.
|
71
|
+
OldModelHigherScoreError: When old model is better than new model.
|
62
72
|
|
63
73
|
Returns:
|
64
74
|
Optimized hyperparameters.
|
@@ -119,6 +129,10 @@ def optimize_hyperparameters_pipeline_core(
|
|
119
129
|
|
120
130
|
Raises:
|
121
131
|
ValueError: If the input_date is insufficient.
|
132
|
+
InputDataInsufficientError: If the input dataframe is empty.
|
133
|
+
InputDataWrongColumnOrderError: If the load column is missing in the input dataframe.
|
134
|
+
OldModelHigherScoreError: When old model is better than new model.
|
135
|
+
InputDataOngoingZeroFlatlinerError: When all recent load measurements are zero.
|
122
136
|
|
123
137
|
Returns:
|
124
138
|
- Best model,
|
@@ -175,18 +189,6 @@ def optimize_hyperparameters_pipeline_core(
|
|
175
189
|
horizons=horizons, feature_names=feature_names, feature_modules=feature_modules
|
176
190
|
).add_features(validated_data, pj=pj)
|
177
191
|
|
178
|
-
# Adds additional proloaf features to the input data, historic_load (equal to the load, first column)
|
179
|
-
if pj["model"] == "proloaf" and "historic_load" not in list(
|
180
|
-
validated_data_with_features.columns
|
181
|
-
):
|
182
|
-
validated_data_with_features[
|
183
|
-
"historic_load"
|
184
|
-
] = validated_data_with_features.iloc[:, 0]
|
185
|
-
# Make sure horizons is last column
|
186
|
-
temp_cols = validated_data_with_features.columns.tolist()
|
187
|
-
new_cols = temp_cols[:-2] + [temp_cols[-1]] + [temp_cols[-2]]
|
188
|
-
validated_data_with_features = validated_data_with_features[new_cols]
|
189
|
-
|
190
192
|
# Create objective (NOTE: this is a callable class)
|
191
193
|
objective = ObjectiveCreator.create_objective(model_type=pj["model"])
|
192
194
|
|
@@ -245,7 +247,7 @@ def optuna_optimization(
|
|
245
247
|
- The objective object used by optuna
|
246
248
|
|
247
249
|
"""
|
248
|
-
model = ModelCreator.create_model(pj["model"])
|
250
|
+
model = ModelCreator.create_model(pj["model"], **(pj.model_kwargs or {}))
|
249
251
|
# Apply set to default hyperparameters if they are specified in the pj
|
250
252
|
if pj.default_modelspecs:
|
251
253
|
valid_hyper_parameters = {
|
@@ -268,7 +270,7 @@ def optuna_optimization(
|
|
268
270
|
if pj.train_split_func is None:
|
269
271
|
split_func = split_data_train_validation_test
|
270
272
|
split_args = {
|
271
|
-
"stratification_min_max":
|
273
|
+
"stratification_min_max": True,
|
272
274
|
"back_test": True,
|
273
275
|
}
|
274
276
|
else:
|
@@ -56,10 +56,16 @@ def train_model_and_forecast_back_test(
|
|
56
56
|
- Validation data sets (list[pd.DataFrame])
|
57
57
|
- Test data sets (list[pd.DataFrame])
|
58
58
|
|
59
|
+
Raises:
|
60
|
+
InputDataInsufficientError: when input data is insufficient.
|
61
|
+
InputDataWrongColumnOrderError: when input data has a invalid column order.
|
62
|
+
ValueError: when the horizon is a string and the corresponding column in not in the input data
|
63
|
+
InputDataOngoingZeroFlatlinerError: when all recent load measurements are zero.
|
64
|
+
|
59
65
|
"""
|
60
66
|
if pj.backtest_split_func is None:
|
61
67
|
backtest_split_func = backtest_split_default
|
62
|
-
backtest_split_args = {"stratification_min_max":
|
68
|
+
backtest_split_args = {"stratification_min_max": True}
|
63
69
|
else:
|
64
70
|
backtest_split_func, backtest_split_args = pj.backtest_split_func.load(
|
65
71
|
required_arguments=["data", "n_folds"]
|
@@ -124,6 +130,10 @@ def train_model_and_forecast_test_core(
|
|
124
130
|
- The trained model
|
125
131
|
- The forecast on the test set.
|
126
132
|
|
133
|
+
Raises:
|
134
|
+
NotImplementedError: When using invalid model type in the prediction job.
|
135
|
+
InputDataWrongColumnOrderError: When 'load' column is not first and 'horizon' column is not last.
|
136
|
+
|
127
137
|
"""
|
128
138
|
model = train_model.train_pipeline_step_train_model(
|
129
139
|
pj, modelspecs, train_data, validation_data
|
openstef/pipeline/train_model.py
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
4
4
|
import logging
|
5
5
|
import os
|
6
|
-
from typing import Optional, Union
|
6
|
+
from typing import Optional, Union, Tuple
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
import structlog
|
@@ -23,6 +23,7 @@ from openstef.model.regressors.regressor import OpenstfRegressor
|
|
23
23
|
from openstef.model.serializer import MLflowSerializer
|
24
24
|
from openstef.model.standard_deviation_generator import StandardDeviationGenerator
|
25
25
|
from openstef.model_selection.model_selection import split_data_train_validation_test
|
26
|
+
from openstef.settings import Settings
|
26
27
|
from openstef.validation import validation
|
27
28
|
|
28
29
|
DEFAULT_TRAIN_HORIZONS_HOURS: list[float] = [0.25, 47.0]
|
@@ -31,6 +32,11 @@ MAXIMUM_MODEL_AGE: int = 7
|
|
31
32
|
DEFAULT_EARLY_STOPPING_ROUNDS: int = 10
|
32
33
|
PENALTY_FACTOR_OLD_MODEL: float = 1.2
|
33
34
|
|
35
|
+
structlog.configure(
|
36
|
+
wrapper_class=structlog.make_filtering_bound_logger(
|
37
|
+
logging.getLevelName(Settings.log_level)
|
38
|
+
)
|
39
|
+
)
|
34
40
|
logger = structlog.get_logger(__name__)
|
35
41
|
|
36
42
|
|
@@ -60,6 +66,13 @@ def train_model_pipeline(
|
|
60
66
|
- The validation dataset with forecasts
|
61
67
|
- The test dataset with forecasts
|
62
68
|
|
69
|
+
Raises:
|
70
|
+
InputDataInsufficientError: when input data is insufficient.
|
71
|
+
InputDataWrongColumnOrderError: when input data has a invalid column order.
|
72
|
+
'load' column should be first and 'horizon' column last.
|
73
|
+
OldModelHigherScoreError: When old model is better than new model.
|
74
|
+
SkipSaveTrainingForecasts: If old model is better or younger than `MAXIMUM_MODEL_AGE`, the model is not saved.
|
75
|
+
|
63
76
|
"""
|
64
77
|
# Initialize serializer
|
65
78
|
serializer = MLflowSerializer(mlflow_tracking_uri=mlflow_tracking_uri)
|
@@ -142,7 +155,7 @@ def train_model_pipeline_core(
|
|
142
155
|
input_data: pd.DataFrame,
|
143
156
|
old_model: OpenstfRegressor = None,
|
144
157
|
horizons: list[float] = DEFAULT_TRAIN_HORIZONS_HOURS,
|
145
|
-
) ->
|
158
|
+
) -> Tuple[
|
146
159
|
OpenstfRegressor,
|
147
160
|
Report,
|
148
161
|
ModelSpecificationDataClass,
|
@@ -164,6 +177,7 @@ def train_model_pipeline_core(
|
|
164
177
|
InputDataInsufficientError: when input data is insufficient.
|
165
178
|
InputDataWrongColumnOrderError: when input data has a invalid column order.
|
166
179
|
OldModelHigherScoreError: When old model is better than new model.
|
180
|
+
InputDataOngoingZeroFlatlinerError: when all recent load measurements are zero.
|
167
181
|
|
168
182
|
Returns:
|
169
183
|
- Fitted_model (OpenstfRegressor)
|
@@ -172,8 +186,6 @@ def train_model_pipeline_core(
|
|
172
186
|
- Datasets (tuple[pd.DataFrmae, pd.DataFrame, pd.Dataframe): The train, validation and test sets
|
173
187
|
|
174
188
|
"""
|
175
|
-
logger = structlog.get_logger(__name__)
|
176
|
-
|
177
189
|
# Call common pipeline
|
178
190
|
(
|
179
191
|
model,
|
@@ -234,7 +246,9 @@ def train_pipeline_common(
|
|
234
246
|
test_fraction: float = 0.0,
|
235
247
|
backtest: bool = False,
|
236
248
|
test_data_predefined: pd.DataFrame = pd.DataFrame(),
|
237
|
-
) -> tuple[
|
249
|
+
) -> tuple[
|
250
|
+
OpenstfRegressor, Report, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame
|
251
|
+
]:
|
238
252
|
"""Common pipeline shared with operational training and backtest training.
|
239
253
|
|
240
254
|
Args:
|
@@ -257,6 +271,8 @@ def train_pipeline_common(
|
|
257
271
|
Raises:
|
258
272
|
InputDataInsufficientError: when input data is insufficient.
|
259
273
|
InputDataWrongColumnOrderError: when input data has a invalid column order.
|
274
|
+
'load' column should be first and 'horizon' column last.
|
275
|
+
InputDataOngoingZeroFlatlinerError: when all recent load measurements are zero.
|
260
276
|
|
261
277
|
"""
|
262
278
|
data_with_features = train_pipeline_step_compute_features(
|
@@ -300,7 +316,8 @@ def train_pipeline_common(
|
|
300
316
|
|
301
317
|
def train_pipeline_step_load_model(
|
302
318
|
pj: PredictionJobDataClass, serializer: MLflowSerializer
|
303
|
-
) ->
|
319
|
+
) -> Tuple[OpenstfRegressor, ModelSpecificationDataClass, Union[int, float]]:
|
320
|
+
old_model: Optional[OpenstfRegressor]
|
304
321
|
try:
|
305
322
|
old_model, model_specs = serializer.load_model(experiment_name=str(pj.id))
|
306
323
|
old_model_age = old_model.age # Age attribute is openstef specific
|
@@ -346,12 +363,9 @@ def train_pipeline_step_compute_features(
|
|
346
363
|
InputDataInsufficientError: when input data is insufficient.
|
347
364
|
InputDataWrongColumnOrderError: when input data has a invalid column order.
|
348
365
|
ValueError: when the horizon is a string and the corresponding column in not in the input data
|
366
|
+
InputDataOngoingZeroFlatlinerError: when all recent load measurements are zero.
|
349
367
|
|
350
368
|
"""
|
351
|
-
if pj["model"] == "proloaf":
|
352
|
-
# proloaf is only able to train with one horizon
|
353
|
-
horizons = [horizons[0]]
|
354
|
-
|
355
369
|
if input_data.empty:
|
356
370
|
raise InputDataInsufficientError("Input dataframe is empty")
|
357
371
|
elif "load" not in input_data.columns:
|
@@ -423,6 +437,10 @@ def train_pipeline_step_train_model(
|
|
423
437
|
Returns:
|
424
438
|
The trained model
|
425
439
|
|
440
|
+
Raises:
|
441
|
+
NotImplementedError: When using invalid model type in the prediction job.
|
442
|
+
InputDataWrongColumnOrderError: When 'load' column is not first and 'horizon' column is not last.
|
443
|
+
|
426
444
|
"""
|
427
445
|
# Test if first column is "load" and last column is "horizon"
|
428
446
|
if train_data.columns[0] != "load" or train_data.columns[-1] != "horizon":
|
@@ -435,6 +453,7 @@ def train_pipeline_step_train_model(
|
|
435
453
|
model = ModelCreator.create_model(
|
436
454
|
pj["model"],
|
437
455
|
quantiles=pj["quantiles"],
|
456
|
+
**(pj.model_kwargs or {}),
|
438
457
|
)
|
439
458
|
|
440
459
|
# split x and y data
|
@@ -493,7 +512,7 @@ def train_pipeline_step_split_data(
|
|
493
512
|
test_fraction: float,
|
494
513
|
backtest: bool = False,
|
495
514
|
test_data_predefined: pd.DataFrame = pd.DataFrame(),
|
496
|
-
) ->
|
515
|
+
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
497
516
|
"""The default way to perform train, val, test split.
|
498
517
|
|
499
518
|
Args:
|
@@ -523,7 +542,7 @@ def train_pipeline_step_split_data(
|
|
523
542
|
if pj.train_split_func is None:
|
524
543
|
split_func = split_data_train_validation_test
|
525
544
|
split_args = {
|
526
|
-
"stratification_min_max":
|
545
|
+
"stratification_min_max": True,
|
527
546
|
"back_test": backtest,
|
528
547
|
}
|
529
548
|
else:
|
openstef/pipeline/utils.py
CHANGED
@@ -27,6 +27,9 @@ def generate_forecast_datetime_range(
|
|
27
27
|
Returns:
|
28
28
|
Start and end datetimes of the forecast range.
|
29
29
|
|
30
|
+
Raises:
|
31
|
+
ValueError: If the target column does not have null values.
|
32
|
+
|
30
33
|
"""
|
31
34
|
# By labeling the True/False values (based on the isnull() statement) as clusters,
|
32
35
|
# we find what True value belongs to what cluster and the amount of True clusters.
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
2
2
|
#
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
4
|
+
import logging
|
4
5
|
from enum import Enum
|
5
6
|
|
6
7
|
import numpy as np
|
@@ -10,6 +11,7 @@ import structlog
|
|
10
11
|
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
11
12
|
from openstef.enums import ForecastType
|
12
13
|
from openstef.feature_engineering import weather_features
|
14
|
+
from openstef.settings import Settings
|
13
15
|
|
14
16
|
# this is the default for "Lagerwey100"
|
15
17
|
TURBINE_DATA = {
|
@@ -219,6 +221,11 @@ def add_prediction_job_properties_to_forecast(
|
|
219
221
|
Dataframe with added metadata.
|
220
222
|
|
221
223
|
"""
|
224
|
+
structlog.configure(
|
225
|
+
wrapper_class=structlog.make_filtering_bound_logger(
|
226
|
+
logging.getLevelName(Settings.log_level)
|
227
|
+
)
|
228
|
+
)
|
222
229
|
logger = structlog.get_logger(__name__)
|
223
230
|
|
224
231
|
logger.info("Postproces in preparation of storing")
|
@@ -244,3 +251,25 @@ def add_prediction_job_properties_to_forecast(
|
|
244
251
|
forecast["algtype"] = algorithm_type
|
245
252
|
|
246
253
|
return forecast
|
254
|
+
|
255
|
+
|
256
|
+
def sort_quantiles(
|
257
|
+
forecast: pd.DataFrame, quantile_col_start="quantile_P"
|
258
|
+
) -> pd.DataFrame:
|
259
|
+
"""Sort quantile values so quantiles do not cross.
|
260
|
+
|
261
|
+
This function assumes that all quantile columns start with 'quantile_P' For more academic details on why this is
|
262
|
+
mathematically sounds, please refer to Quantile and Probability Curves Without Crossing (Chernozhukov, 2010)
|
263
|
+
|
264
|
+
"""
|
265
|
+
p_columns = [col for col in forecast.columns if col.startswith(quantile_col_start)]
|
266
|
+
|
267
|
+
if len(p_columns) == 0:
|
268
|
+
return forecast
|
269
|
+
|
270
|
+
# sort the columns
|
271
|
+
p_columns = np.sort(p_columns)
|
272
|
+
|
273
|
+
forecast.loc[:, p_columns] = forecast[p_columns].apply(sorted, axis=1).to_list()
|
274
|
+
|
275
|
+
return forecast
|
openstef/settings.py
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
2
|
+
#
|
3
|
+
# SPDX-License-Identifier: MPL-2.0
|
4
|
+
|
5
|
+
from functools import lru_cache
|
6
|
+
|
7
|
+
from openstef.app_settings import AppSettings
|
8
|
+
|
9
|
+
|
10
|
+
@lru_cache
|
11
|
+
def _get_app_settings() -> AppSettings:
|
12
|
+
return AppSettings()
|
13
|
+
|
14
|
+
|
15
|
+
Settings = _get_app_settings()
|