openstef 3.4.56__py3-none-any.whl → 4.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstef-4.0.0a3.dist-info/METADATA +177 -0
- openstef-4.0.0a3.dist-info/RECORD +4 -0
- {openstef-3.4.56.dist-info → openstef-4.0.0a3.dist-info}/WHEEL +1 -2
- openstef/__init__.py +0 -14
- openstef/__main__.py +0 -3
- openstef/app_settings.py +0 -19
- openstef/data/NL_terrestrial_radiation.csv +0 -25585
- openstef/data/NL_terrestrial_radiation.csv.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +0 -18
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +0 -3
- openstef/data/dutch_holidays.csv +0 -1759
- openstef/data/dutch_holidays.csv.license +0 -3
- openstef/data/pv_single_coefs.csv +0 -601
- openstef/data/pv_single_coefs.csv.license +0 -3
- openstef/data_classes/__init__.py +0 -3
- openstef/data_classes/data_prep.py +0 -99
- openstef/data_classes/model_specifications.py +0 -30
- openstef/data_classes/prediction_job.py +0 -135
- openstef/data_classes/split_function.py +0 -97
- openstef/enums.py +0 -140
- openstef/exceptions.py +0 -74
- openstef/feature_engineering/__init__.py +0 -3
- openstef/feature_engineering/apply_features.py +0 -138
- openstef/feature_engineering/bidding_zone_to_country_mapping.py +0 -106
- openstef/feature_engineering/cyclic_features.py +0 -161
- openstef/feature_engineering/data_preparation.py +0 -152
- openstef/feature_engineering/feature_adder.py +0 -206
- openstef/feature_engineering/feature_applicator.py +0 -202
- openstef/feature_engineering/general.py +0 -141
- openstef/feature_engineering/holiday_features.py +0 -231
- openstef/feature_engineering/lag_features.py +0 -165
- openstef/feature_engineering/missing_values_transformer.py +0 -141
- openstef/feature_engineering/rolling_features.py +0 -58
- openstef/feature_engineering/weather_features.py +0 -492
- openstef/metrics/__init__.py +0 -3
- openstef/metrics/figure.py +0 -303
- openstef/metrics/metrics.py +0 -486
- openstef/metrics/reporter.py +0 -222
- openstef/model/__init__.py +0 -3
- openstef/model/basecase.py +0 -82
- openstef/model/confidence_interval_applicator.py +0 -242
- openstef/model/fallback.py +0 -77
- openstef/model/metamodels/__init__.py +0 -3
- openstef/model/metamodels/feature_clipper.py +0 -90
- openstef/model/metamodels/grouped_regressor.py +0 -222
- openstef/model/metamodels/missing_values_handler.py +0 -138
- openstef/model/model_creator.py +0 -214
- openstef/model/objective.py +0 -426
- openstef/model/objective_creator.py +0 -65
- openstef/model/regressors/__init__.py +0 -3
- openstef/model/regressors/arima.py +0 -197
- openstef/model/regressors/custom_regressor.py +0 -64
- openstef/model/regressors/dazls.py +0 -116
- openstef/model/regressors/flatliner.py +0 -95
- openstef/model/regressors/gblinear_quantile.py +0 -334
- openstef/model/regressors/lgbm.py +0 -29
- openstef/model/regressors/linear.py +0 -90
- openstef/model/regressors/linear_quantile.py +0 -305
- openstef/model/regressors/regressor.py +0 -114
- openstef/model/regressors/xgb.py +0 -52
- openstef/model/regressors/xgb_multioutput_quantile.py +0 -261
- openstef/model/regressors/xgb_quantile.py +0 -228
- openstef/model/serializer.py +0 -431
- openstef/model/standard_deviation_generator.py +0 -81
- openstef/model_selection/__init__.py +0 -3
- openstef/model_selection/model_selection.py +0 -311
- openstef/monitoring/__init__.py +0 -3
- openstef/monitoring/performance_meter.py +0 -92
- openstef/monitoring/teams.py +0 -203
- openstef/pipeline/__init__.py +0 -3
- openstef/pipeline/create_basecase_forecast.py +0 -133
- openstef/pipeline/create_component_forecast.py +0 -168
- openstef/pipeline/create_forecast.py +0 -171
- openstef/pipeline/optimize_hyperparameters.py +0 -317
- openstef/pipeline/train_create_forecast_backtest.py +0 -163
- openstef/pipeline/train_model.py +0 -561
- openstef/pipeline/utils.py +0 -52
- openstef/postprocessing/__init__.py +0 -3
- openstef/postprocessing/postprocessing.py +0 -275
- openstef/preprocessing/__init__.py +0 -3
- openstef/preprocessing/preprocessing.py +0 -42
- openstef/settings.py +0 -15
- openstef/tasks/__init__.py +0 -3
- openstef/tasks/calculate_kpi.py +0 -324
- openstef/tasks/create_basecase_forecast.py +0 -118
- openstef/tasks/create_components_forecast.py +0 -162
- openstef/tasks/create_forecast.py +0 -145
- openstef/tasks/create_solar_forecast.py +0 -420
- openstef/tasks/create_wind_forecast.py +0 -80
- openstef/tasks/optimize_hyperparameters.py +0 -135
- openstef/tasks/split_forecast.py +0 -273
- openstef/tasks/train_model.py +0 -224
- openstef/tasks/utils/__init__.py +0 -3
- openstef/tasks/utils/dependencies.py +0 -107
- openstef/tasks/utils/predictionjobloop.py +0 -243
- openstef/tasks/utils/taskcontext.py +0 -160
- openstef/validation/__init__.py +0 -3
- openstef/validation/validation.py +0 -322
- openstef-3.4.56.dist-info/METADATA +0 -154
- openstef-3.4.56.dist-info/RECORD +0 -102
- openstef-3.4.56.dist-info/top_level.txt +0 -1
- /openstef-3.4.56.dist-info/LICENSE → /openstef-4.0.0a3.dist-info/licenses/LICENSE.md +0 -0
|
@@ -1,275 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
import logging
|
|
5
|
-
from enum import Enum
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import structlog
|
|
10
|
-
|
|
11
|
-
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
|
12
|
-
from openstef.enums import ForecastType
|
|
13
|
-
from openstef.feature_engineering import weather_features
|
|
14
|
-
from openstef.settings import Settings
|
|
15
|
-
|
|
16
|
-
# this is the default for "Lagerwey100"
|
|
17
|
-
TURBINE_DATA = {
|
|
18
|
-
"rated_power": 1,
|
|
19
|
-
"slope_center": 8.07,
|
|
20
|
-
"steepness": 0.664,
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
# Set value to define precission of power, this is needed because comparing to zero sometimes leads to issues for very small values.
|
|
24
|
-
SMALLEST_POWER_UNIT: float = 0.000001
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def normalize_and_convert_weather_data_for_splitting(
|
|
28
|
-
weather_data: pd.DataFrame,
|
|
29
|
-
) -> pd.DataFrame:
|
|
30
|
-
"""Normalize and converts weather data for use in energy splitting.
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
weather_data: Weather data with "windspeed_100m" and "radiation".
|
|
34
|
-
|
|
35
|
-
Returns:
|
|
36
|
-
Dataframe with "windpower" and "radiation" columns.
|
|
37
|
-
|
|
38
|
-
"""
|
|
39
|
-
# Check we have "windspeed_100m" and "radiation" available
|
|
40
|
-
if not all(
|
|
41
|
-
elem in weather_data.columns for elem in ["windspeed_100m", "radiation"]
|
|
42
|
-
):
|
|
43
|
-
raise ValueError("weather data does not contain required data!")
|
|
44
|
-
|
|
45
|
-
# Prepare output dataframe
|
|
46
|
-
output_dataframe = pd.DataFrame()
|
|
47
|
-
|
|
48
|
-
# Normalize weather data
|
|
49
|
-
output_dataframe["radiation"] = (
|
|
50
|
-
weather_data["radiation"]
|
|
51
|
-
/ np.percentile(weather_data["radiation"].dropna(), 99.0)
|
|
52
|
-
* -1
|
|
53
|
-
)
|
|
54
|
-
wind_ref_series = weather_features.calculate_windspeed_at_hubheight(
|
|
55
|
-
weather_data["windspeed_100m"], fromheight=100
|
|
56
|
-
)
|
|
57
|
-
wind_ref = wind_ref_series.to_frame()
|
|
58
|
-
wind_ref = calculate_wind_power(wind_ref)
|
|
59
|
-
wind_ref *= -1
|
|
60
|
-
|
|
61
|
-
output_dataframe["windpower"] = wind_ref
|
|
62
|
-
return output_dataframe
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def calculate_wind_power(
|
|
66
|
-
windspeed_100m: pd.DataFrame,
|
|
67
|
-
) -> pd.DataFrame:
|
|
68
|
-
"""Calculate the generated wind power based on the wind speed.
|
|
69
|
-
|
|
70
|
-
Values are related through the power curve, which is
|
|
71
|
-
described by turbine_data. Default values are used and are normalized to 1MWp.
|
|
72
|
-
|
|
73
|
-
Args:
|
|
74
|
-
windspeed_100m: Example: ``pd.DataFrame (index = datetime, columns = ["windspeed_100m"])``
|
|
75
|
-
|
|
76
|
-
Returns:
|
|
77
|
-
Example output ``pd.DataFrame(index = datetime, columns = ["windenergy"])``
|
|
78
|
-
|
|
79
|
-
"""
|
|
80
|
-
generated_power = TURBINE_DATA["rated_power"] / (
|
|
81
|
-
1
|
|
82
|
-
+ np.exp(
|
|
83
|
-
-TURBINE_DATA["steepness"] * (windspeed_100m - TURBINE_DATA["slope_center"])
|
|
84
|
-
)
|
|
85
|
-
)
|
|
86
|
-
return generated_power["windspeed_100m"].rename("windenergy").to_frame()
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def split_forecast_in_components(
|
|
90
|
-
forecast: pd.DataFrame, weather_data: pd.DataFrame, split_coefs: dict
|
|
91
|
-
) -> dict[str, pd.DataFrame]:
|
|
92
|
-
"""Make estimates of energy components based on given forecast.
|
|
93
|
-
|
|
94
|
-
Args:
|
|
95
|
-
forecast: KTP load forecast
|
|
96
|
-
weather_data: Weather data for energy splitting, at least; "windspeed_100m" and "radiation"
|
|
97
|
-
split_coefs: Previously determined splitting coefs for prediction job
|
|
98
|
-
|
|
99
|
-
Returns:
|
|
100
|
-
Forecast dataframe for each component
|
|
101
|
-
|
|
102
|
-
"""
|
|
103
|
-
# Normalize weather data
|
|
104
|
-
weather_ref_profiles = normalize_and_convert_weather_data_for_splitting(
|
|
105
|
-
weather_data
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
# Check input
|
|
109
|
-
if not all(
|
|
110
|
-
elem in ["windpower", "radiation"]
|
|
111
|
-
for elem in list(weather_ref_profiles.columns)
|
|
112
|
-
):
|
|
113
|
-
raise ValueError("weather data does not contain required data!")
|
|
114
|
-
|
|
115
|
-
# Merge to ensure datetime index is the same
|
|
116
|
-
weather_ref_profiles = forecast.merge(
|
|
117
|
-
weather_ref_profiles, how="outer", right_index=True, left_index=True
|
|
118
|
-
)
|
|
119
|
-
# Drop rows with duplicate indices
|
|
120
|
-
weather_ref_profiles = weather_ref_profiles[
|
|
121
|
-
~weather_ref_profiles.index.duplicated()
|
|
122
|
-
]
|
|
123
|
-
weather_ref_profiles.replace([np.inf, -np.inf], np.nan).dropna(inplace=True)
|
|
124
|
-
|
|
125
|
-
# Prepare output dictionary and list of forecast types
|
|
126
|
-
components = forecast.copy(deep=True)
|
|
127
|
-
|
|
128
|
-
# Calculate profiles of estimated components
|
|
129
|
-
components["forecast_wind_on_shore"] = (
|
|
130
|
-
split_coefs["wind_ref"] * weather_ref_profiles["windpower"]
|
|
131
|
-
)
|
|
132
|
-
components["forecast_solar"] = (
|
|
133
|
-
split_coefs["pv_ref"] * weather_ref_profiles["radiation"]
|
|
134
|
-
)
|
|
135
|
-
components["forecast_other"] = (
|
|
136
|
-
weather_ref_profiles["forecast"]
|
|
137
|
-
- components["forecast_solar"]
|
|
138
|
-
- components["forecast_wind_on_shore"]
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
# Check that sign of production components is negative and not positive, change if sign is wrong
|
|
142
|
-
if components["forecast_wind_on_shore"].sum() > 0:
|
|
143
|
-
raise ValueError("Sign of estimated wind_on_shore component is positive!")
|
|
144
|
-
if components["forecast_solar"].sum() > 0:
|
|
145
|
-
raise ValueError("Sign of estimated solar component is positive!")
|
|
146
|
-
|
|
147
|
-
# Post process predictions to ensure realistic values
|
|
148
|
-
components["forecast_solar"] = post_process_wind_solar(
|
|
149
|
-
components["forecast_solar"], ForecastType.SOLAR
|
|
150
|
-
)
|
|
151
|
-
components["forecast_wind_on_shore"] = post_process_wind_solar(
|
|
152
|
-
components["forecast_wind_on_shore"], ForecastType.WIND
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
return components.drop("forecast", axis=1).drop("stdev", axis=1).dropna()
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
def post_process_wind_solar(
|
|
159
|
-
forecast: pd.Series, forecast_type: ForecastType
|
|
160
|
-
) -> pd.DataFrame:
|
|
161
|
-
"""Function that caries out postprocessing for wind and solar power generators.
|
|
162
|
-
|
|
163
|
-
As these points will always produce energy, predicted energy consumption is
|
|
164
|
-
set to zero. This function enforces the assumption that production is negative
|
|
165
|
-
and consuption positive.
|
|
166
|
-
|
|
167
|
-
Args:
|
|
168
|
-
forecast: Series with forecast data.
|
|
169
|
-
forecast_type: Specifies the type of forecast. This can be retrieved
|
|
170
|
-
from the prediction job as pj['forecast_type']
|
|
171
|
-
|
|
172
|
-
Returns:
|
|
173
|
-
Post-processed forecast.
|
|
174
|
-
|
|
175
|
-
"""
|
|
176
|
-
if forecast_type not in [ForecastType.WIND, ForecastType.SOLAR]:
|
|
177
|
-
return forecast
|
|
178
|
-
|
|
179
|
-
# For wind and solar forecasted value should always be negative.
|
|
180
|
-
forecast.loc[forecast > (-1 * SMALLEST_POWER_UNIT)] = 0
|
|
181
|
-
|
|
182
|
-
# write changed back to forecast
|
|
183
|
-
return forecast
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
def add_components_base_case_forecast(basecase_forecast: pd.DataFrame) -> pd.DataFrame:
|
|
187
|
-
"""Makes a basecase forecast for the forecast_other component.
|
|
188
|
-
|
|
189
|
-
This will make a simple basecase components forecast
|
|
190
|
-
available and ensures that the sum of the components (other, wind and solar) is equal to the normal basecase
|
|
191
|
-
forecast This is important for sending GLMD messages correctly to TenneT!
|
|
192
|
-
|
|
193
|
-
Args:
|
|
194
|
-
basecase_forecast: pd.DataFrame with basecase forecast
|
|
195
|
-
|
|
196
|
-
Returns:
|
|
197
|
-
basecase_forecast: pd.DataFrame with extra "forecast_other component"
|
|
198
|
-
|
|
199
|
-
"""
|
|
200
|
-
basecase_forecast["forecast_other"] = basecase_forecast["forecast"]
|
|
201
|
-
return basecase_forecast
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
def add_prediction_job_properties_to_forecast(
|
|
205
|
-
pj: PredictionJobDataClass,
|
|
206
|
-
forecast: pd.DataFrame,
|
|
207
|
-
algorithm_type: str,
|
|
208
|
-
forecast_type: Enum = None,
|
|
209
|
-
forecast_quality: str = None,
|
|
210
|
-
) -> pd.DataFrame:
|
|
211
|
-
"""Adds prediciton job meta data to a forecast dataframe.
|
|
212
|
-
|
|
213
|
-
Args:
|
|
214
|
-
pj: Prediciton job.
|
|
215
|
-
forecast: Forecast dataframe
|
|
216
|
-
algorithm_type: Type of algirithm used for making the forecast.
|
|
217
|
-
forecast_type: Type of the forecast. Defaults to None.
|
|
218
|
-
forecast_quality: Quality of the forecast. Defaults to None.
|
|
219
|
-
|
|
220
|
-
Returns:
|
|
221
|
-
Dataframe with added metadata.
|
|
222
|
-
|
|
223
|
-
"""
|
|
224
|
-
structlog.configure(
|
|
225
|
-
wrapper_class=structlog.make_filtering_bound_logger(
|
|
226
|
-
logging.getLevelName(Settings.log_level)
|
|
227
|
-
)
|
|
228
|
-
)
|
|
229
|
-
logger = structlog.get_logger(__name__)
|
|
230
|
-
|
|
231
|
-
logger.info("Postproces in preparation of storing")
|
|
232
|
-
if forecast_type is None:
|
|
233
|
-
forecast_type = pj["forecast_type"]
|
|
234
|
-
else:
|
|
235
|
-
# get the value from the enum
|
|
236
|
-
forecast_type = forecast_type.value
|
|
237
|
-
|
|
238
|
-
# NOTE this field is only used when making the babasecase forecast and fallback
|
|
239
|
-
if forecast_quality is not None:
|
|
240
|
-
forecast["quality"] = forecast_quality
|
|
241
|
-
|
|
242
|
-
# TODO rename prediction job typ to type
|
|
243
|
-
# TODO algtype = model_file_path, perhaps we can find a more logical name
|
|
244
|
-
# TODO perhaps better to make a forecast its own class!
|
|
245
|
-
# TODO double check and sync this with make_basecase_forecast (other fields are added)
|
|
246
|
-
# !!!!! TODO fix the requirement for customer
|
|
247
|
-
forecast["pid"] = pj["id"]
|
|
248
|
-
forecast["customer"] = pj["name"]
|
|
249
|
-
forecast["description"] = pj["description"]
|
|
250
|
-
forecast["type"] = forecast_type
|
|
251
|
-
forecast["algtype"] = algorithm_type
|
|
252
|
-
|
|
253
|
-
return forecast
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
def sort_quantiles(
|
|
257
|
-
forecast: pd.DataFrame, quantile_col_start="quantile_P"
|
|
258
|
-
) -> pd.DataFrame:
|
|
259
|
-
"""Sort quantile values so quantiles do not cross.
|
|
260
|
-
|
|
261
|
-
This function assumes that all quantile columns start with 'quantile_P' For more academic details on why this is
|
|
262
|
-
mathematically sounds, please refer to Quantile and Probability Curves Without Crossing (Chernozhukov, 2010)
|
|
263
|
-
|
|
264
|
-
"""
|
|
265
|
-
p_columns = [col for col in forecast.columns if col.startswith(quantile_col_start)]
|
|
266
|
-
|
|
267
|
-
if len(p_columns) == 0:
|
|
268
|
-
return forecast
|
|
269
|
-
|
|
270
|
-
# sort the columns
|
|
271
|
-
p_columns = np.sort(p_columns)
|
|
272
|
-
|
|
273
|
-
forecast.loc[:, p_columns] = forecast[p_columns].apply(sorted, axis=1).to_list()
|
|
274
|
-
|
|
275
|
-
return forecast
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def replace_repeated_values_with_nan(
|
|
10
|
-
df: pd.DataFrame, threshold: int, column_name: str
|
|
11
|
-
) -> pd.DataFrame:
|
|
12
|
-
"""Replace sequentially repeated values with NaN.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
df: Data with potential repeating values.
|
|
16
|
-
threshold: The minimum number of squentially repeated values needed to trigger the replacement with NaN.
|
|
17
|
-
column_name: Column name of input dataframe with repeating values.
|
|
18
|
-
|
|
19
|
-
Returns:
|
|
20
|
-
DataFrame, similar to df, with the desired values set to NaN.
|
|
21
|
-
|
|
22
|
-
"""
|
|
23
|
-
data = df.copy()
|
|
24
|
-
|
|
25
|
-
# Add a boolean column to mark sequential duplicates
|
|
26
|
-
data["temp_is_duplicate"] = data[column_name].eq(data[column_name].shift(1))
|
|
27
|
-
|
|
28
|
-
# Create an unique identifier for each sequence with the same value, so we can easily remove the correct sequences
|
|
29
|
-
data["temp_repeated_group"] = (~data["temp_is_duplicate"]).cumsum()
|
|
30
|
-
|
|
31
|
-
# Create mask of sequences larger than or equal to the threshold value
|
|
32
|
-
mask = (
|
|
33
|
-
data.groupby("temp_repeated_group")[column_name].transform("count") >= threshold
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
# Replace the masked values with NaN
|
|
37
|
-
data.loc[mask, column_name] = np.nan
|
|
38
|
-
|
|
39
|
-
# Drop temporary columns
|
|
40
|
-
data = data.drop(["temp_is_duplicate", "temp_repeated_group"], axis=1)
|
|
41
|
-
|
|
42
|
-
return data
|
openstef/settings.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
|
|
5
|
-
from functools import lru_cache
|
|
6
|
-
|
|
7
|
-
from openstef.app_settings import AppSettings
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@lru_cache
|
|
11
|
-
def _get_app_settings() -> AppSettings:
|
|
12
|
-
return AppSettings()
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
Settings = _get_app_settings()
|
openstef/tasks/__init__.py
DELETED
openstef/tasks/calculate_kpi.py
DELETED
|
@@ -1,324 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
|
|
5
|
-
"""This module contains the CRON job that is periodically executed to calculate key performance indicators (KPIs).
|
|
6
|
-
|
|
7
|
-
This code assumes prognoses are available from the persistent storage.
|
|
8
|
-
If these are not available run create_forecast.py to train all models.
|
|
9
|
-
|
|
10
|
-
The folowing tasks are caried out:
|
|
11
|
-
1: Calculate the KPI for a given pid. Ignore SplitEnergy
|
|
12
|
-
2: Create figures
|
|
13
|
-
3: Write KPI to database
|
|
14
|
-
|
|
15
|
-
Example:
|
|
16
|
-
This module is meant to be called directly from a CRON job.
|
|
17
|
-
Alternatively this code can be run directly by running::
|
|
18
|
-
$ python calculate_kpi.py
|
|
19
|
-
|
|
20
|
-
"""
|
|
21
|
-
import logging
|
|
22
|
-
|
|
23
|
-
# Import builtins
|
|
24
|
-
from datetime import datetime, timedelta
|
|
25
|
-
from pathlib import Path
|
|
26
|
-
|
|
27
|
-
import numpy as np
|
|
28
|
-
import pandas as pd
|
|
29
|
-
import structlog
|
|
30
|
-
|
|
31
|
-
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
|
32
|
-
from openstef.enums import ModelType
|
|
33
|
-
from openstef.exceptions import NoPredictedLoadError, NoRealisedLoadError
|
|
34
|
-
from openstef.metrics import metrics
|
|
35
|
-
from openstef.settings import Settings
|
|
36
|
-
from openstef.tasks.utils.predictionjobloop import PredictionJobLoop
|
|
37
|
-
from openstef.tasks.utils.taskcontext import TaskContext
|
|
38
|
-
from openstef.validation import validation
|
|
39
|
-
|
|
40
|
-
# Thresholds for retraining and optimizing
|
|
41
|
-
THRESHOLD_RETRAINING = 0.25
|
|
42
|
-
THRESHOLD_OPTIMIZING = 0.50
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def main(model_type: ModelType = None, config=None, database=None) -> None:
|
|
46
|
-
taskname = Path(__file__).name.replace(".py", "")
|
|
47
|
-
|
|
48
|
-
if database is None or config is None:
|
|
49
|
-
raise RuntimeError(
|
|
50
|
-
"Please specifiy a config object and/or database connection object. These"
|
|
51
|
-
" can be found in the openstef-dbc package."
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
if model_type is None:
|
|
55
|
-
model_type = [ml.value for ml in ModelType]
|
|
56
|
-
|
|
57
|
-
with TaskContext(taskname, config, database) as context:
|
|
58
|
-
# Set start and end time
|
|
59
|
-
start_time = datetime.utcnow() - timedelta(days=1)
|
|
60
|
-
end_time = datetime.utcnow()
|
|
61
|
-
|
|
62
|
-
PredictionJobLoop(context, model_type=model_type).map(
|
|
63
|
-
check_kpi_task,
|
|
64
|
-
context,
|
|
65
|
-
start_time=start_time,
|
|
66
|
-
end_time=end_time,
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def check_kpi_task(
|
|
71
|
-
pj: PredictionJobDataClass,
|
|
72
|
-
context: TaskContext,
|
|
73
|
-
start_time: datetime,
|
|
74
|
-
end_time: datetime,
|
|
75
|
-
threshold_optimizing=THRESHOLD_OPTIMIZING,
|
|
76
|
-
threshold_retraining=THRESHOLD_RETRAINING,
|
|
77
|
-
) -> None:
|
|
78
|
-
# Apply default parameters if none are provided
|
|
79
|
-
if start_time is None:
|
|
80
|
-
start_time = datetime.utcnow() - timedelta(days=1)
|
|
81
|
-
if end_time is None:
|
|
82
|
-
end_time = datetime.utcnow()
|
|
83
|
-
|
|
84
|
-
# Get realised load data
|
|
85
|
-
realised = context.database.get_load_pid(pj["id"], start_time, end_time, "15T")
|
|
86
|
-
|
|
87
|
-
# Get predicted load
|
|
88
|
-
predicted_load = context.database.get_predicted_load_tahead(
|
|
89
|
-
pj, start_time, end_time
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
# Get basecase prediction
|
|
93
|
-
load_1_week_before = context.database.get_load_pid(
|
|
94
|
-
pj["id"], start_time - timedelta(days=7), end_time - timedelta(days=7), "15T"
|
|
95
|
-
)
|
|
96
|
-
if len(load_1_week_before) > 0:
|
|
97
|
-
basecase = load_1_week_before.shift(periods=7, freq="d")
|
|
98
|
-
else:
|
|
99
|
-
basecase = pd.DataFrame()
|
|
100
|
-
|
|
101
|
-
kpis = calc_kpi_for_specific_pid(pj["id"], realised, predicted_load, basecase)
|
|
102
|
-
# Write KPI's to database
|
|
103
|
-
context.database.write_kpi(pj, kpis)
|
|
104
|
-
|
|
105
|
-
# Add pid to the list of pids that should be retrained or optimized if
|
|
106
|
-
# performance is insufficient
|
|
107
|
-
if kpis["47.0h"]["rMAE"] > threshold_retraining:
|
|
108
|
-
context.logger.warning(
|
|
109
|
-
"Need to retrain model, retraining threshold rMAE 47h exceeded",
|
|
110
|
-
t_ahead="47.0h",
|
|
111
|
-
rMAE=kpis["47.0h"]["rMAE"],
|
|
112
|
-
retraining_threshold=threshold_retraining,
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
if kpis["47.0h"]["rMAE"] > threshold_optimizing:
|
|
116
|
-
context.logger.warning(
|
|
117
|
-
"Need to optimize hyperparameters, optimizing threshold rMAE 47h exceeded",
|
|
118
|
-
t_ahead="47.0h",
|
|
119
|
-
rMAE=kpis["47.0h"]["rMAE"],
|
|
120
|
-
optimizing_threshold=threshold_optimizing,
|
|
121
|
-
)
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
def calc_kpi_for_specific_pid(
|
|
125
|
-
pid: int,
|
|
126
|
-
realised: pd.DataFrame,
|
|
127
|
-
predicted_load: pd.DataFrame,
|
|
128
|
-
basecase: pd.DataFrame,
|
|
129
|
-
) -> dict:
|
|
130
|
-
"""Function that checks the model performance based on a pid. This function.
|
|
131
|
-
|
|
132
|
-
- loads and combines forecast and realised data
|
|
133
|
-
- calculated several key performance indicators (KPIs)
|
|
134
|
-
These metric include:
|
|
135
|
-
- RMSE,
|
|
136
|
-
- bias,
|
|
137
|
-
- NSME (model efficiency, between -inf and 1)
|
|
138
|
-
- Mean absolute Error
|
|
139
|
-
|
|
140
|
-
Args:
|
|
141
|
-
pid: Prediction ID for a given prediction job
|
|
142
|
-
realised: Realised load.
|
|
143
|
-
predicted_load: Predicted load.
|
|
144
|
-
basecase: Basecase predicted load.
|
|
145
|
-
|
|
146
|
-
Returns:
|
|
147
|
-
- Dictionary that includes a dictonary for each t_ahead.
|
|
148
|
-
- Dict includes enddate en window (in days) for clarification
|
|
149
|
-
|
|
150
|
-
Raises:
|
|
151
|
-
NoPredictedLoadError: When no predicted load for given datatime range.
|
|
152
|
-
NoRealisedLoadError: When no realised load for given datetime range.
|
|
153
|
-
|
|
154
|
-
Example:
|
|
155
|
-
To get the rMAE for the 24 hours ahead prediction: kpis['24h']['rMAE']
|
|
156
|
-
|
|
157
|
-
"""
|
|
158
|
-
COMPLETENESS_REALISED_THRESHOLDS = 0.7
|
|
159
|
-
COMPLETENESS_PREDICTED_LOAD_THRESHOLD = 0.7
|
|
160
|
-
|
|
161
|
-
structlog.configure(
|
|
162
|
-
wrapper_class=structlog.make_filtering_bound_logger(
|
|
163
|
-
logging.getLevelName(Settings.log_level)
|
|
164
|
-
)
|
|
165
|
-
)
|
|
166
|
-
logger = structlog.get_logger(__name__)
|
|
167
|
-
|
|
168
|
-
# If predicted is empty
|
|
169
|
-
if len(predicted_load) == 0:
|
|
170
|
-
raise NoPredictedLoadError(pid)
|
|
171
|
-
|
|
172
|
-
# If realised is empty
|
|
173
|
-
if len(realised) == 0:
|
|
174
|
-
raise NoRealisedLoadError(pid)
|
|
175
|
-
|
|
176
|
-
# Define start and end time
|
|
177
|
-
start_time = realised.index.min().to_pydatetime()
|
|
178
|
-
end_time = realised.index.max().to_pydatetime()
|
|
179
|
-
|
|
180
|
-
completeness_realised = validation.calc_completeness_dataframe(realised)[0]
|
|
181
|
-
|
|
182
|
-
# Interpolate missing data if needed
|
|
183
|
-
realised = realised.resample("15T").interpolate(limit=3)
|
|
184
|
-
|
|
185
|
-
completeness_predicted_load = validation.calc_completeness_dataframe(predicted_load)
|
|
186
|
-
|
|
187
|
-
# Combine the forecast and the realised to make sure indices are matched nicely
|
|
188
|
-
combined = pd.merge(realised, predicted_load, left_index=True, right_index=True)
|
|
189
|
-
|
|
190
|
-
# Add basecase (load in same time period 7 days ago)
|
|
191
|
-
# Check if basecase is not empty, else make a dummy dataframe
|
|
192
|
-
if len(basecase) == 0:
|
|
193
|
-
basecase = pd.DataFrame(columns=["load"])
|
|
194
|
-
basecase = basecase.rename(columns=dict(load="basecase"))
|
|
195
|
-
|
|
196
|
-
combined = combined.merge(basecase, how="left", left_index=True, right_index=True)
|
|
197
|
-
|
|
198
|
-
# Raise exception in case of constant load
|
|
199
|
-
if combined.load.nunique() == 1:
|
|
200
|
-
logger.warning(
|
|
201
|
-
"The load is constant! KPIs will still be calculated, but relative metrics"
|
|
202
|
-
" will be nan."
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
# Define output dictonary
|
|
206
|
-
kpis = dict()
|
|
207
|
-
|
|
208
|
-
# Extract t_aheads from predicted_load,
|
|
209
|
-
# Make a list of tuples with [(forecast_xh, stdev_xh),(..,..),..]
|
|
210
|
-
hor_list = [
|
|
211
|
-
("forecast_" + t_ahead, "stdev_" + t_ahead)
|
|
212
|
-
for t_ahead in set(col.split("_")[1] for col in predicted_load.columns)
|
|
213
|
-
]
|
|
214
|
-
|
|
215
|
-
# cast date to int
|
|
216
|
-
date = pd.to_datetime(end_time)
|
|
217
|
-
|
|
218
|
-
# Calculate model metrics and add them to the output dictionary
|
|
219
|
-
logger.info("Start calculating kpis")
|
|
220
|
-
for hor_cols in hor_list:
|
|
221
|
-
t_ahead_h = hor_cols[0].split("_")[1]
|
|
222
|
-
fc = combined[hor_cols[0]] # load predictions
|
|
223
|
-
st = combined[hor_cols[1]] # standard deviations of load predictions
|
|
224
|
-
|
|
225
|
-
completeness_predicted_load_specific_hor = (
|
|
226
|
-
validation.calc_completeness_dataframe(fc.to_frame(name=t_ahead_h))[0]
|
|
227
|
-
)
|
|
228
|
-
kpis.update(
|
|
229
|
-
{
|
|
230
|
-
t_ahead_h: {
|
|
231
|
-
"RMSE": metrics.rmse(combined["load"], fc),
|
|
232
|
-
"bias": metrics.bias(combined["load"], fc),
|
|
233
|
-
"NSME": metrics.nsme(combined["load"], fc),
|
|
234
|
-
"MAE": metrics.mae(combined["load"], fc),
|
|
235
|
-
"rMAE": metrics.r_mae(combined["load"], fc),
|
|
236
|
-
"rMAE_highest": metrics.r_mae_highest(combined["load"], fc),
|
|
237
|
-
"rMNE_highest": metrics.r_mne_highest(combined["load"], fc),
|
|
238
|
-
"rMPE_highest": metrics.r_mpe_highest(combined["load"], fc),
|
|
239
|
-
"rMAE_lowest": metrics.r_mae_lowest(combined["load"], fc),
|
|
240
|
-
"skill_score_basecase": metrics.skill_score(
|
|
241
|
-
combined["load"],
|
|
242
|
-
combined["basecase"],
|
|
243
|
-
np.mean(combined["basecase"]),
|
|
244
|
-
),
|
|
245
|
-
"skill_score": metrics.skill_score(
|
|
246
|
-
combined["load"], fc, np.mean(combined["basecase"])
|
|
247
|
-
),
|
|
248
|
-
"skill_score_positive_peaks": metrics.skill_score_positive_peaks(
|
|
249
|
-
combined["load"], fc, np.mean(combined["basecase"])
|
|
250
|
-
),
|
|
251
|
-
"skill_score_positive_peaks_basecase": metrics.skill_score_positive_peaks(
|
|
252
|
-
combined["load"],
|
|
253
|
-
combined["basecase"],
|
|
254
|
-
np.mean(combined["basecase"]),
|
|
255
|
-
),
|
|
256
|
-
"franks_skill_score": metrics.franks_skill_score(
|
|
257
|
-
combined["load"], fc, combined["basecase"]
|
|
258
|
-
),
|
|
259
|
-
"franks_skill_score_peaks": metrics.franks_skill_score_peaks(
|
|
260
|
-
combined["load"], fc, combined["basecase"]
|
|
261
|
-
),
|
|
262
|
-
"load_range": combined["load"].max() - combined["load"].min(),
|
|
263
|
-
"frac_in_1sdev": metrics.frac_in_stdev(combined["load"], fc, st),
|
|
264
|
-
"frac_in_2sdev": metrics.frac_in_stdev(
|
|
265
|
-
combined["load"], fc, 2 * st
|
|
266
|
-
),
|
|
267
|
-
"completeness_realised": completeness_realised,
|
|
268
|
-
"completeness_predicted": completeness_predicted_load_specific_hor,
|
|
269
|
-
"date": date, # cast to date
|
|
270
|
-
"window_days": np.round(
|
|
271
|
-
(end_time - start_time).total_seconds() / 60.0 / 60.0 / 24.0
|
|
272
|
-
),
|
|
273
|
-
}
|
|
274
|
-
}
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
if completeness_realised < COMPLETENESS_REALISED_THRESHOLDS:
|
|
278
|
-
logger.warning(
|
|
279
|
-
"Completeness realised load too low",
|
|
280
|
-
prediction_id=pid,
|
|
281
|
-
start_time=start_time,
|
|
282
|
-
end_time=end_time,
|
|
283
|
-
completeness=completeness_realised,
|
|
284
|
-
completeness_threshold=COMPLETENESS_REALISED_THRESHOLDS,
|
|
285
|
-
)
|
|
286
|
-
set_incomplete_kpi_to_nan(kpis, t_ahead_h)
|
|
287
|
-
if completeness_predicted_load.any() < COMPLETENESS_PREDICTED_LOAD_THRESHOLD:
|
|
288
|
-
logger.warning(
|
|
289
|
-
"Completeness predicted load of specific horizon too low",
|
|
290
|
-
prediction_id=pid,
|
|
291
|
-
horizon=t_ahead_h,
|
|
292
|
-
start_time=start_time,
|
|
293
|
-
end_time=end_time,
|
|
294
|
-
completeness=completeness_predicted_load,
|
|
295
|
-
completeness_threshold=COMPLETENESS_PREDICTED_LOAD_THRESHOLD,
|
|
296
|
-
)
|
|
297
|
-
set_incomplete_kpi_to_nan(kpis, t_ahead_h)
|
|
298
|
-
|
|
299
|
-
# Return output dictionary
|
|
300
|
-
return kpis
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
def set_incomplete_kpi_to_nan(kpis: dict, t_ahead_h: str) -> None:
|
|
304
|
-
"""Checks the given kpis for completeness and sets to nan if this not true.
|
|
305
|
-
|
|
306
|
-
Args:
|
|
307
|
-
kpis: the kpis
|
|
308
|
-
t_ahead_h: t_ahead_h
|
|
309
|
-
|
|
310
|
-
"""
|
|
311
|
-
kpi_metrics = list(kpis[t_ahead_h].keys())
|
|
312
|
-
# Set to nan
|
|
313
|
-
for kpi in kpi_metrics:
|
|
314
|
-
if kpi not in [
|
|
315
|
-
"completeness_realised",
|
|
316
|
-
"completeness_predicted",
|
|
317
|
-
"date",
|
|
318
|
-
"window_days",
|
|
319
|
-
]:
|
|
320
|
-
kpis[t_ahead_h].update({kpi: np.nan})
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
if __name__ == "__main__":
|
|
324
|
-
main()
|