openstef 3.4.56__py3-none-any.whl → 4.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstef-4.0.0a3.dist-info/METADATA +177 -0
- openstef-4.0.0a3.dist-info/RECORD +4 -0
- {openstef-3.4.56.dist-info → openstef-4.0.0a3.dist-info}/WHEEL +1 -2
- openstef/__init__.py +0 -14
- openstef/__main__.py +0 -3
- openstef/app_settings.py +0 -19
- openstef/data/NL_terrestrial_radiation.csv +0 -25585
- openstef/data/NL_terrestrial_radiation.csv.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +0 -18
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +0 -3
- openstef/data/dutch_holidays.csv +0 -1759
- openstef/data/dutch_holidays.csv.license +0 -3
- openstef/data/pv_single_coefs.csv +0 -601
- openstef/data/pv_single_coefs.csv.license +0 -3
- openstef/data_classes/__init__.py +0 -3
- openstef/data_classes/data_prep.py +0 -99
- openstef/data_classes/model_specifications.py +0 -30
- openstef/data_classes/prediction_job.py +0 -135
- openstef/data_classes/split_function.py +0 -97
- openstef/enums.py +0 -140
- openstef/exceptions.py +0 -74
- openstef/feature_engineering/__init__.py +0 -3
- openstef/feature_engineering/apply_features.py +0 -138
- openstef/feature_engineering/bidding_zone_to_country_mapping.py +0 -106
- openstef/feature_engineering/cyclic_features.py +0 -161
- openstef/feature_engineering/data_preparation.py +0 -152
- openstef/feature_engineering/feature_adder.py +0 -206
- openstef/feature_engineering/feature_applicator.py +0 -202
- openstef/feature_engineering/general.py +0 -141
- openstef/feature_engineering/holiday_features.py +0 -231
- openstef/feature_engineering/lag_features.py +0 -165
- openstef/feature_engineering/missing_values_transformer.py +0 -141
- openstef/feature_engineering/rolling_features.py +0 -58
- openstef/feature_engineering/weather_features.py +0 -492
- openstef/metrics/__init__.py +0 -3
- openstef/metrics/figure.py +0 -303
- openstef/metrics/metrics.py +0 -486
- openstef/metrics/reporter.py +0 -222
- openstef/model/__init__.py +0 -3
- openstef/model/basecase.py +0 -82
- openstef/model/confidence_interval_applicator.py +0 -242
- openstef/model/fallback.py +0 -77
- openstef/model/metamodels/__init__.py +0 -3
- openstef/model/metamodels/feature_clipper.py +0 -90
- openstef/model/metamodels/grouped_regressor.py +0 -222
- openstef/model/metamodels/missing_values_handler.py +0 -138
- openstef/model/model_creator.py +0 -214
- openstef/model/objective.py +0 -426
- openstef/model/objective_creator.py +0 -65
- openstef/model/regressors/__init__.py +0 -3
- openstef/model/regressors/arima.py +0 -197
- openstef/model/regressors/custom_regressor.py +0 -64
- openstef/model/regressors/dazls.py +0 -116
- openstef/model/regressors/flatliner.py +0 -95
- openstef/model/regressors/gblinear_quantile.py +0 -334
- openstef/model/regressors/lgbm.py +0 -29
- openstef/model/regressors/linear.py +0 -90
- openstef/model/regressors/linear_quantile.py +0 -305
- openstef/model/regressors/regressor.py +0 -114
- openstef/model/regressors/xgb.py +0 -52
- openstef/model/regressors/xgb_multioutput_quantile.py +0 -261
- openstef/model/regressors/xgb_quantile.py +0 -228
- openstef/model/serializer.py +0 -431
- openstef/model/standard_deviation_generator.py +0 -81
- openstef/model_selection/__init__.py +0 -3
- openstef/model_selection/model_selection.py +0 -311
- openstef/monitoring/__init__.py +0 -3
- openstef/monitoring/performance_meter.py +0 -92
- openstef/monitoring/teams.py +0 -203
- openstef/pipeline/__init__.py +0 -3
- openstef/pipeline/create_basecase_forecast.py +0 -133
- openstef/pipeline/create_component_forecast.py +0 -168
- openstef/pipeline/create_forecast.py +0 -171
- openstef/pipeline/optimize_hyperparameters.py +0 -317
- openstef/pipeline/train_create_forecast_backtest.py +0 -163
- openstef/pipeline/train_model.py +0 -561
- openstef/pipeline/utils.py +0 -52
- openstef/postprocessing/__init__.py +0 -3
- openstef/postprocessing/postprocessing.py +0 -275
- openstef/preprocessing/__init__.py +0 -3
- openstef/preprocessing/preprocessing.py +0 -42
- openstef/settings.py +0 -15
- openstef/tasks/__init__.py +0 -3
- openstef/tasks/calculate_kpi.py +0 -324
- openstef/tasks/create_basecase_forecast.py +0 -118
- openstef/tasks/create_components_forecast.py +0 -162
- openstef/tasks/create_forecast.py +0 -145
- openstef/tasks/create_solar_forecast.py +0 -420
- openstef/tasks/create_wind_forecast.py +0 -80
- openstef/tasks/optimize_hyperparameters.py +0 -135
- openstef/tasks/split_forecast.py +0 -273
- openstef/tasks/train_model.py +0 -224
- openstef/tasks/utils/__init__.py +0 -3
- openstef/tasks/utils/dependencies.py +0 -107
- openstef/tasks/utils/predictionjobloop.py +0 -243
- openstef/tasks/utils/taskcontext.py +0 -160
- openstef/validation/__init__.py +0 -3
- openstef/validation/validation.py +0 -322
- openstef-3.4.56.dist-info/METADATA +0 -154
- openstef-3.4.56.dist-info/RECORD +0 -102
- openstef-3.4.56.dist-info/top_level.txt +0 -1
- /openstef-3.4.56.dist-info/LICENSE → /openstef-4.0.0a3.dist-info/licenses/LICENSE.md +0 -0
openstef/tasks/split_forecast.py
DELETED
|
@@ -1,273 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
|
|
5
|
-
"""This module contains the CRON job that is periodically executed to make prognoses of solar features.
|
|
6
|
-
|
|
7
|
-
These features are usefull for splitting the load
|
|
8
|
-
in solar and wind contributions.
|
|
9
|
-
This is achieved by carrying out the folowing steps:
|
|
10
|
-
1. Get the wind and solar reference data for the specific location of the
|
|
11
|
-
customer
|
|
12
|
-
2. Get the TDCV (Typical Domestic Consumption Values) data
|
|
13
|
-
3. Fit a linear combination of above time series to the historic load data to
|
|
14
|
-
determine the contributions of each energy source.
|
|
15
|
-
4. Write the resulting coeficients to the SQL database.
|
|
16
|
-
|
|
17
|
-
Example:
|
|
18
|
-
This module is meant to be called directly from a CRON job. A description of
|
|
19
|
-
the CRON job can be found in the /k8s/CronJobs folder.
|
|
20
|
-
Alternatively this code can be run directly by running::
|
|
21
|
-
|
|
22
|
-
$ python split_forecast.py
|
|
23
|
-
|
|
24
|
-
"""
|
|
25
|
-
import logging
|
|
26
|
-
from datetime import datetime
|
|
27
|
-
from pathlib import Path
|
|
28
|
-
|
|
29
|
-
import numpy as np
|
|
30
|
-
import pandas as pd
|
|
31
|
-
import scipy.optimize
|
|
32
|
-
import structlog
|
|
33
|
-
|
|
34
|
-
import openstef.monitoring.teams as monitoring
|
|
35
|
-
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
|
36
|
-
from openstef.enums import ModelType
|
|
37
|
-
from openstef.settings import Settings
|
|
38
|
-
from openstef.tasks.utils.predictionjobloop import PredictionJobLoop
|
|
39
|
-
from openstef.tasks.utils.taskcontext import TaskContext
|
|
40
|
-
|
|
41
|
-
COEF_MAX_FRACTION_DIFF = 0.3
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def main(config=None, database=None):
|
|
45
|
-
taskname = Path(__file__).name.replace(".py", "")
|
|
46
|
-
|
|
47
|
-
if database is None or config is None:
|
|
48
|
-
raise RuntimeError(
|
|
49
|
-
"Please specifiy a config object and/or database connection object. These"
|
|
50
|
-
" can be found in the openstef-dbc package."
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
with TaskContext(taskname, config, database) as context:
|
|
54
|
-
model_type = [ml.value for ml in ModelType]
|
|
55
|
-
|
|
56
|
-
PredictionJobLoop(
|
|
57
|
-
context,
|
|
58
|
-
model_type=model_type,
|
|
59
|
-
).map(split_forecast_task, context)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def split_forecast_task(
|
|
63
|
-
pj: PredictionJobDataClass,
|
|
64
|
-
context: TaskContext,
|
|
65
|
-
) -> pd.DataFrame:
|
|
66
|
-
"""Function that caries out the energy splitting for a specific prediction job with id pid.
|
|
67
|
-
|
|
68
|
-
Args:
|
|
69
|
-
pid: Prediction job id
|
|
70
|
-
|
|
71
|
-
Returns:
|
|
72
|
-
Energy splitting coefficients.
|
|
73
|
-
|
|
74
|
-
"""
|
|
75
|
-
structlog.configure(
|
|
76
|
-
wrapper_class=structlog.make_filtering_bound_logger(
|
|
77
|
-
logging.getLevelName(Settings.log_level)
|
|
78
|
-
)
|
|
79
|
-
)
|
|
80
|
-
logger = structlog.get_logger(__name__)
|
|
81
|
-
|
|
82
|
-
logger.info("Start splitting energy", pid=pj["id"])
|
|
83
|
-
|
|
84
|
-
# Get input for splitting
|
|
85
|
-
input_split_function = context.database.get_input_energy_splitting(pj)
|
|
86
|
-
|
|
87
|
-
# Old split method;
|
|
88
|
-
# find_components() gives two things:
|
|
89
|
-
# - the split components (load, solar, wind, consumption, Inschatting (=sum of others) )
|
|
90
|
-
# - coefdict: coefficients of each component; these are not yet an output of Dazls. Lets discuss with JM if we want that
|
|
91
|
-
|
|
92
|
-
# Carry out the splitting
|
|
93
|
-
components, coefdict = find_components(input_split_function)
|
|
94
|
-
|
|
95
|
-
# Calculate mean absolute error (MAE)
|
|
96
|
-
# TODO: use a standard metric function for this
|
|
97
|
-
error = components[["load", "Inschatting"]].diff(axis=1).iloc[:, 1]
|
|
98
|
-
mae = error.abs().mean()
|
|
99
|
-
coefdict.update({"MAE": mae})
|
|
100
|
-
coefsdf = convert_coefdict_to_coefsdf(pj, input_split_function, coefdict)
|
|
101
|
-
|
|
102
|
-
# Get the coefs of previous runs and check if new coefs are valid
|
|
103
|
-
last_coefsdict = context.database.get_energy_split_coefs(pj)
|
|
104
|
-
last_coefsdf = convert_coefdict_to_coefsdf(pj, input_split_function, last_coefsdict)
|
|
105
|
-
invalid_coefs = determine_invalid_coefs(coefsdf, last_coefsdf)
|
|
106
|
-
if not invalid_coefs.empty:
|
|
107
|
-
# If coefs not valid, do not update the coefs in the db and send teams
|
|
108
|
-
# message that something strange is happening
|
|
109
|
-
monitoring.post_teams(
|
|
110
|
-
f"New splitting coefficient(s) for pid **{pj['id']}** deviate strongly "
|
|
111
|
-
"from previously stored coefficients.",
|
|
112
|
-
url=context.config.teams_monitoring_url,
|
|
113
|
-
invalid_coefficients=invalid_coefs,
|
|
114
|
-
coefficients_df=coefsdf,
|
|
115
|
-
)
|
|
116
|
-
# Use the last known coefficients for further processing
|
|
117
|
-
return last_coefsdf
|
|
118
|
-
else:
|
|
119
|
-
# Save Results
|
|
120
|
-
context.database.write_energy_splitting_coefficients(
|
|
121
|
-
coefsdf, if_exists="append"
|
|
122
|
-
)
|
|
123
|
-
logger.info(
|
|
124
|
-
"Succesfully wrote energy split coefficients to database", pid=pj["id"]
|
|
125
|
-
)
|
|
126
|
-
return coefsdf
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
def determine_invalid_coefs(
|
|
130
|
-
new_coefs: pd.DataFrame, last_coefs: pd.DataFrame
|
|
131
|
-
) -> pd.DataFrame:
|
|
132
|
-
"""Determine which new coefficients are valid and return them.
|
|
133
|
-
|
|
134
|
-
Args:
|
|
135
|
-
new_coefs: df of new coefficients for standard load
|
|
136
|
-
profiles (i.e. wind, solar, household)
|
|
137
|
-
last_coefs: df of last coefficients for standard load
|
|
138
|
-
profiles (i.e. wind, solar, household)
|
|
139
|
-
|
|
140
|
-
Returns:
|
|
141
|
-
Dataframe with invalid coefficients
|
|
142
|
-
|
|
143
|
-
"""
|
|
144
|
-
merged_coefs = pd.merge(
|
|
145
|
-
last_coefs, new_coefs, on="coef_name", how="left", suffixes=["_last", "_new"]
|
|
146
|
-
)
|
|
147
|
-
# calculate difference between new and last coefficients, if no new
|
|
148
|
-
# coefficient, set difference to inf
|
|
149
|
-
# If coefficient name is not present in new coefficients list, fail. If coefficient
|
|
150
|
-
# name is not present in last coefficients list, add it.
|
|
151
|
-
merged_coefs["difference"] = (
|
|
152
|
-
(merged_coefs.coef_value_last - merged_coefs.coef_value_new)
|
|
153
|
-
.abs()
|
|
154
|
-
.fillna(np.inf)
|
|
155
|
-
)
|
|
156
|
-
# Check if the absolute difference between last coefficients and new coefficients
|
|
157
|
-
# is more than COEF_MAX_FRACTION_DIFF x absolute value of last coefficient
|
|
158
|
-
invalid_coefs = merged_coefs[
|
|
159
|
-
merged_coefs.difference
|
|
160
|
-
> (COEF_MAX_FRACTION_DIFF * merged_coefs.coef_value_last).abs()
|
|
161
|
-
]
|
|
162
|
-
return invalid_coefs
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
def convert_coefdict_to_coefsdf(
|
|
166
|
-
pj: PredictionJobDataClass, input_split_function: pd.DataFrame, coefdict: dict
|
|
167
|
-
) -> pd.DataFrame:
|
|
168
|
-
"""Convert dictionary of coefficients to dataframe with additional data for db storage.
|
|
169
|
-
|
|
170
|
-
Args:
|
|
171
|
-
pj: prediction job
|
|
172
|
-
input_split_function: df of columns of standard load profiles,
|
|
173
|
-
i.e. wind, solar, household
|
|
174
|
-
coefdict: dict of coefficient per standard load profile
|
|
175
|
-
|
|
176
|
-
Returns:
|
|
177
|
-
DataFrame of coefficients to insert in sql
|
|
178
|
-
|
|
179
|
-
"""
|
|
180
|
-
#
|
|
181
|
-
sql_column_labels = ["pid", "date_start", "date_end", "created"]
|
|
182
|
-
sql_colum_values = [
|
|
183
|
-
pj["id"],
|
|
184
|
-
input_split_function.index.min().date(),
|
|
185
|
-
input_split_function.index.max().date(),
|
|
186
|
-
datetime.utcnow(),
|
|
187
|
-
]
|
|
188
|
-
coefsdf = pd.DataFrame(
|
|
189
|
-
{"coef_name": list(coefdict.keys()), "coef_value": list(coefdict.values())}
|
|
190
|
-
)
|
|
191
|
-
for i, column in enumerate(sql_column_labels):
|
|
192
|
-
coefsdf[column] = sql_colum_values[i]
|
|
193
|
-
|
|
194
|
-
return coefsdf
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def find_components(
|
|
198
|
-
df: pd.DataFrame, zero_bound: bool = True
|
|
199
|
-
) -> tuple[pd.DataFrame, dict]:
|
|
200
|
-
"""Function that does the actual energy splitting.
|
|
201
|
-
|
|
202
|
-
Args:
|
|
203
|
-
df: Input data. The dataframe should contain these columns
|
|
204
|
-
in exactly this order: [load, wind_ref, pv_ref, mulitple tdcv colums]
|
|
205
|
-
zero_bound: If zero_bound is True coefficients can't be negative.
|
|
206
|
-
|
|
207
|
-
Returns:
|
|
208
|
-
tuple:
|
|
209
|
-
- DataFrame containing the wind and solar components
|
|
210
|
-
- Dict with the coefficients that result from the fitting
|
|
211
|
-
|
|
212
|
-
"""
|
|
213
|
-
load = df.iloc[:, 0]
|
|
214
|
-
wind_ref = df.iloc[:, 1]
|
|
215
|
-
pv_ref = df.iloc[:, 2]
|
|
216
|
-
|
|
217
|
-
# Define scaler
|
|
218
|
-
nedu_scaler = (load.max() - load.min()) / 10
|
|
219
|
-
|
|
220
|
-
# Come up with inital guess for the fitting
|
|
221
|
-
p_wind_guess = 1.0
|
|
222
|
-
ppv_guess = 1.0
|
|
223
|
-
p0 = [p_wind_guess, ppv_guess] + (len(df.columns) - 3) * [nedu_scaler]
|
|
224
|
-
|
|
225
|
-
# Define fitting bounds
|
|
226
|
-
if zero_bound:
|
|
227
|
-
bounds = (0, "inf")
|
|
228
|
-
else:
|
|
229
|
-
bounds = ("-inf", "inf")
|
|
230
|
-
|
|
231
|
-
# Define function to fit
|
|
232
|
-
def weighted_sum(x, *args):
|
|
233
|
-
if len(x) != len(args):
|
|
234
|
-
raise ValueError("Length of args should match len of x")
|
|
235
|
-
weights = np.array([v for v in args])
|
|
236
|
-
return np.dot(x.T, weights)
|
|
237
|
-
|
|
238
|
-
# Carry out fitting
|
|
239
|
-
# See https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.curve_fit.html # noqa
|
|
240
|
-
coefs, cov = scipy.optimize.curve_fit(
|
|
241
|
-
weighted_sum,
|
|
242
|
-
xdata=df.iloc[:, 1:].values.T,
|
|
243
|
-
ydata=load.values,
|
|
244
|
-
p0=p0,
|
|
245
|
-
bounds=bounds,
|
|
246
|
-
method="trf",
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
# Set 'almost zero' to zero
|
|
250
|
-
coefs[coefs < 0.1] = 0
|
|
251
|
-
|
|
252
|
-
# Reconstuct historical load
|
|
253
|
-
hist = weighted_sum(df.iloc[:, 1:].values.T, *coefs)
|
|
254
|
-
histp0 = weighted_sum(df.iloc[:, 1:].values.T, *p0)
|
|
255
|
-
|
|
256
|
-
# Make a nice dataframe to return the components
|
|
257
|
-
components = df.iloc[:, [0]].copy()
|
|
258
|
-
components["Inschatting"] = hist.T
|
|
259
|
-
components["p0"] = histp0.T
|
|
260
|
-
components["Windopwek"] = wind_ref * coefs[0]
|
|
261
|
-
components["Zonne-opwek"] = pv_ref * coefs[1]
|
|
262
|
-
components["StandaardVerbruik"] = (df.iloc[:, 3:] * coefs[2:]).sum(axis=1)
|
|
263
|
-
components["Residu"] = -1 * components.iloc[:, 0:2].diff(axis=1).iloc[:, 1]
|
|
264
|
-
|
|
265
|
-
# Make nice dictinary to return coefficents
|
|
266
|
-
coefdict = {name: value for name, value in zip(df.columns[1:], coefs)}
|
|
267
|
-
|
|
268
|
-
# Return result
|
|
269
|
-
return components, coefdict
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
if __name__ == "__main__":
|
|
273
|
-
main()
|
openstef/tasks/train_model.py
DELETED
|
@@ -1,224 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
"""This module contains the CRON job that is periodically executed to retrain the prognosis models.
|
|
5
|
-
|
|
6
|
-
For this the folowing steps are caried out:
|
|
7
|
-
1. Get historic training data (TDCV, Load, Weather and day_ahead_electricity_price price data)
|
|
8
|
-
2. Apply features
|
|
9
|
-
3. Train and Test the new model
|
|
10
|
-
4. Check if new model performs better than the old model
|
|
11
|
-
5. Store the model if it performs better
|
|
12
|
-
6. Send slack message to inform the users
|
|
13
|
-
|
|
14
|
-
Example:
|
|
15
|
-
This module is meant to be called directly from a CRON job. A description of
|
|
16
|
-
the CRON job can be found in the /k8s/CronJobs folder.
|
|
17
|
-
Alternatively this code can be run directly by running::
|
|
18
|
-
|
|
19
|
-
$ python model_train.py
|
|
20
|
-
|
|
21
|
-
"""
|
|
22
|
-
from datetime import datetime, timedelta
|
|
23
|
-
from pathlib import Path
|
|
24
|
-
|
|
25
|
-
import pandas as pd
|
|
26
|
-
|
|
27
|
-
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
|
28
|
-
from openstef.enums import ModelType, PipelineType
|
|
29
|
-
from openstef.exceptions import (
|
|
30
|
-
InputDataOngoingZeroFlatlinerError,
|
|
31
|
-
SkipSaveTrainingForecasts,
|
|
32
|
-
)
|
|
33
|
-
from openstef.model.serializer import MLflowSerializer
|
|
34
|
-
from openstef.pipeline.train_model import (
|
|
35
|
-
MAXIMUM_MODEL_AGE,
|
|
36
|
-
train_model_pipeline,
|
|
37
|
-
train_pipeline_step_load_model,
|
|
38
|
-
)
|
|
39
|
-
from openstef.tasks.utils.predictionjobloop import PredictionJobLoop
|
|
40
|
-
from openstef.tasks.utils.taskcontext import TaskContext
|
|
41
|
-
|
|
42
|
-
TRAINING_PERIOD_DAYS: int = 120
|
|
43
|
-
DEFAULT_CHECK_MODEL_AGE: bool = True
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def train_model_task(
|
|
47
|
-
pj: PredictionJobDataClass,
|
|
48
|
-
context: TaskContext,
|
|
49
|
-
check_old_model_age: bool = DEFAULT_CHECK_MODEL_AGE,
|
|
50
|
-
datetime_start: datetime = None,
|
|
51
|
-
datetime_end: datetime = None,
|
|
52
|
-
) -> None:
|
|
53
|
-
"""Train model task.
|
|
54
|
-
|
|
55
|
-
Top level task that trains a new model and makes sure the best available model is
|
|
56
|
-
stored. On this task level all database and context manager dependencies are resolved.
|
|
57
|
-
|
|
58
|
-
Expected prediction job keys: "id", "model", "lat", "lon", "name"
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
pj: Prediction job
|
|
62
|
-
context: Contect object that holds a config manager and a
|
|
63
|
-
database connection.
|
|
64
|
-
check_old_model_age: check if model is too young to be retrained
|
|
65
|
-
datetime_start: Start
|
|
66
|
-
datetime_end: End
|
|
67
|
-
|
|
68
|
-
Raises:
|
|
69
|
-
SkipSaveTrainingForecasts: If old model is better or too young, you don't need to save the traing forcast.
|
|
70
|
-
InputDataOngoingZeroFlatlinerError: If all recent load measurements are zero.
|
|
71
|
-
|
|
72
|
-
"""
|
|
73
|
-
# Check pipeline types
|
|
74
|
-
if PipelineType.TRAIN not in pj.pipelines_to_run:
|
|
75
|
-
context.logger.info(
|
|
76
|
-
"Skip this PredictionJob because train pipeline is not specified in the pj."
|
|
77
|
-
)
|
|
78
|
-
return
|
|
79
|
-
|
|
80
|
-
# TODO: Improve implementation by using a field in the database and leveraging the
|
|
81
|
-
# `pipelines_to_run` attribute of the `PredictionJobDataClass` object. This
|
|
82
|
-
# would require a change to the MySQL datamodel.
|
|
83
|
-
if (
|
|
84
|
-
context.config.externally_posted_forecasts_pids
|
|
85
|
-
and pj.id in context.config.externally_posted_forecasts_pids
|
|
86
|
-
):
|
|
87
|
-
context.logger.info(
|
|
88
|
-
"Skip this PredictionJob because its forecasts are posted by an external process."
|
|
89
|
-
)
|
|
90
|
-
return
|
|
91
|
-
|
|
92
|
-
# Get the paths for storing model and reports from the config manager
|
|
93
|
-
mlflow_tracking_uri = context.config.paths_mlflow_tracking_uri
|
|
94
|
-
context.logger.debug(f"MLflow tracking uri: {mlflow_tracking_uri}")
|
|
95
|
-
artifact_folder = context.config.paths_artifact_folder
|
|
96
|
-
context.logger.debug(f"Artifact folder: {artifact_folder}")
|
|
97
|
-
|
|
98
|
-
context.perf_meter.checkpoint("Added metadata to PredictionJob")
|
|
99
|
-
|
|
100
|
-
# Check the model age before retrieving the input data to speed up train job.
|
|
101
|
-
# (The exact same model age check is also part of the "train_model_pipeline".)
|
|
102
|
-
|
|
103
|
-
# Initialize serializer
|
|
104
|
-
serializer = MLflowSerializer(mlflow_tracking_uri=mlflow_tracking_uri)
|
|
105
|
-
|
|
106
|
-
# Get old model and age
|
|
107
|
-
_, _, old_model_age = train_pipeline_step_load_model(pj, serializer)
|
|
108
|
-
|
|
109
|
-
# Check old model age and continue yes/no
|
|
110
|
-
if (old_model_age < MAXIMUM_MODEL_AGE) and check_old_model_age:
|
|
111
|
-
context.perf_meter.checkpoint(
|
|
112
|
-
f"Old model is younger than {MAXIMUM_MODEL_AGE} days, skip training"
|
|
113
|
-
)
|
|
114
|
-
if pj.save_train_forecasts:
|
|
115
|
-
raise SkipSaveTrainingForecasts
|
|
116
|
-
return
|
|
117
|
-
|
|
118
|
-
# Define start and end of the training input data
|
|
119
|
-
training_period_days_to_fetch = (
|
|
120
|
-
TRAINING_PERIOD_DAYS
|
|
121
|
-
if pj.data_balancing_ratio is None
|
|
122
|
-
else int(pj.data_balancing_ratio * TRAINING_PERIOD_DAYS)
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
if datetime_end is None:
|
|
126
|
-
datetime_end = datetime.utcnow()
|
|
127
|
-
if datetime_start is None:
|
|
128
|
-
datetime_start = datetime_end - timedelta(days=training_period_days_to_fetch)
|
|
129
|
-
|
|
130
|
-
# Get training input data from database
|
|
131
|
-
input_data = context.database.get_model_input(
|
|
132
|
-
pid=pj["id"],
|
|
133
|
-
location=[pj["lat"], pj["lon"]],
|
|
134
|
-
datetime_start=datetime_start,
|
|
135
|
-
datetime_end=datetime_end,
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
# If data balancing is enabled, fetch data from 1 year ago and combine it with the
|
|
139
|
-
# current data
|
|
140
|
-
if pj.data_balancing_ratio is not None:
|
|
141
|
-
# Because the data is from the past, we can use the data from the "future"
|
|
142
|
-
balanced_datetime_start = datetime_end - timedelta(days=365)
|
|
143
|
-
balanced_datetime_end = balanced_datetime_start + timedelta(
|
|
144
|
-
days=training_period_days_to_fetch
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
balanced_input_data = context.database.get_model_input(
|
|
148
|
-
pid=pj["id"],
|
|
149
|
-
location=[pj["lat"], pj["lon"]],
|
|
150
|
-
datetime_start=balanced_datetime_start,
|
|
151
|
-
datetime_end=balanced_datetime_end,
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
input_data = pd.concat(
|
|
155
|
-
[
|
|
156
|
-
balanced_input_data,
|
|
157
|
-
input_data,
|
|
158
|
-
]
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
context.perf_meter.checkpoint("Retrieved timeseries input")
|
|
162
|
-
|
|
163
|
-
# Excecute the model training pipeline
|
|
164
|
-
try:
|
|
165
|
-
data_sets = train_model_pipeline(
|
|
166
|
-
pj,
|
|
167
|
-
input_data,
|
|
168
|
-
check_old_model_age=check_old_model_age,
|
|
169
|
-
mlflow_tracking_uri=mlflow_tracking_uri,
|
|
170
|
-
artifact_folder=artifact_folder,
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
if data_sets:
|
|
174
|
-
context.perf_meter.checkpoint("Model trained")
|
|
175
|
-
else:
|
|
176
|
-
context.perf_meter.checkpoint("Model not trained")
|
|
177
|
-
|
|
178
|
-
if pj.save_train_forecasts:
|
|
179
|
-
if data_sets is None:
|
|
180
|
-
raise RuntimeError("Forecasts were not retrieved")
|
|
181
|
-
if not hasattr(context.database, "write_train_forecasts"):
|
|
182
|
-
raise RuntimeError(
|
|
183
|
-
"Database connector does dot support 'write_train_forecasts' while "
|
|
184
|
-
"'save_train_forecasts option was activated.'"
|
|
185
|
-
)
|
|
186
|
-
context.database.write_train_forecasts(pj, data_sets)
|
|
187
|
-
context.logger.debug(f"Saved Forecasts from trained model on datasets")
|
|
188
|
-
except SkipSaveTrainingForecasts:
|
|
189
|
-
context.logger.debug(f"Skip saving forecasts")
|
|
190
|
-
except InputDataOngoingZeroFlatlinerError:
|
|
191
|
-
if (
|
|
192
|
-
context.config.known_zero_flatliners
|
|
193
|
-
and pj.id in context.config.known_zero_flatliners
|
|
194
|
-
):
|
|
195
|
-
context.logger.info(
|
|
196
|
-
"No model was trained for this known zero flatliner. No model needs to be trained either, since the fallback forecasts are sufficient."
|
|
197
|
-
)
|
|
198
|
-
return
|
|
199
|
-
else:
|
|
200
|
-
raise InputDataOngoingZeroFlatlinerError(
|
|
201
|
-
'All recent load measurements are zero. Check the load profile of this pid as well as related/neighbouring prediction jobs. Afterwards, consider adding this pid to the "known_zero_flatliners" app_setting and possibly removing other pids from the same app_setting.'
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def main(model_type=None, config=None, database=None):
|
|
206
|
-
if database is None or config is None:
|
|
207
|
-
raise RuntimeError(
|
|
208
|
-
"Please specifiy a config object and/or database connection object. These"
|
|
209
|
-
" can be found in the openstef-dbc package."
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
if model_type is None:
|
|
213
|
-
model_type = [ml.value for ml in ModelType]
|
|
214
|
-
|
|
215
|
-
taskname = Path(__file__).name.replace(".py", "")
|
|
216
|
-
datetime_now = datetime.utcnow()
|
|
217
|
-
with TaskContext(taskname, config, database) as context:
|
|
218
|
-
PredictionJobLoop(context, model_type=model_type).map(
|
|
219
|
-
train_model_task, context, datetime_end=datetime_now
|
|
220
|
-
)
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
if __name__ == "__main__":
|
|
224
|
-
main()
|
openstef/tasks/utils/__init__.py
DELETED
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
import random
|
|
5
|
-
from typing import Iterable, Sequence, Set, Union
|
|
6
|
-
|
|
7
|
-
import networkx as nx
|
|
8
|
-
|
|
9
|
-
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
|
10
|
-
|
|
11
|
-
NodeIdType = Union[str, int]
|
|
12
|
-
EdgeType = tuple[NodeIdType, NodeIdType]
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def has_dependencies(pjs: Iterable[PredictionJobDataClass]) -> bool:
|
|
16
|
-
"""Test whether some prediction jobs have dependencies information.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
pjs: The list of prediction jobs
|
|
20
|
-
|
|
21
|
-
Returns:
|
|
22
|
-
True if some dependency information was found.
|
|
23
|
-
|
|
24
|
-
"""
|
|
25
|
-
for pj in pjs:
|
|
26
|
-
if pj.depends_on is not None and len(pj.depends_on) > 0:
|
|
27
|
-
return True
|
|
28
|
-
return False
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def build_graph_structure(
|
|
32
|
-
pjs: Iterable[PredictionJobDataClass],
|
|
33
|
-
) -> tuple[Set[NodeIdType], Set[EdgeType]]:
|
|
34
|
-
"""Build the graph of dependencies between prediction jobs.
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
pjs: The Iterable of prediction jobs
|
|
38
|
-
|
|
39
|
-
Returns:
|
|
40
|
-
- The set of node ids of the graph
|
|
41
|
-
- The set of edges in the graph
|
|
42
|
-
|
|
43
|
-
"""
|
|
44
|
-
nodes = set()
|
|
45
|
-
edges = set()
|
|
46
|
-
|
|
47
|
-
for pj in pjs:
|
|
48
|
-
nodes.add(pj["id"])
|
|
49
|
-
if pj.depends_on is not None:
|
|
50
|
-
for j in pj.depends_on:
|
|
51
|
-
edges.add((j, pj["id"]))
|
|
52
|
-
|
|
53
|
-
return nodes, edges
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def build_nx_graph(
|
|
57
|
-
nodes: Iterable[NodeIdType], edges: Iterable[EdgeType]
|
|
58
|
-
) -> nx.DiGraph:
|
|
59
|
-
"""Build a networkx Directed Graph.
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
nodes: The sequence of node ids
|
|
63
|
-
edges: The sequence of edges
|
|
64
|
-
|
|
65
|
-
Returns:
|
|
66
|
-
The dependency graph
|
|
67
|
-
|
|
68
|
-
"""
|
|
69
|
-
graph = nx.DiGraph()
|
|
70
|
-
graph.add_nodes_from(nodes)
|
|
71
|
-
graph.add_edges_from(edges)
|
|
72
|
-
return graph
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def find_groups(
|
|
76
|
-
pjs: Sequence[PredictionJobDataClass], randomize_groups: bool = False
|
|
77
|
-
) -> tuple[nx.DiGraph, list[list[PredictionJobDataClass]]]:
|
|
78
|
-
"""Find a sequence of prediction job groups respecting dependencies.
|
|
79
|
-
|
|
80
|
-
Compute groups of prediction jobs such that the prediction jobs in a group
|
|
81
|
-
depend of at least one prediction job in the previous group and does not depend
|
|
82
|
-
on a prediction job in the following groups.
|
|
83
|
-
This means that all the prediction jobs in a group can be run in parallel and that
|
|
84
|
-
if groups are treated in the given order, the dependencies of a prediction job have
|
|
85
|
-
already been treated when the prediction job is run.
|
|
86
|
-
|
|
87
|
-
Args:
|
|
88
|
-
pjs: The sequence of prediction jobs
|
|
89
|
-
randomize_groups: Wether subgroups should be randomized.
|
|
90
|
-
|
|
91
|
-
Returns:
|
|
92
|
-
- The dependency graph
|
|
93
|
-
- The list of prediction job groups
|
|
94
|
-
|
|
95
|
-
"""
|
|
96
|
-
nodes, edges = build_graph_structure(pjs)
|
|
97
|
-
graph = build_nx_graph(nodes, edges)
|
|
98
|
-
groups = list(nx.topological_generations(graph))
|
|
99
|
-
|
|
100
|
-
if randomize_groups:
|
|
101
|
-
for group in groups:
|
|
102
|
-
random.shuffle(group)
|
|
103
|
-
|
|
104
|
-
# Convert groups of pj ids to groups of pjs
|
|
105
|
-
pj_id_map = {pj["id"]: i for i, pj in enumerate(pjs)}
|
|
106
|
-
pj_groups = [[pjs[pj_id_map[pj_id]] for pj_id in group] for group in groups]
|
|
107
|
-
return graph, pj_groups
|