openstef 3.4.56__py3-none-any.whl → 4.0.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. openstef-4.0.0a3.dist-info/METADATA +177 -0
  2. openstef-4.0.0a3.dist-info/RECORD +4 -0
  3. {openstef-3.4.56.dist-info → openstef-4.0.0a3.dist-info}/WHEEL +1 -2
  4. openstef/__init__.py +0 -14
  5. openstef/__main__.py +0 -3
  6. openstef/app_settings.py +0 -19
  7. openstef/data/NL_terrestrial_radiation.csv +0 -25585
  8. openstef/data/NL_terrestrial_radiation.csv.license +0 -3
  9. openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
  10. openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license +0 -3
  11. openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +0 -18
  12. openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +0 -3
  13. openstef/data/dutch_holidays.csv +0 -1759
  14. openstef/data/dutch_holidays.csv.license +0 -3
  15. openstef/data/pv_single_coefs.csv +0 -601
  16. openstef/data/pv_single_coefs.csv.license +0 -3
  17. openstef/data_classes/__init__.py +0 -3
  18. openstef/data_classes/data_prep.py +0 -99
  19. openstef/data_classes/model_specifications.py +0 -30
  20. openstef/data_classes/prediction_job.py +0 -135
  21. openstef/data_classes/split_function.py +0 -97
  22. openstef/enums.py +0 -140
  23. openstef/exceptions.py +0 -74
  24. openstef/feature_engineering/__init__.py +0 -3
  25. openstef/feature_engineering/apply_features.py +0 -138
  26. openstef/feature_engineering/bidding_zone_to_country_mapping.py +0 -106
  27. openstef/feature_engineering/cyclic_features.py +0 -161
  28. openstef/feature_engineering/data_preparation.py +0 -152
  29. openstef/feature_engineering/feature_adder.py +0 -206
  30. openstef/feature_engineering/feature_applicator.py +0 -202
  31. openstef/feature_engineering/general.py +0 -141
  32. openstef/feature_engineering/holiday_features.py +0 -231
  33. openstef/feature_engineering/lag_features.py +0 -165
  34. openstef/feature_engineering/missing_values_transformer.py +0 -141
  35. openstef/feature_engineering/rolling_features.py +0 -58
  36. openstef/feature_engineering/weather_features.py +0 -492
  37. openstef/metrics/__init__.py +0 -3
  38. openstef/metrics/figure.py +0 -303
  39. openstef/metrics/metrics.py +0 -486
  40. openstef/metrics/reporter.py +0 -222
  41. openstef/model/__init__.py +0 -3
  42. openstef/model/basecase.py +0 -82
  43. openstef/model/confidence_interval_applicator.py +0 -242
  44. openstef/model/fallback.py +0 -77
  45. openstef/model/metamodels/__init__.py +0 -3
  46. openstef/model/metamodels/feature_clipper.py +0 -90
  47. openstef/model/metamodels/grouped_regressor.py +0 -222
  48. openstef/model/metamodels/missing_values_handler.py +0 -138
  49. openstef/model/model_creator.py +0 -214
  50. openstef/model/objective.py +0 -426
  51. openstef/model/objective_creator.py +0 -65
  52. openstef/model/regressors/__init__.py +0 -3
  53. openstef/model/regressors/arima.py +0 -197
  54. openstef/model/regressors/custom_regressor.py +0 -64
  55. openstef/model/regressors/dazls.py +0 -116
  56. openstef/model/regressors/flatliner.py +0 -95
  57. openstef/model/regressors/gblinear_quantile.py +0 -334
  58. openstef/model/regressors/lgbm.py +0 -29
  59. openstef/model/regressors/linear.py +0 -90
  60. openstef/model/regressors/linear_quantile.py +0 -305
  61. openstef/model/regressors/regressor.py +0 -114
  62. openstef/model/regressors/xgb.py +0 -52
  63. openstef/model/regressors/xgb_multioutput_quantile.py +0 -261
  64. openstef/model/regressors/xgb_quantile.py +0 -228
  65. openstef/model/serializer.py +0 -431
  66. openstef/model/standard_deviation_generator.py +0 -81
  67. openstef/model_selection/__init__.py +0 -3
  68. openstef/model_selection/model_selection.py +0 -311
  69. openstef/monitoring/__init__.py +0 -3
  70. openstef/monitoring/performance_meter.py +0 -92
  71. openstef/monitoring/teams.py +0 -203
  72. openstef/pipeline/__init__.py +0 -3
  73. openstef/pipeline/create_basecase_forecast.py +0 -133
  74. openstef/pipeline/create_component_forecast.py +0 -168
  75. openstef/pipeline/create_forecast.py +0 -171
  76. openstef/pipeline/optimize_hyperparameters.py +0 -317
  77. openstef/pipeline/train_create_forecast_backtest.py +0 -163
  78. openstef/pipeline/train_model.py +0 -561
  79. openstef/pipeline/utils.py +0 -52
  80. openstef/postprocessing/__init__.py +0 -3
  81. openstef/postprocessing/postprocessing.py +0 -275
  82. openstef/preprocessing/__init__.py +0 -3
  83. openstef/preprocessing/preprocessing.py +0 -42
  84. openstef/settings.py +0 -15
  85. openstef/tasks/__init__.py +0 -3
  86. openstef/tasks/calculate_kpi.py +0 -324
  87. openstef/tasks/create_basecase_forecast.py +0 -118
  88. openstef/tasks/create_components_forecast.py +0 -162
  89. openstef/tasks/create_forecast.py +0 -145
  90. openstef/tasks/create_solar_forecast.py +0 -420
  91. openstef/tasks/create_wind_forecast.py +0 -80
  92. openstef/tasks/optimize_hyperparameters.py +0 -135
  93. openstef/tasks/split_forecast.py +0 -273
  94. openstef/tasks/train_model.py +0 -224
  95. openstef/tasks/utils/__init__.py +0 -3
  96. openstef/tasks/utils/dependencies.py +0 -107
  97. openstef/tasks/utils/predictionjobloop.py +0 -243
  98. openstef/tasks/utils/taskcontext.py +0 -160
  99. openstef/validation/__init__.py +0 -3
  100. openstef/validation/validation.py +0 -322
  101. openstef-3.4.56.dist-info/METADATA +0 -154
  102. openstef-3.4.56.dist-info/RECORD +0 -102
  103. openstef-3.4.56.dist-info/top_level.txt +0 -1
  104. /openstef-3.4.56.dist-info/LICENSE → /openstef-4.0.0a3.dist-info/licenses/LICENSE.md +0 -0
@@ -1,486 +0,0 @@
1
- # SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
2
- #
3
- # SPDX-License-Identifier: MPL-2.0
4
-
5
- # This file contains the Objective loss functions for quantile regression from:
6
- # https://gist.github.com/Nikolay-Lysenko/06769d701c1d9c9acb9a66f2f9d7a6c7
7
- #
8
- # SPDX-FileCopyrightText: 2017 Nikolay Lysenko
9
- #
10
- # SPDX-License-Identifier: MIT
11
- """This module contains all metrics to assess forecast quality."""
12
- from typing import Callable
13
-
14
- import numpy as np
15
- import pandas as pd
16
- import xgboost
17
-
18
-
19
- def get_eval_metric_function(metric_name: str) -> Callable:
20
- """Gets a metric if it is available.
21
-
22
- Args:
23
- metric_name: Name of the metric.
24
-
25
- Returns:
26
- Function to calculate the metric.
27
-
28
- Raises:
29
- KeyError: If the metric is not available.
30
-
31
- """
32
- evaluation_function = {
33
- "rmse": rmse,
34
- "bias": bias,
35
- "nsme": nsme,
36
- "mae": mae,
37
- "r_mae": r_mae,
38
- "frac_in_stdev": frac_in_stdev,
39
- "r_mae_highest": r_mae_highest,
40
- "r_mne_highest": r_mne_highest,
41
- "r_mpe_highest": r_mpe_highest,
42
- "r_mae_lowest": r_mae_lowest,
43
- "skill_score": skill_score,
44
- "skill_score_positive_peaks": skill_score_positive_peaks,
45
- "franks_skill_score": franks_skill_score,
46
- "franks_skill_score_peaks": franks_skill_score_peaks,
47
- }.get(metric_name, None)
48
-
49
- if evaluation_function is None:
50
- raise KeyError(f"Unknown evaluation metric function {metric_name}")
51
-
52
- return evaluation_function
53
-
54
-
55
- def rmse(realised: pd.Series, forecast: pd.Series) -> float:
56
- """Function that calculates the Root Mean Square Error based on the true and prediciton.
57
-
58
- Args:
59
- realised: Realised load.
60
- forecast: Forecasted load.
61
-
62
- Returns:
63
- Root Mean Square Error
64
-
65
- """
66
- return np.sqrt(((realised - forecast) ** 2).mean())
67
-
68
-
69
- def bias(realised: pd.Series, forecast: pd.Series) -> float:
70
- """Function that calculates the absolute bias in % based on the true and prediciton.
71
-
72
- Args:
73
- realised: Realised load.
74
- forecast: Forecasted load.
75
-
76
- Returns:
77
- Bias
78
-
79
- """
80
- return np.mean(forecast - realised)
81
-
82
-
83
- def nsme(realised: pd.Series, forecast: pd.Series) -> float:
84
- """Function that calculates the Nash-sutcliffe model efficiency based on the true and prediciton.
85
-
86
- Args:
87
- realised: Realised load.
88
- forecast: Forecasted load.
89
-
90
- Returns:
91
- Nash-sutcliffe model efficiency
92
-
93
- """
94
- try:
95
- return 1 - sum((forecast - realised) ** 2) / sum(
96
- (realised - np.mean(realised)) ** 2
97
- )
98
- except ZeroDivisionError: # means the error is 0
99
- return 1
100
-
101
-
102
- def mae(realised: pd.Series, forecast: pd.Series) -> float:
103
- """Function that calculates the mean absolute error based on the true and prediction."""
104
- return np.mean(np.abs(forecast - realised))
105
-
106
-
107
- def r_mae(realised: pd.Series, forecast: pd.Series) -> float:
108
- """Function that calculates the relative mean absolute error based on the true and prediction.
109
-
110
- The range is based on the load range of the previous two weeks
111
-
112
- """
113
- # Determine load range on entire dataset
114
- range_ = (
115
- realised.max() - realised.min()
116
- if (realised.max() - realised.min()) != 0
117
- else np.nan
118
- )
119
-
120
- return mae(realised, forecast) / range_
121
-
122
-
123
- def frac_in_stdev(realised: pd.Series, forecast: pd.Series, stdev: pd.Series) -> float:
124
- """Function that calculates the amount of measurements that are within one stdev of our predictions."""
125
- outside_stdev = forecast[(forecast - realised).abs() > stdev]
126
- return round((1 - (len(outside_stdev) / len(forecast))), 2)
127
-
128
-
129
- def r_mae_highest(
130
- realised: pd.Series, forecast: pd.Series, percentile: float = 0.95
131
- ) -> float:
132
- """Function that calculates the relative mean absolute error for the 5 percent highest realised values.
133
-
134
- The range is based on the load range of the previous two weeks.
135
-
136
- Raises:
137
- ValueError: If the length of the realised and forecast arrays are not equal.
138
-
139
- """
140
- # Check if length of both arrays is equal
141
- if len(np.array(realised)) != len(np.array(forecast)):
142
- raise ValueError(
143
- "Error metric can only be calculated for arrays of equal length!"
144
- )
145
-
146
- # Determine load range on entire dataset
147
- range_ = (
148
- realised.max() - realised.min()
149
- if (realised.max() - realised.min()) != 0
150
- else np.nan
151
- )
152
-
153
- # Get highest percentile of values
154
- highest_values = realised > np.percentile(realised, percentile)
155
-
156
- # Calculate mae
157
- r_mae_highest = mae(realised[highest_values], forecast[highest_values]) / range_
158
-
159
- return r_mae_highest
160
-
161
-
162
- def r_mne_highest(realised: pd.Series, forecast: pd.Series) -> float:
163
- """Function that calculates the relative mean negative error for the 5 percent highest realised values.
164
-
165
- The range is based on the load range of the previous two weeks, this measure quantifies how much we underestimate
166
- peaks.
167
-
168
- """
169
- # Combine series in one DataFrame
170
- combined = pd.concat([realised, forecast], axis=1)
171
-
172
- # Determine load range on entire dataset
173
- range_ = (
174
- combined[realised.name].max() - combined[realised.name].min()
175
- if (combined[realised.name].max() - combined[realised.name].min()) != 0
176
- else np.nan
177
- )
178
-
179
- # Select 5 percent highest realised load values
180
- combined["highest"] = combined[realised.name][
181
- combined[realised.name] > combined[realised.name].quantile(0.95)
182
- ]
183
- combined = combined[np.invert(np.isnan(combined["highest"]))]
184
-
185
- # Calculate rMNE for the selected points
186
- diff = combined[forecast.name] - combined[realised.name]
187
-
188
- if len(diff[diff < 0]) < 2:
189
- return 0.0
190
-
191
- r_mne_highest = np.mean(diff[diff < 0]) / range_
192
-
193
- if np.isnan(r_mne_highest):
194
- return 99999.0
195
-
196
- return r_mne_highest
197
-
198
-
199
- def r_mpe_highest(realised: pd.Series, forecast: pd.Series) -> float:
200
- """Function that calculates the relative mean positive error for the 5 percent highest realised values.
201
-
202
- The range is based on the load range of the previous two weeks, this measure quantifies how much we overestimate
203
- peaks.
204
-
205
- """
206
- # Combine series in one DataFrame
207
- combined = pd.concat([realised, forecast], axis=1)
208
-
209
- # Determine load range on entire dataset
210
- range_ = (
211
- combined[realised.name].max() - combined[realised.name].min()
212
- if (combined[realised.name].max() - combined[realised.name].min()) != 0
213
- else np.nan
214
- )
215
-
216
- # Select 5 percent highest realised load values
217
- combined["highest"] = combined[realised.name][
218
- combined[realised.name] > combined[realised.name].quantile(0.95)
219
- ]
220
- combined = combined[np.invert(np.isnan(combined["highest"]))]
221
-
222
- # Calculate rMPE for the selected points
223
-
224
- diff = combined[forecast.name] - combined[realised.name]
225
-
226
- if len(diff[diff > 0]) < 2:
227
- return 0.0
228
-
229
- r_mpe_highest = np.mean(diff[diff > 0]) / range_
230
-
231
- if np.isnan(r_mpe_highest):
232
- return 99999.0
233
- return r_mpe_highest
234
-
235
-
236
- def r_mae_lowest(
237
- realised: pd.Series, forecast: pd.Series, quantile: float = 0.05
238
- ) -> float:
239
- """Function that calculates the relative mean absolute error for the 5 percent lowest realised values.
240
-
241
- The range is based on the load range of the previous two weeks.
242
-
243
- """
244
- # Determine load range on entire dataset
245
- range_ = (
246
- realised.max() - realised.min()
247
- if (realised.max() - realised.min()) != 0
248
- else np.nan
249
- )
250
-
251
- # Get lowest percentile of values
252
- lowest_values = realised < np.quantile(realised, quantile)
253
- # Calculate mae
254
- r_mae_lowest = mae(realised[lowest_values], forecast[lowest_values]) / range_
255
-
256
- return r_mae_lowest
257
-
258
-
259
- def skill_score(realised: pd.Series, forecast: pd.Series, mean: pd.Series) -> float:
260
- """Function that calculates the skill score.
261
-
262
- Thise indicates model performance relative to a reference, in this case the mean of the realised values. The range
263
- is based on the load range of the previous two weeks.
264
-
265
- """
266
- combined = pd.concat([realised, forecast], axis=1)
267
- combined["mean"] = mean
268
-
269
- skill_score = 1 - (mae(realised, forecast) / mae(realised, combined["mean"]))
270
-
271
- if np.isnan(skill_score):
272
- return 0
273
-
274
- return skill_score
275
-
276
-
277
- def skill_score_positive_peaks(
278
- realised: pd.Series, forecast: pd.Series, mean: pd.Series
279
- ) -> float:
280
- """Calculates skill score on positive peaks."""
281
- # Combine series in one DataFrame
282
- combined = pd.concat([realised, forecast], axis=1)
283
-
284
- # Select 5 percent highest realised load values
285
- combined["highest"] = combined[realised.name][
286
- combined[realised.name] > combined[realised.name].quantile(0.95)
287
- ]
288
- combined = combined[np.invert(np.isnan(combined["highest"]))]
289
-
290
- # Calculate rMAE for the selected points
291
- skill_score_highest = skill_score(
292
- combined[realised.name], combined[forecast.name], mean
293
- )
294
-
295
- if np.isnan(skill_score_highest):
296
- return 0
297
-
298
- return skill_score_highest
299
-
300
-
301
- def franks_skill_score(
302
- realised: pd.Series, forecast: pd.Series, basecase: pd.Series, range_: float = 1.0
303
- ) -> float:
304
- """Calculate Franks skill score."""
305
- # Combine series in one DataFrame
306
- combined = pd.concat([realised, forecast], axis=1)
307
- if range_ == 1.0:
308
- range_ = (
309
- combined[realised.name].max() - combined[realised.name].min()
310
- if (combined[realised.name].max() - combined[realised.name].min()) != 0
311
- else np.nan
312
- )
313
-
314
- franks_skill_score = (mae(realised, basecase) - mae(realised, forecast)) / range_
315
-
316
- if np.isnan(franks_skill_score):
317
- return 0
318
-
319
- return franks_skill_score
320
-
321
-
322
- def franks_skill_score_peaks(
323
- realised: pd.Series, forecast: pd.Series, basecase: pd.Series
324
- ) -> float:
325
- """Calculate Franks skill score on positive peaks."""
326
- # Combine series in one DataFrame
327
- combined = pd.concat([realised, forecast, basecase], axis=1)
328
-
329
- range_ = (
330
- combined[realised.name].max() - combined[realised.name].min()
331
- if (combined[realised.name].max() - combined[realised.name].min()) != 0
332
- else np.nan
333
- )
334
- # Select 5 percent highest realised load values
335
- combined["highest"] = combined[realised.name][
336
- combined[realised.name] > combined[realised.name].quantile(0.95)
337
- ]
338
- combined = combined[np.invert(np.isnan(combined["highest"]))]
339
-
340
- # Calculate rMAE for the selected points
341
- franks_skill_score_highest = franks_skill_score(
342
- combined[realised.name],
343
- combined[forecast.name],
344
- combined[basecase.name],
345
- range_=range_,
346
- )
347
-
348
- if np.isnan(franks_skill_score_highest):
349
- return 0
350
-
351
- return franks_skill_score_highest
352
-
353
-
354
- # Objective loss functions for quantile regression, from: https://gist.github.com/Nikolay-Lysenko/06769d701c1d9c9acb9a66f2f9d7a6c7
355
-
356
- # SPDX-FileCopyrightText: 2017 Nikolay Lysenko
357
- #
358
- # SPDX-License-Identifier: MIT
359
-
360
-
361
- def xgb_quantile_eval(
362
- preds: np.ndarray, dmatrix: xgboost.DMatrix, quantile: float = 0.2
363
- ) -> str:
364
- """Customized evaluational metric that equals to quantile regression loss (also known as pinball loss).
365
-
366
- Quantile regression is regression that estimates a specified quantile of target's distribution conditional on given features.
367
-
368
- Args:
369
- preds: Predicted values
370
- dmatrix: xgboost.DMatrix of the input data.
371
- quantile: Target quantile.
372
-
373
- Returns:
374
- Loss information
375
-
376
-
377
- # See also:
378
- https://gist.github.com/Nikolay-Lysenko/06769d701c1d9c9acb9a66f2f9d7a6c7
379
-
380
- """
381
- labels = dmatrix.get_label()
382
- return (
383
- "q{}_loss".format(quantile),
384
- np.nanmean(
385
- (preds >= labels) * (1 - quantile) * (preds - labels)
386
- + (preds < labels) * quantile * (labels - preds)
387
- ),
388
- )
389
-
390
-
391
- def xgb_quantile_obj(
392
- preds: np.ndarray, dmatrix: xgboost.DMatrix, quantile: float = 0.2
393
- ) -> tuple[np.ndarray, np.ndarray]:
394
- """Quantile regression objective fucntion.
395
-
396
- Computes first-order derivative of quantile regression loss and a non-degenerate substitute for second-order
397
- derivative.
398
-
399
- Substitute is returned instead of zeros, because XGBoost requires non-zero second-order derivatives. See
400
- this page: https://github.com/dmlc/xgboost/issues/1825 to see why it is possible to use this trick. However, be sure
401
- that hyperparameter named `max_delta_step` is small enough to satisfy:``0.5 * max_delta_step <= min(quantile, 1 - quantile)``.
402
-
403
- Args:
404
- preds: numpy.ndarray
405
- dmatrix: xgboost.DMatrix
406
- quantile: float between 0 and 1
407
-
408
- Returns:
409
- Gradient and Hessian
410
-
411
- # See also:
412
- https://gist.github.com/Nikolay-Lysenko/06769d701c1d9c9acb9a66f2f9d7a6c7
413
-
414
- Reasoning for the hessian:
415
- https://gist.github.com/Nikolay-Lysenko/06769d701c1d9c9acb9a66f2f9d7a6c7#gistcomment-2322558
416
-
417
- """
418
- try:
419
- assert 0 <= quantile <= 1
420
- except AssertionError:
421
- raise ValueError("Quantile value must be float between 0 and 1.")
422
-
423
- labels = dmatrix.get_label()
424
- errors = preds - labels
425
-
426
- left_mask = errors < 0
427
- right_mask = errors > 0
428
-
429
- # The factor `* errors` is different from the original implementation, however
430
- # this addition makes the objective function scalable with the size of the error.
431
- # This solves issues with regression on large (>100) input data.
432
- grad = (quantile * left_mask + (1 - quantile) * right_mask) * errors
433
- hess = np.ones_like(preds)
434
-
435
- return grad, hess
436
-
437
-
438
- def arctan_loss(y_true, y_pred, taus, s=0.1):
439
- """Compute the arctan pinball loss.
440
-
441
- Note that XGBoost outputs the predictions in a slightly peculiar manner.
442
- Suppose we have 100 data points and we predict 10 quantiles. The predictions
443
- will be an array of size (1000 x 1). We first resize this to a (100x10) array
444
- where each row corresponds to the 10 predicted quantile for a single data
445
- point. We then use a for-loop (over the 10 columns) to calculate the gradients
446
- and second derivatives. Legibility was chosen over efficiency. This part
447
- can be made more efficient.
448
-
449
- Args:
450
- y_true: An array containing the true observations.
451
- y_pred: An array containing the predicted quantiles.
452
- taus: A list containing the true desired coverage of the quantiles.
453
- s: A smoothing parameter.
454
-
455
- Returns:
456
- grad: An array containing the (negative) gradients with respect to y_pred.
457
- hess: An array containing the second derivative with respect to y_pred.
458
-
459
- """
460
- size = len(y_true)
461
- n_dim = len(taus) # The number of columns
462
- n_rows = size // n_dim
463
-
464
- # Resize the predictions and targets.
465
- # Each column corresponds to a quantile, each row to a data point.
466
- y_pred = np.reshape(y_pred, (n_rows, n_dim))
467
- y_true = np.reshape(y_true, (n_rows, n_dim))
468
-
469
- # Calculate the differences
470
- u = y_true - y_pred
471
-
472
- # Calculate the gradient and second derivatives
473
- grad = np.zeros_like(y_pred)
474
- hess = np.zeros_like(y_pred)
475
- z = u / s
476
- for i, tau in enumerate(taus):
477
- x = 1 + z[:, i] ** 2
478
- grad[:, i] = (
479
- tau - 0.5 + 1 / np.pi * np.arctan(z[:, i]) + z[:, i] / (np.pi) * x**-1
480
- )
481
- hess[:, i] = 2 / (np.pi * s) * x ** (-2)
482
-
483
- # Reshape back to the original shape.
484
- grad = grad.reshape(size)
485
- hess = hess.reshape(size)
486
- return -grad / n_dim, hess / n_dim
@@ -1,222 +0,0 @@
1
- # SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
2
- #
3
- # SPDX-License-Identifier: MPL-2.0
4
- """Defines reporter class."""
5
- import logging
6
- import os
7
- import warnings
8
- from dataclasses import dataclass
9
-
10
- import numpy as np
11
- import pandas as pd
12
- import sklearn
13
- import structlog
14
- from mlflow.models import ModelSignature, infer_signature
15
- from plotly.graph_objects import Figure
16
-
17
- from openstef.metrics import figure
18
- from openstef.metrics.metrics import bias, mae, nsme, r_mae, rmse
19
- from openstef.model.regressors.regressor import OpenstfRegressor
20
- from openstef.settings import Settings
21
-
22
-
23
- @dataclass
24
- class Report:
25
- """Dataclass to hold a report describing the training process."""
26
-
27
- def __init__(
28
- self,
29
- feature_importance_figure: Figure,
30
- data_series_figures: dict[str, Figure],
31
- metrics: dict,
32
- signature: ModelSignature,
33
- ):
34
- """Initialize training report.
35
-
36
- Args:
37
- feature_importance_figure: Figure with feature importance
38
- data_series_figures: Figure with input data time series.
39
- metrics: Dict with metrics
40
- signature: Model signature
41
-
42
- """
43
- self.feature_importance_figure = feature_importance_figure
44
- self.data_series_figures = data_series_figures
45
- self.metrics = metrics
46
- self.signature = signature
47
-
48
-
49
- class Reporter:
50
- """Reporter class that generates reports describing the training process."""
51
-
52
- def __init__(
53
- self,
54
- train_data: pd.DataFrame = None,
55
- validation_data: pd.DataFrame = None,
56
- test_data: pd.DataFrame = None,
57
- quantiles: list[float] = None,
58
- ) -> None:
59
- """Initializes reporter.
60
-
61
- Args:
62
- train_data: Dataframe with training data
63
- validation_data: Dataframe with validation data
64
- test_data: Dataframe with test data
65
- quantiles: List of predicted quantiles that have to be plotted.
66
-
67
- """
68
- self.horizons = train_data.horizon.unique()
69
- self.predicted_data_list = []
70
- self.input_data_list = [train_data, validation_data, test_data]
71
- self.quantiles = [] if quantiles is None else sorted(quantiles)
72
-
73
- def generate_report(
74
- self,
75
- model: OpenstfRegressor,
76
- ) -> Report:
77
- """Generate a report on a given model.
78
-
79
- Args:
80
- model: the model to create a report on
81
-
82
- Returns:
83
- Reporter object containing info about the model
84
-
85
- """
86
- # Get training (input_data_list[0]) and validation (input_data_list[1]) set
87
- train_x, train_y = (
88
- self.input_data_list[0].iloc[:, 1:-1],
89
- self.input_data_list[0].iloc[:, 0],
90
- )
91
- valid_x, valid_y = (
92
- self.input_data_list[1].iloc[:, 1:-1],
93
- self.input_data_list[1].iloc[:, 0],
94
- )
95
-
96
- data_series_figures = self._make_data_series_figures(model)
97
-
98
- # feature_importance_dataframe should be a dataframe, to create a figure
99
- # can be None if we have no feature importance
100
- if isinstance(model.feature_importance_dataframe, pd.DataFrame):
101
- feature_importance_figure = figure.plot_feature_importance(
102
- model.feature_importance_dataframe
103
- )
104
- # If it isn't a dataframe we will set feature_importance_figure, so it will not create the figure
105
- else:
106
- feature_importance_figure = None
107
-
108
- with warnings.catch_warnings():
109
- warnings.simplefilter("ignore")
110
-
111
- if model.can_predict_quantiles:
112
- fiabilities = self.get_fiabilities(
113
- {q: model.predict(valid_x, quantile=q) for q in self.quantiles},
114
- valid_y,
115
- )
116
- else:
117
- fiabilities = {}
118
-
119
- report = Report(
120
- data_series_figures=data_series_figures,
121
- feature_importance_figure=feature_importance_figure,
122
- metrics={
123
- **self.get_metrics(model.predict(valid_x), valid_y),
124
- **fiabilities,
125
- },
126
- signature=infer_signature(train_x, train_y),
127
- )
128
-
129
- return report
130
-
131
- @staticmethod
132
- def get_fiabilities(quantiles: dict[float, np.array], y_true: np.array) -> dict:
133
- fiabilities_dict = {}
134
- for alpha, qhat in quantiles.items():
135
- fiabilities_dict[f"fiability_at_q{alpha}"] = np.mean(qhat >= y_true)
136
- return fiabilities_dict
137
-
138
- @staticmethod
139
- def get_metrics(y_pred: np.array, y_true: np.array) -> dict:
140
- """Calculate the metrics for a prediction.
141
-
142
- Args:
143
- y_pred: np.array
144
- y_true: np.array
145
-
146
- Returns:
147
- Metrics for the prediction
148
-
149
- """
150
- metric_dict = {
151
- "bias": bias,
152
- "NSME": nsme,
153
- "MAE": mae,
154
- "R_MAE": r_mae,
155
- "RMSE": rmse,
156
- "explained_variance": sklearn.metrics.explained_variance_score,
157
- "MSE": sklearn.metrics.mean_squared_error,
158
- "r2": sklearn.metrics.r2_score,
159
- }
160
- results = {}
161
- for name, metric in metric_dict.items():
162
- try:
163
- results[name] = metric(y_true, y_pred)
164
- except ValueError:
165
- continue
166
- return results
167
-
168
- @staticmethod
169
- def write_report_to_disk(report: Report, report_folder: str):
170
- """Write report to disk; e.g. for viewing report of latest models using grafana."""
171
- # Initialize logger and serializer
172
- structlog.configure(
173
- wrapper_class=structlog.make_filtering_bound_logger(
174
- logging.getLevelName(Settings.log_level)
175
- )
176
- )
177
- logger = structlog.get_logger(__name__)
178
- if report_folder:
179
- # create path if does not exist
180
- if not os.path.exists(report_folder):
181
- os.makedirs(report_folder)
182
- logger.info(f"Writing reports to {report_folder}")
183
- # write feature importance figure
184
- if report.feature_importance_figure: # only write if figure is not none
185
- report.feature_importance_figure.write_html(
186
- os.path.join(report_folder, "weight_plot.html")
187
- )
188
- # write predictors
189
- for name, figure in report.data_series_figures.items():
190
- if figure: # only write if figure is not none
191
- figure.write_html(os.path.join(report_folder, f"{name}.html"))
192
-
193
- def _make_data_series_figures(self, model: OpenstfRegressor) -> dict:
194
- """Make data series figures."""
195
- # Make model predictions
196
- for data_set in self.input_data_list:
197
- # First ("load") and last ("horizon") are removed here
198
- # as they are not expected by the model as prediction input
199
- model_forecast = model.predict(data_set.iloc[:, 1:-1])
200
- forecast = pd.DataFrame(
201
- index=data_set.index, data={"forecast": model_forecast}
202
- )
203
-
204
- if (model.can_predict_quantiles) & (len(self.quantiles) >= 2):
205
- forecast.loc[:, f"q{100 * self.quantiles[0]}"] = model.predict(
206
- data_set.iloc[:, 1:-1], quantile=self.quantiles[0]
207
- )
208
- forecast.loc[:, f"q{100 * self.quantiles[-1]}"] = model.predict(
209
- data_set.iloc[:, 1:-1], quantile=self.quantiles[-1]
210
- )
211
-
212
- self.predicted_data_list.append(forecast)
213
-
214
- # Make cufflinks plots for the data series
215
- return {
216
- f"Predictor{horizon}": figure.plot_data_series(
217
- data=self.input_data_list,
218
- predict_data=self.predicted_data_list,
219
- horizon=horizon,
220
- )
221
- for horizon in self.horizons
222
- }
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
2
- #
3
- # SPDX-License-Identifier: MPL-2.0