openstef 3.4.56__py3-none-any.whl → 4.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstef-4.0.0a3.dist-info/METADATA +177 -0
- openstef-4.0.0a3.dist-info/RECORD +4 -0
- {openstef-3.4.56.dist-info → openstef-4.0.0a3.dist-info}/WHEEL +1 -2
- openstef/__init__.py +0 -14
- openstef/__main__.py +0 -3
- openstef/app_settings.py +0 -19
- openstef/data/NL_terrestrial_radiation.csv +0 -25585
- openstef/data/NL_terrestrial_radiation.csv.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +0 -18
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +0 -3
- openstef/data/dutch_holidays.csv +0 -1759
- openstef/data/dutch_holidays.csv.license +0 -3
- openstef/data/pv_single_coefs.csv +0 -601
- openstef/data/pv_single_coefs.csv.license +0 -3
- openstef/data_classes/__init__.py +0 -3
- openstef/data_classes/data_prep.py +0 -99
- openstef/data_classes/model_specifications.py +0 -30
- openstef/data_classes/prediction_job.py +0 -135
- openstef/data_classes/split_function.py +0 -97
- openstef/enums.py +0 -140
- openstef/exceptions.py +0 -74
- openstef/feature_engineering/__init__.py +0 -3
- openstef/feature_engineering/apply_features.py +0 -138
- openstef/feature_engineering/bidding_zone_to_country_mapping.py +0 -106
- openstef/feature_engineering/cyclic_features.py +0 -161
- openstef/feature_engineering/data_preparation.py +0 -152
- openstef/feature_engineering/feature_adder.py +0 -206
- openstef/feature_engineering/feature_applicator.py +0 -202
- openstef/feature_engineering/general.py +0 -141
- openstef/feature_engineering/holiday_features.py +0 -231
- openstef/feature_engineering/lag_features.py +0 -165
- openstef/feature_engineering/missing_values_transformer.py +0 -141
- openstef/feature_engineering/rolling_features.py +0 -58
- openstef/feature_engineering/weather_features.py +0 -492
- openstef/metrics/__init__.py +0 -3
- openstef/metrics/figure.py +0 -303
- openstef/metrics/metrics.py +0 -486
- openstef/metrics/reporter.py +0 -222
- openstef/model/__init__.py +0 -3
- openstef/model/basecase.py +0 -82
- openstef/model/confidence_interval_applicator.py +0 -242
- openstef/model/fallback.py +0 -77
- openstef/model/metamodels/__init__.py +0 -3
- openstef/model/metamodels/feature_clipper.py +0 -90
- openstef/model/metamodels/grouped_regressor.py +0 -222
- openstef/model/metamodels/missing_values_handler.py +0 -138
- openstef/model/model_creator.py +0 -214
- openstef/model/objective.py +0 -426
- openstef/model/objective_creator.py +0 -65
- openstef/model/regressors/__init__.py +0 -3
- openstef/model/regressors/arima.py +0 -197
- openstef/model/regressors/custom_regressor.py +0 -64
- openstef/model/regressors/dazls.py +0 -116
- openstef/model/regressors/flatliner.py +0 -95
- openstef/model/regressors/gblinear_quantile.py +0 -334
- openstef/model/regressors/lgbm.py +0 -29
- openstef/model/regressors/linear.py +0 -90
- openstef/model/regressors/linear_quantile.py +0 -305
- openstef/model/regressors/regressor.py +0 -114
- openstef/model/regressors/xgb.py +0 -52
- openstef/model/regressors/xgb_multioutput_quantile.py +0 -261
- openstef/model/regressors/xgb_quantile.py +0 -228
- openstef/model/serializer.py +0 -431
- openstef/model/standard_deviation_generator.py +0 -81
- openstef/model_selection/__init__.py +0 -3
- openstef/model_selection/model_selection.py +0 -311
- openstef/monitoring/__init__.py +0 -3
- openstef/monitoring/performance_meter.py +0 -92
- openstef/monitoring/teams.py +0 -203
- openstef/pipeline/__init__.py +0 -3
- openstef/pipeline/create_basecase_forecast.py +0 -133
- openstef/pipeline/create_component_forecast.py +0 -168
- openstef/pipeline/create_forecast.py +0 -171
- openstef/pipeline/optimize_hyperparameters.py +0 -317
- openstef/pipeline/train_create_forecast_backtest.py +0 -163
- openstef/pipeline/train_model.py +0 -561
- openstef/pipeline/utils.py +0 -52
- openstef/postprocessing/__init__.py +0 -3
- openstef/postprocessing/postprocessing.py +0 -275
- openstef/preprocessing/__init__.py +0 -3
- openstef/preprocessing/preprocessing.py +0 -42
- openstef/settings.py +0 -15
- openstef/tasks/__init__.py +0 -3
- openstef/tasks/calculate_kpi.py +0 -324
- openstef/tasks/create_basecase_forecast.py +0 -118
- openstef/tasks/create_components_forecast.py +0 -162
- openstef/tasks/create_forecast.py +0 -145
- openstef/tasks/create_solar_forecast.py +0 -420
- openstef/tasks/create_wind_forecast.py +0 -80
- openstef/tasks/optimize_hyperparameters.py +0 -135
- openstef/tasks/split_forecast.py +0 -273
- openstef/tasks/train_model.py +0 -224
- openstef/tasks/utils/__init__.py +0 -3
- openstef/tasks/utils/dependencies.py +0 -107
- openstef/tasks/utils/predictionjobloop.py +0 -243
- openstef/tasks/utils/taskcontext.py +0 -160
- openstef/validation/__init__.py +0 -3
- openstef/validation/validation.py +0 -322
- openstef-3.4.56.dist-info/METADATA +0 -154
- openstef-3.4.56.dist-info/RECORD +0 -102
- openstef-3.4.56.dist-info/top_level.txt +0 -1
- /openstef-3.4.56.dist-info/LICENSE → /openstef-4.0.0a3.dist-info/licenses/LICENSE.md +0 -0
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Alliander N.V. <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
"""This module defines the custom regressor."""
|
|
5
|
-
import inspect
|
|
6
|
-
from abc import abstractmethod
|
|
7
|
-
from importlib import import_module
|
|
8
|
-
from typing import Type
|
|
9
|
-
|
|
10
|
-
import pandas as pd
|
|
11
|
-
|
|
12
|
-
from openstef.model.objective import (
|
|
13
|
-
EVAL_METRIC,
|
|
14
|
-
TEST_FRACTION,
|
|
15
|
-
VALIDATION_FRACTION,
|
|
16
|
-
RegressorObjective,
|
|
17
|
-
)
|
|
18
|
-
from openstef.model.regressors.regressor import OpenstfRegressor
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class CustomOpenstfRegressor(OpenstfRegressor):
|
|
22
|
-
"""A custom regressor allows to load any custom model that is not included with openSTEF."""
|
|
23
|
-
|
|
24
|
-
@staticmethod
|
|
25
|
-
@abstractmethod
|
|
26
|
-
def valid_kwargs() -> list[str]:
|
|
27
|
-
...
|
|
28
|
-
|
|
29
|
-
@classmethod
|
|
30
|
-
@abstractmethod
|
|
31
|
-
def objective(self) -> Type[RegressorObjective]:
|
|
32
|
-
...
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def load_custom_model(custom_model_path) -> CustomOpenstfRegressor:
|
|
36
|
-
"""Load the external custom model."""
|
|
37
|
-
path_elements = custom_model_path.split(".")
|
|
38
|
-
module_path = ".".join(path_elements[:-1])
|
|
39
|
-
module = import_module(module_path)
|
|
40
|
-
model_name = path_elements[-1]
|
|
41
|
-
model_class = getattr(module, model_name)
|
|
42
|
-
|
|
43
|
-
if (
|
|
44
|
-
not inspect.isclass(model_class)
|
|
45
|
-
or inspect.isabstract(model_class)
|
|
46
|
-
or not issubclass(model_class, CustomOpenstfRegressor)
|
|
47
|
-
):
|
|
48
|
-
raise ValueError(
|
|
49
|
-
f"The path {custom_model_path!r} does not correspond to a concrete"
|
|
50
|
-
" CustomOpenstfRegressor subclass"
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
return model_class
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def is_custom_type(model_type):
|
|
57
|
-
return isinstance(model_type, str) and "." in model_type
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def create_custom_objective(
|
|
61
|
-
custom_model_path,
|
|
62
|
-
):
|
|
63
|
-
model_class = load_custom_model(custom_model_path)
|
|
64
|
-
return model_class.objective()
|
|
@@ -1,116 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
"""This module defines the DAZL model."""
|
|
5
|
-
import numpy as np
|
|
6
|
-
from sklearn.base import BaseEstimator
|
|
7
|
-
from sklearn.compose import TransformedTargetRegressor
|
|
8
|
-
from sklearn.linear_model import LinearRegression
|
|
9
|
-
from sklearn.metrics import mean_squared_error, r2_score
|
|
10
|
-
from sklearn.pipeline import Pipeline
|
|
11
|
-
from sklearn.preprocessing import MinMaxScaler
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class Dazls(BaseEstimator):
|
|
15
|
-
"""DAZLS model.
|
|
16
|
-
|
|
17
|
-
The model carries out wind and solar power prediction for unseen target substations using training data from other
|
|
18
|
-
substations with known components.
|
|
19
|
-
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
model_: Pipeline
|
|
23
|
-
|
|
24
|
-
def __init__(self):
|
|
25
|
-
"""Initialize DAZL model."""
|
|
26
|
-
self.__name__ = "DAZLS"
|
|
27
|
-
|
|
28
|
-
regressor = TransformedTargetRegressor(
|
|
29
|
-
regressor=LinearRegression(),
|
|
30
|
-
transformer=MinMaxScaler(clip=True),
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
self.model_ = Pipeline(
|
|
34
|
-
[("scaler", MinMaxScaler(clip=True)), ("regressor", regressor)]
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
# The input columns for the domain and adaptation models (with description)
|
|
38
|
-
self.baseline_input_columns = [
|
|
39
|
-
"radiation", # Weather parameter
|
|
40
|
-
"windspeed_100m", # Weather parameter
|
|
41
|
-
"total_load",
|
|
42
|
-
]
|
|
43
|
-
self.target_columns = ["total_wind_part", "total_solar_part"]
|
|
44
|
-
|
|
45
|
-
def fit(self, features, target):
|
|
46
|
-
"""Fit the model.
|
|
47
|
-
|
|
48
|
-
In this function we scale the input of the domain and adaptation models of the DAZLS MODEL. Then we fit the
|
|
49
|
-
two models. We separate the features into domain_model_input, adaptation_model_input and target, and we use them
|
|
50
|
-
for the fitting and the training of the models.
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
features: inputs for domain and adaptation model (domain_model_input, adaptation_model_input)
|
|
54
|
-
target: the expected output (y_train)
|
|
55
|
-
|
|
56
|
-
"""
|
|
57
|
-
x, y = (
|
|
58
|
-
features.loc[:, self.baseline_input_columns],
|
|
59
|
-
target.loc[:, self.target_columns],
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
self.model_.fit(x, y)
|
|
63
|
-
|
|
64
|
-
def predict(self, x: np.array):
|
|
65
|
-
"""Make a prediction.
|
|
66
|
-
|
|
67
|
-
For the prediction we use the test data x. We use domain_model_input_columns and
|
|
68
|
-
adaptation_model_input_columns to separate x in test data for domain model and adaptation model respectively.
|
|
69
|
-
|
|
70
|
-
There is an option available to return the domain model and adaptation model predictions separately to more
|
|
71
|
-
easily investigate the effectiveness of the models.
|
|
72
|
-
|
|
73
|
-
Args:
|
|
74
|
-
x: domain_model_test_data, adaptation_model_test_data
|
|
75
|
-
return_sub_preds : a flag value indicating to return the predictions of the domain model and adaptation
|
|
76
|
-
model separately. (Default: False.)
|
|
77
|
-
|
|
78
|
-
Returns:
|
|
79
|
-
prediction: The output prediction after both models.
|
|
80
|
-
|
|
81
|
-
"""
|
|
82
|
-
model_test_data = x.loc[:, self.baseline_input_columns]
|
|
83
|
-
|
|
84
|
-
return self.model_.predict(model_test_data)
|
|
85
|
-
|
|
86
|
-
def score(self, truth, prediction):
|
|
87
|
-
"""Evaluation of the prediction's output.
|
|
88
|
-
|
|
89
|
-
Args:
|
|
90
|
-
truth: real values
|
|
91
|
-
prediction: predicted values
|
|
92
|
-
|
|
93
|
-
Returns:
|
|
94
|
-
RMSE and R2 scores
|
|
95
|
-
|
|
96
|
-
"""
|
|
97
|
-
rmse = (mean_squared_error(truth, prediction)) ** 0.5
|
|
98
|
-
r2_score_value = r2_score(truth, prediction)
|
|
99
|
-
return rmse, r2_score_value
|
|
100
|
-
|
|
101
|
-
def __str__(self):
|
|
102
|
-
"""String method of the DAZLs model, provides a summary of the model for easy inspection.
|
|
103
|
-
|
|
104
|
-
Returns:
|
|
105
|
-
Summary represented by a string
|
|
106
|
-
|
|
107
|
-
"""
|
|
108
|
-
summary_str = (
|
|
109
|
-
f"{self.__name__} model summary:\n\n"
|
|
110
|
-
f"Model: {self.model_} \n"
|
|
111
|
-
f"\tInput columns: {self.baseline_input_columns} \n"
|
|
112
|
-
f"\tScaler: {self.model_['scaler']} \n\n"
|
|
113
|
-
f"\tRegressor: {self.model_['regressor']} \n\n"
|
|
114
|
-
)
|
|
115
|
-
|
|
116
|
-
return summary_str
|
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
import re
|
|
5
|
-
from typing import List
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pandas as pd
|
|
9
|
-
from sklearn.base import RegressorMixin
|
|
10
|
-
from sklearn.utils.validation import check_is_fitted
|
|
11
|
-
|
|
12
|
-
from openstef.model.regressors.regressor import OpenstfRegressor
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class FlatlinerRegressor(OpenstfRegressor, RegressorMixin):
|
|
16
|
-
feature_names_: List[str] = []
|
|
17
|
-
|
|
18
|
-
def __init__(self, quantiles=None):
|
|
19
|
-
"""Initialize FlatlinerRegressor.
|
|
20
|
-
|
|
21
|
-
The model always predicts 0.0, regardless of the input features. The model is meant to be used for flatliner
|
|
22
|
-
locations that still expect a prediction while preserving the prediction interface.
|
|
23
|
-
|
|
24
|
-
"""
|
|
25
|
-
super().__init__()
|
|
26
|
-
self.quantiles = quantiles
|
|
27
|
-
|
|
28
|
-
@property
|
|
29
|
-
def feature_names(self) -> list:
|
|
30
|
-
"""The names of the features used to train the model."""
|
|
31
|
-
check_is_fitted(self)
|
|
32
|
-
return self.feature_names_
|
|
33
|
-
|
|
34
|
-
@staticmethod
|
|
35
|
-
def _get_importance_names():
|
|
36
|
-
return {
|
|
37
|
-
"gain_importance_name": "total_gain",
|
|
38
|
-
"weight_importance_name": "weight",
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
@property
|
|
42
|
-
def can_predict_quantiles(self) -> bool:
|
|
43
|
-
"""Attribute that indicates if the model predict particular quantiles."""
|
|
44
|
-
return True
|
|
45
|
-
|
|
46
|
-
def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin:
|
|
47
|
-
"""Fits flatliner model.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
x: Feature matrix
|
|
51
|
-
y: Labels
|
|
52
|
-
|
|
53
|
-
Returns:
|
|
54
|
-
Fitted LinearQuantile model
|
|
55
|
-
|
|
56
|
-
"""
|
|
57
|
-
self.feature_names_ = list(x.columns)
|
|
58
|
-
self.feature_importances_ = np.ones(len(self.feature_names_)) / (
|
|
59
|
-
len(self.feature_names_) or 1.0
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
return self
|
|
63
|
-
|
|
64
|
-
def predict(self, x: pd.DataFrame, quantile: float = 0.5, **kwargs) -> np.array:
|
|
65
|
-
"""Makes a prediction for a desired quantile.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
x: Feature matrix
|
|
69
|
-
quantile: Quantile for which a prediciton is desired,
|
|
70
|
-
note that only quantile are available for which a model is trained,
|
|
71
|
-
and that this is a quantile-model specific keyword
|
|
72
|
-
|
|
73
|
-
Returns:
|
|
74
|
-
Prediction
|
|
75
|
-
|
|
76
|
-
Raises:
|
|
77
|
-
ValueError in case no model is trained for the requested quantile
|
|
78
|
-
|
|
79
|
-
"""
|
|
80
|
-
check_is_fitted(self)
|
|
81
|
-
|
|
82
|
-
return np.zeros(x.shape[0])
|
|
83
|
-
|
|
84
|
-
def _get_feature_importance_from_linear(self, quantile: float = 0.5) -> np.array:
|
|
85
|
-
check_is_fitted(self)
|
|
86
|
-
return np.array([0.0 for _ in self.feature_names_])
|
|
87
|
-
|
|
88
|
-
@classmethod
|
|
89
|
-
def _get_param_names(cls):
|
|
90
|
-
return [
|
|
91
|
-
"quantiles",
|
|
92
|
-
]
|
|
93
|
-
|
|
94
|
-
def __sklearn_is_fitted__(self) -> bool:
|
|
95
|
-
return True
|
|
@@ -1,334 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
import math
|
|
5
|
-
import re
|
|
6
|
-
from typing import Union, Optional, List
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from sklearn.model_selection import train_test_split
|
|
11
|
-
import xgboost as xgb
|
|
12
|
-
from sklearn.preprocessing import StandardScaler
|
|
13
|
-
from sklearn.utils.validation import check_is_fitted
|
|
14
|
-
|
|
15
|
-
from openstef.feature_engineering.missing_values_transformer import (
|
|
16
|
-
MissingValuesTransformer,
|
|
17
|
-
)
|
|
18
|
-
from openstef.model.metamodels.feature_clipper import FeatureClipper
|
|
19
|
-
from openstef.model.regressors.regressor import OpenstfRegressor
|
|
20
|
-
|
|
21
|
-
DEFAULT_QUANTILES: tuple[float, ...] = (0.9, 0.5, 0.1)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class GBLinearQuantileOpenstfRegressor(OpenstfRegressor):
|
|
25
|
-
is_fitted_: bool = False
|
|
26
|
-
|
|
27
|
-
TO_KEEP_FEATURES: List[str] = [
|
|
28
|
-
"T-7d",
|
|
29
|
-
"T-1d",
|
|
30
|
-
]
|
|
31
|
-
TO_IGNORE_FEATURES: List[str] = [
|
|
32
|
-
"Month",
|
|
33
|
-
"Quarter",
|
|
34
|
-
]
|
|
35
|
-
|
|
36
|
-
def __init__(
|
|
37
|
-
self,
|
|
38
|
-
quantiles: tuple[float, ...] = DEFAULT_QUANTILES,
|
|
39
|
-
missing_values: Union[int, float, str, None] = np.nan,
|
|
40
|
-
imputation_strategy: Optional[str] = "mean",
|
|
41
|
-
fill_value: Union[str, int, float] = None,
|
|
42
|
-
weight_scale_percentile: int = 95,
|
|
43
|
-
weight_exponent: float = 1,
|
|
44
|
-
weight_floor: float = 0.1,
|
|
45
|
-
validation_fraction: float = 0.2,
|
|
46
|
-
no_fill_future_values_features: List[str] = None,
|
|
47
|
-
clipped_features: List[str] = None,
|
|
48
|
-
learning_rate: float = 0.15,
|
|
49
|
-
num_boost_round: int = 500,
|
|
50
|
-
early_stopping_rounds: int = 10,
|
|
51
|
-
reg_alpha: float = 0.0001,
|
|
52
|
-
reg_lambda: float = 0.1,
|
|
53
|
-
updater: str = "shotgun",
|
|
54
|
-
feature_selector: str = "shuffle",
|
|
55
|
-
top_k: int = 0,
|
|
56
|
-
):
|
|
57
|
-
super().__init__()
|
|
58
|
-
|
|
59
|
-
# Check if quantile 0.5 is present. This is required.
|
|
60
|
-
if 0.5 not in quantiles:
|
|
61
|
-
raise ValueError(
|
|
62
|
-
"Cannot train quantile model as 0.5 is not in requested quantiles!"
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
if clipped_features is None:
|
|
66
|
-
clipped_features = ["APX"]
|
|
67
|
-
|
|
68
|
-
self.quantiles = quantiles
|
|
69
|
-
self.weight_scale_percentile = weight_scale_percentile
|
|
70
|
-
self.weight_exponent = weight_exponent
|
|
71
|
-
self.weight_floor = weight_floor
|
|
72
|
-
self.imputer_ = MissingValuesTransformer(
|
|
73
|
-
missing_values=missing_values,
|
|
74
|
-
imputation_strategy=imputation_strategy,
|
|
75
|
-
fill_value=fill_value,
|
|
76
|
-
no_fill_future_values_features=no_fill_future_values_features,
|
|
77
|
-
)
|
|
78
|
-
self.x_scaler_ = StandardScaler()
|
|
79
|
-
self.y_scaler_ = StandardScaler()
|
|
80
|
-
self.validation_fraction = validation_fraction
|
|
81
|
-
self.model_: xgb.Booster = None
|
|
82
|
-
self.feature_clipper_ = FeatureClipper(columns=clipped_features)
|
|
83
|
-
|
|
84
|
-
self.learning_rate = learning_rate
|
|
85
|
-
self.num_boost_round = num_boost_round
|
|
86
|
-
self.early_stopping_rounds = early_stopping_rounds
|
|
87
|
-
self.reg_alpha = reg_alpha
|
|
88
|
-
self.reg_labmda = reg_lambda
|
|
89
|
-
self.updater = updater
|
|
90
|
-
self.feature_selector = feature_selector
|
|
91
|
-
self.top_k = top_k
|
|
92
|
-
|
|
93
|
-
@property
|
|
94
|
-
def feature_names(self) -> list:
|
|
95
|
-
"""The names of the features used to train the model."""
|
|
96
|
-
check_is_fitted(self)
|
|
97
|
-
return self.imputer_.non_null_feature_names
|
|
98
|
-
|
|
99
|
-
@staticmethod
|
|
100
|
-
def _get_importance_names():
|
|
101
|
-
return {
|
|
102
|
-
"gain_importance_name": "total_gain",
|
|
103
|
-
"weight_importance_name": "weight",
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
@property
|
|
107
|
-
def can_predict_quantiles(self) -> bool:
|
|
108
|
-
"""Attribute that indicates if the model predict particular quantiles."""
|
|
109
|
-
return True
|
|
110
|
-
|
|
111
|
-
def _is_feature_ignored(self, feature_name: str) -> bool:
|
|
112
|
-
"""Check if a feature is ignored by the model.
|
|
113
|
-
|
|
114
|
-
Args:
|
|
115
|
-
feature_name: Feature name
|
|
116
|
-
|
|
117
|
-
Returns:
|
|
118
|
-
True if the feature is ignored, False otherwise
|
|
119
|
-
|
|
120
|
-
"""
|
|
121
|
-
|
|
122
|
-
if feature_name in self.TO_KEEP_FEATURES:
|
|
123
|
-
return False
|
|
124
|
-
|
|
125
|
-
return (
|
|
126
|
-
# Ignore named features
|
|
127
|
-
feature_name in self.TO_IGNORE_FEATURES
|
|
128
|
-
or
|
|
129
|
-
# Ignore holiday features
|
|
130
|
-
re.match(r"is_", feature_name) is not None
|
|
131
|
-
or
|
|
132
|
-
# Ignore lag features
|
|
133
|
-
re.match(r"T-", feature_name) is not None
|
|
134
|
-
or
|
|
135
|
-
# Ignore infeed MFFBAS profiles
|
|
136
|
-
re.match(r"E\d.*_I", feature_name) is not None
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
def _remove_ignored_features(self, x: pd.DataFrame) -> pd.DataFrame:
|
|
140
|
-
"""Remove ignored features from the input data.
|
|
141
|
-
|
|
142
|
-
Args:
|
|
143
|
-
x: Input data
|
|
144
|
-
|
|
145
|
-
Returns:
|
|
146
|
-
Data without ignored features
|
|
147
|
-
|
|
148
|
-
"""
|
|
149
|
-
return x.drop(columns=[c for c in x.columns if self._is_feature_ignored(c)])
|
|
150
|
-
|
|
151
|
-
def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> OpenstfRegressor:
|
|
152
|
-
if not isinstance(y, pd.Series):
|
|
153
|
-
y = pd.Series(np.asarray(y), name="load")
|
|
154
|
-
|
|
155
|
-
x = self._remove_ignored_features(x)
|
|
156
|
-
self.feature_clipper_.fit(x)
|
|
157
|
-
|
|
158
|
-
# Fix nan columns
|
|
159
|
-
x, y = self.imputer_.fit_transform(x, y)
|
|
160
|
-
if x.isna().any().any():
|
|
161
|
-
raise ValueError(
|
|
162
|
-
"There are nan values in the input data. Set "
|
|
163
|
-
"imputation_strategy to solve them."
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
# Apply feature scaling
|
|
167
|
-
x_scaled = self.x_scaler_.fit_transform(x)
|
|
168
|
-
y_scaled = self.y_scaler_.fit_transform(y.to_frame())[:, 0]
|
|
169
|
-
|
|
170
|
-
# Add more focus on extreme / peak values
|
|
171
|
-
sample_weight = self._calculate_sample_weights(y.values.squeeze())
|
|
172
|
-
|
|
173
|
-
# Split the data into training and validation sets
|
|
174
|
-
x_train, x_val, y_train, y_val, weight_train, weight_val = train_test_split(
|
|
175
|
-
x_scaled,
|
|
176
|
-
y_scaled,
|
|
177
|
-
sample_weight,
|
|
178
|
-
test_size=self.validation_fraction,
|
|
179
|
-
random_state=42,
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
# Preserve feature names
|
|
183
|
-
x_train = pd.DataFrame(x_train, columns=x.columns)
|
|
184
|
-
x_val = pd.DataFrame(x_val, columns=x.columns)
|
|
185
|
-
|
|
186
|
-
dtrain = xgb.DMatrix(x_train, label=y_train, weight=weight_train)
|
|
187
|
-
dval = xgb.DMatrix(x_val, label=y_val, weight=weight_val)
|
|
188
|
-
|
|
189
|
-
xgb_params = {
|
|
190
|
-
# Use the quantile objective function.
|
|
191
|
-
"objective": "reg:quantileerror", # This is pinball loss
|
|
192
|
-
"booster": "gblinear",
|
|
193
|
-
"updater": self.updater,
|
|
194
|
-
"alpha": self.reg_alpha,
|
|
195
|
-
"lambda": self.reg_labmda,
|
|
196
|
-
"feature_selector": self.feature_selector,
|
|
197
|
-
"quantile_alpha": np.array(self.quantiles),
|
|
198
|
-
"learning_rate": self.learning_rate,
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
if self.top_k > 0:
|
|
202
|
-
xgb_params["top_k"] = self.top_k
|
|
203
|
-
|
|
204
|
-
self.model_ = xgb.train(
|
|
205
|
-
params=xgb_params,
|
|
206
|
-
dtrain=dtrain,
|
|
207
|
-
num_boost_round=self.num_boost_round,
|
|
208
|
-
early_stopping_rounds=self.early_stopping_rounds,
|
|
209
|
-
evals=[(dtrain, "train"), (dval, "val")],
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
self._Booster = self.model_
|
|
213
|
-
|
|
214
|
-
self.is_fitted_ = True
|
|
215
|
-
|
|
216
|
-
self.feature_importances_ = self._get_feature_importances_from_booster(
|
|
217
|
-
self.model_
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
return self
|
|
221
|
-
|
|
222
|
-
def _calculate_sample_weights(self, y: np.array):
|
|
223
|
-
"""Calculate sample weights based on the y values of arbitrary scale.
|
|
224
|
-
|
|
225
|
-
The resulting weights are in the range [0,1] and are used to put more emphasis
|
|
226
|
-
on certain samples. The sample weighting function does:
|
|
227
|
-
|
|
228
|
-
* Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will
|
|
229
|
-
be within this range. Rest is outside.
|
|
230
|
-
* Calculate the weight by taking the exponent of scaled data.
|
|
231
|
-
* exponent=0: Results in uniform weights for all samples.
|
|
232
|
-
* exponent=1: Results in linearly increasing weights for samples that are
|
|
233
|
-
closer to the extremes.
|
|
234
|
-
* exponent>1: Results in exponentially increasing weights for samples that are
|
|
235
|
-
closer to the extremes.
|
|
236
|
-
* Clip the data to [0, 1] range with weight_floor as the minimum weight.
|
|
237
|
-
* Weight floor is used to make sure that all the samples are considered.
|
|
238
|
-
|
|
239
|
-
"""
|
|
240
|
-
return np.clip(
|
|
241
|
-
_weight_exp(
|
|
242
|
-
_scale_percentile(y, percentile=self.weight_scale_percentile),
|
|
243
|
-
exponent=self.weight_exponent,
|
|
244
|
-
),
|
|
245
|
-
a_min=self.weight_floor,
|
|
246
|
-
a_max=1,
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
def predict(self, x: pd.DataFrame, quantile: float = 0.5, **kwargs) -> np.array:
|
|
250
|
-
check_is_fitted(self)
|
|
251
|
-
|
|
252
|
-
# Preprocess input data
|
|
253
|
-
x = self._remove_ignored_features(x)
|
|
254
|
-
x = self.feature_clipper_.transform(x)
|
|
255
|
-
x = self.imputer_.transform(x)
|
|
256
|
-
x_scaled = self.x_scaler_.transform(x)
|
|
257
|
-
|
|
258
|
-
# Preserve feature names
|
|
259
|
-
x_scaled = pd.DataFrame(x_scaled, columns=x.columns)
|
|
260
|
-
|
|
261
|
-
d_x_scaled = xgb.DMatrix(x_scaled)
|
|
262
|
-
|
|
263
|
-
# Make prediction
|
|
264
|
-
y_pred = self.model_.predict(d_x_scaled)
|
|
265
|
-
|
|
266
|
-
# When multiple quantiles are trained,
|
|
267
|
-
# we need to select the requested quantile
|
|
268
|
-
if len(self.quantiles) > 1:
|
|
269
|
-
# Get index of the quantile value in the quantiles list
|
|
270
|
-
quantile_index = self.quantiles.index(quantile)
|
|
271
|
-
|
|
272
|
-
# Get the quantile prediction
|
|
273
|
-
y_pred = y_pred[:, quantile_index]
|
|
274
|
-
|
|
275
|
-
# Inverse scaling
|
|
276
|
-
y_pred = self.y_scaler_.inverse_transform(y_pred.reshape(-1, 1))[:, 0]
|
|
277
|
-
|
|
278
|
-
return y_pred
|
|
279
|
-
|
|
280
|
-
@classmethod
|
|
281
|
-
def _get_feature_importances_from_booster(cls, booster: xgb.Booster) -> np.ndarray:
|
|
282
|
-
"""Gets feature importances from a XGB booster.
|
|
283
|
-
|
|
284
|
-
This is based on the feature_importance_ property defined in:
|
|
285
|
-
https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py.
|
|
286
|
-
|
|
287
|
-
Args:
|
|
288
|
-
booster: Booster object,
|
|
289
|
-
most of the times the median model (quantile=0.5) is preferred
|
|
290
|
-
|
|
291
|
-
Returns:
|
|
292
|
-
Ndarray with normalized feature importances.
|
|
293
|
-
|
|
294
|
-
"""
|
|
295
|
-
# Get score
|
|
296
|
-
score = booster.get_score(importance_type="weight")
|
|
297
|
-
|
|
298
|
-
if type(next(iter(score.values()))) is list:
|
|
299
|
-
num_quantiles = len(next(iter(score.values())))
|
|
300
|
-
|
|
301
|
-
# Select middle quantile, assuming odd number of quantiles
|
|
302
|
-
quantile_index = num_quantiles // 2
|
|
303
|
-
|
|
304
|
-
score = {f: score[f][quantile_index] for f in score}
|
|
305
|
-
|
|
306
|
-
# Get feature names from booster
|
|
307
|
-
feature_names = booster.feature_names
|
|
308
|
-
|
|
309
|
-
# Get importance
|
|
310
|
-
feature_importance = [score.get(f, 0.0) for f in feature_names]
|
|
311
|
-
# Convert to array
|
|
312
|
-
features_importance_array = np.array(feature_importance, dtype=np.float32)
|
|
313
|
-
|
|
314
|
-
total = features_importance_array.sum() # For normalizing
|
|
315
|
-
if total == 0:
|
|
316
|
-
return features_importance_array
|
|
317
|
-
return features_importance_array / total # Normalize
|
|
318
|
-
|
|
319
|
-
@classmethod
|
|
320
|
-
def _get_param_names(cls):
|
|
321
|
-
return [
|
|
322
|
-
"quantiles",
|
|
323
|
-
]
|
|
324
|
-
|
|
325
|
-
def __sklearn_is_fitted__(self) -> bool:
|
|
326
|
-
return self.is_fitted_
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
def _scale_percentile(x: np.ndarray, percentile: int = 95):
|
|
330
|
-
return np.abs(x / np.percentile(np.abs(x), percentile))
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
def _weight_exp(x: np.ndarray, exponent: float = 1):
|
|
334
|
-
return np.abs(x) ** exponent
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
|
|
5
|
-
from lightgbm import LGBMRegressor
|
|
6
|
-
|
|
7
|
-
from openstef.model.regressors.regressor import OpenstfRegressor
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class LGBMOpenstfRegressor(LGBMRegressor, OpenstfRegressor):
|
|
11
|
-
"""LGBM Regressor which implements the Openstf regressor API."""
|
|
12
|
-
|
|
13
|
-
gain_importance_name = "gain"
|
|
14
|
-
weight_importance_name = "split"
|
|
15
|
-
|
|
16
|
-
@property
|
|
17
|
-
def feature_names(self):
|
|
18
|
-
return self._Booster.feature_name()
|
|
19
|
-
|
|
20
|
-
@property
|
|
21
|
-
def can_predict_quantiles(self):
|
|
22
|
-
return False
|
|
23
|
-
|
|
24
|
-
@staticmethod
|
|
25
|
-
def _get_importance_names():
|
|
26
|
-
return {
|
|
27
|
-
"gain_importance_name": "gain",
|
|
28
|
-
"weight_importance_name": "split",
|
|
29
|
-
}
|