openstef 3.4.56__py3-none-any.whl → 4.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstef-4.0.0a3.dist-info/METADATA +177 -0
- openstef-4.0.0a3.dist-info/RECORD +4 -0
- {openstef-3.4.56.dist-info → openstef-4.0.0a3.dist-info}/WHEEL +1 -2
- openstef/__init__.py +0 -14
- openstef/__main__.py +0 -3
- openstef/app_settings.py +0 -19
- openstef/data/NL_terrestrial_radiation.csv +0 -25585
- openstef/data/NL_terrestrial_radiation.csv.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +0 -18
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +0 -3
- openstef/data/dutch_holidays.csv +0 -1759
- openstef/data/dutch_holidays.csv.license +0 -3
- openstef/data/pv_single_coefs.csv +0 -601
- openstef/data/pv_single_coefs.csv.license +0 -3
- openstef/data_classes/__init__.py +0 -3
- openstef/data_classes/data_prep.py +0 -99
- openstef/data_classes/model_specifications.py +0 -30
- openstef/data_classes/prediction_job.py +0 -135
- openstef/data_classes/split_function.py +0 -97
- openstef/enums.py +0 -140
- openstef/exceptions.py +0 -74
- openstef/feature_engineering/__init__.py +0 -3
- openstef/feature_engineering/apply_features.py +0 -138
- openstef/feature_engineering/bidding_zone_to_country_mapping.py +0 -106
- openstef/feature_engineering/cyclic_features.py +0 -161
- openstef/feature_engineering/data_preparation.py +0 -152
- openstef/feature_engineering/feature_adder.py +0 -206
- openstef/feature_engineering/feature_applicator.py +0 -202
- openstef/feature_engineering/general.py +0 -141
- openstef/feature_engineering/holiday_features.py +0 -231
- openstef/feature_engineering/lag_features.py +0 -165
- openstef/feature_engineering/missing_values_transformer.py +0 -141
- openstef/feature_engineering/rolling_features.py +0 -58
- openstef/feature_engineering/weather_features.py +0 -492
- openstef/metrics/__init__.py +0 -3
- openstef/metrics/figure.py +0 -303
- openstef/metrics/metrics.py +0 -486
- openstef/metrics/reporter.py +0 -222
- openstef/model/__init__.py +0 -3
- openstef/model/basecase.py +0 -82
- openstef/model/confidence_interval_applicator.py +0 -242
- openstef/model/fallback.py +0 -77
- openstef/model/metamodels/__init__.py +0 -3
- openstef/model/metamodels/feature_clipper.py +0 -90
- openstef/model/metamodels/grouped_regressor.py +0 -222
- openstef/model/metamodels/missing_values_handler.py +0 -138
- openstef/model/model_creator.py +0 -214
- openstef/model/objective.py +0 -426
- openstef/model/objective_creator.py +0 -65
- openstef/model/regressors/__init__.py +0 -3
- openstef/model/regressors/arima.py +0 -197
- openstef/model/regressors/custom_regressor.py +0 -64
- openstef/model/regressors/dazls.py +0 -116
- openstef/model/regressors/flatliner.py +0 -95
- openstef/model/regressors/gblinear_quantile.py +0 -334
- openstef/model/regressors/lgbm.py +0 -29
- openstef/model/regressors/linear.py +0 -90
- openstef/model/regressors/linear_quantile.py +0 -305
- openstef/model/regressors/regressor.py +0 -114
- openstef/model/regressors/xgb.py +0 -52
- openstef/model/regressors/xgb_multioutput_quantile.py +0 -261
- openstef/model/regressors/xgb_quantile.py +0 -228
- openstef/model/serializer.py +0 -431
- openstef/model/standard_deviation_generator.py +0 -81
- openstef/model_selection/__init__.py +0 -3
- openstef/model_selection/model_selection.py +0 -311
- openstef/monitoring/__init__.py +0 -3
- openstef/monitoring/performance_meter.py +0 -92
- openstef/monitoring/teams.py +0 -203
- openstef/pipeline/__init__.py +0 -3
- openstef/pipeline/create_basecase_forecast.py +0 -133
- openstef/pipeline/create_component_forecast.py +0 -168
- openstef/pipeline/create_forecast.py +0 -171
- openstef/pipeline/optimize_hyperparameters.py +0 -317
- openstef/pipeline/train_create_forecast_backtest.py +0 -163
- openstef/pipeline/train_model.py +0 -561
- openstef/pipeline/utils.py +0 -52
- openstef/postprocessing/__init__.py +0 -3
- openstef/postprocessing/postprocessing.py +0 -275
- openstef/preprocessing/__init__.py +0 -3
- openstef/preprocessing/preprocessing.py +0 -42
- openstef/settings.py +0 -15
- openstef/tasks/__init__.py +0 -3
- openstef/tasks/calculate_kpi.py +0 -324
- openstef/tasks/create_basecase_forecast.py +0 -118
- openstef/tasks/create_components_forecast.py +0 -162
- openstef/tasks/create_forecast.py +0 -145
- openstef/tasks/create_solar_forecast.py +0 -420
- openstef/tasks/create_wind_forecast.py +0 -80
- openstef/tasks/optimize_hyperparameters.py +0 -135
- openstef/tasks/split_forecast.py +0 -273
- openstef/tasks/train_model.py +0 -224
- openstef/tasks/utils/__init__.py +0 -3
- openstef/tasks/utils/dependencies.py +0 -107
- openstef/tasks/utils/predictionjobloop.py +0 -243
- openstef/tasks/utils/taskcontext.py +0 -160
- openstef/validation/__init__.py +0 -3
- openstef/validation/validation.py +0 -322
- openstef-3.4.56.dist-info/METADATA +0 -154
- openstef-3.4.56.dist-info/RECORD +0 -102
- openstef-3.4.56.dist-info/top_level.txt +0 -1
- /openstef-3.4.56.dist-info/LICENSE → /openstef-4.0.0a3.dist-info/licenses/LICENSE.md +0 -0
|
@@ -1,261 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
from functools import partial
|
|
5
|
-
from typing import Dict, Optional, Sequence, Tuple, Union
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import sklearn.base
|
|
10
|
-
import xgboost as xgb
|
|
11
|
-
from sklearn.compose import TransformedTargetRegressor
|
|
12
|
-
from sklearn.preprocessing import StandardScaler
|
|
13
|
-
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
|
|
14
|
-
from xgboost import Booster
|
|
15
|
-
|
|
16
|
-
import openstef.metrics.metrics as metrics
|
|
17
|
-
from openstef.model.regressors.regressor import OpenstfRegressor
|
|
18
|
-
|
|
19
|
-
DEFAULT_QUANTILES: tuple[float, ...] = (0.9, 0.5, 0.1)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class XGBMultiOutputQuantileOpenstfRegressor(OpenstfRegressor):
|
|
23
|
-
r"""Model that provides multioutput quantile regression with XGBoost by default using the arctan loss function.
|
|
24
|
-
|
|
25
|
-
Arctan loss:
|
|
26
|
-
Refence: https://github.com/LaurensSluyterman/XGBoost_quantile_regression/tree/master
|
|
27
|
-
The key idea is to use a smooth approximation of the pinball loss, the arctan
|
|
28
|
-
pinball loss, that has a relatively large second derivative.
|
|
29
|
-
|
|
30
|
-
The approximation is given by:
|
|
31
|
-
$$L^{(\text{arctan})}_{\tau, s}(u) = (\tau - 0.5 + \frac{\arctan (u/s)}{\pi})u + \frac{s}{\pi}$$. # noqa E501
|
|
32
|
-
|
|
33
|
-
Some important settings:
|
|
34
|
-
|
|
35
|
-
* The parameter in the loss function determines the amount of smoothing. A
|
|
36
|
-
smaller values gives a closer approximation but also a much smaller second
|
|
37
|
-
derivative. A larger value gives more conservative quantiles when
|
|
38
|
-
is larger than 0.5, the quantile becomes larger and vice versa.
|
|
39
|
-
Values between 0.05 and 0.1 appear to work well. It may be a good idea to
|
|
40
|
-
optimize this parameter.
|
|
41
|
-
* Set min-child-weight to zero. The second derivatives can be a lot smaller
|
|
42
|
-
than 1 and this parameter may prevent any splits.
|
|
43
|
-
* Use a relatively small max-delta-step. We used a default of 0.5.
|
|
44
|
-
This prevents excessive steps that could happen due to the relatively
|
|
45
|
-
small second derivative.
|
|
46
|
-
* For the same reason, use a slightly lower learning rate of 0.05.
|
|
47
|
-
|
|
48
|
-
"""
|
|
49
|
-
|
|
50
|
-
estimator_: TransformedTargetRegressor
|
|
51
|
-
quantile_indices_: Dict[float, int]
|
|
52
|
-
|
|
53
|
-
@staticmethod
|
|
54
|
-
def _get_importance_names():
|
|
55
|
-
return {
|
|
56
|
-
"gain_importance_name": "total_gain",
|
|
57
|
-
"weight_importance_name": "weight",
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
def __init__(
|
|
61
|
-
self,
|
|
62
|
-
quantiles: tuple[float, ...] = DEFAULT_QUANTILES,
|
|
63
|
-
gamma: float = 0.0,
|
|
64
|
-
colsample_bytree: float = 1.0,
|
|
65
|
-
subsample: float = 1.0,
|
|
66
|
-
min_child_weight: int = 0,
|
|
67
|
-
max_depth: int = 6,
|
|
68
|
-
learning_rate: float = 0.22,
|
|
69
|
-
alpha: float = 0.0,
|
|
70
|
-
max_delta_step: int = 0.5,
|
|
71
|
-
arctan_smoothing: float = 0.055,
|
|
72
|
-
early_stopping_rounds: Optional[int] = None,
|
|
73
|
-
):
|
|
74
|
-
"""Initialize XGBMultiQuantileRegressor.
|
|
75
|
-
|
|
76
|
-
Model that provides quantile regression with XGBoost.
|
|
77
|
-
For each desired quantile an XGBoost model is trained,
|
|
78
|
-
these can later be used to predict quantiles.
|
|
79
|
-
|
|
80
|
-
Args:
|
|
81
|
-
quantiles: Tuple with desired quantiles, quantile 0.5 is required.
|
|
82
|
-
For example: (0.1, 0.5, 0.9)
|
|
83
|
-
gamma: Gamma.
|
|
84
|
-
colsample_bytree: Colsample by tree.
|
|
85
|
-
subsample: Subsample.
|
|
86
|
-
min_child_weight: Minimum child weight.
|
|
87
|
-
max_depth: Maximum depth.
|
|
88
|
-
learning_rate: Learning rate.
|
|
89
|
-
alpha: Alpha.
|
|
90
|
-
max_delta_step: Maximum delta step.
|
|
91
|
-
arctan_smoothing: smoothing parameter of the arctan loss function.
|
|
92
|
-
early_stopping_rounds: Number of rounds to stop training if no improvement
|
|
93
|
-
is made.
|
|
94
|
-
|
|
95
|
-
Raises:
|
|
96
|
-
ValueError in case quantile 0.5 is not in the requested quantiles.
|
|
97
|
-
|
|
98
|
-
"""
|
|
99
|
-
super().__init__()
|
|
100
|
-
if 0.5 not in quantiles:
|
|
101
|
-
raise ValueError(
|
|
102
|
-
"Cannot train quantile model as 0.5 is not in requested quantiles!"
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
self.quantiles = quantiles
|
|
106
|
-
|
|
107
|
-
# Set attributes for hyper parameters
|
|
108
|
-
self.subsample = subsample
|
|
109
|
-
self.min_child_weight = min_child_weight
|
|
110
|
-
self.max_depth = max_depth
|
|
111
|
-
self.gamma = gamma
|
|
112
|
-
self.alpha = alpha
|
|
113
|
-
self.max_delta_step = max_delta_step
|
|
114
|
-
self.colsample_bytree = colsample_bytree
|
|
115
|
-
self.learning_rate = learning_rate
|
|
116
|
-
self.early_stopping_rounds = early_stopping_rounds
|
|
117
|
-
self.arctan_smoothing = arctan_smoothing
|
|
118
|
-
|
|
119
|
-
# Get fitting parameters - only those required for xgbooster's
|
|
120
|
-
xgb_regressor_params = {
|
|
121
|
-
key: value
|
|
122
|
-
for key, value in self.get_params().items()
|
|
123
|
-
if key in xgb.XGBRegressor().get_params().keys()
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
# Define the model
|
|
127
|
-
objective = partial(
|
|
128
|
-
metrics.arctan_loss, taus=self.quantiles, s=arctan_smoothing
|
|
129
|
-
)
|
|
130
|
-
xgb_model: xgb.XGBRegressor = xgb.XGBRegressor(
|
|
131
|
-
objective=objective,
|
|
132
|
-
base_score=0,
|
|
133
|
-
multi_strategy="one_output_per_tree",
|
|
134
|
-
**xgb_regressor_params,
|
|
135
|
-
)
|
|
136
|
-
self.estimator_ = TransformedTargetRegressor(
|
|
137
|
-
regressor=xgb_model, transformer=StandardScaler()
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
# Set quantile indices to remap multioutput predictions
|
|
141
|
-
self.quantile_indices_ = {
|
|
142
|
-
quantile: i for i, quantile in enumerate(self.quantiles)
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
def fit(
|
|
146
|
-
self,
|
|
147
|
-
x: np.array,
|
|
148
|
-
y: np.array,
|
|
149
|
-
eval_set: Optional[Sequence[Tuple[np.array, np.array]]] = None,
|
|
150
|
-
verbose: Optional[Union[bool, int]] = 0,
|
|
151
|
-
**kwargs
|
|
152
|
-
) -> OpenstfRegressor:
|
|
153
|
-
"""Fits xgb quantile model.
|
|
154
|
-
|
|
155
|
-
Args:
|
|
156
|
-
x: Feature matrix.
|
|
157
|
-
y: Labels.
|
|
158
|
-
eval_set: Evaluation set to monitor training performance.
|
|
159
|
-
verbose: Verbosity level (disabled by default).
|
|
160
|
-
|
|
161
|
-
Returns:
|
|
162
|
-
Fitted XGBQuantile model.
|
|
163
|
-
|
|
164
|
-
"""
|
|
165
|
-
if isinstance(y, pd.Series):
|
|
166
|
-
y = y.to_numpy()
|
|
167
|
-
|
|
168
|
-
if not isinstance(x, pd.DataFrame):
|
|
169
|
-
x = pd.DataFrame(np.asarray(x))
|
|
170
|
-
|
|
171
|
-
# Check/validate input
|
|
172
|
-
check_X_y(x, y, force_all_finite="allow-nan")
|
|
173
|
-
|
|
174
|
-
# Prepare inputs
|
|
175
|
-
y_multioutput = replicate_for_multioutput(y, len(self.quantiles))
|
|
176
|
-
|
|
177
|
-
# Define watchlist if eval_set is defined
|
|
178
|
-
eval_set_multioutput = []
|
|
179
|
-
if eval_set:
|
|
180
|
-
for x_eval, y_eval in eval_set:
|
|
181
|
-
if isinstance(y_eval, pd.Series):
|
|
182
|
-
y_eval = y_eval.to_numpy()
|
|
183
|
-
|
|
184
|
-
y_eval_multioutput = replicate_for_multioutput(
|
|
185
|
-
y=y_eval, num_quantiles=len(self.quantiles)
|
|
186
|
-
)
|
|
187
|
-
eval_set_multioutput.append((x_eval, y_eval_multioutput))
|
|
188
|
-
|
|
189
|
-
eval_set_multioutput.append((x, y_multioutput))
|
|
190
|
-
|
|
191
|
-
self.estimator_.fit(
|
|
192
|
-
X=x.copy(deep=True),
|
|
193
|
-
y=y_multioutput,
|
|
194
|
-
eval_set=eval_set_multioutput,
|
|
195
|
-
verbose=verbose,
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
# Update state of the estimator
|
|
199
|
-
self.feature_importances_ = self.estimator_.regressor_.feature_importances_
|
|
200
|
-
self.is_fitted_ = True
|
|
201
|
-
|
|
202
|
-
return self
|
|
203
|
-
|
|
204
|
-
def predict(self, x: np.array, quantile: float = 0.5) -> np.array:
|
|
205
|
-
"""Makes a prediction for a desired quantile.
|
|
206
|
-
|
|
207
|
-
Args:
|
|
208
|
-
x: Feature matrix.
|
|
209
|
-
quantile: Quantile for which a prediciton is desired,
|
|
210
|
-
note that only quantile are available for which a model is trained,
|
|
211
|
-
and that this is a quantile-model specific keyword.
|
|
212
|
-
|
|
213
|
-
Returns:
|
|
214
|
-
Prediction
|
|
215
|
-
|
|
216
|
-
Raises:
|
|
217
|
-
ValueError in case no model is trained for the requested quantile.
|
|
218
|
-
|
|
219
|
-
"""
|
|
220
|
-
# Check if model is trained for this quantile
|
|
221
|
-
if quantile not in self.quantiles:
|
|
222
|
-
raise ValueError("No model trained for requested quantile!")
|
|
223
|
-
|
|
224
|
-
# Check/validate input
|
|
225
|
-
check_array(x, force_all_finite="allow-nan")
|
|
226
|
-
check_is_fitted(self)
|
|
227
|
-
|
|
228
|
-
# best_iteration is only available if early stopping was used during training
|
|
229
|
-
prediction: np.array
|
|
230
|
-
if hasattr(self.estimator_, "best_iteration"):
|
|
231
|
-
prediction = self.estimator_.predict(
|
|
232
|
-
X=x,
|
|
233
|
-
iteration_range=(0, self.estimator_.best_iteration + 1),
|
|
234
|
-
)
|
|
235
|
-
else:
|
|
236
|
-
prediction = self.estimator_.predict(X=x)
|
|
237
|
-
|
|
238
|
-
quantile_index = self.quantile_indices_[quantile]
|
|
239
|
-
return prediction[:, quantile_index]
|
|
240
|
-
|
|
241
|
-
@property
|
|
242
|
-
def feature_names(self):
|
|
243
|
-
return self.estimator_.feature_names_in_
|
|
244
|
-
|
|
245
|
-
@property
|
|
246
|
-
def can_predict_quantiles(self):
|
|
247
|
-
return True
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
def replicate_for_multioutput(y: np.array, num_quantiles: int) -> np.array:
|
|
251
|
-
"""Replicates a 1D array to a 2D array for multioutput regression.
|
|
252
|
-
|
|
253
|
-
Args:
|
|
254
|
-
y: 1D array.
|
|
255
|
-
num_quantiles: Number of columns in the output array.
|
|
256
|
-
|
|
257
|
-
Returns:
|
|
258
|
-
2D array with shape (len(y), num_quantiles)
|
|
259
|
-
|
|
260
|
-
"""
|
|
261
|
-
return np.repeat(y[:, None], num_quantiles, axis=1)
|
|
@@ -1,228 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
from functools import partial
|
|
5
|
-
|
|
6
|
-
import numpy as np
|
|
7
|
-
import xgboost as xgb
|
|
8
|
-
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
|
|
9
|
-
from xgboost import Booster
|
|
10
|
-
|
|
11
|
-
import openstef.metrics.metrics as metrics
|
|
12
|
-
from openstef.model.regressors.regressor import OpenstfRegressor
|
|
13
|
-
|
|
14
|
-
DEFAULT_QUANTILES: tuple[float, ...] = (0.9, 0.5, 0.1)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class XGBQuantileOpenstfRegressor(OpenstfRegressor):
|
|
18
|
-
@staticmethod
|
|
19
|
-
def _get_importance_names():
|
|
20
|
-
return {
|
|
21
|
-
"gain_importance_name": "total_gain",
|
|
22
|
-
"weight_importance_name": "weight",
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
quantiles: tuple[float, ...] = DEFAULT_QUANTILES,
|
|
28
|
-
gamma: float = 0.0,
|
|
29
|
-
colsample_bytree: float = 1.0,
|
|
30
|
-
subsample: float = 1.0,
|
|
31
|
-
min_child_weight: int = 1,
|
|
32
|
-
max_depth: int = 6,
|
|
33
|
-
learning_rate: float = 0.3,
|
|
34
|
-
alpha: float = 0.0,
|
|
35
|
-
max_delta_step: int = 0,
|
|
36
|
-
):
|
|
37
|
-
"""Initialize XGBQuantileRegressor.
|
|
38
|
-
|
|
39
|
-
Model that provides quantile regression with XGBoost.
|
|
40
|
-
For each desired quantile an XGBoost model is trained,
|
|
41
|
-
these can later be used to predict quantiles.
|
|
42
|
-
|
|
43
|
-
Args:
|
|
44
|
-
quantiles: Tuple with desired quantiles, quantile 0.5 is required.
|
|
45
|
-
For example: (0.1, 0.5, 0.9)
|
|
46
|
-
gamma: Gamma
|
|
47
|
-
colsample_bytree: Colsample by tree
|
|
48
|
-
subsample: Subsample
|
|
49
|
-
min_child_weight: Minimum child weight
|
|
50
|
-
max_depth: Maximum depth
|
|
51
|
-
learning_rate: Learning rate
|
|
52
|
-
alpha: Alpha
|
|
53
|
-
max_delta_step: Maximum delta step
|
|
54
|
-
|
|
55
|
-
Raises:
|
|
56
|
-
ValueError in case quantile 0.5 is not in the requested quantiles
|
|
57
|
-
|
|
58
|
-
"""
|
|
59
|
-
super().__init__()
|
|
60
|
-
# Check if quantile 0.5 is pressent this is required
|
|
61
|
-
if 0.5 not in quantiles:
|
|
62
|
-
raise ValueError(
|
|
63
|
-
"Cannot train quantile model as 0.5 is not in requested quantiles!"
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
self.quantiles = quantiles
|
|
67
|
-
|
|
68
|
-
# Set attributes for hyper parameters
|
|
69
|
-
self.subsample = subsample
|
|
70
|
-
self.min_child_weight = min_child_weight
|
|
71
|
-
self.max_depth = max_depth
|
|
72
|
-
self.gamma = gamma
|
|
73
|
-
self.alpha = alpha
|
|
74
|
-
self.max_delta_step = max_delta_step
|
|
75
|
-
self.colsample_bytree = colsample_bytree
|
|
76
|
-
self.learning_rate = learning_rate
|
|
77
|
-
|
|
78
|
-
def fit(self, x: np.array, y: np.array, **kwargs) -> OpenstfRegressor:
|
|
79
|
-
"""Fits xgb quantile model.
|
|
80
|
-
|
|
81
|
-
Args:
|
|
82
|
-
x: Feature matrix
|
|
83
|
-
y: Labels
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
Fitted XGBQuantile model
|
|
87
|
-
|
|
88
|
-
"""
|
|
89
|
-
# TODO: specify these required kwargs in the function definition
|
|
90
|
-
early_stopping_rounds = kwargs.get("early_stopping_rounds", None)
|
|
91
|
-
eval_set = kwargs.get("eval_set", None)
|
|
92
|
-
|
|
93
|
-
# Check/validate input
|
|
94
|
-
check_X_y(x, y, force_all_finite="allow-nan")
|
|
95
|
-
|
|
96
|
-
# Convert x and y to dmatrix input
|
|
97
|
-
dtrain = xgb.DMatrix(x.copy(deep=True), label=y.copy(deep=True))
|
|
98
|
-
|
|
99
|
-
# Define watchlist if eval_set is defined
|
|
100
|
-
if eval_set:
|
|
101
|
-
dval = xgb.DMatrix(
|
|
102
|
-
eval_set[1][0].copy(deep=True),
|
|
103
|
-
label=eval_set[1][1].copy(deep=True),
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
# Define data set to be monitored during training, the last(validation)
|
|
107
|
-
# will be used for early stopping
|
|
108
|
-
watchlist = [(dtrain, "train"), (dval, "validation")]
|
|
109
|
-
else:
|
|
110
|
-
watchlist = ()
|
|
111
|
-
|
|
112
|
-
# Get fitting parameters - only those required for xgbooster's
|
|
113
|
-
xgb_regressor_params = {
|
|
114
|
-
key: value
|
|
115
|
-
for key, value in self.get_params().items()
|
|
116
|
-
if key in xgb.XGBRegressor().get_params().keys()
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
quantile_models = {}
|
|
120
|
-
|
|
121
|
-
for quantile in self.quantiles:
|
|
122
|
-
# Define objective callback functions specifically for desired quantile
|
|
123
|
-
xgb_quantile_eval_this_quantile = partial(
|
|
124
|
-
metrics.xgb_quantile_eval, quantile=quantile
|
|
125
|
-
)
|
|
126
|
-
xgb_quantile_obj_this_quantile = partial(
|
|
127
|
-
metrics.xgb_quantile_obj, quantile=quantile
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
# Train quantile model
|
|
131
|
-
quantile_models[quantile] = xgb.train(
|
|
132
|
-
params=xgb_regressor_params,
|
|
133
|
-
dtrain=dtrain,
|
|
134
|
-
evals=watchlist,
|
|
135
|
-
# Can be large because we are early stopping anyway
|
|
136
|
-
num_boost_round=100,
|
|
137
|
-
obj=xgb_quantile_obj_this_quantile,
|
|
138
|
-
feval=xgb_quantile_eval_this_quantile,
|
|
139
|
-
verbose_eval=False,
|
|
140
|
-
early_stopping_rounds=early_stopping_rounds,
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
# Set weigths and features from the 0.5 (median) model
|
|
144
|
-
self.feature_importances_ = self.get_feature_importances_from_booster(
|
|
145
|
-
quantile_models[0.5]
|
|
146
|
-
)
|
|
147
|
-
self._Booster = quantile_models[0.5] # Used for feature names later on
|
|
148
|
-
# Update state of the estimator
|
|
149
|
-
self.estimators_ = quantile_models
|
|
150
|
-
self.is_fitted_ = True
|
|
151
|
-
|
|
152
|
-
return self
|
|
153
|
-
|
|
154
|
-
def predict(self, x: np.array, quantile: float = 0.5) -> np.array:
|
|
155
|
-
"""Makes a prediction for a desired quantile.
|
|
156
|
-
|
|
157
|
-
Args:
|
|
158
|
-
x: Feature matrix
|
|
159
|
-
quantile: Quantile for which a prediciton is desired,
|
|
160
|
-
note that only quantile are available for which a model is trained,
|
|
161
|
-
and that this is a quantile-model specific keyword
|
|
162
|
-
|
|
163
|
-
Returns:
|
|
164
|
-
Prediction
|
|
165
|
-
|
|
166
|
-
Raises:
|
|
167
|
-
ValueError in case no model is trained for the requested quantile
|
|
168
|
-
|
|
169
|
-
"""
|
|
170
|
-
# Check if model is trained for this quantile
|
|
171
|
-
if quantile not in self.quantiles:
|
|
172
|
-
raise ValueError("No model trained for requested quantile!")
|
|
173
|
-
|
|
174
|
-
# Check/validate input
|
|
175
|
-
check_array(x, force_all_finite="allow-nan")
|
|
176
|
-
check_is_fitted(self)
|
|
177
|
-
|
|
178
|
-
# Convert array to dmatrix
|
|
179
|
-
dmatrix_input = xgb.DMatrix(x.copy(deep=True))
|
|
180
|
-
|
|
181
|
-
# best_iteration is only available if early stopping was used during training
|
|
182
|
-
if hasattr(self.estimators_[quantile], "best_iteration"):
|
|
183
|
-
return self.estimators_[quantile].predict(
|
|
184
|
-
dmatrix_input,
|
|
185
|
-
iteration_range=(0, self.estimators_[quantile].best_iteration + 1),
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
else:
|
|
189
|
-
return self.estimators_[quantile].predict(dmatrix_input)
|
|
190
|
-
|
|
191
|
-
@classmethod
|
|
192
|
-
def get_feature_importances_from_booster(cls, booster: Booster) -> np.ndarray:
|
|
193
|
-
"""Gets feauture importances from a XGB booster.
|
|
194
|
-
|
|
195
|
-
This is based on the feature_importance_ property defined in:
|
|
196
|
-
https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py.
|
|
197
|
-
|
|
198
|
-
Args:
|
|
199
|
-
booster: Booster object,
|
|
200
|
-
most of the times the median model (quantile=0.5) is preferred
|
|
201
|
-
|
|
202
|
-
Returns:
|
|
203
|
-
Ndarray with normalized feature importances.
|
|
204
|
-
|
|
205
|
-
"""
|
|
206
|
-
# Get score
|
|
207
|
-
score = booster.get_score(importance_type="gain")
|
|
208
|
-
|
|
209
|
-
# Get feature names from booster
|
|
210
|
-
feature_names = booster.feature_names
|
|
211
|
-
|
|
212
|
-
# Get importance
|
|
213
|
-
feature_importance = [score.get(f, 0.0) for f in feature_names]
|
|
214
|
-
# Convert to array
|
|
215
|
-
features_importance_array = np.array(feature_importance, dtype=np.float32)
|
|
216
|
-
|
|
217
|
-
total = features_importance_array.sum() # For normalizing
|
|
218
|
-
if total == 0:
|
|
219
|
-
return features_importance_array
|
|
220
|
-
return features_importance_array / total # Normalize
|
|
221
|
-
|
|
222
|
-
@property
|
|
223
|
-
def feature_names(self):
|
|
224
|
-
return self._Booster.feature_names
|
|
225
|
-
|
|
226
|
-
@property
|
|
227
|
-
def can_predict_quantiles(self):
|
|
228
|
-
return True
|