emhass 0.10.6__py3-none-any.whl → 0.15.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emhass/command_line.py +1827 -735
- emhass/connection_manager.py +108 -0
- emhass/data/associations.csv +98 -0
- emhass/data/cec_inverters.pbz2 +0 -0
- emhass/data/cec_modules.pbz2 +0 -0
- emhass/data/config_defaults.json +120 -0
- emhass/forecast.py +1482 -622
- emhass/img/emhass_icon.png +0 -0
- emhass/machine_learning_forecaster.py +565 -212
- emhass/machine_learning_regressor.py +162 -122
- emhass/optimization.py +1724 -590
- emhass/retrieve_hass.py +1104 -248
- emhass/static/advanced.html +9 -1
- emhass/static/basic.html +4 -2
- emhass/static/configuration_list.html +48 -0
- emhass/static/configuration_script.js +956 -0
- emhass/static/data/param_definitions.json +592 -0
- emhass/static/script.js +377 -322
- emhass/static/style.css +270 -13
- emhass/templates/configuration.html +77 -0
- emhass/templates/index.html +23 -14
- emhass/templates/template.html +4 -5
- emhass/utils.py +1797 -428
- emhass/web_server.py +850 -448
- emhass/websocket_client.py +224 -0
- emhass-0.15.5.dist-info/METADATA +164 -0
- emhass-0.15.5.dist-info/RECORD +34 -0
- {emhass-0.10.6.dist-info → emhass-0.15.5.dist-info}/WHEEL +1 -2
- emhass-0.15.5.dist-info/entry_points.txt +2 -0
- emhass-0.10.6.dist-info/METADATA +0 -622
- emhass-0.10.6.dist-info/RECORD +0 -26
- emhass-0.10.6.dist-info/entry_points.txt +0 -2
- emhass-0.10.6.dist-info/top_level.txt +0 -1
- {emhass-0.10.6.dist-info → emhass-0.15.5.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,44 +1,61 @@
|
|
|
1
|
-
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
1
|
+
import asyncio
|
|
4
2
|
import logging
|
|
5
|
-
import copy
|
|
6
3
|
import time
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import numpy as np
|
|
4
|
+
import warnings
|
|
10
5
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
from
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from skforecast.model_selection import (
|
|
9
|
+
TimeSeriesFold,
|
|
10
|
+
backtesting_forecaster,
|
|
11
|
+
bayesian_search_forecaster,
|
|
12
|
+
)
|
|
13
|
+
from skforecast.recursive import ForecasterRecursive
|
|
14
|
+
from sklearn.ensemble import (
|
|
15
|
+
AdaBoostRegressor,
|
|
16
|
+
ExtraTreesRegressor,
|
|
17
|
+
GradientBoostingRegressor,
|
|
18
|
+
RandomForestRegressor,
|
|
19
|
+
)
|
|
20
|
+
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
|
|
14
21
|
from sklearn.metrics import r2_score
|
|
22
|
+
from sklearn.neighbors import KNeighborsRegressor
|
|
23
|
+
from sklearn.neural_network import MLPRegressor
|
|
24
|
+
from sklearn.svm import SVR
|
|
25
|
+
from sklearn.tree import DecisionTreeRegressor
|
|
15
26
|
|
|
16
|
-
from
|
|
17
|
-
from skforecast.model_selection import bayesian_search_forecaster
|
|
18
|
-
from skforecast.model_selection import backtesting_forecaster
|
|
27
|
+
from emhass import utils
|
|
19
28
|
|
|
20
|
-
import warnings
|
|
21
29
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
22
30
|
|
|
31
|
+
|
|
23
32
|
class MLForecaster:
|
|
24
33
|
r"""
|
|
25
34
|
A forecaster class using machine learning models with auto-regressive approach and features\
|
|
26
35
|
based on timestamp information (hour, day, week, etc).
|
|
27
|
-
|
|
36
|
+
|
|
28
37
|
This class uses the `skforecast` module and the machine learning models are from `scikit-learn`.
|
|
29
|
-
|
|
38
|
+
|
|
30
39
|
It exposes three main methods:
|
|
31
|
-
|
|
40
|
+
|
|
32
41
|
- `fit`: to train a model with the passed data.
|
|
33
|
-
|
|
42
|
+
|
|
34
43
|
- `predict`: to obtain a forecast from a pre-trained model.
|
|
35
|
-
|
|
36
|
-
- `tune`: to optimize the models hyperparameters using bayesian optimization.
|
|
37
|
-
|
|
44
|
+
|
|
45
|
+
- `tune`: to optimize the models hyperparameters using bayesian optimization.
|
|
46
|
+
|
|
38
47
|
"""
|
|
39
48
|
|
|
40
|
-
def __init__(
|
|
41
|
-
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
data: pd.DataFrame,
|
|
52
|
+
model_type: str,
|
|
53
|
+
var_model: str,
|
|
54
|
+
sklearn_model: str,
|
|
55
|
+
num_lags: int,
|
|
56
|
+
emhass_conf: dict,
|
|
57
|
+
logger: logging.Logger,
|
|
58
|
+
) -> None:
|
|
42
59
|
r"""Define constructor for the forecast class.
|
|
43
60
|
|
|
44
61
|
:param data: The data that will be used for train/test
|
|
@@ -69,47 +86,104 @@ class MLForecaster:
|
|
|
69
86
|
self.emhass_conf = emhass_conf
|
|
70
87
|
self.logger = logger
|
|
71
88
|
self.is_tuned = False
|
|
89
|
+
self.forecaster: ForecasterRecursive | None = None
|
|
90
|
+
self.optimize_results: pd.DataFrame | None = None
|
|
91
|
+
self.optimize_results_object = None
|
|
92
|
+
|
|
72
93
|
# A quick data preparation
|
|
94
|
+
self._prepare_data()
|
|
95
|
+
|
|
96
|
+
def _prepare_data(self):
|
|
97
|
+
"""Prepare the input data by cleaning and sorting."""
|
|
73
98
|
self.data.index = pd.to_datetime(self.data.index)
|
|
74
99
|
self.data.sort_index(inplace=True)
|
|
75
|
-
self.data = self.data[~self.data.index.duplicated(keep=
|
|
76
|
-
|
|
77
|
-
@staticmethod
|
|
78
|
-
def add_date_features(data: pd.DataFrame) -> pd.DataFrame:
|
|
79
|
-
"""Add date features from the input DataFrame timestamp
|
|
80
|
-
|
|
81
|
-
:param data: The input DataFrame
|
|
82
|
-
:type data: pd.DataFrame
|
|
83
|
-
:return: The DataFrame with the added features
|
|
84
|
-
:rtype: pd.DataFrame
|
|
85
|
-
"""
|
|
86
|
-
df = copy.deepcopy(data)
|
|
87
|
-
df['year'] = [i.year for i in df.index]
|
|
88
|
-
df['month'] = [i.month for i in df.index]
|
|
89
|
-
df['day_of_week'] = [i.dayofweek for i in df.index]
|
|
90
|
-
df['day_of_year'] = [i.dayofyear for i in df.index]
|
|
91
|
-
df['day'] = [i.day for i in df.index]
|
|
92
|
-
df['hour'] = [i.hour for i in df.index]
|
|
93
|
-
return df
|
|
100
|
+
self.data = self.data[~self.data.index.duplicated(keep="first")]
|
|
94
101
|
|
|
95
102
|
@staticmethod
|
|
96
103
|
def neg_r2_score(y_true, y_pred):
|
|
97
104
|
"""The negative of the r2 score."""
|
|
98
105
|
return -r2_score(y_true, y_pred)
|
|
99
|
-
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
async def interpolate_async(data: pd.DataFrame) -> pd.DataFrame:
|
|
109
|
+
"""Interpolate missing values asynchronously."""
|
|
110
|
+
return await asyncio.to_thread(data.interpolate, method="linear", axis=0, limit=None)
|
|
111
|
+
|
|
100
112
|
@staticmethod
|
|
101
|
-
def
|
|
113
|
+
def get_lags_list_from_frequency(freq: pd.Timedelta) -> list[int]:
|
|
114
|
+
"""Calculate appropriate lag values based on data frequency.
|
|
115
|
+
|
|
116
|
+
The lags represent different time horizons (6h, 12h, 1d, 1.5d, 2d, 2.5d, 3d).
|
|
117
|
+
This method scales these horizons according to the actual data frequency.
|
|
118
|
+
|
|
119
|
+
:param freq: The frequency of the data as a pandas Timedelta
|
|
120
|
+
:type freq: pd.Timedelta
|
|
121
|
+
:return: A list of lag values appropriate for the data frequency
|
|
122
|
+
:rtype: list[int]
|
|
123
|
+
"""
|
|
124
|
+
# Define target time horizons in hours
|
|
125
|
+
target_horizons_hours = [6, 12, 24, 36, 48, 60, 72]
|
|
126
|
+
|
|
127
|
+
# Calculate frequency in hours
|
|
128
|
+
freq_hours = freq.total_seconds() / 3600
|
|
129
|
+
|
|
130
|
+
# Calculate lags for each horizon
|
|
131
|
+
lags = [int(round(horizon / freq_hours)) for horizon in target_horizons_hours]
|
|
132
|
+
|
|
133
|
+
# Remove duplicates and ensure minimum value of 1
|
|
134
|
+
lags = sorted({max(1, lag) for lag in lags})
|
|
135
|
+
|
|
136
|
+
return lags
|
|
137
|
+
|
|
138
|
+
@staticmethod
|
|
139
|
+
async def generate_exog(data_last_window, periods, var_name):
|
|
102
140
|
"""Generate the exogenous data for future timestamps."""
|
|
103
|
-
forecast_dates = pd.date_range(
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
exog =
|
|
141
|
+
forecast_dates = pd.date_range(
|
|
142
|
+
start=data_last_window.index[-1] + data_last_window.index.freq,
|
|
143
|
+
periods=periods,
|
|
144
|
+
freq=data_last_window.index.freq,
|
|
145
|
+
)
|
|
146
|
+
exog = pd.DataFrame({var_name: [np.nan] * periods}, index=forecast_dates)
|
|
147
|
+
exog = utils.add_date_features(exog)
|
|
109
148
|
return exog
|
|
110
|
-
|
|
111
|
-
def
|
|
112
|
-
|
|
149
|
+
|
|
150
|
+
def _get_sklearn_model(self, model_name: str):
|
|
151
|
+
"""Get the sklearn model instance based on the model name."""
|
|
152
|
+
seed = 42
|
|
153
|
+
models = {
|
|
154
|
+
"LinearRegression": LinearRegression(),
|
|
155
|
+
"RidgeRegression": Ridge(),
|
|
156
|
+
"LassoRegression": Lasso(random_state=seed),
|
|
157
|
+
"ElasticNet": ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=seed),
|
|
158
|
+
"KNeighborsRegressor": KNeighborsRegressor(),
|
|
159
|
+
"DecisionTreeRegressor": DecisionTreeRegressor(ccp_alpha=0.0, random_state=seed),
|
|
160
|
+
"SVR": SVR(),
|
|
161
|
+
"RandomForestRegressor": RandomForestRegressor(
|
|
162
|
+
min_samples_leaf=1, max_features=1.0, random_state=seed
|
|
163
|
+
),
|
|
164
|
+
"ExtraTreesRegressor": ExtraTreesRegressor(
|
|
165
|
+
min_samples_leaf=1, max_features=1.0, random_state=seed
|
|
166
|
+
),
|
|
167
|
+
"GradientBoostingRegressor": GradientBoostingRegressor(
|
|
168
|
+
learning_rate=0.1, random_state=seed
|
|
169
|
+
),
|
|
170
|
+
"AdaBoostRegressor": AdaBoostRegressor(learning_rate=1.0, random_state=seed),
|
|
171
|
+
"MLPRegressor": MLPRegressor(hidden_layer_sizes=(100,), random_state=seed),
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if model_name not in models:
|
|
175
|
+
self.logger.error(
|
|
176
|
+
f"Passed sklearn model {model_name} is not valid. Defaulting to KNeighborsRegressor"
|
|
177
|
+
)
|
|
178
|
+
return KNeighborsRegressor()
|
|
179
|
+
|
|
180
|
+
return models[model_name]
|
|
181
|
+
|
|
182
|
+
async def fit(
|
|
183
|
+
self,
|
|
184
|
+
split_date_delta: str | None = "48h",
|
|
185
|
+
perform_backtest: bool | None = False,
|
|
186
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
113
187
|
r"""The fit method to train the ML model.
|
|
114
188
|
|
|
115
189
|
:param split_date_delta: The delta from now to `split_date_delta` that will be used \
|
|
@@ -121,72 +195,133 @@ class MLForecaster:
|
|
|
121
195
|
:return: The DataFrame containing the forecast data results without and with backtest
|
|
122
196
|
:rtype: Tuple[pd.DataFrame, pd.DataFrame]
|
|
123
197
|
"""
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
self.
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
self.logger.info(f"Prediction R2 score of fitted model on test data: {pred_metric}")
|
|
160
|
-
# Packing results in a DataFrame
|
|
161
|
-
df_pred = pd.DataFrame(index=self.data_exo.index,columns=['train','test','pred'])
|
|
162
|
-
df_pred['train'] = self.data_train[self.var_model]
|
|
163
|
-
df_pred['test'] = self.data_test[self.var_model]
|
|
164
|
-
df_pred['pred'] = predictions
|
|
165
|
-
df_pred_backtest = None
|
|
166
|
-
if perform_backtest is True:
|
|
167
|
-
# Using backtesting tool to evaluate the model
|
|
168
|
-
self.logger.info("Performing simple backtesting of fitted model")
|
|
198
|
+
try:
|
|
199
|
+
self.logger.info("Performing a forecast model fit for " + self.model_type)
|
|
200
|
+
|
|
201
|
+
# Check if variable exists in data
|
|
202
|
+
if self.var_model not in self.data.columns:
|
|
203
|
+
raise KeyError(
|
|
204
|
+
f"Variable '{self.var_model}' not found in data columns: {list(self.data.columns)}"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Preparing the data: adding exogenous features
|
|
208
|
+
self.data_exo = pd.DataFrame(index=self.data.index)
|
|
209
|
+
self.data_exo = utils.add_date_features(self.data_exo)
|
|
210
|
+
self.data_exo[self.var_model] = self.data[self.var_model]
|
|
211
|
+
|
|
212
|
+
self.data_exo = await self.interpolate_async(self.data_exo)
|
|
213
|
+
|
|
214
|
+
# train/test split
|
|
215
|
+
self.date_train = (
|
|
216
|
+
self.data_exo.index[-1] - pd.Timedelta("5days") + self.data_exo.index.freq
|
|
217
|
+
) # The last 5 days
|
|
218
|
+
self.date_split = (
|
|
219
|
+
self.data_exo.index[-1] - pd.Timedelta(split_date_delta) + self.data_exo.index.freq
|
|
220
|
+
) # The last 48h
|
|
221
|
+
self.data_train = self.data_exo.loc[: self.date_split - self.data_exo.index.freq, :]
|
|
222
|
+
self.data_test = self.data_exo.loc[self.date_split :, :]
|
|
223
|
+
self.steps = len(self.data_test)
|
|
224
|
+
|
|
225
|
+
# Pick correct sklearn model
|
|
226
|
+
base_model = self._get_sklearn_model(self.sklearn_model)
|
|
227
|
+
|
|
228
|
+
# Define the forecaster object
|
|
229
|
+
self.forecaster = ForecasterRecursive(estimator=base_model, lags=self.num_lags)
|
|
230
|
+
|
|
231
|
+
# Fit and time it
|
|
232
|
+
self.logger.info("Training a " + self.sklearn_model + " model")
|
|
169
233
|
start_time = time.time()
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
steps = self.num_lags,
|
|
177
|
-
metric = MLForecaster.neg_r2_score,
|
|
178
|
-
refit = False,
|
|
179
|
-
verbose = False
|
|
234
|
+
|
|
235
|
+
await asyncio.to_thread(
|
|
236
|
+
self.forecaster.fit,
|
|
237
|
+
y=self.data_train[self.var_model],
|
|
238
|
+
exog=self.data_train.drop(self.var_model, axis=1),
|
|
239
|
+
store_in_sample_residuals=True,
|
|
180
240
|
)
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
241
|
+
|
|
242
|
+
fit_time = time.time() - start_time
|
|
243
|
+
self.logger.info(f"Elapsed time for model fit: {fit_time}")
|
|
244
|
+
|
|
245
|
+
# Make a prediction to print metrics
|
|
246
|
+
predictions = await asyncio.to_thread(
|
|
247
|
+
self.forecaster.predict,
|
|
248
|
+
steps=self.steps,
|
|
249
|
+
exog=self.data_test.drop(self.var_model, axis=1),
|
|
250
|
+
)
|
|
251
|
+
pred_metric = await asyncio.to_thread(
|
|
252
|
+
r2_score, self.data_test[self.var_model], predictions
|
|
253
|
+
)
|
|
254
|
+
self.logger.info(f"Prediction R2 score of fitted model on test data: {pred_metric}")
|
|
255
|
+
|
|
256
|
+
# Packing results in a DataFrame
|
|
257
|
+
df_pred = pd.DataFrame(index=self.data_exo.index, columns=["train", "test", "pred"])
|
|
258
|
+
|
|
259
|
+
df_pred["train"] = self.data_train[self.var_model]
|
|
260
|
+
df_pred["test"] = self.data_test[self.var_model]
|
|
261
|
+
df_pred["pred"] = predictions
|
|
262
|
+
|
|
263
|
+
df_pred_backtest = None
|
|
264
|
+
|
|
265
|
+
if perform_backtest is True:
|
|
266
|
+
# Using backtesting tool to evaluate the model
|
|
267
|
+
self.logger.info("Performing simple backtesting of fitted model")
|
|
268
|
+
start_time = time.time()
|
|
269
|
+
cv = TimeSeriesFold(
|
|
270
|
+
steps=self.num_lags,
|
|
271
|
+
initial_train_size=None,
|
|
272
|
+
fixed_train_size=False,
|
|
273
|
+
gap=0,
|
|
274
|
+
allow_incomplete_fold=True,
|
|
275
|
+
refit=False,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
metric, predictions_backtest = await asyncio.to_thread(
|
|
279
|
+
backtesting_forecaster,
|
|
280
|
+
forecaster=self.forecaster,
|
|
281
|
+
y=self.data_train[self.var_model],
|
|
282
|
+
exog=self.data_train.drop(self.var_model, axis=1),
|
|
283
|
+
cv=cv,
|
|
284
|
+
metric=MLForecaster.neg_r2_score,
|
|
285
|
+
verbose=False,
|
|
286
|
+
show_progress=True,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
backtest_time = time.time() - start_time
|
|
290
|
+
backtest_r2 = -metric
|
|
291
|
+
self.logger.info(f"Elapsed backtesting time: {backtest_time}")
|
|
292
|
+
self.logger.info(f"Backtest R2 score: {backtest_r2}")
|
|
293
|
+
df_pred_backtest = pd.DataFrame(
|
|
294
|
+
index=self.data_exo.index, columns=["train", "pred"]
|
|
295
|
+
)
|
|
296
|
+
df_pred_backtest["train"] = self.data_exo[self.var_model]
|
|
297
|
+
# Handle skforecast 0.18.0+ DataFrame output with fold column
|
|
298
|
+
if isinstance(predictions_backtest, pd.DataFrame):
|
|
299
|
+
# Extract the 'pred' column from the DataFrame
|
|
300
|
+
pred_values = (
|
|
301
|
+
predictions_backtest["pred"]
|
|
302
|
+
if "pred" in predictions_backtest.columns
|
|
303
|
+
else predictions_backtest.iloc[:, -1]
|
|
304
|
+
)
|
|
305
|
+
else:
|
|
306
|
+
# If it's a Series, use it directly
|
|
307
|
+
pred_values = predictions_backtest
|
|
308
|
+
|
|
309
|
+
# Use loc to align indices properly - only assign where indices match
|
|
310
|
+
df_pred_backtest.loc[pred_values.index, "pred"] = pred_values
|
|
311
|
+
|
|
312
|
+
return df_pred, df_pred_backtest
|
|
313
|
+
|
|
314
|
+
except asyncio.CancelledError:
|
|
315
|
+
self.logger.info("Model training was cancelled")
|
|
316
|
+
raise
|
|
317
|
+
except Exception as e:
|
|
318
|
+
self.logger.error(f"Error during model fitting: {e}")
|
|
319
|
+
raise
|
|
320
|
+
|
|
321
|
+
async def predict(
|
|
322
|
+
self,
|
|
323
|
+
data_last_window: pd.DataFrame | None = None,
|
|
324
|
+
) -> pd.Series:
|
|
190
325
|
"""The predict method to generate forecasts from a previously fitted ML model.
|
|
191
326
|
|
|
192
327
|
:param data_last_window: The data that will be used to generate the new forecast, this \
|
|
@@ -197,109 +332,327 @@ class MLForecaster:
|
|
|
197
332
|
:return: A pandas series containing the generated forecasts.
|
|
198
333
|
:rtype: pd.Series
|
|
199
334
|
"""
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
if
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
335
|
+
try:
|
|
336
|
+
if self.forecaster is None:
|
|
337
|
+
raise ValueError("Model has not been fitted yet. Call fit() first.")
|
|
338
|
+
|
|
339
|
+
if data_last_window is None:
|
|
340
|
+
predictions = await asyncio.to_thread(
|
|
341
|
+
self.forecaster.predict,
|
|
342
|
+
steps=self.num_lags,
|
|
343
|
+
exog=self.data_test.drop(self.var_model, axis=1),
|
|
344
|
+
)
|
|
209
345
|
else:
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
346
|
+
data_last_window = await self.interpolate_async(data_last_window)
|
|
347
|
+
|
|
348
|
+
if self.is_tuned:
|
|
349
|
+
exog = await self.generate_exog(data_last_window, self.lags_opt, self.var_model)
|
|
350
|
+
|
|
351
|
+
predictions = await asyncio.to_thread(
|
|
352
|
+
self.forecaster.predict,
|
|
353
|
+
steps=self.lags_opt,
|
|
354
|
+
last_window=data_last_window[self.var_model],
|
|
355
|
+
exog=exog.drop(self.var_model, axis=1),
|
|
356
|
+
)
|
|
357
|
+
else:
|
|
358
|
+
exog = await self.generate_exog(data_last_window, self.num_lags, self.var_model)
|
|
359
|
+
|
|
360
|
+
predictions = await asyncio.to_thread(
|
|
361
|
+
self.forecaster.predict,
|
|
362
|
+
steps=self.num_lags,
|
|
363
|
+
last_window=data_last_window[self.var_model],
|
|
364
|
+
exog=exog.drop(self.var_model, axis=1),
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
return predictions
|
|
368
|
+
|
|
369
|
+
except asyncio.CancelledError:
|
|
370
|
+
self.logger.info("Prediction was cancelled")
|
|
371
|
+
raise
|
|
372
|
+
except Exception as e:
|
|
373
|
+
self.logger.error(f"Error during prediction: {e}")
|
|
374
|
+
raise
|
|
375
|
+
|
|
376
|
+
def _get_search_space(self, debug: bool, lags_list: list[int] | None = None):
|
|
377
|
+
"""Get the hyperparameter search space for the given model.
|
|
378
|
+
|
|
379
|
+
:param debug: If True, use simplified search space for faster testing
|
|
380
|
+
:type debug: bool
|
|
381
|
+
:param lags_list: List of lag values to use. If None, uses default values
|
|
382
|
+
:type lags_list: list[int] | None
|
|
383
|
+
"""
|
|
384
|
+
if lags_list is None:
|
|
385
|
+
lags_list = [6, 12, 24, 36, 48, 60, 72]
|
|
386
|
+
|
|
387
|
+
debug_lags = [3]
|
|
388
|
+
|
|
389
|
+
def get_lags(trial):
|
|
390
|
+
return trial.suggest_categorical("lags", debug_lags if debug else lags_list)
|
|
391
|
+
|
|
392
|
+
def svr_search_space(trial):
|
|
393
|
+
# Base SVR parameters
|
|
394
|
+
search = {
|
|
395
|
+
"C": trial.suggest_float("C", 0.1, 1.0)
|
|
396
|
+
if debug
|
|
397
|
+
else trial.suggest_float("C", 1e-2, 100.0, log=True),
|
|
398
|
+
"epsilon": trial.suggest_float("epsilon", 0.01, 1.0),
|
|
399
|
+
"kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
|
|
400
|
+
"gamma": trial.suggest_categorical(
|
|
401
|
+
"gamma", ["scale", "auto", 0.01, 0.1, 1.0, 10.0]
|
|
402
|
+
),
|
|
403
|
+
"lags": get_lags(trial),
|
|
404
|
+
}
|
|
405
|
+
return search
|
|
406
|
+
|
|
407
|
+
# Registry of search space generators
|
|
408
|
+
search_spaces = {
|
|
409
|
+
"LinearRegression": lambda trial: {
|
|
410
|
+
"fit_intercept": trial.suggest_categorical(
|
|
411
|
+
"fit_intercept", [True] if debug else [True, False]
|
|
412
|
+
),
|
|
413
|
+
"lags": get_lags(trial),
|
|
414
|
+
},
|
|
415
|
+
"RidgeRegression": lambda trial: {
|
|
416
|
+
"alpha": trial.suggest_float("alpha", 0.1, 1.0)
|
|
417
|
+
if debug
|
|
418
|
+
else trial.suggest_float("alpha", 1e-4, 100.0, log=True),
|
|
419
|
+
"lags": get_lags(trial),
|
|
420
|
+
},
|
|
421
|
+
"LassoRegression": lambda trial: {
|
|
422
|
+
"alpha": trial.suggest_float("alpha", 0.1, 1.0)
|
|
423
|
+
if debug
|
|
424
|
+
else trial.suggest_float("alpha", 1e-4, 100.0, log=True),
|
|
425
|
+
"lags": get_lags(trial),
|
|
426
|
+
},
|
|
427
|
+
"ElasticNet": lambda trial: {
|
|
428
|
+
"alpha": trial.suggest_float("alpha", 0.0, 2.0),
|
|
429
|
+
"l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
|
|
430
|
+
"selection": trial.suggest_categorical(
|
|
431
|
+
"selection", ["random"] if debug else ["cyclic", "random"]
|
|
432
|
+
),
|
|
433
|
+
"lags": get_lags(trial),
|
|
434
|
+
},
|
|
435
|
+
"KNeighborsRegressor": lambda trial: {
|
|
436
|
+
"n_neighbors": trial.suggest_int("n_neighbors", 2, 2)
|
|
437
|
+
if debug
|
|
438
|
+
else trial.suggest_int("n_neighbors", 2, 20),
|
|
439
|
+
"leaf_size": trial.suggest_int("leaf_size", 20, 20)
|
|
440
|
+
if debug
|
|
441
|
+
else trial.suggest_int("leaf_size", 20, 40),
|
|
442
|
+
"weights": trial.suggest_categorical(
|
|
443
|
+
"weights", ["uniform"] if debug else ["uniform", "distance"]
|
|
444
|
+
),
|
|
445
|
+
"lags": get_lags(trial),
|
|
446
|
+
},
|
|
447
|
+
"DecisionTreeRegressor": lambda trial: {
|
|
448
|
+
"max_depth": trial.suggest_int("max_depth", 2, 5)
|
|
449
|
+
if debug
|
|
450
|
+
else trial.suggest_int("max_depth", 2, 20),
|
|
451
|
+
"min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
|
|
452
|
+
"min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
|
|
453
|
+
"lags": get_lags(trial),
|
|
454
|
+
},
|
|
455
|
+
"SVR": svr_search_space,
|
|
456
|
+
"RandomForestRegressor": lambda trial: {
|
|
457
|
+
"n_estimators": trial.suggest_int("n_estimators", 10, 20)
|
|
458
|
+
if debug
|
|
459
|
+
else trial.suggest_int("n_estimators", 50, 300),
|
|
460
|
+
"max_depth": trial.suggest_int("max_depth", 3, 20),
|
|
461
|
+
"min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
|
|
462
|
+
"max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
|
|
463
|
+
"lags": get_lags(trial),
|
|
464
|
+
},
|
|
465
|
+
"ExtraTreesRegressor": lambda trial: {
|
|
466
|
+
"n_estimators": trial.suggest_int("n_estimators", 10, 20)
|
|
467
|
+
if debug
|
|
468
|
+
else trial.suggest_int("n_estimators", 50, 300),
|
|
469
|
+
"max_depth": trial.suggest_int("max_depth", 3, 20),
|
|
470
|
+
"min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
|
|
471
|
+
"max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
|
|
472
|
+
"lags": get_lags(trial),
|
|
473
|
+
},
|
|
474
|
+
"GradientBoostingRegressor": lambda trial: {
|
|
475
|
+
"n_estimators": trial.suggest_int("n_estimators", 10, 20)
|
|
476
|
+
if debug
|
|
477
|
+
else trial.suggest_int("n_estimators", 50, 300),
|
|
478
|
+
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
|
|
479
|
+
"max_depth": trial.suggest_int("max_depth", 3, 10),
|
|
480
|
+
"lags": get_lags(trial),
|
|
481
|
+
},
|
|
482
|
+
"AdaBoostRegressor": lambda trial: {
|
|
483
|
+
"n_estimators": trial.suggest_int("n_estimators", 10, 20)
|
|
484
|
+
if debug
|
|
485
|
+
else trial.suggest_int("n_estimators", 50, 300),
|
|
486
|
+
"learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
|
|
487
|
+
"lags": get_lags(trial),
|
|
488
|
+
},
|
|
489
|
+
"MLPRegressor": lambda trial: {
|
|
490
|
+
"learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.01),
|
|
491
|
+
"hidden_layer_sizes": trial.suggest_categorical(
|
|
492
|
+
"hidden_layer_sizes", [(50,), (100,), (50, 50)]
|
|
493
|
+
),
|
|
494
|
+
"activation": trial.suggest_categorical("activation", ["relu", "tanh"]),
|
|
495
|
+
"alpha": trial.suggest_float("alpha", 1e-5, 1e-1, log=True),
|
|
496
|
+
"lags": get_lags(trial),
|
|
497
|
+
},
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
if self.sklearn_model not in search_spaces:
|
|
501
|
+
raise ValueError(f"Unsupported model for tuning: {self.sklearn_model}")
|
|
502
|
+
|
|
503
|
+
return search_spaces[self.sklearn_model]
|
|
504
|
+
|
|
505
|
+
async def tune(
|
|
506
|
+
self,
|
|
507
|
+
split_date_delta: str | None = "48h",
|
|
508
|
+
n_trials: int = 10,
|
|
509
|
+
debug: bool | None = False,
|
|
510
|
+
) -> pd.DataFrame:
|
|
217
511
|
"""Tuning a previously fitted model using bayesian optimization.
|
|
218
512
|
|
|
513
|
+
:param split_date_delta: The delta from now to `split_date_delta` that will be used \
|
|
514
|
+
as the test period to evaluate the model, defaults to '48h'.\
|
|
515
|
+
This define the training/validation split for the tuning process.
|
|
516
|
+
:type split_date_delta: Optional[str], optional
|
|
219
517
|
:param debug: Set to True for testing and faster optimizations, defaults to False
|
|
220
518
|
:type debug: Optional[bool], optional
|
|
519
|
+
:param n_trials: Number of trials for bayesian optimization, defaults to 10
|
|
520
|
+
:type n_trials: Optional[int], optional
|
|
221
521
|
:return: The DataFrame with the forecasts using the optimized model.
|
|
222
522
|
:rtype: pd.DataFrame
|
|
223
523
|
"""
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
524
|
+
try:
|
|
525
|
+
if self.forecaster is None:
|
|
526
|
+
raise ValueError("Model has not been fitted yet. Call fit() first.")
|
|
527
|
+
|
|
528
|
+
# Calculate appropriate lags based on data frequency
|
|
529
|
+
freq_timedelta = pd.Timedelta(self.data_exo.index.freq)
|
|
530
|
+
lags_list = MLForecaster.get_lags_list_from_frequency(freq_timedelta)
|
|
531
|
+
self.logger.info(
|
|
532
|
+
f"Using lags list based on data frequency ({self.data_exo.index.freq}): {lags_list}"
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
# Get the search space for this model
|
|
536
|
+
search_space = self._get_search_space(debug, lags_list)
|
|
537
|
+
|
|
538
|
+
# Bayesian search hyperparameter and lags with skforecast/optuna
|
|
237
539
|
if debug:
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
'lags': trial.suggest_categorical('lags', [3])}
|
|
241
|
-
return search_space
|
|
540
|
+
refit = False
|
|
541
|
+
num_lags = 3
|
|
242
542
|
else:
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
543
|
+
refit = True
|
|
544
|
+
num_lags = self.num_lags
|
|
545
|
+
# The optimization routine call
|
|
546
|
+
self.logger.info("Bayesian hyperparameter optimization with backtesting")
|
|
547
|
+
start_time = time.time()
|
|
548
|
+
|
|
549
|
+
# Use the 'y' data that will be passed to the optimizer
|
|
550
|
+
data_to_tune = self.data_train[self.var_model]
|
|
551
|
+
|
|
552
|
+
# Calculate the new split date and initial_train_size based on the passed split_date_delta
|
|
553
|
+
try:
|
|
554
|
+
date_split = (
|
|
555
|
+
data_to_tune.index[-1]
|
|
556
|
+
- pd.Timedelta(split_date_delta)
|
|
557
|
+
+ data_to_tune.index.freq
|
|
558
|
+
)
|
|
559
|
+
initial_train_size = len(data_to_tune.loc[: date_split - data_to_tune.index.freq])
|
|
560
|
+
except (ValueError, TypeError):
|
|
561
|
+
self.logger.warning(
|
|
562
|
+
f"Invalid split_date_delta: {split_date_delta}. Falling back to 5 days."
|
|
563
|
+
)
|
|
564
|
+
date_split = (
|
|
565
|
+
data_to_tune.index[-1] - pd.Timedelta("5days") + data_to_tune.index.freq
|
|
566
|
+
)
|
|
567
|
+
initial_train_size = len(data_to_tune.loc[: date_split - data_to_tune.index.freq])
|
|
568
|
+
|
|
569
|
+
# Check if the calculated initial_train_size is valid
|
|
570
|
+
window_size = num_lags # This is what skforecast will use as window_size
|
|
250
571
|
if debug:
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
572
|
+
window_size = 3 # Match debug lags
|
|
573
|
+
|
|
574
|
+
if initial_train_size <= window_size:
|
|
575
|
+
self.logger.warning(
|
|
576
|
+
f"Calculated initial_train_size ({initial_train_size}) is <= window_size ({window_size})."
|
|
577
|
+
)
|
|
578
|
+
self.logger.warning(
|
|
579
|
+
"This is likely because split_date_delta is too large for the dataset."
|
|
580
|
+
)
|
|
581
|
+
MIN_SAMPLES_FOR_KNN = 6
|
|
582
|
+
new_train_size = window_size + MIN_SAMPLES_FOR_KNN
|
|
583
|
+
self.logger.warning(
|
|
584
|
+
f"Adjusting initial_train_size to {new_train_size} to attempt recovery."
|
|
585
|
+
)
|
|
586
|
+
initial_train_size = new_train_size
|
|
587
|
+
|
|
588
|
+
cv = TimeSeriesFold(
|
|
589
|
+
steps=num_lags,
|
|
590
|
+
initial_train_size=initial_train_size,
|
|
591
|
+
fixed_train_size=True,
|
|
592
|
+
gap=0,
|
|
593
|
+
skip_folds=None,
|
|
594
|
+
allow_incomplete_fold=True,
|
|
595
|
+
refit=refit,
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
(
|
|
599
|
+
self.optimize_results,
|
|
600
|
+
self.optimize_results_object,
|
|
601
|
+
) = await asyncio.to_thread(
|
|
602
|
+
bayesian_search_forecaster,
|
|
603
|
+
forecaster=self.forecaster,
|
|
604
|
+
y=self.data_train[self.var_model],
|
|
605
|
+
exog=self.data_train.drop(self.var_model, axis=1),
|
|
606
|
+
cv=cv,
|
|
607
|
+
search_space=search_space,
|
|
608
|
+
metric=MLForecaster.neg_r2_score,
|
|
609
|
+
n_trials=n_trials,
|
|
610
|
+
random_state=123,
|
|
611
|
+
return_best=True,
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
optimization_time = time.time() - start_time
|
|
615
|
+
self.logger.info(f"Elapsed time: {optimization_time}")
|
|
616
|
+
|
|
617
|
+
self.is_tuned = True
|
|
618
|
+
|
|
619
|
+
predictions_opt = await asyncio.to_thread(
|
|
620
|
+
self.forecaster.predict,
|
|
621
|
+
steps=self.num_lags,
|
|
622
|
+
exog=self.data_test.drop(self.var_model, axis=1),
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
freq_hours = self.data_exo.index.freq.delta.seconds / 3600
|
|
626
|
+
self.lags_opt = int(np.round(len(self.optimize_results.iloc[0]["lags"])))
|
|
627
|
+
self.days_needed = int(np.round(self.lags_opt * freq_hours / 24))
|
|
628
|
+
|
|
629
|
+
df_pred_opt = pd.DataFrame(
|
|
630
|
+
index=self.data_exo.index, columns=["train", "test", "pred_optim"]
|
|
631
|
+
)
|
|
632
|
+
df_pred_opt["train"] = self.data_train[self.var_model]
|
|
633
|
+
df_pred_opt["test"] = self.data_test[self.var_model]
|
|
634
|
+
df_pred_opt["pred_optim"] = predictions_opt
|
|
635
|
+
|
|
636
|
+
pred_optim_metric_train = -self.optimize_results.iloc[0]["neg_r2_score"]
|
|
637
|
+
self.logger.info(
|
|
638
|
+
f"R2 score for optimized prediction in train period: {pred_optim_metric_train}"
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
pred_optim_metric_test = await asyncio.to_thread(
|
|
642
|
+
r2_score,
|
|
643
|
+
df_pred_opt.loc[predictions_opt.index, "test"],
|
|
644
|
+
df_pred_opt.loc[predictions_opt.index, "pred_optim"],
|
|
645
|
+
)
|
|
646
|
+
self.logger.info(
|
|
647
|
+
f"R2 score for optimized prediction in test period: {pred_optim_metric_test}"
|
|
648
|
+
)
|
|
649
|
+
self.logger.info("Number of optimal lags obtained: " + str(self.lags_opt))
|
|
650
|
+
|
|
651
|
+
return df_pred_opt
|
|
652
|
+
|
|
653
|
+
except asyncio.CancelledError:
|
|
654
|
+
self.logger.info("Model tuning was cancelled")
|
|
655
|
+
raise
|
|
656
|
+
except Exception as e:
|
|
657
|
+
self.logger.error(f"Error during model tuning: {e}")
|
|
658
|
+
raise
|