emhass 0.11.4__py3-none-any.whl → 0.15.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emhass/command_line.py +1481 -811
- emhass/connection_manager.py +108 -0
- emhass/data/associations.csv +37 -2
- emhass/data/cec_inverters.pbz2 +0 -0
- emhass/data/cec_modules.pbz2 +0 -0
- emhass/data/config_defaults.json +53 -49
- emhass/forecast.py +1264 -731
- emhass/img/emhass_icon.png +0 -0
- emhass/machine_learning_forecaster.py +534 -281
- emhass/machine_learning_regressor.py +141 -125
- emhass/optimization.py +1173 -585
- emhass/retrieve_hass.py +958 -263
- emhass/static/advanced.html +7 -0
- emhass/static/configuration_list.html +5 -1
- emhass/static/configuration_script.js +146 -62
- emhass/static/data/param_definitions.json +215 -48
- emhass/static/script.js +58 -26
- emhass/static/style.css +6 -8
- emhass/templates/configuration.html +5 -3
- emhass/templates/index.html +8 -6
- emhass/templates/template.html +4 -5
- emhass/utils.py +1152 -403
- emhass/web_server.py +565 -379
- emhass/websocket_client.py +224 -0
- emhass-0.15.5.dist-info/METADATA +164 -0
- emhass-0.15.5.dist-info/RECORD +34 -0
- {emhass-0.11.4.dist-info → emhass-0.15.5.dist-info}/WHEEL +1 -2
- emhass-0.15.5.dist-info/entry_points.txt +2 -0
- emhass-0.11.4.dist-info/METADATA +0 -666
- emhass-0.11.4.dist-info/RECORD +0 -32
- emhass-0.11.4.dist-info/entry_points.txt +0 -2
- emhass-0.11.4.dist-info/top_level.txt +0 -1
- {emhass-0.11.4.dist-info → emhass-0.15.5.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,22 +1,30 @@
|
|
|
1
|
-
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
1
|
+
import asyncio
|
|
4
2
|
import logging
|
|
5
|
-
import copy
|
|
6
3
|
import time
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import numpy as np
|
|
10
|
-
|
|
11
|
-
from sklearn.linear_model import LinearRegression
|
|
12
|
-
from sklearn.linear_model import ElasticNet
|
|
13
|
-
from sklearn.neighbors import KNeighborsRegressor
|
|
14
|
-
from sklearn.metrics import r2_score
|
|
4
|
+
import warnings
|
|
15
5
|
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from skforecast.model_selection import (
|
|
9
|
+
TimeSeriesFold,
|
|
10
|
+
backtesting_forecaster,
|
|
11
|
+
bayesian_search_forecaster,
|
|
12
|
+
)
|
|
16
13
|
from skforecast.recursive import ForecasterRecursive
|
|
17
|
-
from
|
|
14
|
+
from sklearn.ensemble import (
|
|
15
|
+
AdaBoostRegressor,
|
|
16
|
+
ExtraTreesRegressor,
|
|
17
|
+
GradientBoostingRegressor,
|
|
18
|
+
RandomForestRegressor,
|
|
19
|
+
)
|
|
20
|
+
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
|
|
21
|
+
from sklearn.metrics import r2_score
|
|
22
|
+
from sklearn.neighbors import KNeighborsRegressor
|
|
23
|
+
from sklearn.neural_network import MLPRegressor
|
|
24
|
+
from sklearn.svm import SVR
|
|
25
|
+
from sklearn.tree import DecisionTreeRegressor
|
|
18
26
|
|
|
19
|
-
import
|
|
27
|
+
from emhass import utils
|
|
20
28
|
|
|
21
29
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
22
30
|
|
|
@@ -25,17 +33,17 @@ class MLForecaster:
|
|
|
25
33
|
r"""
|
|
26
34
|
A forecaster class using machine learning models with auto-regressive approach and features\
|
|
27
35
|
based on timestamp information (hour, day, week, etc).
|
|
28
|
-
|
|
36
|
+
|
|
29
37
|
This class uses the `skforecast` module and the machine learning models are from `scikit-learn`.
|
|
30
|
-
|
|
38
|
+
|
|
31
39
|
It exposes three main methods:
|
|
32
|
-
|
|
40
|
+
|
|
33
41
|
- `fit`: to train a model with the passed data.
|
|
34
|
-
|
|
42
|
+
|
|
35
43
|
- `predict`: to obtain a forecast from a pre-trained model.
|
|
36
|
-
|
|
37
|
-
- `tune`: to optimize the models hyperparameters using bayesian optimization.
|
|
38
|
-
|
|
44
|
+
|
|
45
|
+
- `tune`: to optimize the models hyperparameters using bayesian optimization.
|
|
46
|
+
|
|
39
47
|
"""
|
|
40
48
|
|
|
41
49
|
def __init__(
|
|
@@ -78,36 +86,57 @@ class MLForecaster:
|
|
|
78
86
|
self.emhass_conf = emhass_conf
|
|
79
87
|
self.logger = logger
|
|
80
88
|
self.is_tuned = False
|
|
89
|
+
self.forecaster: ForecasterRecursive | None = None
|
|
90
|
+
self.optimize_results: pd.DataFrame | None = None
|
|
91
|
+
self.optimize_results_object = None
|
|
92
|
+
|
|
81
93
|
# A quick data preparation
|
|
94
|
+
self._prepare_data()
|
|
95
|
+
|
|
96
|
+
def _prepare_data(self):
|
|
97
|
+
"""Prepare the input data by cleaning and sorting."""
|
|
82
98
|
self.data.index = pd.to_datetime(self.data.index)
|
|
83
99
|
self.data.sort_index(inplace=True)
|
|
84
100
|
self.data = self.data[~self.data.index.duplicated(keep="first")]
|
|
85
101
|
|
|
86
|
-
@staticmethod
|
|
87
|
-
def add_date_features(data: pd.DataFrame) -> pd.DataFrame:
|
|
88
|
-
"""Add date features from the input DataFrame timestamp
|
|
89
|
-
|
|
90
|
-
:param data: The input DataFrame
|
|
91
|
-
:type data: pd.DataFrame
|
|
92
|
-
:return: The DataFrame with the added features
|
|
93
|
-
:rtype: pd.DataFrame
|
|
94
|
-
"""
|
|
95
|
-
df = copy.deepcopy(data)
|
|
96
|
-
df["year"] = [i.year for i in df.index]
|
|
97
|
-
df["month"] = [i.month for i in df.index]
|
|
98
|
-
df["day_of_week"] = [i.dayofweek for i in df.index]
|
|
99
|
-
df["day_of_year"] = [i.dayofyear for i in df.index]
|
|
100
|
-
df["day"] = [i.day for i in df.index]
|
|
101
|
-
df["hour"] = [i.hour for i in df.index]
|
|
102
|
-
return df
|
|
103
|
-
|
|
104
102
|
@staticmethod
|
|
105
103
|
def neg_r2_score(y_true, y_pred):
|
|
106
104
|
"""The negative of the r2 score."""
|
|
107
105
|
return -r2_score(y_true, y_pred)
|
|
108
106
|
|
|
109
107
|
@staticmethod
|
|
110
|
-
def
|
|
108
|
+
async def interpolate_async(data: pd.DataFrame) -> pd.DataFrame:
|
|
109
|
+
"""Interpolate missing values asynchronously."""
|
|
110
|
+
return await asyncio.to_thread(data.interpolate, method="linear", axis=0, limit=None)
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def get_lags_list_from_frequency(freq: pd.Timedelta) -> list[int]:
|
|
114
|
+
"""Calculate appropriate lag values based on data frequency.
|
|
115
|
+
|
|
116
|
+
The lags represent different time horizons (6h, 12h, 1d, 1.5d, 2d, 2.5d, 3d).
|
|
117
|
+
This method scales these horizons according to the actual data frequency.
|
|
118
|
+
|
|
119
|
+
:param freq: The frequency of the data as a pandas Timedelta
|
|
120
|
+
:type freq: pd.Timedelta
|
|
121
|
+
:return: A list of lag values appropriate for the data frequency
|
|
122
|
+
:rtype: list[int]
|
|
123
|
+
"""
|
|
124
|
+
# Define target time horizons in hours
|
|
125
|
+
target_horizons_hours = [6, 12, 24, 36, 48, 60, 72]
|
|
126
|
+
|
|
127
|
+
# Calculate frequency in hours
|
|
128
|
+
freq_hours = freq.total_seconds() / 3600
|
|
129
|
+
|
|
130
|
+
# Calculate lags for each horizon
|
|
131
|
+
lags = [int(round(horizon / freq_hours)) for horizon in target_horizons_hours]
|
|
132
|
+
|
|
133
|
+
# Remove duplicates and ensure minimum value of 1
|
|
134
|
+
lags = sorted({max(1, lag) for lag in lags})
|
|
135
|
+
|
|
136
|
+
return lags
|
|
137
|
+
|
|
138
|
+
@staticmethod
|
|
139
|
+
async def generate_exog(data_last_window, periods, var_name):
|
|
111
140
|
"""Generate the exogenous data for future timestamps."""
|
|
112
141
|
forecast_dates = pd.date_range(
|
|
113
142
|
start=data_last_window.index[-1] + data_last_window.index.freq,
|
|
@@ -115,14 +144,46 @@ class MLForecaster:
|
|
|
115
144
|
freq=data_last_window.index.freq,
|
|
116
145
|
)
|
|
117
146
|
exog = pd.DataFrame({var_name: [np.nan] * periods}, index=forecast_dates)
|
|
118
|
-
exog =
|
|
147
|
+
exog = utils.add_date_features(exog)
|
|
119
148
|
return exog
|
|
120
149
|
|
|
121
|
-
def
|
|
150
|
+
def _get_sklearn_model(self, model_name: str):
|
|
151
|
+
"""Get the sklearn model instance based on the model name."""
|
|
152
|
+
seed = 42
|
|
153
|
+
models = {
|
|
154
|
+
"LinearRegression": LinearRegression(),
|
|
155
|
+
"RidgeRegression": Ridge(),
|
|
156
|
+
"LassoRegression": Lasso(random_state=seed),
|
|
157
|
+
"ElasticNet": ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=seed),
|
|
158
|
+
"KNeighborsRegressor": KNeighborsRegressor(),
|
|
159
|
+
"DecisionTreeRegressor": DecisionTreeRegressor(ccp_alpha=0.0, random_state=seed),
|
|
160
|
+
"SVR": SVR(),
|
|
161
|
+
"RandomForestRegressor": RandomForestRegressor(
|
|
162
|
+
min_samples_leaf=1, max_features=1.0, random_state=seed
|
|
163
|
+
),
|
|
164
|
+
"ExtraTreesRegressor": ExtraTreesRegressor(
|
|
165
|
+
min_samples_leaf=1, max_features=1.0, random_state=seed
|
|
166
|
+
),
|
|
167
|
+
"GradientBoostingRegressor": GradientBoostingRegressor(
|
|
168
|
+
learning_rate=0.1, random_state=seed
|
|
169
|
+
),
|
|
170
|
+
"AdaBoostRegressor": AdaBoostRegressor(learning_rate=1.0, random_state=seed),
|
|
171
|
+
"MLPRegressor": MLPRegressor(hidden_layer_sizes=(100,), random_state=seed),
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if model_name not in models:
|
|
175
|
+
self.logger.error(
|
|
176
|
+
f"Passed sklearn model {model_name} is not valid. Defaulting to KNeighborsRegressor"
|
|
177
|
+
)
|
|
178
|
+
return KNeighborsRegressor()
|
|
179
|
+
|
|
180
|
+
return models[model_name]
|
|
181
|
+
|
|
182
|
+
async def fit(
|
|
122
183
|
self,
|
|
123
|
-
split_date_delta:
|
|
124
|
-
perform_backtest:
|
|
125
|
-
) ->
|
|
184
|
+
split_date_delta: str | None = "48h",
|
|
185
|
+
perform_backtest: bool | None = False,
|
|
186
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
126
187
|
r"""The fit method to train the ML model.
|
|
127
188
|
|
|
128
189
|
:param split_date_delta: The delta from now to `split_date_delta` that will be used \
|
|
@@ -134,100 +195,133 @@ class MLForecaster:
|
|
|
134
195
|
:return: The DataFrame containing the forecast data results without and with backtest
|
|
135
196
|
:rtype: Tuple[pd.DataFrame, pd.DataFrame]
|
|
136
197
|
"""
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
self.data_exo.
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
self.
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
self.forecaster = ForecasterRecursive(
|
|
173
|
-
regressor = base_model,
|
|
174
|
-
lags = self.num_lags
|
|
175
|
-
)
|
|
176
|
-
# Fit and time it
|
|
177
|
-
self.logger.info("Training a " + self.sklearn_model + " model")
|
|
178
|
-
start_time = time.time()
|
|
179
|
-
self.forecaster.fit(
|
|
180
|
-
y=self.data_train[self.var_model],
|
|
181
|
-
exog=self.data_train.drop(self.var_model, axis=1),
|
|
182
|
-
)
|
|
183
|
-
self.logger.info(f"Elapsed time for model fit: {time.time() - start_time}")
|
|
184
|
-
# Make a prediction to print metrics
|
|
185
|
-
predictions = self.forecaster.predict(
|
|
186
|
-
steps=self.steps, exog=self.data_test.drop(self.var_model, axis=1)
|
|
187
|
-
)
|
|
188
|
-
pred_metric = r2_score(self.data_test[self.var_model], predictions)
|
|
189
|
-
self.logger.info(
|
|
190
|
-
f"Prediction R2 score of fitted model on test data: {pred_metric}"
|
|
191
|
-
)
|
|
192
|
-
# Packing results in a DataFrame
|
|
193
|
-
df_pred = pd.DataFrame(
|
|
194
|
-
index=self.data_exo.index, columns=["train", "test", "pred"]
|
|
195
|
-
)
|
|
196
|
-
df_pred["train"] = self.data_train[self.var_model]
|
|
197
|
-
df_pred["test"] = self.data_test[self.var_model]
|
|
198
|
-
df_pred["pred"] = predictions
|
|
199
|
-
df_pred_backtest = None
|
|
200
|
-
if perform_backtest is True:
|
|
201
|
-
# Using backtesting tool to evaluate the model
|
|
202
|
-
self.logger.info("Performing simple backtesting of fitted model")
|
|
198
|
+
try:
|
|
199
|
+
self.logger.info("Performing a forecast model fit for " + self.model_type)
|
|
200
|
+
|
|
201
|
+
# Check if variable exists in data
|
|
202
|
+
if self.var_model not in self.data.columns:
|
|
203
|
+
raise KeyError(
|
|
204
|
+
f"Variable '{self.var_model}' not found in data columns: {list(self.data.columns)}"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Preparing the data: adding exogenous features
|
|
208
|
+
self.data_exo = pd.DataFrame(index=self.data.index)
|
|
209
|
+
self.data_exo = utils.add_date_features(self.data_exo)
|
|
210
|
+
self.data_exo[self.var_model] = self.data[self.var_model]
|
|
211
|
+
|
|
212
|
+
self.data_exo = await self.interpolate_async(self.data_exo)
|
|
213
|
+
|
|
214
|
+
# train/test split
|
|
215
|
+
self.date_train = (
|
|
216
|
+
self.data_exo.index[-1] - pd.Timedelta("5days") + self.data_exo.index.freq
|
|
217
|
+
) # The last 5 days
|
|
218
|
+
self.date_split = (
|
|
219
|
+
self.data_exo.index[-1] - pd.Timedelta(split_date_delta) + self.data_exo.index.freq
|
|
220
|
+
) # The last 48h
|
|
221
|
+
self.data_train = self.data_exo.loc[: self.date_split - self.data_exo.index.freq, :]
|
|
222
|
+
self.data_test = self.data_exo.loc[self.date_split :, :]
|
|
223
|
+
self.steps = len(self.data_test)
|
|
224
|
+
|
|
225
|
+
# Pick correct sklearn model
|
|
226
|
+
base_model = self._get_sklearn_model(self.sklearn_model)
|
|
227
|
+
|
|
228
|
+
# Define the forecaster object
|
|
229
|
+
self.forecaster = ForecasterRecursive(estimator=base_model, lags=self.num_lags)
|
|
230
|
+
|
|
231
|
+
# Fit and time it
|
|
232
|
+
self.logger.info("Training a " + self.sklearn_model + " model")
|
|
203
233
|
start_time = time.time()
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
refit = False
|
|
234
|
+
|
|
235
|
+
await asyncio.to_thread(
|
|
236
|
+
self.forecaster.fit,
|
|
237
|
+
y=self.data_train[self.var_model],
|
|
238
|
+
exog=self.data_train.drop(self.var_model, axis=1),
|
|
239
|
+
store_in_sample_residuals=True,
|
|
211
240
|
)
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
241
|
+
|
|
242
|
+
fit_time = time.time() - start_time
|
|
243
|
+
self.logger.info(f"Elapsed time for model fit: {fit_time}")
|
|
244
|
+
|
|
245
|
+
# Make a prediction to print metrics
|
|
246
|
+
predictions = await asyncio.to_thread(
|
|
247
|
+
self.forecaster.predict,
|
|
248
|
+
steps=self.steps,
|
|
249
|
+
exog=self.data_test.drop(self.var_model, axis=1),
|
|
220
250
|
)
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
df_pred_backtest = pd.DataFrame(
|
|
224
|
-
index=self.data_exo.index, columns=["train", "pred"]
|
|
251
|
+
pred_metric = await asyncio.to_thread(
|
|
252
|
+
r2_score, self.data_test[self.var_model], predictions
|
|
225
253
|
)
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
254
|
+
self.logger.info(f"Prediction R2 score of fitted model on test data: {pred_metric}")
|
|
255
|
+
|
|
256
|
+
# Packing results in a DataFrame
|
|
257
|
+
df_pred = pd.DataFrame(index=self.data_exo.index, columns=["train", "test", "pred"])
|
|
258
|
+
|
|
259
|
+
df_pred["train"] = self.data_train[self.var_model]
|
|
260
|
+
df_pred["test"] = self.data_test[self.var_model]
|
|
261
|
+
df_pred["pred"] = predictions
|
|
262
|
+
|
|
263
|
+
df_pred_backtest = None
|
|
264
|
+
|
|
265
|
+
if perform_backtest is True:
|
|
266
|
+
# Using backtesting tool to evaluate the model
|
|
267
|
+
self.logger.info("Performing simple backtesting of fitted model")
|
|
268
|
+
start_time = time.time()
|
|
269
|
+
cv = TimeSeriesFold(
|
|
270
|
+
steps=self.num_lags,
|
|
271
|
+
initial_train_size=None,
|
|
272
|
+
fixed_train_size=False,
|
|
273
|
+
gap=0,
|
|
274
|
+
allow_incomplete_fold=True,
|
|
275
|
+
refit=False,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
metric, predictions_backtest = await asyncio.to_thread(
|
|
279
|
+
backtesting_forecaster,
|
|
280
|
+
forecaster=self.forecaster,
|
|
281
|
+
y=self.data_train[self.var_model],
|
|
282
|
+
exog=self.data_train.drop(self.var_model, axis=1),
|
|
283
|
+
cv=cv,
|
|
284
|
+
metric=MLForecaster.neg_r2_score,
|
|
285
|
+
verbose=False,
|
|
286
|
+
show_progress=True,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
backtest_time = time.time() - start_time
|
|
290
|
+
backtest_r2 = -metric
|
|
291
|
+
self.logger.info(f"Elapsed backtesting time: {backtest_time}")
|
|
292
|
+
self.logger.info(f"Backtest R2 score: {backtest_r2}")
|
|
293
|
+
df_pred_backtest = pd.DataFrame(
|
|
294
|
+
index=self.data_exo.index, columns=["train", "pred"]
|
|
295
|
+
)
|
|
296
|
+
df_pred_backtest["train"] = self.data_exo[self.var_model]
|
|
297
|
+
# Handle skforecast 0.18.0+ DataFrame output with fold column
|
|
298
|
+
if isinstance(predictions_backtest, pd.DataFrame):
|
|
299
|
+
# Extract the 'pred' column from the DataFrame
|
|
300
|
+
pred_values = (
|
|
301
|
+
predictions_backtest["pred"]
|
|
302
|
+
if "pred" in predictions_backtest.columns
|
|
303
|
+
else predictions_backtest.iloc[:, -1]
|
|
304
|
+
)
|
|
305
|
+
else:
|
|
306
|
+
# If it's a Series, use it directly
|
|
307
|
+
pred_values = predictions_backtest
|
|
308
|
+
|
|
309
|
+
# Use loc to align indices properly - only assign where indices match
|
|
310
|
+
df_pred_backtest.loc[pred_values.index, "pred"] = pred_values
|
|
311
|
+
|
|
312
|
+
return df_pred, df_pred_backtest
|
|
229
313
|
|
|
230
|
-
|
|
314
|
+
except asyncio.CancelledError:
|
|
315
|
+
self.logger.info("Model training was cancelled")
|
|
316
|
+
raise
|
|
317
|
+
except Exception as e:
|
|
318
|
+
self.logger.error(f"Error during model fitting: {e}")
|
|
319
|
+
raise
|
|
320
|
+
|
|
321
|
+
async def predict(
|
|
322
|
+
self,
|
|
323
|
+
data_last_window: pd.DataFrame | None = None,
|
|
324
|
+
) -> pd.Series:
|
|
231
325
|
"""The predict method to generate forecasts from a previously fitted ML model.
|
|
232
326
|
|
|
233
327
|
:param data_last_window: The data that will be used to generate the new forecast, this \
|
|
@@ -238,168 +332,327 @@ class MLForecaster:
|
|
|
238
332
|
:return: A pandas series containing the generated forecasts.
|
|
239
333
|
:rtype: pd.Series
|
|
240
334
|
"""
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
)
|
|
249
|
-
if self.is_tuned:
|
|
250
|
-
exog = MLForecaster.generate_exog(
|
|
251
|
-
data_last_window, self.lags_opt, self.var_model
|
|
252
|
-
)
|
|
253
|
-
predictions = self.forecaster.predict(
|
|
254
|
-
steps=self.lags_opt,
|
|
255
|
-
last_window=data_last_window[self.var_model],
|
|
256
|
-
exog=exog.drop(self.var_model, axis=1),
|
|
257
|
-
)
|
|
258
|
-
else:
|
|
259
|
-
exog = MLForecaster.generate_exog(
|
|
260
|
-
data_last_window, self.num_lags, self.var_model
|
|
261
|
-
)
|
|
262
|
-
predictions = self.forecaster.predict(
|
|
335
|
+
try:
|
|
336
|
+
if self.forecaster is None:
|
|
337
|
+
raise ValueError("Model has not been fitted yet. Call fit() first.")
|
|
338
|
+
|
|
339
|
+
if data_last_window is None:
|
|
340
|
+
predictions = await asyncio.to_thread(
|
|
341
|
+
self.forecaster.predict,
|
|
263
342
|
steps=self.num_lags,
|
|
264
|
-
|
|
265
|
-
exog=exog.drop(self.var_model, axis=1),
|
|
343
|
+
exog=self.data_test.drop(self.var_model, axis=1),
|
|
266
344
|
)
|
|
267
|
-
|
|
345
|
+
else:
|
|
346
|
+
data_last_window = await self.interpolate_async(data_last_window)
|
|
347
|
+
|
|
348
|
+
if self.is_tuned:
|
|
349
|
+
exog = await self.generate_exog(data_last_window, self.lags_opt, self.var_model)
|
|
350
|
+
|
|
351
|
+
predictions = await asyncio.to_thread(
|
|
352
|
+
self.forecaster.predict,
|
|
353
|
+
steps=self.lags_opt,
|
|
354
|
+
last_window=data_last_window[self.var_model],
|
|
355
|
+
exog=exog.drop(self.var_model, axis=1),
|
|
356
|
+
)
|
|
357
|
+
else:
|
|
358
|
+
exog = await self.generate_exog(data_last_window, self.num_lags, self.var_model)
|
|
359
|
+
|
|
360
|
+
predictions = await asyncio.to_thread(
|
|
361
|
+
self.forecaster.predict,
|
|
362
|
+
steps=self.num_lags,
|
|
363
|
+
last_window=data_last_window[self.var_model],
|
|
364
|
+
exog=exog.drop(self.var_model, axis=1),
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
return predictions
|
|
368
|
+
|
|
369
|
+
except asyncio.CancelledError:
|
|
370
|
+
self.logger.info("Prediction was cancelled")
|
|
371
|
+
raise
|
|
372
|
+
except Exception as e:
|
|
373
|
+
self.logger.error(f"Error during prediction: {e}")
|
|
374
|
+
raise
|
|
268
375
|
|
|
269
|
-
def
|
|
376
|
+
def _get_search_space(self, debug: bool, lags_list: list[int] | None = None):
|
|
377
|
+
"""Get the hyperparameter search space for the given model.
|
|
378
|
+
|
|
379
|
+
:param debug: If True, use simplified search space for faster testing
|
|
380
|
+
:type debug: bool
|
|
381
|
+
:param lags_list: List of lag values to use. If None, uses default values
|
|
382
|
+
:type lags_list: list[int] | None
|
|
383
|
+
"""
|
|
384
|
+
if lags_list is None:
|
|
385
|
+
lags_list = [6, 12, 24, 36, 48, 60, 72]
|
|
386
|
+
|
|
387
|
+
debug_lags = [3]
|
|
388
|
+
|
|
389
|
+
def get_lags(trial):
|
|
390
|
+
return trial.suggest_categorical("lags", debug_lags if debug else lags_list)
|
|
391
|
+
|
|
392
|
+
def svr_search_space(trial):
|
|
393
|
+
# Base SVR parameters
|
|
394
|
+
search = {
|
|
395
|
+
"C": trial.suggest_float("C", 0.1, 1.0)
|
|
396
|
+
if debug
|
|
397
|
+
else trial.suggest_float("C", 1e-2, 100.0, log=True),
|
|
398
|
+
"epsilon": trial.suggest_float("epsilon", 0.01, 1.0),
|
|
399
|
+
"kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
|
|
400
|
+
"gamma": trial.suggest_categorical(
|
|
401
|
+
"gamma", ["scale", "auto", 0.01, 0.1, 1.0, 10.0]
|
|
402
|
+
),
|
|
403
|
+
"lags": get_lags(trial),
|
|
404
|
+
}
|
|
405
|
+
return search
|
|
406
|
+
|
|
407
|
+
# Registry of search space generators
|
|
408
|
+
search_spaces = {
|
|
409
|
+
"LinearRegression": lambda trial: {
|
|
410
|
+
"fit_intercept": trial.suggest_categorical(
|
|
411
|
+
"fit_intercept", [True] if debug else [True, False]
|
|
412
|
+
),
|
|
413
|
+
"lags": get_lags(trial),
|
|
414
|
+
},
|
|
415
|
+
"RidgeRegression": lambda trial: {
|
|
416
|
+
"alpha": trial.suggest_float("alpha", 0.1, 1.0)
|
|
417
|
+
if debug
|
|
418
|
+
else trial.suggest_float("alpha", 1e-4, 100.0, log=True),
|
|
419
|
+
"lags": get_lags(trial),
|
|
420
|
+
},
|
|
421
|
+
"LassoRegression": lambda trial: {
|
|
422
|
+
"alpha": trial.suggest_float("alpha", 0.1, 1.0)
|
|
423
|
+
if debug
|
|
424
|
+
else trial.suggest_float("alpha", 1e-4, 100.0, log=True),
|
|
425
|
+
"lags": get_lags(trial),
|
|
426
|
+
},
|
|
427
|
+
"ElasticNet": lambda trial: {
|
|
428
|
+
"alpha": trial.suggest_float("alpha", 0.0, 2.0),
|
|
429
|
+
"l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
|
|
430
|
+
"selection": trial.suggest_categorical(
|
|
431
|
+
"selection", ["random"] if debug else ["cyclic", "random"]
|
|
432
|
+
),
|
|
433
|
+
"lags": get_lags(trial),
|
|
434
|
+
},
|
|
435
|
+
"KNeighborsRegressor": lambda trial: {
|
|
436
|
+
"n_neighbors": trial.suggest_int("n_neighbors", 2, 2)
|
|
437
|
+
if debug
|
|
438
|
+
else trial.suggest_int("n_neighbors", 2, 20),
|
|
439
|
+
"leaf_size": trial.suggest_int("leaf_size", 20, 20)
|
|
440
|
+
if debug
|
|
441
|
+
else trial.suggest_int("leaf_size", 20, 40),
|
|
442
|
+
"weights": trial.suggest_categorical(
|
|
443
|
+
"weights", ["uniform"] if debug else ["uniform", "distance"]
|
|
444
|
+
),
|
|
445
|
+
"lags": get_lags(trial),
|
|
446
|
+
},
|
|
447
|
+
"DecisionTreeRegressor": lambda trial: {
|
|
448
|
+
"max_depth": trial.suggest_int("max_depth", 2, 5)
|
|
449
|
+
if debug
|
|
450
|
+
else trial.suggest_int("max_depth", 2, 20),
|
|
451
|
+
"min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
|
|
452
|
+
"min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
|
|
453
|
+
"lags": get_lags(trial),
|
|
454
|
+
},
|
|
455
|
+
"SVR": svr_search_space,
|
|
456
|
+
"RandomForestRegressor": lambda trial: {
|
|
457
|
+
"n_estimators": trial.suggest_int("n_estimators", 10, 20)
|
|
458
|
+
if debug
|
|
459
|
+
else trial.suggest_int("n_estimators", 50, 300),
|
|
460
|
+
"max_depth": trial.suggest_int("max_depth", 3, 20),
|
|
461
|
+
"min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
|
|
462
|
+
"max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
|
|
463
|
+
"lags": get_lags(trial),
|
|
464
|
+
},
|
|
465
|
+
"ExtraTreesRegressor": lambda trial: {
|
|
466
|
+
"n_estimators": trial.suggest_int("n_estimators", 10, 20)
|
|
467
|
+
if debug
|
|
468
|
+
else trial.suggest_int("n_estimators", 50, 300),
|
|
469
|
+
"max_depth": trial.suggest_int("max_depth", 3, 20),
|
|
470
|
+
"min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
|
|
471
|
+
"max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
|
|
472
|
+
"lags": get_lags(trial),
|
|
473
|
+
},
|
|
474
|
+
"GradientBoostingRegressor": lambda trial: {
|
|
475
|
+
"n_estimators": trial.suggest_int("n_estimators", 10, 20)
|
|
476
|
+
if debug
|
|
477
|
+
else trial.suggest_int("n_estimators", 50, 300),
|
|
478
|
+
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
|
|
479
|
+
"max_depth": trial.suggest_int("max_depth", 3, 10),
|
|
480
|
+
"lags": get_lags(trial),
|
|
481
|
+
},
|
|
482
|
+
"AdaBoostRegressor": lambda trial: {
|
|
483
|
+
"n_estimators": trial.suggest_int("n_estimators", 10, 20)
|
|
484
|
+
if debug
|
|
485
|
+
else trial.suggest_int("n_estimators", 50, 300),
|
|
486
|
+
"learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
|
|
487
|
+
"lags": get_lags(trial),
|
|
488
|
+
},
|
|
489
|
+
"MLPRegressor": lambda trial: {
|
|
490
|
+
"learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.01),
|
|
491
|
+
"hidden_layer_sizes": trial.suggest_categorical(
|
|
492
|
+
"hidden_layer_sizes", [(50,), (100,), (50, 50)]
|
|
493
|
+
),
|
|
494
|
+
"activation": trial.suggest_categorical("activation", ["relu", "tanh"]),
|
|
495
|
+
"alpha": trial.suggest_float("alpha", 1e-5, 1e-1, log=True),
|
|
496
|
+
"lags": get_lags(trial),
|
|
497
|
+
},
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
if self.sklearn_model not in search_spaces:
|
|
501
|
+
raise ValueError(f"Unsupported model for tuning: {self.sklearn_model}")
|
|
502
|
+
|
|
503
|
+
return search_spaces[self.sklearn_model]
|
|
504
|
+
|
|
505
|
+
async def tune(
|
|
506
|
+
self,
|
|
507
|
+
split_date_delta: str | None = "48h",
|
|
508
|
+
n_trials: int = 10,
|
|
509
|
+
debug: bool | None = False,
|
|
510
|
+
) -> pd.DataFrame:
|
|
270
511
|
"""Tuning a previously fitted model using bayesian optimization.
|
|
271
512
|
|
|
513
|
+
:param split_date_delta: The delta from now to `split_date_delta` that will be used \
|
|
514
|
+
as the test period to evaluate the model, defaults to '48h'.\
|
|
515
|
+
This define the training/validation split for the tuning process.
|
|
516
|
+
:type split_date_delta: Optional[str], optional
|
|
272
517
|
:param debug: Set to True for testing and faster optimizations, defaults to False
|
|
273
518
|
:type debug: Optional[bool], optional
|
|
519
|
+
:param n_trials: Number of trials for bayesian optimization, defaults to 10
|
|
520
|
+
:type n_trials: Optional[int], optional
|
|
274
521
|
:return: The DataFrame with the forecasts using the optimized model.
|
|
275
522
|
:rtype: pd.DataFrame
|
|
276
523
|
"""
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
524
|
+
try:
|
|
525
|
+
if self.forecaster is None:
|
|
526
|
+
raise ValueError("Model has not been fitted yet. Call fit() first.")
|
|
280
527
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
}
|
|
288
|
-
return search_space
|
|
289
|
-
else:
|
|
528
|
+
# Calculate appropriate lags based on data frequency
|
|
529
|
+
freq_timedelta = pd.Timedelta(self.data_exo.index.freq)
|
|
530
|
+
lags_list = MLForecaster.get_lags_list_from_frequency(freq_timedelta)
|
|
531
|
+
self.logger.info(
|
|
532
|
+
f"Using lags list based on data frequency ({self.data_exo.index.freq}): {lags_list}"
|
|
533
|
+
)
|
|
290
534
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
"fit_intercept": trial.suggest_categorical(
|
|
294
|
-
"fit_intercept", [True, False]
|
|
295
|
-
),
|
|
296
|
-
"lags": trial.suggest_categorical(
|
|
297
|
-
"lags", [6, 12, 24, 36, 48, 60, 72]
|
|
298
|
-
),
|
|
299
|
-
}
|
|
300
|
-
return search_space
|
|
301
|
-
elif self.sklearn_model == "ElasticNet":
|
|
302
|
-
if debug:
|
|
535
|
+
# Get the search space for this model
|
|
536
|
+
search_space = self._get_search_space(debug, lags_list)
|
|
303
537
|
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
}
|
|
309
|
-
return search_space
|
|
538
|
+
# Bayesian search hyperparameter and lags with skforecast/optuna
|
|
539
|
+
if debug:
|
|
540
|
+
refit = False
|
|
541
|
+
num_lags = 3
|
|
310
542
|
else:
|
|
543
|
+
refit = True
|
|
544
|
+
num_lags = self.num_lags
|
|
545
|
+
# The optimization routine call
|
|
546
|
+
self.logger.info("Bayesian hyperparameter optimization with backtesting")
|
|
547
|
+
start_time = time.time()
|
|
548
|
+
|
|
549
|
+
# Use the 'y' data that will be passed to the optimizer
|
|
550
|
+
data_to_tune = self.data_train[self.var_model]
|
|
311
551
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
|
|
552
|
+
# Calculate the new split date and initial_train_size based on the passed split_date_delta
|
|
553
|
+
try:
|
|
554
|
+
date_split = (
|
|
555
|
+
data_to_tune.index[-1]
|
|
556
|
+
- pd.Timedelta(split_date_delta)
|
|
557
|
+
+ data_to_tune.index.freq
|
|
558
|
+
)
|
|
559
|
+
initial_train_size = len(data_to_tune.loc[: date_split - data_to_tune.index.freq])
|
|
560
|
+
except (ValueError, TypeError):
|
|
561
|
+
self.logger.warning(
|
|
562
|
+
f"Invalid split_date_delta: {split_date_delta}. Falling back to 5 days."
|
|
563
|
+
)
|
|
564
|
+
date_split = (
|
|
565
|
+
data_to_tune.index[-1] - pd.Timedelta("5days") + data_to_tune.index.freq
|
|
566
|
+
)
|
|
567
|
+
initial_train_size = len(data_to_tune.loc[: date_split - data_to_tune.index.freq])
|
|
568
|
+
|
|
569
|
+
# Check if the calculated initial_train_size is valid
|
|
570
|
+
window_size = num_lags # This is what skforecast will use as window_size
|
|
325
571
|
if debug:
|
|
572
|
+
window_size = 3 # Match debug lags
|
|
326
573
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
574
|
+
if initial_train_size <= window_size:
|
|
575
|
+
self.logger.warning(
|
|
576
|
+
f"Calculated initial_train_size ({initial_train_size}) is <= window_size ({window_size})."
|
|
577
|
+
)
|
|
578
|
+
self.logger.warning(
|
|
579
|
+
"This is likely because split_date_delta is too large for the dataset."
|
|
580
|
+
)
|
|
581
|
+
MIN_SAMPLES_FOR_KNN = 6
|
|
582
|
+
new_train_size = window_size + MIN_SAMPLES_FOR_KNN
|
|
583
|
+
self.logger.warning(
|
|
584
|
+
f"Adjusting initial_train_size to {new_train_size} to attempt recovery."
|
|
585
|
+
)
|
|
586
|
+
initial_train_size = new_train_size
|
|
334
587
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
df_pred_opt
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
588
|
+
cv = TimeSeriesFold(
|
|
589
|
+
steps=num_lags,
|
|
590
|
+
initial_train_size=initial_train_size,
|
|
591
|
+
fixed_train_size=True,
|
|
592
|
+
gap=0,
|
|
593
|
+
skip_folds=None,
|
|
594
|
+
allow_incomplete_fold=True,
|
|
595
|
+
refit=refit,
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
(
|
|
599
|
+
self.optimize_results,
|
|
600
|
+
self.optimize_results_object,
|
|
601
|
+
) = await asyncio.to_thread(
|
|
602
|
+
bayesian_search_forecaster,
|
|
603
|
+
forecaster=self.forecaster,
|
|
604
|
+
y=self.data_train[self.var_model],
|
|
605
|
+
exog=self.data_train.drop(self.var_model, axis=1),
|
|
606
|
+
cv=cv,
|
|
607
|
+
search_space=search_space,
|
|
608
|
+
metric=MLForecaster.neg_r2_score,
|
|
609
|
+
n_trials=n_trials,
|
|
610
|
+
random_state=123,
|
|
611
|
+
return_best=True,
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
optimization_time = time.time() - start_time
|
|
615
|
+
self.logger.info(f"Elapsed time: {optimization_time}")
|
|
616
|
+
|
|
617
|
+
self.is_tuned = True
|
|
618
|
+
|
|
619
|
+
predictions_opt = await asyncio.to_thread(
|
|
620
|
+
self.forecaster.predict,
|
|
621
|
+
steps=self.num_lags,
|
|
622
|
+
exog=self.data_test.drop(self.var_model, axis=1),
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
freq_hours = self.data_exo.index.freq.delta.seconds / 3600
|
|
626
|
+
self.lags_opt = int(np.round(len(self.optimize_results.iloc[0]["lags"])))
|
|
627
|
+
self.days_needed = int(np.round(self.lags_opt * freq_hours / 24))
|
|
628
|
+
|
|
629
|
+
df_pred_opt = pd.DataFrame(
|
|
630
|
+
index=self.data_exo.index, columns=["train", "test", "pred_optim"]
|
|
631
|
+
)
|
|
632
|
+
df_pred_opt["train"] = self.data_train[self.var_model]
|
|
633
|
+
df_pred_opt["test"] = self.data_test[self.var_model]
|
|
634
|
+
df_pred_opt["pred_optim"] = predictions_opt
|
|
635
|
+
|
|
636
|
+
pred_optim_metric_train = -self.optimize_results.iloc[0]["neg_r2_score"]
|
|
637
|
+
self.logger.info(
|
|
638
|
+
f"R2 score for optimized prediction in train period: {pred_optim_metric_train}"
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
pred_optim_metric_test = await asyncio.to_thread(
|
|
642
|
+
r2_score,
|
|
643
|
+
df_pred_opt.loc[predictions_opt.index, "test"],
|
|
644
|
+
df_pred_opt.loc[predictions_opt.index, "pred_optim"],
|
|
645
|
+
)
|
|
646
|
+
self.logger.info(
|
|
647
|
+
f"R2 score for optimized prediction in test period: {pred_optim_metric_test}"
|
|
648
|
+
)
|
|
649
|
+
self.logger.info("Number of optimal lags obtained: " + str(self.lags_opt))
|
|
650
|
+
|
|
651
|
+
return df_pred_opt
|
|
652
|
+
|
|
653
|
+
except asyncio.CancelledError:
|
|
654
|
+
self.logger.info("Model tuning was cancelled")
|
|
655
|
+
raise
|
|
656
|
+
except Exception as e:
|
|
657
|
+
self.logger.error(f"Error during model tuning: {e}")
|
|
658
|
+
raise
|