emhass 0.11.4__py3-none-any.whl → 0.15.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,22 +1,30 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
1
+ import asyncio
4
2
  import logging
5
- import copy
6
3
  import time
7
- from typing import Optional, Tuple
8
- import pandas as pd
9
- import numpy as np
10
-
11
- from sklearn.linear_model import LinearRegression
12
- from sklearn.linear_model import ElasticNet
13
- from sklearn.neighbors import KNeighborsRegressor
14
- from sklearn.metrics import r2_score
4
+ import warnings
15
5
 
6
+ import numpy as np
7
+ import pandas as pd
8
+ from skforecast.model_selection import (
9
+ TimeSeriesFold,
10
+ backtesting_forecaster,
11
+ bayesian_search_forecaster,
12
+ )
16
13
  from skforecast.recursive import ForecasterRecursive
17
- from skforecast.model_selection import bayesian_search_forecaster, backtesting_forecaster, TimeSeriesFold
14
+ from sklearn.ensemble import (
15
+ AdaBoostRegressor,
16
+ ExtraTreesRegressor,
17
+ GradientBoostingRegressor,
18
+ RandomForestRegressor,
19
+ )
20
+ from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
21
+ from sklearn.metrics import r2_score
22
+ from sklearn.neighbors import KNeighborsRegressor
23
+ from sklearn.neural_network import MLPRegressor
24
+ from sklearn.svm import SVR
25
+ from sklearn.tree import DecisionTreeRegressor
18
26
 
19
- import warnings
27
+ from emhass import utils
20
28
 
21
29
  warnings.filterwarnings("ignore", category=DeprecationWarning)
22
30
 
@@ -25,17 +33,17 @@ class MLForecaster:
25
33
  r"""
26
34
  A forecaster class using machine learning models with auto-regressive approach and features\
27
35
  based on timestamp information (hour, day, week, etc).
28
-
36
+
29
37
  This class uses the `skforecast` module and the machine learning models are from `scikit-learn`.
30
-
38
+
31
39
  It exposes three main methods:
32
-
40
+
33
41
  - `fit`: to train a model with the passed data.
34
-
42
+
35
43
  - `predict`: to obtain a forecast from a pre-trained model.
36
-
37
- - `tune`: to optimize the models hyperparameters using bayesian optimization.
38
-
44
+
45
+ - `tune`: to optimize the models hyperparameters using bayesian optimization.
46
+
39
47
  """
40
48
 
41
49
  def __init__(
@@ -78,36 +86,57 @@ class MLForecaster:
78
86
  self.emhass_conf = emhass_conf
79
87
  self.logger = logger
80
88
  self.is_tuned = False
89
+ self.forecaster: ForecasterRecursive | None = None
90
+ self.optimize_results: pd.DataFrame | None = None
91
+ self.optimize_results_object = None
92
+
81
93
  # A quick data preparation
94
+ self._prepare_data()
95
+
96
+ def _prepare_data(self):
97
+ """Prepare the input data by cleaning and sorting."""
82
98
  self.data.index = pd.to_datetime(self.data.index)
83
99
  self.data.sort_index(inplace=True)
84
100
  self.data = self.data[~self.data.index.duplicated(keep="first")]
85
101
 
86
- @staticmethod
87
- def add_date_features(data: pd.DataFrame) -> pd.DataFrame:
88
- """Add date features from the input DataFrame timestamp
89
-
90
- :param data: The input DataFrame
91
- :type data: pd.DataFrame
92
- :return: The DataFrame with the added features
93
- :rtype: pd.DataFrame
94
- """
95
- df = copy.deepcopy(data)
96
- df["year"] = [i.year for i in df.index]
97
- df["month"] = [i.month for i in df.index]
98
- df["day_of_week"] = [i.dayofweek for i in df.index]
99
- df["day_of_year"] = [i.dayofyear for i in df.index]
100
- df["day"] = [i.day for i in df.index]
101
- df["hour"] = [i.hour for i in df.index]
102
- return df
103
-
104
102
  @staticmethod
105
103
  def neg_r2_score(y_true, y_pred):
106
104
  """The negative of the r2 score."""
107
105
  return -r2_score(y_true, y_pred)
108
106
 
109
107
  @staticmethod
110
- def generate_exog(data_last_window, periods, var_name):
108
+ async def interpolate_async(data: pd.DataFrame) -> pd.DataFrame:
109
+ """Interpolate missing values asynchronously."""
110
+ return await asyncio.to_thread(data.interpolate, method="linear", axis=0, limit=None)
111
+
112
+ @staticmethod
113
+ def get_lags_list_from_frequency(freq: pd.Timedelta) -> list[int]:
114
+ """Calculate appropriate lag values based on data frequency.
115
+
116
+ The lags represent different time horizons (6h, 12h, 1d, 1.5d, 2d, 2.5d, 3d).
117
+ This method scales these horizons according to the actual data frequency.
118
+
119
+ :param freq: The frequency of the data as a pandas Timedelta
120
+ :type freq: pd.Timedelta
121
+ :return: A list of lag values appropriate for the data frequency
122
+ :rtype: list[int]
123
+ """
124
+ # Define target time horizons in hours
125
+ target_horizons_hours = [6, 12, 24, 36, 48, 60, 72]
126
+
127
+ # Calculate frequency in hours
128
+ freq_hours = freq.total_seconds() / 3600
129
+
130
+ # Calculate lags for each horizon
131
+ lags = [int(round(horizon / freq_hours)) for horizon in target_horizons_hours]
132
+
133
+ # Remove duplicates and ensure minimum value of 1
134
+ lags = sorted({max(1, lag) for lag in lags})
135
+
136
+ return lags
137
+
138
+ @staticmethod
139
+ async def generate_exog(data_last_window, periods, var_name):
111
140
  """Generate the exogenous data for future timestamps."""
112
141
  forecast_dates = pd.date_range(
113
142
  start=data_last_window.index[-1] + data_last_window.index.freq,
@@ -115,14 +144,46 @@ class MLForecaster:
115
144
  freq=data_last_window.index.freq,
116
145
  )
117
146
  exog = pd.DataFrame({var_name: [np.nan] * periods}, index=forecast_dates)
118
- exog = MLForecaster.add_date_features(exog)
147
+ exog = utils.add_date_features(exog)
119
148
  return exog
120
149
 
121
- def fit(
150
+ def _get_sklearn_model(self, model_name: str):
151
+ """Get the sklearn model instance based on the model name."""
152
+ seed = 42
153
+ models = {
154
+ "LinearRegression": LinearRegression(),
155
+ "RidgeRegression": Ridge(),
156
+ "LassoRegression": Lasso(random_state=seed),
157
+ "ElasticNet": ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=seed),
158
+ "KNeighborsRegressor": KNeighborsRegressor(),
159
+ "DecisionTreeRegressor": DecisionTreeRegressor(ccp_alpha=0.0, random_state=seed),
160
+ "SVR": SVR(),
161
+ "RandomForestRegressor": RandomForestRegressor(
162
+ min_samples_leaf=1, max_features=1.0, random_state=seed
163
+ ),
164
+ "ExtraTreesRegressor": ExtraTreesRegressor(
165
+ min_samples_leaf=1, max_features=1.0, random_state=seed
166
+ ),
167
+ "GradientBoostingRegressor": GradientBoostingRegressor(
168
+ learning_rate=0.1, random_state=seed
169
+ ),
170
+ "AdaBoostRegressor": AdaBoostRegressor(learning_rate=1.0, random_state=seed),
171
+ "MLPRegressor": MLPRegressor(hidden_layer_sizes=(100,), random_state=seed),
172
+ }
173
+
174
+ if model_name not in models:
175
+ self.logger.error(
176
+ f"Passed sklearn model {model_name} is not valid. Defaulting to KNeighborsRegressor"
177
+ )
178
+ return KNeighborsRegressor()
179
+
180
+ return models[model_name]
181
+
182
+ async def fit(
122
183
  self,
123
- split_date_delta: Optional[str] = "48h",
124
- perform_backtest: Optional[bool] = False,
125
- ) -> Tuple[pd.DataFrame, pd.DataFrame]:
184
+ split_date_delta: str | None = "48h",
185
+ perform_backtest: bool | None = False,
186
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
126
187
  r"""The fit method to train the ML model.
127
188
 
128
189
  :param split_date_delta: The delta from now to `split_date_delta` that will be used \
@@ -134,100 +195,133 @@ class MLForecaster:
134
195
  :return: The DataFrame containing the forecast data results without and with backtest
135
196
  :rtype: Tuple[pd.DataFrame, pd.DataFrame]
136
197
  """
137
- self.logger.info("Performing a forecast model fit for " + self.model_type)
138
- # Preparing the data: adding exogenous features
139
- self.data_exo = pd.DataFrame(index=self.data.index)
140
- self.data_exo = MLForecaster.add_date_features(self.data_exo)
141
- self.data_exo[self.var_model] = self.data[self.var_model]
142
- self.data_exo = self.data_exo.interpolate(method="linear", axis=0, limit=None)
143
- # train/test split
144
- self.date_train = (
145
- self.data_exo.index[-1] - pd.Timedelta("5days") + self.data_exo.index.freq
146
- ) # The last 5 days
147
- self.date_split = (
148
- self.data_exo.index[-1]
149
- - pd.Timedelta(split_date_delta)
150
- + self.data_exo.index.freq
151
- ) # The last 48h
152
- self.data_train = self.data_exo.loc[
153
- : self.date_split - self.data_exo.index.freq, :
154
- ]
155
- self.data_test = self.data_exo.loc[self.date_split :, :]
156
- self.steps = len(self.data_test)
157
- # Pick correct sklearn model
158
- if self.sklearn_model == "LinearRegression":
159
- base_model = LinearRegression()
160
- elif self.sklearn_model == "ElasticNet":
161
- base_model = ElasticNet()
162
- elif self.sklearn_model == "KNeighborsRegressor":
163
- base_model = KNeighborsRegressor()
164
- else:
165
- self.logger.error(
166
- "Passed sklearn model "
167
- + self.sklearn_model
168
- + " is not valid. Defaulting to KNeighborsRegressor"
169
- )
170
- base_model = KNeighborsRegressor()
171
- # Define the forecaster object
172
- self.forecaster = ForecasterRecursive(
173
- regressor = base_model,
174
- lags = self.num_lags
175
- )
176
- # Fit and time it
177
- self.logger.info("Training a " + self.sklearn_model + " model")
178
- start_time = time.time()
179
- self.forecaster.fit(
180
- y=self.data_train[self.var_model],
181
- exog=self.data_train.drop(self.var_model, axis=1),
182
- )
183
- self.logger.info(f"Elapsed time for model fit: {time.time() - start_time}")
184
- # Make a prediction to print metrics
185
- predictions = self.forecaster.predict(
186
- steps=self.steps, exog=self.data_test.drop(self.var_model, axis=1)
187
- )
188
- pred_metric = r2_score(self.data_test[self.var_model], predictions)
189
- self.logger.info(
190
- f"Prediction R2 score of fitted model on test data: {pred_metric}"
191
- )
192
- # Packing results in a DataFrame
193
- df_pred = pd.DataFrame(
194
- index=self.data_exo.index, columns=["train", "test", "pred"]
195
- )
196
- df_pred["train"] = self.data_train[self.var_model]
197
- df_pred["test"] = self.data_test[self.var_model]
198
- df_pred["pred"] = predictions
199
- df_pred_backtest = None
200
- if perform_backtest is True:
201
- # Using backtesting tool to evaluate the model
202
- self.logger.info("Performing simple backtesting of fitted model")
198
+ try:
199
+ self.logger.info("Performing a forecast model fit for " + self.model_type)
200
+
201
+ # Check if variable exists in data
202
+ if self.var_model not in self.data.columns:
203
+ raise KeyError(
204
+ f"Variable '{self.var_model}' not found in data columns: {list(self.data.columns)}"
205
+ )
206
+
207
+ # Preparing the data: adding exogenous features
208
+ self.data_exo = pd.DataFrame(index=self.data.index)
209
+ self.data_exo = utils.add_date_features(self.data_exo)
210
+ self.data_exo[self.var_model] = self.data[self.var_model]
211
+
212
+ self.data_exo = await self.interpolate_async(self.data_exo)
213
+
214
+ # train/test split
215
+ self.date_train = (
216
+ self.data_exo.index[-1] - pd.Timedelta("5days") + self.data_exo.index.freq
217
+ ) # The last 5 days
218
+ self.date_split = (
219
+ self.data_exo.index[-1] - pd.Timedelta(split_date_delta) + self.data_exo.index.freq
220
+ ) # The last 48h
221
+ self.data_train = self.data_exo.loc[: self.date_split - self.data_exo.index.freq, :]
222
+ self.data_test = self.data_exo.loc[self.date_split :, :]
223
+ self.steps = len(self.data_test)
224
+
225
+ # Pick correct sklearn model
226
+ base_model = self._get_sklearn_model(self.sklearn_model)
227
+
228
+ # Define the forecaster object
229
+ self.forecaster = ForecasterRecursive(estimator=base_model, lags=self.num_lags)
230
+
231
+ # Fit and time it
232
+ self.logger.info("Training a " + self.sklearn_model + " model")
203
233
  start_time = time.time()
204
- cv = TimeSeriesFold(
205
- steps = self.num_lags,
206
- initial_train_size = None,
207
- fixed_train_size = False,
208
- gap = 0,
209
- allow_incomplete_fold = True,
210
- refit = False
234
+
235
+ await asyncio.to_thread(
236
+ self.forecaster.fit,
237
+ y=self.data_train[self.var_model],
238
+ exog=self.data_train.drop(self.var_model, axis=1),
239
+ store_in_sample_residuals=True,
211
240
  )
212
- metric, predictions_backtest = backtesting_forecaster(
213
- forecaster = self.forecaster,
214
- y = self.data_train[self.var_model],
215
- exog = self.data_train.drop(self.var_model, axis=1),
216
- cv = cv,
217
- metric = MLForecaster.neg_r2_score,
218
- verbose = False,
219
- show_progress = True
241
+
242
+ fit_time = time.time() - start_time
243
+ self.logger.info(f"Elapsed time for model fit: {fit_time}")
244
+
245
+ # Make a prediction to print metrics
246
+ predictions = await asyncio.to_thread(
247
+ self.forecaster.predict,
248
+ steps=self.steps,
249
+ exog=self.data_test.drop(self.var_model, axis=1),
220
250
  )
221
- self.logger.info(f"Elapsed backtesting time: {time.time() - start_time}")
222
- self.logger.info(f"Backtest R2 score: {-metric}")
223
- df_pred_backtest = pd.DataFrame(
224
- index=self.data_exo.index, columns=["train", "pred"]
251
+ pred_metric = await asyncio.to_thread(
252
+ r2_score, self.data_test[self.var_model], predictions
225
253
  )
226
- df_pred_backtest["train"] = self.data_exo[self.var_model]
227
- df_pred_backtest["pred"] = predictions_backtest
228
- return df_pred, df_pred_backtest
254
+ self.logger.info(f"Prediction R2 score of fitted model on test data: {pred_metric}")
255
+
256
+ # Packing results in a DataFrame
257
+ df_pred = pd.DataFrame(index=self.data_exo.index, columns=["train", "test", "pred"])
258
+
259
+ df_pred["train"] = self.data_train[self.var_model]
260
+ df_pred["test"] = self.data_test[self.var_model]
261
+ df_pred["pred"] = predictions
262
+
263
+ df_pred_backtest = None
264
+
265
+ if perform_backtest is True:
266
+ # Using backtesting tool to evaluate the model
267
+ self.logger.info("Performing simple backtesting of fitted model")
268
+ start_time = time.time()
269
+ cv = TimeSeriesFold(
270
+ steps=self.num_lags,
271
+ initial_train_size=None,
272
+ fixed_train_size=False,
273
+ gap=0,
274
+ allow_incomplete_fold=True,
275
+ refit=False,
276
+ )
277
+
278
+ metric, predictions_backtest = await asyncio.to_thread(
279
+ backtesting_forecaster,
280
+ forecaster=self.forecaster,
281
+ y=self.data_train[self.var_model],
282
+ exog=self.data_train.drop(self.var_model, axis=1),
283
+ cv=cv,
284
+ metric=MLForecaster.neg_r2_score,
285
+ verbose=False,
286
+ show_progress=True,
287
+ )
288
+
289
+ backtest_time = time.time() - start_time
290
+ backtest_r2 = -metric
291
+ self.logger.info(f"Elapsed backtesting time: {backtest_time}")
292
+ self.logger.info(f"Backtest R2 score: {backtest_r2}")
293
+ df_pred_backtest = pd.DataFrame(
294
+ index=self.data_exo.index, columns=["train", "pred"]
295
+ )
296
+ df_pred_backtest["train"] = self.data_exo[self.var_model]
297
+ # Handle skforecast 0.18.0+ DataFrame output with fold column
298
+ if isinstance(predictions_backtest, pd.DataFrame):
299
+ # Extract the 'pred' column from the DataFrame
300
+ pred_values = (
301
+ predictions_backtest["pred"]
302
+ if "pred" in predictions_backtest.columns
303
+ else predictions_backtest.iloc[:, -1]
304
+ )
305
+ else:
306
+ # If it's a Series, use it directly
307
+ pred_values = predictions_backtest
308
+
309
+ # Use loc to align indices properly - only assign where indices match
310
+ df_pred_backtest.loc[pred_values.index, "pred"] = pred_values
311
+
312
+ return df_pred, df_pred_backtest
229
313
 
230
- def predict(self, data_last_window: Optional[pd.DataFrame] = None) -> pd.Series:
314
+ except asyncio.CancelledError:
315
+ self.logger.info("Model training was cancelled")
316
+ raise
317
+ except Exception as e:
318
+ self.logger.error(f"Error during model fitting: {e}")
319
+ raise
320
+
321
+ async def predict(
322
+ self,
323
+ data_last_window: pd.DataFrame | None = None,
324
+ ) -> pd.Series:
231
325
  """The predict method to generate forecasts from a previously fitted ML model.
232
326
 
233
327
  :param data_last_window: The data that will be used to generate the new forecast, this \
@@ -238,168 +332,327 @@ class MLForecaster:
238
332
  :return: A pandas series containing the generated forecasts.
239
333
  :rtype: pd.Series
240
334
  """
241
- if data_last_window is None:
242
- predictions = self.forecaster.predict(
243
- steps=self.num_lags, exog=self.data_test.drop(self.var_model, axis=1)
244
- )
245
- else:
246
- data_last_window = data_last_window.interpolate(
247
- method="linear", axis=0, limit=None
248
- )
249
- if self.is_tuned:
250
- exog = MLForecaster.generate_exog(
251
- data_last_window, self.lags_opt, self.var_model
252
- )
253
- predictions = self.forecaster.predict(
254
- steps=self.lags_opt,
255
- last_window=data_last_window[self.var_model],
256
- exog=exog.drop(self.var_model, axis=1),
257
- )
258
- else:
259
- exog = MLForecaster.generate_exog(
260
- data_last_window, self.num_lags, self.var_model
261
- )
262
- predictions = self.forecaster.predict(
335
+ try:
336
+ if self.forecaster is None:
337
+ raise ValueError("Model has not been fitted yet. Call fit() first.")
338
+
339
+ if data_last_window is None:
340
+ predictions = await asyncio.to_thread(
341
+ self.forecaster.predict,
263
342
  steps=self.num_lags,
264
- last_window=data_last_window[self.var_model],
265
- exog=exog.drop(self.var_model, axis=1),
343
+ exog=self.data_test.drop(self.var_model, axis=1),
266
344
  )
267
- return predictions
345
+ else:
346
+ data_last_window = await self.interpolate_async(data_last_window)
347
+
348
+ if self.is_tuned:
349
+ exog = await self.generate_exog(data_last_window, self.lags_opt, self.var_model)
350
+
351
+ predictions = await asyncio.to_thread(
352
+ self.forecaster.predict,
353
+ steps=self.lags_opt,
354
+ last_window=data_last_window[self.var_model],
355
+ exog=exog.drop(self.var_model, axis=1),
356
+ )
357
+ else:
358
+ exog = await self.generate_exog(data_last_window, self.num_lags, self.var_model)
359
+
360
+ predictions = await asyncio.to_thread(
361
+ self.forecaster.predict,
362
+ steps=self.num_lags,
363
+ last_window=data_last_window[self.var_model],
364
+ exog=exog.drop(self.var_model, axis=1),
365
+ )
366
+
367
+ return predictions
368
+
369
+ except asyncio.CancelledError:
370
+ self.logger.info("Prediction was cancelled")
371
+ raise
372
+ except Exception as e:
373
+ self.logger.error(f"Error during prediction: {e}")
374
+ raise
268
375
 
269
- def tune(self, debug: Optional[bool] = False) -> pd.DataFrame:
376
+ def _get_search_space(self, debug: bool, lags_list: list[int] | None = None):
377
+ """Get the hyperparameter search space for the given model.
378
+
379
+ :param debug: If True, use simplified search space for faster testing
380
+ :type debug: bool
381
+ :param lags_list: List of lag values to use. If None, uses default values
382
+ :type lags_list: list[int] | None
383
+ """
384
+ if lags_list is None:
385
+ lags_list = [6, 12, 24, 36, 48, 60, 72]
386
+
387
+ debug_lags = [3]
388
+
389
+ def get_lags(trial):
390
+ return trial.suggest_categorical("lags", debug_lags if debug else lags_list)
391
+
392
+ def svr_search_space(trial):
393
+ # Base SVR parameters
394
+ search = {
395
+ "C": trial.suggest_float("C", 0.1, 1.0)
396
+ if debug
397
+ else trial.suggest_float("C", 1e-2, 100.0, log=True),
398
+ "epsilon": trial.suggest_float("epsilon", 0.01, 1.0),
399
+ "kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
400
+ "gamma": trial.suggest_categorical(
401
+ "gamma", ["scale", "auto", 0.01, 0.1, 1.0, 10.0]
402
+ ),
403
+ "lags": get_lags(trial),
404
+ }
405
+ return search
406
+
407
+ # Registry of search space generators
408
+ search_spaces = {
409
+ "LinearRegression": lambda trial: {
410
+ "fit_intercept": trial.suggest_categorical(
411
+ "fit_intercept", [True] if debug else [True, False]
412
+ ),
413
+ "lags": get_lags(trial),
414
+ },
415
+ "RidgeRegression": lambda trial: {
416
+ "alpha": trial.suggest_float("alpha", 0.1, 1.0)
417
+ if debug
418
+ else trial.suggest_float("alpha", 1e-4, 100.0, log=True),
419
+ "lags": get_lags(trial),
420
+ },
421
+ "LassoRegression": lambda trial: {
422
+ "alpha": trial.suggest_float("alpha", 0.1, 1.0)
423
+ if debug
424
+ else trial.suggest_float("alpha", 1e-4, 100.0, log=True),
425
+ "lags": get_lags(trial),
426
+ },
427
+ "ElasticNet": lambda trial: {
428
+ "alpha": trial.suggest_float("alpha", 0.0, 2.0),
429
+ "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
430
+ "selection": trial.suggest_categorical(
431
+ "selection", ["random"] if debug else ["cyclic", "random"]
432
+ ),
433
+ "lags": get_lags(trial),
434
+ },
435
+ "KNeighborsRegressor": lambda trial: {
436
+ "n_neighbors": trial.suggest_int("n_neighbors", 2, 2)
437
+ if debug
438
+ else trial.suggest_int("n_neighbors", 2, 20),
439
+ "leaf_size": trial.suggest_int("leaf_size", 20, 20)
440
+ if debug
441
+ else trial.suggest_int("leaf_size", 20, 40),
442
+ "weights": trial.suggest_categorical(
443
+ "weights", ["uniform"] if debug else ["uniform", "distance"]
444
+ ),
445
+ "lags": get_lags(trial),
446
+ },
447
+ "DecisionTreeRegressor": lambda trial: {
448
+ "max_depth": trial.suggest_int("max_depth", 2, 5)
449
+ if debug
450
+ else trial.suggest_int("max_depth", 2, 20),
451
+ "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
452
+ "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
453
+ "lags": get_lags(trial),
454
+ },
455
+ "SVR": svr_search_space,
456
+ "RandomForestRegressor": lambda trial: {
457
+ "n_estimators": trial.suggest_int("n_estimators", 10, 20)
458
+ if debug
459
+ else trial.suggest_int("n_estimators", 50, 300),
460
+ "max_depth": trial.suggest_int("max_depth", 3, 20),
461
+ "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
462
+ "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
463
+ "lags": get_lags(trial),
464
+ },
465
+ "ExtraTreesRegressor": lambda trial: {
466
+ "n_estimators": trial.suggest_int("n_estimators", 10, 20)
467
+ if debug
468
+ else trial.suggest_int("n_estimators", 50, 300),
469
+ "max_depth": trial.suggest_int("max_depth", 3, 20),
470
+ "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
471
+ "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
472
+ "lags": get_lags(trial),
473
+ },
474
+ "GradientBoostingRegressor": lambda trial: {
475
+ "n_estimators": trial.suggest_int("n_estimators", 10, 20)
476
+ if debug
477
+ else trial.suggest_int("n_estimators", 50, 300),
478
+ "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
479
+ "max_depth": trial.suggest_int("max_depth", 3, 10),
480
+ "lags": get_lags(trial),
481
+ },
482
+ "AdaBoostRegressor": lambda trial: {
483
+ "n_estimators": trial.suggest_int("n_estimators", 10, 20)
484
+ if debug
485
+ else trial.suggest_int("n_estimators", 50, 300),
486
+ "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
487
+ "lags": get_lags(trial),
488
+ },
489
+ "MLPRegressor": lambda trial: {
490
+ "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.01),
491
+ "hidden_layer_sizes": trial.suggest_categorical(
492
+ "hidden_layer_sizes", [(50,), (100,), (50, 50)]
493
+ ),
494
+ "activation": trial.suggest_categorical("activation", ["relu", "tanh"]),
495
+ "alpha": trial.suggest_float("alpha", 1e-5, 1e-1, log=True),
496
+ "lags": get_lags(trial),
497
+ },
498
+ }
499
+
500
+ if self.sklearn_model not in search_spaces:
501
+ raise ValueError(f"Unsupported model for tuning: {self.sklearn_model}")
502
+
503
+ return search_spaces[self.sklearn_model]
504
+
505
+ async def tune(
506
+ self,
507
+ split_date_delta: str | None = "48h",
508
+ n_trials: int = 10,
509
+ debug: bool | None = False,
510
+ ) -> pd.DataFrame:
270
511
  """Tuning a previously fitted model using bayesian optimization.
271
512
 
513
+ :param split_date_delta: The delta from now to `split_date_delta` that will be used \
514
+ as the test period to evaluate the model, defaults to '48h'.\
515
+ This define the training/validation split for the tuning process.
516
+ :type split_date_delta: Optional[str], optional
272
517
  :param debug: Set to True for testing and faster optimizations, defaults to False
273
518
  :type debug: Optional[bool], optional
519
+ :param n_trials: Number of trials for bayesian optimization, defaults to 10
520
+ :type n_trials: Optional[int], optional
274
521
  :return: The DataFrame with the forecasts using the optimized model.
275
522
  :rtype: pd.DataFrame
276
523
  """
277
- # Regressor hyperparameters search space
278
- if self.sklearn_model == "LinearRegression":
279
- if debug:
524
+ try:
525
+ if self.forecaster is None:
526
+ raise ValueError("Model has not been fitted yet. Call fit() first.")
280
527
 
281
- def search_space(trial):
282
- search_space = {
283
- "fit_intercept": trial.suggest_categorical(
284
- "fit_intercept", [True]
285
- ),
286
- "lags": trial.suggest_categorical("lags", [3]),
287
- }
288
- return search_space
289
- else:
528
+ # Calculate appropriate lags based on data frequency
529
+ freq_timedelta = pd.Timedelta(self.data_exo.index.freq)
530
+ lags_list = MLForecaster.get_lags_list_from_frequency(freq_timedelta)
531
+ self.logger.info(
532
+ f"Using lags list based on data frequency ({self.data_exo.index.freq}): {lags_list}"
533
+ )
290
534
 
291
- def search_space(trial):
292
- search_space = {
293
- "fit_intercept": trial.suggest_categorical(
294
- "fit_intercept", [True, False]
295
- ),
296
- "lags": trial.suggest_categorical(
297
- "lags", [6, 12, 24, 36, 48, 60, 72]
298
- ),
299
- }
300
- return search_space
301
- elif self.sklearn_model == "ElasticNet":
302
- if debug:
535
+ # Get the search space for this model
536
+ search_space = self._get_search_space(debug, lags_list)
303
537
 
304
- def search_space(trial):
305
- search_space = {
306
- "selection": trial.suggest_categorical("selection", ["random"]),
307
- "lags": trial.suggest_categorical("lags", [3]),
308
- }
309
- return search_space
538
+ # Bayesian search hyperparameter and lags with skforecast/optuna
539
+ if debug:
540
+ refit = False
541
+ num_lags = 3
310
542
  else:
543
+ refit = True
544
+ num_lags = self.num_lags
545
+ # The optimization routine call
546
+ self.logger.info("Bayesian hyperparameter optimization with backtesting")
547
+ start_time = time.time()
548
+
549
+ # Use the 'y' data that will be passed to the optimizer
550
+ data_to_tune = self.data_train[self.var_model]
311
551
 
312
- def search_space(trial):
313
- search_space = {
314
- "alpha": trial.suggest_float("alpha", 0.0, 2.0),
315
- "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
316
- "selection": trial.suggest_categorical(
317
- "selection", ["cyclic", "random"]
318
- ),
319
- "lags": trial.suggest_categorical(
320
- "lags", [6, 12, 24, 36, 48, 60, 72]
321
- ),
322
- }
323
- return search_space
324
- elif self.sklearn_model == "KNeighborsRegressor":
552
+ # Calculate the new split date and initial_train_size based on the passed split_date_delta
553
+ try:
554
+ date_split = (
555
+ data_to_tune.index[-1]
556
+ - pd.Timedelta(split_date_delta)
557
+ + data_to_tune.index.freq
558
+ )
559
+ initial_train_size = len(data_to_tune.loc[: date_split - data_to_tune.index.freq])
560
+ except (ValueError, TypeError):
561
+ self.logger.warning(
562
+ f"Invalid split_date_delta: {split_date_delta}. Falling back to 5 days."
563
+ )
564
+ date_split = (
565
+ data_to_tune.index[-1] - pd.Timedelta("5days") + data_to_tune.index.freq
566
+ )
567
+ initial_train_size = len(data_to_tune.loc[: date_split - data_to_tune.index.freq])
568
+
569
+ # Check if the calculated initial_train_size is valid
570
+ window_size = num_lags # This is what skforecast will use as window_size
325
571
  if debug:
572
+ window_size = 3 # Match debug lags
326
573
 
327
- def search_space(trial):
328
- search_space = {
329
- "weights": trial.suggest_categorical("weights", ["uniform"]),
330
- "lags": trial.suggest_categorical("lags", [3]),
331
- }
332
- return search_space
333
- else:
574
+ if initial_train_size <= window_size:
575
+ self.logger.warning(
576
+ f"Calculated initial_train_size ({initial_train_size}) is <= window_size ({window_size})."
577
+ )
578
+ self.logger.warning(
579
+ "This is likely because split_date_delta is too large for the dataset."
580
+ )
581
+ MIN_SAMPLES_FOR_KNN = 6
582
+ new_train_size = window_size + MIN_SAMPLES_FOR_KNN
583
+ self.logger.warning(
584
+ f"Adjusting initial_train_size to {new_train_size} to attempt recovery."
585
+ )
586
+ initial_train_size = new_train_size
334
587
 
335
- def search_space(trial):
336
- search_space = {
337
- "n_neighbors": trial.suggest_int("n_neighbors", 2, 20),
338
- "leaf_size": trial.suggest_int("leaf_size", 20, 40),
339
- "weights": trial.suggest_categorical(
340
- "weights", ["uniform", "distance"]
341
- ),
342
- "lags": trial.suggest_categorical(
343
- "lags", [6, 12, 24, 36, 48, 60, 72]
344
- ),
345
- }
346
- return search_space
347
-
348
- # Bayesian search hyperparameter and lags with skforecast/optuna
349
- # Lags used as predictors
350
- if debug:
351
- refit = False
352
- num_lags = 3
353
- else:
354
- refit = True
355
- num_lags = self.num_lags
356
- # The optimization routine call
357
- self.logger.info("Bayesian hyperparameter optimization with backtesting")
358
- start_time = time.time()
359
- cv = TimeSeriesFold(
360
- steps = num_lags,
361
- initial_train_size = len(self.data_exo.loc[:self.date_train]),
362
- fixed_train_size = True,
363
- gap = 0,
364
- skip_folds = None,
365
- allow_incomplete_fold = True,
366
- refit = refit
367
- )
368
- self.optimize_results, self.optimize_results_object = bayesian_search_forecaster(
369
- forecaster = self.forecaster,
370
- y = self.data_train[self.var_model],
371
- exog = self.data_train.drop(self.var_model, axis=1),
372
- cv = cv,
373
- search_space = search_space,
374
- metric = MLForecaster.neg_r2_score,
375
- n_trials = 10,
376
- random_state = 123,
377
- return_best = True
378
- )
379
- self.logger.info(f"Elapsed time: {time.time() - start_time}")
380
- self.is_tuned = True
381
- predictions_opt = self.forecaster.predict(
382
- steps=self.num_lags, exog=self.data_test.drop(self.var_model, axis=1)
383
- )
384
- freq_hours = self.data_exo.index.freq.delta.seconds / 3600
385
- self.lags_opt = int(np.round(len(self.optimize_results.iloc[0]["lags"])))
386
- self.days_needed = int(np.round(self.lags_opt * freq_hours / 24))
387
- df_pred_opt = pd.DataFrame(
388
- index=self.data_exo.index, columns=["train", "test", "pred_optim"]
389
- )
390
- df_pred_opt["train"] = self.data_train[self.var_model]
391
- df_pred_opt["test"] = self.data_test[self.var_model]
392
- df_pred_opt["pred_optim"] = predictions_opt
393
- pred_optim_metric_train = -self.optimize_results.iloc[0]["neg_r2_score"]
394
- self.logger.info(
395
- f"R2 score for optimized prediction in train period: {pred_optim_metric_train}"
396
- )
397
- pred_optim_metric_test = r2_score(
398
- df_pred_opt.loc[predictions_opt.index, "test"],
399
- df_pred_opt.loc[predictions_opt.index, "pred_optim"],
400
- )
401
- self.logger.info(
402
- f"R2 score for optimized prediction in test period: {pred_optim_metric_test}"
403
- )
404
- self.logger.info("Number of optimal lags obtained: " + str(self.lags_opt))
405
- return df_pred_opt
588
+ cv = TimeSeriesFold(
589
+ steps=num_lags,
590
+ initial_train_size=initial_train_size,
591
+ fixed_train_size=True,
592
+ gap=0,
593
+ skip_folds=None,
594
+ allow_incomplete_fold=True,
595
+ refit=refit,
596
+ )
597
+
598
+ (
599
+ self.optimize_results,
600
+ self.optimize_results_object,
601
+ ) = await asyncio.to_thread(
602
+ bayesian_search_forecaster,
603
+ forecaster=self.forecaster,
604
+ y=self.data_train[self.var_model],
605
+ exog=self.data_train.drop(self.var_model, axis=1),
606
+ cv=cv,
607
+ search_space=search_space,
608
+ metric=MLForecaster.neg_r2_score,
609
+ n_trials=n_trials,
610
+ random_state=123,
611
+ return_best=True,
612
+ )
613
+
614
+ optimization_time = time.time() - start_time
615
+ self.logger.info(f"Elapsed time: {optimization_time}")
616
+
617
+ self.is_tuned = True
618
+
619
+ predictions_opt = await asyncio.to_thread(
620
+ self.forecaster.predict,
621
+ steps=self.num_lags,
622
+ exog=self.data_test.drop(self.var_model, axis=1),
623
+ )
624
+
625
+ freq_hours = self.data_exo.index.freq.delta.seconds / 3600
626
+ self.lags_opt = int(np.round(len(self.optimize_results.iloc[0]["lags"])))
627
+ self.days_needed = int(np.round(self.lags_opt * freq_hours / 24))
628
+
629
+ df_pred_opt = pd.DataFrame(
630
+ index=self.data_exo.index, columns=["train", "test", "pred_optim"]
631
+ )
632
+ df_pred_opt["train"] = self.data_train[self.var_model]
633
+ df_pred_opt["test"] = self.data_test[self.var_model]
634
+ df_pred_opt["pred_optim"] = predictions_opt
635
+
636
+ pred_optim_metric_train = -self.optimize_results.iloc[0]["neg_r2_score"]
637
+ self.logger.info(
638
+ f"R2 score for optimized prediction in train period: {pred_optim_metric_train}"
639
+ )
640
+
641
+ pred_optim_metric_test = await asyncio.to_thread(
642
+ r2_score,
643
+ df_pred_opt.loc[predictions_opt.index, "test"],
644
+ df_pred_opt.loc[predictions_opt.index, "pred_optim"],
645
+ )
646
+ self.logger.info(
647
+ f"R2 score for optimized prediction in test period: {pred_optim_metric_test}"
648
+ )
649
+ self.logger.info("Number of optimal lags obtained: " + str(self.lags_opt))
650
+
651
+ return df_pred_opt
652
+
653
+ except asyncio.CancelledError:
654
+ self.logger.info("Model tuning was cancelled")
655
+ raise
656
+ except Exception as e:
657
+ self.logger.error(f"Error during model tuning: {e}")
658
+ raise