emhass 0.10.6__py3-none-any.whl → 0.15.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,44 +1,61 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
1
+ import asyncio
4
2
  import logging
5
- import copy
6
3
  import time
7
- from typing import Optional, Tuple
8
- import pandas as pd
9
- import numpy as np
4
+ import warnings
10
5
 
11
- from sklearn.linear_model import LinearRegression
12
- from sklearn.linear_model import ElasticNet
13
- from sklearn.neighbors import KNeighborsRegressor
6
+ import numpy as np
7
+ import pandas as pd
8
+ from skforecast.model_selection import (
9
+ TimeSeriesFold,
10
+ backtesting_forecaster,
11
+ bayesian_search_forecaster,
12
+ )
13
+ from skforecast.recursive import ForecasterRecursive
14
+ from sklearn.ensemble import (
15
+ AdaBoostRegressor,
16
+ ExtraTreesRegressor,
17
+ GradientBoostingRegressor,
18
+ RandomForestRegressor,
19
+ )
20
+ from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
14
21
  from sklearn.metrics import r2_score
22
+ from sklearn.neighbors import KNeighborsRegressor
23
+ from sklearn.neural_network import MLPRegressor
24
+ from sklearn.svm import SVR
25
+ from sklearn.tree import DecisionTreeRegressor
15
26
 
16
- from skforecast.ForecasterAutoreg import ForecasterAutoreg
17
- from skforecast.model_selection import bayesian_search_forecaster
18
- from skforecast.model_selection import backtesting_forecaster
27
+ from emhass import utils
19
28
 
20
- import warnings
21
29
  warnings.filterwarnings("ignore", category=DeprecationWarning)
22
30
 
31
+
23
32
  class MLForecaster:
24
33
  r"""
25
34
  A forecaster class using machine learning models with auto-regressive approach and features\
26
35
  based on timestamp information (hour, day, week, etc).
27
-
36
+
28
37
  This class uses the `skforecast` module and the machine learning models are from `scikit-learn`.
29
-
38
+
30
39
  It exposes three main methods:
31
-
40
+
32
41
  - `fit`: to train a model with the passed data.
33
-
42
+
34
43
  - `predict`: to obtain a forecast from a pre-trained model.
35
-
36
- - `tune`: to optimize the models hyperparameters using bayesian optimization.
37
-
44
+
45
+ - `tune`: to optimize the models hyperparameters using bayesian optimization.
46
+
38
47
  """
39
48
 
40
- def __init__(self, data: pd.DataFrame, model_type: str, var_model: str, sklearn_model: str,
41
- num_lags: int, emhass_conf: dict, logger: logging.Logger) -> None:
49
+ def __init__(
50
+ self,
51
+ data: pd.DataFrame,
52
+ model_type: str,
53
+ var_model: str,
54
+ sklearn_model: str,
55
+ num_lags: int,
56
+ emhass_conf: dict,
57
+ logger: logging.Logger,
58
+ ) -> None:
42
59
  r"""Define constructor for the forecast class.
43
60
 
44
61
  :param data: The data that will be used for train/test
@@ -69,47 +86,104 @@ class MLForecaster:
69
86
  self.emhass_conf = emhass_conf
70
87
  self.logger = logger
71
88
  self.is_tuned = False
89
+ self.forecaster: ForecasterRecursive | None = None
90
+ self.optimize_results: pd.DataFrame | None = None
91
+ self.optimize_results_object = None
92
+
72
93
  # A quick data preparation
94
+ self._prepare_data()
95
+
96
+ def _prepare_data(self):
97
+ """Prepare the input data by cleaning and sorting."""
73
98
  self.data.index = pd.to_datetime(self.data.index)
74
99
  self.data.sort_index(inplace=True)
75
- self.data = self.data[~self.data.index.duplicated(keep='first')]
76
-
77
- @staticmethod
78
- def add_date_features(data: pd.DataFrame) -> pd.DataFrame:
79
- """Add date features from the input DataFrame timestamp
80
-
81
- :param data: The input DataFrame
82
- :type data: pd.DataFrame
83
- :return: The DataFrame with the added features
84
- :rtype: pd.DataFrame
85
- """
86
- df = copy.deepcopy(data)
87
- df['year'] = [i.year for i in df.index]
88
- df['month'] = [i.month for i in df.index]
89
- df['day_of_week'] = [i.dayofweek for i in df.index]
90
- df['day_of_year'] = [i.dayofyear for i in df.index]
91
- df['day'] = [i.day for i in df.index]
92
- df['hour'] = [i.hour for i in df.index]
93
- return df
100
+ self.data = self.data[~self.data.index.duplicated(keep="first")]
94
101
 
95
102
  @staticmethod
96
103
  def neg_r2_score(y_true, y_pred):
97
104
  """The negative of the r2 score."""
98
105
  return -r2_score(y_true, y_pred)
99
-
106
+
107
+ @staticmethod
108
+ async def interpolate_async(data: pd.DataFrame) -> pd.DataFrame:
109
+ """Interpolate missing values asynchronously."""
110
+ return await asyncio.to_thread(data.interpolate, method="linear", axis=0, limit=None)
111
+
100
112
  @staticmethod
101
- def generate_exog(data_last_window, periods, var_name):
113
+ def get_lags_list_from_frequency(freq: pd.Timedelta) -> list[int]:
114
+ """Calculate appropriate lag values based on data frequency.
115
+
116
+ The lags represent different time horizons (6h, 12h, 1d, 1.5d, 2d, 2.5d, 3d).
117
+ This method scales these horizons according to the actual data frequency.
118
+
119
+ :param freq: The frequency of the data as a pandas Timedelta
120
+ :type freq: pd.Timedelta
121
+ :return: A list of lag values appropriate for the data frequency
122
+ :rtype: list[int]
123
+ """
124
+ # Define target time horizons in hours
125
+ target_horizons_hours = [6, 12, 24, 36, 48, 60, 72]
126
+
127
+ # Calculate frequency in hours
128
+ freq_hours = freq.total_seconds() / 3600
129
+
130
+ # Calculate lags for each horizon
131
+ lags = [int(round(horizon / freq_hours)) for horizon in target_horizons_hours]
132
+
133
+ # Remove duplicates and ensure minimum value of 1
134
+ lags = sorted({max(1, lag) for lag in lags})
135
+
136
+ return lags
137
+
138
+ @staticmethod
139
+ async def generate_exog(data_last_window, periods, var_name):
102
140
  """Generate the exogenous data for future timestamps."""
103
- forecast_dates = pd.date_range(start=data_last_window.index[-1]+data_last_window.index.freq,
104
- periods=periods,
105
- freq=data_last_window.index.freq)
106
- exog = pd.DataFrame({var_name:[np.nan]*periods},
107
- index=forecast_dates)
108
- exog = MLForecaster.add_date_features(exog)
141
+ forecast_dates = pd.date_range(
142
+ start=data_last_window.index[-1] + data_last_window.index.freq,
143
+ periods=periods,
144
+ freq=data_last_window.index.freq,
145
+ )
146
+ exog = pd.DataFrame({var_name: [np.nan] * periods}, index=forecast_dates)
147
+ exog = utils.add_date_features(exog)
109
148
  return exog
110
-
111
- def fit(self, split_date_delta: Optional[str] = '48h', perform_backtest: Optional[bool] = False
112
- ) -> Tuple[pd.DataFrame, pd.DataFrame]:
149
+
150
+ def _get_sklearn_model(self, model_name: str):
151
+ """Get the sklearn model instance based on the model name."""
152
+ seed = 42
153
+ models = {
154
+ "LinearRegression": LinearRegression(),
155
+ "RidgeRegression": Ridge(),
156
+ "LassoRegression": Lasso(random_state=seed),
157
+ "ElasticNet": ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=seed),
158
+ "KNeighborsRegressor": KNeighborsRegressor(),
159
+ "DecisionTreeRegressor": DecisionTreeRegressor(ccp_alpha=0.0, random_state=seed),
160
+ "SVR": SVR(),
161
+ "RandomForestRegressor": RandomForestRegressor(
162
+ min_samples_leaf=1, max_features=1.0, random_state=seed
163
+ ),
164
+ "ExtraTreesRegressor": ExtraTreesRegressor(
165
+ min_samples_leaf=1, max_features=1.0, random_state=seed
166
+ ),
167
+ "GradientBoostingRegressor": GradientBoostingRegressor(
168
+ learning_rate=0.1, random_state=seed
169
+ ),
170
+ "AdaBoostRegressor": AdaBoostRegressor(learning_rate=1.0, random_state=seed),
171
+ "MLPRegressor": MLPRegressor(hidden_layer_sizes=(100,), random_state=seed),
172
+ }
173
+
174
+ if model_name not in models:
175
+ self.logger.error(
176
+ f"Passed sklearn model {model_name} is not valid. Defaulting to KNeighborsRegressor"
177
+ )
178
+ return KNeighborsRegressor()
179
+
180
+ return models[model_name]
181
+
182
+ async def fit(
183
+ self,
184
+ split_date_delta: str | None = "48h",
185
+ perform_backtest: bool | None = False,
186
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
113
187
  r"""The fit method to train the ML model.
114
188
 
115
189
  :param split_date_delta: The delta from now to `split_date_delta` that will be used \
@@ -121,72 +195,133 @@ class MLForecaster:
121
195
  :return: The DataFrame containing the forecast data results without and with backtest
122
196
  :rtype: Tuple[pd.DataFrame, pd.DataFrame]
123
197
  """
124
- self.logger.info("Performing a forecast model fit for "+self.model_type)
125
- # Preparing the data: adding exogenous features
126
- self.data_exo = pd.DataFrame(index=self.data.index)
127
- self.data_exo = MLForecaster.add_date_features(self.data_exo)
128
- self.data_exo[self.var_model] = self.data[self.var_model]
129
- self.data_exo = self.data_exo.interpolate(method='linear', axis=0, limit=None)
130
- # train/test split
131
- self.date_train = self.data_exo.index[-1]-pd.Timedelta('5days')+self.data_exo.index.freq # The last 5 days
132
- self.date_split = self.data_exo.index[-1]-pd.Timedelta(split_date_delta)+self.data_exo.index.freq # The last 48h
133
- self.data_train = self.data_exo.loc[:self.date_split-self.data_exo.index.freq,:]
134
- self.data_test = self.data_exo.loc[self.date_split:,:]
135
- self.steps = len(self.data_test)
136
- # Pick correct sklearn model
137
- if self.sklearn_model == 'LinearRegression':
138
- base_model = LinearRegression()
139
- elif self.sklearn_model == 'ElasticNet':
140
- base_model = ElasticNet()
141
- elif self.sklearn_model == 'KNeighborsRegressor':
142
- base_model = KNeighborsRegressor()
143
- else:
144
- self.logger.error("Passed sklearn model "+self.sklearn_model+" is not valid")
145
- # Define the forecaster object
146
- self.forecaster = ForecasterAutoreg(
147
- regressor = base_model,
148
- lags = self.num_lags
149
- )
150
- # Fit and time it
151
- self.logger.info("Training a "+self.sklearn_model+" model")
152
- start_time = time.time()
153
- self.forecaster.fit(y=self.data_train[self.var_model],
154
- exog=self.data_train.drop(self.var_model, axis=1))
155
- self.logger.info(f"Elapsed time for model fit: {time.time() - start_time}")
156
- # Make a prediction to print metrics
157
- predictions = self.forecaster.predict(steps=self.steps, exog=self.data_test.drop(self.var_model, axis=1))
158
- pred_metric = r2_score(self.data_test[self.var_model],predictions)
159
- self.logger.info(f"Prediction R2 score of fitted model on test data: {pred_metric}")
160
- # Packing results in a DataFrame
161
- df_pred = pd.DataFrame(index=self.data_exo.index,columns=['train','test','pred'])
162
- df_pred['train'] = self.data_train[self.var_model]
163
- df_pred['test'] = self.data_test[self.var_model]
164
- df_pred['pred'] = predictions
165
- df_pred_backtest = None
166
- if perform_backtest is True:
167
- # Using backtesting tool to evaluate the model
168
- self.logger.info("Performing simple backtesting of fitted model")
198
+ try:
199
+ self.logger.info("Performing a forecast model fit for " + self.model_type)
200
+
201
+ # Check if variable exists in data
202
+ if self.var_model not in self.data.columns:
203
+ raise KeyError(
204
+ f"Variable '{self.var_model}' not found in data columns: {list(self.data.columns)}"
205
+ )
206
+
207
+ # Preparing the data: adding exogenous features
208
+ self.data_exo = pd.DataFrame(index=self.data.index)
209
+ self.data_exo = utils.add_date_features(self.data_exo)
210
+ self.data_exo[self.var_model] = self.data[self.var_model]
211
+
212
+ self.data_exo = await self.interpolate_async(self.data_exo)
213
+
214
+ # train/test split
215
+ self.date_train = (
216
+ self.data_exo.index[-1] - pd.Timedelta("5days") + self.data_exo.index.freq
217
+ ) # The last 5 days
218
+ self.date_split = (
219
+ self.data_exo.index[-1] - pd.Timedelta(split_date_delta) + self.data_exo.index.freq
220
+ ) # The last 48h
221
+ self.data_train = self.data_exo.loc[: self.date_split - self.data_exo.index.freq, :]
222
+ self.data_test = self.data_exo.loc[self.date_split :, :]
223
+ self.steps = len(self.data_test)
224
+
225
+ # Pick correct sklearn model
226
+ base_model = self._get_sklearn_model(self.sklearn_model)
227
+
228
+ # Define the forecaster object
229
+ self.forecaster = ForecasterRecursive(estimator=base_model, lags=self.num_lags)
230
+
231
+ # Fit and time it
232
+ self.logger.info("Training a " + self.sklearn_model + " model")
169
233
  start_time = time.time()
170
- metric, predictions_backtest = backtesting_forecaster(
171
- forecaster = self.forecaster,
172
- y = self.data_train[self.var_model],
173
- exog = self.data_train.drop(self.var_model, axis=1),
174
- initial_train_size = None,
175
- fixed_train_size = False,
176
- steps = self.num_lags,
177
- metric = MLForecaster.neg_r2_score,
178
- refit = False,
179
- verbose = False
234
+
235
+ await asyncio.to_thread(
236
+ self.forecaster.fit,
237
+ y=self.data_train[self.var_model],
238
+ exog=self.data_train.drop(self.var_model, axis=1),
239
+ store_in_sample_residuals=True,
180
240
  )
181
- self.logger.info(f"Elapsed backtesting time: {time.time() - start_time}")
182
- self.logger.info(f"Backtest R2 score: {-metric}")
183
- df_pred_backtest = pd.DataFrame(index=self.data_exo.index,columns=['train','pred'])
184
- df_pred_backtest['train'] = self.data_exo[self.var_model]
185
- df_pred_backtest['pred'] = predictions_backtest
186
- return df_pred, df_pred_backtest
187
-
188
- def predict(self, data_last_window: Optional[pd.DataFrame] = None
189
- ) -> pd.Series:
241
+
242
+ fit_time = time.time() - start_time
243
+ self.logger.info(f"Elapsed time for model fit: {fit_time}")
244
+
245
+ # Make a prediction to print metrics
246
+ predictions = await asyncio.to_thread(
247
+ self.forecaster.predict,
248
+ steps=self.steps,
249
+ exog=self.data_test.drop(self.var_model, axis=1),
250
+ )
251
+ pred_metric = await asyncio.to_thread(
252
+ r2_score, self.data_test[self.var_model], predictions
253
+ )
254
+ self.logger.info(f"Prediction R2 score of fitted model on test data: {pred_metric}")
255
+
256
+ # Packing results in a DataFrame
257
+ df_pred = pd.DataFrame(index=self.data_exo.index, columns=["train", "test", "pred"])
258
+
259
+ df_pred["train"] = self.data_train[self.var_model]
260
+ df_pred["test"] = self.data_test[self.var_model]
261
+ df_pred["pred"] = predictions
262
+
263
+ df_pred_backtest = None
264
+
265
+ if perform_backtest is True:
266
+ # Using backtesting tool to evaluate the model
267
+ self.logger.info("Performing simple backtesting of fitted model")
268
+ start_time = time.time()
269
+ cv = TimeSeriesFold(
270
+ steps=self.num_lags,
271
+ initial_train_size=None,
272
+ fixed_train_size=False,
273
+ gap=0,
274
+ allow_incomplete_fold=True,
275
+ refit=False,
276
+ )
277
+
278
+ metric, predictions_backtest = await asyncio.to_thread(
279
+ backtesting_forecaster,
280
+ forecaster=self.forecaster,
281
+ y=self.data_train[self.var_model],
282
+ exog=self.data_train.drop(self.var_model, axis=1),
283
+ cv=cv,
284
+ metric=MLForecaster.neg_r2_score,
285
+ verbose=False,
286
+ show_progress=True,
287
+ )
288
+
289
+ backtest_time = time.time() - start_time
290
+ backtest_r2 = -metric
291
+ self.logger.info(f"Elapsed backtesting time: {backtest_time}")
292
+ self.logger.info(f"Backtest R2 score: {backtest_r2}")
293
+ df_pred_backtest = pd.DataFrame(
294
+ index=self.data_exo.index, columns=["train", "pred"]
295
+ )
296
+ df_pred_backtest["train"] = self.data_exo[self.var_model]
297
+ # Handle skforecast 0.18.0+ DataFrame output with fold column
298
+ if isinstance(predictions_backtest, pd.DataFrame):
299
+ # Extract the 'pred' column from the DataFrame
300
+ pred_values = (
301
+ predictions_backtest["pred"]
302
+ if "pred" in predictions_backtest.columns
303
+ else predictions_backtest.iloc[:, -1]
304
+ )
305
+ else:
306
+ # If it's a Series, use it directly
307
+ pred_values = predictions_backtest
308
+
309
+ # Use loc to align indices properly - only assign where indices match
310
+ df_pred_backtest.loc[pred_values.index, "pred"] = pred_values
311
+
312
+ return df_pred, df_pred_backtest
313
+
314
+ except asyncio.CancelledError:
315
+ self.logger.info("Model training was cancelled")
316
+ raise
317
+ except Exception as e:
318
+ self.logger.error(f"Error during model fitting: {e}")
319
+ raise
320
+
321
+ async def predict(
322
+ self,
323
+ data_last_window: pd.DataFrame | None = None,
324
+ ) -> pd.Series:
190
325
  """The predict method to generate forecasts from a previously fitted ML model.
191
326
 
192
327
  :param data_last_window: The data that will be used to generate the new forecast, this \
@@ -197,109 +332,327 @@ class MLForecaster:
197
332
  :return: A pandas series containing the generated forecasts.
198
333
  :rtype: pd.Series
199
334
  """
200
- if data_last_window is None:
201
- predictions = self.forecaster.predict(steps=self.num_lags, exog=self.data_test.drop(self.var_model, axis=1))
202
- else:
203
- data_last_window = data_last_window.interpolate(method='linear', axis=0, limit=None)
204
- if self.is_tuned:
205
- exog = MLForecaster.generate_exog(data_last_window, self.lags_opt, self.var_model)
206
- predictions = self.forecaster.predict(steps=self.lags_opt,
207
- last_window=data_last_window[self.var_model],
208
- exog=exog.drop(self.var_model, axis=1))
335
+ try:
336
+ if self.forecaster is None:
337
+ raise ValueError("Model has not been fitted yet. Call fit() first.")
338
+
339
+ if data_last_window is None:
340
+ predictions = await asyncio.to_thread(
341
+ self.forecaster.predict,
342
+ steps=self.num_lags,
343
+ exog=self.data_test.drop(self.var_model, axis=1),
344
+ )
209
345
  else:
210
- exog = MLForecaster.generate_exog(data_last_window, self.num_lags, self.var_model)
211
- predictions = self.forecaster.predict(steps=self.num_lags,
212
- last_window=data_last_window[self.var_model],
213
- exog=exog.drop(self.var_model, axis=1))
214
- return predictions
215
-
216
- def tune(self, debug: Optional[bool] = False) -> pd.DataFrame:
346
+ data_last_window = await self.interpolate_async(data_last_window)
347
+
348
+ if self.is_tuned:
349
+ exog = await self.generate_exog(data_last_window, self.lags_opt, self.var_model)
350
+
351
+ predictions = await asyncio.to_thread(
352
+ self.forecaster.predict,
353
+ steps=self.lags_opt,
354
+ last_window=data_last_window[self.var_model],
355
+ exog=exog.drop(self.var_model, axis=1),
356
+ )
357
+ else:
358
+ exog = await self.generate_exog(data_last_window, self.num_lags, self.var_model)
359
+
360
+ predictions = await asyncio.to_thread(
361
+ self.forecaster.predict,
362
+ steps=self.num_lags,
363
+ last_window=data_last_window[self.var_model],
364
+ exog=exog.drop(self.var_model, axis=1),
365
+ )
366
+
367
+ return predictions
368
+
369
+ except asyncio.CancelledError:
370
+ self.logger.info("Prediction was cancelled")
371
+ raise
372
+ except Exception as e:
373
+ self.logger.error(f"Error during prediction: {e}")
374
+ raise
375
+
376
+ def _get_search_space(self, debug: bool, lags_list: list[int] | None = None):
377
+ """Get the hyperparameter search space for the given model.
378
+
379
+ :param debug: If True, use simplified search space for faster testing
380
+ :type debug: bool
381
+ :param lags_list: List of lag values to use. If None, uses default values
382
+ :type lags_list: list[int] | None
383
+ """
384
+ if lags_list is None:
385
+ lags_list = [6, 12, 24, 36, 48, 60, 72]
386
+
387
+ debug_lags = [3]
388
+
389
+ def get_lags(trial):
390
+ return trial.suggest_categorical("lags", debug_lags if debug else lags_list)
391
+
392
+ def svr_search_space(trial):
393
+ # Base SVR parameters
394
+ search = {
395
+ "C": trial.suggest_float("C", 0.1, 1.0)
396
+ if debug
397
+ else trial.suggest_float("C", 1e-2, 100.0, log=True),
398
+ "epsilon": trial.suggest_float("epsilon", 0.01, 1.0),
399
+ "kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
400
+ "gamma": trial.suggest_categorical(
401
+ "gamma", ["scale", "auto", 0.01, 0.1, 1.0, 10.0]
402
+ ),
403
+ "lags": get_lags(trial),
404
+ }
405
+ return search
406
+
407
+ # Registry of search space generators
408
+ search_spaces = {
409
+ "LinearRegression": lambda trial: {
410
+ "fit_intercept": trial.suggest_categorical(
411
+ "fit_intercept", [True] if debug else [True, False]
412
+ ),
413
+ "lags": get_lags(trial),
414
+ },
415
+ "RidgeRegression": lambda trial: {
416
+ "alpha": trial.suggest_float("alpha", 0.1, 1.0)
417
+ if debug
418
+ else trial.suggest_float("alpha", 1e-4, 100.0, log=True),
419
+ "lags": get_lags(trial),
420
+ },
421
+ "LassoRegression": lambda trial: {
422
+ "alpha": trial.suggest_float("alpha", 0.1, 1.0)
423
+ if debug
424
+ else trial.suggest_float("alpha", 1e-4, 100.0, log=True),
425
+ "lags": get_lags(trial),
426
+ },
427
+ "ElasticNet": lambda trial: {
428
+ "alpha": trial.suggest_float("alpha", 0.0, 2.0),
429
+ "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
430
+ "selection": trial.suggest_categorical(
431
+ "selection", ["random"] if debug else ["cyclic", "random"]
432
+ ),
433
+ "lags": get_lags(trial),
434
+ },
435
+ "KNeighborsRegressor": lambda trial: {
436
+ "n_neighbors": trial.suggest_int("n_neighbors", 2, 2)
437
+ if debug
438
+ else trial.suggest_int("n_neighbors", 2, 20),
439
+ "leaf_size": trial.suggest_int("leaf_size", 20, 20)
440
+ if debug
441
+ else trial.suggest_int("leaf_size", 20, 40),
442
+ "weights": trial.suggest_categorical(
443
+ "weights", ["uniform"] if debug else ["uniform", "distance"]
444
+ ),
445
+ "lags": get_lags(trial),
446
+ },
447
+ "DecisionTreeRegressor": lambda trial: {
448
+ "max_depth": trial.suggest_int("max_depth", 2, 5)
449
+ if debug
450
+ else trial.suggest_int("max_depth", 2, 20),
451
+ "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
452
+ "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
453
+ "lags": get_lags(trial),
454
+ },
455
+ "SVR": svr_search_space,
456
+ "RandomForestRegressor": lambda trial: {
457
+ "n_estimators": trial.suggest_int("n_estimators", 10, 20)
458
+ if debug
459
+ else trial.suggest_int("n_estimators", 50, 300),
460
+ "max_depth": trial.suggest_int("max_depth", 3, 20),
461
+ "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
462
+ "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
463
+ "lags": get_lags(trial),
464
+ },
465
+ "ExtraTreesRegressor": lambda trial: {
466
+ "n_estimators": trial.suggest_int("n_estimators", 10, 20)
467
+ if debug
468
+ else trial.suggest_int("n_estimators", 50, 300),
469
+ "max_depth": trial.suggest_int("max_depth", 3, 20),
470
+ "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
471
+ "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
472
+ "lags": get_lags(trial),
473
+ },
474
+ "GradientBoostingRegressor": lambda trial: {
475
+ "n_estimators": trial.suggest_int("n_estimators", 10, 20)
476
+ if debug
477
+ else trial.suggest_int("n_estimators", 50, 300),
478
+ "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
479
+ "max_depth": trial.suggest_int("max_depth", 3, 10),
480
+ "lags": get_lags(trial),
481
+ },
482
+ "AdaBoostRegressor": lambda trial: {
483
+ "n_estimators": trial.suggest_int("n_estimators", 10, 20)
484
+ if debug
485
+ else trial.suggest_int("n_estimators", 50, 300),
486
+ "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
487
+ "lags": get_lags(trial),
488
+ },
489
+ "MLPRegressor": lambda trial: {
490
+ "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.01),
491
+ "hidden_layer_sizes": trial.suggest_categorical(
492
+ "hidden_layer_sizes", [(50,), (100,), (50, 50)]
493
+ ),
494
+ "activation": trial.suggest_categorical("activation", ["relu", "tanh"]),
495
+ "alpha": trial.suggest_float("alpha", 1e-5, 1e-1, log=True),
496
+ "lags": get_lags(trial),
497
+ },
498
+ }
499
+
500
+ if self.sklearn_model not in search_spaces:
501
+ raise ValueError(f"Unsupported model for tuning: {self.sklearn_model}")
502
+
503
+ return search_spaces[self.sklearn_model]
504
+
505
+ async def tune(
506
+ self,
507
+ split_date_delta: str | None = "48h",
508
+ n_trials: int = 10,
509
+ debug: bool | None = False,
510
+ ) -> pd.DataFrame:
217
511
  """Tuning a previously fitted model using bayesian optimization.
218
512
 
513
+ :param split_date_delta: The delta from now to `split_date_delta` that will be used \
514
+ as the test period to evaluate the model, defaults to '48h'.\
515
+ This define the training/validation split for the tuning process.
516
+ :type split_date_delta: Optional[str], optional
219
517
  :param debug: Set to True for testing and faster optimizations, defaults to False
220
518
  :type debug: Optional[bool], optional
519
+ :param n_trials: Number of trials for bayesian optimization, defaults to 10
520
+ :type n_trials: Optional[int], optional
221
521
  :return: The DataFrame with the forecasts using the optimized model.
222
522
  :rtype: pd.DataFrame
223
523
  """
224
- # Regressor hyperparameters search space
225
- if self.sklearn_model == 'LinearRegression':
226
- if debug:
227
- def search_space(trial):
228
- search_space = {'fit_intercept': trial.suggest_categorical('fit_intercept', [True]),
229
- 'lags': trial.suggest_categorical('lags', [3])}
230
- return search_space
231
- else:
232
- def search_space(trial):
233
- search_space = {'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
234
- 'lags': trial.suggest_categorical('lags', [6, 12, 24, 36, 48, 60, 72])}
235
- return search_space
236
- elif self.sklearn_model == 'ElasticNet':
524
+ try:
525
+ if self.forecaster is None:
526
+ raise ValueError("Model has not been fitted yet. Call fit() first.")
527
+
528
+ # Calculate appropriate lags based on data frequency
529
+ freq_timedelta = pd.Timedelta(self.data_exo.index.freq)
530
+ lags_list = MLForecaster.get_lags_list_from_frequency(freq_timedelta)
531
+ self.logger.info(
532
+ f"Using lags list based on data frequency ({self.data_exo.index.freq}): {lags_list}"
533
+ )
534
+
535
+ # Get the search space for this model
536
+ search_space = self._get_search_space(debug, lags_list)
537
+
538
+ # Bayesian search hyperparameter and lags with skforecast/optuna
237
539
  if debug:
238
- def search_space(trial):
239
- search_space = {'selection': trial.suggest_categorical('selection', ['random']),
240
- 'lags': trial.suggest_categorical('lags', [3])}
241
- return search_space
540
+ refit = False
541
+ num_lags = 3
242
542
  else:
243
- def search_space(trial):
244
- search_space = {'alpha': trial.suggest_float('alpha', 0.0, 2.0),
245
- 'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0),
246
- 'selection': trial.suggest_categorical('selection', ['cyclic', 'random']),
247
- 'lags': trial.suggest_categorical('lags', [6, 12, 24, 36, 48, 60, 72])}
248
- return search_space
249
- elif self.sklearn_model == 'KNeighborsRegressor':
543
+ refit = True
544
+ num_lags = self.num_lags
545
+ # The optimization routine call
546
+ self.logger.info("Bayesian hyperparameter optimization with backtesting")
547
+ start_time = time.time()
548
+
549
+ # Use the 'y' data that will be passed to the optimizer
550
+ data_to_tune = self.data_train[self.var_model]
551
+
552
+ # Calculate the new split date and initial_train_size based on the passed split_date_delta
553
+ try:
554
+ date_split = (
555
+ data_to_tune.index[-1]
556
+ - pd.Timedelta(split_date_delta)
557
+ + data_to_tune.index.freq
558
+ )
559
+ initial_train_size = len(data_to_tune.loc[: date_split - data_to_tune.index.freq])
560
+ except (ValueError, TypeError):
561
+ self.logger.warning(
562
+ f"Invalid split_date_delta: {split_date_delta}. Falling back to 5 days."
563
+ )
564
+ date_split = (
565
+ data_to_tune.index[-1] - pd.Timedelta("5days") + data_to_tune.index.freq
566
+ )
567
+ initial_train_size = len(data_to_tune.loc[: date_split - data_to_tune.index.freq])
568
+
569
+ # Check if the calculated initial_train_size is valid
570
+ window_size = num_lags # This is what skforecast will use as window_size
250
571
  if debug:
251
- def search_space(trial):
252
- search_space = {'weights': trial.suggest_categorical('weights', ['uniform']),
253
- 'lags': trial.suggest_categorical('lags', [3])}
254
- return search_space
255
- else:
256
- def search_space(trial):
257
- search_space = {'n_neighbors': trial.suggest_int('n_neighbors', 2, 20),
258
- 'leaf_size': trial.suggest_int('leaf_size', 20, 40),
259
- 'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
260
- 'lags': trial.suggest_categorical('lags', [6, 12, 24, 36, 48, 60, 72])}
261
- return search_space
262
- # Bayesian search hyperparameter and lags with skforecast/optuna
263
- # Lags used as predictors
264
- if debug:
265
- refit = False
266
- num_lags = 3
267
- else:
268
- refit = True
269
- num_lags = self.num_lags
270
- # The optimization routine call
271
- self.logger.info("Bayesian hyperparameter optimization with backtesting")
272
- start_time = time.time()
273
- self.optimize_results, self.optimize_results_object = bayesian_search_forecaster(
274
- forecaster = self.forecaster,
275
- y = self.data_train[self.var_model],
276
- exog = self.data_train.drop(self.var_model, axis=1),
277
- search_space = search_space,
278
- steps = num_lags,
279
- metric = MLForecaster.neg_r2_score,
280
- refit = refit,
281
- initial_train_size = len(self.data_exo.loc[:self.date_train]),
282
- fixed_train_size = True,
283
- n_trials = 10,
284
- random_state = 123,
285
- return_best = True,
286
- verbose = False,
287
- engine = 'optuna'
288
- )
289
- self.logger.info(f"Elapsed time: {time.time() - start_time}")
290
- self.is_tuned = True
291
- predictions_opt = self.forecaster.predict(steps=self.num_lags, exog=self.data_test.drop(self.var_model, axis=1))
292
- freq_hours = self.data_exo.index.freq.delta.seconds/3600
293
- self.lags_opt = int(np.round(len(self.optimize_results.iloc[0]['lags'])))
294
- self.days_needed = int(np.round(self.lags_opt*freq_hours/24))
295
- df_pred_opt = pd.DataFrame(index=self.data_exo.index,columns=['train','test','pred_optim'])
296
- df_pred_opt['train'] = self.data_train[self.var_model]
297
- df_pred_opt['test'] = self.data_test[self.var_model]
298
- df_pred_opt['pred_optim'] = predictions_opt
299
- pred_optim_metric_train = -self.optimize_results.iloc[0]['neg_r2_score']
300
- self.logger.info(f"R2 score for optimized prediction in train period: {pred_optim_metric_train}")
301
- pred_optim_metric_test = r2_score(df_pred_opt.loc[predictions_opt.index,'test'],
302
- df_pred_opt.loc[predictions_opt.index,'pred_optim'])
303
- self.logger.info(f"R2 score for optimized prediction in test period: {pred_optim_metric_test}")
304
- self.logger.info("Number of optimal lags obtained: "+str(self.lags_opt))
305
- return df_pred_opt
572
+ window_size = 3 # Match debug lags
573
+
574
+ if initial_train_size <= window_size:
575
+ self.logger.warning(
576
+ f"Calculated initial_train_size ({initial_train_size}) is <= window_size ({window_size})."
577
+ )
578
+ self.logger.warning(
579
+ "This is likely because split_date_delta is too large for the dataset."
580
+ )
581
+ MIN_SAMPLES_FOR_KNN = 6
582
+ new_train_size = window_size + MIN_SAMPLES_FOR_KNN
583
+ self.logger.warning(
584
+ f"Adjusting initial_train_size to {new_train_size} to attempt recovery."
585
+ )
586
+ initial_train_size = new_train_size
587
+
588
+ cv = TimeSeriesFold(
589
+ steps=num_lags,
590
+ initial_train_size=initial_train_size,
591
+ fixed_train_size=True,
592
+ gap=0,
593
+ skip_folds=None,
594
+ allow_incomplete_fold=True,
595
+ refit=refit,
596
+ )
597
+
598
+ (
599
+ self.optimize_results,
600
+ self.optimize_results_object,
601
+ ) = await asyncio.to_thread(
602
+ bayesian_search_forecaster,
603
+ forecaster=self.forecaster,
604
+ y=self.data_train[self.var_model],
605
+ exog=self.data_train.drop(self.var_model, axis=1),
606
+ cv=cv,
607
+ search_space=search_space,
608
+ metric=MLForecaster.neg_r2_score,
609
+ n_trials=n_trials,
610
+ random_state=123,
611
+ return_best=True,
612
+ )
613
+
614
+ optimization_time = time.time() - start_time
615
+ self.logger.info(f"Elapsed time: {optimization_time}")
616
+
617
+ self.is_tuned = True
618
+
619
+ predictions_opt = await asyncio.to_thread(
620
+ self.forecaster.predict,
621
+ steps=self.num_lags,
622
+ exog=self.data_test.drop(self.var_model, axis=1),
623
+ )
624
+
625
+ freq_hours = self.data_exo.index.freq.delta.seconds / 3600
626
+ self.lags_opt = int(np.round(len(self.optimize_results.iloc[0]["lags"])))
627
+ self.days_needed = int(np.round(self.lags_opt * freq_hours / 24))
628
+
629
+ df_pred_opt = pd.DataFrame(
630
+ index=self.data_exo.index, columns=["train", "test", "pred_optim"]
631
+ )
632
+ df_pred_opt["train"] = self.data_train[self.var_model]
633
+ df_pred_opt["test"] = self.data_test[self.var_model]
634
+ df_pred_opt["pred_optim"] = predictions_opt
635
+
636
+ pred_optim_metric_train = -self.optimize_results.iloc[0]["neg_r2_score"]
637
+ self.logger.info(
638
+ f"R2 score for optimized prediction in train period: {pred_optim_metric_train}"
639
+ )
640
+
641
+ pred_optim_metric_test = await asyncio.to_thread(
642
+ r2_score,
643
+ df_pred_opt.loc[predictions_opt.index, "test"],
644
+ df_pred_opt.loc[predictions_opt.index, "pred_optim"],
645
+ )
646
+ self.logger.info(
647
+ f"R2 score for optimized prediction in test period: {pred_optim_metric_test}"
648
+ )
649
+ self.logger.info("Number of optimal lags obtained: " + str(self.lags_opt))
650
+
651
+ return df_pred_opt
652
+
653
+ except asyncio.CancelledError:
654
+ self.logger.info("Model tuning was cancelled")
655
+ raise
656
+ except Exception as e:
657
+ self.logger.error(f"Error during model tuning: {e}")
658
+ raise