emhass 0.12.4__py3-none-any.whl → 0.12.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
@@ -1,397 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- import copy
5
- import logging
6
- import time
7
- import warnings
8
- from typing import Optional, Tuple
9
-
10
- import numpy as np
11
- import pandas as pd
12
- from skforecast.ForecasterAutoreg import ForecasterAutoreg
13
- from skforecast.model_selection import (
14
- backtesting_forecaster,
15
- bayesian_search_forecaster,
16
- )
17
- from sklearn.linear_model import ElasticNet, LinearRegression
18
- from sklearn.metrics import r2_score
19
- from sklearn.neighbors import KNeighborsRegressor
20
-
21
- warnings.filterwarnings("ignore", category=DeprecationWarning)
22
-
23
-
24
- class MLForecaster:
25
- r"""
26
- A forecaster class using machine learning models with auto-regressive approach and features\
27
- based on timestamp information (hour, day, week, etc).
28
-
29
- This class uses the `skforecast` module and the machine learning models are from `scikit-learn`.
30
-
31
- It exposes three main methods:
32
-
33
- - `fit`: to train a model with the passed data.
34
-
35
- - `predict`: to obtain a forecast from a pre-trained model.
36
-
37
- - `tune`: to optimize the models hyperparameters using bayesian optimization.
38
-
39
- """
40
-
41
- def __init__(
42
- self,
43
- data: pd.DataFrame,
44
- model_type: str,
45
- var_model: str,
46
- sklearn_model: str,
47
- num_lags: int,
48
- emhass_conf: dict,
49
- logger: logging.Logger,
50
- ) -> None:
51
- r"""Define constructor for the forecast class.
52
-
53
- :param data: The data that will be used for train/test
54
- :type data: pd.DataFrame
55
- :param model_type: A unique name defining this model and useful to identify \
56
- for what it will be used for.
57
- :type model_type: str
58
- :param var_model: The name of the sensor to retrieve data from Home Assistant. \
59
- Example: `sensor.power_load_no_var_loads`.
60
- :type var_model: str
61
- :param sklearn_model: The `scikit-learn` model that will be used. For now only \
62
- this options are possible: `LinearRegression`, `ElasticNet` and `KNeighborsRegressor`.
63
- :type sklearn_model: str
64
- :param num_lags: The number of auto-regression lags to consider. A good starting point \
65
- is to fix this as one day. For example if your time step is 30 minutes, then fix this \
66
- to 48, if the time step is 1 hour the fix this to 24 and so on.
67
- :type num_lags: int
68
- :param emhass_conf: Dictionary containing the needed emhass paths
69
- :type emhass_conf: dict
70
- :param logger: The passed logger object
71
- :type logger: logging.Logger
72
- """
73
- self.data = data
74
- self.model_type = model_type
75
- self.var_model = var_model
76
- self.sklearn_model = sklearn_model
77
- self.num_lags = num_lags
78
- self.emhass_conf = emhass_conf
79
- self.logger = logger
80
- self.is_tuned = False
81
- # A quick data preparation
82
- self.data.index = pd.to_datetime(self.data.index)
83
- self.data.sort_index(inplace=True)
84
- self.data = self.data[~self.data.index.duplicated(keep="first")]
85
-
86
- @staticmethod
87
- def add_date_features(data: pd.DataFrame) -> pd.DataFrame:
88
- """Add date features from the input DataFrame timestamp
89
-
90
- :param data: The input DataFrame
91
- :type data: pd.DataFrame
92
- :return: The DataFrame with the added features
93
- :rtype: pd.DataFrame
94
- """
95
- df = copy.deepcopy(data)
96
- df["year"] = [i.year for i in df.index]
97
- df["month"] = [i.month for i in df.index]
98
- df["day_of_week"] = [i.dayofweek for i in df.index]
99
- df["day_of_year"] = [i.dayofyear for i in df.index]
100
- df["day"] = [i.day for i in df.index]
101
- df["hour"] = [i.hour for i in df.index]
102
- return df
103
-
104
- @staticmethod
105
- def neg_r2_score(y_true, y_pred):
106
- """The negative of the r2 score."""
107
- return -r2_score(y_true, y_pred)
108
-
109
- @staticmethod
110
- def generate_exog(data_last_window, periods, var_name):
111
- """Generate the exogenous data for future timestamps."""
112
- forecast_dates = pd.date_range(
113
- start=data_last_window.index[-1] + data_last_window.index.freq,
114
- periods=periods,
115
- freq=data_last_window.index.freq,
116
- )
117
- exog = pd.DataFrame({var_name: [np.nan] * periods}, index=forecast_dates)
118
- exog = MLForecaster.add_date_features(exog)
119
- return exog
120
-
121
- def fit(
122
- self,
123
- split_date_delta: Optional[str] = "48h",
124
- perform_backtest: Optional[bool] = False,
125
- ) -> Tuple[pd.DataFrame, pd.DataFrame]:
126
- r"""The fit method to train the ML model.
127
-
128
- :param split_date_delta: The delta from now to `split_date_delta` that will be used \
129
- as the test period to evaluate the model, defaults to '48h'
130
- :type split_date_delta: Optional[str], optional
131
- :param perform_backtest: If `True` then a back testing routine is performed to evaluate \
132
- the performance of the model on the complete train set, defaults to False
133
- :type perform_backtest: Optional[bool], optional
134
- :return: The DataFrame containing the forecast data results without and with backtest
135
- :rtype: Tuple[pd.DataFrame, pd.DataFrame]
136
- """
137
- self.logger.info("Performing a forecast model fit for " + self.model_type)
138
- # Preparing the data: adding exogenous features
139
- self.data_exo = pd.DataFrame(index=self.data.index)
140
- self.data_exo = MLForecaster.add_date_features(self.data_exo)
141
- self.data_exo[self.var_model] = self.data[self.var_model]
142
- self.data_exo = self.data_exo.interpolate(method="linear", axis=0, limit=None)
143
- # train/test split
144
- self.date_train = (
145
- self.data_exo.index[-1] - pd.Timedelta("5days") + self.data_exo.index.freq
146
- ) # The last 5 days
147
- self.date_split = (
148
- self.data_exo.index[-1]
149
- - pd.Timedelta(split_date_delta)
150
- + self.data_exo.index.freq
151
- ) # The last 48h
152
- self.data_train = self.data_exo.loc[
153
- : self.date_split - self.data_exo.index.freq, :
154
- ]
155
- self.data_test = self.data_exo.loc[self.date_split :, :]
156
- self.steps = len(self.data_test)
157
- # Pick correct sklearn model
158
- if self.sklearn_model == "LinearRegression":
159
- base_model = LinearRegression()
160
- elif self.sklearn_model == "ElasticNet":
161
- base_model = ElasticNet()
162
- elif self.sklearn_model == "KNeighborsRegressor":
163
- base_model = KNeighborsRegressor()
164
- else:
165
- self.logger.error(
166
- "Passed sklearn model "
167
- + self.sklearn_model
168
- + " is not valid. Defaulting to KNeighborsRegressor"
169
- )
170
- base_model = KNeighborsRegressor()
171
- # Define the forecaster object
172
- self.forecaster = ForecasterAutoreg(regressor=base_model, lags=self.num_lags)
173
- # Fit and time it
174
- self.logger.info("Training a " + self.sklearn_model + " model")
175
- start_time = time.time()
176
- self.forecaster.fit(
177
- y=self.data_train[self.var_model],
178
- exog=self.data_train.drop(self.var_model, axis=1),
179
- )
180
- self.logger.info(f"Elapsed time for model fit: {time.time() - start_time}")
181
- # Make a prediction to print metrics
182
- predictions = self.forecaster.predict(
183
- steps=self.steps, exog=self.data_test.drop(self.var_model, axis=1)
184
- )
185
- pred_metric = r2_score(self.data_test[self.var_model], predictions)
186
- self.logger.info(
187
- f"Prediction R2 score of fitted model on test data: {pred_metric}"
188
- )
189
- # Packing results in a DataFrame
190
- df_pred = pd.DataFrame(
191
- index=self.data_exo.index, columns=["train", "test", "pred"]
192
- )
193
- df_pred["train"] = self.data_train[self.var_model]
194
- df_pred["test"] = self.data_test[self.var_model]
195
- df_pred["pred"] = predictions
196
- df_pred_backtest = None
197
- if perform_backtest is True:
198
- # Using backtesting tool to evaluate the model
199
- self.logger.info("Performing simple backtesting of fitted model")
200
- start_time = time.time()
201
- metric, predictions_backtest = backtesting_forecaster(
202
- forecaster=self.forecaster,
203
- y=self.data_train[self.var_model],
204
- exog=self.data_train.drop(self.var_model, axis=1),
205
- steps=self.num_lags,
206
- initial_train_size=None,
207
- allow_incomplete_fold=True,
208
- gap=0,
209
- metric=MLForecaster.neg_r2_score,
210
- verbose=False,
211
- refit=False,
212
- show_progress=True,
213
- )
214
- self.logger.info(f"Elapsed backtesting time: {time.time() - start_time}")
215
- self.logger.info(f"Backtest R2 score: {-metric}")
216
- df_pred_backtest = pd.DataFrame(
217
- index=self.data_exo.index, columns=["train", "pred"]
218
- )
219
- df_pred_backtest["train"] = self.data_exo[self.var_model]
220
- df_pred_backtest["pred"] = predictions_backtest
221
- return df_pred, df_pred_backtest
222
-
223
- def predict(self, data_last_window: Optional[pd.DataFrame] = None) -> pd.Series:
224
- """The predict method to generate forecasts from a previously fitted ML model.
225
-
226
- :param data_last_window: The data that will be used to generate the new forecast, this \
227
- will be freshly retrieved from Home Assistant. This data is needed because the forecast \
228
- model is an auto-regressive model with lags. If not passed then the data used during the \
229
- model train is used, defaults to None
230
- :type data_last_window: Optional[pd.DataFrame], optional
231
- :return: A pandas series containing the generated forecasts.
232
- :rtype: pd.Series
233
- """
234
- if data_last_window is None:
235
- predictions = self.forecaster.predict(
236
- steps=self.num_lags, exog=self.data_test.drop(self.var_model, axis=1)
237
- )
238
- else:
239
- data_last_window = data_last_window.interpolate(
240
- method="linear", axis=0, limit=None
241
- )
242
- if self.is_tuned:
243
- exog = MLForecaster.generate_exog(
244
- data_last_window, self.lags_opt, self.var_model
245
- )
246
- predictions = self.forecaster.predict(
247
- steps=self.lags_opt,
248
- last_window=data_last_window[self.var_model],
249
- exog=exog.drop(self.var_model, axis=1),
250
- )
251
- else:
252
- exog = MLForecaster.generate_exog(
253
- data_last_window, self.num_lags, self.var_model
254
- )
255
- predictions = self.forecaster.predict(
256
- steps=self.num_lags,
257
- last_window=data_last_window[self.var_model],
258
- exog=exog.drop(self.var_model, axis=1),
259
- )
260
- return predictions
261
-
262
- def tune(self, debug: Optional[bool] = False) -> pd.DataFrame:
263
- """Tuning a previously fitted model using bayesian optimization.
264
-
265
- :param debug: Set to True for testing and faster optimizations, defaults to False
266
- :type debug: Optional[bool], optional
267
- :return: The DataFrame with the forecasts using the optimized model.
268
- :rtype: pd.DataFrame
269
- """
270
- # Regressor hyperparameters search space
271
- if self.sklearn_model == "LinearRegression":
272
- if debug:
273
-
274
- def search_space(trial):
275
- search_space = {
276
- "fit_intercept": trial.suggest_categorical(
277
- "fit_intercept", [True]
278
- ),
279
- "lags": trial.suggest_categorical("lags", [3]),
280
- }
281
- return search_space
282
- else:
283
-
284
- def search_space(trial):
285
- search_space = {
286
- "fit_intercept": trial.suggest_categorical(
287
- "fit_intercept", [True, False]
288
- ),
289
- "lags": trial.suggest_categorical(
290
- "lags", [6, 12, 24, 36, 48, 60, 72]
291
- ),
292
- }
293
- return search_space
294
- elif self.sklearn_model == "ElasticNet":
295
- if debug:
296
-
297
- def search_space(trial):
298
- search_space = {
299
- "selection": trial.suggest_categorical("selection", ["random"]),
300
- "lags": trial.suggest_categorical("lags", [3]),
301
- }
302
- return search_space
303
- else:
304
-
305
- def search_space(trial):
306
- search_space = {
307
- "alpha": trial.suggest_float("alpha", 0.0, 2.0),
308
- "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
309
- "selection": trial.suggest_categorical(
310
- "selection", ["cyclic", "random"]
311
- ),
312
- "lags": trial.suggest_categorical(
313
- "lags", [6, 12, 24, 36, 48, 60, 72]
314
- ),
315
- }
316
- return search_space
317
- elif self.sklearn_model == "KNeighborsRegressor":
318
- if debug:
319
-
320
- def search_space(trial):
321
- search_space = {
322
- "weights": trial.suggest_categorical("weights", ["uniform"]),
323
- "lags": trial.suggest_categorical("lags", [3]),
324
- }
325
- return search_space
326
- else:
327
-
328
- def search_space(trial):
329
- search_space = {
330
- "n_neighbors": trial.suggest_int("n_neighbors", 2, 20),
331
- "leaf_size": trial.suggest_int("leaf_size", 20, 40),
332
- "weights": trial.suggest_categorical(
333
- "weights", ["uniform", "distance"]
334
- ),
335
- "lags": trial.suggest_categorical(
336
- "lags", [6, 12, 24, 36, 48, 60, 72]
337
- ),
338
- }
339
- return search_space
340
-
341
- # Bayesian search hyperparameter and lags with skforecast/optuna
342
- # Lags used as predictors
343
- if debug:
344
- refit = False
345
- num_lags = 3
346
- else:
347
- refit = True
348
- num_lags = self.num_lags
349
- # The optimization routine call
350
- self.logger.info("Bayesian hyperparameter optimization with backtesting")
351
- start_time = time.time()
352
- self.optimize_results, self.optimize_results_object = (
353
- bayesian_search_forecaster(
354
- forecaster=self.forecaster,
355
- y=self.data_train[self.var_model],
356
- exog=self.data_train.drop(self.var_model, axis=1),
357
- search_space=search_space,
358
- metric=MLForecaster.neg_r2_score,
359
- n_trials=10,
360
- random_state=123,
361
- steps=num_lags,
362
- initial_train_size=len(self.data_exo.loc[: self.date_train]),
363
- return_best=True,
364
- fixed_train_size=True,
365
- gap=0,
366
- allow_incomplete_fold=True,
367
- skip_folds=None,
368
- refit=refit,
369
- )
370
- )
371
- self.logger.info(f"Elapsed time: {time.time() - start_time}")
372
- self.is_tuned = True
373
- predictions_opt = self.forecaster.predict(
374
- steps=self.num_lags, exog=self.data_test.drop(self.var_model, axis=1)
375
- )
376
- freq_hours = self.data_exo.index.freq.delta.seconds / 3600
377
- self.lags_opt = int(np.round(len(self.optimize_results.iloc[0]["lags"])))
378
- self.days_needed = int(np.round(self.lags_opt * freq_hours / 24))
379
- df_pred_opt = pd.DataFrame(
380
- index=self.data_exo.index, columns=["train", "test", "pred_optim"]
381
- )
382
- df_pred_opt["train"] = self.data_train[self.var_model]
383
- df_pred_opt["test"] = self.data_test[self.var_model]
384
- df_pred_opt["pred_optim"] = predictions_opt
385
- pred_optim_metric_train = -self.optimize_results.iloc[0]["neg_r2_score"]
386
- self.logger.info(
387
- f"R2 score for optimized prediction in train period: {pred_optim_metric_train}"
388
- )
389
- pred_optim_metric_test = r2_score(
390
- df_pred_opt.loc[predictions_opt.index, "test"],
391
- df_pred_opt.loc[predictions_opt.index, "pred_optim"],
392
- )
393
- self.logger.info(
394
- f"R2 score for optimized prediction in test period: {pred_optim_metric_test}"
395
- )
396
- self.logger.info("Number of optimal lags obtained: " + str(self.lags_opt))
397
- return df_pred_opt
@@ -1,275 +0,0 @@
1
- """Machine learning regressor module."""
2
-
3
- from __future__ import annotations
4
-
5
- import copy
6
- import time
7
- import warnings
8
- from typing import TYPE_CHECKING
9
-
10
- import numpy as np
11
- import pandas as pd
12
- from sklearn.ensemble import (
13
- AdaBoostRegressor,
14
- GradientBoostingRegressor,
15
- RandomForestRegressor,
16
- )
17
- from sklearn.linear_model import Lasso, LinearRegression, Ridge
18
- from sklearn.metrics import r2_score
19
- from sklearn.model_selection import GridSearchCV, train_test_split
20
- from sklearn.pipeline import make_pipeline
21
- from sklearn.preprocessing import StandardScaler
22
-
23
- if TYPE_CHECKING:
24
- import logging
25
-
26
- warnings.filterwarnings("ignore", category=DeprecationWarning)
27
-
28
- REGRESSION_METHODS = {
29
- "LinearRegression": {
30
- "model": LinearRegression(),
31
- "param_grid": {
32
- "linearregression__fit_intercept": [True, False],
33
- "linearregression__positive": [True, False],
34
- },
35
- },
36
- "RidgeRegression": {
37
- "model": Ridge(),
38
- "param_grid": {"ridge__alpha": [0.1, 1.0, 10.0]},
39
- },
40
- "LassoRegression": {
41
- "model": Lasso(),
42
- "param_grid": {"lasso__alpha": [0.1, 1.0, 10.0]},
43
- },
44
- "RandomForestRegression": {
45
- "model": RandomForestRegressor(),
46
- "param_grid": {"randomforestregressor__n_estimators": [50, 100, 200]},
47
- },
48
- "GradientBoostingRegression": {
49
- "model": GradientBoostingRegressor(),
50
- "param_grid": {
51
- "gradientboostingregressor__n_estimators": [50, 100, 200],
52
- "gradientboostingregressor__learning_rate": [0.01, 0.1, 0.2],
53
- },
54
- },
55
- "AdaBoostRegression": {
56
- "model": AdaBoostRegressor(),
57
- "param_grid": {
58
- "adaboostregressor__n_estimators": [50, 100, 200],
59
- "adaboostregressor__learning_rate": [0.01, 0.1, 0.2],
60
- },
61
- },
62
- }
63
-
64
-
65
- class MLRegressor:
66
- r"""A forecaster class using machine learning models.
67
-
68
- This class uses the `sklearn` module and the machine learning models are \
69
- from `scikit-learn`.
70
-
71
- It exposes two main methods:
72
-
73
- - `fit`: to train a model with the passed data.
74
-
75
- - `predict`: to obtain a forecast from a pre-trained model.
76
-
77
- """
78
-
79
- def __init__(
80
- self: MLRegressor,
81
- data: pd.DataFrame,
82
- model_type: str,
83
- regression_model: str,
84
- features: list,
85
- target: str,
86
- timestamp: str,
87
- logger: logging.Logger,
88
- ) -> None:
89
- r"""Define constructor for the forecast class.
90
-
91
- :param data: The data that will be used for train/test
92
- :type data: pd.DataFrame
93
- :param model_type: A unique name defining this model and useful to identify \
94
- for what it will be used for.
95
- :type model_type: str
96
- :param regression_model: The model that will be used. For now only \
97
- this options are possible: `LinearRegression`, `RidgeRegression`, \
98
- `LassoRegression`, `RandomForestRegression`, \
99
- `GradientBoostingRegression` and `AdaBoostRegression`.
100
- :type regression_model: str
101
- :param features: A list of features. \
102
- Example: [`solar_production`, `degree_days`].
103
- :type features: list
104
- :param target: The target(to be predicted). \
105
- Example: `heating_hours`.
106
- :type target: str
107
- :param timestamp: If defined, the column key that has to be used of timestamp.
108
- :type timestamp: str
109
- :param logger: The passed logger object
110
- :type logger: logging.Logger
111
- """
112
- self.data = data
113
- self.features = features
114
- self.target = target
115
- self.timestamp = timestamp
116
- self.model_type = model_type
117
- self.regression_model = regression_model
118
- self.logger = logger
119
- self.data = self.data.sort_index()
120
- self.data = self.data[~self.data.index.duplicated(keep="first")]
121
- self.data_exo = None
122
- self.steps = None
123
- self.model = None
124
- self.grid_search = None
125
-
126
- @staticmethod
127
- def add_date_features(
128
- data: pd.DataFrame, date_features: list, timestamp: str
129
- ) -> pd.DataFrame:
130
- """Add date features from the input DataFrame timestamp.
131
-
132
- :param data: The input DataFrame
133
- :type data: pd.DataFrame
134
- :param timestamp: The column containing the timestamp
135
- :type timestamp: str
136
- :return: The DataFrame with the added features
137
- :rtype: pd.DataFrame
138
- """
139
- df = copy.deepcopy(data) # noqa: PD901
140
- df[timestamp] = pd.to_datetime(df["timestamp"])
141
- if "year" in date_features:
142
- df["year"] = [i.year for i in df["timestamp"]]
143
- if "month" in date_features:
144
- df["month"] = [i.month for i in df["timestamp"]]
145
- if "day_of_week" in date_features:
146
- df["day_of_week"] = [i.dayofweek for i in df["timestamp"]]
147
- if "day_of_year" in date_features:
148
- df["day_of_year"] = [i.dayofyear for i in df["timestamp"]]
149
- if "day" in date_features:
150
- df["day"] = [i.day for i in df["timestamp"]]
151
- if "hour" in date_features:
152
- df["hour"] = [i.day for i in df["timestamp"]]
153
- return df
154
-
155
- def get_regression_model(self: MLRegressor) -> tuple[str, str]:
156
- r"""
157
- Get the base model and parameter grid for the specified regression model.
158
- Returns a tuple containing the base model and parameter grid corresponding to \
159
- the specified regression model.
160
-
161
- :param self: The instance of the MLRegressor class.
162
- :type self: MLRegressor
163
- :return: A tuple containing the base model and parameter grid.
164
- :rtype: tuple[str, str]
165
- """
166
- if self.regression_model == "LinearRegression":
167
- base_model = REGRESSION_METHODS["LinearRegression"]["model"]
168
- param_grid = REGRESSION_METHODS["LinearRegression"]["param_grid"]
169
- elif self.regression_model == "RidgeRegression":
170
- base_model = REGRESSION_METHODS["RidgeRegression"]["model"]
171
- param_grid = REGRESSION_METHODS["RidgeRegression"]["param_grid"]
172
- elif self.regression_model == "LassoRegression":
173
- base_model = REGRESSION_METHODS["LassoRegression"]["model"]
174
- param_grid = REGRESSION_METHODS["LassoRegression"]["param_grid"]
175
- elif self.regression_model == "RandomForestRegression":
176
- base_model = REGRESSION_METHODS["RandomForestRegression"]["model"]
177
- param_grid = REGRESSION_METHODS["RandomForestRegression"]["param_grid"]
178
- elif self.regression_model == "GradientBoostingRegression":
179
- base_model = REGRESSION_METHODS["GradientBoostingRegression"]["model"]
180
- param_grid = REGRESSION_METHODS["GradientBoostingRegression"]["param_grid"]
181
- elif self.regression_model == "AdaBoostRegression":
182
- base_model = REGRESSION_METHODS["AdaBoostRegression"]["model"]
183
- param_grid = REGRESSION_METHODS["AdaBoostRegression"]["param_grid"]
184
- else:
185
- self.logger.error(
186
- "Passed model %s is not valid",
187
- self.regression_model,
188
- )
189
- return None, None
190
- return base_model, param_grid
191
-
192
- def fit(self: MLRegressor, date_features: list | None = None) -> bool:
193
- r"""Fit the model using the provided data.
194
-
195
- :param date_features: A list of 'date_features' to take into account when \
196
- fitting the model.
197
- :type data: list
198
- :return: bool if successful
199
- :rtype: bool
200
- """
201
- self.logger.info("Performing a MLRegressor fit for %s", self.model_type)
202
- self.data_exo = pd.DataFrame(self.data)
203
- self.data_exo[self.features] = self.data[self.features]
204
- self.data_exo[self.target] = self.data[self.target]
205
- keep_columns = []
206
- keep_columns.extend(self.features)
207
- if self.timestamp is not None:
208
- keep_columns.append(self.timestamp)
209
- keep_columns.append(self.target)
210
- self.data_exo = self.data_exo[self.data_exo.columns.intersection(keep_columns)]
211
- self.data_exo = self.data_exo.reset_index(drop=True)
212
- if date_features is not None:
213
- if self.timestamp is not None:
214
- self.data_exo = MLRegressor.add_date_features(
215
- self.data_exo,
216
- date_features,
217
- self.timestamp,
218
- )
219
- else:
220
- self.logger.error(
221
- "If no timestamp provided, you can't use date_features, going \
222
- further without date_features.",
223
- )
224
- y = self.data_exo[self.target]
225
- self.data_exo = self.data_exo.drop(self.target, axis=1)
226
- if self.timestamp is not None:
227
- self.data_exo = self.data_exo.drop(self.timestamp, axis=1)
228
- X = self.data_exo
229
- X_train, X_test, y_train, y_test = train_test_split(
230
- X, y, test_size=0.2, random_state=42
231
- )
232
- self.steps = len(X_test)
233
- base_model, param_grid = self.get_regression_model()
234
- if base_model is None:
235
- return False
236
- self.model = make_pipeline(StandardScaler(), base_model)
237
- # Create a grid search object
238
- self.grid_search = GridSearchCV(
239
- self.model,
240
- param_grid,
241
- cv=5,
242
- scoring="neg_mean_squared_error",
243
- refit=True,
244
- verbose=0,
245
- n_jobs=-1,
246
- )
247
- # Fit the grid search object to the data
248
- self.logger.info("Training a %s model", self.regression_model)
249
- start_time = time.time()
250
- self.grid_search.fit(X_train.values, y_train.values)
251
- self.logger.info("Elapsed time for model fit: %s", time.time() - start_time)
252
- self.model = self.grid_search.best_estimator_
253
- # Make predictions
254
- predictions = self.model.predict(X_test.values)
255
- predictions = pd.Series(predictions, index=X_test.index)
256
- pred_metric = r2_score(y_test, predictions)
257
- self.logger.info(
258
- "Prediction R2 score of fitted model on test data: %s",
259
- pred_metric,
260
- )
261
- return True
262
-
263
- def predict(self: MLRegressor, new_values: list) -> np.ndarray:
264
- """Predict a new value.
265
-
266
- :param new_values: The new values for the features \
267
- (in the same order as the features list). \
268
- Example: [2.24, 5.68].
269
- :type new_values: list
270
- :return: The np.ndarray containing the predicted value.
271
- :rtype: np.ndarray
272
- """
273
- self.logger.info("Performing a prediction for %s", self.model_type)
274
- new_values = np.array([new_values])
275
- return self.model.predict(new_values)