emhass 0.12.4__py3-none-any.whl → 0.12.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {emhass-0.12.4.dist-info → emhass-0.12.5.dist-info}/METADATA +34 -17
- {emhass-0.12.4.dist-info → emhass-0.12.5.dist-info}/RECORD +5 -17
- emhass/__init__.py +0 -0
- emhass/command_line.py +0 -1748
- emhass/data/emhass_inverters.csv +0 -8
- emhass/data/emhass_modules.csv +0 -6
- emhass/forecast.py +0 -1348
- emhass/img/emhass_icon.png +0 -0
- emhass/machine_learning_forecaster.py +0 -397
- emhass/machine_learning_regressor.py +0 -275
- emhass/optimization.py +0 -1504
- emhass/retrieve_hass.py +0 -670
- emhass/utils.py +0 -1678
- emhass/web_server.py +0 -756
- {emhass-0.12.4.dist-info → emhass-0.12.5.dist-info}/WHEEL +0 -0
- {emhass-0.12.4.dist-info → emhass-0.12.5.dist-info}/entry_points.txt +0 -0
- {emhass-0.12.4.dist-info → emhass-0.12.5.dist-info}/licenses/LICENSE +0 -0
emhass/img/emhass_icon.png
DELETED
Binary file
|
@@ -1,397 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
import copy
|
5
|
-
import logging
|
6
|
-
import time
|
7
|
-
import warnings
|
8
|
-
from typing import Optional, Tuple
|
9
|
-
|
10
|
-
import numpy as np
|
11
|
-
import pandas as pd
|
12
|
-
from skforecast.ForecasterAutoreg import ForecasterAutoreg
|
13
|
-
from skforecast.model_selection import (
|
14
|
-
backtesting_forecaster,
|
15
|
-
bayesian_search_forecaster,
|
16
|
-
)
|
17
|
-
from sklearn.linear_model import ElasticNet, LinearRegression
|
18
|
-
from sklearn.metrics import r2_score
|
19
|
-
from sklearn.neighbors import KNeighborsRegressor
|
20
|
-
|
21
|
-
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
22
|
-
|
23
|
-
|
24
|
-
class MLForecaster:
|
25
|
-
r"""
|
26
|
-
A forecaster class using machine learning models with auto-regressive approach and features\
|
27
|
-
based on timestamp information (hour, day, week, etc).
|
28
|
-
|
29
|
-
This class uses the `skforecast` module and the machine learning models are from `scikit-learn`.
|
30
|
-
|
31
|
-
It exposes three main methods:
|
32
|
-
|
33
|
-
- `fit`: to train a model with the passed data.
|
34
|
-
|
35
|
-
- `predict`: to obtain a forecast from a pre-trained model.
|
36
|
-
|
37
|
-
- `tune`: to optimize the models hyperparameters using bayesian optimization.
|
38
|
-
|
39
|
-
"""
|
40
|
-
|
41
|
-
def __init__(
|
42
|
-
self,
|
43
|
-
data: pd.DataFrame,
|
44
|
-
model_type: str,
|
45
|
-
var_model: str,
|
46
|
-
sklearn_model: str,
|
47
|
-
num_lags: int,
|
48
|
-
emhass_conf: dict,
|
49
|
-
logger: logging.Logger,
|
50
|
-
) -> None:
|
51
|
-
r"""Define constructor for the forecast class.
|
52
|
-
|
53
|
-
:param data: The data that will be used for train/test
|
54
|
-
:type data: pd.DataFrame
|
55
|
-
:param model_type: A unique name defining this model and useful to identify \
|
56
|
-
for what it will be used for.
|
57
|
-
:type model_type: str
|
58
|
-
:param var_model: The name of the sensor to retrieve data from Home Assistant. \
|
59
|
-
Example: `sensor.power_load_no_var_loads`.
|
60
|
-
:type var_model: str
|
61
|
-
:param sklearn_model: The `scikit-learn` model that will be used. For now only \
|
62
|
-
this options are possible: `LinearRegression`, `ElasticNet` and `KNeighborsRegressor`.
|
63
|
-
:type sklearn_model: str
|
64
|
-
:param num_lags: The number of auto-regression lags to consider. A good starting point \
|
65
|
-
is to fix this as one day. For example if your time step is 30 minutes, then fix this \
|
66
|
-
to 48, if the time step is 1 hour the fix this to 24 and so on.
|
67
|
-
:type num_lags: int
|
68
|
-
:param emhass_conf: Dictionary containing the needed emhass paths
|
69
|
-
:type emhass_conf: dict
|
70
|
-
:param logger: The passed logger object
|
71
|
-
:type logger: logging.Logger
|
72
|
-
"""
|
73
|
-
self.data = data
|
74
|
-
self.model_type = model_type
|
75
|
-
self.var_model = var_model
|
76
|
-
self.sklearn_model = sklearn_model
|
77
|
-
self.num_lags = num_lags
|
78
|
-
self.emhass_conf = emhass_conf
|
79
|
-
self.logger = logger
|
80
|
-
self.is_tuned = False
|
81
|
-
# A quick data preparation
|
82
|
-
self.data.index = pd.to_datetime(self.data.index)
|
83
|
-
self.data.sort_index(inplace=True)
|
84
|
-
self.data = self.data[~self.data.index.duplicated(keep="first")]
|
85
|
-
|
86
|
-
@staticmethod
|
87
|
-
def add_date_features(data: pd.DataFrame) -> pd.DataFrame:
|
88
|
-
"""Add date features from the input DataFrame timestamp
|
89
|
-
|
90
|
-
:param data: The input DataFrame
|
91
|
-
:type data: pd.DataFrame
|
92
|
-
:return: The DataFrame with the added features
|
93
|
-
:rtype: pd.DataFrame
|
94
|
-
"""
|
95
|
-
df = copy.deepcopy(data)
|
96
|
-
df["year"] = [i.year for i in df.index]
|
97
|
-
df["month"] = [i.month for i in df.index]
|
98
|
-
df["day_of_week"] = [i.dayofweek for i in df.index]
|
99
|
-
df["day_of_year"] = [i.dayofyear for i in df.index]
|
100
|
-
df["day"] = [i.day for i in df.index]
|
101
|
-
df["hour"] = [i.hour for i in df.index]
|
102
|
-
return df
|
103
|
-
|
104
|
-
@staticmethod
|
105
|
-
def neg_r2_score(y_true, y_pred):
|
106
|
-
"""The negative of the r2 score."""
|
107
|
-
return -r2_score(y_true, y_pred)
|
108
|
-
|
109
|
-
@staticmethod
|
110
|
-
def generate_exog(data_last_window, periods, var_name):
|
111
|
-
"""Generate the exogenous data for future timestamps."""
|
112
|
-
forecast_dates = pd.date_range(
|
113
|
-
start=data_last_window.index[-1] + data_last_window.index.freq,
|
114
|
-
periods=periods,
|
115
|
-
freq=data_last_window.index.freq,
|
116
|
-
)
|
117
|
-
exog = pd.DataFrame({var_name: [np.nan] * periods}, index=forecast_dates)
|
118
|
-
exog = MLForecaster.add_date_features(exog)
|
119
|
-
return exog
|
120
|
-
|
121
|
-
def fit(
|
122
|
-
self,
|
123
|
-
split_date_delta: Optional[str] = "48h",
|
124
|
-
perform_backtest: Optional[bool] = False,
|
125
|
-
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
126
|
-
r"""The fit method to train the ML model.
|
127
|
-
|
128
|
-
:param split_date_delta: The delta from now to `split_date_delta` that will be used \
|
129
|
-
as the test period to evaluate the model, defaults to '48h'
|
130
|
-
:type split_date_delta: Optional[str], optional
|
131
|
-
:param perform_backtest: If `True` then a back testing routine is performed to evaluate \
|
132
|
-
the performance of the model on the complete train set, defaults to False
|
133
|
-
:type perform_backtest: Optional[bool], optional
|
134
|
-
:return: The DataFrame containing the forecast data results without and with backtest
|
135
|
-
:rtype: Tuple[pd.DataFrame, pd.DataFrame]
|
136
|
-
"""
|
137
|
-
self.logger.info("Performing a forecast model fit for " + self.model_type)
|
138
|
-
# Preparing the data: adding exogenous features
|
139
|
-
self.data_exo = pd.DataFrame(index=self.data.index)
|
140
|
-
self.data_exo = MLForecaster.add_date_features(self.data_exo)
|
141
|
-
self.data_exo[self.var_model] = self.data[self.var_model]
|
142
|
-
self.data_exo = self.data_exo.interpolate(method="linear", axis=0, limit=None)
|
143
|
-
# train/test split
|
144
|
-
self.date_train = (
|
145
|
-
self.data_exo.index[-1] - pd.Timedelta("5days") + self.data_exo.index.freq
|
146
|
-
) # The last 5 days
|
147
|
-
self.date_split = (
|
148
|
-
self.data_exo.index[-1]
|
149
|
-
- pd.Timedelta(split_date_delta)
|
150
|
-
+ self.data_exo.index.freq
|
151
|
-
) # The last 48h
|
152
|
-
self.data_train = self.data_exo.loc[
|
153
|
-
: self.date_split - self.data_exo.index.freq, :
|
154
|
-
]
|
155
|
-
self.data_test = self.data_exo.loc[self.date_split :, :]
|
156
|
-
self.steps = len(self.data_test)
|
157
|
-
# Pick correct sklearn model
|
158
|
-
if self.sklearn_model == "LinearRegression":
|
159
|
-
base_model = LinearRegression()
|
160
|
-
elif self.sklearn_model == "ElasticNet":
|
161
|
-
base_model = ElasticNet()
|
162
|
-
elif self.sklearn_model == "KNeighborsRegressor":
|
163
|
-
base_model = KNeighborsRegressor()
|
164
|
-
else:
|
165
|
-
self.logger.error(
|
166
|
-
"Passed sklearn model "
|
167
|
-
+ self.sklearn_model
|
168
|
-
+ " is not valid. Defaulting to KNeighborsRegressor"
|
169
|
-
)
|
170
|
-
base_model = KNeighborsRegressor()
|
171
|
-
# Define the forecaster object
|
172
|
-
self.forecaster = ForecasterAutoreg(regressor=base_model, lags=self.num_lags)
|
173
|
-
# Fit and time it
|
174
|
-
self.logger.info("Training a " + self.sklearn_model + " model")
|
175
|
-
start_time = time.time()
|
176
|
-
self.forecaster.fit(
|
177
|
-
y=self.data_train[self.var_model],
|
178
|
-
exog=self.data_train.drop(self.var_model, axis=1),
|
179
|
-
)
|
180
|
-
self.logger.info(f"Elapsed time for model fit: {time.time() - start_time}")
|
181
|
-
# Make a prediction to print metrics
|
182
|
-
predictions = self.forecaster.predict(
|
183
|
-
steps=self.steps, exog=self.data_test.drop(self.var_model, axis=1)
|
184
|
-
)
|
185
|
-
pred_metric = r2_score(self.data_test[self.var_model], predictions)
|
186
|
-
self.logger.info(
|
187
|
-
f"Prediction R2 score of fitted model on test data: {pred_metric}"
|
188
|
-
)
|
189
|
-
# Packing results in a DataFrame
|
190
|
-
df_pred = pd.DataFrame(
|
191
|
-
index=self.data_exo.index, columns=["train", "test", "pred"]
|
192
|
-
)
|
193
|
-
df_pred["train"] = self.data_train[self.var_model]
|
194
|
-
df_pred["test"] = self.data_test[self.var_model]
|
195
|
-
df_pred["pred"] = predictions
|
196
|
-
df_pred_backtest = None
|
197
|
-
if perform_backtest is True:
|
198
|
-
# Using backtesting tool to evaluate the model
|
199
|
-
self.logger.info("Performing simple backtesting of fitted model")
|
200
|
-
start_time = time.time()
|
201
|
-
metric, predictions_backtest = backtesting_forecaster(
|
202
|
-
forecaster=self.forecaster,
|
203
|
-
y=self.data_train[self.var_model],
|
204
|
-
exog=self.data_train.drop(self.var_model, axis=1),
|
205
|
-
steps=self.num_lags,
|
206
|
-
initial_train_size=None,
|
207
|
-
allow_incomplete_fold=True,
|
208
|
-
gap=0,
|
209
|
-
metric=MLForecaster.neg_r2_score,
|
210
|
-
verbose=False,
|
211
|
-
refit=False,
|
212
|
-
show_progress=True,
|
213
|
-
)
|
214
|
-
self.logger.info(f"Elapsed backtesting time: {time.time() - start_time}")
|
215
|
-
self.logger.info(f"Backtest R2 score: {-metric}")
|
216
|
-
df_pred_backtest = pd.DataFrame(
|
217
|
-
index=self.data_exo.index, columns=["train", "pred"]
|
218
|
-
)
|
219
|
-
df_pred_backtest["train"] = self.data_exo[self.var_model]
|
220
|
-
df_pred_backtest["pred"] = predictions_backtest
|
221
|
-
return df_pred, df_pred_backtest
|
222
|
-
|
223
|
-
def predict(self, data_last_window: Optional[pd.DataFrame] = None) -> pd.Series:
|
224
|
-
"""The predict method to generate forecasts from a previously fitted ML model.
|
225
|
-
|
226
|
-
:param data_last_window: The data that will be used to generate the new forecast, this \
|
227
|
-
will be freshly retrieved from Home Assistant. This data is needed because the forecast \
|
228
|
-
model is an auto-regressive model with lags. If not passed then the data used during the \
|
229
|
-
model train is used, defaults to None
|
230
|
-
:type data_last_window: Optional[pd.DataFrame], optional
|
231
|
-
:return: A pandas series containing the generated forecasts.
|
232
|
-
:rtype: pd.Series
|
233
|
-
"""
|
234
|
-
if data_last_window is None:
|
235
|
-
predictions = self.forecaster.predict(
|
236
|
-
steps=self.num_lags, exog=self.data_test.drop(self.var_model, axis=1)
|
237
|
-
)
|
238
|
-
else:
|
239
|
-
data_last_window = data_last_window.interpolate(
|
240
|
-
method="linear", axis=0, limit=None
|
241
|
-
)
|
242
|
-
if self.is_tuned:
|
243
|
-
exog = MLForecaster.generate_exog(
|
244
|
-
data_last_window, self.lags_opt, self.var_model
|
245
|
-
)
|
246
|
-
predictions = self.forecaster.predict(
|
247
|
-
steps=self.lags_opt,
|
248
|
-
last_window=data_last_window[self.var_model],
|
249
|
-
exog=exog.drop(self.var_model, axis=1),
|
250
|
-
)
|
251
|
-
else:
|
252
|
-
exog = MLForecaster.generate_exog(
|
253
|
-
data_last_window, self.num_lags, self.var_model
|
254
|
-
)
|
255
|
-
predictions = self.forecaster.predict(
|
256
|
-
steps=self.num_lags,
|
257
|
-
last_window=data_last_window[self.var_model],
|
258
|
-
exog=exog.drop(self.var_model, axis=1),
|
259
|
-
)
|
260
|
-
return predictions
|
261
|
-
|
262
|
-
def tune(self, debug: Optional[bool] = False) -> pd.DataFrame:
|
263
|
-
"""Tuning a previously fitted model using bayesian optimization.
|
264
|
-
|
265
|
-
:param debug: Set to True for testing and faster optimizations, defaults to False
|
266
|
-
:type debug: Optional[bool], optional
|
267
|
-
:return: The DataFrame with the forecasts using the optimized model.
|
268
|
-
:rtype: pd.DataFrame
|
269
|
-
"""
|
270
|
-
# Regressor hyperparameters search space
|
271
|
-
if self.sklearn_model == "LinearRegression":
|
272
|
-
if debug:
|
273
|
-
|
274
|
-
def search_space(trial):
|
275
|
-
search_space = {
|
276
|
-
"fit_intercept": trial.suggest_categorical(
|
277
|
-
"fit_intercept", [True]
|
278
|
-
),
|
279
|
-
"lags": trial.suggest_categorical("lags", [3]),
|
280
|
-
}
|
281
|
-
return search_space
|
282
|
-
else:
|
283
|
-
|
284
|
-
def search_space(trial):
|
285
|
-
search_space = {
|
286
|
-
"fit_intercept": trial.suggest_categorical(
|
287
|
-
"fit_intercept", [True, False]
|
288
|
-
),
|
289
|
-
"lags": trial.suggest_categorical(
|
290
|
-
"lags", [6, 12, 24, 36, 48, 60, 72]
|
291
|
-
),
|
292
|
-
}
|
293
|
-
return search_space
|
294
|
-
elif self.sklearn_model == "ElasticNet":
|
295
|
-
if debug:
|
296
|
-
|
297
|
-
def search_space(trial):
|
298
|
-
search_space = {
|
299
|
-
"selection": trial.suggest_categorical("selection", ["random"]),
|
300
|
-
"lags": trial.suggest_categorical("lags", [3]),
|
301
|
-
}
|
302
|
-
return search_space
|
303
|
-
else:
|
304
|
-
|
305
|
-
def search_space(trial):
|
306
|
-
search_space = {
|
307
|
-
"alpha": trial.suggest_float("alpha", 0.0, 2.0),
|
308
|
-
"l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
|
309
|
-
"selection": trial.suggest_categorical(
|
310
|
-
"selection", ["cyclic", "random"]
|
311
|
-
),
|
312
|
-
"lags": trial.suggest_categorical(
|
313
|
-
"lags", [6, 12, 24, 36, 48, 60, 72]
|
314
|
-
),
|
315
|
-
}
|
316
|
-
return search_space
|
317
|
-
elif self.sklearn_model == "KNeighborsRegressor":
|
318
|
-
if debug:
|
319
|
-
|
320
|
-
def search_space(trial):
|
321
|
-
search_space = {
|
322
|
-
"weights": trial.suggest_categorical("weights", ["uniform"]),
|
323
|
-
"lags": trial.suggest_categorical("lags", [3]),
|
324
|
-
}
|
325
|
-
return search_space
|
326
|
-
else:
|
327
|
-
|
328
|
-
def search_space(trial):
|
329
|
-
search_space = {
|
330
|
-
"n_neighbors": trial.suggest_int("n_neighbors", 2, 20),
|
331
|
-
"leaf_size": trial.suggest_int("leaf_size", 20, 40),
|
332
|
-
"weights": trial.suggest_categorical(
|
333
|
-
"weights", ["uniform", "distance"]
|
334
|
-
),
|
335
|
-
"lags": trial.suggest_categorical(
|
336
|
-
"lags", [6, 12, 24, 36, 48, 60, 72]
|
337
|
-
),
|
338
|
-
}
|
339
|
-
return search_space
|
340
|
-
|
341
|
-
# Bayesian search hyperparameter and lags with skforecast/optuna
|
342
|
-
# Lags used as predictors
|
343
|
-
if debug:
|
344
|
-
refit = False
|
345
|
-
num_lags = 3
|
346
|
-
else:
|
347
|
-
refit = True
|
348
|
-
num_lags = self.num_lags
|
349
|
-
# The optimization routine call
|
350
|
-
self.logger.info("Bayesian hyperparameter optimization with backtesting")
|
351
|
-
start_time = time.time()
|
352
|
-
self.optimize_results, self.optimize_results_object = (
|
353
|
-
bayesian_search_forecaster(
|
354
|
-
forecaster=self.forecaster,
|
355
|
-
y=self.data_train[self.var_model],
|
356
|
-
exog=self.data_train.drop(self.var_model, axis=1),
|
357
|
-
search_space=search_space,
|
358
|
-
metric=MLForecaster.neg_r2_score,
|
359
|
-
n_trials=10,
|
360
|
-
random_state=123,
|
361
|
-
steps=num_lags,
|
362
|
-
initial_train_size=len(self.data_exo.loc[: self.date_train]),
|
363
|
-
return_best=True,
|
364
|
-
fixed_train_size=True,
|
365
|
-
gap=0,
|
366
|
-
allow_incomplete_fold=True,
|
367
|
-
skip_folds=None,
|
368
|
-
refit=refit,
|
369
|
-
)
|
370
|
-
)
|
371
|
-
self.logger.info(f"Elapsed time: {time.time() - start_time}")
|
372
|
-
self.is_tuned = True
|
373
|
-
predictions_opt = self.forecaster.predict(
|
374
|
-
steps=self.num_lags, exog=self.data_test.drop(self.var_model, axis=1)
|
375
|
-
)
|
376
|
-
freq_hours = self.data_exo.index.freq.delta.seconds / 3600
|
377
|
-
self.lags_opt = int(np.round(len(self.optimize_results.iloc[0]["lags"])))
|
378
|
-
self.days_needed = int(np.round(self.lags_opt * freq_hours / 24))
|
379
|
-
df_pred_opt = pd.DataFrame(
|
380
|
-
index=self.data_exo.index, columns=["train", "test", "pred_optim"]
|
381
|
-
)
|
382
|
-
df_pred_opt["train"] = self.data_train[self.var_model]
|
383
|
-
df_pred_opt["test"] = self.data_test[self.var_model]
|
384
|
-
df_pred_opt["pred_optim"] = predictions_opt
|
385
|
-
pred_optim_metric_train = -self.optimize_results.iloc[0]["neg_r2_score"]
|
386
|
-
self.logger.info(
|
387
|
-
f"R2 score for optimized prediction in train period: {pred_optim_metric_train}"
|
388
|
-
)
|
389
|
-
pred_optim_metric_test = r2_score(
|
390
|
-
df_pred_opt.loc[predictions_opt.index, "test"],
|
391
|
-
df_pred_opt.loc[predictions_opt.index, "pred_optim"],
|
392
|
-
)
|
393
|
-
self.logger.info(
|
394
|
-
f"R2 score for optimized prediction in test period: {pred_optim_metric_test}"
|
395
|
-
)
|
396
|
-
self.logger.info("Number of optimal lags obtained: " + str(self.lags_opt))
|
397
|
-
return df_pred_opt
|
@@ -1,275 +0,0 @@
|
|
1
|
-
"""Machine learning regressor module."""
|
2
|
-
|
3
|
-
from __future__ import annotations
|
4
|
-
|
5
|
-
import copy
|
6
|
-
import time
|
7
|
-
import warnings
|
8
|
-
from typing import TYPE_CHECKING
|
9
|
-
|
10
|
-
import numpy as np
|
11
|
-
import pandas as pd
|
12
|
-
from sklearn.ensemble import (
|
13
|
-
AdaBoostRegressor,
|
14
|
-
GradientBoostingRegressor,
|
15
|
-
RandomForestRegressor,
|
16
|
-
)
|
17
|
-
from sklearn.linear_model import Lasso, LinearRegression, Ridge
|
18
|
-
from sklearn.metrics import r2_score
|
19
|
-
from sklearn.model_selection import GridSearchCV, train_test_split
|
20
|
-
from sklearn.pipeline import make_pipeline
|
21
|
-
from sklearn.preprocessing import StandardScaler
|
22
|
-
|
23
|
-
if TYPE_CHECKING:
|
24
|
-
import logging
|
25
|
-
|
26
|
-
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
27
|
-
|
28
|
-
REGRESSION_METHODS = {
|
29
|
-
"LinearRegression": {
|
30
|
-
"model": LinearRegression(),
|
31
|
-
"param_grid": {
|
32
|
-
"linearregression__fit_intercept": [True, False],
|
33
|
-
"linearregression__positive": [True, False],
|
34
|
-
},
|
35
|
-
},
|
36
|
-
"RidgeRegression": {
|
37
|
-
"model": Ridge(),
|
38
|
-
"param_grid": {"ridge__alpha": [0.1, 1.0, 10.0]},
|
39
|
-
},
|
40
|
-
"LassoRegression": {
|
41
|
-
"model": Lasso(),
|
42
|
-
"param_grid": {"lasso__alpha": [0.1, 1.0, 10.0]},
|
43
|
-
},
|
44
|
-
"RandomForestRegression": {
|
45
|
-
"model": RandomForestRegressor(),
|
46
|
-
"param_grid": {"randomforestregressor__n_estimators": [50, 100, 200]},
|
47
|
-
},
|
48
|
-
"GradientBoostingRegression": {
|
49
|
-
"model": GradientBoostingRegressor(),
|
50
|
-
"param_grid": {
|
51
|
-
"gradientboostingregressor__n_estimators": [50, 100, 200],
|
52
|
-
"gradientboostingregressor__learning_rate": [0.01, 0.1, 0.2],
|
53
|
-
},
|
54
|
-
},
|
55
|
-
"AdaBoostRegression": {
|
56
|
-
"model": AdaBoostRegressor(),
|
57
|
-
"param_grid": {
|
58
|
-
"adaboostregressor__n_estimators": [50, 100, 200],
|
59
|
-
"adaboostregressor__learning_rate": [0.01, 0.1, 0.2],
|
60
|
-
},
|
61
|
-
},
|
62
|
-
}
|
63
|
-
|
64
|
-
|
65
|
-
class MLRegressor:
|
66
|
-
r"""A forecaster class using machine learning models.
|
67
|
-
|
68
|
-
This class uses the `sklearn` module and the machine learning models are \
|
69
|
-
from `scikit-learn`.
|
70
|
-
|
71
|
-
It exposes two main methods:
|
72
|
-
|
73
|
-
- `fit`: to train a model with the passed data.
|
74
|
-
|
75
|
-
- `predict`: to obtain a forecast from a pre-trained model.
|
76
|
-
|
77
|
-
"""
|
78
|
-
|
79
|
-
def __init__(
|
80
|
-
self: MLRegressor,
|
81
|
-
data: pd.DataFrame,
|
82
|
-
model_type: str,
|
83
|
-
regression_model: str,
|
84
|
-
features: list,
|
85
|
-
target: str,
|
86
|
-
timestamp: str,
|
87
|
-
logger: logging.Logger,
|
88
|
-
) -> None:
|
89
|
-
r"""Define constructor for the forecast class.
|
90
|
-
|
91
|
-
:param data: The data that will be used for train/test
|
92
|
-
:type data: pd.DataFrame
|
93
|
-
:param model_type: A unique name defining this model and useful to identify \
|
94
|
-
for what it will be used for.
|
95
|
-
:type model_type: str
|
96
|
-
:param regression_model: The model that will be used. For now only \
|
97
|
-
this options are possible: `LinearRegression`, `RidgeRegression`, \
|
98
|
-
`LassoRegression`, `RandomForestRegression`, \
|
99
|
-
`GradientBoostingRegression` and `AdaBoostRegression`.
|
100
|
-
:type regression_model: str
|
101
|
-
:param features: A list of features. \
|
102
|
-
Example: [`solar_production`, `degree_days`].
|
103
|
-
:type features: list
|
104
|
-
:param target: The target(to be predicted). \
|
105
|
-
Example: `heating_hours`.
|
106
|
-
:type target: str
|
107
|
-
:param timestamp: If defined, the column key that has to be used of timestamp.
|
108
|
-
:type timestamp: str
|
109
|
-
:param logger: The passed logger object
|
110
|
-
:type logger: logging.Logger
|
111
|
-
"""
|
112
|
-
self.data = data
|
113
|
-
self.features = features
|
114
|
-
self.target = target
|
115
|
-
self.timestamp = timestamp
|
116
|
-
self.model_type = model_type
|
117
|
-
self.regression_model = regression_model
|
118
|
-
self.logger = logger
|
119
|
-
self.data = self.data.sort_index()
|
120
|
-
self.data = self.data[~self.data.index.duplicated(keep="first")]
|
121
|
-
self.data_exo = None
|
122
|
-
self.steps = None
|
123
|
-
self.model = None
|
124
|
-
self.grid_search = None
|
125
|
-
|
126
|
-
@staticmethod
|
127
|
-
def add_date_features(
|
128
|
-
data: pd.DataFrame, date_features: list, timestamp: str
|
129
|
-
) -> pd.DataFrame:
|
130
|
-
"""Add date features from the input DataFrame timestamp.
|
131
|
-
|
132
|
-
:param data: The input DataFrame
|
133
|
-
:type data: pd.DataFrame
|
134
|
-
:param timestamp: The column containing the timestamp
|
135
|
-
:type timestamp: str
|
136
|
-
:return: The DataFrame with the added features
|
137
|
-
:rtype: pd.DataFrame
|
138
|
-
"""
|
139
|
-
df = copy.deepcopy(data) # noqa: PD901
|
140
|
-
df[timestamp] = pd.to_datetime(df["timestamp"])
|
141
|
-
if "year" in date_features:
|
142
|
-
df["year"] = [i.year for i in df["timestamp"]]
|
143
|
-
if "month" in date_features:
|
144
|
-
df["month"] = [i.month for i in df["timestamp"]]
|
145
|
-
if "day_of_week" in date_features:
|
146
|
-
df["day_of_week"] = [i.dayofweek for i in df["timestamp"]]
|
147
|
-
if "day_of_year" in date_features:
|
148
|
-
df["day_of_year"] = [i.dayofyear for i in df["timestamp"]]
|
149
|
-
if "day" in date_features:
|
150
|
-
df["day"] = [i.day for i in df["timestamp"]]
|
151
|
-
if "hour" in date_features:
|
152
|
-
df["hour"] = [i.day for i in df["timestamp"]]
|
153
|
-
return df
|
154
|
-
|
155
|
-
def get_regression_model(self: MLRegressor) -> tuple[str, str]:
|
156
|
-
r"""
|
157
|
-
Get the base model and parameter grid for the specified regression model.
|
158
|
-
Returns a tuple containing the base model and parameter grid corresponding to \
|
159
|
-
the specified regression model.
|
160
|
-
|
161
|
-
:param self: The instance of the MLRegressor class.
|
162
|
-
:type self: MLRegressor
|
163
|
-
:return: A tuple containing the base model and parameter grid.
|
164
|
-
:rtype: tuple[str, str]
|
165
|
-
"""
|
166
|
-
if self.regression_model == "LinearRegression":
|
167
|
-
base_model = REGRESSION_METHODS["LinearRegression"]["model"]
|
168
|
-
param_grid = REGRESSION_METHODS["LinearRegression"]["param_grid"]
|
169
|
-
elif self.regression_model == "RidgeRegression":
|
170
|
-
base_model = REGRESSION_METHODS["RidgeRegression"]["model"]
|
171
|
-
param_grid = REGRESSION_METHODS["RidgeRegression"]["param_grid"]
|
172
|
-
elif self.regression_model == "LassoRegression":
|
173
|
-
base_model = REGRESSION_METHODS["LassoRegression"]["model"]
|
174
|
-
param_grid = REGRESSION_METHODS["LassoRegression"]["param_grid"]
|
175
|
-
elif self.regression_model == "RandomForestRegression":
|
176
|
-
base_model = REGRESSION_METHODS["RandomForestRegression"]["model"]
|
177
|
-
param_grid = REGRESSION_METHODS["RandomForestRegression"]["param_grid"]
|
178
|
-
elif self.regression_model == "GradientBoostingRegression":
|
179
|
-
base_model = REGRESSION_METHODS["GradientBoostingRegression"]["model"]
|
180
|
-
param_grid = REGRESSION_METHODS["GradientBoostingRegression"]["param_grid"]
|
181
|
-
elif self.regression_model == "AdaBoostRegression":
|
182
|
-
base_model = REGRESSION_METHODS["AdaBoostRegression"]["model"]
|
183
|
-
param_grid = REGRESSION_METHODS["AdaBoostRegression"]["param_grid"]
|
184
|
-
else:
|
185
|
-
self.logger.error(
|
186
|
-
"Passed model %s is not valid",
|
187
|
-
self.regression_model,
|
188
|
-
)
|
189
|
-
return None, None
|
190
|
-
return base_model, param_grid
|
191
|
-
|
192
|
-
def fit(self: MLRegressor, date_features: list | None = None) -> bool:
|
193
|
-
r"""Fit the model using the provided data.
|
194
|
-
|
195
|
-
:param date_features: A list of 'date_features' to take into account when \
|
196
|
-
fitting the model.
|
197
|
-
:type data: list
|
198
|
-
:return: bool if successful
|
199
|
-
:rtype: bool
|
200
|
-
"""
|
201
|
-
self.logger.info("Performing a MLRegressor fit for %s", self.model_type)
|
202
|
-
self.data_exo = pd.DataFrame(self.data)
|
203
|
-
self.data_exo[self.features] = self.data[self.features]
|
204
|
-
self.data_exo[self.target] = self.data[self.target]
|
205
|
-
keep_columns = []
|
206
|
-
keep_columns.extend(self.features)
|
207
|
-
if self.timestamp is not None:
|
208
|
-
keep_columns.append(self.timestamp)
|
209
|
-
keep_columns.append(self.target)
|
210
|
-
self.data_exo = self.data_exo[self.data_exo.columns.intersection(keep_columns)]
|
211
|
-
self.data_exo = self.data_exo.reset_index(drop=True)
|
212
|
-
if date_features is not None:
|
213
|
-
if self.timestamp is not None:
|
214
|
-
self.data_exo = MLRegressor.add_date_features(
|
215
|
-
self.data_exo,
|
216
|
-
date_features,
|
217
|
-
self.timestamp,
|
218
|
-
)
|
219
|
-
else:
|
220
|
-
self.logger.error(
|
221
|
-
"If no timestamp provided, you can't use date_features, going \
|
222
|
-
further without date_features.",
|
223
|
-
)
|
224
|
-
y = self.data_exo[self.target]
|
225
|
-
self.data_exo = self.data_exo.drop(self.target, axis=1)
|
226
|
-
if self.timestamp is not None:
|
227
|
-
self.data_exo = self.data_exo.drop(self.timestamp, axis=1)
|
228
|
-
X = self.data_exo
|
229
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
230
|
-
X, y, test_size=0.2, random_state=42
|
231
|
-
)
|
232
|
-
self.steps = len(X_test)
|
233
|
-
base_model, param_grid = self.get_regression_model()
|
234
|
-
if base_model is None:
|
235
|
-
return False
|
236
|
-
self.model = make_pipeline(StandardScaler(), base_model)
|
237
|
-
# Create a grid search object
|
238
|
-
self.grid_search = GridSearchCV(
|
239
|
-
self.model,
|
240
|
-
param_grid,
|
241
|
-
cv=5,
|
242
|
-
scoring="neg_mean_squared_error",
|
243
|
-
refit=True,
|
244
|
-
verbose=0,
|
245
|
-
n_jobs=-1,
|
246
|
-
)
|
247
|
-
# Fit the grid search object to the data
|
248
|
-
self.logger.info("Training a %s model", self.regression_model)
|
249
|
-
start_time = time.time()
|
250
|
-
self.grid_search.fit(X_train.values, y_train.values)
|
251
|
-
self.logger.info("Elapsed time for model fit: %s", time.time() - start_time)
|
252
|
-
self.model = self.grid_search.best_estimator_
|
253
|
-
# Make predictions
|
254
|
-
predictions = self.model.predict(X_test.values)
|
255
|
-
predictions = pd.Series(predictions, index=X_test.index)
|
256
|
-
pred_metric = r2_score(y_test, predictions)
|
257
|
-
self.logger.info(
|
258
|
-
"Prediction R2 score of fitted model on test data: %s",
|
259
|
-
pred_metric,
|
260
|
-
)
|
261
|
-
return True
|
262
|
-
|
263
|
-
def predict(self: MLRegressor, new_values: list) -> np.ndarray:
|
264
|
-
"""Predict a new value.
|
265
|
-
|
266
|
-
:param new_values: The new values for the features \
|
267
|
-
(in the same order as the features list). \
|
268
|
-
Example: [2.24, 5.68].
|
269
|
-
:type new_values: list
|
270
|
-
:return: The np.ndarray containing the predicted value.
|
271
|
-
:rtype: np.ndarray
|
272
|
-
"""
|
273
|
-
self.logger.info("Performing a prediction for %s", self.model_type)
|
274
|
-
new_values = np.array([new_values])
|
275
|
-
return self.model.predict(new_values)
|