emhass 0.10.6__py3-none-any.whl → 0.15.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import copy
5
+ import asyncio
6
6
  import time
7
7
  import warnings
8
8
  from typing import TYPE_CHECKING
@@ -11,20 +11,36 @@ import numpy as np
11
11
  import pandas as pd
12
12
  from sklearn.ensemble import (
13
13
  AdaBoostRegressor,
14
+ ExtraTreesRegressor,
14
15
  GradientBoostingRegressor,
15
16
  RandomForestRegressor,
16
17
  )
17
- from sklearn.linear_model import Lasso, LinearRegression, Ridge
18
+ from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
18
19
  from sklearn.metrics import r2_score
19
20
  from sklearn.model_selection import GridSearchCV, train_test_split
21
+ from sklearn.neighbors import KNeighborsRegressor
22
+ from sklearn.neural_network import MLPRegressor
20
23
  from sklearn.pipeline import make_pipeline
21
24
  from sklearn.preprocessing import StandardScaler
25
+ from sklearn.svm import SVR
26
+ from sklearn.tree import DecisionTreeRegressor
27
+
28
+ from emhass import utils
22
29
 
23
30
  if TYPE_CHECKING:
24
31
  import logging
25
32
 
26
33
  warnings.filterwarnings("ignore", category=DeprecationWarning)
27
34
 
35
+
36
+ # AUTHORITATIVE SOURCE: Supported regression models for MLRegressor and adjust_pv_forecast
37
+ # When adding/removing models, also update:
38
+ # - src/emhass/static/data/param_definitions.json (adjusted_pv_regression_model select_options)
39
+ # - docs/config.md (adjusted_pv_regression_model description)
40
+ # - docs/forecasts.md (Model Training section)
41
+ # - src/emhass/forecast.py (adjust_pv_forecast_fit docstring)
42
+ # Define a seed for reproducibility
43
+ seed = 42
28
44
  REGRESSION_METHODS = {
29
45
  "LinearRegression": {
30
46
  "model": LinearRegression(),
@@ -35,30 +51,81 @@ REGRESSION_METHODS = {
35
51
  },
36
52
  "RidgeRegression": {
37
53
  "model": Ridge(),
38
- "param_grid": {"ridge__alpha": [0.1, 1.0, 10.0]},
54
+ "param_grid": {"ridge__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]},
39
55
  },
40
56
  "LassoRegression": {
41
- "model": Lasso(),
42
- "param_grid": {"lasso__alpha": [0.1, 1.0, 10.0]},
57
+ "model": Lasso(random_state=seed),
58
+ "param_grid": {"lasso__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]},
59
+ },
60
+ "ElasticNet": {
61
+ "model": ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=seed),
62
+ "param_grid": {
63
+ "elasticnet__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100],
64
+ "elasticnet__l1_ratio": [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
65
+ },
66
+ },
67
+ "KNeighborsRegressor": {
68
+ "model": KNeighborsRegressor(),
69
+ "param_grid": {
70
+ "kneighborsregressor__n_neighbors": [3, 5, 7, 10, 15],
71
+ "kneighborsregressor__weights": ["uniform", "distance"],
72
+ },
73
+ },
74
+ "DecisionTreeRegressor": {
75
+ "model": DecisionTreeRegressor(ccp_alpha=0.0, random_state=seed),
76
+ "param_grid": {
77
+ "decisiontreeregressor__max_depth": [None, 5, 10, 20],
78
+ "decisiontreeregressor__min_samples_split": [2, 5, 10],
79
+ },
80
+ },
81
+ "SVR": {
82
+ "model": SVR(),
83
+ "param_grid": {
84
+ "svr__C": [0.1, 1, 10, 100],
85
+ "svr__gamma": ["scale", "auto"],
86
+ "svr__kernel": ["rbf", "linear"],
87
+ },
43
88
  },
44
- "RandomForestRegression": {
45
- "model": RandomForestRegressor(),
46
- "param_grid": {"randomforestregressor__n_estimators": [50, 100, 200]},
89
+ "RandomForestRegressor": {
90
+ "model": RandomForestRegressor(min_samples_leaf=1, max_features=1.0, random_state=seed),
91
+ "param_grid": {
92
+ "randomforestregressor__n_estimators": [50, 100, 200],
93
+ "randomforestregressor__max_depth": [None, 10, 20],
94
+ "randomforestregressor__max_features": ["sqrt", "log2", None],
95
+ },
47
96
  },
48
- "GradientBoostingRegression": {
49
- "model": GradientBoostingRegressor(),
97
+ "ExtraTreesRegressor": {
98
+ "model": ExtraTreesRegressor(min_samples_leaf=1, max_features=1.0, random_state=seed),
99
+ "param_grid": {
100
+ "extratreesregressor__n_estimators": [50, 100, 200],
101
+ "extratreesregressor__max_depth": [None, 10, 20],
102
+ "extratreesregressor__max_features": ["sqrt", "log2", None],
103
+ },
104
+ },
105
+ "GradientBoostingRegressor": {
106
+ "model": GradientBoostingRegressor(learning_rate=0.1, random_state=seed),
50
107
  "param_grid": {
51
108
  "gradientboostingregressor__n_estimators": [50, 100, 200],
52
109
  "gradientboostingregressor__learning_rate": [0.01, 0.1, 0.2],
110
+ "gradientboostingregressor__max_depth": [3, 5, 10],
53
111
  },
54
112
  },
55
- "AdaBoostRegression": {
56
- "model": AdaBoostRegressor(),
113
+ "AdaBoostRegressor": {
114
+ "model": AdaBoostRegressor(learning_rate=1.0, random_state=seed),
57
115
  "param_grid": {
58
116
  "adaboostregressor__n_estimators": [50, 100, 200],
59
117
  "adaboostregressor__learning_rate": [0.01, 0.1, 0.2],
60
118
  },
61
119
  },
120
+ "MLPRegressor": {
121
+ "model": MLPRegressor(hidden_layer_sizes=(100,), random_state=seed),
122
+ "param_grid": {
123
+ "mlpregressor__hidden_layer_sizes": [(50,), (100,), (50, 50)],
124
+ "mlpregressor__activation": ["relu", "tanh"],
125
+ "mlpregressor__alpha": [1e-4, 1e-3],
126
+ "mlpregressor__max_iter": [500],
127
+ },
128
+ },
62
129
  }
63
130
 
64
131
 
@@ -76,8 +143,16 @@ class MLRegressor:
76
143
 
77
144
  """
78
145
 
79
- def __init__(self: MLRegressor, data: pd.DataFrame, model_type: str, regression_model: str,
80
- features: list, target: str, timestamp: str, logger: logging.Logger) -> None:
146
+ def __init__(
147
+ self: MLRegressor,
148
+ data: pd.DataFrame,
149
+ model_type: str,
150
+ regression_model: str,
151
+ features: list[str],
152
+ target: str,
153
+ timestamp: str,
154
+ logger: logging.Logger,
155
+ ) -> None:
81
156
  r"""Define constructor for the forecast class.
82
157
 
83
158
  :param data: The data that will be used for train/test
@@ -101,48 +176,45 @@ class MLRegressor:
101
176
  :param logger: The passed logger object
102
177
  :type logger: logging.Logger
103
178
  """
104
- self.data = data
179
+ self.data = data.sort_index()
105
180
  self.features = features
106
181
  self.target = target
107
182
  self.timestamp = timestamp
108
183
  self.model_type = model_type
109
184
  self.regression_model = regression_model
110
185
  self.logger = logger
111
- self.data = self.data.sort_index()
186
+
112
187
  self.data = self.data[~self.data.index.duplicated(keep="first")]
113
- self.data_exo = None
114
- self.steps = None
188
+ self.data_exo: pd.DataFrame | None = None
189
+ self.steps: int | None = None
115
190
  self.model = None
116
- self.grid_search = None
191
+ self.grid_search: GridSearchCV | None = None
117
192
 
118
- @staticmethod
119
- def add_date_features(data: pd.DataFrame, date_features: list, timestamp: str) -> pd.DataFrame:
120
- """Add date features from the input DataFrame timestamp.
193
+ def _prepare_data(self, date_features: list[str] | None) -> tuple[pd.DataFrame, pd.Series]:
194
+ self.data_exo = self.data.copy()
195
+ self.data_exo[self.features] = self.data[self.features]
196
+ self.data_exo[self.target] = self.data[self.target]
121
197
 
122
- :param data: The input DataFrame
123
- :type data: pd.DataFrame
124
- :param timestamp: The column containing the timestamp
125
- :type timestamp: str
126
- :return: The DataFrame with the added features
127
- :rtype: pd.DataFrame
128
- """
129
- df = copy.deepcopy(data) # noqa: PD901
130
- df[timestamp] = pd.to_datetime(df["timestamp"])
131
- if "year" in date_features:
132
- df["year"] = [i.year for i in df["timestamp"]]
133
- if "month" in date_features:
134
- df["month"] = [i.month for i in df["timestamp"]]
135
- if "day_of_week" in date_features:
136
- df["day_of_week"] = [i.dayofweek for i in df["timestamp"]]
137
- if "day_of_year" in date_features:
138
- df["day_of_year"] = [i.dayofyear for i in df["timestamp"]]
139
- if "day" in date_features:
140
- df["day"] = [i.day for i in df["timestamp"]]
141
- if "hour" in date_features:
142
- df["hour"] = [i.day for i in df["timestamp"]]
143
- return df
144
-
145
- def get_regression_model(self: MLRegressor) -> tuple[str, str]:
198
+ keep_columns = list(self.features)
199
+ if self.timestamp:
200
+ keep_columns.append(self.timestamp)
201
+ keep_columns.append(self.target)
202
+ self.data_exo = self.data_exo[keep_columns].reset_index(drop=True)
203
+
204
+ if date_features and self.timestamp:
205
+ self.data_exo = utils.add_date_features(
206
+ self.data_exo, timestamp=self.timestamp, date_features=date_features
207
+ )
208
+ elif date_features:
209
+ self.logger.warning("Timestamp is required for date_features. Skipping date features.")
210
+
211
+ y = self.data_exo[self.target]
212
+ X = self.data_exo.drop(
213
+ columns=[self.target, self.timestamp] if self.timestamp else [self.target]
214
+ )
215
+ return X, y
216
+
217
+ def _get_model_and_params(self) -> tuple[GridSearchCV, dict] | tuple[None, None]:
146
218
  r"""
147
219
  Get the base model and parameter grid for the specified regression model.
148
220
  Returns a tuple containing the base model and parameter grid corresponding to \
@@ -153,90 +225,57 @@ class MLRegressor:
153
225
  :return: A tuple containing the base model and parameter grid.
154
226
  :rtype: tuple[str, str]
155
227
  """
156
- if self.regression_model == "LinearRegression":
157
- base_model = REGRESSION_METHODS["LinearRegression"]["model"]
158
- param_grid = REGRESSION_METHODS["LinearRegression"]["param_grid"]
159
- elif self.regression_model == "RidgeRegression":
160
- base_model = REGRESSION_METHODS["RidgeRegression"]["model"]
161
- param_grid = REGRESSION_METHODS["RidgeRegression"]["param_grid"]
162
- elif self.regression_model == "LassoRegression":
163
- base_model = REGRESSION_METHODS["LassoRegression"]["model"]
164
- param_grid = REGRESSION_METHODS["LassoRegression"]["param_grid"]
165
- elif self.regression_model == "RandomForestRegression":
166
- base_model = REGRESSION_METHODS["RandomForestRegression"]["model"]
167
- param_grid = REGRESSION_METHODS["RandomForestRegression"]["param_grid"]
168
- elif self.regression_model == "GradientBoostingRegression":
169
- base_model = REGRESSION_METHODS["GradientBoostingRegression"]["model"]
170
- param_grid = REGRESSION_METHODS["GradientBoostingRegression"]["param_grid"]
171
- elif self.regression_model == "AdaBoostRegression":
172
- base_model = REGRESSION_METHODS["AdaBoostRegression"]["model"]
173
- param_grid = REGRESSION_METHODS["AdaBoostRegression"]["param_grid"]
174
- else:
175
- self.logger.error(
176
- "Passed model %s is not valid",
177
- self.regression_model,
178
- )
179
- return None
180
- return base_model, param_grid
228
+ method = REGRESSION_METHODS.get(self.regression_model)
229
+ if not method:
230
+ self.logger.error("Invalid regression model: %s", self.regression_model)
231
+ return None, None
232
+
233
+ pipeline = make_pipeline(StandardScaler(), method["model"])
234
+ param_grid = method["param_grid"]
235
+ return pipeline, param_grid
181
236
 
182
- def fit(self: MLRegressor, date_features: list | None = None) -> None:
237
+ async def fit(self: MLRegressor, date_features: list[str] | None = None) -> bool:
183
238
  r"""Fit the model using the provided data.
184
239
 
185
240
  :param date_features: A list of 'date_features' to take into account when \
186
241
  fitting the model.
187
242
  :type data: list
243
+ :return: bool if successful
244
+ :rtype: bool
188
245
  """
189
- self.logger.info("Performing a MLRegressor fit for %s", self.model_type)
190
- self.data_exo = pd.DataFrame(self.data)
191
- self.data_exo[self.features] = self.data[self.features]
192
- self.data_exo[self.target] = self.data[self.target]
193
- keep_columns = []
194
- keep_columns.extend(self.features)
195
- if self.timestamp is not None:
196
- keep_columns.append(self.timestamp)
197
- keep_columns.append(self.target)
198
- self.data_exo = self.data_exo[self.data_exo.columns.intersection(keep_columns)]
199
- self.data_exo = self.data_exo.reset_index(drop=True)
200
- if date_features is not None:
201
- if self.timestamp is not None:
202
- self.data_exo = MLRegressor.add_date_features(
203
- self.data_exo,
204
- date_features,
205
- self.timestamp,
206
- )
207
- else:
208
- self.logger.error(
209
- "If no timestamp provided, you can't use date_features, going \
210
- further without date_features.",
211
- )
212
- y = self.data_exo[self.target]
213
- self.data_exo = self.data_exo.drop(self.target, axis=1)
214
- if self.timestamp is not None:
215
- self.data_exo = self.data_exo.drop(self.timestamp, axis=1)
216
- X = self.data_exo
246
+ self.logger.info("Fitting MLRegressor model for %s", self.model_type)
247
+
248
+ X, y = self._prepare_data(date_features)
217
249
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
218
250
  self.steps = len(X_test)
219
- base_model, param_grid = self.get_regression_model()
220
- self.model = make_pipeline(StandardScaler(), base_model)
221
- # Create a grid search object
222
- self.grid_search = GridSearchCV(self.model, param_grid, cv=5, scoring="neg_mean_squared_error",
223
- refit=True, verbose=0, n_jobs=-1)
224
- # Fit the grid search object to the data
225
- self.logger.info("Training a %s model", self.regression_model)
226
- start_time = time.time()
227
- self.grid_search.fit(X_train.values, y_train.values)
228
- self.logger.info("Elapsed time for model fit: %s", time.time() - start_time)
229
- self.model = self.grid_search.best_estimator_
230
- # Make predictions
231
- predictions = self.model.predict(X_test.values)
232
- predictions = pd.Series(predictions, index=X_test.index)
233
- pred_metric = r2_score(y_test, predictions)
234
- self.logger.info(
235
- "Prediction R2 score of fitted model on test data: %s",
236
- pred_metric,
251
+
252
+ model_pipeline, param_grid = self._get_model_and_params()
253
+ if model_pipeline is None:
254
+ return False
255
+
256
+ self.grid_search = GridSearchCV(
257
+ model_pipeline,
258
+ param_grid,
259
+ cv=5,
260
+ scoring="neg_mean_squared_error",
261
+ refit=True,
262
+ verbose=0,
263
+ n_jobs=-1,
237
264
  )
238
265
 
239
- def predict(self: MLRegressor, new_values: list) -> np.ndarray:
266
+ self.logger.info("Training model: %s", self.regression_model)
267
+ start = time.time()
268
+ await asyncio.to_thread(self.grid_search.fit, X_train.values, y_train.values)
269
+ self.logger.info("Model fit completed in %.2f seconds", time.time() - start)
270
+
271
+ self.model = self.grid_search.best_estimator_
272
+
273
+ predictions = await asyncio.to_thread(self.model.predict, X_test.values)
274
+ r2 = r2_score(y_test, predictions)
275
+ self.logger.info("R2 score on test set: %.4f", r2)
276
+ return True
277
+
278
+ async def predict(self: MLRegressor, new_values: list[float]) -> np.ndarray:
240
279
  """Predict a new value.
241
280
 
242
281
  :param new_values: The new values for the features \
@@ -246,6 +285,7 @@ class MLRegressor:
246
285
  :return: The np.ndarray containing the predicted value.
247
286
  :rtype: np.ndarray
248
287
  """
249
- self.logger.info("Performing a prediction for %s", self.model_type)
250
- new_values = np.array([new_values])
251
- return self.model.predict(new_values)
288
+ self.logger.info("Making prediction with model %s", self.model_type)
289
+ new_values_array = np.array([new_values])
290
+ prediction = await asyncio.to_thread(self.model.predict, new_values_array)
291
+ return prediction