emhass 0.11.4__py3-none-any.whl → 0.15.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import copy
5
+ import asyncio
6
6
  import time
7
7
  import warnings
8
8
  from typing import TYPE_CHECKING
@@ -11,20 +11,36 @@ import numpy as np
11
11
  import pandas as pd
12
12
  from sklearn.ensemble import (
13
13
  AdaBoostRegressor,
14
+ ExtraTreesRegressor,
14
15
  GradientBoostingRegressor,
15
16
  RandomForestRegressor,
16
17
  )
17
- from sklearn.linear_model import Lasso, LinearRegression, Ridge
18
+ from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
18
19
  from sklearn.metrics import r2_score
19
20
  from sklearn.model_selection import GridSearchCV, train_test_split
21
+ from sklearn.neighbors import KNeighborsRegressor
22
+ from sklearn.neural_network import MLPRegressor
20
23
  from sklearn.pipeline import make_pipeline
21
24
  from sklearn.preprocessing import StandardScaler
25
+ from sklearn.svm import SVR
26
+ from sklearn.tree import DecisionTreeRegressor
27
+
28
+ from emhass import utils
22
29
 
23
30
  if TYPE_CHECKING:
24
31
  import logging
25
32
 
26
33
  warnings.filterwarnings("ignore", category=DeprecationWarning)
27
34
 
35
+
36
+ # AUTHORITATIVE SOURCE: Supported regression models for MLRegressor and adjust_pv_forecast
37
+ # When adding/removing models, also update:
38
+ # - src/emhass/static/data/param_definitions.json (adjusted_pv_regression_model select_options)
39
+ # - docs/config.md (adjusted_pv_regression_model description)
40
+ # - docs/forecasts.md (Model Training section)
41
+ # - src/emhass/forecast.py (adjust_pv_forecast_fit docstring)
42
+ # Define a seed for reproducibility
43
+ seed = 42
28
44
  REGRESSION_METHODS = {
29
45
  "LinearRegression": {
30
46
  "model": LinearRegression(),
@@ -35,30 +51,81 @@ REGRESSION_METHODS = {
35
51
  },
36
52
  "RidgeRegression": {
37
53
  "model": Ridge(),
38
- "param_grid": {"ridge__alpha": [0.1, 1.0, 10.0]},
54
+ "param_grid": {"ridge__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]},
39
55
  },
40
56
  "LassoRegression": {
41
- "model": Lasso(),
42
- "param_grid": {"lasso__alpha": [0.1, 1.0, 10.0]},
57
+ "model": Lasso(random_state=seed),
58
+ "param_grid": {"lasso__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]},
59
+ },
60
+ "ElasticNet": {
61
+ "model": ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=seed),
62
+ "param_grid": {
63
+ "elasticnet__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100],
64
+ "elasticnet__l1_ratio": [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
65
+ },
66
+ },
67
+ "KNeighborsRegressor": {
68
+ "model": KNeighborsRegressor(),
69
+ "param_grid": {
70
+ "kneighborsregressor__n_neighbors": [3, 5, 7, 10, 15],
71
+ "kneighborsregressor__weights": ["uniform", "distance"],
72
+ },
73
+ },
74
+ "DecisionTreeRegressor": {
75
+ "model": DecisionTreeRegressor(ccp_alpha=0.0, random_state=seed),
76
+ "param_grid": {
77
+ "decisiontreeregressor__max_depth": [None, 5, 10, 20],
78
+ "decisiontreeregressor__min_samples_split": [2, 5, 10],
79
+ },
80
+ },
81
+ "SVR": {
82
+ "model": SVR(),
83
+ "param_grid": {
84
+ "svr__C": [0.1, 1, 10, 100],
85
+ "svr__gamma": ["scale", "auto"],
86
+ "svr__kernel": ["rbf", "linear"],
87
+ },
43
88
  },
44
- "RandomForestRegression": {
45
- "model": RandomForestRegressor(),
46
- "param_grid": {"randomforestregressor__n_estimators": [50, 100, 200]},
89
+ "RandomForestRegressor": {
90
+ "model": RandomForestRegressor(min_samples_leaf=1, max_features=1.0, random_state=seed),
91
+ "param_grid": {
92
+ "randomforestregressor__n_estimators": [50, 100, 200],
93
+ "randomforestregressor__max_depth": [None, 10, 20],
94
+ "randomforestregressor__max_features": ["sqrt", "log2", None],
95
+ },
47
96
  },
48
- "GradientBoostingRegression": {
49
- "model": GradientBoostingRegressor(),
97
+ "ExtraTreesRegressor": {
98
+ "model": ExtraTreesRegressor(min_samples_leaf=1, max_features=1.0, random_state=seed),
99
+ "param_grid": {
100
+ "extratreesregressor__n_estimators": [50, 100, 200],
101
+ "extratreesregressor__max_depth": [None, 10, 20],
102
+ "extratreesregressor__max_features": ["sqrt", "log2", None],
103
+ },
104
+ },
105
+ "GradientBoostingRegressor": {
106
+ "model": GradientBoostingRegressor(learning_rate=0.1, random_state=seed),
50
107
  "param_grid": {
51
108
  "gradientboostingregressor__n_estimators": [50, 100, 200],
52
109
  "gradientboostingregressor__learning_rate": [0.01, 0.1, 0.2],
110
+ "gradientboostingregressor__max_depth": [3, 5, 10],
53
111
  },
54
112
  },
55
- "AdaBoostRegression": {
56
- "model": AdaBoostRegressor(),
113
+ "AdaBoostRegressor": {
114
+ "model": AdaBoostRegressor(learning_rate=1.0, random_state=seed),
57
115
  "param_grid": {
58
116
  "adaboostregressor__n_estimators": [50, 100, 200],
59
117
  "adaboostregressor__learning_rate": [0.01, 0.1, 0.2],
60
118
  },
61
119
  },
120
+ "MLPRegressor": {
121
+ "model": MLPRegressor(hidden_layer_sizes=(100,), random_state=seed),
122
+ "param_grid": {
123
+ "mlpregressor__hidden_layer_sizes": [(50,), (100,), (50, 50)],
124
+ "mlpregressor__activation": ["relu", "tanh"],
125
+ "mlpregressor__alpha": [1e-4, 1e-3],
126
+ "mlpregressor__max_iter": [500],
127
+ },
128
+ },
62
129
  }
63
130
 
64
131
 
@@ -81,7 +148,7 @@ class MLRegressor:
81
148
  data: pd.DataFrame,
82
149
  model_type: str,
83
150
  regression_model: str,
84
- features: list,
151
+ features: list[str],
85
152
  target: str,
86
153
  timestamp: str,
87
154
  logger: logging.Logger,
@@ -109,50 +176,45 @@ class MLRegressor:
109
176
  :param logger: The passed logger object
110
177
  :type logger: logging.Logger
111
178
  """
112
- self.data = data
179
+ self.data = data.sort_index()
113
180
  self.features = features
114
181
  self.target = target
115
182
  self.timestamp = timestamp
116
183
  self.model_type = model_type
117
184
  self.regression_model = regression_model
118
185
  self.logger = logger
119
- self.data = self.data.sort_index()
186
+
120
187
  self.data = self.data[~self.data.index.duplicated(keep="first")]
121
- self.data_exo = None
122
- self.steps = None
188
+ self.data_exo: pd.DataFrame | None = None
189
+ self.steps: int | None = None
123
190
  self.model = None
124
- self.grid_search = None
191
+ self.grid_search: GridSearchCV | None = None
125
192
 
126
- @staticmethod
127
- def add_date_features(
128
- data: pd.DataFrame, date_features: list, timestamp: str
129
- ) -> pd.DataFrame:
130
- """Add date features from the input DataFrame timestamp.
193
+ def _prepare_data(self, date_features: list[str] | None) -> tuple[pd.DataFrame, pd.Series]:
194
+ self.data_exo = self.data.copy()
195
+ self.data_exo[self.features] = self.data[self.features]
196
+ self.data_exo[self.target] = self.data[self.target]
131
197
 
132
- :param data: The input DataFrame
133
- :type data: pd.DataFrame
134
- :param timestamp: The column containing the timestamp
135
- :type timestamp: str
136
- :return: The DataFrame with the added features
137
- :rtype: pd.DataFrame
138
- """
139
- df = copy.deepcopy(data) # noqa: PD901
140
- df[timestamp] = pd.to_datetime(df["timestamp"])
141
- if "year" in date_features:
142
- df["year"] = [i.year for i in df["timestamp"]]
143
- if "month" in date_features:
144
- df["month"] = [i.month for i in df["timestamp"]]
145
- if "day_of_week" in date_features:
146
- df["day_of_week"] = [i.dayofweek for i in df["timestamp"]]
147
- if "day_of_year" in date_features:
148
- df["day_of_year"] = [i.dayofyear for i in df["timestamp"]]
149
- if "day" in date_features:
150
- df["day"] = [i.day for i in df["timestamp"]]
151
- if "hour" in date_features:
152
- df["hour"] = [i.day for i in df["timestamp"]]
153
- return df
154
-
155
- def get_regression_model(self: MLRegressor) -> tuple[str, str]:
198
+ keep_columns = list(self.features)
199
+ if self.timestamp:
200
+ keep_columns.append(self.timestamp)
201
+ keep_columns.append(self.target)
202
+ self.data_exo = self.data_exo[keep_columns].reset_index(drop=True)
203
+
204
+ if date_features and self.timestamp:
205
+ self.data_exo = utils.add_date_features(
206
+ self.data_exo, timestamp=self.timestamp, date_features=date_features
207
+ )
208
+ elif date_features:
209
+ self.logger.warning("Timestamp is required for date_features. Skipping date features.")
210
+
211
+ y = self.data_exo[self.target]
212
+ X = self.data_exo.drop(
213
+ columns=[self.target, self.timestamp] if self.timestamp else [self.target]
214
+ )
215
+ return X, y
216
+
217
+ def _get_model_and_params(self) -> tuple[GridSearchCV, dict] | tuple[None, None]:
156
218
  r"""
157
219
  Get the base model and parameter grid for the specified regression model.
158
220
  Returns a tuple containing the base model and parameter grid corresponding to \
@@ -163,33 +225,16 @@ class MLRegressor:
163
225
  :return: A tuple containing the base model and parameter grid.
164
226
  :rtype: tuple[str, str]
165
227
  """
166
- if self.regression_model == "LinearRegression":
167
- base_model = REGRESSION_METHODS["LinearRegression"]["model"]
168
- param_grid = REGRESSION_METHODS["LinearRegression"]["param_grid"]
169
- elif self.regression_model == "RidgeRegression":
170
- base_model = REGRESSION_METHODS["RidgeRegression"]["model"]
171
- param_grid = REGRESSION_METHODS["RidgeRegression"]["param_grid"]
172
- elif self.regression_model == "LassoRegression":
173
- base_model = REGRESSION_METHODS["LassoRegression"]["model"]
174
- param_grid = REGRESSION_METHODS["LassoRegression"]["param_grid"]
175
- elif self.regression_model == "RandomForestRegression":
176
- base_model = REGRESSION_METHODS["RandomForestRegression"]["model"]
177
- param_grid = REGRESSION_METHODS["RandomForestRegression"]["param_grid"]
178
- elif self.regression_model == "GradientBoostingRegression":
179
- base_model = REGRESSION_METHODS["GradientBoostingRegression"]["model"]
180
- param_grid = REGRESSION_METHODS["GradientBoostingRegression"]["param_grid"]
181
- elif self.regression_model == "AdaBoostRegression":
182
- base_model = REGRESSION_METHODS["AdaBoostRegression"]["model"]
183
- param_grid = REGRESSION_METHODS["AdaBoostRegression"]["param_grid"]
184
- else:
185
- self.logger.error(
186
- "Passed model %s is not valid",
187
- self.regression_model,
188
- )
228
+ method = REGRESSION_METHODS.get(self.regression_model)
229
+ if not method:
230
+ self.logger.error("Invalid regression model: %s", self.regression_model)
189
231
  return None, None
190
- return base_model, param_grid
191
232
 
192
- def fit(self: MLRegressor, date_features: list | None = None) -> bool:
233
+ pipeline = make_pipeline(StandardScaler(), method["model"])
234
+ param_grid = method["param_grid"]
235
+ return pipeline, param_grid
236
+
237
+ async def fit(self: MLRegressor, date_features: list[str] | None = None) -> bool:
193
238
  r"""Fit the model using the provided data.
194
239
 
195
240
  :param date_features: A list of 'date_features' to take into account when \
@@ -198,45 +243,18 @@ class MLRegressor:
198
243
  :return: bool if successful
199
244
  :rtype: bool
200
245
  """
201
- self.logger.info("Performing a MLRegressor fit for %s", self.model_type)
202
- self.data_exo = pd.DataFrame(self.data)
203
- self.data_exo[self.features] = self.data[self.features]
204
- self.data_exo[self.target] = self.data[self.target]
205
- keep_columns = []
206
- keep_columns.extend(self.features)
207
- if self.timestamp is not None:
208
- keep_columns.append(self.timestamp)
209
- keep_columns.append(self.target)
210
- self.data_exo = self.data_exo[self.data_exo.columns.intersection(keep_columns)]
211
- self.data_exo = self.data_exo.reset_index(drop=True)
212
- if date_features is not None:
213
- if self.timestamp is not None:
214
- self.data_exo = MLRegressor.add_date_features(
215
- self.data_exo,
216
- date_features,
217
- self.timestamp,
218
- )
219
- else:
220
- self.logger.error(
221
- "If no timestamp provided, you can't use date_features, going \
222
- further without date_features.",
223
- )
224
- y = self.data_exo[self.target]
225
- self.data_exo = self.data_exo.drop(self.target, axis=1)
226
- if self.timestamp is not None:
227
- self.data_exo = self.data_exo.drop(self.timestamp, axis=1)
228
- X = self.data_exo
229
- X_train, X_test, y_train, y_test = train_test_split(
230
- X, y, test_size=0.2, random_state=42
231
- )
246
+ self.logger.info("Fitting MLRegressor model for %s", self.model_type)
247
+
248
+ X, y = self._prepare_data(date_features)
249
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
232
250
  self.steps = len(X_test)
233
- base_model, param_grid = self.get_regression_model()
234
- if base_model is None:
251
+
252
+ model_pipeline, param_grid = self._get_model_and_params()
253
+ if model_pipeline is None:
235
254
  return False
236
- self.model = make_pipeline(StandardScaler(), base_model)
237
- # Create a grid search object
255
+
238
256
  self.grid_search = GridSearchCV(
239
- self.model,
257
+ model_pipeline,
240
258
  param_grid,
241
259
  cv=5,
242
260
  scoring="neg_mean_squared_error",
@@ -244,23 +262,20 @@ class MLRegressor:
244
262
  verbose=0,
245
263
  n_jobs=-1,
246
264
  )
247
- # Fit the grid search object to the data
248
- self.logger.info("Training a %s model", self.regression_model)
249
- start_time = time.time()
250
- self.grid_search.fit(X_train.values, y_train.values)
251
- self.logger.info("Elapsed time for model fit: %s", time.time() - start_time)
265
+
266
+ self.logger.info("Training model: %s", self.regression_model)
267
+ start = time.time()
268
+ await asyncio.to_thread(self.grid_search.fit, X_train.values, y_train.values)
269
+ self.logger.info("Model fit completed in %.2f seconds", time.time() - start)
270
+
252
271
  self.model = self.grid_search.best_estimator_
253
- # Make predictions
254
- predictions = self.model.predict(X_test.values)
255
- predictions = pd.Series(predictions, index=X_test.index)
256
- pred_metric = r2_score(y_test, predictions)
257
- self.logger.info(
258
- "Prediction R2 score of fitted model on test data: %s",
259
- pred_metric,
260
- )
272
+
273
+ predictions = await asyncio.to_thread(self.model.predict, X_test.values)
274
+ r2 = r2_score(y_test, predictions)
275
+ self.logger.info("R2 score on test set: %.4f", r2)
261
276
  return True
262
277
 
263
- def predict(self: MLRegressor, new_values: list) -> np.ndarray:
278
+ async def predict(self: MLRegressor, new_values: list[float]) -> np.ndarray:
264
279
  """Predict a new value.
265
280
 
266
281
  :param new_values: The new values for the features \
@@ -270,6 +285,7 @@ class MLRegressor:
270
285
  :return: The np.ndarray containing the predicted value.
271
286
  :rtype: np.ndarray
272
287
  """
273
- self.logger.info("Performing a prediction for %s", self.model_type)
274
- new_values = np.array([new_values])
275
- return self.model.predict(new_values)
288
+ self.logger.info("Making prediction with model %s", self.model_type)
289
+ new_values_array = np.array([new_values])
290
+ prediction = await asyncio.to_thread(self.model.predict, new_values_array)
291
+ return prediction