openstef 3.4.73__py3-none-any.whl → 3.4.75__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,7 @@
1
+ # SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com>
2
+ #
3
+ # SPDX-License-Identifier: MPL-2.0
4
+
1
5
  """This module contains the median regressor."""
2
6
 
3
7
  import numpy as np
@@ -12,6 +16,8 @@ from openstef.model.regressors.regressor import OpenstfRegressor
12
16
  class MedianRegressor(OpenstfRegressor, RegressorMixin):
13
17
  """
14
18
  Median regressor implementing the OpenSTEF regressor API.
19
+ Note that this is a autoregressive model, meaning that it uses the previous
20
+ predictions to predict the next value.
15
21
 
16
22
  This regressor is good for predicting two types of signals:
17
23
  - Signals with very slow dynamics compared to the sampling rate, possibly
@@ -20,6 +26,19 @@ class MedianRegressor(OpenstfRegressor, RegressorMixin):
20
26
  depend on unknown features, but tend to be stable in each state. An example of
21
27
  this may be waste heat delivered from an industrial process. Using a median
22
28
  over the last few timesteps adds some hysterisis to avoid triggering on noise.
29
+
30
+ Tips for using this regressor:
31
+ - Set the lags to be evenly spaced and at a frequency mathching the
32
+ frequency of the input data. For example, if the input data is at 15
33
+ minute intervals, set the lags to be at 15 minute intervals as well.
34
+ - Use a small training dataset, since there are no actual parameters to train.
35
+ - Set the frequency of the input data index to avoid inferring it. Inference might be
36
+ a problem if we get very small chunks of data in training or validation sets.
37
+ - Use only one training horizon, since the regressor will use the same lags for all
38
+ training horizons.
39
+ - Allow for missing data by setting completeness_threshold to 0. If the prediction horizon
40
+ is larger than the context window there will be a lot of nans in the input data, but
41
+ the autoregression solves that.
23
42
  """
24
43
 
25
44
  def __init__(self):
@@ -37,10 +56,160 @@ class MedianRegressor(OpenstfRegressor, RegressorMixin):
37
56
  check_is_fitted(self, "feature_names_")
38
57
  return self.feature_names_
39
58
 
59
+ @property
60
+ def frequency(self) -> int:
61
+ """Retrieve the model input frequency.
62
+
63
+ Returns:
64
+ The frequency of the model input
65
+
66
+ """
67
+ check_is_fitted(self, "frequency_")
68
+ return self.frequency_
69
+
40
70
  @property
41
71
  def can_predict_quantiles(self) -> bool:
42
72
  return False
43
73
 
74
+ @staticmethod
75
+ def _get_importance_names():
76
+ # This function does not do much, but needs to be implemented
77
+ # in order to get the feature importances from the regressor.
78
+ # Keys need to be these specific strings, values van be anything
79
+ return {
80
+ "gain_importance_name": "exists",
81
+ "weight_importance_name": "exists",
82
+ }
83
+
84
+ @staticmethod
85
+ def _infer_frequency(index: pd.DatetimeIndex) -> pd.Timedelta:
86
+ """
87
+ Infer the frequency of a pandas DatetimeIndex if the freq attribute is not set.
88
+ This method calculates the most common time difference between consecutive timestamps,
89
+ which is more permissive of missing chunks of data than the pandas infer_freq method.
90
+
91
+ Args:
92
+ index (pd.DatetimeIndex): The datetime index to infer the frequency from.
93
+
94
+ Returns:
95
+ pd.Timedelta: The inferred frequency as a pandas Timedelta.
96
+ """
97
+ if len(index) < 2:
98
+ raise ValueError(
99
+ "Cannot infer frequency from an index with fewer than 2 timestamps."
100
+ )
101
+
102
+ # Calculate the differences between consecutive timestamps
103
+ deltas = index.to_series().diff().dropna()
104
+
105
+ # Find the most common difference
106
+ inferred_freq = deltas.mode().iloc[0]
107
+ return inferred_freq
108
+
109
+ def _frequency_matches(self, index: pd.DatetimeIndex) -> bool:
110
+ """
111
+ Check if the frequency of the input data matches the model frequency.
112
+
113
+ Args:
114
+ x (pd.DataFrame): The input data to check.
115
+
116
+ Returns:
117
+ bool: True if the frequencies match, False otherwise.
118
+ """
119
+ if not isinstance(index, pd.DatetimeIndex):
120
+ raise ValueError(
121
+ "The index of the input data must be a pandas DatetimeIndex."
122
+ )
123
+
124
+ if index.freq is None:
125
+ input_frequency = self._infer_frequency(index)
126
+ else:
127
+ input_frequency = index.freq
128
+
129
+ return input_frequency == pd.Timedelta(minutes=self.frequency)
130
+
131
+ @staticmethod
132
+ def _extract_and_validate_lags(
133
+ x: pd.DataFrame,
134
+ ) -> tuple[tuple[str], int, list[tuple[str, int]]]:
135
+ """Extract and validate the lag features from the input data.
136
+
137
+ This method checks that the lag features are evenly spaced and match the frequency of the input data.
138
+ It also extracts the lag features and their corresponding time deltas.
139
+ Args:
140
+ x (pd.DataFrame): The input data containing lag features.
141
+ Returns:
142
+ tuple: A tuple containing:
143
+ - A list of feature names, sorted by their lag in minutes.
144
+ - The frequency of the lag features in minutes.
145
+ - A list of tuples containing the lag feature names and their corresponding time deltas in minutes.
146
+ """
147
+ # Check that the input data contains the required lag features
148
+ feature_names = list(x.columns[x.columns.str.startswith("T-")])
149
+ if len(feature_names) == 0:
150
+ raise ValueError("No lag features found in the input data.")
151
+
152
+ # Convert all lags to minutes to make comparable
153
+ feature_to_lags_in_min = []
154
+ for feature in feature_names:
155
+ if feature.endswith("min"):
156
+ lag_in_min = int(feature.split("-")[1].split("min")[0])
157
+ elif feature.endswith("d"):
158
+ lag_in_min = int(feature.split("-")[1].split("d")[0]) * 60 * 24
159
+ else:
160
+ raise ValueError(
161
+ f"Feature name '{feature}' does not follow the expected format."
162
+ " Expected format is 'T-<lag_in_minutes>' or 'T-<lag_in_days>d'."
163
+ )
164
+ feature_to_lags_in_min.append((feature, lag_in_min))
165
+
166
+ # Sort the features by lag in minutes
167
+ feature_to_lags_in_min.sort(key=lambda x: x[1])
168
+ sorted_features, sorted_lags_in_min = zip(*feature_to_lags_in_min)
169
+
170
+ # Check that the lags are evenly spaced
171
+ diffs = np.diff(sorted_lags_in_min)
172
+ unique_diffs = np.unique(diffs)
173
+ if len(unique_diffs) > 1:
174
+ raise ValueError(
175
+ "Lag features are not evenly spaced. "
176
+ f"Got lags with differences: {unique_diffs} min. "
177
+ "Please ensure that the lag features are generated correctly."
178
+ )
179
+ frequency = unique_diffs[0]
180
+
181
+ return sorted_features, frequency, feature_to_lags_in_min
182
+
183
+ @staticmethod
184
+ def _fill_diagonal_with_median(
185
+ lag_array: np.ndarray, start: int, end: int, median: float
186
+ ):
187
+ # Use the calculated median to fill in future lag values where this prediction would be used as input.
188
+
189
+ # If the start index is beyond the array bounds, no future updates are needed from this step.
190
+ if start >= lag_array.shape[0]:
191
+ return lag_array
192
+
193
+ # Ensure the end index does not exceed the array bounds.
194
+ end = min(end, lag_array.shape[0])
195
+
196
+ # Get a view of the sub-array where the diagonal needs to be filled.
197
+ # The slice represents future time steps (rows) and corresponding lag features (columns).
198
+ # Rows: from 'start' up to (but not including) 'end'
199
+ # Columns: from 0 up to (but not including) 'end - start'
200
+ # This selects the part of the array where lag_array[start + k, k] resides for k in range(end - start).
201
+ view = lag_array[start:end, 0 : (end - start)]
202
+
203
+ # Create a mask for NaNs on the diagonal
204
+ diagonal_nan_mask = np.isnan(np.diag(view))
205
+
206
+ # Only update if there are NaNs on the diagonal
207
+ if np.any(diagonal_nan_mask):
208
+ # Create a temporary array to hold the new diagonal
209
+ updated_diagonal = np.diag(view).copy()
210
+ updated_diagonal[diagonal_nan_mask] = median
211
+ np.fill_diagonal(view, updated_diagonal)
212
+
44
213
  def predict(self, x: pd.DataFrame, **kwargs) -> np.array:
45
214
  """
46
215
  Predict the median of the lag features for each time step in the context window.
@@ -53,21 +222,105 @@ class MedianRegressor(OpenstfRegressor, RegressorMixin):
53
222
  If any lag feature is NaN, this will be ignored.
54
223
  If all lag features are NaN, the regressor will return NaN.
55
224
  """
225
+ if not self._frequency_matches(x.index):
226
+ raise ValueError(
227
+ f"The input data frequency ({x.index.freq}) does not match the model frequency ({self.frequency})."
228
+ )
229
+
230
+ # Check that the input data contains the required lag features
231
+ missing_features = set(self.feature_names) - set(x.columns)
232
+ if missing_features:
233
+ raise ValueError(
234
+ f"The input data is missing the following lag features: {missing_features}"
235
+ )
56
236
 
57
- lag_df = x.loc[:, self.feature_names]
58
- median = lag_df.median(axis=1, skipna=True)
237
+ # Reindex the input data to ensure there are no gaps in the time series.
238
+ # This is important for the autoregressive logic that follows.
239
+ # Store the original index to return predictions aligned with the input.
240
+ original_index = x.index.copy()
241
+ first_index = x.index[0]
242
+ last_index = x.index[-1]
243
+ freq = pd.Timedelta(minutes=self.frequency)
244
+ # Create a new date range with the expected frequency.
245
+ new_index = pd.date_range(first_index, last_index, freq=freq)
246
+ # Reindex the input DataFrame, filling any new timestamps with NaN.
247
+ x = x.reindex(new_index, fill_value=np.nan)
59
248
 
60
- return median
249
+ # Select only the lag feature columns in the specified order.
250
+ lag_df = x[self.feature_names]
251
+
252
+ # Convert the lag DataFrame and its index to NumPy arrays for faster processing.
253
+ lag_array = lag_df.to_numpy()
254
+ time_index = lag_df.index.to_numpy()
255
+ # Initialize the prediction array with NaNs.
256
+ prediction = np.full(lag_array.shape[0], np.nan)
257
+
258
+ # Calculate the time step size based on the model frequency.
259
+ step_size = pd.Timedelta(minutes=self.frequency)
260
+ # Determine the number of steps corresponding to the smallest and largest lags.
261
+ smallest_lag_steps = int(
262
+ self.lags_to_time_deltas_[self.feature_names[0]] / step_size
263
+ )
264
+ largest_lag_steps = int(
265
+ self.lags_to_time_deltas_[self.feature_names[-1]] / step_size
266
+ )
267
+
268
+ # Iterate through each time step in the reindexed data.
269
+ for time_step in range(lag_array.shape[0]):
270
+ # Get the lag features for the current time step.
271
+ current_lags = lag_array[time_step]
272
+ # Calculate the median of the available lag features, ignoring NaNs.
273
+ median = np.nanmedian(current_lags)
274
+ # Store the calculated median in the prediction array.
275
+ prediction[time_step] = median
276
+
277
+ # If the median calculation resulted in NaN (e.g., all lags were NaN), skip the autoregression step.
278
+ if np.isnan(median):
279
+ continue
280
+
281
+ # Auto-regressive step: update the lag array for future time steps.
282
+ # Calculate the start and end indices in the future time steps that will be affected.
283
+ start, end = (
284
+ time_step + smallest_lag_steps,
285
+ time_step + largest_lag_steps + 1,
286
+ )
287
+ self._fill_diagonal_with_median(lag_array, start, end, median)
288
+
289
+ # Convert the prediction array back to a pandas DataFrame using the reindexed time index.
290
+ prediction_df = pd.DataFrame(prediction, index=time_index, columns=["median"])
291
+ # Select only the predictions corresponding to the original input index.
292
+ prediction = prediction_df.loc[original_index].to_numpy().flatten()
293
+
294
+ # Return the final predictions as a flattened NumPy array.
295
+ return prediction
61
296
 
62
297
  def fit(self, x: pd.DataFrame, y: pd.DataFrame, **kwargs) -> RegressorMixin:
63
298
  """This model does not have any hyperparameters to fit,
64
- but it does need to know the feature names of the lag features.
299
+ but it does need to know the feature names of the lag features and the order of these.
300
+
301
+ Lag features are expected to be evently spaced and match the frequency of the input data.
302
+ The lag features are expected to be named in the format T-<lag_in_minutes> or T-<lag_in_days>d.
303
+ For example, T-1min, T-2min, T-3min or T-1d, T-2d.
65
304
 
66
305
  Which lag features are used is determined by the feature engineering step.
67
306
  """
68
- self.feature_names_ = list(x.columns[x.columns.str.startswith("T-")])
69
- if len(self.feature_names_) == 0:
70
- raise ValueError("No lag features found in the input data.")
307
+ feature_names, frequency, feature_to_lags_in_min = (
308
+ self._extract_and_validate_lags(x)
309
+ )
310
+
311
+ self.feature_names_ = list(feature_names)
312
+ self.frequency_ = frequency
313
+ self.lags_to_time_deltas_ = {
314
+ key: pd.Timedelta(minutes=val) for key, val in feature_to_lags_in_min
315
+ }
316
+
317
+ # Check that the frequency of the input data matches frequency of the lags
318
+ if not self._frequency_matches(
319
+ x.index.drop_duplicates()
320
+ ): # Several training horizons give duplicates
321
+ raise ValueError(
322
+ f"The input data frequency ({x.index.freq}) does not match the model frequency ({self.frequency})."
323
+ )
71
324
 
72
325
  self.feature_importances_ = np.ones(len(self.feature_names_)) / (
73
326
  len(self.feature_names_) or 1.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openstef
3
- Version: 3.4.73
3
+ Version: 3.4.75
4
4
  Summary: Open short term energy forecaster
5
5
  Home-page: https://github.com/OpenSTEF/openstef
6
6
  Author: Alliander N.V
@@ -64,7 +64,7 @@ openstef/model/regressors/gblinear_quantile.py,sha256=PKQL_TAXa3Kw9oZrKC6Uvo_n2N
64
64
  openstef/model/regressors/lgbm.py,sha256=zCdn1euEdSFxYJzH8XqQFFnb6R4JVUnmineKjX_Gy-g,800
65
65
  openstef/model/regressors/linear.py,sha256=uOvZMLGZH_9nXfmS5honCMfyVeyGXP1Cza9A_BdXlVw,3665
66
66
  openstef/model/regressors/linear_quantile.py,sha256=zIpGo9deMeTZdwFWoZ3FstX74mYdlAhfg-YOsPRFl0k,10534
67
- openstef/model/regressors/median.py,sha256=ITM5QhqfvjMjfk8fuHbyVWyWgld1NTiWajaaWAugbis,2697
67
+ openstef/model/regressors/median.py,sha256=i6nqSsKHnMxA06Ea6SNWIn4f8lvAaMz58Smx3bZ731E,14132
68
68
  openstef/model/regressors/regressor.py,sha256=0um575rTEkzYb1E5IAOuTlsZDhmb7eI5byu5e062NRs,3469
69
69
  openstef/model/regressors/xgb.py,sha256=uhV9Wm90aOkjByTm-O2xpt2kpANRxAqQvv5mA0H1uBc,1294
70
70
  openstef/model/regressors/xgb_multioutput_quantile.py,sha256=xWzA7tymC_o-F1OS3I7vUKf9zP6RR1ZglEeY4NAgjU0,9146
@@ -104,8 +104,8 @@ openstef/tasks/utils/predictionjobloop.py,sha256=Ysy3zF5lzPMz_asYDKeF5m0qgVT3tCt
104
104
  openstef/tasks/utils/taskcontext.py,sha256=O-LZ_wHEl5vbT8oB7EYtOeMkvk6EqCnI1-KiyER7Eu4,5407
105
105
  openstef/validation/__init__.py,sha256=bIyGTSA4V5VoOLTwdaiJJAnozmpSzvQooVYlsf8H4eU,163
106
106
  openstef/validation/validation.py,sha256=r6UqkdH5TMjsGfn8Ta07K1jkqmrVmwcPGfyQvMmZyO4,11459
107
- openstef-3.4.73.dist-info/licenses/LICENSE,sha256=7Pm2fWFFHHUG5lDHed1vl5CjzxObIXQglnYsEdtjo_k,14907
108
- openstef-3.4.73.dist-info/METADATA,sha256=7Nxg4a2GePqCFf19ANnEIze222JfNmypS88wt05cdfg,8834
109
- openstef-3.4.73.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
110
- openstef-3.4.73.dist-info/top_level.txt,sha256=kD0H4PqrQoncZ957FvqwfBxa89kTrun4Z_RAPs_HhLs,9
111
- openstef-3.4.73.dist-info/RECORD,,
107
+ openstef-3.4.75.dist-info/licenses/LICENSE,sha256=7Pm2fWFFHHUG5lDHed1vl5CjzxObIXQglnYsEdtjo_k,14907
108
+ openstef-3.4.75.dist-info/METADATA,sha256=8Q6z9qqtBZrKepnjG6MSSXzfrSlZ8U2enknksgTBRBI,8834
109
+ openstef-3.4.75.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
110
+ openstef-3.4.75.dist-info/top_level.txt,sha256=kD0H4PqrQoncZ957FvqwfBxa89kTrun4Z_RAPs_HhLs,9
111
+ openstef-3.4.75.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (79.0.1)
2
+ Generator: setuptools (80.4.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5