spotforecast2 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. spotforecast2/.DS_Store +0 -0
  2. spotforecast2/__init__.py +2 -0
  3. spotforecast2/data/__init__.py +0 -0
  4. spotforecast2/data/data.py +130 -0
  5. spotforecast2/data/fetch_data.py +209 -0
  6. spotforecast2/exceptions.py +681 -0
  7. spotforecast2/forecaster/.DS_Store +0 -0
  8. spotforecast2/forecaster/__init__.py +7 -0
  9. spotforecast2/forecaster/base.py +448 -0
  10. spotforecast2/forecaster/metrics.py +527 -0
  11. spotforecast2/forecaster/recursive/__init__.py +4 -0
  12. spotforecast2/forecaster/recursive/_forecaster_equivalent_date.py +1075 -0
  13. spotforecast2/forecaster/recursive/_forecaster_recursive.py +939 -0
  14. spotforecast2/forecaster/recursive/_warnings.py +15 -0
  15. spotforecast2/forecaster/utils.py +954 -0
  16. spotforecast2/model_selection/__init__.py +5 -0
  17. spotforecast2/model_selection/bayesian_search.py +453 -0
  18. spotforecast2/model_selection/grid_search.py +314 -0
  19. spotforecast2/model_selection/random_search.py +151 -0
  20. spotforecast2/model_selection/split_base.py +357 -0
  21. spotforecast2/model_selection/split_one_step.py +245 -0
  22. spotforecast2/model_selection/split_ts_cv.py +634 -0
  23. spotforecast2/model_selection/utils_common.py +718 -0
  24. spotforecast2/model_selection/utils_metrics.py +103 -0
  25. spotforecast2/model_selection/validation.py +685 -0
  26. spotforecast2/preprocessing/__init__.py +30 -0
  27. spotforecast2/preprocessing/_binner.py +378 -0
  28. spotforecast2/preprocessing/_common.py +123 -0
  29. spotforecast2/preprocessing/_differentiator.py +123 -0
  30. spotforecast2/preprocessing/_rolling.py +136 -0
  31. spotforecast2/preprocessing/curate_data.py +254 -0
  32. spotforecast2/preprocessing/imputation.py +92 -0
  33. spotforecast2/preprocessing/outlier.py +114 -0
  34. spotforecast2/preprocessing/split.py +139 -0
  35. spotforecast2/py.typed +0 -0
  36. spotforecast2/utils/__init__.py +43 -0
  37. spotforecast2/utils/convert_to_utc.py +44 -0
  38. spotforecast2/utils/data_transform.py +208 -0
  39. spotforecast2/utils/forecaster_config.py +344 -0
  40. spotforecast2/utils/generate_holiday.py +70 -0
  41. spotforecast2/utils/validation.py +569 -0
  42. spotforecast2/weather/__init__.py +0 -0
  43. spotforecast2/weather/weather_client.py +288 -0
  44. spotforecast2-0.0.1.dist-info/METADATA +47 -0
  45. spotforecast2-0.0.1.dist-info/RECORD +46 -0
  46. spotforecast2-0.0.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,1075 @@
1
+ from __future__ import annotations
2
+ from typing import Callable, Any
3
+ import warnings
4
+ import sys
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn.exceptions import NotFittedError
8
+
9
+ from spotforecast2.exceptions import MissingValuesWarning
10
+ from spotforecast2.preprocessing import QuantileBinner
11
+
12
+ # from spotforecast2._version import __version__ # Skipping version for now or mock it
13
+ from spotforecast2.forecaster.utils import (
14
+ check_extract_values_and_index,
15
+ get_style_repr_html,
16
+ check_residuals_input,
17
+ )
18
+ from spotforecast2.utils import (
19
+ check_y,
20
+ check_interval,
21
+ expand_index,
22
+ check_predict_input,
23
+ )
24
+ from ._warnings import ResidualsUsageWarning
25
+
26
+ # Mock version if not present
27
+ try:
28
+ from spotforecast2._version import __version__
29
+ except ImportError:
30
+ __version__ = "0.0.1"
31
+
32
+
33
+ class ForecasterEquivalentDate:
34
+ """
35
+ This forecaster predicts future values based on the most recent equivalent
36
+ date. It also allows to aggregate multiple past values of the equivalent
37
+ date using a function (e.g. mean, median, max, min, etc.). The equivalent
38
+ date is calculated by moving back in time a specified number of steps (offset).
39
+ The offset can be defined as an integer or as a pandas DateOffset. This
40
+ approach is useful as a baseline, but it is a simplistic method and may not
41
+ capture complex underlying patterns.
42
+
43
+ Args:
44
+ offset (int, pandas.tseries.offsets.DateOffset): Number of steps to go back
45
+ in time to find the most recent equivalent date to the target period.
46
+ If `offset` is an integer, it represents the number of steps to go back
47
+ in time. For example, if the frequency of the time series is daily,
48
+ `offset = 7` means that the most recent data similar to the target
49
+ period is the value observed 7 days ago.
50
+ Pandas DateOffsets can also be used to move forward a given number of
51
+ valid dates. For example, Bday(2) can be used to move back two business
52
+ days. If the date does not start on a valid date, it is first moved to a
53
+ valid date. For example, if the date is a Saturday, it is moved to the
54
+ previous Friday. Then, the offset is applied. If the result is a non-valid
55
+ date, it is moved to the next valid date. For example, if the date
56
+ is a Sunday, it is moved to the next Monday.
57
+ For more information about offsets, see
58
+ https://pandas.pydata.org/docs/reference/offset_frequency.html.
59
+ n_offsets (int, optional): Number of equivalent dates (multiple of offset)
60
+ used in the prediction. Defaults to 1.
61
+ If `n_offsets` is greater than 1, the values at the equivalent dates are
62
+ aggregated using the `agg_func` function. For example, if the frequency
63
+ of the time series is daily, `offset = 7`, `n_offsets = 2` and
64
+ `agg_func = np.mean`, the predicted value will be the mean of the values
65
+ observed 7 and 14 days ago.
66
+ agg_func (Callable, optional): Function used to aggregate the values of the
67
+ equivalent dates when the number of equivalent dates (`n_offsets`) is
68
+ greater than 1. Defaults to np.mean.
69
+ binner_kwargs (dict, optional): Additional arguments to pass to the
70
+ `QuantileBinner` used to discretize the residuals into k bins according
71
+ to the predicted values associated with each residual. Available arguments
72
+ are: `n_bins`, `method`, `subsample`, `random_state` and `dtype`.
73
+ Argument `method` is passed internally to the function `numpy.percentile`.
74
+ Defaults to None.
75
+ forecaster_id (str, int, optional): Name used as an identifier of the
76
+ forecaster. Defaults to None.
77
+
78
+ Attributes:
79
+ offset (int, pandas.tseries.offsets.DateOffset): Number of steps to go back
80
+ in time to find the most recent equivalent date to the target period.
81
+ n_offsets (int): Number of equivalent dates (multiple of offset) used in
82
+ the prediction.
83
+ agg_func (Callable): Function used to aggregate the values of the equivalent
84
+ dates when the number of equivalent dates (`n_offsets`) is greater than 1.
85
+ window_size (int): Number of past values needed to include the last
86
+ equivalent dates according to the `offset` and `n_offsets`.
87
+ last_window_ (pandas Series): This window represents the most recent data
88
+ observed by the predictor during its training phase. It contains the
89
+ past values needed to include the last equivalent date according the
90
+ `offset` and `n_offsets`.
91
+ index_type_ (type): Type of index of the input used in training.
92
+ index_freq_ (str): Frequency of Index of the input used in training.
93
+ training_range_ (pandas Index): First and last values of index of the data
94
+ used during training.
95
+ series_name_in_ (str): Names of the series provided by the user during training.
96
+ in_sample_residuals_ (numpy ndarray): Residuals of the model when predicting
97
+ training data. Only stored up to 10_000 values.
98
+ in_sample_residuals_by_bin_ (dict): In sample residuals binned according to
99
+ the predicted value each residual is associated with. The number of
100
+ residuals stored per bin is limited to `10_000 // self.binner.n_bins_`
101
+ in the form `{bin: residuals}`.
102
+ out_sample_residuals_ (numpy ndarray): Residuals of the model when predicting
103
+ non-training data. Only stored up to 10_000 values. Use
104
+ `set_out_sample_residuals()` method to set values.
105
+ out_sample_residuals_by_bin_ (dict): Out of sample residuals binned
106
+ according to the predicted value each residual is associated with.
107
+ The number of residuals stored per bin is limited to
108
+ `10_000 // self.binner.n_bins_` in the form `{bin: residuals}`.
109
+ binner (spotforecast.preprocessing.QuantileBinner): `QuantileBinner` used to
110
+ discretize residuals into k bins according to the predicted values
111
+ associated with each residual.
112
+ binner_intervals_ (dict): Intervals used to discretize residuals into k bins
113
+ according to the predicted values associated with each residual.
114
+ binner_kwargs (dict): Additional arguments to pass to the `QuantileBinner`.
115
+ creation_date (str): Date of creation.
116
+ is_fitted (bool): Tag to identify if the estimator has been fitted (trained).
117
+ fit_date (str): Date of last fit.
118
+ spotforecast_version (str): Version of spotforecast library used to create
119
+ the forecaster.
120
+ python_version (str): Version of python used to create the forecaster.
121
+ forecaster_id (str, int): Name used as an identifier of the forecaster.
122
+
123
+ Examples:
124
+ >>> import pandas as pd
125
+ >>> import numpy as np
126
+ >>> from spotforecast2.forecaster.recursive import ForecasterEquivalentDate
127
+ >>> # Series with daily frequency
128
+ >>> data = pd.Series(
129
+ ... data = np.arange(14),
130
+ ... index = pd.date_range(start='2022-01-01', periods=14, freq='D')
131
+ ... )
132
+ >>> # Forecast based on the value 7 days ago
133
+ >>> forecaster = ForecasterEquivalentDate(offset=7)
134
+ >>> forecaster.fit(y=data)
135
+ >>> forecaster.predict(steps=3)
136
+ 2022-01-15 7
137
+ 2022-01-16 8
138
+ 2022-01-17 9
139
+ Freq: D, Name: pred, dtype: int64
140
+ """
141
+
142
+ def __init__(
143
+ self,
144
+ offset: int | pd.tseries.offsets.DateOffset,
145
+ n_offsets: int = 1,
146
+ agg_func: Callable = np.mean,
147
+ binner_kwargs: dict[str, object] | None = None,
148
+ forecaster_id: str | int | None = None,
149
+ ) -> None:
150
+
151
+ self.offset = offset
152
+ self.n_offsets = n_offsets
153
+ self.agg_func = agg_func
154
+ self.last_window_ = None
155
+ self.index_type_ = None
156
+ self.index_freq_ = None
157
+ self.training_range_ = None
158
+ self.series_name_in_ = None
159
+ self.in_sample_residuals_ = None
160
+ self.out_sample_residuals_ = None
161
+ self.in_sample_residuals_by_bin_ = None
162
+ self.out_sample_residuals_by_bin_ = None
163
+ self.creation_date = pd.Timestamp.today().strftime("%Y-%m-%d %H:%M:%S")
164
+ self.is_fitted = False
165
+ self.fit_date = None
166
+ self.spotforecast_version = __version__
167
+ self.python_version = sys.version.split(" ")[0]
168
+ self.forecaster_id = forecaster_id
169
+ self._probabilistic_mode = "binned"
170
+ self.estimator = None
171
+ self.differentiation = None
172
+ self.differentiation_max = None
173
+ self.window_size = None # Defaults to None, validated later
174
+
175
+ if not isinstance(self.offset, (int, pd.tseries.offsets.DateOffset)):
176
+ raise TypeError(
177
+ "`offset` must be an integer greater than 0 or a "
178
+ "pandas.tseries.offsets. Find more information about offsets in "
179
+ "https://pandas.pydata.org/docs/reference/offset_frequency.html"
180
+ )
181
+
182
+ if isinstance(self.offset, int):
183
+ self.window_size = self.offset * self.n_offsets
184
+
185
+ self.binner_kwargs = binner_kwargs
186
+ if binner_kwargs is None:
187
+ self.binner_kwargs = {
188
+ "n_bins": 10,
189
+ "method": "linear",
190
+ "subsample": 200000,
191
+ "random_state": 789654,
192
+ "dtype": np.float64,
193
+ }
194
+ self.binner = QuantileBinner(**self.binner_kwargs)
195
+ self.binner_intervals_ = None
196
+
197
+ self.__spotforecast_tags__ = {
198
+ "library": "spotforecast",
199
+ "forecaster_name": "ForecasterEquivalentDate",
200
+ "forecaster_task": "regression",
201
+ "forecasting_scope": "single-series", # single-series | global
202
+ "forecasting_strategy": "recursive", # recursive | direct | deep_learning
203
+ "index_types_supported": ["pandas.RangeIndex", "pandas.DatetimeIndex"],
204
+ "requires_index_frequency": True,
205
+ "allowed_input_types_series": ["pandas.Series"],
206
+ "supports_exog": False,
207
+ "allowed_input_types_exog": [],
208
+ "handles_missing_values_series": False,
209
+ "handles_missing_values_exog": False,
210
+ "supports_lags": False,
211
+ "supports_window_features": False,
212
+ "supports_transformer_series": False,
213
+ "supports_transformer_exog": False,
214
+ "supports_weight_func": False,
215
+ "supports_differentiation": False,
216
+ "prediction_types": ["point", "interval"],
217
+ "supports_probabilistic": True,
218
+ "probabilistic_methods": ["conformal"],
219
+ "handles_binned_residuals": True,
220
+ }
221
+
222
+ def __repr__(self) -> str:
223
+ """
224
+ Information displayed when a Forecaster object is printed.
225
+ """
226
+
227
+ info = (
228
+ f"{'=' * len(type(self).__name__)} \n"
229
+ f"{type(self).__name__} \n"
230
+ f"{'=' * len(type(self).__name__)} \n"
231
+ f"Offset: {self.offset} \n"
232
+ f"Number of offsets: {self.n_offsets} \n"
233
+ f"Aggregation function: {self.agg_func.__name__} \n"
234
+ f"Window size: {self.window_size} \n"
235
+ f"Series name: {self.series_name_in_} \n"
236
+ f"Training range: {self.training_range_.to_list() if self.is_fitted else None} \n"
237
+ f"Training index type: {str(self.index_type_).split('.')[-1][:-2] if self.is_fitted else None} \n"
238
+ f"Training index frequency: {self.index_freq_ if self.is_fitted else None} \n"
239
+ f"Creation date: {self.creation_date} \n"
240
+ f"Last fit date: {self.fit_date} \n"
241
+ f"spotforecast version: {self.spotforecast_version} \n"
242
+ f"Python version: {self.python_version} \n"
243
+ f"Forecaster id: {self.forecaster_id} \n"
244
+ )
245
+
246
+ return info
247
+
248
+ def _repr_html_(self) -> str:
249
+ """
250
+ HTML representation of the object.
251
+ The "General Information" section is expanded by default.
252
+ """
253
+
254
+ style, unique_id = get_style_repr_html(self.is_fitted)
255
+
256
+ content = f"""
257
+ <div class="container-{unique_id}">
258
+ <p style="font-size: 1.5em; font-weight: bold; margin-block-start: 0.83em; margin-block-end: 0.83em;">{type(self).__name__}</p>
259
+ <details open>
260
+ <summary>General Information</summary>
261
+ <ul>
262
+ <li><strong>Estimator:</strong> {type(self.estimator).__name__}</li>
263
+ <li><strong>Offset:</strong> {self.offset}</li>
264
+ <li><strong>Number of offsets:</strong> {self.n_offsets}</li>
265
+ <li><strong>Aggregation function:</strong> {self.agg_func.__name__}</li>
266
+ <li><strong>Window size:</strong> {self.window_size}</li>
267
+ <li><strong>Creation date:</strong> {self.creation_date}</li>
268
+ <li><strong>Last fit date:</strong> {self.fit_date}</li>
269
+ <li><strong>spotforecast version:</strong> {self.spotforecast_version}</li>
270
+ <li><strong>Python version:</strong> {self.python_version}</li>
271
+ <li><strong>Forecaster id:</strong> {self.forecaster_id}</li>
272
+ </ul>
273
+ </details>
274
+ <details>
275
+ <summary>Training Information</summary>
276
+ <ul>
277
+ <li><strong>Training range:</strong> {self.training_range_.to_list() if self.is_fitted else 'Not fitted'}</li>
278
+ <li><strong>Training index type:</strong> {str(self.index_type_).split('.')[-1][:-2] if self.is_fitted else 'Not fitted'}</li>
279
+ <li><strong>Training index frequency:</strong> {self.index_freq_ if self.is_fitted else 'Not fitted'}</li>
280
+ </ul>
281
+ </div>
282
+ """
283
+
284
+ return style + content
285
+
286
+ def fit(
287
+ self,
288
+ y: pd.Series,
289
+ store_in_sample_residuals: bool = False,
290
+ random_state: int = 123,
291
+ exog: Any = None,
292
+ ) -> None:
293
+ """
294
+ Training Forecaster.
295
+
296
+ Args:
297
+ y (pandas Series): Training time series.
298
+ store_in_sample_residuals (bool, optional): If `True`, in-sample
299
+ residuals will be stored in the forecaster object after fitting
300
+ (`in_sample_residuals_` and `in_sample_residuals_by_bin_` attributes).
301
+ If `False`, only the intervals of the bins are stored. Defaults to False.
302
+ random_state (int, optional): Set a seed for the random generator so
303
+ that the stored sample residuals are always deterministic. Defaults to 123.
304
+ exog (Ignored): Not used, present here for API consistency by convention.
305
+
306
+ Returns:
307
+ None
308
+ """
309
+
310
+ if not isinstance(y, pd.Series):
311
+ raise TypeError(
312
+ f"`y` must be a pandas Series with a DatetimeIndex or a RangeIndex. "
313
+ f"Found {type(y)}."
314
+ )
315
+
316
+ if isinstance(self.offset, pd.tseries.offsets.DateOffset):
317
+ if not isinstance(y.index, pd.DatetimeIndex):
318
+ raise TypeError(
319
+ "If `offset` is a pandas DateOffset, the index of `y` must be a "
320
+ "pandas DatetimeIndex with frequency."
321
+ )
322
+ elif y.index.freq is None:
323
+ try:
324
+ y.index.freq = pd.infer_freq(y.index)
325
+ except (ValueError, TypeError):
326
+ raise TypeError(
327
+ "If `offset` is a pandas DateOffset, the index of `y` must be a "
328
+ "pandas DatetimeIndex with frequency."
329
+ )
330
+ if y.index.freq is None:
331
+ raise TypeError(
332
+ "If `offset` is a pandas DateOffset, the index of `y` must be a "
333
+ "pandas DatetimeIndex with frequency."
334
+ )
335
+
336
+ # Reset values in case the forecaster has already been fitted.
337
+ self.last_window_ = None
338
+ self.index_type_ = None
339
+ self.index_freq_ = None
340
+ self.training_range_ = None
341
+ self.series_name_in_ = None
342
+ self.is_fitted = False
343
+
344
+ _, y_index = check_extract_values_and_index(
345
+ data=y, data_label="`y`", return_values=False
346
+ )
347
+
348
+ if isinstance(self.offset, pd.tseries.offsets.DateOffset):
349
+ # Calculate the window_size in steps for compatibility with the
350
+ # check_predict_input function. This is not a exact calculation
351
+ # because the offset follows the calendar rules and the distance
352
+ # between two dates may not be constant.
353
+ first_valid_index = y_index[-1] - self.offset * self.n_offsets
354
+
355
+ try:
356
+ window_size_idx_start = y_index.get_loc(first_valid_index)
357
+ window_size_idx_end = y_index.get_loc(y_index[-1])
358
+ self.window_size = window_size_idx_end - window_size_idx_start
359
+ except KeyError:
360
+ raise ValueError(
361
+ f"The length of `y` ({len(y)}), must be greater than or equal "
362
+ f"to the window size ({self.window_size}). This is because "
363
+ f"the offset ({self.offset}) is larger than the available "
364
+ f"data. Try to decrease the size of the offset ({self.offset}), "
365
+ f"the number of `n_offsets` ({self.n_offsets}) or increase the "
366
+ f"size of `y`."
367
+ )
368
+ else:
369
+ if len(y) <= self.window_size:
370
+ raise ValueError(
371
+ f"Length of `y` must be greater than the maximum window size "
372
+ f"needed by the forecaster. This is because "
373
+ f"the offset ({self.offset}) is larger than the available "
374
+ f"data. Try to decrease the size of the offset ({self.offset}), "
375
+ f"the number of `n_offsets` ({self.n_offsets}) or increase the "
376
+ f"size of `y`.\n"
377
+ f" Length `y`: {len(y)}.\n"
378
+ f" Max window size: {self.window_size}.\n"
379
+ )
380
+
381
+ self.is_fitted = True
382
+ self.series_name_in_ = y.name if y.name is not None else "y"
383
+ self.fit_date = pd.Timestamp.today().strftime("%Y-%m-%d %H:%M:%S")
384
+ self.training_range_ = y_index[[0, -1]]
385
+ self.index_type_ = type(y_index)
386
+ self.index_freq_ = (
387
+ y_index.freq if isinstance(y_index, pd.DatetimeIndex) else y_index.step
388
+ )
389
+
390
+ # NOTE: This is done to save time during fit in functions such as backtesting()
391
+ if self._probabilistic_mode is not False:
392
+ self._binning_in_sample_residuals(
393
+ y=y,
394
+ store_in_sample_residuals=store_in_sample_residuals,
395
+ random_state=random_state,
396
+ )
397
+
398
+ # The last time window of training data is stored so that equivalent
399
+ # dates are available when calling the `predict` method.
400
+ # Store the whole series to avoid errors when the offset is larger
401
+ # than the data available.
402
+ self.last_window_ = y.copy()
403
+
404
+ def _binning_in_sample_residuals(
405
+ self,
406
+ y: pd.Series,
407
+ store_in_sample_residuals: bool = False,
408
+ random_state: int = 123,
409
+ ) -> None:
410
+ """
411
+ Bin residuals according to the predicted value each residual is
412
+ associated with. First a `spotforecast.preprocessing.QuantileBinner` object
413
+ is fitted to the predicted values. Then, residuals are binned according
414
+ to the predicted value each residual is associated with. Residuals are
415
+ stored in the forecaster object as `in_sample_residuals_` and
416
+ `in_sample_residuals_by_bin_`.
417
+
418
+ The number of residuals stored per bin is limited to
419
+ `10_000 // self.binner.n_bins_`. The total number of residuals stored is
420
+ `10_000`.
421
+ **New in version 0.17.0**
422
+
423
+ Args:
424
+ y (pandas Series): Training time series.
425
+ store_in_sample_residuals (bool, optional): If `True`, in-sample
426
+ residuals will be stored in the forecaster object after fitting
427
+ (`in_sample_residuals_` and `in_sample_residuals_by_bin_` attributes).
428
+ If `False`, only the intervals of the bins are stored. Defaults to False.
429
+ random_state (int, optional): Set a seed for the random generator so
430
+ that the stored sample residuals are always deterministic. Defaults to 123.
431
+
432
+ Returns:
433
+ None
434
+ """
435
+
436
+ if isinstance(self.offset, pd.tseries.offsets.DateOffset):
437
+ y_preds = []
438
+ for n_off in range(1, self.n_offsets + 1):
439
+ idx = y.index - self.offset * n_off
440
+ mask = idx >= y.index[0]
441
+ y_pred = y.loc[idx[mask]]
442
+ y_pred.index = y.index[-mask.sum() :]
443
+ y_preds.append(y_pred)
444
+
445
+ y_preds = pd.concat(y_preds, axis=1).to_numpy()
446
+ y_true = y.to_numpy()[-len(y_preds) :]
447
+
448
+ else:
449
+ y_preds = [
450
+ y.shift(self.offset * n_off)[self.window_size :]
451
+ for n_off in range(1, self.n_offsets + 1)
452
+ ]
453
+ y_preds = np.column_stack(y_preds)
454
+ y_true = y.to_numpy()[self.window_size :]
455
+
456
+ y_pred = np.apply_along_axis(self.agg_func, axis=1, arr=y_preds)
457
+
458
+ residuals = y_true - y_pred
459
+
460
+ if self._probabilistic_mode == "binned":
461
+ data = pd.DataFrame({"prediction": y_pred, "residuals": residuals}).dropna()
462
+ y_pred = data["prediction"].to_numpy()
463
+ residuals = data["residuals"].to_numpy()
464
+
465
+ self.binner.fit(y_pred)
466
+ self.binner_intervals_ = self.binner.intervals_
467
+
468
+ if store_in_sample_residuals:
469
+ rng = np.random.default_rng(seed=random_state)
470
+ if self._probabilistic_mode == "binned":
471
+ data["bin"] = self.binner.transform(y_pred).astype(int)
472
+ self.in_sample_residuals_by_bin_ = (
473
+ data.groupby("bin")["residuals"].apply(np.array).to_dict()
474
+ )
475
+
476
+ max_sample = 10_000 // self.binner.n_bins_
477
+ for k, v in self.in_sample_residuals_by_bin_.items():
478
+ if len(v) > max_sample:
479
+ sample = v[rng.integers(low=0, high=len(v), size=max_sample)]
480
+ self.in_sample_residuals_by_bin_[k] = sample
481
+
482
+ for k in self.binner_intervals_.keys():
483
+ if k not in self.in_sample_residuals_by_bin_:
484
+ self.in_sample_residuals_by_bin_[k] = np.array([])
485
+
486
+ empty_bins = [
487
+ k
488
+ for k, v in self.in_sample_residuals_by_bin_.items()
489
+ if v.size == 0
490
+ ]
491
+ if empty_bins:
492
+ empty_bin_size = min(max_sample, len(residuals))
493
+ for k in empty_bins:
494
+ self.in_sample_residuals_by_bin_[k] = rng.choice(
495
+ a=residuals, size=empty_bin_size, replace=False
496
+ )
497
+
498
+ if len(residuals) > 10_000:
499
+ residuals = residuals[
500
+ rng.integers(low=0, high=len(residuals), size=10_000)
501
+ ]
502
+
503
+ self.in_sample_residuals_ = residuals
504
+
505
+ def predict(
506
+ self,
507
+ steps: int,
508
+ last_window: pd.Series | None = None,
509
+ check_inputs: bool = True,
510
+ exog: Any = None,
511
+ ) -> pd.Series:
512
+ """
513
+ Predict n steps ahead.
514
+
515
+ Args:
516
+ steps (int): Number of steps to predict.
517
+ last_window (pandas Series, optional): Past values needed to select the
518
+ last equivalent dates according to the offset. If `last_window = None`,
519
+ the values stored in `self.last_window_` are used and the predictions
520
+ start immediately after the training data. Defaults to None.
521
+ check_inputs (bool, optional): If `True`, the input is checked for
522
+ possible warnings and errors with the `check_predict_input` function.
523
+ This argument is created for internal use and is not recommended to
524
+ be changed. Defaults to True.
525
+ exog (Ignored): Not used, present here for API consistency by convention.
526
+
527
+ Returns:
528
+ pd.Series: Predicted values.
529
+ """
530
+
531
+ if last_window is None:
532
+ last_window = self.last_window_
533
+
534
+ if check_inputs:
535
+ check_predict_input(
536
+ forecaster_name=type(self).__name__,
537
+ steps=steps,
538
+ is_fitted=self.is_fitted,
539
+ exog_in_=False,
540
+ index_type_=self.index_type_,
541
+ index_freq_=self.index_freq_,
542
+ window_size=self.window_size,
543
+ last_window=last_window,
544
+ )
545
+
546
+ prediction_index = expand_index(index=last_window.index, steps=steps)
547
+
548
+ if isinstance(self.offset, int):
549
+
550
+ last_window_values = last_window.to_numpy(copy=True).ravel()
551
+ equivalent_indexes = np.tile(
552
+ np.arange(-self.offset, 0), int(np.ceil(steps / self.offset))
553
+ )
554
+ equivalent_indexes = equivalent_indexes[:steps]
555
+
556
+ if self.n_offsets == 1:
557
+ equivalent_values = last_window_values[equivalent_indexes]
558
+ predictions = equivalent_values.ravel()
559
+
560
+ if self.n_offsets > 1:
561
+ equivalent_indexes = [
562
+ equivalent_indexes - n * self.offset
563
+ for n in np.arange(self.n_offsets)
564
+ ]
565
+ equivalent_indexes = np.vstack(equivalent_indexes)
566
+ equivalent_values = last_window_values[equivalent_indexes]
567
+ predictions = np.apply_along_axis(
568
+ self.agg_func, axis=0, arr=equivalent_values
569
+ )
570
+
571
+ predictions = pd.Series(
572
+ data=predictions, index=prediction_index, name="pred"
573
+ )
574
+
575
+ if isinstance(self.offset, pd.tseries.offsets.DateOffset):
576
+
577
+ last_window = last_window.copy()
578
+ max_allowed_date = last_window.index[-1]
579
+
580
+ # For every date in prediction_index, calculate the n offsets
581
+ offset_dates = []
582
+ for date in prediction_index:
583
+ selected_offsets = []
584
+ while len(selected_offsets) < self.n_offsets:
585
+ offset_date = date - self.offset
586
+ if offset_date <= max_allowed_date:
587
+ selected_offsets.append(offset_date)
588
+ date = offset_date
589
+ offset_dates.append(selected_offsets)
590
+
591
+ offset_dates = np.array(offset_dates)
592
+
593
+ # Select the values of the time series corresponding to the each
594
+ # offset date. If the offset date is not in the time series, the
595
+ # value is set to NaN.
596
+ equivalent_values = (
597
+ last_window.reindex(offset_dates.ravel())
598
+ .to_numpy()
599
+ .reshape(-1, self.n_offsets)
600
+ )
601
+ equivalent_values = pd.DataFrame(
602
+ data=equivalent_values,
603
+ index=prediction_index,
604
+ columns=[f"offset_{i}" for i in range(self.n_offsets)],
605
+ )
606
+
607
+ # Error if all values are missing
608
+ if equivalent_values.isnull().all().all():
609
+ raise ValueError(
610
+ f"All equivalent values are missing. This is caused by using "
611
+ f"an offset ({self.offset}) larger than the available data. "
612
+ f"Try to decrease the size of the offset ({self.offset}), "
613
+ f"the number of `n_offsets` ({self.n_offsets}) or increase the "
614
+ f"size of `last_window`. In backtesting, this error may be "
615
+ f"caused by using an `initial_train_size` too small."
616
+ )
617
+
618
+ # Warning if equivalent values are missing
619
+ incomplete_offsets = equivalent_values.isnull().any(axis=1)
620
+ incomplete_offsets = incomplete_offsets[incomplete_offsets].index
621
+ if not incomplete_offsets.empty:
622
+ warnings.warn(
623
+ f"Steps: {incomplete_offsets.strftime('%Y-%m-%d').to_list()} "
624
+ f"are calculated with less than {self.n_offsets} `n_offsets`. "
625
+ f"To avoid this, increase the `last_window` size or decrease "
626
+ f"the number of `n_offsets`. The current configuration requires "
627
+ f"a total offset of {self.offset * self.n_offsets}.",
628
+ MissingValuesWarning,
629
+ )
630
+
631
+ aggregate_values = equivalent_values.apply(self.agg_func, axis=1)
632
+ predictions = aggregate_values.rename("pred")
633
+
634
+ return predictions
635
+
636
+ def predict_interval(
637
+ self,
638
+ steps: int,
639
+ last_window: pd.Series | None = None,
640
+ method: str = "conformal",
641
+ interval: float | list[float] | tuple[float] = [5, 95],
642
+ use_in_sample_residuals: bool = True,
643
+ use_binned_residuals: bool = True,
644
+ random_state: Any = None,
645
+ exog: Any = None,
646
+ n_boot: Any = None,
647
+ ) -> pd.DataFrame:
648
+ """
649
+ Predict n steps ahead and estimate prediction intervals using conformal
650
+ prediction method. Refer to the References section for additional
651
+ details on this method.
652
+
653
+ Args:
654
+ steps (int): Number of steps to predict.
655
+ last_window (pandas Series, optional): Past values needed to select the
656
+ last equivalent dates according to the offset. If `last_window = None`,
657
+ the values stored in `self.last_window_` are used and the predictions
658
+ start immediately after the training data. Defaults to None.
659
+ method (str, optional): Technique used to estimate prediction intervals.
660
+ Available options:
661
+ - 'conformal': Employs the conformal prediction split method for
662
+ interval estimation [1]_. Defaults to 'conformal'.
663
+ interval (float, list, tuple, optional): Confidence level of the
664
+ prediction interval. Interpretation depends on the method used:
665
+ - If `float`, represents the nominal (expected) coverage (between 0
666
+ and 1). For instance, `interval=0.95` corresponds to `[2.5, 97.5]`
667
+ percentiles.
668
+ - If `list` or `tuple`, defines the exact percentiles to compute,
669
+ which must be between 0 and 100 inclusive. For example, interval
670
+ of 95% should be as `interval = [2.5, 97.5]`.
671
+ - When using `method='conformal'`, the interval must be a float or
672
+ a list/tuple defining a symmetric interval. Defaults to [5, 95].
673
+ use_in_sample_residuals (bool, optional): If `True`, residuals from the
674
+ training data are used as proxy of prediction error to create predictions.
675
+ If `False`, out of sample residuals (calibration) are used.
676
+ Out-of-sample residuals must be precomputed using Forecaster's
677
+ `set_out_sample_residuals()` method. Defaults to True.
678
+ use_binned_residuals (bool, optional): If `True`, residuals are selected
679
+ based on the predicted values (binned selection).
680
+ If `False`, residuals are selected randomly. Defaults to True.
681
+ random_state (Ignored): Not used, present here for API consistency by convention.
682
+ exog (Ignored): Not used, present here for API consistency by convention.
683
+ n_boot (Ignored): Not used, present here for API consistency by convention.
684
+
685
+ Returns:
686
+ pd.DataFrame: Values predicted by the forecaster and their estimated interval.
687
+ - pred: predictions.
688
+ - lower_bound: lower bound of the interval.
689
+ - upper_bound: upper bound of the interval.
690
+
691
+ References:
692
+ .. [1] MAPIE - Model Agnostic Prediction Interval Estimator.
693
+ https://mapie.readthedocs.io/en/stable/theoretical_description_regression.html#the-split-method
694
+ """
695
+
696
+ if method != "conformal":
697
+ raise ValueError(
698
+ f"Method '{method}' is not supported. Only 'conformal' is available."
699
+ )
700
+
701
+ if last_window is None:
702
+ last_window = self.last_window_
703
+
704
+ check_predict_input(
705
+ forecaster_name=type(self).__name__,
706
+ steps=steps,
707
+ is_fitted=self.is_fitted,
708
+ exog_in_=False,
709
+ index_type_=self.index_type_,
710
+ index_freq_=self.index_freq_,
711
+ window_size=self.window_size,
712
+ last_window=last_window,
713
+ )
714
+
715
+ check_residuals_input(
716
+ forecaster_name=type(self).__name__,
717
+ use_in_sample_residuals=use_in_sample_residuals,
718
+ in_sample_residuals_=self.in_sample_residuals_,
719
+ out_sample_residuals_=self.out_sample_residuals_,
720
+ use_binned_residuals=use_binned_residuals,
721
+ in_sample_residuals_by_bin_=self.in_sample_residuals_by_bin_,
722
+ out_sample_residuals_by_bin_=self.out_sample_residuals_by_bin_,
723
+ )
724
+
725
+ if isinstance(interval, (list, tuple)):
726
+ check_interval(interval=interval, ensure_symmetric_intervals=True)
727
+ nominal_coverage = (interval[1] - interval[0]) / 100
728
+ else:
729
+ check_interval(alpha=interval, alpha_literal="interval")
730
+ nominal_coverage = interval
731
+
732
+ if use_in_sample_residuals:
733
+ residuals = self.in_sample_residuals_
734
+ residuals_by_bin = self.in_sample_residuals_by_bin_
735
+ else:
736
+ residuals = self.out_sample_residuals_
737
+ residuals_by_bin = self.out_sample_residuals_by_bin_
738
+
739
+ prediction_index = expand_index(index=last_window.index, steps=steps)
740
+
741
+ if isinstance(self.offset, int):
742
+
743
+ last_window_values = last_window.to_numpy(copy=True).ravel()
744
+ equivalent_indexes = np.tile(
745
+ np.arange(-self.offset, 0), int(np.ceil(steps / self.offset))
746
+ )
747
+ equivalent_indexes = equivalent_indexes[:steps]
748
+
749
+ if self.n_offsets == 1:
750
+ equivalent_values = last_window_values[equivalent_indexes]
751
+ predictions = equivalent_values.ravel()
752
+
753
+ if self.n_offsets > 1:
754
+ equivalent_indexes = [
755
+ equivalent_indexes - n * self.offset
756
+ for n in np.arange(self.n_offsets)
757
+ ]
758
+ equivalent_indexes = np.vstack(equivalent_indexes)
759
+ equivalent_values = last_window_values[equivalent_indexes]
760
+ predictions = np.apply_along_axis(
761
+ self.agg_func, axis=0, arr=equivalent_values
762
+ )
763
+
764
+ if isinstance(self.offset, pd.tseries.offsets.DateOffset):
765
+
766
+ last_window = last_window.copy()
767
+ max_allowed_date = last_window.index[-1]
768
+
769
+ # For every date in prediction_index, calculate the n offsets
770
+ offset_dates = []
771
+ for date in prediction_index:
772
+ selected_offsets = []
773
+ while len(selected_offsets) < self.n_offsets:
774
+ offset_date = date - self.offset
775
+ if offset_date <= max_allowed_date:
776
+ selected_offsets.append(offset_date)
777
+ date = offset_date
778
+ offset_dates.append(selected_offsets)
779
+
780
+ offset_dates = np.array(offset_dates)
781
+
782
+ # Select the values of the time series corresponding to the each
783
+ # offset date. If the offset date is not in the time series, the
784
+ # value is set to NaN.
785
+ equivalent_values = (
786
+ last_window.reindex(offset_dates.ravel())
787
+ .to_numpy()
788
+ .reshape(-1, self.n_offsets)
789
+ )
790
+ equivalent_values = pd.DataFrame(
791
+ data=equivalent_values,
792
+ index=prediction_index,
793
+ columns=[f"offset_{i}" for i in range(self.n_offsets)],
794
+ )
795
+
796
+ # Error if all values are missing
797
+ if equivalent_values.isnull().all().all():
798
+ raise ValueError(
799
+ f"All equivalent values are missing. This is caused by using "
800
+ f"an offset ({self.offset}) larger than the available data. "
801
+ f"Try to decrease the size of the offset ({self.offset}), "
802
+ f"the number of `n_offsets` ({self.n_offsets}) or increase the "
803
+ f"size of `last_window`. In backtesting, this error may be "
804
+ f"caused by using an `initial_train_size` too small."
805
+ )
806
+
807
+ # Warning if equivalent values are missing
808
+ incomplete_offsets = equivalent_values.isnull().any(axis=1)
809
+ incomplete_offsets = incomplete_offsets[incomplete_offsets].index
810
+ if not incomplete_offsets.empty:
811
+ warnings.warn(
812
+ f"Steps: {incomplete_offsets.strftime('%Y-%m-%d').to_list()} "
813
+ f"are calculated with less than {self.n_offsets} `n_offsets`. "
814
+ f"To avoid this, increase the `last_window` size or decrease "
815
+ f"the number of `n_offsets`. The current configuration requires "
816
+ f"a total offset of {self.offset * self.n_offsets}.",
817
+ MissingValuesWarning,
818
+ )
819
+
820
+ aggregate_values = equivalent_values.apply(self.agg_func, axis=1)
821
+ predictions = aggregate_values.to_numpy()
822
+
823
+ if use_binned_residuals:
824
+ correction_factor_by_bin = {
825
+ k: np.quantile(np.abs(v), nominal_coverage)
826
+ for k, v in residuals_by_bin.items()
827
+ }
828
+ replace_func = np.vectorize(lambda x: correction_factor_by_bin[x])
829
+ predictions_bin = self.binner.transform(predictions)
830
+ correction_factor = replace_func(predictions_bin)
831
+ else:
832
+ correction_factor = np.quantile(np.abs(residuals), nominal_coverage)
833
+
834
+ lower_bound = predictions - correction_factor
835
+ upper_bound = predictions + correction_factor
836
+ predictions = np.column_stack([predictions, lower_bound, upper_bound])
837
+
838
+ predictions = pd.DataFrame(
839
+ data=predictions,
840
+ index=prediction_index,
841
+ columns=["pred", "lower_bound", "upper_bound"],
842
+ )
843
+
844
+ return predictions
845
+
846
+ def set_in_sample_residuals(
847
+ self, y: pd.Series, random_state: int = 123, exog: Any = None
848
+ ) -> None:
849
+ """
850
+ Set in-sample residuals in case they were not calculated during the
851
+ training process.
852
+
853
+ In-sample residuals are calculated as the difference between the true
854
+ values and the predictions made by the forecaster using the training
855
+ data. The following internal attributes are updated:
856
+
857
+ + `in_sample_residuals_`: residuals stored in a numpy ndarray.
858
+ + `binner_intervals_`: intervals used to bin the residuals are calculated
859
+ using the quantiles of the predicted values.
860
+ + `in_sample_residuals_by_bin_`: residuals are binned according to the
861
+ predicted value they are associated with and stored in a dictionary, where
862
+ the keys are the intervals of the predicted values and the values are
863
+ the residuals associated with that range.
864
+
865
+ A total of 10_000 residuals are stored in the attribute `in_sample_residuals_`.
866
+ If the number of residuals is greater than 10_000, a random sample of
867
+ 10_000 residuals is stored. The number of residuals stored per bin is
868
+ limited to `10_000 // self.binner.n_bins_`.
869
+
870
+ Parameters
871
+ ----------
872
+ y : pandas Series
873
+ Training time series.
874
+ random_state : int, default 123
875
+ Sets a seed to the random sampling for reproducible output.
876
+ exog : Ignored
877
+ Not used, present here for API consistency by convention.
878
+
879
+ Returns
880
+ -------
881
+ None
882
+
883
+ """
884
+
885
+ if not self.is_fitted:
886
+ raise NotFittedError(
887
+ "This forecaster is not fitted yet. Call `fit` with appropriate "
888
+ "arguments before using `set_in_sample_residuals()`."
889
+ )
890
+
891
+ check_y(y=y)
892
+ y_index_range = check_extract_values_and_index(
893
+ data=y, data_label="`y`", return_values=False
894
+ )[1][[0, -1]]
895
+ if not y_index_range.equals(self.training_range_):
896
+ raise IndexError(
897
+ f"The index range of `y` does not match the range "
898
+ f"used during training. Please ensure the index is aligned "
899
+ f"with the training data.\n"
900
+ f" Expected : {self.training_range_}\n"
901
+ f" Received : {y_index_range}"
902
+ )
903
+
904
+ self._binning_in_sample_residuals(
905
+ y=y, store_in_sample_residuals=True, random_state=random_state
906
+ )
907
+
908
+ def set_out_sample_residuals(
909
+ self,
910
+ y_true: np.ndarray | pd.Series,
911
+ y_pred: np.ndarray | pd.Series,
912
+ append: bool = False,
913
+ random_state: int = 123,
914
+ ) -> None:
915
+ """
916
+ Set new values to the attribute `out_sample_residuals_`. Out of sample
917
+ residuals are meant to be calculated using observations that did not
918
+ participate in the training process. Two internal attributes are updated:
919
+
920
+ + `out_sample_residuals_`: residuals stored in a numpy ndarray.
921
+ + `out_sample_residuals_by_bin_`: residuals are binned according to the
922
+ predicted value they are associated with and stored in a dictionary, where
923
+ the keys are the intervals of the predicted values and the values are
924
+ the residuals associated with that range. If a bin binning is empty, it
925
+ is filled with a random sample of residuals from other bins. This is done
926
+ to ensure that all bins have at least one residual and can be used in the
927
+ prediction process.
928
+
929
+ A total of 10_000 residuals are stored in the attribute `out_sample_residuals_`.
930
+ If the number of residuals is greater than 10_000, a random sample of
931
+ 10_000 residuals is stored. The number of residuals stored per bin is
932
+ limited to `10_000 // self.binner.n_bins_`.
933
+
934
+ Parameters
935
+ ----------
936
+ y_true : numpy ndarray, pandas Series
937
+ True values of the time series from which the residuals have been
938
+ calculated.
939
+ y_pred : numpy ndarray, pandas Series
940
+ Predicted values of the time series.
941
+ append : bool, default False
942
+ If `True`, new residuals are added to the once already stored in the
943
+ forecaster. If after appending the new residuals, the limit of
944
+ `10_000 // self.binner.n_bins_` values per bin is reached, a random
945
+ sample of residuals is stored.
946
+ random_state : int, default 123
947
+ Sets a seed to the random sampling for reproducible output.
948
+
949
+ Returns
950
+ -------
951
+ None
952
+
953
+ """
954
+
955
+ if not self.is_fitted:
956
+ raise NotFittedError(
957
+ "This forecaster is not fitted yet. Call `fit` with appropriate "
958
+ "arguments before using `set_out_sample_residuals()`."
959
+ )
960
+
961
+ if not isinstance(y_true, (np.ndarray, pd.Series)):
962
+ raise TypeError(
963
+ f"`y_true` argument must be `numpy ndarray` or `pandas Series`. "
964
+ f"Got {type(y_true)}."
965
+ )
966
+
967
+ if not isinstance(y_pred, (np.ndarray, pd.Series)):
968
+ raise TypeError(
969
+ f"`y_pred` argument must be `numpy ndarray` or `pandas Series`. "
970
+ f"Got {type(y_pred)}."
971
+ )
972
+
973
+ if len(y_true) != len(y_pred):
974
+ raise ValueError(
975
+ f"`y_true` and `y_pred` must have the same length. "
976
+ f"Got {len(y_true)} and {len(y_pred)}."
977
+ )
978
+
979
+ if isinstance(y_true, pd.Series) and isinstance(y_pred, pd.Series):
980
+ if not y_true.index.equals(y_pred.index):
981
+ raise ValueError("`y_true` and `y_pred` must have the same index.")
982
+
983
+ if not isinstance(y_pred, np.ndarray):
984
+ y_pred = y_pred.to_numpy()
985
+ if not isinstance(y_true, np.ndarray):
986
+ y_true = y_true.to_numpy()
987
+
988
+ data = pd.DataFrame(
989
+ {"prediction": y_pred, "residuals": y_true - y_pred}
990
+ ).dropna()
991
+ y_pred = data["prediction"].to_numpy()
992
+ residuals = data["residuals"].to_numpy()
993
+
994
+ data["bin"] = self.binner.transform(y_pred).astype(int)
995
+ residuals_by_bin = data.groupby("bin")["residuals"].apply(np.array).to_dict()
996
+
997
+ out_sample_residuals = (
998
+ np.array([])
999
+ if self.out_sample_residuals_ is None
1000
+ else self.out_sample_residuals_
1001
+ )
1002
+ out_sample_residuals_by_bin = (
1003
+ {}
1004
+ if self.out_sample_residuals_by_bin_ is None
1005
+ else self.out_sample_residuals_by_bin_
1006
+ )
1007
+ if append:
1008
+ out_sample_residuals = np.concatenate([out_sample_residuals, residuals])
1009
+ for k, v in residuals_by_bin.items():
1010
+ if k in out_sample_residuals_by_bin:
1011
+ out_sample_residuals_by_bin[k] = np.concatenate(
1012
+ (out_sample_residuals_by_bin[k], v)
1013
+ )
1014
+ else:
1015
+ out_sample_residuals_by_bin[k] = v
1016
+ else:
1017
+ out_sample_residuals = residuals
1018
+ out_sample_residuals_by_bin = residuals_by_bin
1019
+
1020
+ max_samples = 10_000 // self.binner.n_bins_
1021
+ rng = np.random.default_rng(seed=random_state)
1022
+ for k, v in out_sample_residuals_by_bin.items():
1023
+ if len(v) > max_samples:
1024
+ sample = rng.choice(a=v, size=max_samples, replace=False)
1025
+ out_sample_residuals_by_bin[k] = sample
1026
+
1027
+ bin_keys = (
1028
+ [] if self.binner_intervals_ is None else self.binner_intervals_.keys()
1029
+ )
1030
+ for k in bin_keys:
1031
+ if k not in out_sample_residuals_by_bin:
1032
+ out_sample_residuals_by_bin[k] = np.array([])
1033
+
1034
+ empty_bins = [k for k, v in out_sample_residuals_by_bin.items() if v.size == 0]
1035
+ if empty_bins:
1036
+ warnings.warn(
1037
+ f"The following bins have no out of sample residuals: {empty_bins}. "
1038
+ f"No predicted values fall in the interval "
1039
+ f"{[self.binner_intervals_[bin] for bin in empty_bins]}. "
1040
+ f"Empty bins will be filled with a random sample of residuals.",
1041
+ ResidualsUsageWarning,
1042
+ )
1043
+ empty_bin_size = min(max_samples, len(out_sample_residuals))
1044
+ for k in empty_bins:
1045
+ out_sample_residuals_by_bin[k] = rng.choice(
1046
+ a=out_sample_residuals, size=empty_bin_size, replace=False
1047
+ )
1048
+
1049
+ if len(out_sample_residuals) > 10_000:
1050
+ out_sample_residuals = rng.choice(
1051
+ a=out_sample_residuals, size=10_000, replace=False
1052
+ )
1053
+
1054
+ self.out_sample_residuals_ = out_sample_residuals
1055
+ self.out_sample_residuals_by_bin_ = out_sample_residuals_by_bin
1056
+
1057
+ def get_tags(self) -> dict[str, Any]:
1058
+ """
1059
+ Return the tags that characterize the behavior of the forecaster.
1060
+
1061
+ Returns:
1062
+ dict: Dictionary with forecaster tags.
1063
+ """
1064
+
1065
+ return self.__spotforecast_tags__
1066
+
1067
+ def summary(self) -> None:
1068
+ """
1069
+ Show forecaster information.
1070
+
1071
+ Returns:
1072
+ None
1073
+ """
1074
+
1075
+ print(self)