spotforecast2 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spotforecast2/.DS_Store +0 -0
- spotforecast2/__init__.py +2 -0
- spotforecast2/data/__init__.py +0 -0
- spotforecast2/data/data.py +130 -0
- spotforecast2/data/fetch_data.py +209 -0
- spotforecast2/exceptions.py +681 -0
- spotforecast2/forecaster/.DS_Store +0 -0
- spotforecast2/forecaster/__init__.py +7 -0
- spotforecast2/forecaster/base.py +448 -0
- spotforecast2/forecaster/metrics.py +527 -0
- spotforecast2/forecaster/recursive/__init__.py +4 -0
- spotforecast2/forecaster/recursive/_forecaster_equivalent_date.py +1075 -0
- spotforecast2/forecaster/recursive/_forecaster_recursive.py +939 -0
- spotforecast2/forecaster/recursive/_warnings.py +15 -0
- spotforecast2/forecaster/utils.py +954 -0
- spotforecast2/model_selection/__init__.py +5 -0
- spotforecast2/model_selection/bayesian_search.py +453 -0
- spotforecast2/model_selection/grid_search.py +314 -0
- spotforecast2/model_selection/random_search.py +151 -0
- spotforecast2/model_selection/split_base.py +357 -0
- spotforecast2/model_selection/split_one_step.py +245 -0
- spotforecast2/model_selection/split_ts_cv.py +634 -0
- spotforecast2/model_selection/utils_common.py +718 -0
- spotforecast2/model_selection/utils_metrics.py +103 -0
- spotforecast2/model_selection/validation.py +685 -0
- spotforecast2/preprocessing/__init__.py +30 -0
- spotforecast2/preprocessing/_binner.py +378 -0
- spotforecast2/preprocessing/_common.py +123 -0
- spotforecast2/preprocessing/_differentiator.py +123 -0
- spotforecast2/preprocessing/_rolling.py +136 -0
- spotforecast2/preprocessing/curate_data.py +254 -0
- spotforecast2/preprocessing/imputation.py +92 -0
- spotforecast2/preprocessing/outlier.py +114 -0
- spotforecast2/preprocessing/split.py +139 -0
- spotforecast2/py.typed +0 -0
- spotforecast2/utils/__init__.py +43 -0
- spotforecast2/utils/convert_to_utc.py +44 -0
- spotforecast2/utils/data_transform.py +208 -0
- spotforecast2/utils/forecaster_config.py +344 -0
- spotforecast2/utils/generate_holiday.py +70 -0
- spotforecast2/utils/validation.py +569 -0
- spotforecast2/weather/__init__.py +0 -0
- spotforecast2/weather/weather_client.py +288 -0
- spotforecast2-0.0.1.dist-info/METADATA +47 -0
- spotforecast2-0.0.1.dist-info/RECORD +46 -0
- spotforecast2-0.0.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1075 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Callable, Any
|
|
3
|
+
import warnings
|
|
4
|
+
import sys
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from sklearn.exceptions import NotFittedError
|
|
8
|
+
|
|
9
|
+
from spotforecast2.exceptions import MissingValuesWarning
|
|
10
|
+
from spotforecast2.preprocessing import QuantileBinner
|
|
11
|
+
|
|
12
|
+
# from spotforecast2._version import __version__ # Skipping version for now or mock it
|
|
13
|
+
from spotforecast2.forecaster.utils import (
|
|
14
|
+
check_extract_values_and_index,
|
|
15
|
+
get_style_repr_html,
|
|
16
|
+
check_residuals_input,
|
|
17
|
+
)
|
|
18
|
+
from spotforecast2.utils import (
|
|
19
|
+
check_y,
|
|
20
|
+
check_interval,
|
|
21
|
+
expand_index,
|
|
22
|
+
check_predict_input,
|
|
23
|
+
)
|
|
24
|
+
from ._warnings import ResidualsUsageWarning
|
|
25
|
+
|
|
26
|
+
# Mock version if not present
|
|
27
|
+
try:
|
|
28
|
+
from spotforecast2._version import __version__
|
|
29
|
+
except ImportError:
|
|
30
|
+
__version__ = "0.0.1"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ForecasterEquivalentDate:
|
|
34
|
+
"""
|
|
35
|
+
This forecaster predicts future values based on the most recent equivalent
|
|
36
|
+
date. It also allows to aggregate multiple past values of the equivalent
|
|
37
|
+
date using a function (e.g. mean, median, max, min, etc.). The equivalent
|
|
38
|
+
date is calculated by moving back in time a specified number of steps (offset).
|
|
39
|
+
The offset can be defined as an integer or as a pandas DateOffset. This
|
|
40
|
+
approach is useful as a baseline, but it is a simplistic method and may not
|
|
41
|
+
capture complex underlying patterns.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
offset (int, pandas.tseries.offsets.DateOffset): Number of steps to go back
|
|
45
|
+
in time to find the most recent equivalent date to the target period.
|
|
46
|
+
If `offset` is an integer, it represents the number of steps to go back
|
|
47
|
+
in time. For example, if the frequency of the time series is daily,
|
|
48
|
+
`offset = 7` means that the most recent data similar to the target
|
|
49
|
+
period is the value observed 7 days ago.
|
|
50
|
+
Pandas DateOffsets can also be used to move forward a given number of
|
|
51
|
+
valid dates. For example, Bday(2) can be used to move back two business
|
|
52
|
+
days. If the date does not start on a valid date, it is first moved to a
|
|
53
|
+
valid date. For example, if the date is a Saturday, it is moved to the
|
|
54
|
+
previous Friday. Then, the offset is applied. If the result is a non-valid
|
|
55
|
+
date, it is moved to the next valid date. For example, if the date
|
|
56
|
+
is a Sunday, it is moved to the next Monday.
|
|
57
|
+
For more information about offsets, see
|
|
58
|
+
https://pandas.pydata.org/docs/reference/offset_frequency.html.
|
|
59
|
+
n_offsets (int, optional): Number of equivalent dates (multiple of offset)
|
|
60
|
+
used in the prediction. Defaults to 1.
|
|
61
|
+
If `n_offsets` is greater than 1, the values at the equivalent dates are
|
|
62
|
+
aggregated using the `agg_func` function. For example, if the frequency
|
|
63
|
+
of the time series is daily, `offset = 7`, `n_offsets = 2` and
|
|
64
|
+
`agg_func = np.mean`, the predicted value will be the mean of the values
|
|
65
|
+
observed 7 and 14 days ago.
|
|
66
|
+
agg_func (Callable, optional): Function used to aggregate the values of the
|
|
67
|
+
equivalent dates when the number of equivalent dates (`n_offsets`) is
|
|
68
|
+
greater than 1. Defaults to np.mean.
|
|
69
|
+
binner_kwargs (dict, optional): Additional arguments to pass to the
|
|
70
|
+
`QuantileBinner` used to discretize the residuals into k bins according
|
|
71
|
+
to the predicted values associated with each residual. Available arguments
|
|
72
|
+
are: `n_bins`, `method`, `subsample`, `random_state` and `dtype`.
|
|
73
|
+
Argument `method` is passed internally to the function `numpy.percentile`.
|
|
74
|
+
Defaults to None.
|
|
75
|
+
forecaster_id (str, int, optional): Name used as an identifier of the
|
|
76
|
+
forecaster. Defaults to None.
|
|
77
|
+
|
|
78
|
+
Attributes:
|
|
79
|
+
offset (int, pandas.tseries.offsets.DateOffset): Number of steps to go back
|
|
80
|
+
in time to find the most recent equivalent date to the target period.
|
|
81
|
+
n_offsets (int): Number of equivalent dates (multiple of offset) used in
|
|
82
|
+
the prediction.
|
|
83
|
+
agg_func (Callable): Function used to aggregate the values of the equivalent
|
|
84
|
+
dates when the number of equivalent dates (`n_offsets`) is greater than 1.
|
|
85
|
+
window_size (int): Number of past values needed to include the last
|
|
86
|
+
equivalent dates according to the `offset` and `n_offsets`.
|
|
87
|
+
last_window_ (pandas Series): This window represents the most recent data
|
|
88
|
+
observed by the predictor during its training phase. It contains the
|
|
89
|
+
past values needed to include the last equivalent date according the
|
|
90
|
+
`offset` and `n_offsets`.
|
|
91
|
+
index_type_ (type): Type of index of the input used in training.
|
|
92
|
+
index_freq_ (str): Frequency of Index of the input used in training.
|
|
93
|
+
training_range_ (pandas Index): First and last values of index of the data
|
|
94
|
+
used during training.
|
|
95
|
+
series_name_in_ (str): Names of the series provided by the user during training.
|
|
96
|
+
in_sample_residuals_ (numpy ndarray): Residuals of the model when predicting
|
|
97
|
+
training data. Only stored up to 10_000 values.
|
|
98
|
+
in_sample_residuals_by_bin_ (dict): In sample residuals binned according to
|
|
99
|
+
the predicted value each residual is associated with. The number of
|
|
100
|
+
residuals stored per bin is limited to `10_000 // self.binner.n_bins_`
|
|
101
|
+
in the form `{bin: residuals}`.
|
|
102
|
+
out_sample_residuals_ (numpy ndarray): Residuals of the model when predicting
|
|
103
|
+
non-training data. Only stored up to 10_000 values. Use
|
|
104
|
+
`set_out_sample_residuals()` method to set values.
|
|
105
|
+
out_sample_residuals_by_bin_ (dict): Out of sample residuals binned
|
|
106
|
+
according to the predicted value each residual is associated with.
|
|
107
|
+
The number of residuals stored per bin is limited to
|
|
108
|
+
`10_000 // self.binner.n_bins_` in the form `{bin: residuals}`.
|
|
109
|
+
binner (spotforecast.preprocessing.QuantileBinner): `QuantileBinner` used to
|
|
110
|
+
discretize residuals into k bins according to the predicted values
|
|
111
|
+
associated with each residual.
|
|
112
|
+
binner_intervals_ (dict): Intervals used to discretize residuals into k bins
|
|
113
|
+
according to the predicted values associated with each residual.
|
|
114
|
+
binner_kwargs (dict): Additional arguments to pass to the `QuantileBinner`.
|
|
115
|
+
creation_date (str): Date of creation.
|
|
116
|
+
is_fitted (bool): Tag to identify if the estimator has been fitted (trained).
|
|
117
|
+
fit_date (str): Date of last fit.
|
|
118
|
+
spotforecast_version (str): Version of spotforecast library used to create
|
|
119
|
+
the forecaster.
|
|
120
|
+
python_version (str): Version of python used to create the forecaster.
|
|
121
|
+
forecaster_id (str, int): Name used as an identifier of the forecaster.
|
|
122
|
+
|
|
123
|
+
Examples:
|
|
124
|
+
>>> import pandas as pd
|
|
125
|
+
>>> import numpy as np
|
|
126
|
+
>>> from spotforecast2.forecaster.recursive import ForecasterEquivalentDate
|
|
127
|
+
>>> # Series with daily frequency
|
|
128
|
+
>>> data = pd.Series(
|
|
129
|
+
... data = np.arange(14),
|
|
130
|
+
... index = pd.date_range(start='2022-01-01', periods=14, freq='D')
|
|
131
|
+
... )
|
|
132
|
+
>>> # Forecast based on the value 7 days ago
|
|
133
|
+
>>> forecaster = ForecasterEquivalentDate(offset=7)
|
|
134
|
+
>>> forecaster.fit(y=data)
|
|
135
|
+
>>> forecaster.predict(steps=3)
|
|
136
|
+
2022-01-15 7
|
|
137
|
+
2022-01-16 8
|
|
138
|
+
2022-01-17 9
|
|
139
|
+
Freq: D, Name: pred, dtype: int64
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def __init__(
|
|
143
|
+
self,
|
|
144
|
+
offset: int | pd.tseries.offsets.DateOffset,
|
|
145
|
+
n_offsets: int = 1,
|
|
146
|
+
agg_func: Callable = np.mean,
|
|
147
|
+
binner_kwargs: dict[str, object] | None = None,
|
|
148
|
+
forecaster_id: str | int | None = None,
|
|
149
|
+
) -> None:
|
|
150
|
+
|
|
151
|
+
self.offset = offset
|
|
152
|
+
self.n_offsets = n_offsets
|
|
153
|
+
self.agg_func = agg_func
|
|
154
|
+
self.last_window_ = None
|
|
155
|
+
self.index_type_ = None
|
|
156
|
+
self.index_freq_ = None
|
|
157
|
+
self.training_range_ = None
|
|
158
|
+
self.series_name_in_ = None
|
|
159
|
+
self.in_sample_residuals_ = None
|
|
160
|
+
self.out_sample_residuals_ = None
|
|
161
|
+
self.in_sample_residuals_by_bin_ = None
|
|
162
|
+
self.out_sample_residuals_by_bin_ = None
|
|
163
|
+
self.creation_date = pd.Timestamp.today().strftime("%Y-%m-%d %H:%M:%S")
|
|
164
|
+
self.is_fitted = False
|
|
165
|
+
self.fit_date = None
|
|
166
|
+
self.spotforecast_version = __version__
|
|
167
|
+
self.python_version = sys.version.split(" ")[0]
|
|
168
|
+
self.forecaster_id = forecaster_id
|
|
169
|
+
self._probabilistic_mode = "binned"
|
|
170
|
+
self.estimator = None
|
|
171
|
+
self.differentiation = None
|
|
172
|
+
self.differentiation_max = None
|
|
173
|
+
self.window_size = None # Defaults to None, validated later
|
|
174
|
+
|
|
175
|
+
if not isinstance(self.offset, (int, pd.tseries.offsets.DateOffset)):
|
|
176
|
+
raise TypeError(
|
|
177
|
+
"`offset` must be an integer greater than 0 or a "
|
|
178
|
+
"pandas.tseries.offsets. Find more information about offsets in "
|
|
179
|
+
"https://pandas.pydata.org/docs/reference/offset_frequency.html"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
if isinstance(self.offset, int):
|
|
183
|
+
self.window_size = self.offset * self.n_offsets
|
|
184
|
+
|
|
185
|
+
self.binner_kwargs = binner_kwargs
|
|
186
|
+
if binner_kwargs is None:
|
|
187
|
+
self.binner_kwargs = {
|
|
188
|
+
"n_bins": 10,
|
|
189
|
+
"method": "linear",
|
|
190
|
+
"subsample": 200000,
|
|
191
|
+
"random_state": 789654,
|
|
192
|
+
"dtype": np.float64,
|
|
193
|
+
}
|
|
194
|
+
self.binner = QuantileBinner(**self.binner_kwargs)
|
|
195
|
+
self.binner_intervals_ = None
|
|
196
|
+
|
|
197
|
+
self.__spotforecast_tags__ = {
|
|
198
|
+
"library": "spotforecast",
|
|
199
|
+
"forecaster_name": "ForecasterEquivalentDate",
|
|
200
|
+
"forecaster_task": "regression",
|
|
201
|
+
"forecasting_scope": "single-series", # single-series | global
|
|
202
|
+
"forecasting_strategy": "recursive", # recursive | direct | deep_learning
|
|
203
|
+
"index_types_supported": ["pandas.RangeIndex", "pandas.DatetimeIndex"],
|
|
204
|
+
"requires_index_frequency": True,
|
|
205
|
+
"allowed_input_types_series": ["pandas.Series"],
|
|
206
|
+
"supports_exog": False,
|
|
207
|
+
"allowed_input_types_exog": [],
|
|
208
|
+
"handles_missing_values_series": False,
|
|
209
|
+
"handles_missing_values_exog": False,
|
|
210
|
+
"supports_lags": False,
|
|
211
|
+
"supports_window_features": False,
|
|
212
|
+
"supports_transformer_series": False,
|
|
213
|
+
"supports_transformer_exog": False,
|
|
214
|
+
"supports_weight_func": False,
|
|
215
|
+
"supports_differentiation": False,
|
|
216
|
+
"prediction_types": ["point", "interval"],
|
|
217
|
+
"supports_probabilistic": True,
|
|
218
|
+
"probabilistic_methods": ["conformal"],
|
|
219
|
+
"handles_binned_residuals": True,
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
def __repr__(self) -> str:
|
|
223
|
+
"""
|
|
224
|
+
Information displayed when a Forecaster object is printed.
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
info = (
|
|
228
|
+
f"{'=' * len(type(self).__name__)} \n"
|
|
229
|
+
f"{type(self).__name__} \n"
|
|
230
|
+
f"{'=' * len(type(self).__name__)} \n"
|
|
231
|
+
f"Offset: {self.offset} \n"
|
|
232
|
+
f"Number of offsets: {self.n_offsets} \n"
|
|
233
|
+
f"Aggregation function: {self.agg_func.__name__} \n"
|
|
234
|
+
f"Window size: {self.window_size} \n"
|
|
235
|
+
f"Series name: {self.series_name_in_} \n"
|
|
236
|
+
f"Training range: {self.training_range_.to_list() if self.is_fitted else None} \n"
|
|
237
|
+
f"Training index type: {str(self.index_type_).split('.')[-1][:-2] if self.is_fitted else None} \n"
|
|
238
|
+
f"Training index frequency: {self.index_freq_ if self.is_fitted else None} \n"
|
|
239
|
+
f"Creation date: {self.creation_date} \n"
|
|
240
|
+
f"Last fit date: {self.fit_date} \n"
|
|
241
|
+
f"spotforecast version: {self.spotforecast_version} \n"
|
|
242
|
+
f"Python version: {self.python_version} \n"
|
|
243
|
+
f"Forecaster id: {self.forecaster_id} \n"
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
return info
|
|
247
|
+
|
|
248
|
+
def _repr_html_(self) -> str:
|
|
249
|
+
"""
|
|
250
|
+
HTML representation of the object.
|
|
251
|
+
The "General Information" section is expanded by default.
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
style, unique_id = get_style_repr_html(self.is_fitted)
|
|
255
|
+
|
|
256
|
+
content = f"""
|
|
257
|
+
<div class="container-{unique_id}">
|
|
258
|
+
<p style="font-size: 1.5em; font-weight: bold; margin-block-start: 0.83em; margin-block-end: 0.83em;">{type(self).__name__}</p>
|
|
259
|
+
<details open>
|
|
260
|
+
<summary>General Information</summary>
|
|
261
|
+
<ul>
|
|
262
|
+
<li><strong>Estimator:</strong> {type(self.estimator).__name__}</li>
|
|
263
|
+
<li><strong>Offset:</strong> {self.offset}</li>
|
|
264
|
+
<li><strong>Number of offsets:</strong> {self.n_offsets}</li>
|
|
265
|
+
<li><strong>Aggregation function:</strong> {self.agg_func.__name__}</li>
|
|
266
|
+
<li><strong>Window size:</strong> {self.window_size}</li>
|
|
267
|
+
<li><strong>Creation date:</strong> {self.creation_date}</li>
|
|
268
|
+
<li><strong>Last fit date:</strong> {self.fit_date}</li>
|
|
269
|
+
<li><strong>spotforecast version:</strong> {self.spotforecast_version}</li>
|
|
270
|
+
<li><strong>Python version:</strong> {self.python_version}</li>
|
|
271
|
+
<li><strong>Forecaster id:</strong> {self.forecaster_id}</li>
|
|
272
|
+
</ul>
|
|
273
|
+
</details>
|
|
274
|
+
<details>
|
|
275
|
+
<summary>Training Information</summary>
|
|
276
|
+
<ul>
|
|
277
|
+
<li><strong>Training range:</strong> {self.training_range_.to_list() if self.is_fitted else 'Not fitted'}</li>
|
|
278
|
+
<li><strong>Training index type:</strong> {str(self.index_type_).split('.')[-1][:-2] if self.is_fitted else 'Not fitted'}</li>
|
|
279
|
+
<li><strong>Training index frequency:</strong> {self.index_freq_ if self.is_fitted else 'Not fitted'}</li>
|
|
280
|
+
</ul>
|
|
281
|
+
</div>
|
|
282
|
+
"""
|
|
283
|
+
|
|
284
|
+
return style + content
|
|
285
|
+
|
|
286
|
+
def fit(
|
|
287
|
+
self,
|
|
288
|
+
y: pd.Series,
|
|
289
|
+
store_in_sample_residuals: bool = False,
|
|
290
|
+
random_state: int = 123,
|
|
291
|
+
exog: Any = None,
|
|
292
|
+
) -> None:
|
|
293
|
+
"""
|
|
294
|
+
Training Forecaster.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
y (pandas Series): Training time series.
|
|
298
|
+
store_in_sample_residuals (bool, optional): If `True`, in-sample
|
|
299
|
+
residuals will be stored in the forecaster object after fitting
|
|
300
|
+
(`in_sample_residuals_` and `in_sample_residuals_by_bin_` attributes).
|
|
301
|
+
If `False`, only the intervals of the bins are stored. Defaults to False.
|
|
302
|
+
random_state (int, optional): Set a seed for the random generator so
|
|
303
|
+
that the stored sample residuals are always deterministic. Defaults to 123.
|
|
304
|
+
exog (Ignored): Not used, present here for API consistency by convention.
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
None
|
|
308
|
+
"""
|
|
309
|
+
|
|
310
|
+
if not isinstance(y, pd.Series):
|
|
311
|
+
raise TypeError(
|
|
312
|
+
f"`y` must be a pandas Series with a DatetimeIndex or a RangeIndex. "
|
|
313
|
+
f"Found {type(y)}."
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
if isinstance(self.offset, pd.tseries.offsets.DateOffset):
|
|
317
|
+
if not isinstance(y.index, pd.DatetimeIndex):
|
|
318
|
+
raise TypeError(
|
|
319
|
+
"If `offset` is a pandas DateOffset, the index of `y` must be a "
|
|
320
|
+
"pandas DatetimeIndex with frequency."
|
|
321
|
+
)
|
|
322
|
+
elif y.index.freq is None:
|
|
323
|
+
try:
|
|
324
|
+
y.index.freq = pd.infer_freq(y.index)
|
|
325
|
+
except (ValueError, TypeError):
|
|
326
|
+
raise TypeError(
|
|
327
|
+
"If `offset` is a pandas DateOffset, the index of `y` must be a "
|
|
328
|
+
"pandas DatetimeIndex with frequency."
|
|
329
|
+
)
|
|
330
|
+
if y.index.freq is None:
|
|
331
|
+
raise TypeError(
|
|
332
|
+
"If `offset` is a pandas DateOffset, the index of `y` must be a "
|
|
333
|
+
"pandas DatetimeIndex with frequency."
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# Reset values in case the forecaster has already been fitted.
|
|
337
|
+
self.last_window_ = None
|
|
338
|
+
self.index_type_ = None
|
|
339
|
+
self.index_freq_ = None
|
|
340
|
+
self.training_range_ = None
|
|
341
|
+
self.series_name_in_ = None
|
|
342
|
+
self.is_fitted = False
|
|
343
|
+
|
|
344
|
+
_, y_index = check_extract_values_and_index(
|
|
345
|
+
data=y, data_label="`y`", return_values=False
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
if isinstance(self.offset, pd.tseries.offsets.DateOffset):
|
|
349
|
+
# Calculate the window_size in steps for compatibility with the
|
|
350
|
+
# check_predict_input function. This is not a exact calculation
|
|
351
|
+
# because the offset follows the calendar rules and the distance
|
|
352
|
+
# between two dates may not be constant.
|
|
353
|
+
first_valid_index = y_index[-1] - self.offset * self.n_offsets
|
|
354
|
+
|
|
355
|
+
try:
|
|
356
|
+
window_size_idx_start = y_index.get_loc(first_valid_index)
|
|
357
|
+
window_size_idx_end = y_index.get_loc(y_index[-1])
|
|
358
|
+
self.window_size = window_size_idx_end - window_size_idx_start
|
|
359
|
+
except KeyError:
|
|
360
|
+
raise ValueError(
|
|
361
|
+
f"The length of `y` ({len(y)}), must be greater than or equal "
|
|
362
|
+
f"to the window size ({self.window_size}). This is because "
|
|
363
|
+
f"the offset ({self.offset}) is larger than the available "
|
|
364
|
+
f"data. Try to decrease the size of the offset ({self.offset}), "
|
|
365
|
+
f"the number of `n_offsets` ({self.n_offsets}) or increase the "
|
|
366
|
+
f"size of `y`."
|
|
367
|
+
)
|
|
368
|
+
else:
|
|
369
|
+
if len(y) <= self.window_size:
|
|
370
|
+
raise ValueError(
|
|
371
|
+
f"Length of `y` must be greater than the maximum window size "
|
|
372
|
+
f"needed by the forecaster. This is because "
|
|
373
|
+
f"the offset ({self.offset}) is larger than the available "
|
|
374
|
+
f"data. Try to decrease the size of the offset ({self.offset}), "
|
|
375
|
+
f"the number of `n_offsets` ({self.n_offsets}) or increase the "
|
|
376
|
+
f"size of `y`.\n"
|
|
377
|
+
f" Length `y`: {len(y)}.\n"
|
|
378
|
+
f" Max window size: {self.window_size}.\n"
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
self.is_fitted = True
|
|
382
|
+
self.series_name_in_ = y.name if y.name is not None else "y"
|
|
383
|
+
self.fit_date = pd.Timestamp.today().strftime("%Y-%m-%d %H:%M:%S")
|
|
384
|
+
self.training_range_ = y_index[[0, -1]]
|
|
385
|
+
self.index_type_ = type(y_index)
|
|
386
|
+
self.index_freq_ = (
|
|
387
|
+
y_index.freq if isinstance(y_index, pd.DatetimeIndex) else y_index.step
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
# NOTE: This is done to save time during fit in functions such as backtesting()
|
|
391
|
+
if self._probabilistic_mode is not False:
|
|
392
|
+
self._binning_in_sample_residuals(
|
|
393
|
+
y=y,
|
|
394
|
+
store_in_sample_residuals=store_in_sample_residuals,
|
|
395
|
+
random_state=random_state,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# The last time window of training data is stored so that equivalent
|
|
399
|
+
# dates are available when calling the `predict` method.
|
|
400
|
+
# Store the whole series to avoid errors when the offset is larger
|
|
401
|
+
# than the data available.
|
|
402
|
+
self.last_window_ = y.copy()
|
|
403
|
+
|
|
404
|
+
def _binning_in_sample_residuals(
|
|
405
|
+
self,
|
|
406
|
+
y: pd.Series,
|
|
407
|
+
store_in_sample_residuals: bool = False,
|
|
408
|
+
random_state: int = 123,
|
|
409
|
+
) -> None:
|
|
410
|
+
"""
|
|
411
|
+
Bin residuals according to the predicted value each residual is
|
|
412
|
+
associated with. First a `spotforecast.preprocessing.QuantileBinner` object
|
|
413
|
+
is fitted to the predicted values. Then, residuals are binned according
|
|
414
|
+
to the predicted value each residual is associated with. Residuals are
|
|
415
|
+
stored in the forecaster object as `in_sample_residuals_` and
|
|
416
|
+
`in_sample_residuals_by_bin_`.
|
|
417
|
+
|
|
418
|
+
The number of residuals stored per bin is limited to
|
|
419
|
+
`10_000 // self.binner.n_bins_`. The total number of residuals stored is
|
|
420
|
+
`10_000`.
|
|
421
|
+
**New in version 0.17.0**
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
y (pandas Series): Training time series.
|
|
425
|
+
store_in_sample_residuals (bool, optional): If `True`, in-sample
|
|
426
|
+
residuals will be stored in the forecaster object after fitting
|
|
427
|
+
(`in_sample_residuals_` and `in_sample_residuals_by_bin_` attributes).
|
|
428
|
+
If `False`, only the intervals of the bins are stored. Defaults to False.
|
|
429
|
+
random_state (int, optional): Set a seed for the random generator so
|
|
430
|
+
that the stored sample residuals are always deterministic. Defaults to 123.
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
None
|
|
434
|
+
"""
|
|
435
|
+
|
|
436
|
+
if isinstance(self.offset, pd.tseries.offsets.DateOffset):
|
|
437
|
+
y_preds = []
|
|
438
|
+
for n_off in range(1, self.n_offsets + 1):
|
|
439
|
+
idx = y.index - self.offset * n_off
|
|
440
|
+
mask = idx >= y.index[0]
|
|
441
|
+
y_pred = y.loc[idx[mask]]
|
|
442
|
+
y_pred.index = y.index[-mask.sum() :]
|
|
443
|
+
y_preds.append(y_pred)
|
|
444
|
+
|
|
445
|
+
y_preds = pd.concat(y_preds, axis=1).to_numpy()
|
|
446
|
+
y_true = y.to_numpy()[-len(y_preds) :]
|
|
447
|
+
|
|
448
|
+
else:
|
|
449
|
+
y_preds = [
|
|
450
|
+
y.shift(self.offset * n_off)[self.window_size :]
|
|
451
|
+
for n_off in range(1, self.n_offsets + 1)
|
|
452
|
+
]
|
|
453
|
+
y_preds = np.column_stack(y_preds)
|
|
454
|
+
y_true = y.to_numpy()[self.window_size :]
|
|
455
|
+
|
|
456
|
+
y_pred = np.apply_along_axis(self.agg_func, axis=1, arr=y_preds)
|
|
457
|
+
|
|
458
|
+
residuals = y_true - y_pred
|
|
459
|
+
|
|
460
|
+
if self._probabilistic_mode == "binned":
|
|
461
|
+
data = pd.DataFrame({"prediction": y_pred, "residuals": residuals}).dropna()
|
|
462
|
+
y_pred = data["prediction"].to_numpy()
|
|
463
|
+
residuals = data["residuals"].to_numpy()
|
|
464
|
+
|
|
465
|
+
self.binner.fit(y_pred)
|
|
466
|
+
self.binner_intervals_ = self.binner.intervals_
|
|
467
|
+
|
|
468
|
+
if store_in_sample_residuals:
|
|
469
|
+
rng = np.random.default_rng(seed=random_state)
|
|
470
|
+
if self._probabilistic_mode == "binned":
|
|
471
|
+
data["bin"] = self.binner.transform(y_pred).astype(int)
|
|
472
|
+
self.in_sample_residuals_by_bin_ = (
|
|
473
|
+
data.groupby("bin")["residuals"].apply(np.array).to_dict()
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
max_sample = 10_000 // self.binner.n_bins_
|
|
477
|
+
for k, v in self.in_sample_residuals_by_bin_.items():
|
|
478
|
+
if len(v) > max_sample:
|
|
479
|
+
sample = v[rng.integers(low=0, high=len(v), size=max_sample)]
|
|
480
|
+
self.in_sample_residuals_by_bin_[k] = sample
|
|
481
|
+
|
|
482
|
+
for k in self.binner_intervals_.keys():
|
|
483
|
+
if k not in self.in_sample_residuals_by_bin_:
|
|
484
|
+
self.in_sample_residuals_by_bin_[k] = np.array([])
|
|
485
|
+
|
|
486
|
+
empty_bins = [
|
|
487
|
+
k
|
|
488
|
+
for k, v in self.in_sample_residuals_by_bin_.items()
|
|
489
|
+
if v.size == 0
|
|
490
|
+
]
|
|
491
|
+
if empty_bins:
|
|
492
|
+
empty_bin_size = min(max_sample, len(residuals))
|
|
493
|
+
for k in empty_bins:
|
|
494
|
+
self.in_sample_residuals_by_bin_[k] = rng.choice(
|
|
495
|
+
a=residuals, size=empty_bin_size, replace=False
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
if len(residuals) > 10_000:
|
|
499
|
+
residuals = residuals[
|
|
500
|
+
rng.integers(low=0, high=len(residuals), size=10_000)
|
|
501
|
+
]
|
|
502
|
+
|
|
503
|
+
self.in_sample_residuals_ = residuals
|
|
504
|
+
|
|
505
|
+
def predict(
|
|
506
|
+
self,
|
|
507
|
+
steps: int,
|
|
508
|
+
last_window: pd.Series | None = None,
|
|
509
|
+
check_inputs: bool = True,
|
|
510
|
+
exog: Any = None,
|
|
511
|
+
) -> pd.Series:
|
|
512
|
+
"""
|
|
513
|
+
Predict n steps ahead.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
steps (int): Number of steps to predict.
|
|
517
|
+
last_window (pandas Series, optional): Past values needed to select the
|
|
518
|
+
last equivalent dates according to the offset. If `last_window = None`,
|
|
519
|
+
the values stored in `self.last_window_` are used and the predictions
|
|
520
|
+
start immediately after the training data. Defaults to None.
|
|
521
|
+
check_inputs (bool, optional): If `True`, the input is checked for
|
|
522
|
+
possible warnings and errors with the `check_predict_input` function.
|
|
523
|
+
This argument is created for internal use and is not recommended to
|
|
524
|
+
be changed. Defaults to True.
|
|
525
|
+
exog (Ignored): Not used, present here for API consistency by convention.
|
|
526
|
+
|
|
527
|
+
Returns:
|
|
528
|
+
pd.Series: Predicted values.
|
|
529
|
+
"""
|
|
530
|
+
|
|
531
|
+
if last_window is None:
|
|
532
|
+
last_window = self.last_window_
|
|
533
|
+
|
|
534
|
+
if check_inputs:
|
|
535
|
+
check_predict_input(
|
|
536
|
+
forecaster_name=type(self).__name__,
|
|
537
|
+
steps=steps,
|
|
538
|
+
is_fitted=self.is_fitted,
|
|
539
|
+
exog_in_=False,
|
|
540
|
+
index_type_=self.index_type_,
|
|
541
|
+
index_freq_=self.index_freq_,
|
|
542
|
+
window_size=self.window_size,
|
|
543
|
+
last_window=last_window,
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
prediction_index = expand_index(index=last_window.index, steps=steps)
|
|
547
|
+
|
|
548
|
+
if isinstance(self.offset, int):
|
|
549
|
+
|
|
550
|
+
last_window_values = last_window.to_numpy(copy=True).ravel()
|
|
551
|
+
equivalent_indexes = np.tile(
|
|
552
|
+
np.arange(-self.offset, 0), int(np.ceil(steps / self.offset))
|
|
553
|
+
)
|
|
554
|
+
equivalent_indexes = equivalent_indexes[:steps]
|
|
555
|
+
|
|
556
|
+
if self.n_offsets == 1:
|
|
557
|
+
equivalent_values = last_window_values[equivalent_indexes]
|
|
558
|
+
predictions = equivalent_values.ravel()
|
|
559
|
+
|
|
560
|
+
if self.n_offsets > 1:
|
|
561
|
+
equivalent_indexes = [
|
|
562
|
+
equivalent_indexes - n * self.offset
|
|
563
|
+
for n in np.arange(self.n_offsets)
|
|
564
|
+
]
|
|
565
|
+
equivalent_indexes = np.vstack(equivalent_indexes)
|
|
566
|
+
equivalent_values = last_window_values[equivalent_indexes]
|
|
567
|
+
predictions = np.apply_along_axis(
|
|
568
|
+
self.agg_func, axis=0, arr=equivalent_values
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
predictions = pd.Series(
|
|
572
|
+
data=predictions, index=prediction_index, name="pred"
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
if isinstance(self.offset, pd.tseries.offsets.DateOffset):
|
|
576
|
+
|
|
577
|
+
last_window = last_window.copy()
|
|
578
|
+
max_allowed_date = last_window.index[-1]
|
|
579
|
+
|
|
580
|
+
# For every date in prediction_index, calculate the n offsets
|
|
581
|
+
offset_dates = []
|
|
582
|
+
for date in prediction_index:
|
|
583
|
+
selected_offsets = []
|
|
584
|
+
while len(selected_offsets) < self.n_offsets:
|
|
585
|
+
offset_date = date - self.offset
|
|
586
|
+
if offset_date <= max_allowed_date:
|
|
587
|
+
selected_offsets.append(offset_date)
|
|
588
|
+
date = offset_date
|
|
589
|
+
offset_dates.append(selected_offsets)
|
|
590
|
+
|
|
591
|
+
offset_dates = np.array(offset_dates)
|
|
592
|
+
|
|
593
|
+
# Select the values of the time series corresponding to the each
|
|
594
|
+
# offset date. If the offset date is not in the time series, the
|
|
595
|
+
# value is set to NaN.
|
|
596
|
+
equivalent_values = (
|
|
597
|
+
last_window.reindex(offset_dates.ravel())
|
|
598
|
+
.to_numpy()
|
|
599
|
+
.reshape(-1, self.n_offsets)
|
|
600
|
+
)
|
|
601
|
+
equivalent_values = pd.DataFrame(
|
|
602
|
+
data=equivalent_values,
|
|
603
|
+
index=prediction_index,
|
|
604
|
+
columns=[f"offset_{i}" for i in range(self.n_offsets)],
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
# Error if all values are missing
|
|
608
|
+
if equivalent_values.isnull().all().all():
|
|
609
|
+
raise ValueError(
|
|
610
|
+
f"All equivalent values are missing. This is caused by using "
|
|
611
|
+
f"an offset ({self.offset}) larger than the available data. "
|
|
612
|
+
f"Try to decrease the size of the offset ({self.offset}), "
|
|
613
|
+
f"the number of `n_offsets` ({self.n_offsets}) or increase the "
|
|
614
|
+
f"size of `last_window`. In backtesting, this error may be "
|
|
615
|
+
f"caused by using an `initial_train_size` too small."
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
# Warning if equivalent values are missing
|
|
619
|
+
incomplete_offsets = equivalent_values.isnull().any(axis=1)
|
|
620
|
+
incomplete_offsets = incomplete_offsets[incomplete_offsets].index
|
|
621
|
+
if not incomplete_offsets.empty:
|
|
622
|
+
warnings.warn(
|
|
623
|
+
f"Steps: {incomplete_offsets.strftime('%Y-%m-%d').to_list()} "
|
|
624
|
+
f"are calculated with less than {self.n_offsets} `n_offsets`. "
|
|
625
|
+
f"To avoid this, increase the `last_window` size or decrease "
|
|
626
|
+
f"the number of `n_offsets`. The current configuration requires "
|
|
627
|
+
f"a total offset of {self.offset * self.n_offsets}.",
|
|
628
|
+
MissingValuesWarning,
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
aggregate_values = equivalent_values.apply(self.agg_func, axis=1)
|
|
632
|
+
predictions = aggregate_values.rename("pred")
|
|
633
|
+
|
|
634
|
+
return predictions
|
|
635
|
+
|
|
636
|
+
def predict_interval(
|
|
637
|
+
self,
|
|
638
|
+
steps: int,
|
|
639
|
+
last_window: pd.Series | None = None,
|
|
640
|
+
method: str = "conformal",
|
|
641
|
+
interval: float | list[float] | tuple[float] = [5, 95],
|
|
642
|
+
use_in_sample_residuals: bool = True,
|
|
643
|
+
use_binned_residuals: bool = True,
|
|
644
|
+
random_state: Any = None,
|
|
645
|
+
exog: Any = None,
|
|
646
|
+
n_boot: Any = None,
|
|
647
|
+
) -> pd.DataFrame:
|
|
648
|
+
"""
|
|
649
|
+
Predict n steps ahead and estimate prediction intervals using conformal
|
|
650
|
+
prediction method. Refer to the References section for additional
|
|
651
|
+
details on this method.
|
|
652
|
+
|
|
653
|
+
Args:
|
|
654
|
+
steps (int): Number of steps to predict.
|
|
655
|
+
last_window (pandas Series, optional): Past values needed to select the
|
|
656
|
+
last equivalent dates according to the offset. If `last_window = None`,
|
|
657
|
+
the values stored in `self.last_window_` are used and the predictions
|
|
658
|
+
start immediately after the training data. Defaults to None.
|
|
659
|
+
method (str, optional): Technique used to estimate prediction intervals.
|
|
660
|
+
Available options:
|
|
661
|
+
- 'conformal': Employs the conformal prediction split method for
|
|
662
|
+
interval estimation [1]_. Defaults to 'conformal'.
|
|
663
|
+
interval (float, list, tuple, optional): Confidence level of the
|
|
664
|
+
prediction interval. Interpretation depends on the method used:
|
|
665
|
+
- If `float`, represents the nominal (expected) coverage (between 0
|
|
666
|
+
and 1). For instance, `interval=0.95` corresponds to `[2.5, 97.5]`
|
|
667
|
+
percentiles.
|
|
668
|
+
- If `list` or `tuple`, defines the exact percentiles to compute,
|
|
669
|
+
which must be between 0 and 100 inclusive. For example, interval
|
|
670
|
+
of 95% should be as `interval = [2.5, 97.5]`.
|
|
671
|
+
- When using `method='conformal'`, the interval must be a float or
|
|
672
|
+
a list/tuple defining a symmetric interval. Defaults to [5, 95].
|
|
673
|
+
use_in_sample_residuals (bool, optional): If `True`, residuals from the
|
|
674
|
+
training data are used as proxy of prediction error to create predictions.
|
|
675
|
+
If `False`, out of sample residuals (calibration) are used.
|
|
676
|
+
Out-of-sample residuals must be precomputed using Forecaster's
|
|
677
|
+
`set_out_sample_residuals()` method. Defaults to True.
|
|
678
|
+
use_binned_residuals (bool, optional): If `True`, residuals are selected
|
|
679
|
+
based on the predicted values (binned selection).
|
|
680
|
+
If `False`, residuals are selected randomly. Defaults to True.
|
|
681
|
+
random_state (Ignored): Not used, present here for API consistency by convention.
|
|
682
|
+
exog (Ignored): Not used, present here for API consistency by convention.
|
|
683
|
+
n_boot (Ignored): Not used, present here for API consistency by convention.
|
|
684
|
+
|
|
685
|
+
Returns:
|
|
686
|
+
pd.DataFrame: Values predicted by the forecaster and their estimated interval.
|
|
687
|
+
- pred: predictions.
|
|
688
|
+
- lower_bound: lower bound of the interval.
|
|
689
|
+
- upper_bound: upper bound of the interval.
|
|
690
|
+
|
|
691
|
+
References:
|
|
692
|
+
.. [1] MAPIE - Model Agnostic Prediction Interval Estimator.
|
|
693
|
+
https://mapie.readthedocs.io/en/stable/theoretical_description_regression.html#the-split-method
|
|
694
|
+
"""
|
|
695
|
+
|
|
696
|
+
if method != "conformal":
|
|
697
|
+
raise ValueError(
|
|
698
|
+
f"Method '{method}' is not supported. Only 'conformal' is available."
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
if last_window is None:
|
|
702
|
+
last_window = self.last_window_
|
|
703
|
+
|
|
704
|
+
check_predict_input(
|
|
705
|
+
forecaster_name=type(self).__name__,
|
|
706
|
+
steps=steps,
|
|
707
|
+
is_fitted=self.is_fitted,
|
|
708
|
+
exog_in_=False,
|
|
709
|
+
index_type_=self.index_type_,
|
|
710
|
+
index_freq_=self.index_freq_,
|
|
711
|
+
window_size=self.window_size,
|
|
712
|
+
last_window=last_window,
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
check_residuals_input(
|
|
716
|
+
forecaster_name=type(self).__name__,
|
|
717
|
+
use_in_sample_residuals=use_in_sample_residuals,
|
|
718
|
+
in_sample_residuals_=self.in_sample_residuals_,
|
|
719
|
+
out_sample_residuals_=self.out_sample_residuals_,
|
|
720
|
+
use_binned_residuals=use_binned_residuals,
|
|
721
|
+
in_sample_residuals_by_bin_=self.in_sample_residuals_by_bin_,
|
|
722
|
+
out_sample_residuals_by_bin_=self.out_sample_residuals_by_bin_,
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
if isinstance(interval, (list, tuple)):
|
|
726
|
+
check_interval(interval=interval, ensure_symmetric_intervals=True)
|
|
727
|
+
nominal_coverage = (interval[1] - interval[0]) / 100
|
|
728
|
+
else:
|
|
729
|
+
check_interval(alpha=interval, alpha_literal="interval")
|
|
730
|
+
nominal_coverage = interval
|
|
731
|
+
|
|
732
|
+
if use_in_sample_residuals:
|
|
733
|
+
residuals = self.in_sample_residuals_
|
|
734
|
+
residuals_by_bin = self.in_sample_residuals_by_bin_
|
|
735
|
+
else:
|
|
736
|
+
residuals = self.out_sample_residuals_
|
|
737
|
+
residuals_by_bin = self.out_sample_residuals_by_bin_
|
|
738
|
+
|
|
739
|
+
prediction_index = expand_index(index=last_window.index, steps=steps)
|
|
740
|
+
|
|
741
|
+
if isinstance(self.offset, int):
|
|
742
|
+
|
|
743
|
+
last_window_values = last_window.to_numpy(copy=True).ravel()
|
|
744
|
+
equivalent_indexes = np.tile(
|
|
745
|
+
np.arange(-self.offset, 0), int(np.ceil(steps / self.offset))
|
|
746
|
+
)
|
|
747
|
+
equivalent_indexes = equivalent_indexes[:steps]
|
|
748
|
+
|
|
749
|
+
if self.n_offsets == 1:
|
|
750
|
+
equivalent_values = last_window_values[equivalent_indexes]
|
|
751
|
+
predictions = equivalent_values.ravel()
|
|
752
|
+
|
|
753
|
+
if self.n_offsets > 1:
|
|
754
|
+
equivalent_indexes = [
|
|
755
|
+
equivalent_indexes - n * self.offset
|
|
756
|
+
for n in np.arange(self.n_offsets)
|
|
757
|
+
]
|
|
758
|
+
equivalent_indexes = np.vstack(equivalent_indexes)
|
|
759
|
+
equivalent_values = last_window_values[equivalent_indexes]
|
|
760
|
+
predictions = np.apply_along_axis(
|
|
761
|
+
self.agg_func, axis=0, arr=equivalent_values
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
if isinstance(self.offset, pd.tseries.offsets.DateOffset):
|
|
765
|
+
|
|
766
|
+
last_window = last_window.copy()
|
|
767
|
+
max_allowed_date = last_window.index[-1]
|
|
768
|
+
|
|
769
|
+
# For every date in prediction_index, calculate the n offsets
|
|
770
|
+
offset_dates = []
|
|
771
|
+
for date in prediction_index:
|
|
772
|
+
selected_offsets = []
|
|
773
|
+
while len(selected_offsets) < self.n_offsets:
|
|
774
|
+
offset_date = date - self.offset
|
|
775
|
+
if offset_date <= max_allowed_date:
|
|
776
|
+
selected_offsets.append(offset_date)
|
|
777
|
+
date = offset_date
|
|
778
|
+
offset_dates.append(selected_offsets)
|
|
779
|
+
|
|
780
|
+
offset_dates = np.array(offset_dates)
|
|
781
|
+
|
|
782
|
+
# Select the values of the time series corresponding to the each
|
|
783
|
+
# offset date. If the offset date is not in the time series, the
|
|
784
|
+
# value is set to NaN.
|
|
785
|
+
equivalent_values = (
|
|
786
|
+
last_window.reindex(offset_dates.ravel())
|
|
787
|
+
.to_numpy()
|
|
788
|
+
.reshape(-1, self.n_offsets)
|
|
789
|
+
)
|
|
790
|
+
equivalent_values = pd.DataFrame(
|
|
791
|
+
data=equivalent_values,
|
|
792
|
+
index=prediction_index,
|
|
793
|
+
columns=[f"offset_{i}" for i in range(self.n_offsets)],
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
# Error if all values are missing
|
|
797
|
+
if equivalent_values.isnull().all().all():
|
|
798
|
+
raise ValueError(
|
|
799
|
+
f"All equivalent values are missing. This is caused by using "
|
|
800
|
+
f"an offset ({self.offset}) larger than the available data. "
|
|
801
|
+
f"Try to decrease the size of the offset ({self.offset}), "
|
|
802
|
+
f"the number of `n_offsets` ({self.n_offsets}) or increase the "
|
|
803
|
+
f"size of `last_window`. In backtesting, this error may be "
|
|
804
|
+
f"caused by using an `initial_train_size` too small."
|
|
805
|
+
)
|
|
806
|
+
|
|
807
|
+
# Warning if equivalent values are missing
|
|
808
|
+
incomplete_offsets = equivalent_values.isnull().any(axis=1)
|
|
809
|
+
incomplete_offsets = incomplete_offsets[incomplete_offsets].index
|
|
810
|
+
if not incomplete_offsets.empty:
|
|
811
|
+
warnings.warn(
|
|
812
|
+
f"Steps: {incomplete_offsets.strftime('%Y-%m-%d').to_list()} "
|
|
813
|
+
f"are calculated with less than {self.n_offsets} `n_offsets`. "
|
|
814
|
+
f"To avoid this, increase the `last_window` size or decrease "
|
|
815
|
+
f"the number of `n_offsets`. The current configuration requires "
|
|
816
|
+
f"a total offset of {self.offset * self.n_offsets}.",
|
|
817
|
+
MissingValuesWarning,
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
aggregate_values = equivalent_values.apply(self.agg_func, axis=1)
|
|
821
|
+
predictions = aggregate_values.to_numpy()
|
|
822
|
+
|
|
823
|
+
if use_binned_residuals:
|
|
824
|
+
correction_factor_by_bin = {
|
|
825
|
+
k: np.quantile(np.abs(v), nominal_coverage)
|
|
826
|
+
for k, v in residuals_by_bin.items()
|
|
827
|
+
}
|
|
828
|
+
replace_func = np.vectorize(lambda x: correction_factor_by_bin[x])
|
|
829
|
+
predictions_bin = self.binner.transform(predictions)
|
|
830
|
+
correction_factor = replace_func(predictions_bin)
|
|
831
|
+
else:
|
|
832
|
+
correction_factor = np.quantile(np.abs(residuals), nominal_coverage)
|
|
833
|
+
|
|
834
|
+
lower_bound = predictions - correction_factor
|
|
835
|
+
upper_bound = predictions + correction_factor
|
|
836
|
+
predictions = np.column_stack([predictions, lower_bound, upper_bound])
|
|
837
|
+
|
|
838
|
+
predictions = pd.DataFrame(
|
|
839
|
+
data=predictions,
|
|
840
|
+
index=prediction_index,
|
|
841
|
+
columns=["pred", "lower_bound", "upper_bound"],
|
|
842
|
+
)
|
|
843
|
+
|
|
844
|
+
return predictions
|
|
845
|
+
|
|
846
|
+
def set_in_sample_residuals(
|
|
847
|
+
self, y: pd.Series, random_state: int = 123, exog: Any = None
|
|
848
|
+
) -> None:
|
|
849
|
+
"""
|
|
850
|
+
Set in-sample residuals in case they were not calculated during the
|
|
851
|
+
training process.
|
|
852
|
+
|
|
853
|
+
In-sample residuals are calculated as the difference between the true
|
|
854
|
+
values and the predictions made by the forecaster using the training
|
|
855
|
+
data. The following internal attributes are updated:
|
|
856
|
+
|
|
857
|
+
+ `in_sample_residuals_`: residuals stored in a numpy ndarray.
|
|
858
|
+
+ `binner_intervals_`: intervals used to bin the residuals are calculated
|
|
859
|
+
using the quantiles of the predicted values.
|
|
860
|
+
+ `in_sample_residuals_by_bin_`: residuals are binned according to the
|
|
861
|
+
predicted value they are associated with and stored in a dictionary, where
|
|
862
|
+
the keys are the intervals of the predicted values and the values are
|
|
863
|
+
the residuals associated with that range.
|
|
864
|
+
|
|
865
|
+
A total of 10_000 residuals are stored in the attribute `in_sample_residuals_`.
|
|
866
|
+
If the number of residuals is greater than 10_000, a random sample of
|
|
867
|
+
10_000 residuals is stored. The number of residuals stored per bin is
|
|
868
|
+
limited to `10_000 // self.binner.n_bins_`.
|
|
869
|
+
|
|
870
|
+
Parameters
|
|
871
|
+
----------
|
|
872
|
+
y : pandas Series
|
|
873
|
+
Training time series.
|
|
874
|
+
random_state : int, default 123
|
|
875
|
+
Sets a seed to the random sampling for reproducible output.
|
|
876
|
+
exog : Ignored
|
|
877
|
+
Not used, present here for API consistency by convention.
|
|
878
|
+
|
|
879
|
+
Returns
|
|
880
|
+
-------
|
|
881
|
+
None
|
|
882
|
+
|
|
883
|
+
"""
|
|
884
|
+
|
|
885
|
+
if not self.is_fitted:
|
|
886
|
+
raise NotFittedError(
|
|
887
|
+
"This forecaster is not fitted yet. Call `fit` with appropriate "
|
|
888
|
+
"arguments before using `set_in_sample_residuals()`."
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
check_y(y=y)
|
|
892
|
+
y_index_range = check_extract_values_and_index(
|
|
893
|
+
data=y, data_label="`y`", return_values=False
|
|
894
|
+
)[1][[0, -1]]
|
|
895
|
+
if not y_index_range.equals(self.training_range_):
|
|
896
|
+
raise IndexError(
|
|
897
|
+
f"The index range of `y` does not match the range "
|
|
898
|
+
f"used during training. Please ensure the index is aligned "
|
|
899
|
+
f"with the training data.\n"
|
|
900
|
+
f" Expected : {self.training_range_}\n"
|
|
901
|
+
f" Received : {y_index_range}"
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
self._binning_in_sample_residuals(
|
|
905
|
+
y=y, store_in_sample_residuals=True, random_state=random_state
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
def set_out_sample_residuals(
|
|
909
|
+
self,
|
|
910
|
+
y_true: np.ndarray | pd.Series,
|
|
911
|
+
y_pred: np.ndarray | pd.Series,
|
|
912
|
+
append: bool = False,
|
|
913
|
+
random_state: int = 123,
|
|
914
|
+
) -> None:
|
|
915
|
+
"""
|
|
916
|
+
Set new values to the attribute `out_sample_residuals_`. Out of sample
|
|
917
|
+
residuals are meant to be calculated using observations that did not
|
|
918
|
+
participate in the training process. Two internal attributes are updated:
|
|
919
|
+
|
|
920
|
+
+ `out_sample_residuals_`: residuals stored in a numpy ndarray.
|
|
921
|
+
+ `out_sample_residuals_by_bin_`: residuals are binned according to the
|
|
922
|
+
predicted value they are associated with and stored in a dictionary, where
|
|
923
|
+
the keys are the intervals of the predicted values and the values are
|
|
924
|
+
the residuals associated with that range. If a bin binning is empty, it
|
|
925
|
+
is filled with a random sample of residuals from other bins. This is done
|
|
926
|
+
to ensure that all bins have at least one residual and can be used in the
|
|
927
|
+
prediction process.
|
|
928
|
+
|
|
929
|
+
A total of 10_000 residuals are stored in the attribute `out_sample_residuals_`.
|
|
930
|
+
If the number of residuals is greater than 10_000, a random sample of
|
|
931
|
+
10_000 residuals is stored. The number of residuals stored per bin is
|
|
932
|
+
limited to `10_000 // self.binner.n_bins_`.
|
|
933
|
+
|
|
934
|
+
Parameters
|
|
935
|
+
----------
|
|
936
|
+
y_true : numpy ndarray, pandas Series
|
|
937
|
+
True values of the time series from which the residuals have been
|
|
938
|
+
calculated.
|
|
939
|
+
y_pred : numpy ndarray, pandas Series
|
|
940
|
+
Predicted values of the time series.
|
|
941
|
+
append : bool, default False
|
|
942
|
+
If `True`, new residuals are added to the once already stored in the
|
|
943
|
+
forecaster. If after appending the new residuals, the limit of
|
|
944
|
+
`10_000 // self.binner.n_bins_` values per bin is reached, a random
|
|
945
|
+
sample of residuals is stored.
|
|
946
|
+
random_state : int, default 123
|
|
947
|
+
Sets a seed to the random sampling for reproducible output.
|
|
948
|
+
|
|
949
|
+
Returns
|
|
950
|
+
-------
|
|
951
|
+
None
|
|
952
|
+
|
|
953
|
+
"""
|
|
954
|
+
|
|
955
|
+
if not self.is_fitted:
|
|
956
|
+
raise NotFittedError(
|
|
957
|
+
"This forecaster is not fitted yet. Call `fit` with appropriate "
|
|
958
|
+
"arguments before using `set_out_sample_residuals()`."
|
|
959
|
+
)
|
|
960
|
+
|
|
961
|
+
if not isinstance(y_true, (np.ndarray, pd.Series)):
|
|
962
|
+
raise TypeError(
|
|
963
|
+
f"`y_true` argument must be `numpy ndarray` or `pandas Series`. "
|
|
964
|
+
f"Got {type(y_true)}."
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
if not isinstance(y_pred, (np.ndarray, pd.Series)):
|
|
968
|
+
raise TypeError(
|
|
969
|
+
f"`y_pred` argument must be `numpy ndarray` or `pandas Series`. "
|
|
970
|
+
f"Got {type(y_pred)}."
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
if len(y_true) != len(y_pred):
|
|
974
|
+
raise ValueError(
|
|
975
|
+
f"`y_true` and `y_pred` must have the same length. "
|
|
976
|
+
f"Got {len(y_true)} and {len(y_pred)}."
|
|
977
|
+
)
|
|
978
|
+
|
|
979
|
+
if isinstance(y_true, pd.Series) and isinstance(y_pred, pd.Series):
|
|
980
|
+
if not y_true.index.equals(y_pred.index):
|
|
981
|
+
raise ValueError("`y_true` and `y_pred` must have the same index.")
|
|
982
|
+
|
|
983
|
+
if not isinstance(y_pred, np.ndarray):
|
|
984
|
+
y_pred = y_pred.to_numpy()
|
|
985
|
+
if not isinstance(y_true, np.ndarray):
|
|
986
|
+
y_true = y_true.to_numpy()
|
|
987
|
+
|
|
988
|
+
data = pd.DataFrame(
|
|
989
|
+
{"prediction": y_pred, "residuals": y_true - y_pred}
|
|
990
|
+
).dropna()
|
|
991
|
+
y_pred = data["prediction"].to_numpy()
|
|
992
|
+
residuals = data["residuals"].to_numpy()
|
|
993
|
+
|
|
994
|
+
data["bin"] = self.binner.transform(y_pred).astype(int)
|
|
995
|
+
residuals_by_bin = data.groupby("bin")["residuals"].apply(np.array).to_dict()
|
|
996
|
+
|
|
997
|
+
out_sample_residuals = (
|
|
998
|
+
np.array([])
|
|
999
|
+
if self.out_sample_residuals_ is None
|
|
1000
|
+
else self.out_sample_residuals_
|
|
1001
|
+
)
|
|
1002
|
+
out_sample_residuals_by_bin = (
|
|
1003
|
+
{}
|
|
1004
|
+
if self.out_sample_residuals_by_bin_ is None
|
|
1005
|
+
else self.out_sample_residuals_by_bin_
|
|
1006
|
+
)
|
|
1007
|
+
if append:
|
|
1008
|
+
out_sample_residuals = np.concatenate([out_sample_residuals, residuals])
|
|
1009
|
+
for k, v in residuals_by_bin.items():
|
|
1010
|
+
if k in out_sample_residuals_by_bin:
|
|
1011
|
+
out_sample_residuals_by_bin[k] = np.concatenate(
|
|
1012
|
+
(out_sample_residuals_by_bin[k], v)
|
|
1013
|
+
)
|
|
1014
|
+
else:
|
|
1015
|
+
out_sample_residuals_by_bin[k] = v
|
|
1016
|
+
else:
|
|
1017
|
+
out_sample_residuals = residuals
|
|
1018
|
+
out_sample_residuals_by_bin = residuals_by_bin
|
|
1019
|
+
|
|
1020
|
+
max_samples = 10_000 // self.binner.n_bins_
|
|
1021
|
+
rng = np.random.default_rng(seed=random_state)
|
|
1022
|
+
for k, v in out_sample_residuals_by_bin.items():
|
|
1023
|
+
if len(v) > max_samples:
|
|
1024
|
+
sample = rng.choice(a=v, size=max_samples, replace=False)
|
|
1025
|
+
out_sample_residuals_by_bin[k] = sample
|
|
1026
|
+
|
|
1027
|
+
bin_keys = (
|
|
1028
|
+
[] if self.binner_intervals_ is None else self.binner_intervals_.keys()
|
|
1029
|
+
)
|
|
1030
|
+
for k in bin_keys:
|
|
1031
|
+
if k not in out_sample_residuals_by_bin:
|
|
1032
|
+
out_sample_residuals_by_bin[k] = np.array([])
|
|
1033
|
+
|
|
1034
|
+
empty_bins = [k for k, v in out_sample_residuals_by_bin.items() if v.size == 0]
|
|
1035
|
+
if empty_bins:
|
|
1036
|
+
warnings.warn(
|
|
1037
|
+
f"The following bins have no out of sample residuals: {empty_bins}. "
|
|
1038
|
+
f"No predicted values fall in the interval "
|
|
1039
|
+
f"{[self.binner_intervals_[bin] for bin in empty_bins]}. "
|
|
1040
|
+
f"Empty bins will be filled with a random sample of residuals.",
|
|
1041
|
+
ResidualsUsageWarning,
|
|
1042
|
+
)
|
|
1043
|
+
empty_bin_size = min(max_samples, len(out_sample_residuals))
|
|
1044
|
+
for k in empty_bins:
|
|
1045
|
+
out_sample_residuals_by_bin[k] = rng.choice(
|
|
1046
|
+
a=out_sample_residuals, size=empty_bin_size, replace=False
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
if len(out_sample_residuals) > 10_000:
|
|
1050
|
+
out_sample_residuals = rng.choice(
|
|
1051
|
+
a=out_sample_residuals, size=10_000, replace=False
|
|
1052
|
+
)
|
|
1053
|
+
|
|
1054
|
+
self.out_sample_residuals_ = out_sample_residuals
|
|
1055
|
+
self.out_sample_residuals_by_bin_ = out_sample_residuals_by_bin
|
|
1056
|
+
|
|
1057
|
+
def get_tags(self) -> dict[str, Any]:
|
|
1058
|
+
"""
|
|
1059
|
+
Return the tags that characterize the behavior of the forecaster.
|
|
1060
|
+
|
|
1061
|
+
Returns:
|
|
1062
|
+
dict: Dictionary with forecaster tags.
|
|
1063
|
+
"""
|
|
1064
|
+
|
|
1065
|
+
return self.__spotforecast_tags__
|
|
1066
|
+
|
|
1067
|
+
def summary(self) -> None:
|
|
1068
|
+
"""
|
|
1069
|
+
Show forecaster information.
|
|
1070
|
+
|
|
1071
|
+
Returns:
|
|
1072
|
+
None
|
|
1073
|
+
"""
|
|
1074
|
+
|
|
1075
|
+
print(self)
|