spotforecast2 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spotforecast2/.DS_Store +0 -0
- spotforecast2/__init__.py +2 -0
- spotforecast2/data/__init__.py +0 -0
- spotforecast2/data/data.py +130 -0
- spotforecast2/data/fetch_data.py +209 -0
- spotforecast2/exceptions.py +681 -0
- spotforecast2/forecaster/.DS_Store +0 -0
- spotforecast2/forecaster/__init__.py +7 -0
- spotforecast2/forecaster/base.py +448 -0
- spotforecast2/forecaster/metrics.py +527 -0
- spotforecast2/forecaster/recursive/__init__.py +4 -0
- spotforecast2/forecaster/recursive/_forecaster_equivalent_date.py +1075 -0
- spotforecast2/forecaster/recursive/_forecaster_recursive.py +939 -0
- spotforecast2/forecaster/recursive/_warnings.py +15 -0
- spotforecast2/forecaster/utils.py +954 -0
- spotforecast2/model_selection/__init__.py +5 -0
- spotforecast2/model_selection/bayesian_search.py +453 -0
- spotforecast2/model_selection/grid_search.py +314 -0
- spotforecast2/model_selection/random_search.py +151 -0
- spotforecast2/model_selection/split_base.py +357 -0
- spotforecast2/model_selection/split_one_step.py +245 -0
- spotforecast2/model_selection/split_ts_cv.py +634 -0
- spotforecast2/model_selection/utils_common.py +718 -0
- spotforecast2/model_selection/utils_metrics.py +103 -0
- spotforecast2/model_selection/validation.py +685 -0
- spotforecast2/preprocessing/__init__.py +30 -0
- spotforecast2/preprocessing/_binner.py +378 -0
- spotforecast2/preprocessing/_common.py +123 -0
- spotforecast2/preprocessing/_differentiator.py +123 -0
- spotforecast2/preprocessing/_rolling.py +136 -0
- spotforecast2/preprocessing/curate_data.py +254 -0
- spotforecast2/preprocessing/imputation.py +92 -0
- spotforecast2/preprocessing/outlier.py +114 -0
- spotforecast2/preprocessing/split.py +139 -0
- spotforecast2/py.typed +0 -0
- spotforecast2/utils/__init__.py +43 -0
- spotforecast2/utils/convert_to_utc.py +44 -0
- spotforecast2/utils/data_transform.py +208 -0
- spotforecast2/utils/forecaster_config.py +344 -0
- spotforecast2/utils/generate_holiday.py +70 -0
- spotforecast2/utils/validation.py +569 -0
- spotforecast2/weather/__init__.py +0 -0
- spotforecast2/weather/weather_client.py +288 -0
- spotforecast2-0.0.1.dist-info/METADATA +47 -0
- spotforecast2-0.0.1.dist-info/RECORD +46 -0
- spotforecast2-0.0.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,634 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Time series cross-validation splitting.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
import warnings
|
|
7
|
+
import itertools
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from spotforecast2.forecaster.utils import date_to_index_position, get_style_repr_html
|
|
12
|
+
from spotforecast2.exceptions import IgnoredArgumentWarning
|
|
13
|
+
from .split_base import BaseFold
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TimeSeriesFold(BaseFold):
|
|
17
|
+
"""
|
|
18
|
+
Class to split time series data into train and test folds.
|
|
19
|
+
When used within a backtesting or hyperparameter search, the arguments
|
|
20
|
+
'initial_train_size', 'window_size' and 'differentiation' are not required
|
|
21
|
+
as they are automatically set by the backtesting or hyperparameter search
|
|
22
|
+
functions.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
steps (int): Number of observations used to be predicted in each fold.
|
|
26
|
+
This is also commonly referred to as the forecast horizon or test size.
|
|
27
|
+
initial_train_size (int | str | pd.Timestamp, optional): Number of observations
|
|
28
|
+
used for initial training.
|
|
29
|
+
|
|
30
|
+
- If `None` or 0, the initial forecaster is not trained in the first fold.
|
|
31
|
+
- If an integer, the number of observations used for initial training.
|
|
32
|
+
- If a date string or pandas Timestamp, it is the last date included in
|
|
33
|
+
the initial training set.
|
|
34
|
+
Defaults to None.
|
|
35
|
+
fold_stride (int, optional): Number of observations that the start of the test
|
|
36
|
+
set advances between consecutive folds.
|
|
37
|
+
|
|
38
|
+
- If `None`, it defaults to the same value as `steps`, meaning that folds
|
|
39
|
+
are placed back-to-back without overlap.
|
|
40
|
+
- If `fold_stride < steps`, test sets overlap and multiple forecasts will
|
|
41
|
+
be generated for the same observations.
|
|
42
|
+
- If `fold_stride > steps`, gaps are left between consecutive test sets.
|
|
43
|
+
**New in version 0.18.0**
|
|
44
|
+
Defaults to None.
|
|
45
|
+
window_size (int, optional): Number of observations needed to generate the
|
|
46
|
+
autoregressive predictors. Defaults to None.
|
|
47
|
+
differentiation (int, optional): Number of observations to use for differentiation.
|
|
48
|
+
This is used to extend the `last_window` as many observations as the
|
|
49
|
+
differentiation order. Defaults to None.
|
|
50
|
+
refit (bool | int, optional): Whether to refit the forecaster in each fold.
|
|
51
|
+
|
|
52
|
+
- If `True`, the forecaster is refitted in each fold.
|
|
53
|
+
- If `False`, the forecaster is trained only in the first fold.
|
|
54
|
+
- If an integer, the forecaster is trained in the first fold and then refitted
|
|
55
|
+
every `refit` folds.
|
|
56
|
+
Defaults to False.
|
|
57
|
+
fixed_train_size (bool, optional): Whether the training size is fixed or increases
|
|
58
|
+
in each fold. Defaults to True.
|
|
59
|
+
gap (int, optional): Number of observations between the end of the training set
|
|
60
|
+
and the start of the test set. Defaults to 0.
|
|
61
|
+
skip_folds (int | list, optional): Number of folds to skip.
|
|
62
|
+
|
|
63
|
+
- If an integer, every 'skip_folds'-th is returned.
|
|
64
|
+
- If a list, the indexes of the folds to skip.
|
|
65
|
+
|
|
66
|
+
For example, if `skip_folds=3` and there are 10 folds, the returned folds are
|
|
67
|
+
0, 3, 6, and 9. If `skip_folds=[1, 2, 3]`, the returned folds are 0, 4, 5, 6, 7,
|
|
68
|
+
8, and 9. Defaults to None.
|
|
69
|
+
allow_incomplete_fold (bool, optional): Whether to allow the last fold to include
|
|
70
|
+
fewer observations than `steps`. If `False`, the last fold is excluded if it
|
|
71
|
+
is incomplete. Defaults to True.
|
|
72
|
+
return_all_indexes (bool, optional): Whether to return all indexes or only the
|
|
73
|
+
start and end indexes of each fold. Defaults to False.
|
|
74
|
+
verbose (bool, optional): Whether to print information about generated folds.
|
|
75
|
+
Defaults to True.
|
|
76
|
+
|
|
77
|
+
Attributes:
|
|
78
|
+
steps (int): Number of observations used to be predicted in each fold.
|
|
79
|
+
initial_train_size (int): Number of observations used for initial training.
|
|
80
|
+
If `None` or 0, the initial forecaster is not trained in the first fold.
|
|
81
|
+
fold_stride (int): Number of observations that the start of the test set
|
|
82
|
+
advances between consecutive folds.
|
|
83
|
+
overlapping_folds (bool): Whether the folds overlap.
|
|
84
|
+
window_size (int): Number of observations needed to generate the
|
|
85
|
+
autoregressive predictors.
|
|
86
|
+
differentiation (int): Number of observations to use for differentiation.
|
|
87
|
+
This is used to extend the `last_window` as many observations as the
|
|
88
|
+
differentiation order.
|
|
89
|
+
refit (bool | int): Whether to refit the forecaster in each fold.
|
|
90
|
+
fixed_train_size (bool): Whether the training size is fixed or increases in each fold.
|
|
91
|
+
gap (int): Number of observations between the end of the training set and the
|
|
92
|
+
start of the test set.
|
|
93
|
+
skip_folds (int | list): Number of folds to skip.
|
|
94
|
+
allow_incomplete_fold (bool): Whether to allow the last fold to include fewer
|
|
95
|
+
observations than `steps`.
|
|
96
|
+
return_all_indexes (bool): Whether to return all indexes or only the start
|
|
97
|
+
and end indexes of each fold.
|
|
98
|
+
verbose (bool): Whether to print information about generated folds.
|
|
99
|
+
|
|
100
|
+
Note:
|
|
101
|
+
Returned values are the positions of the observations and not the actual values of
|
|
102
|
+
the index, so they can be used to slice the data directly using iloc. For example,
|
|
103
|
+
if the input series is `X = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]`, the
|
|
104
|
+
`initial_train_size = 3`, `window_size = 2`, `steps = 4`, and `gap = 1`,
|
|
105
|
+
the output of the first fold will: [0, [0, 3], [1, 3], [3, 8], [4, 8], True].
|
|
106
|
+
|
|
107
|
+
The first element is the fold number, the first list `[0, 3]` indicates that
|
|
108
|
+
the training set goes from the first to the third observation. The second
|
|
109
|
+
list `[1, 3]` indicates that the last window seen by the forecaster during
|
|
110
|
+
training goes from the second to the third observation. The third list `[3, 8]`
|
|
111
|
+
indicates that the test set goes from the fourth to the eighth observation.
|
|
112
|
+
The fourth list `[4, 8]` indicates that the test set including the gap goes
|
|
113
|
+
from the fifth to the eighth observation. The boolean `False` indicates that
|
|
114
|
+
the forecaster should not be trained in this fold.
|
|
115
|
+
|
|
116
|
+
Following the python convention, the start index is inclusive and the end index is
|
|
117
|
+
exclusive. This means that the last index is not included in the slice.
|
|
118
|
+
|
|
119
|
+
As an example, with `initial_train_size=50`, `steps=30`, and `fold_stride=7`,
|
|
120
|
+
the first test fold will cover observations [50, 80), the second fold [57, 87),
|
|
121
|
+
and the third fold [64, 94). This configuration produces multiple forecasts
|
|
122
|
+
for the same observations, which is often desirable in rolling-origin
|
|
123
|
+
evaluation.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
def __init__(
|
|
127
|
+
self,
|
|
128
|
+
steps: int,
|
|
129
|
+
initial_train_size: int | str | pd.Timestamp | None = None,
|
|
130
|
+
fold_stride: int | None = None,
|
|
131
|
+
window_size: int | None = None,
|
|
132
|
+
differentiation: int | None = None,
|
|
133
|
+
refit: bool | int = False,
|
|
134
|
+
fixed_train_size: bool = True,
|
|
135
|
+
gap: int = 0,
|
|
136
|
+
skip_folds: int | list[int] | None = None,
|
|
137
|
+
allow_incomplete_fold: bool = True,
|
|
138
|
+
return_all_indexes: bool = False,
|
|
139
|
+
verbose: bool = True,
|
|
140
|
+
) -> None:
|
|
141
|
+
|
|
142
|
+
super().__init__(
|
|
143
|
+
steps=steps,
|
|
144
|
+
initial_train_size=initial_train_size,
|
|
145
|
+
fold_stride=fold_stride,
|
|
146
|
+
window_size=window_size,
|
|
147
|
+
differentiation=differentiation,
|
|
148
|
+
refit=refit,
|
|
149
|
+
fixed_train_size=fixed_train_size,
|
|
150
|
+
gap=gap,
|
|
151
|
+
skip_folds=skip_folds,
|
|
152
|
+
allow_incomplete_fold=allow_incomplete_fold,
|
|
153
|
+
return_all_indexes=return_all_indexes,
|
|
154
|
+
verbose=verbose,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
self.steps = steps
|
|
158
|
+
self.fold_stride = fold_stride if fold_stride is not None else steps
|
|
159
|
+
self.overlapping_folds = self.fold_stride < self.steps
|
|
160
|
+
self.refit = refit
|
|
161
|
+
self.fixed_train_size = fixed_train_size
|
|
162
|
+
self.gap = gap
|
|
163
|
+
self.skip_folds = skip_folds
|
|
164
|
+
self.allow_incomplete_fold = allow_incomplete_fold
|
|
165
|
+
|
|
166
|
+
def __repr__(self) -> str:
|
|
167
|
+
"""
|
|
168
|
+
Information displayed when printed.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
info = (
|
|
172
|
+
f"{'=' * len(type(self).__name__)} \n"
|
|
173
|
+
f"{type(self).__name__} \n"
|
|
174
|
+
f"{'=' * len(type(self).__name__)} \n"
|
|
175
|
+
f"Initial train size = {self.initial_train_size},\n"
|
|
176
|
+
f"Steps = {self.steps},\n"
|
|
177
|
+
f"Fold stride = {self.fold_stride},\n"
|
|
178
|
+
f"Overlapping folds = {self.overlapping_folds},\n"
|
|
179
|
+
f"Window size = {self.window_size},\n"
|
|
180
|
+
f"Differentiation = {self.differentiation},\n"
|
|
181
|
+
f"Refit = {self.refit},\n"
|
|
182
|
+
f"Fixed train size = {self.fixed_train_size},\n"
|
|
183
|
+
f"Gap = {self.gap},\n"
|
|
184
|
+
f"Skip folds = {self.skip_folds},\n"
|
|
185
|
+
f"Allow incomplete fold = {self.allow_incomplete_fold},\n"
|
|
186
|
+
f"Return all indexes = {self.return_all_indexes},\n"
|
|
187
|
+
f"Verbose = {self.verbose}\n"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
return info
|
|
191
|
+
|
|
192
|
+
def _repr_html_(self) -> str:
|
|
193
|
+
"""
|
|
194
|
+
HTML representation of the object.
|
|
195
|
+
The "General Information" section is expanded by default.
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
style, unique_id = get_style_repr_html()
|
|
199
|
+
content = f"""
|
|
200
|
+
<div class="container-{unique_id}">
|
|
201
|
+
<p style="font-size: 1.5em; font-weight: bold; margin-block-start: 0.83em; margin-block-end: 0.83em;">{type(self).__name__}</p>
|
|
202
|
+
<details open>
|
|
203
|
+
<summary>General Information</summary>
|
|
204
|
+
<ul>
|
|
205
|
+
<li><strong>Initial train size:</strong> {self.initial_train_size}</li>
|
|
206
|
+
<li><strong>Steps:</strong> {self.steps}</li>
|
|
207
|
+
<li><strong>Fold stride:</strong> {self.fold_stride}</li>
|
|
208
|
+
<li><strong>Overlapping folds:</strong> {self.overlapping_folds}</li>
|
|
209
|
+
<li><strong>Window size:</strong> {self.window_size}</li>
|
|
210
|
+
<li><strong>Differentiation:</strong> {self.differentiation}</li>
|
|
211
|
+
<li><strong>Refit:</strong> {self.refit}</li>
|
|
212
|
+
<li><strong>Fixed train size:</strong> {self.fixed_train_size}</li>
|
|
213
|
+
<li><strong>Gap:</strong> {self.gap}</li>
|
|
214
|
+
<li><strong>Skip folds:</strong> {self.skip_folds}</li>
|
|
215
|
+
<li><strong>Allow incomplete fold:</strong> {self.allow_incomplete_fold}</li>
|
|
216
|
+
<li><strong>Return all indexes:</strong> {self.return_all_indexes}</li>
|
|
217
|
+
</ul>
|
|
218
|
+
</details>
|
|
219
|
+
</div>
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
return style + content
|
|
223
|
+
|
|
224
|
+
def split(
|
|
225
|
+
self,
|
|
226
|
+
X: pd.Series | pd.DataFrame | pd.Index | dict[str, pd.Series | pd.DataFrame],
|
|
227
|
+
as_pandas: bool = False,
|
|
228
|
+
) -> list | pd.DataFrame:
|
|
229
|
+
"""
|
|
230
|
+
Split the time series data into train and test folds.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
X (pd.Series | pd.DataFrame | pd.Index | dict): Time series data or index to split.
|
|
234
|
+
as_pandas (bool, optional): If True, the folds are returned as a DataFrame.
|
|
235
|
+
This is useful to visualize the folds in a more interpretable way.
|
|
236
|
+
Defaults to False.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
list | pd.DataFrame: A list of lists containing the indices (position) for
|
|
240
|
+
each fold. Each list contains 4 lists and a boolean with the following
|
|
241
|
+
information:
|
|
242
|
+
|
|
243
|
+
- fold: fold number.
|
|
244
|
+
- [train_start, train_end]: list with the start and end positions of the
|
|
245
|
+
training set.
|
|
246
|
+
- [last_window_start, last_window_end]: list with the start and end positions
|
|
247
|
+
of the last window seen by the forecaster during training. The last window
|
|
248
|
+
is used to generate the lags use as predictors. If `differentiation` is
|
|
249
|
+
included, the interval is extended as many observations as the
|
|
250
|
+
differentiation order. If the argument `window_size` is `None`, this list is
|
|
251
|
+
empty.
|
|
252
|
+
- [test_start, test_end]: list with the start and end positions of the test
|
|
253
|
+
set. These are the observations used to evaluate the forecaster.
|
|
254
|
+
- [test_start_with_gap, test_end_with_gap]: list with the start and end
|
|
255
|
+
positions of the test set including the gap. The gap is the number of
|
|
256
|
+
observations between the end of the training set and the start of the test
|
|
257
|
+
set.
|
|
258
|
+
- fit_forecaster: boolean indicating whether the forecaster should be fitted
|
|
259
|
+
in this fold.
|
|
260
|
+
|
|
261
|
+
It is important to note that the returned values are the positions of the
|
|
262
|
+
observations and not the actual values of the index, so they can be used to
|
|
263
|
+
slice the data directly using iloc.
|
|
264
|
+
|
|
265
|
+
If `as_pandas` is `True`, the folds are returned as a DataFrame with the
|
|
266
|
+
following columns: 'fold', 'train_start', 'train_end', 'last_window_start',
|
|
267
|
+
'last_window_end', 'test_start', 'test_end', 'test_start_with_gap',
|
|
268
|
+
'test_end_with_gap', 'fit_forecaster'.
|
|
269
|
+
|
|
270
|
+
Following the python convention, the start index is inclusive and the end
|
|
271
|
+
index is exclusive. This means that the last index is not included in the
|
|
272
|
+
slice.
|
|
273
|
+
"""
|
|
274
|
+
|
|
275
|
+
if not isinstance(X, (pd.Series, pd.DataFrame, pd.Index, dict)):
|
|
276
|
+
raise TypeError(
|
|
277
|
+
f"X must be a pandas Series, DataFrame, Index or a dictionary. "
|
|
278
|
+
f"Got {type(X)}."
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
window_size_as_date_offset = isinstance(
|
|
282
|
+
self.window_size, pd.tseries.offsets.DateOffset
|
|
283
|
+
)
|
|
284
|
+
if window_size_as_date_offset:
|
|
285
|
+
# Calculate the window_size in steps. This is not a exact calculation
|
|
286
|
+
# because the offset follows the calendar rules and the distance between
|
|
287
|
+
# two dates may not be constant.
|
|
288
|
+
first_valid_index = X.index[-1] - self.window_size
|
|
289
|
+
try:
|
|
290
|
+
window_size_idx_start = X.index.get_loc(first_valid_index)
|
|
291
|
+
window_size_idx_end = X.index.get_loc(X.index[-1])
|
|
292
|
+
self.window_size = window_size_idx_end - window_size_idx_start
|
|
293
|
+
except KeyError:
|
|
294
|
+
raise ValueError(
|
|
295
|
+
f"The length of `y` ({len(X)}), must be greater than or equal "
|
|
296
|
+
f"to the window size ({self.window_size}). This is because "
|
|
297
|
+
f"the offset (forecaster.offset) is larger than the available "
|
|
298
|
+
f"data. Try to decrease the size of the offset (forecaster.offset), "
|
|
299
|
+
f"the number of `n_offsets` (forecaster.n_offsets) or increase the "
|
|
300
|
+
f"size of `y`."
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
if self.initial_train_size is None:
|
|
304
|
+
if self.window_size is None:
|
|
305
|
+
raise ValueError(
|
|
306
|
+
"To use split method when `initial_train_size` is None, "
|
|
307
|
+
"`window_size` must be an integer greater than 0. "
|
|
308
|
+
"Although no initial training is done and all data is used to "
|
|
309
|
+
"evaluate the model, the first `window_size` observations are "
|
|
310
|
+
"needed to create the initial predictors. Got `window_size` = None."
|
|
311
|
+
)
|
|
312
|
+
if self.refit:
|
|
313
|
+
raise ValueError(
|
|
314
|
+
"`refit` is only allowed when `initial_train_size` is not `None`. "
|
|
315
|
+
"Set `refit` to `False` if you want to use `initial_train_size = None`."
|
|
316
|
+
)
|
|
317
|
+
externally_fitted = True
|
|
318
|
+
self.initial_train_size = self.window_size # Reset to None later
|
|
319
|
+
else:
|
|
320
|
+
if self.window_size is None:
|
|
321
|
+
warnings.warn(
|
|
322
|
+
"Last window cannot be calculated because `window_size` is None.",
|
|
323
|
+
IgnoredArgumentWarning,
|
|
324
|
+
)
|
|
325
|
+
externally_fitted = False
|
|
326
|
+
|
|
327
|
+
index = self._extract_index(X)
|
|
328
|
+
idx = range(len(index))
|
|
329
|
+
folds = []
|
|
330
|
+
i = 0
|
|
331
|
+
|
|
332
|
+
self.initial_train_size = date_to_index_position(
|
|
333
|
+
index=index,
|
|
334
|
+
date_input=self.initial_train_size,
|
|
335
|
+
method="validation",
|
|
336
|
+
date_literal="initial_train_size",
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
if window_size_as_date_offset:
|
|
340
|
+
if self.initial_train_size is not None:
|
|
341
|
+
if self.initial_train_size < self.window_size:
|
|
342
|
+
raise ValueError(
|
|
343
|
+
f"If `initial_train_size` is an integer, it must be greater than "
|
|
344
|
+
f"the `window_size` of the forecaster ({self.window_size}) "
|
|
345
|
+
f"and smaller than the length of the series ({len(X)}). If "
|
|
346
|
+
f"it is a date, it must be within this range of the index."
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
if self.allow_incomplete_fold:
|
|
350
|
+
# At least one observation after the gap to allow incomplete fold
|
|
351
|
+
if len(index) <= self.initial_train_size + self.gap:
|
|
352
|
+
raise ValueError(
|
|
353
|
+
f"The time series must have more than `initial_train_size + gap` "
|
|
354
|
+
f"observations to create at least one fold.\n"
|
|
355
|
+
f" Time series length: {len(index)}\n"
|
|
356
|
+
f" Required > {self.initial_train_size + self.gap}\n"
|
|
357
|
+
f" initial_train_size: {self.initial_train_size}\n"
|
|
358
|
+
f" gap: {self.gap}\n"
|
|
359
|
+
)
|
|
360
|
+
else:
|
|
361
|
+
# At least one complete fold
|
|
362
|
+
if len(index) < self.initial_train_size + self.gap + self.steps:
|
|
363
|
+
raise ValueError(
|
|
364
|
+
f"The time series must have at least `initial_train_size + gap + steps` "
|
|
365
|
+
f"observations to create a minimum of one complete fold "
|
|
366
|
+
f"(allow_incomplete_fold=False).\n"
|
|
367
|
+
f" Time series length: {len(index)}\n"
|
|
368
|
+
f" Required >= {self.initial_train_size + self.gap + self.steps}\n"
|
|
369
|
+
f" initial_train_size: {self.initial_train_size}\n"
|
|
370
|
+
f" gap: {self.gap}\n"
|
|
371
|
+
f" steps: {self.steps}\n"
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
while self.initial_train_size + (i * self.fold_stride) + self.gap < len(index):
|
|
375
|
+
|
|
376
|
+
if self.refit:
|
|
377
|
+
# NOTE: If `fixed_train_size` the train size doesn't increase but
|
|
378
|
+
# moves by `fold_stride` positions in each iteration. If `False`,
|
|
379
|
+
# the train size increases by `fold_stride` in each iteration.
|
|
380
|
+
train_iloc_start = (
|
|
381
|
+
i * (self.fold_stride) if self.fixed_train_size else 0
|
|
382
|
+
)
|
|
383
|
+
train_iloc_end = self.initial_train_size + i * (self.fold_stride)
|
|
384
|
+
test_iloc_start = train_iloc_end
|
|
385
|
+
else:
|
|
386
|
+
# NOTE: The train size doesn't increase and doesn't move.
|
|
387
|
+
train_iloc_start = 0
|
|
388
|
+
train_iloc_end = self.initial_train_size
|
|
389
|
+
test_iloc_start = self.initial_train_size + i * (self.fold_stride)
|
|
390
|
+
|
|
391
|
+
if self.window_size is not None:
|
|
392
|
+
last_window_iloc_start = test_iloc_start - self.window_size
|
|
393
|
+
|
|
394
|
+
test_iloc_end = test_iloc_start + self.gap + self.steps
|
|
395
|
+
|
|
396
|
+
partitions = [
|
|
397
|
+
idx[train_iloc_start:train_iloc_end],
|
|
398
|
+
(
|
|
399
|
+
idx[last_window_iloc_start:test_iloc_start]
|
|
400
|
+
if self.window_size is not None
|
|
401
|
+
else []
|
|
402
|
+
),
|
|
403
|
+
idx[test_iloc_start:test_iloc_end],
|
|
404
|
+
idx[test_iloc_start + self.gap : test_iloc_end],
|
|
405
|
+
]
|
|
406
|
+
folds.append(partitions)
|
|
407
|
+
i += 1
|
|
408
|
+
|
|
409
|
+
# NOTE: Delete all incomplete folds at the end if not allowed
|
|
410
|
+
n_removed_folds = 0
|
|
411
|
+
if not self.allow_incomplete_fold:
|
|
412
|
+
# NOTE: While folds and the last "test_index_with_gap" is incomplete,
|
|
413
|
+
# calculating len of range objects
|
|
414
|
+
while folds and len(folds[-1][3]) < self.steps:
|
|
415
|
+
folds.pop()
|
|
416
|
+
n_removed_folds += 1
|
|
417
|
+
|
|
418
|
+
# Replace partitions inside folds with length 0 with `None`
|
|
419
|
+
folds = [
|
|
420
|
+
[partition if len(partition) > 0 else None for partition in fold]
|
|
421
|
+
for fold in folds
|
|
422
|
+
]
|
|
423
|
+
|
|
424
|
+
# Create a flag to know whether to train the forecaster
|
|
425
|
+
if self.refit == 0:
|
|
426
|
+
self.refit = False
|
|
427
|
+
|
|
428
|
+
if isinstance(self.refit, bool):
|
|
429
|
+
fit_forecaster = [self.refit] * len(folds)
|
|
430
|
+
fit_forecaster[0] = True
|
|
431
|
+
else:
|
|
432
|
+
fit_forecaster = [False] * len(folds)
|
|
433
|
+
for i in range(0, len(fit_forecaster), self.refit):
|
|
434
|
+
fit_forecaster[i] = True
|
|
435
|
+
|
|
436
|
+
for i in range(len(folds)):
|
|
437
|
+
folds[i].insert(0, i)
|
|
438
|
+
folds[i].append(fit_forecaster[i])
|
|
439
|
+
if fit_forecaster[i] is False:
|
|
440
|
+
folds[i][1] = folds[i - 1][1]
|
|
441
|
+
|
|
442
|
+
index_to_skip = []
|
|
443
|
+
if self.skip_folds is not None:
|
|
444
|
+
if isinstance(self.skip_folds, (int, np.integer)) and self.skip_folds > 0:
|
|
445
|
+
index_to_keep = np.arange(0, len(folds), self.skip_folds)
|
|
446
|
+
index_to_skip = np.setdiff1d(
|
|
447
|
+
np.arange(0, len(folds)), index_to_keep, assume_unique=True
|
|
448
|
+
)
|
|
449
|
+
index_to_skip = [
|
|
450
|
+
int(x) for x in index_to_skip
|
|
451
|
+
] # Required since numpy 2.0
|
|
452
|
+
if isinstance(self.skip_folds, list):
|
|
453
|
+
index_to_skip = [i for i in self.skip_folds if i < len(folds)]
|
|
454
|
+
|
|
455
|
+
if self.verbose:
|
|
456
|
+
self._print_info(
|
|
457
|
+
index=index,
|
|
458
|
+
folds=folds,
|
|
459
|
+
externally_fitted=externally_fitted,
|
|
460
|
+
n_removed_folds=n_removed_folds,
|
|
461
|
+
index_to_skip=index_to_skip,
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
folds = [fold for i, fold in enumerate(folds) if i not in index_to_skip]
|
|
465
|
+
if not self.return_all_indexes:
|
|
466
|
+
# NOTE: +1 to prevent iloc pandas from deleting the last observation
|
|
467
|
+
folds = [
|
|
468
|
+
[
|
|
469
|
+
fold[0],
|
|
470
|
+
[fold[1][0], fold[1][-1] + 1],
|
|
471
|
+
(
|
|
472
|
+
[fold[2][0], fold[2][-1] + 1]
|
|
473
|
+
if self.window_size is not None
|
|
474
|
+
else []
|
|
475
|
+
),
|
|
476
|
+
[fold[3][0], fold[3][-1] + 1],
|
|
477
|
+
[fold[4][0], fold[4][-1] + 1],
|
|
478
|
+
fold[5],
|
|
479
|
+
]
|
|
480
|
+
for fold in folds
|
|
481
|
+
]
|
|
482
|
+
|
|
483
|
+
if externally_fitted:
|
|
484
|
+
self.initial_train_size = None
|
|
485
|
+
folds[0][5] = False
|
|
486
|
+
|
|
487
|
+
if as_pandas:
|
|
488
|
+
if self.window_size is None:
|
|
489
|
+
for fold in folds:
|
|
490
|
+
fold[2] = [None, None]
|
|
491
|
+
|
|
492
|
+
if not self.return_all_indexes:
|
|
493
|
+
folds = pd.DataFrame(
|
|
494
|
+
data=[
|
|
495
|
+
[fold[0]] + list(itertools.chain(*fold[1:-1])) + [fold[-1]]
|
|
496
|
+
for fold in folds
|
|
497
|
+
],
|
|
498
|
+
columns=[
|
|
499
|
+
"fold",
|
|
500
|
+
"train_start",
|
|
501
|
+
"train_end",
|
|
502
|
+
"last_window_start",
|
|
503
|
+
"last_window_end",
|
|
504
|
+
"test_start",
|
|
505
|
+
"test_end",
|
|
506
|
+
"test_start_with_gap",
|
|
507
|
+
"test_end_with_gap",
|
|
508
|
+
"fit_forecaster",
|
|
509
|
+
],
|
|
510
|
+
)
|
|
511
|
+
else:
|
|
512
|
+
folds = pd.DataFrame(
|
|
513
|
+
data=folds,
|
|
514
|
+
columns=[
|
|
515
|
+
"fold",
|
|
516
|
+
"train_index",
|
|
517
|
+
"last_window_index",
|
|
518
|
+
"test_index",
|
|
519
|
+
"test_index_with_gap",
|
|
520
|
+
"fit_forecaster",
|
|
521
|
+
],
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
return folds
|
|
525
|
+
|
|
526
|
+
def _print_info(
|
|
527
|
+
self,
|
|
528
|
+
index: pd.Index,
|
|
529
|
+
folds: list[list[int]],
|
|
530
|
+
externally_fitted: bool,
|
|
531
|
+
n_removed_folds: int,
|
|
532
|
+
index_to_skip: list[int],
|
|
533
|
+
) -> None:
|
|
534
|
+
"""
|
|
535
|
+
Print information about folds.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
index (pd.Index): Index of the time series data.
|
|
539
|
+
folds (list): A list of lists containing the indices (position) for each fold.
|
|
540
|
+
externally_fitted (bool): Whether an already trained forecaster is to be used.
|
|
541
|
+
n_removed_folds (int): Number of folds removed.
|
|
542
|
+
index_to_skip (list): Number of folds skipped.
|
|
543
|
+
"""
|
|
544
|
+
|
|
545
|
+
print("Information of folds")
|
|
546
|
+
print("--------------------")
|
|
547
|
+
if externally_fitted:
|
|
548
|
+
print(
|
|
549
|
+
f"An already trained forecaster is to be used. Window size: "
|
|
550
|
+
f"{self.window_size}"
|
|
551
|
+
)
|
|
552
|
+
else:
|
|
553
|
+
if self.differentiation is None:
|
|
554
|
+
print(
|
|
555
|
+
f"Number of observations used for initial training: "
|
|
556
|
+
f"{self.initial_train_size}"
|
|
557
|
+
)
|
|
558
|
+
else:
|
|
559
|
+
print(
|
|
560
|
+
f"Number of observations used for initial training: "
|
|
561
|
+
f"{self.initial_train_size - self.differentiation}"
|
|
562
|
+
)
|
|
563
|
+
print(
|
|
564
|
+
f" First {self.differentiation} observation/s in training sets "
|
|
565
|
+
f"are used for differentiation"
|
|
566
|
+
)
|
|
567
|
+
print(
|
|
568
|
+
f"Number of observations used for backtesting: "
|
|
569
|
+
f"{len(index) - self.initial_train_size}"
|
|
570
|
+
)
|
|
571
|
+
print(f" Number of folds: {len(folds)}")
|
|
572
|
+
print(
|
|
573
|
+
f" Number skipped folds: "
|
|
574
|
+
f"{len(index_to_skip)} {index_to_skip if index_to_skip else ''}"
|
|
575
|
+
)
|
|
576
|
+
print(f" Number of steps per fold: {self.steps}")
|
|
577
|
+
if self.steps != self.fold_stride:
|
|
578
|
+
print(
|
|
579
|
+
f" Number of steps to the next fold (fold stride): {self.fold_stride}"
|
|
580
|
+
)
|
|
581
|
+
print(
|
|
582
|
+
f" Number of steps to exclude between last observed data "
|
|
583
|
+
f"(last window) and predictions (gap): {self.gap}"
|
|
584
|
+
)
|
|
585
|
+
if n_removed_folds > 0:
|
|
586
|
+
print(
|
|
587
|
+
f" The last {n_removed_folds} fold(s) have been excluded "
|
|
588
|
+
f"because they were incomplete."
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
if len(folds[-1][4]) < self.steps:
|
|
592
|
+
print(f" Last fold only includes {len(folds[-1][4])} observations.")
|
|
593
|
+
|
|
594
|
+
print("")
|
|
595
|
+
|
|
596
|
+
if self.differentiation is None:
|
|
597
|
+
differentiation = 0
|
|
598
|
+
else:
|
|
599
|
+
differentiation = self.differentiation
|
|
600
|
+
|
|
601
|
+
for i, fold in enumerate(folds):
|
|
602
|
+
is_fold_skipped = i in index_to_skip
|
|
603
|
+
has_training = fold[-1] if i != 0 else True
|
|
604
|
+
training_start = (
|
|
605
|
+
index[fold[1][0] + differentiation] if fold[1] is not None else None
|
|
606
|
+
)
|
|
607
|
+
training_end = index[fold[1][-1]] if fold[1] is not None else None
|
|
608
|
+
training_length = (
|
|
609
|
+
len(fold[1]) - differentiation if fold[1] is not None else 0
|
|
610
|
+
)
|
|
611
|
+
validation_start = index[fold[4][0]]
|
|
612
|
+
validation_end = index[fold[4][-1]]
|
|
613
|
+
validation_length = len(fold[4])
|
|
614
|
+
|
|
615
|
+
print(f"Fold: {i}")
|
|
616
|
+
if is_fold_skipped:
|
|
617
|
+
print(" Fold skipped")
|
|
618
|
+
elif not externally_fitted and has_training:
|
|
619
|
+
print(
|
|
620
|
+
f" Training: {training_start} -- {training_end} "
|
|
621
|
+
f"(n={training_length})"
|
|
622
|
+
)
|
|
623
|
+
print(
|
|
624
|
+
f" Validation: {validation_start} -- {validation_end} "
|
|
625
|
+
f"(n={validation_length})"
|
|
626
|
+
)
|
|
627
|
+
else:
|
|
628
|
+
print(" Training: No training in this fold")
|
|
629
|
+
print(
|
|
630
|
+
f" Validation: {validation_start} -- {validation_end} "
|
|
631
|
+
f"(n={validation_length})"
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
print("")
|