spotforecast2 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spotforecast2/.DS_Store +0 -0
- spotforecast2/__init__.py +2 -0
- spotforecast2/data/__init__.py +0 -0
- spotforecast2/data/data.py +130 -0
- spotforecast2/data/fetch_data.py +209 -0
- spotforecast2/exceptions.py +681 -0
- spotforecast2/forecaster/.DS_Store +0 -0
- spotforecast2/forecaster/__init__.py +7 -0
- spotforecast2/forecaster/base.py +448 -0
- spotforecast2/forecaster/metrics.py +527 -0
- spotforecast2/forecaster/recursive/__init__.py +4 -0
- spotforecast2/forecaster/recursive/_forecaster_equivalent_date.py +1075 -0
- spotforecast2/forecaster/recursive/_forecaster_recursive.py +939 -0
- spotforecast2/forecaster/recursive/_warnings.py +15 -0
- spotforecast2/forecaster/utils.py +954 -0
- spotforecast2/model_selection/__init__.py +5 -0
- spotforecast2/model_selection/bayesian_search.py +453 -0
- spotforecast2/model_selection/grid_search.py +314 -0
- spotforecast2/model_selection/random_search.py +151 -0
- spotforecast2/model_selection/split_base.py +357 -0
- spotforecast2/model_selection/split_one_step.py +245 -0
- spotforecast2/model_selection/split_ts_cv.py +634 -0
- spotforecast2/model_selection/utils_common.py +718 -0
- spotforecast2/model_selection/utils_metrics.py +103 -0
- spotforecast2/model_selection/validation.py +685 -0
- spotforecast2/preprocessing/__init__.py +30 -0
- spotforecast2/preprocessing/_binner.py +378 -0
- spotforecast2/preprocessing/_common.py +123 -0
- spotforecast2/preprocessing/_differentiator.py +123 -0
- spotforecast2/preprocessing/_rolling.py +136 -0
- spotforecast2/preprocessing/curate_data.py +254 -0
- spotforecast2/preprocessing/imputation.py +92 -0
- spotforecast2/preprocessing/outlier.py +114 -0
- spotforecast2/preprocessing/split.py +139 -0
- spotforecast2/py.typed +0 -0
- spotforecast2/utils/__init__.py +43 -0
- spotforecast2/utils/convert_to_utc.py +44 -0
- spotforecast2/utils/data_transform.py +208 -0
- spotforecast2/utils/forecaster_config.py +344 -0
- spotforecast2/utils/generate_holiday.py +70 -0
- spotforecast2/utils/validation.py +569 -0
- spotforecast2/weather/__init__.py +0 -0
- spotforecast2/weather/weather_client.py +288 -0
- spotforecast2-0.0.1.dist-info/METADATA +47 -0
- spotforecast2-0.0.1.dist-info/RECORD +46 -0
- spotforecast2-0.0.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base class for time series cross-validation splitting.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
import warnings
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from spotforecast2.exceptions import IgnoredArgumentWarning
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseFold:
|
|
13
|
+
"""
|
|
14
|
+
Base class for all Fold classes in spotforecast. All fold classes should specify
|
|
15
|
+
all the parameters that can be set at the class level in their ``__init__``.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
steps (int, optional): Number of observations used to be predicted in each fold.
|
|
19
|
+
This is also commonly referred to as the forecast horizon or test size.
|
|
20
|
+
Defaults to None.
|
|
21
|
+
initial_train_size (int | str | pd.Timestamp, optional): Number of observations
|
|
22
|
+
used for initial training.
|
|
23
|
+
|
|
24
|
+
- If an integer, the number of observations used for initial training.
|
|
25
|
+
- If a date string or pandas Timestamp, it is the last date included in
|
|
26
|
+
the initial training set.
|
|
27
|
+
Defaults to None.
|
|
28
|
+
fold_stride (int, optional): Number of observations that the start of the test
|
|
29
|
+
set advances between consecutive folds.
|
|
30
|
+
|
|
31
|
+
- If `None`, it defaults to the same value as `steps`, meaning that folds
|
|
32
|
+
are placed back-to-back without overlap.
|
|
33
|
+
- If `fold_stride < steps`, test sets overlap and multiple forecasts will
|
|
34
|
+
be generated for the same observations.
|
|
35
|
+
- If `fold_stride > steps`, gaps are left between consecutive test sets.
|
|
36
|
+
Defaults to None.
|
|
37
|
+
window_size (int, optional): Number of observations needed to generate the
|
|
38
|
+
autoregressive predictors. Defaults to None.
|
|
39
|
+
differentiation (int, optional): Number of observations to use for differentiation.
|
|
40
|
+
This is used to extend the `last_window` as many observations as the
|
|
41
|
+
differentiation order. Defaults to None.
|
|
42
|
+
refit (bool | int, optional): Whether to refit the forecaster in each fold.
|
|
43
|
+
|
|
44
|
+
- If `True`, the forecaster is refitted in each fold.
|
|
45
|
+
- If `False`, the forecaster is trained only in the first fold.
|
|
46
|
+
- If an integer, the forecaster is trained in the first fold and then refitted
|
|
47
|
+
every `refit` folds.
|
|
48
|
+
Defaults to False.
|
|
49
|
+
fixed_train_size (bool, optional): Whether the training size is fixed or increases
|
|
50
|
+
in each fold. Defaults to True.
|
|
51
|
+
gap (int, optional): Number of observations between the end of the training set
|
|
52
|
+
and the start of the test set. Defaults to 0.
|
|
53
|
+
skip_folds (int | list, optional): Number of folds to skip.
|
|
54
|
+
|
|
55
|
+
- If an integer, every 'skip_folds'-th is returned.
|
|
56
|
+
- If a list, the indexes of the folds to skip.
|
|
57
|
+
|
|
58
|
+
For example, if `skip_folds=3` and there are 10 folds, the returned folds are
|
|
59
|
+
0, 3, 6, and 9. If `skip_folds=[1, 2, 3]`, the returned folds are 0, 4, 5, 6, 7,
|
|
60
|
+
8, and 9. Defaults to None.
|
|
61
|
+
allow_incomplete_fold (bool, optional): Whether to allow the last fold to include
|
|
62
|
+
fewer observations than `steps`. If `False`, the last fold is excluded if it
|
|
63
|
+
is incomplete. Defaults to True.
|
|
64
|
+
return_all_indexes (bool, optional): Whether to return all indexes or only the
|
|
65
|
+
start and end indexes of each fold. Defaults to False.
|
|
66
|
+
verbose (bool, optional): Whether to print information about generated folds.
|
|
67
|
+
Defaults to True.
|
|
68
|
+
|
|
69
|
+
Attributes:
|
|
70
|
+
initial_train_size (int): Number of observations used for initial training.
|
|
71
|
+
window_size (int): Number of observations needed to generate the
|
|
72
|
+
autoregressive predictors.
|
|
73
|
+
differentiation (int): Number of observations to use for differentiation.
|
|
74
|
+
This is used to extend the `last_window` as many observations as the
|
|
75
|
+
differentiation order.
|
|
76
|
+
return_all_indexes (bool): Whether to return all indexes or only the start
|
|
77
|
+
and end indexes of each fold.
|
|
78
|
+
verbose (bool): Whether to print information about generated folds.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
steps: int | None = None,
|
|
84
|
+
initial_train_size: int | str | pd.Timestamp | None = None,
|
|
85
|
+
fold_stride: int | None = None,
|
|
86
|
+
window_size: int | None = None,
|
|
87
|
+
differentiation: int | None = None,
|
|
88
|
+
refit: bool | int = False,
|
|
89
|
+
fixed_train_size: bool = True,
|
|
90
|
+
gap: int = 0,
|
|
91
|
+
skip_folds: int | list[int] | None = None,
|
|
92
|
+
allow_incomplete_fold: bool = True,
|
|
93
|
+
return_all_indexes: bool = False,
|
|
94
|
+
verbose: bool = True,
|
|
95
|
+
) -> None:
|
|
96
|
+
|
|
97
|
+
self._validate_params(
|
|
98
|
+
cv_name=type(self).__name__,
|
|
99
|
+
steps=steps,
|
|
100
|
+
initial_train_size=initial_train_size,
|
|
101
|
+
fold_stride=fold_stride,
|
|
102
|
+
window_size=window_size,
|
|
103
|
+
differentiation=differentiation,
|
|
104
|
+
refit=refit,
|
|
105
|
+
fixed_train_size=fixed_train_size,
|
|
106
|
+
gap=gap,
|
|
107
|
+
skip_folds=skip_folds,
|
|
108
|
+
allow_incomplete_fold=allow_incomplete_fold,
|
|
109
|
+
return_all_indexes=return_all_indexes,
|
|
110
|
+
verbose=verbose,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
self.initial_train_size = initial_train_size
|
|
114
|
+
self.window_size = window_size
|
|
115
|
+
self.differentiation = differentiation
|
|
116
|
+
self.return_all_indexes = return_all_indexes
|
|
117
|
+
self.verbose = verbose
|
|
118
|
+
|
|
119
|
+
def _validate_params(
|
|
120
|
+
self,
|
|
121
|
+
cv_name: str,
|
|
122
|
+
steps: int | None = None,
|
|
123
|
+
initial_train_size: int | str | pd.Timestamp | None = None,
|
|
124
|
+
fold_stride: int | None = None,
|
|
125
|
+
window_size: int | None = None,
|
|
126
|
+
differentiation: int | None = None,
|
|
127
|
+
refit: bool | int = False,
|
|
128
|
+
fixed_train_size: bool = True,
|
|
129
|
+
gap: int = 0,
|
|
130
|
+
skip_folds: int | list[int] | None = None,
|
|
131
|
+
allow_incomplete_fold: bool = True,
|
|
132
|
+
return_all_indexes: bool = False,
|
|
133
|
+
verbose: bool = True,
|
|
134
|
+
**kwargs,
|
|
135
|
+
) -> None:
|
|
136
|
+
"""
|
|
137
|
+
Validate all input parameters to ensure correctness.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
if cv_name == "TimeSeriesFold":
|
|
141
|
+
if not isinstance(steps, (int, np.integer)) or steps < 1:
|
|
142
|
+
raise ValueError(
|
|
143
|
+
f"`steps` must be an integer greater than 0. Got {steps}."
|
|
144
|
+
)
|
|
145
|
+
if not isinstance(
|
|
146
|
+
initial_train_size, (int, np.integer, str, pd.Timestamp, type(None))
|
|
147
|
+
):
|
|
148
|
+
raise ValueError(
|
|
149
|
+
f"`initial_train_size` must be an integer greater than 0, a date "
|
|
150
|
+
f"string, a pandas Timestamp, or None. Got {initial_train_size}."
|
|
151
|
+
)
|
|
152
|
+
if (
|
|
153
|
+
isinstance(initial_train_size, (int, np.integer))
|
|
154
|
+
and initial_train_size < 1
|
|
155
|
+
):
|
|
156
|
+
raise ValueError(
|
|
157
|
+
f"`initial_train_size` must be an integer greater than 0, "
|
|
158
|
+
f"a date string, a pandas Timestamp, or None. Got {initial_train_size}."
|
|
159
|
+
)
|
|
160
|
+
if fold_stride is not None:
|
|
161
|
+
if not isinstance(fold_stride, (int, np.integer)) or fold_stride < 1:
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"`fold_stride` must be an integer greater than 0. Got {fold_stride}."
|
|
164
|
+
)
|
|
165
|
+
if not isinstance(refit, (bool, int, np.integer)):
|
|
166
|
+
raise TypeError(
|
|
167
|
+
f"`refit` must be a boolean or an integer equal or greater than 0. "
|
|
168
|
+
f"Got {refit}."
|
|
169
|
+
)
|
|
170
|
+
if (
|
|
171
|
+
isinstance(refit, (int, np.integer))
|
|
172
|
+
and not isinstance(refit, bool)
|
|
173
|
+
and refit < 0
|
|
174
|
+
):
|
|
175
|
+
raise TypeError(
|
|
176
|
+
f"`refit` must be a boolean or an integer equal or greater than 0. "
|
|
177
|
+
f"Got {refit}."
|
|
178
|
+
)
|
|
179
|
+
if not isinstance(fixed_train_size, bool):
|
|
180
|
+
raise TypeError(
|
|
181
|
+
f"`fixed_train_size` must be a boolean: `True`, `False`. "
|
|
182
|
+
f"Got {fixed_train_size}."
|
|
183
|
+
)
|
|
184
|
+
if not isinstance(gap, (int, np.integer)) or gap < 0:
|
|
185
|
+
raise ValueError(
|
|
186
|
+
f"`gap` must be an integer greater than or equal to 0. Got {gap}."
|
|
187
|
+
)
|
|
188
|
+
if skip_folds is not None:
|
|
189
|
+
if not isinstance(skip_folds, (int, np.integer, list, type(None))):
|
|
190
|
+
raise TypeError(
|
|
191
|
+
f"`skip_folds` must be an integer greater than 0, a list of "
|
|
192
|
+
f"integers or `None`. Got {skip_folds}."
|
|
193
|
+
)
|
|
194
|
+
if isinstance(skip_folds, (int, np.integer)) and skip_folds < 1:
|
|
195
|
+
raise ValueError(
|
|
196
|
+
f"`skip_folds` must be an integer greater than 0, a list of "
|
|
197
|
+
f"integers or `None`. Got {skip_folds}."
|
|
198
|
+
)
|
|
199
|
+
if isinstance(skip_folds, list) and any([x < 1 for x in skip_folds]):
|
|
200
|
+
raise ValueError(
|
|
201
|
+
f"`skip_folds` list must contain integers greater than or "
|
|
202
|
+
f"equal to 1. The first fold is always needed to train the "
|
|
203
|
+
f"forecaster. Got {skip_folds}."
|
|
204
|
+
)
|
|
205
|
+
if not isinstance(allow_incomplete_fold, bool):
|
|
206
|
+
raise TypeError(
|
|
207
|
+
f"`allow_incomplete_fold` must be a boolean: `True`, `False`. "
|
|
208
|
+
f"Got {allow_incomplete_fold}."
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
if cv_name == "OneStepAheadFold":
|
|
212
|
+
if not isinstance(initial_train_size, (int, np.integer, str, pd.Timestamp)):
|
|
213
|
+
raise ValueError(
|
|
214
|
+
f"`initial_train_size` must be an integer greater than 0, a date "
|
|
215
|
+
f"string, or a pandas Timestamp. Got {initial_train_size}."
|
|
216
|
+
)
|
|
217
|
+
if (
|
|
218
|
+
isinstance(initial_train_size, (int, np.integer))
|
|
219
|
+
and initial_train_size < 1
|
|
220
|
+
):
|
|
221
|
+
raise ValueError(
|
|
222
|
+
f"`initial_train_size` must be an integer greater than 0, "
|
|
223
|
+
f"a date string, or a pandas Timestamp. Got {initial_train_size}."
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if (
|
|
227
|
+
not isinstance(window_size, (int, np.integer, pd.DateOffset, type(None)))
|
|
228
|
+
or isinstance(window_size, (int, np.integer))
|
|
229
|
+
and window_size < 1
|
|
230
|
+
):
|
|
231
|
+
raise ValueError(
|
|
232
|
+
f"`window_size` must be an integer greater than 0. Got {window_size}."
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
if differentiation is not None:
|
|
236
|
+
if (
|
|
237
|
+
not isinstance(differentiation, (int, np.integer))
|
|
238
|
+
or differentiation < 0
|
|
239
|
+
):
|
|
240
|
+
raise ValueError(
|
|
241
|
+
f"`differentiation` must be None or an integer greater than or "
|
|
242
|
+
f"equal to 0. Got {differentiation}."
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if not isinstance(return_all_indexes, bool):
|
|
246
|
+
raise TypeError(
|
|
247
|
+
f"`return_all_indexes` must be a boolean: `True`, `False`. "
|
|
248
|
+
f"Got {return_all_indexes}."
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
if not isinstance(verbose, bool):
|
|
252
|
+
raise TypeError(
|
|
253
|
+
f"`verbose` must be a boolean: `True`, `False`. " f"Got {verbose}."
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
def _extract_index(
|
|
257
|
+
self,
|
|
258
|
+
X: pd.Series | pd.DataFrame | pd.Index | dict[str, pd.Series | pd.DataFrame],
|
|
259
|
+
) -> pd.Index:
|
|
260
|
+
"""
|
|
261
|
+
Extracts and returns the index from the input data X.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
X (pd.Series | pd.DataFrame | pd.Index | dict): Time series data or
|
|
265
|
+
index to split.
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
pd.Index: Index extracted from the input data.
|
|
269
|
+
"""
|
|
270
|
+
|
|
271
|
+
if isinstance(X, (pd.Series, pd.DataFrame)):
|
|
272
|
+
idx = X.index
|
|
273
|
+
elif isinstance(X, dict):
|
|
274
|
+
indexes_freq = set()
|
|
275
|
+
not_valid_index = []
|
|
276
|
+
min_index = []
|
|
277
|
+
max_index = []
|
|
278
|
+
for k, v in X.items():
|
|
279
|
+
if v is None:
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
idx = v.index
|
|
283
|
+
if isinstance(idx, pd.DatetimeIndex):
|
|
284
|
+
indexes_freq.add(idx.freq)
|
|
285
|
+
elif isinstance(idx, pd.RangeIndex):
|
|
286
|
+
indexes_freq.add(idx.step)
|
|
287
|
+
else:
|
|
288
|
+
not_valid_index.append(k)
|
|
289
|
+
|
|
290
|
+
min_index.append(idx[0])
|
|
291
|
+
max_index.append(idx[-1])
|
|
292
|
+
|
|
293
|
+
if not_valid_index:
|
|
294
|
+
raise TypeError(
|
|
295
|
+
f"If `X` is a dictionary, all series must have a Pandas "
|
|
296
|
+
f"RangeIndex or DatetimeIndex with the same step/frequency. "
|
|
297
|
+
f"Review series: {not_valid_index}"
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
if None in indexes_freq:
|
|
301
|
+
raise ValueError(
|
|
302
|
+
"If `X` is a dictionary, all series must have a Pandas "
|
|
303
|
+
"RangeIndex or DatetimeIndex with the same step/frequency. "
|
|
304
|
+
"Found series with no frequency or step."
|
|
305
|
+
)
|
|
306
|
+
if not len(indexes_freq) == 1:
|
|
307
|
+
raise ValueError(
|
|
308
|
+
f"If `X` is a dictionary, all series must have a Pandas "
|
|
309
|
+
f"RangeIndex or DatetimeIndex with the same step/frequency. "
|
|
310
|
+
f"Found frequencies: {sorted(indexes_freq)}"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
if isinstance(idx, pd.DatetimeIndex):
|
|
314
|
+
idx = pd.date_range(
|
|
315
|
+
start=min(min_index), end=max(max_index), freq=indexes_freq.pop()
|
|
316
|
+
)
|
|
317
|
+
else:
|
|
318
|
+
idx = pd.RangeIndex(
|
|
319
|
+
start=min(min_index),
|
|
320
|
+
stop=max(max_index) + 1,
|
|
321
|
+
step=indexes_freq.pop(),
|
|
322
|
+
)
|
|
323
|
+
else:
|
|
324
|
+
idx = X
|
|
325
|
+
|
|
326
|
+
return idx
|
|
327
|
+
|
|
328
|
+
def set_params(self, params: dict) -> None:
|
|
329
|
+
"""
|
|
330
|
+
Set the parameters of the Fold object. Before overwriting the current
|
|
331
|
+
parameters, the input parameters are validated to ensure correctness.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
params (dict): Dictionary with the parameters to set.
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
if not isinstance(params, dict):
|
|
338
|
+
raise TypeError(f"`params` must be a dictionary. Got {type(params)}.")
|
|
339
|
+
|
|
340
|
+
current_params = dict(vars(self))
|
|
341
|
+
unknown_params = set(params.keys()) - set(current_params.keys())
|
|
342
|
+
if unknown_params:
|
|
343
|
+
warnings.warn(
|
|
344
|
+
f"Unknown parameters: {unknown_params}. They have been ignored.",
|
|
345
|
+
IgnoredArgumentWarning,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
filtered_params = {k: v for k, v in params.items() if k in current_params}
|
|
349
|
+
updated_params = {
|
|
350
|
+
"cv_name": type(self).__name__,
|
|
351
|
+
**current_params,
|
|
352
|
+
**filtered_params,
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
self._validate_params(**updated_params)
|
|
356
|
+
for key, value in updated_params.items():
|
|
357
|
+
setattr(self, key, value)
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""
|
|
2
|
+
One step ahead cross-validation splitting.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
from typing import Any
|
|
7
|
+
import itertools
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from spotforecast2.forecaster.utils import date_to_index_position, get_style_repr_html
|
|
11
|
+
from .split_base import BaseFold
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OneStepAheadFold(BaseFold):
|
|
15
|
+
"""
|
|
16
|
+
Class to split time series data into train and test folds for one-step-ahead
|
|
17
|
+
forecasting.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
initial_train_size (int | str | pd.Timestamp): Number of observations used
|
|
21
|
+
for initial training.
|
|
22
|
+
|
|
23
|
+
- If an integer, the number of observations used for initial training.
|
|
24
|
+
- If a date string or pandas Timestamp, it is the last date included in
|
|
25
|
+
the initial training set.
|
|
26
|
+
window_size (int, optional): Number of observations needed to generate the
|
|
27
|
+
autoregressive predictors. Defaults to None.
|
|
28
|
+
differentiation (int, optional): Number of observations to use for differentiation.
|
|
29
|
+
This is used to extend the `last_window` as many observations as the
|
|
30
|
+
differentiation order. Defaults to None.
|
|
31
|
+
return_all_indexes (bool, optional): Whether to return all indexes or only the
|
|
32
|
+
start and end indexes of each fold. Defaults to False.
|
|
33
|
+
verbose (bool, optional): Whether to print information about generated folds.
|
|
34
|
+
Defaults to True.
|
|
35
|
+
|
|
36
|
+
Attributes:
|
|
37
|
+
initial_train_size (int): Number of observations used for initial training.
|
|
38
|
+
window_size (int): Number of observations needed to generate the
|
|
39
|
+
autoregressive predictors.
|
|
40
|
+
differentiation (int): Number of observations to use for differentiation.
|
|
41
|
+
This is used to extend the `last_window` as many observations as the
|
|
42
|
+
differentiation order.
|
|
43
|
+
return_all_indexes (bool): Whether to return all indexes or only the start
|
|
44
|
+
and end indexes of each fold.
|
|
45
|
+
verbose (bool): Whether to print information about generated folds.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
initial_train_size: int | str | pd.Timestamp,
|
|
51
|
+
window_size: int | None = None,
|
|
52
|
+
differentiation: int | None = None,
|
|
53
|
+
return_all_indexes: bool = False,
|
|
54
|
+
verbose: bool = True,
|
|
55
|
+
) -> None:
|
|
56
|
+
|
|
57
|
+
super().__init__(
|
|
58
|
+
initial_train_size=initial_train_size,
|
|
59
|
+
window_size=window_size,
|
|
60
|
+
differentiation=differentiation,
|
|
61
|
+
return_all_indexes=return_all_indexes,
|
|
62
|
+
verbose=verbose,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def __repr__(self) -> str:
|
|
66
|
+
"""
|
|
67
|
+
Information displayed when printed.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
info = (
|
|
71
|
+
f"{'=' * len(type(self).__name__)} \n"
|
|
72
|
+
f"{type(self).__name__} \n"
|
|
73
|
+
f"{'=' * len(type(self).__name__)} \n"
|
|
74
|
+
f"Initial train size = {self.initial_train_size},\n"
|
|
75
|
+
f"Window size = {self.window_size},\n"
|
|
76
|
+
f"Differentiation = {self.differentiation},\n"
|
|
77
|
+
f"Return all indexes = {self.return_all_indexes},\n"
|
|
78
|
+
f"Verbose = {self.verbose}\n"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
return info
|
|
82
|
+
|
|
83
|
+
def _repr_html_(self) -> str:
|
|
84
|
+
"""
|
|
85
|
+
HTML representation of the object.
|
|
86
|
+
The "General Information" section is expanded by default.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
style, unique_id = get_style_repr_html()
|
|
90
|
+
content = f"""
|
|
91
|
+
<div class="container-{unique_id}">
|
|
92
|
+
<p style="font-size: 1.5em; font-weight: bold; margin-block-start: 0.83em; margin-block-end: 0.83em;">{type(self).__name__}</p>
|
|
93
|
+
<details open>
|
|
94
|
+
<summary>General Information</summary>
|
|
95
|
+
<ul>
|
|
96
|
+
<li><strong>Initial train size:</strong> {self.initial_train_size}</li>
|
|
97
|
+
<li><strong>Window size:</strong> {self.window_size}</li>
|
|
98
|
+
<li><strong>Differentiation:</strong> {self.differentiation}</li>
|
|
99
|
+
<li><strong>Return all indexes:</strong> {self.return_all_indexes}</li>
|
|
100
|
+
</ul>
|
|
101
|
+
</details>
|
|
102
|
+
</div>
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
return style + content
|
|
106
|
+
|
|
107
|
+
def split(
|
|
108
|
+
self,
|
|
109
|
+
X: pd.Series | pd.DataFrame | pd.Index | dict[str, pd.Series | pd.DataFrame],
|
|
110
|
+
as_pandas: bool = False,
|
|
111
|
+
externally_fitted: Any = None,
|
|
112
|
+
) -> list | pd.DataFrame:
|
|
113
|
+
"""
|
|
114
|
+
Split the time series data into train and test folds.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
X (pd.Series | pd.DataFrame | pd.Index | dict): Time series data or index to split.
|
|
118
|
+
as_pandas (bool, optional): If True, the folds are returned as a DataFrame.
|
|
119
|
+
This is useful to visualize the folds in a more interpretable way.
|
|
120
|
+
Defaults to False.
|
|
121
|
+
externally_fitted (Any, optional): This argument is not used in this class.
|
|
122
|
+
It is included for API consistency. Defaults to None.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
list | pd.DataFrame: A list of lists containing the indices (position) of
|
|
126
|
+
the fold. The list contains 2 lists with the following information:
|
|
127
|
+
|
|
128
|
+
- fold: fold number.
|
|
129
|
+
- [train_start, train_end]: list with the start and end positions of the
|
|
130
|
+
training set.
|
|
131
|
+
- [test_start, test_end]: list with the start and end positions of the test
|
|
132
|
+
set. These are the observations used to evaluate the forecaster.
|
|
133
|
+
- fit_forecaster: boolean indicating whether the forecaster should be fitted
|
|
134
|
+
in this fold.
|
|
135
|
+
|
|
136
|
+
It is important to note that the returned values are the positions of the
|
|
137
|
+
observations and not the actual values of the index, so they can be used to
|
|
138
|
+
slice the data directly using iloc.
|
|
139
|
+
|
|
140
|
+
If `as_pandas` is `True`, the folds are returned as a DataFrame with the
|
|
141
|
+
following columns: 'fold', 'train_start', 'train_end', 'test_start',
|
|
142
|
+
'test_end', 'fit_forecaster'.
|
|
143
|
+
|
|
144
|
+
Following the python convention, the start index is inclusive and the end
|
|
145
|
+
index is exclusive. This means that the last index is not included in the
|
|
146
|
+
slice.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
if not isinstance(X, (pd.Series, pd.DataFrame, pd.Index, dict)):
|
|
150
|
+
raise TypeError(
|
|
151
|
+
f"X must be a pandas Series, DataFrame, Index or a dictionary. "
|
|
152
|
+
f"Got {type(X)}."
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
index = self._extract_index(X)
|
|
156
|
+
|
|
157
|
+
self.initial_train_size = date_to_index_position(
|
|
158
|
+
index=index,
|
|
159
|
+
date_input=self.initial_train_size,
|
|
160
|
+
method="validation",
|
|
161
|
+
date_literal="initial_train_size",
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
fold = [
|
|
165
|
+
0,
|
|
166
|
+
[0, self.initial_train_size - 1],
|
|
167
|
+
[self.initial_train_size, len(X)],
|
|
168
|
+
True,
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
if self.verbose:
|
|
172
|
+
self._print_info(index=index, fold=fold)
|
|
173
|
+
|
|
174
|
+
# NOTE: +1 to prevent iloc pandas from deleting the last observation
|
|
175
|
+
if self.return_all_indexes:
|
|
176
|
+
fold = [
|
|
177
|
+
fold[0],
|
|
178
|
+
[range(fold[1][0], fold[1][1] + 1)],
|
|
179
|
+
[range(fold[2][0], fold[2][1])],
|
|
180
|
+
fold[3],
|
|
181
|
+
]
|
|
182
|
+
else:
|
|
183
|
+
fold = [
|
|
184
|
+
fold[0],
|
|
185
|
+
[fold[1][0], fold[1][1] + 1],
|
|
186
|
+
[fold[2][0], fold[2][1]],
|
|
187
|
+
fold[3],
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
if as_pandas:
|
|
191
|
+
if not self.return_all_indexes:
|
|
192
|
+
fold = pd.DataFrame(
|
|
193
|
+
data=[[fold[0]] + list(itertools.chain(*fold[1:-1])) + [fold[-1]]],
|
|
194
|
+
columns=[
|
|
195
|
+
"fold",
|
|
196
|
+
"train_start",
|
|
197
|
+
"train_end",
|
|
198
|
+
"test_start",
|
|
199
|
+
"test_end",
|
|
200
|
+
"fit_forecaster",
|
|
201
|
+
],
|
|
202
|
+
)
|
|
203
|
+
else:
|
|
204
|
+
fold = pd.DataFrame(
|
|
205
|
+
data=[fold],
|
|
206
|
+
columns=["fold", "train_index", "test_index", "fit_forecaster"],
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return fold
|
|
210
|
+
|
|
211
|
+
def _print_info(self, index: pd.Index, fold: list[list[int]]) -> None:
|
|
212
|
+
"""
|
|
213
|
+
Print information about folds.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
index (pd.Index): Index of the time series data.
|
|
217
|
+
fold (list): A list of lists containing the indices (position) of the fold.
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
if self.differentiation is None:
|
|
221
|
+
differentiation = 0
|
|
222
|
+
else:
|
|
223
|
+
differentiation = self.differentiation
|
|
224
|
+
|
|
225
|
+
initial_train_size = self.initial_train_size - differentiation
|
|
226
|
+
test_length = len(index) - (initial_train_size + differentiation)
|
|
227
|
+
|
|
228
|
+
print("Information of folds")
|
|
229
|
+
print("--------------------")
|
|
230
|
+
print(f"Number of observations in train: {initial_train_size}")
|
|
231
|
+
if self.differentiation is not None:
|
|
232
|
+
print(
|
|
233
|
+
f" First {differentiation} observation/s in training set "
|
|
234
|
+
f"are used for differentiation"
|
|
235
|
+
)
|
|
236
|
+
print(f"Number of observations in test: {test_length}")
|
|
237
|
+
|
|
238
|
+
training_start = index[fold[1][0] + differentiation]
|
|
239
|
+
training_end = index[fold[1][-1]]
|
|
240
|
+
test_start = index[fold[2][0]]
|
|
241
|
+
test_end = index[fold[2][-1] - 1]
|
|
242
|
+
|
|
243
|
+
print(f"Training : {training_start} -- {training_end} (n={initial_train_size})")
|
|
244
|
+
print(f"Test : {test_start} -- {test_end} (n={test_length})")
|
|
245
|
+
print("")
|