spotforecast2 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spotforecast2/.DS_Store +0 -0
- spotforecast2/__init__.py +2 -0
- spotforecast2/data/__init__.py +0 -0
- spotforecast2/data/data.py +130 -0
- spotforecast2/data/fetch_data.py +209 -0
- spotforecast2/exceptions.py +681 -0
- spotforecast2/forecaster/.DS_Store +0 -0
- spotforecast2/forecaster/__init__.py +7 -0
- spotforecast2/forecaster/base.py +448 -0
- spotforecast2/forecaster/metrics.py +527 -0
- spotforecast2/forecaster/recursive/__init__.py +4 -0
- spotforecast2/forecaster/recursive/_forecaster_equivalent_date.py +1075 -0
- spotforecast2/forecaster/recursive/_forecaster_recursive.py +939 -0
- spotforecast2/forecaster/recursive/_warnings.py +15 -0
- spotforecast2/forecaster/utils.py +954 -0
- spotforecast2/model_selection/__init__.py +5 -0
- spotforecast2/model_selection/bayesian_search.py +453 -0
- spotforecast2/model_selection/grid_search.py +314 -0
- spotforecast2/model_selection/random_search.py +151 -0
- spotforecast2/model_selection/split_base.py +357 -0
- spotforecast2/model_selection/split_one_step.py +245 -0
- spotforecast2/model_selection/split_ts_cv.py +634 -0
- spotforecast2/model_selection/utils_common.py +718 -0
- spotforecast2/model_selection/utils_metrics.py +103 -0
- spotforecast2/model_selection/validation.py +685 -0
- spotforecast2/preprocessing/__init__.py +30 -0
- spotforecast2/preprocessing/_binner.py +378 -0
- spotforecast2/preprocessing/_common.py +123 -0
- spotforecast2/preprocessing/_differentiator.py +123 -0
- spotforecast2/preprocessing/_rolling.py +136 -0
- spotforecast2/preprocessing/curate_data.py +254 -0
- spotforecast2/preprocessing/imputation.py +92 -0
- spotforecast2/preprocessing/outlier.py +114 -0
- spotforecast2/preprocessing/split.py +139 -0
- spotforecast2/py.typed +0 -0
- spotforecast2/utils/__init__.py +43 -0
- spotforecast2/utils/convert_to_utc.py +44 -0
- spotforecast2/utils/data_transform.py +208 -0
- spotforecast2/utils/forecaster_config.py +344 -0
- spotforecast2/utils/generate_holiday.py +70 -0
- spotforecast2/utils/validation.py +569 -0
- spotforecast2/weather/__init__.py +0 -0
- spotforecast2/weather/weather_client.py +288 -0
- spotforecast2-0.0.1.dist-info/METADATA +47 -0
- spotforecast2-0.0.1.dist-info/RECORD +46 -0
- spotforecast2-0.0.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,718 @@
|
|
|
1
|
+
"""Common validation and initialization utilities for model selection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
from typing import Callable
|
|
5
|
+
import warnings
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from joblib import cpu_count
|
|
9
|
+
from sklearn.exceptions import NotFittedError
|
|
10
|
+
from sklearn.linear_model._base import LinearModel, LinearClassifierMixin
|
|
11
|
+
from sklearn.pipeline import Pipeline
|
|
12
|
+
|
|
13
|
+
from spotforecast2.forecaster.utils import check_interval, date_to_index_position
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OneStepAheadValidationWarning(UserWarning):
|
|
17
|
+
"""
|
|
18
|
+
Warning used when validation is performed with one-step-ahead predictions.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def initialize_lags_grid(
|
|
25
|
+
forecaster: object,
|
|
26
|
+
lags_grid: (
|
|
27
|
+
list[int | list[int] | np.ndarray[int] | range[int]]
|
|
28
|
+
| dict[str, list[int | list[int] | np.ndarray[int] | range[int]]]
|
|
29
|
+
| None
|
|
30
|
+
) = None,
|
|
31
|
+
) -> tuple[dict[str, int], str]:
|
|
32
|
+
"""
|
|
33
|
+
Initialize lags grid and lags label for model selection.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
forecaster: Forecaster model. ForecasterRecursive, ForecasterDirect,
|
|
37
|
+
ForecasterRecursiveMultiSeries, ForecasterDirectMultiVariate.
|
|
38
|
+
lags_grid: Lists of lags to try, containing int, lists, numpy ndarray, or range
|
|
39
|
+
objects. If `dict`, the keys are used as labels in the `results`
|
|
40
|
+
DataFrame, and the values are used as the lists of lags to try.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
tuple: (lags_grid, lags_label)
|
|
44
|
+
- lags_grid (dict): Dictionary with lags configuration for each iteration.
|
|
45
|
+
- lags_label (str): Label for lags representation in the results object.
|
|
46
|
+
|
|
47
|
+
Examples:
|
|
48
|
+
>>> from spotforecast2.model_selection.utils_common import initialize_lags_grid
|
|
49
|
+
>>> from spotforecast2.forecaster.recursive import ForecasterRecursive
|
|
50
|
+
>>> from sklearn.linear_model import LinearRegression
|
|
51
|
+
>>> forecaster = ForecasterRecursive(LinearRegression(), lags=2)
|
|
52
|
+
>>> lags_grid = [2, 4]
|
|
53
|
+
>>> lags_grid, lags_label = initialize_lags_grid(forecaster, lags_grid)
|
|
54
|
+
>>> print(lags_grid)
|
|
55
|
+
{'2': 2, '4': 4}
|
|
56
|
+
>>> print(lags_label)
|
|
57
|
+
values
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
if not isinstance(lags_grid, (list, dict, type(None))):
|
|
61
|
+
raise TypeError(
|
|
62
|
+
f"`lags_grid` argument must be a list, dict or None. "
|
|
63
|
+
f"Got {type(lags_grid)}."
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
lags_label = "values"
|
|
67
|
+
if isinstance(lags_grid, list):
|
|
68
|
+
lags_grid = {f"{lags}": lags for lags in lags_grid}
|
|
69
|
+
elif lags_grid is None:
|
|
70
|
+
lags = [int(lag) for lag in forecaster.lags] # Required since numpy 2.0
|
|
71
|
+
lags_grid = {f"{lags}": lags}
|
|
72
|
+
else:
|
|
73
|
+
lags_label = "keys"
|
|
74
|
+
|
|
75
|
+
return lags_grid, lags_label
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def check_backtesting_input(
|
|
79
|
+
forecaster: object,
|
|
80
|
+
cv: object,
|
|
81
|
+
metric: str | Callable | list[str | Callable],
|
|
82
|
+
add_aggregated_metric: bool = True,
|
|
83
|
+
y: pd.Series | None = None,
|
|
84
|
+
series: pd.DataFrame | dict[str, pd.Series | pd.DataFrame] = None,
|
|
85
|
+
exog: pd.Series | pd.DataFrame | dict[str, pd.Series | pd.DataFrame] | None = None,
|
|
86
|
+
interval: float | list[float] | tuple[float] | str | object | None = None,
|
|
87
|
+
interval_method: str = "bootstrapping",
|
|
88
|
+
alpha: float | None = None,
|
|
89
|
+
n_boot: int = 250,
|
|
90
|
+
use_in_sample_residuals: bool = True,
|
|
91
|
+
use_binned_residuals: bool = True,
|
|
92
|
+
random_state: int = 123,
|
|
93
|
+
return_predictors: bool = False,
|
|
94
|
+
freeze_params: bool = True,
|
|
95
|
+
n_jobs: int | str = "auto",
|
|
96
|
+
show_progress: bool = True,
|
|
97
|
+
suppress_warnings: bool = False,
|
|
98
|
+
) -> None:
|
|
99
|
+
"""
|
|
100
|
+
This is a helper function to check most inputs of backtesting functions in
|
|
101
|
+
modules `model_selection`.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
forecaster: Forecaster model.
|
|
105
|
+
cv: TimeSeriesFold object with the information needed to split the data into folds.
|
|
106
|
+
metric: Metric used to quantify the goodness of fit of the model.
|
|
107
|
+
add_aggregated_metric: If `True`, the aggregated metrics (average, weighted average and pooling)
|
|
108
|
+
over all levels are also returned (only multiseries).
|
|
109
|
+
y: Training time series for uni-series forecasters.
|
|
110
|
+
series: Training time series for multi-series forecasters.
|
|
111
|
+
exog: Exogenous variables.
|
|
112
|
+
interval: Specifies whether probabilistic predictions should be estimated and the
|
|
113
|
+
method to use. The following options are supported:
|
|
114
|
+
|
|
115
|
+
- If `float`, represents the nominal (expected) coverage (between 0 and 1).
|
|
116
|
+
For instance, `interval=0.95` corresponds to `[2.5, 97.5]` percentiles.
|
|
117
|
+
- If `list` or `tuple`: Sequence of percentiles to compute, each value must
|
|
118
|
+
be between 0 and 100 inclusive. For example, a 95% confidence interval can
|
|
119
|
+
be specified as `interval = [2.5, 97.5]` or multiple percentiles (e.g. 10,
|
|
120
|
+
50 and 90) as `interval = [10, 50, 90]`.
|
|
121
|
+
- If 'bootstrapping' (str): `n_boot` bootstrapping predictions will be generated.
|
|
122
|
+
- If scipy.stats distribution object, the distribution parameters will
|
|
123
|
+
be estimated for each prediction.
|
|
124
|
+
- If None, no probabilistic predictions are estimated.
|
|
125
|
+
interval_method: Technique used to estimate prediction intervals. Available options:
|
|
126
|
+
|
|
127
|
+
- 'bootstrapping': Bootstrapping is used to generate prediction
|
|
128
|
+
intervals.
|
|
129
|
+
- 'conformal': Employs the conformal prediction split method for
|
|
130
|
+
interval estimation.
|
|
131
|
+
alpha: The confidence intervals used in ForecasterStats are (1 - alpha) %.
|
|
132
|
+
n_boot: Number of bootstrapping iterations to perform when estimating prediction
|
|
133
|
+
intervals.
|
|
134
|
+
use_in_sample_residuals: If `True`, residuals from the training data are used as proxy of prediction
|
|
135
|
+
error to create prediction intervals. If `False`, out_sample_residuals
|
|
136
|
+
are used if they are already stored inside the forecaster.
|
|
137
|
+
use_binned_residuals: If `True`, residuals are selected based on the predicted values
|
|
138
|
+
(binned selection).
|
|
139
|
+
If `False`, residuals are selected randomly.
|
|
140
|
+
random_state: Seed for the random number generator to ensure reproducibility.
|
|
141
|
+
return_predictors: If `True`, the predictors used to make the predictions are also returned.
|
|
142
|
+
n_jobs: The number of jobs to run in parallel. If `-1`, then the number of jobs is
|
|
143
|
+
set to the number of cores. If 'auto', `n_jobs` is set using the function
|
|
144
|
+
select_n_jobs_fit_forecaster.
|
|
145
|
+
freeze_params: Determines whether to freeze the model parameters after the first fit
|
|
146
|
+
for estimators that perform automatic model selection.
|
|
147
|
+
|
|
148
|
+
- If `True`, the model parameters found during the first fit (e.g., order
|
|
149
|
+
and seasonal_order for Arima, or smoothing parameters for Ets) are reused
|
|
150
|
+
in all subsequent refits. This avoids re-running the automatic selection
|
|
151
|
+
procedure in each fold and reduces runtime.
|
|
152
|
+
- If `False`, automatic model selection is performed independently in each
|
|
153
|
+
refit, allowing parameters to adapt across folds. This increases runtime
|
|
154
|
+
and adds a `params` column to the output with the parameters selected per
|
|
155
|
+
fold.
|
|
156
|
+
show_progress: Whether to show a progress bar.
|
|
157
|
+
suppress_warnings: If `True`, spotforecast warnings will be suppressed during the backtesting
|
|
158
|
+
process.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
None
|
|
162
|
+
|
|
163
|
+
Examples:
|
|
164
|
+
>>> import pandas as pd
|
|
165
|
+
>>> from spotforecast2.model_selection.utils_common import check_backtesting_input
|
|
166
|
+
>>> from spotforecast2.forecaster.recursive import ForecasterRecursive
|
|
167
|
+
>>> from spotforecast2.model_selection import TimeSeriesFold
|
|
168
|
+
>>> from sklearn.linear_model import LinearRegression
|
|
169
|
+
>>> from sklearn.metrics import mean_squared_error
|
|
170
|
+
>>> y = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
|
|
171
|
+
>>> forecaster = ForecasterRecursive(LinearRegression(), lags=2)
|
|
172
|
+
>>> cv = TimeSeriesFold(
|
|
173
|
+
... steps=3,
|
|
174
|
+
... initial_train_size=5,
|
|
175
|
+
... gap=0,
|
|
176
|
+
... refit=False,
|
|
177
|
+
... fixed_train_size=False,
|
|
178
|
+
... allow_incomplete_fold=True
|
|
179
|
+
... )
|
|
180
|
+
>>> check_backtesting_input(
|
|
181
|
+
... forecaster=forecaster,
|
|
182
|
+
... cv=cv,
|
|
183
|
+
... metric=mean_squared_error,
|
|
184
|
+
... y=y
|
|
185
|
+
... )
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
forecaster_name = type(forecaster).__name__
|
|
189
|
+
cv_name = type(cv).__name__
|
|
190
|
+
|
|
191
|
+
if cv_name != "TimeSeriesFold":
|
|
192
|
+
raise TypeError(f"`cv` must be a 'TimeSeriesFold' object. Got '{cv_name}'.")
|
|
193
|
+
|
|
194
|
+
steps = cv.steps
|
|
195
|
+
initial_train_size = cv.initial_train_size
|
|
196
|
+
gap = cv.gap
|
|
197
|
+
allow_incomplete_fold = cv.allow_incomplete_fold
|
|
198
|
+
refit = cv.refit
|
|
199
|
+
|
|
200
|
+
forecasters_uni = [
|
|
201
|
+
"ForecasterRecursive",
|
|
202
|
+
"ForecasterDirect",
|
|
203
|
+
"ForecasterStats",
|
|
204
|
+
"ForecasterEquivalentDate",
|
|
205
|
+
"ForecasterRecursiveClassifier",
|
|
206
|
+
]
|
|
207
|
+
forecasters_direct = [
|
|
208
|
+
"ForecasterDirect",
|
|
209
|
+
"ForecasterDirectMultiVariate",
|
|
210
|
+
"ForecasterRnn",
|
|
211
|
+
]
|
|
212
|
+
forecasters_multi_no_dict = [
|
|
213
|
+
"ForecasterDirectMultiVariate",
|
|
214
|
+
"ForecasterRnn",
|
|
215
|
+
]
|
|
216
|
+
forecasters_multi_dict = ["ForecasterRecursiveMultiSeries"]
|
|
217
|
+
# NOTE: ForecasterStats has interval but not with bootstrapping or conformal
|
|
218
|
+
forecasters_boot_conformal = [
|
|
219
|
+
"ForecasterRecursive",
|
|
220
|
+
"ForecasterDirect",
|
|
221
|
+
"ForecasterRecursiveMultiSeries",
|
|
222
|
+
"ForecasterDirectMultiVariate",
|
|
223
|
+
"ForecasterEquivalentDate",
|
|
224
|
+
]
|
|
225
|
+
forecasters_return_predictors = [
|
|
226
|
+
"ForecasterRecursive",
|
|
227
|
+
"ForecasterDirect",
|
|
228
|
+
"ForecasterRecursiveMultiSeries",
|
|
229
|
+
"ForecasterDirectMultiVariate",
|
|
230
|
+
"ForecasterRecursiveClassifier",
|
|
231
|
+
]
|
|
232
|
+
|
|
233
|
+
if forecaster_name in forecasters_uni:
|
|
234
|
+
if not isinstance(y, pd.Series):
|
|
235
|
+
raise TypeError("`y` must be a pandas Series.")
|
|
236
|
+
data_name = "y"
|
|
237
|
+
data_length = len(y)
|
|
238
|
+
|
|
239
|
+
elif forecaster_name in forecasters_multi_no_dict:
|
|
240
|
+
if not isinstance(series, pd.DataFrame):
|
|
241
|
+
raise TypeError("`series` must be a pandas DataFrame.")
|
|
242
|
+
data_name = "series"
|
|
243
|
+
data_length = len(series)
|
|
244
|
+
|
|
245
|
+
elif forecaster_name in forecasters_multi_dict:
|
|
246
|
+
|
|
247
|
+
# NOTE: Checks are not need as they are done in the function
|
|
248
|
+
# `check_preprocess_series` that is used before `check_backtesting_input`
|
|
249
|
+
# in the backtesting function.
|
|
250
|
+
|
|
251
|
+
data_name = "series"
|
|
252
|
+
data_length = max([len(series[serie]) for serie in series])
|
|
253
|
+
|
|
254
|
+
if exog is not None:
|
|
255
|
+
if forecaster_name in forecasters_multi_dict:
|
|
256
|
+
# NOTE: Checks are not need as they are done in the function
|
|
257
|
+
# `check_preprocess_exog_multiseries` that is used before
|
|
258
|
+
# `check_backtesting_input` in the backtesting function.
|
|
259
|
+
pass
|
|
260
|
+
else:
|
|
261
|
+
if not isinstance(exog, (pd.Series, pd.DataFrame)):
|
|
262
|
+
raise TypeError(
|
|
263
|
+
f"`exog` must be a pandas Series, DataFrame or None. Got {type(exog)}."
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
if hasattr(forecaster, "differentiation"):
|
|
267
|
+
if forecaster.differentiation_max != cv.differentiation:
|
|
268
|
+
if forecaster_name == "ForecasterRecursiveMultiSeries" and isinstance(
|
|
269
|
+
forecaster.differentiation, dict
|
|
270
|
+
):
|
|
271
|
+
raise ValueError(
|
|
272
|
+
f"When using a dict as `differentiation` in ForecasterRecursiveMultiSeries, "
|
|
273
|
+
f"the `differentiation` included in the cv ({cv.differentiation}) must be "
|
|
274
|
+
f"the same as the maximum `differentiation` included in the forecaster "
|
|
275
|
+
f"({forecaster.differentiation_max}). Set the same value "
|
|
276
|
+
f"for both using the `differentiation` argument."
|
|
277
|
+
)
|
|
278
|
+
else:
|
|
279
|
+
raise ValueError(
|
|
280
|
+
f"The differentiation included in the forecaster "
|
|
281
|
+
f"({forecaster.differentiation_max}) differs from the differentiation "
|
|
282
|
+
f"included in the cv ({cv.differentiation}). Set the same value "
|
|
283
|
+
f"for both using the `differentiation` argument."
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
if not isinstance(metric, (str, Callable, list)):
|
|
287
|
+
raise TypeError(
|
|
288
|
+
f"`metric` must be a string, a callable function, or a list containing "
|
|
289
|
+
f"multiple strings and/or callables. Got {type(metric)}."
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
if forecaster_name == "ForecasterEquivalentDate" and isinstance(
|
|
293
|
+
forecaster.offset, pd.tseries.offsets.DateOffset
|
|
294
|
+
):
|
|
295
|
+
# NOTE: Checks when initial_train_size is not None cannot be done here
|
|
296
|
+
# because the forecaster is not fitted yet and we don't know the
|
|
297
|
+
# window_size since pd.DateOffset is not a fixed window size.
|
|
298
|
+
if initial_train_size is None:
|
|
299
|
+
raise ValueError(
|
|
300
|
+
f"`initial_train_size` must be an integer greater than "
|
|
301
|
+
f"the `window_size` of the forecaster ({forecaster.window_size}) "
|
|
302
|
+
f"and smaller than the length of `{data_name}` ({data_length}) or "
|
|
303
|
+
f"a date within this range of the index."
|
|
304
|
+
)
|
|
305
|
+
elif initial_train_size is not None:
|
|
306
|
+
if forecaster_name in forecasters_uni:
|
|
307
|
+
index = cv._extract_index(y)
|
|
308
|
+
else:
|
|
309
|
+
index = cv._extract_index(series)
|
|
310
|
+
|
|
311
|
+
initial_train_size = date_to_index_position(
|
|
312
|
+
index=index,
|
|
313
|
+
date_input=initial_train_size,
|
|
314
|
+
method="validation",
|
|
315
|
+
date_literal="initial_train_size",
|
|
316
|
+
)
|
|
317
|
+
if (
|
|
318
|
+
initial_train_size < forecaster.window_size
|
|
319
|
+
or initial_train_size >= data_length
|
|
320
|
+
):
|
|
321
|
+
raise ValueError(
|
|
322
|
+
f"If `initial_train_size` is an integer, it must be greater than "
|
|
323
|
+
f"the `window_size` of the forecaster ({forecaster.window_size}) "
|
|
324
|
+
f"and smaller than the length of `{data_name}` ({data_length}). If "
|
|
325
|
+
f"it is a date, it must be within this range of the index."
|
|
326
|
+
)
|
|
327
|
+
if allow_incomplete_fold:
|
|
328
|
+
# At least one observation after the gap to allow incomplete fold
|
|
329
|
+
if data_length <= initial_train_size + gap:
|
|
330
|
+
raise ValueError(
|
|
331
|
+
f"`{data_name}` must have more than `initial_train_size + gap` "
|
|
332
|
+
f"observations to create at least one fold.\n"
|
|
333
|
+
f" Time series length: {data_length}\n"
|
|
334
|
+
f" Required > {initial_train_size + gap}\n"
|
|
335
|
+
f" initial_train_size: {initial_train_size}\n"
|
|
336
|
+
f" gap: {gap}\n"
|
|
337
|
+
)
|
|
338
|
+
else:
|
|
339
|
+
# At least one complete fold
|
|
340
|
+
if data_length < initial_train_size + gap + steps:
|
|
341
|
+
raise ValueError(
|
|
342
|
+
f"`{data_name}` must have at least `initial_train_size + gap + steps` "
|
|
343
|
+
f"observations to create a minimum of one complete fold "
|
|
344
|
+
f"(allow_incomplete_fold=False).\n"
|
|
345
|
+
f" Time series length: {data_length}\n"
|
|
346
|
+
f" Required >= {initial_train_size + gap + steps}\n"
|
|
347
|
+
f" initial_train_size: {initial_train_size}\n"
|
|
348
|
+
f" gap: {gap}\n"
|
|
349
|
+
f" steps: {steps}\n"
|
|
350
|
+
)
|
|
351
|
+
else:
|
|
352
|
+
if forecaster_name in ["ForecasterStats", "ForecasterEquivalentDate"]:
|
|
353
|
+
raise ValueError(
|
|
354
|
+
f"When using {forecaster_name}, `initial_train_size` must be an "
|
|
355
|
+
f"integer smaller than the length of `{data_name}` ({data_length})."
|
|
356
|
+
)
|
|
357
|
+
else:
|
|
358
|
+
if not forecaster.is_fitted:
|
|
359
|
+
raise NotFittedError(
|
|
360
|
+
"`forecaster` must be already trained if no `initial_train_size` "
|
|
361
|
+
"is provided."
|
|
362
|
+
)
|
|
363
|
+
if refit:
|
|
364
|
+
raise ValueError(
|
|
365
|
+
"`refit` is only allowed when `initial_train_size` is not `None`."
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
if forecaster_name == "ForecasterStats" and cv.skip_folds is not None:
|
|
369
|
+
raise ValueError(
|
|
370
|
+
"`skip_folds` is not allowed for ForecasterStats. Set it to `None`."
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
if not isinstance(add_aggregated_metric, bool):
|
|
374
|
+
raise TypeError("`add_aggregated_metric` must be a boolean: `True`, `False`.")
|
|
375
|
+
if not isinstance(n_boot, (int, np.integer)) or n_boot < 0:
|
|
376
|
+
raise TypeError(f"`n_boot` must be an integer greater than 0. Got {n_boot}.")
|
|
377
|
+
if not isinstance(use_in_sample_residuals, bool):
|
|
378
|
+
raise TypeError("`use_in_sample_residuals` must be a boolean: `True`, `False`.")
|
|
379
|
+
if not isinstance(use_binned_residuals, bool):
|
|
380
|
+
raise TypeError("`use_binned_residuals` must be a boolean: `True`, `False`.")
|
|
381
|
+
if not isinstance(random_state, (int, np.integer)) or random_state < 0:
|
|
382
|
+
raise TypeError(
|
|
383
|
+
f"`random_state` must be an integer greater than 0. Got {random_state}."
|
|
384
|
+
)
|
|
385
|
+
if not isinstance(return_predictors, bool):
|
|
386
|
+
raise TypeError("`return_predictors` must be a boolean: `True`, `False`.")
|
|
387
|
+
if not isinstance(freeze_params, bool):
|
|
388
|
+
raise TypeError("`freeze_params` must be a boolean: `True`, `False`.")
|
|
389
|
+
if not isinstance(n_jobs, int) and n_jobs != "auto":
|
|
390
|
+
raise TypeError(f"`n_jobs` must be an integer or `'auto'`. Got {n_jobs}.")
|
|
391
|
+
if not isinstance(show_progress, bool):
|
|
392
|
+
raise TypeError("`show_progress` must be a boolean: `True`, `False`.")
|
|
393
|
+
if not isinstance(suppress_warnings, bool):
|
|
394
|
+
raise TypeError("`suppress_warnings` must be a boolean: `True`, `False`.")
|
|
395
|
+
|
|
396
|
+
if interval is not None or alpha is not None:
|
|
397
|
+
|
|
398
|
+
if forecaster_name in forecasters_boot_conformal:
|
|
399
|
+
|
|
400
|
+
if interval_method == "conformal":
|
|
401
|
+
if not isinstance(interval, (float, list, tuple)):
|
|
402
|
+
raise TypeError(
|
|
403
|
+
f"When `interval_method` is 'conformal', `interval` must "
|
|
404
|
+
f"be a float or a list/tuple defining a symmetric interval. "
|
|
405
|
+
f"Got {type(interval)}."
|
|
406
|
+
)
|
|
407
|
+
elif interval_method == "bootstrapping":
|
|
408
|
+
if not isinstance(interval, (float, list, tuple, str)) and (
|
|
409
|
+
not hasattr(interval, "_pdf")
|
|
410
|
+
or not callable(getattr(interval, "fit", None))
|
|
411
|
+
):
|
|
412
|
+
raise TypeError(
|
|
413
|
+
f"When `interval_method` is 'bootstrapping', `interval` "
|
|
414
|
+
f"must be a float, a list or tuple of floats, a "
|
|
415
|
+
f"scipy.stats distribution object (with methods `_pdf` and "
|
|
416
|
+
f"`fit`) or the string 'bootstrapping'. Got {type(interval)}."
|
|
417
|
+
)
|
|
418
|
+
if isinstance(interval, (list, tuple)):
|
|
419
|
+
for i in interval:
|
|
420
|
+
if not isinstance(i, (int, float)):
|
|
421
|
+
raise TypeError(
|
|
422
|
+
f"`interval` must be a list or tuple of floats. "
|
|
423
|
+
f"Got {type(i)} in {interval}."
|
|
424
|
+
)
|
|
425
|
+
if len(interval) == 2:
|
|
426
|
+
check_interval(interval=interval)
|
|
427
|
+
else:
|
|
428
|
+
for q in interval:
|
|
429
|
+
if (q < 0.0) or (q > 100.0):
|
|
430
|
+
raise ValueError(
|
|
431
|
+
"When `interval` is a list or tuple, all values must be "
|
|
432
|
+
"between 0 and 100 inclusive."
|
|
433
|
+
)
|
|
434
|
+
elif isinstance(interval, str):
|
|
435
|
+
if interval != "bootstrapping":
|
|
436
|
+
raise ValueError(
|
|
437
|
+
f"When `interval` is a string, it must be 'bootstrapping'."
|
|
438
|
+
f"Got {interval}."
|
|
439
|
+
)
|
|
440
|
+
else:
|
|
441
|
+
raise ValueError(
|
|
442
|
+
f"`interval_method` must be 'bootstrapping' or 'conformal'. "
|
|
443
|
+
f"Got {interval_method}."
|
|
444
|
+
)
|
|
445
|
+
else:
|
|
446
|
+
if forecaster_name == "ForecasterRecursiveClassifier":
|
|
447
|
+
raise ValueError(
|
|
448
|
+
f"`interval` is not supported for {forecaster_name}. Class "
|
|
449
|
+
f"probabilities are returned by default during backtesting, "
|
|
450
|
+
f"set `interval=None`."
|
|
451
|
+
)
|
|
452
|
+
check_interval(interval=interval, alpha=alpha)
|
|
453
|
+
|
|
454
|
+
if return_predictors and forecaster_name not in forecasters_return_predictors:
|
|
455
|
+
raise ValueError(
|
|
456
|
+
f"`return_predictors` is only allowed for forecasters of type "
|
|
457
|
+
f"{forecasters_return_predictors}. Got {forecaster_name}."
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
if forecaster_name in forecasters_direct and forecaster.max_step < steps + gap:
|
|
461
|
+
raise ValueError(
|
|
462
|
+
f"When using a {forecaster_name}, the combination of steps "
|
|
463
|
+
f"+ gap ({steps + gap}) cannot be greater than the `steps` parameter "
|
|
464
|
+
f"declared when the forecaster is initialized ({forecaster.max_step})."
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def check_one_step_ahead_input(
|
|
469
|
+
forecaster: object,
|
|
470
|
+
cv: object,
|
|
471
|
+
metric: str | Callable | list[str | Callable],
|
|
472
|
+
y: pd.Series | None = None,
|
|
473
|
+
series: pd.DataFrame | dict[str, pd.Series | pd.DataFrame] = None,
|
|
474
|
+
exog: pd.Series | pd.DataFrame | dict[str, pd.Series | pd.DataFrame] | None = None,
|
|
475
|
+
show_progress: bool = True,
|
|
476
|
+
suppress_warnings: bool = False,
|
|
477
|
+
) -> None:
|
|
478
|
+
"""
|
|
479
|
+
This is a helper function to check most inputs of hyperparameter tuning
|
|
480
|
+
functions in modules `model_selection` when using a `OneStepAheadFold`.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
forecaster: Forecaster model.
|
|
484
|
+
cv: OneStepAheadFold object with the information needed to split the data into folds.
|
|
485
|
+
metric: Metric used to quantify the goodness of fit of the model.
|
|
486
|
+
y: Training time series for uni-series forecasters.
|
|
487
|
+
series: Training time series for multi-series forecasters.
|
|
488
|
+
exog: Exogenous variables.
|
|
489
|
+
show_progress: Whether to show a progress bar.
|
|
490
|
+
suppress_warnings: If `True`, spotforecast warnings will be suppressed during the hyperparameter
|
|
491
|
+
search.
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
None
|
|
495
|
+
|
|
496
|
+
Examples:
|
|
497
|
+
>>> import pandas as pd
|
|
498
|
+
>>> from spotforecast2.model_selection.utils_common import check_one_step_ahead_input
|
|
499
|
+
>>> from spotforecast2.forecaster.recursive import ForecasterRecursive
|
|
500
|
+
>>> from spotforecast2.model_selection import OneStepAheadFold
|
|
501
|
+
>>> from sklearn.linear_model import LinearRegression
|
|
502
|
+
>>> from sklearn.metrics import mean_squared_error
|
|
503
|
+
>>> y = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
|
|
504
|
+
>>> forecaster = ForecasterRecursive(LinearRegression(), lags=2)
|
|
505
|
+
>>> cv = OneStepAheadFold(
|
|
506
|
+
... initial_train_size=5,
|
|
507
|
+
... return_all_predictions=False
|
|
508
|
+
... )
|
|
509
|
+
>>> check_one_step_ahead_input(
|
|
510
|
+
... forecaster=forecaster,
|
|
511
|
+
... cv=cv,
|
|
512
|
+
... metric=mean_squared_error,
|
|
513
|
+
... y=y
|
|
514
|
+
... )
|
|
515
|
+
"""
|
|
516
|
+
|
|
517
|
+
forecaster_name = type(forecaster).__name__
|
|
518
|
+
cv_name = type(cv).__name__
|
|
519
|
+
|
|
520
|
+
if cv_name != "OneStepAheadFold":
|
|
521
|
+
raise TypeError(f"`cv` must be a 'OneStepAheadFold' object. Got '{cv_name}'.")
|
|
522
|
+
|
|
523
|
+
initial_train_size = cv.initial_train_size
|
|
524
|
+
|
|
525
|
+
forecasters_one_step_ahead = [
|
|
526
|
+
"ForecasterRecursive",
|
|
527
|
+
"ForecasterDirect",
|
|
528
|
+
"ForecasterRecursiveClassifier",
|
|
529
|
+
"ForecasterRecursiveMultiSeries",
|
|
530
|
+
"ForecasterDirectMultiVariate",
|
|
531
|
+
]
|
|
532
|
+
if forecaster_name not in forecasters_one_step_ahead:
|
|
533
|
+
raise TypeError(
|
|
534
|
+
f"Only forecasters of type {forecasters_one_step_ahead} are allowed "
|
|
535
|
+
f"when using `cv` of type `OneStepAheadFold`. Got {forecaster_name}."
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
forecasters_uni = [
|
|
539
|
+
"ForecasterRecursive",
|
|
540
|
+
"ForecasterDirect",
|
|
541
|
+
"ForecasterRecursiveClassifier",
|
|
542
|
+
]
|
|
543
|
+
forecasters_multi_no_dict = [
|
|
544
|
+
"ForecasterDirectMultiVariate",
|
|
545
|
+
]
|
|
546
|
+
forecasters_multi_dict = ["ForecasterRecursiveMultiSeries"]
|
|
547
|
+
|
|
548
|
+
if forecaster_name in forecasters_uni:
|
|
549
|
+
if not isinstance(y, pd.Series):
|
|
550
|
+
raise TypeError(f"`y` must be a pandas Series. Got {type(y)}")
|
|
551
|
+
data_name = "y"
|
|
552
|
+
data_length = len(y)
|
|
553
|
+
|
|
554
|
+
elif forecaster_name in forecasters_multi_no_dict:
|
|
555
|
+
if not isinstance(series, pd.DataFrame):
|
|
556
|
+
raise TypeError(f"`series` must be a pandas DataFrame. Got {type(series)}")
|
|
557
|
+
data_name = "series"
|
|
558
|
+
data_length = len(series)
|
|
559
|
+
|
|
560
|
+
elif forecaster_name in forecasters_multi_dict:
|
|
561
|
+
|
|
562
|
+
# NOTE: Checks are not need as they are done in the function
|
|
563
|
+
# `check_preprocess_series` that is used before `check_one_step_ahead_input`
|
|
564
|
+
# in the backtesting function.
|
|
565
|
+
|
|
566
|
+
data_name = "series"
|
|
567
|
+
data_length = max([len(series[serie]) for serie in series])
|
|
568
|
+
|
|
569
|
+
if exog is not None:
|
|
570
|
+
if forecaster_name in forecasters_multi_dict:
|
|
571
|
+
# NOTE: Checks are not need as they are done in the function
|
|
572
|
+
# `check_preprocess_exog_multiseries` that is used before
|
|
573
|
+
# `check_backtesting_input` in the backtesting function.
|
|
574
|
+
pass
|
|
575
|
+
else:
|
|
576
|
+
if not isinstance(exog, (pd.Series, pd.DataFrame)):
|
|
577
|
+
raise TypeError(
|
|
578
|
+
f"`exog` must be a pandas Series, DataFrame or None. Got {type(exog)}."
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
if hasattr(forecaster, "differentiation"):
|
|
582
|
+
if forecaster.differentiation_max != cv.differentiation:
|
|
583
|
+
if forecaster_name == "ForecasterRecursiveMultiSeries" and isinstance(
|
|
584
|
+
forecaster.differentiation, dict
|
|
585
|
+
):
|
|
586
|
+
raise ValueError(
|
|
587
|
+
f"When using a dict as `differentiation` in ForecasterRecursiveMultiSeries, "
|
|
588
|
+
f"the `differentiation` included in the cv ({cv.differentiation}) must be "
|
|
589
|
+
f"the same as the maximum `differentiation` included in the forecaster "
|
|
590
|
+
f"({forecaster.differentiation_max}). Set the same value "
|
|
591
|
+
f"for both using the `differentiation` argument."
|
|
592
|
+
)
|
|
593
|
+
else:
|
|
594
|
+
raise ValueError(
|
|
595
|
+
f"The differentiation included in the forecaster "
|
|
596
|
+
f"({forecaster.differentiation_max}) differs from the differentiation "
|
|
597
|
+
f"included in the cv ({cv.differentiation}). Set the same value "
|
|
598
|
+
f"for both using the `differentiation` argument."
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
if not isinstance(metric, (str, Callable, list)):
|
|
602
|
+
raise TypeError(
|
|
603
|
+
f"`metric` must be a string, a callable function, or a list containing "
|
|
604
|
+
f"multiple strings and/or callables. Got {type(metric)}."
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
if forecaster_name in forecasters_uni:
|
|
608
|
+
index = cv._extract_index(y)
|
|
609
|
+
else:
|
|
610
|
+
index = cv._extract_index(series)
|
|
611
|
+
|
|
612
|
+
initial_train_size = date_to_index_position(
|
|
613
|
+
index=index,
|
|
614
|
+
date_input=initial_train_size,
|
|
615
|
+
method="validation",
|
|
616
|
+
date_literal="initial_train_size",
|
|
617
|
+
)
|
|
618
|
+
if initial_train_size < forecaster.window_size or initial_train_size >= data_length:
|
|
619
|
+
raise ValueError(
|
|
620
|
+
f"If `initial_train_size` is an integer, it must be greater than "
|
|
621
|
+
f"the `window_size` of the forecaster ({forecaster.window_size}) "
|
|
622
|
+
f"and smaller than the length of `{data_name}` ({data_length}). If "
|
|
623
|
+
f"it is a date, it must be within this range of the index."
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
if not isinstance(show_progress, bool):
|
|
627
|
+
raise TypeError("`show_progress` must be a boolean: `True`, `False`.")
|
|
628
|
+
if not isinstance(suppress_warnings, bool):
|
|
629
|
+
raise TypeError("`suppress_warnings` must be a boolean: `True`, `False`.")
|
|
630
|
+
|
|
631
|
+
if not suppress_warnings:
|
|
632
|
+
warnings.warn(
|
|
633
|
+
"One-step-ahead predictions are used for faster model comparison, but they "
|
|
634
|
+
"may not fully represent multi-step prediction performance. It is recommended "
|
|
635
|
+
"to backtest the final model for a more accurate multi-step performance "
|
|
636
|
+
"estimate.",
|
|
637
|
+
OneStepAheadValidationWarning,
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def select_n_jobs_backtesting(forecaster: object, refit: bool | int) -> int:
|
|
642
|
+
"""
|
|
643
|
+
Select the optimal number of jobs to use in the backtesting process. This
|
|
644
|
+
selection is based on heuristics and is not guaranteed to be optimal.
|
|
645
|
+
|
|
646
|
+
The number of jobs is chosen as follows:
|
|
647
|
+
|
|
648
|
+
- If `refit` is an integer, then `n_jobs = 1`. This is because parallelization doesn't
|
|
649
|
+
work with intermittent refit.
|
|
650
|
+
- If forecaster is 'ForecasterRecursive' and estimator is a linear estimator,
|
|
651
|
+
then `n_jobs = 1`.
|
|
652
|
+
- If forecaster is 'ForecasterRecursive' and estimator is not a linear
|
|
653
|
+
estimator then `n_jobs = cpu_count() - 1`.
|
|
654
|
+
- If forecaster is 'ForecasterDirect' or 'ForecasterDirectMultiVariate'
|
|
655
|
+
and `refit = True`, then `n_jobs = cpu_count() - 1`.
|
|
656
|
+
- If forecaster is 'ForecasterDirect' or 'ForecasterDirectMultiVariate'
|
|
657
|
+
and `refit = False`, then `n_jobs = 1`.
|
|
658
|
+
- If forecaster is 'ForecasterRecursiveMultiSeries', then `n_jobs = cpu_count() - 1`.
|
|
659
|
+
- If forecaster is 'ForecasterStats' or 'ForecasterEquivalentDate',
|
|
660
|
+
then `n_jobs = 1`.
|
|
661
|
+
- If estimator is a `LGBMRegressor(n_jobs=1)`, then `n_jobs = cpu_count() - 1`.
|
|
662
|
+
- If estimator is a `LGBMRegressor` with internal n_jobs != 1, then `n_jobs = 1`.
|
|
663
|
+
This is because `lightgbm` is highly optimized for gradient boosting and
|
|
664
|
+
parallelizes operations at a very fine-grained level, making additional
|
|
665
|
+
parallelization unnecessary and potentially harmful due to resource contention.
|
|
666
|
+
|
|
667
|
+
Args:
|
|
668
|
+
forecaster: Forecaster model.
|
|
669
|
+
refit: If the forecaster is refitted during the backtesting process.
|
|
670
|
+
|
|
671
|
+
Returns:
|
|
672
|
+
int: The number of jobs to run in parallel.
|
|
673
|
+
|
|
674
|
+
Examples:
|
|
675
|
+
>>> from spotforecast2.model_selection.utils_common import select_n_jobs_backtesting
|
|
676
|
+
>>> from spotforecast2.forecaster.recursive import ForecasterRecursive
|
|
677
|
+
>>> from sklearn.linear_model import LinearRegression
|
|
678
|
+
>>> forecaster = ForecasterRecursive(LinearRegression(), lags=2)
|
|
679
|
+
>>> select_n_jobs_backtesting(forecaster, refit=True)
|
|
680
|
+
1
|
|
681
|
+
"""
|
|
682
|
+
|
|
683
|
+
forecaster_name = type(forecaster).__name__
|
|
684
|
+
|
|
685
|
+
if forecaster_name == "ForecasterStats":
|
|
686
|
+
n_jobs = 1
|
|
687
|
+
return n_jobs
|
|
688
|
+
|
|
689
|
+
if isinstance(forecaster.estimator, Pipeline):
|
|
690
|
+
estimator = forecaster.estimator[-1]
|
|
691
|
+
else:
|
|
692
|
+
estimator = forecaster.estimator
|
|
693
|
+
|
|
694
|
+
refit = False if refit == 0 else refit
|
|
695
|
+
if not isinstance(refit, bool) and refit != 1:
|
|
696
|
+
n_jobs = 1
|
|
697
|
+
else:
|
|
698
|
+
if forecaster_name in {"ForecasterRecursive", "ForecasterRecursiveClassifier"}:
|
|
699
|
+
if isinstance(estimator, (LinearModel, LinearClassifierMixin)):
|
|
700
|
+
n_jobs = 1
|
|
701
|
+
elif type(estimator).__name__ in {"LGBMRegressor", "LGBMClassifier"}:
|
|
702
|
+
n_jobs = cpu_count() - 1 if estimator.n_jobs == 1 else 1
|
|
703
|
+
else:
|
|
704
|
+
n_jobs = cpu_count() - 1
|
|
705
|
+
elif forecaster_name in {"ForecasterDirect", "ForecasterDirectMultiVariate"}:
|
|
706
|
+
# Parallelization is applied during the fitting process.
|
|
707
|
+
n_jobs = 1
|
|
708
|
+
elif forecaster_name in {"ForecasterRecursiveMultiSeries"}:
|
|
709
|
+
if type(estimator).__name__ == "LGBMRegressor":
|
|
710
|
+
n_jobs = cpu_count() - 1 if estimator.n_jobs == 1 else 1
|
|
711
|
+
else:
|
|
712
|
+
n_jobs = cpu_count() - 1
|
|
713
|
+
elif forecaster_name in {"ForecasterEquivalentDate"}:
|
|
714
|
+
n_jobs = 1
|
|
715
|
+
else:
|
|
716
|
+
n_jobs = 1
|
|
717
|
+
|
|
718
|
+
return n_jobs
|