spotforecast2 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. spotforecast2/.DS_Store +0 -0
  2. spotforecast2/__init__.py +2 -0
  3. spotforecast2/data/__init__.py +0 -0
  4. spotforecast2/data/data.py +130 -0
  5. spotforecast2/data/fetch_data.py +209 -0
  6. spotforecast2/exceptions.py +681 -0
  7. spotforecast2/forecaster/.DS_Store +0 -0
  8. spotforecast2/forecaster/__init__.py +7 -0
  9. spotforecast2/forecaster/base.py +448 -0
  10. spotforecast2/forecaster/metrics.py +527 -0
  11. spotforecast2/forecaster/recursive/__init__.py +4 -0
  12. spotforecast2/forecaster/recursive/_forecaster_equivalent_date.py +1075 -0
  13. spotforecast2/forecaster/recursive/_forecaster_recursive.py +939 -0
  14. spotforecast2/forecaster/recursive/_warnings.py +15 -0
  15. spotforecast2/forecaster/utils.py +954 -0
  16. spotforecast2/model_selection/__init__.py +5 -0
  17. spotforecast2/model_selection/bayesian_search.py +453 -0
  18. spotforecast2/model_selection/grid_search.py +314 -0
  19. spotforecast2/model_selection/random_search.py +151 -0
  20. spotforecast2/model_selection/split_base.py +357 -0
  21. spotforecast2/model_selection/split_one_step.py +245 -0
  22. spotforecast2/model_selection/split_ts_cv.py +634 -0
  23. spotforecast2/model_selection/utils_common.py +718 -0
  24. spotforecast2/model_selection/utils_metrics.py +103 -0
  25. spotforecast2/model_selection/validation.py +685 -0
  26. spotforecast2/preprocessing/__init__.py +30 -0
  27. spotforecast2/preprocessing/_binner.py +378 -0
  28. spotforecast2/preprocessing/_common.py +123 -0
  29. spotforecast2/preprocessing/_differentiator.py +123 -0
  30. spotforecast2/preprocessing/_rolling.py +136 -0
  31. spotforecast2/preprocessing/curate_data.py +254 -0
  32. spotforecast2/preprocessing/imputation.py +92 -0
  33. spotforecast2/preprocessing/outlier.py +114 -0
  34. spotforecast2/preprocessing/split.py +139 -0
  35. spotforecast2/py.typed +0 -0
  36. spotforecast2/utils/__init__.py +43 -0
  37. spotforecast2/utils/convert_to_utc.py +44 -0
  38. spotforecast2/utils/data_transform.py +208 -0
  39. spotforecast2/utils/forecaster_config.py +344 -0
  40. spotforecast2/utils/generate_holiday.py +70 -0
  41. spotforecast2/utils/validation.py +569 -0
  42. spotforecast2/weather/__init__.py +0 -0
  43. spotforecast2/weather/weather_client.py +288 -0
  44. spotforecast2-0.0.1.dist-info/METADATA +47 -0
  45. spotforecast2-0.0.1.dist-info/RECORD +46 -0
  46. spotforecast2-0.0.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,718 @@
1
+ """Common validation and initialization utilities for model selection."""
2
+
3
+ from __future__ import annotations
4
+ from typing import Callable
5
+ import warnings
6
+ import numpy as np
7
+ import pandas as pd
8
+ from joblib import cpu_count
9
+ from sklearn.exceptions import NotFittedError
10
+ from sklearn.linear_model._base import LinearModel, LinearClassifierMixin
11
+ from sklearn.pipeline import Pipeline
12
+
13
+ from spotforecast2.forecaster.utils import check_interval, date_to_index_position
14
+
15
+
16
+ class OneStepAheadValidationWarning(UserWarning):
17
+ """
18
+ Warning used when validation is performed with one-step-ahead predictions.
19
+ """
20
+
21
+ pass
22
+
23
+
24
+ def initialize_lags_grid(
25
+ forecaster: object,
26
+ lags_grid: (
27
+ list[int | list[int] | np.ndarray[int] | range[int]]
28
+ | dict[str, list[int | list[int] | np.ndarray[int] | range[int]]]
29
+ | None
30
+ ) = None,
31
+ ) -> tuple[dict[str, int], str]:
32
+ """
33
+ Initialize lags grid and lags label for model selection.
34
+
35
+ Args:
36
+ forecaster: Forecaster model. ForecasterRecursive, ForecasterDirect,
37
+ ForecasterRecursiveMultiSeries, ForecasterDirectMultiVariate.
38
+ lags_grid: Lists of lags to try, containing int, lists, numpy ndarray, or range
39
+ objects. If `dict`, the keys are used as labels in the `results`
40
+ DataFrame, and the values are used as the lists of lags to try.
41
+
42
+ Returns:
43
+ tuple: (lags_grid, lags_label)
44
+ - lags_grid (dict): Dictionary with lags configuration for each iteration.
45
+ - lags_label (str): Label for lags representation in the results object.
46
+
47
+ Examples:
48
+ >>> from spotforecast2.model_selection.utils_common import initialize_lags_grid
49
+ >>> from spotforecast2.forecaster.recursive import ForecasterRecursive
50
+ >>> from sklearn.linear_model import LinearRegression
51
+ >>> forecaster = ForecasterRecursive(LinearRegression(), lags=2)
52
+ >>> lags_grid = [2, 4]
53
+ >>> lags_grid, lags_label = initialize_lags_grid(forecaster, lags_grid)
54
+ >>> print(lags_grid)
55
+ {'2': 2, '4': 4}
56
+ >>> print(lags_label)
57
+ values
58
+ """
59
+
60
+ if not isinstance(lags_grid, (list, dict, type(None))):
61
+ raise TypeError(
62
+ f"`lags_grid` argument must be a list, dict or None. "
63
+ f"Got {type(lags_grid)}."
64
+ )
65
+
66
+ lags_label = "values"
67
+ if isinstance(lags_grid, list):
68
+ lags_grid = {f"{lags}": lags for lags in lags_grid}
69
+ elif lags_grid is None:
70
+ lags = [int(lag) for lag in forecaster.lags] # Required since numpy 2.0
71
+ lags_grid = {f"{lags}": lags}
72
+ else:
73
+ lags_label = "keys"
74
+
75
+ return lags_grid, lags_label
76
+
77
+
78
+ def check_backtesting_input(
79
+ forecaster: object,
80
+ cv: object,
81
+ metric: str | Callable | list[str | Callable],
82
+ add_aggregated_metric: bool = True,
83
+ y: pd.Series | None = None,
84
+ series: pd.DataFrame | dict[str, pd.Series | pd.DataFrame] = None,
85
+ exog: pd.Series | pd.DataFrame | dict[str, pd.Series | pd.DataFrame] | None = None,
86
+ interval: float | list[float] | tuple[float] | str | object | None = None,
87
+ interval_method: str = "bootstrapping",
88
+ alpha: float | None = None,
89
+ n_boot: int = 250,
90
+ use_in_sample_residuals: bool = True,
91
+ use_binned_residuals: bool = True,
92
+ random_state: int = 123,
93
+ return_predictors: bool = False,
94
+ freeze_params: bool = True,
95
+ n_jobs: int | str = "auto",
96
+ show_progress: bool = True,
97
+ suppress_warnings: bool = False,
98
+ ) -> None:
99
+ """
100
+ This is a helper function to check most inputs of backtesting functions in
101
+ modules `model_selection`.
102
+
103
+ Args:
104
+ forecaster: Forecaster model.
105
+ cv: TimeSeriesFold object with the information needed to split the data into folds.
106
+ metric: Metric used to quantify the goodness of fit of the model.
107
+ add_aggregated_metric: If `True`, the aggregated metrics (average, weighted average and pooling)
108
+ over all levels are also returned (only multiseries).
109
+ y: Training time series for uni-series forecasters.
110
+ series: Training time series for multi-series forecasters.
111
+ exog: Exogenous variables.
112
+ interval: Specifies whether probabilistic predictions should be estimated and the
113
+ method to use. The following options are supported:
114
+
115
+ - If `float`, represents the nominal (expected) coverage (between 0 and 1).
116
+ For instance, `interval=0.95` corresponds to `[2.5, 97.5]` percentiles.
117
+ - If `list` or `tuple`: Sequence of percentiles to compute, each value must
118
+ be between 0 and 100 inclusive. For example, a 95% confidence interval can
119
+ be specified as `interval = [2.5, 97.5]` or multiple percentiles (e.g. 10,
120
+ 50 and 90) as `interval = [10, 50, 90]`.
121
+ - If 'bootstrapping' (str): `n_boot` bootstrapping predictions will be generated.
122
+ - If scipy.stats distribution object, the distribution parameters will
123
+ be estimated for each prediction.
124
+ - If None, no probabilistic predictions are estimated.
125
+ interval_method: Technique used to estimate prediction intervals. Available options:
126
+
127
+ - 'bootstrapping': Bootstrapping is used to generate prediction
128
+ intervals.
129
+ - 'conformal': Employs the conformal prediction split method for
130
+ interval estimation.
131
+ alpha: The confidence intervals used in ForecasterStats are (1 - alpha) %.
132
+ n_boot: Number of bootstrapping iterations to perform when estimating prediction
133
+ intervals.
134
+ use_in_sample_residuals: If `True`, residuals from the training data are used as proxy of prediction
135
+ error to create prediction intervals. If `False`, out_sample_residuals
136
+ are used if they are already stored inside the forecaster.
137
+ use_binned_residuals: If `True`, residuals are selected based on the predicted values
138
+ (binned selection).
139
+ If `False`, residuals are selected randomly.
140
+ random_state: Seed for the random number generator to ensure reproducibility.
141
+ return_predictors: If `True`, the predictors used to make the predictions are also returned.
142
+ n_jobs: The number of jobs to run in parallel. If `-1`, then the number of jobs is
143
+ set to the number of cores. If 'auto', `n_jobs` is set using the function
144
+ select_n_jobs_fit_forecaster.
145
+ freeze_params: Determines whether to freeze the model parameters after the first fit
146
+ for estimators that perform automatic model selection.
147
+
148
+ - If `True`, the model parameters found during the first fit (e.g., order
149
+ and seasonal_order for Arima, or smoothing parameters for Ets) are reused
150
+ in all subsequent refits. This avoids re-running the automatic selection
151
+ procedure in each fold and reduces runtime.
152
+ - If `False`, automatic model selection is performed independently in each
153
+ refit, allowing parameters to adapt across folds. This increases runtime
154
+ and adds a `params` column to the output with the parameters selected per
155
+ fold.
156
+ show_progress: Whether to show a progress bar.
157
+ suppress_warnings: If `True`, spotforecast warnings will be suppressed during the backtesting
158
+ process.
159
+
160
+ Returns:
161
+ None
162
+
163
+ Examples:
164
+ >>> import pandas as pd
165
+ >>> from spotforecast2.model_selection.utils_common import check_backtesting_input
166
+ >>> from spotforecast2.forecaster.recursive import ForecasterRecursive
167
+ >>> from spotforecast2.model_selection import TimeSeriesFold
168
+ >>> from sklearn.linear_model import LinearRegression
169
+ >>> from sklearn.metrics import mean_squared_error
170
+ >>> y = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
171
+ >>> forecaster = ForecasterRecursive(LinearRegression(), lags=2)
172
+ >>> cv = TimeSeriesFold(
173
+ ... steps=3,
174
+ ... initial_train_size=5,
175
+ ... gap=0,
176
+ ... refit=False,
177
+ ... fixed_train_size=False,
178
+ ... allow_incomplete_fold=True
179
+ ... )
180
+ >>> check_backtesting_input(
181
+ ... forecaster=forecaster,
182
+ ... cv=cv,
183
+ ... metric=mean_squared_error,
184
+ ... y=y
185
+ ... )
186
+ """
187
+
188
+ forecaster_name = type(forecaster).__name__
189
+ cv_name = type(cv).__name__
190
+
191
+ if cv_name != "TimeSeriesFold":
192
+ raise TypeError(f"`cv` must be a 'TimeSeriesFold' object. Got '{cv_name}'.")
193
+
194
+ steps = cv.steps
195
+ initial_train_size = cv.initial_train_size
196
+ gap = cv.gap
197
+ allow_incomplete_fold = cv.allow_incomplete_fold
198
+ refit = cv.refit
199
+
200
+ forecasters_uni = [
201
+ "ForecasterRecursive",
202
+ "ForecasterDirect",
203
+ "ForecasterStats",
204
+ "ForecasterEquivalentDate",
205
+ "ForecasterRecursiveClassifier",
206
+ ]
207
+ forecasters_direct = [
208
+ "ForecasterDirect",
209
+ "ForecasterDirectMultiVariate",
210
+ "ForecasterRnn",
211
+ ]
212
+ forecasters_multi_no_dict = [
213
+ "ForecasterDirectMultiVariate",
214
+ "ForecasterRnn",
215
+ ]
216
+ forecasters_multi_dict = ["ForecasterRecursiveMultiSeries"]
217
+ # NOTE: ForecasterStats has interval but not with bootstrapping or conformal
218
+ forecasters_boot_conformal = [
219
+ "ForecasterRecursive",
220
+ "ForecasterDirect",
221
+ "ForecasterRecursiveMultiSeries",
222
+ "ForecasterDirectMultiVariate",
223
+ "ForecasterEquivalentDate",
224
+ ]
225
+ forecasters_return_predictors = [
226
+ "ForecasterRecursive",
227
+ "ForecasterDirect",
228
+ "ForecasterRecursiveMultiSeries",
229
+ "ForecasterDirectMultiVariate",
230
+ "ForecasterRecursiveClassifier",
231
+ ]
232
+
233
+ if forecaster_name in forecasters_uni:
234
+ if not isinstance(y, pd.Series):
235
+ raise TypeError("`y` must be a pandas Series.")
236
+ data_name = "y"
237
+ data_length = len(y)
238
+
239
+ elif forecaster_name in forecasters_multi_no_dict:
240
+ if not isinstance(series, pd.DataFrame):
241
+ raise TypeError("`series` must be a pandas DataFrame.")
242
+ data_name = "series"
243
+ data_length = len(series)
244
+
245
+ elif forecaster_name in forecasters_multi_dict:
246
+
247
+ # NOTE: Checks are not need as they are done in the function
248
+ # `check_preprocess_series` that is used before `check_backtesting_input`
249
+ # in the backtesting function.
250
+
251
+ data_name = "series"
252
+ data_length = max([len(series[serie]) for serie in series])
253
+
254
+ if exog is not None:
255
+ if forecaster_name in forecasters_multi_dict:
256
+ # NOTE: Checks are not need as they are done in the function
257
+ # `check_preprocess_exog_multiseries` that is used before
258
+ # `check_backtesting_input` in the backtesting function.
259
+ pass
260
+ else:
261
+ if not isinstance(exog, (pd.Series, pd.DataFrame)):
262
+ raise TypeError(
263
+ f"`exog` must be a pandas Series, DataFrame or None. Got {type(exog)}."
264
+ )
265
+
266
+ if hasattr(forecaster, "differentiation"):
267
+ if forecaster.differentiation_max != cv.differentiation:
268
+ if forecaster_name == "ForecasterRecursiveMultiSeries" and isinstance(
269
+ forecaster.differentiation, dict
270
+ ):
271
+ raise ValueError(
272
+ f"When using a dict as `differentiation` in ForecasterRecursiveMultiSeries, "
273
+ f"the `differentiation` included in the cv ({cv.differentiation}) must be "
274
+ f"the same as the maximum `differentiation` included in the forecaster "
275
+ f"({forecaster.differentiation_max}). Set the same value "
276
+ f"for both using the `differentiation` argument."
277
+ )
278
+ else:
279
+ raise ValueError(
280
+ f"The differentiation included in the forecaster "
281
+ f"({forecaster.differentiation_max}) differs from the differentiation "
282
+ f"included in the cv ({cv.differentiation}). Set the same value "
283
+ f"for both using the `differentiation` argument."
284
+ )
285
+
286
+ if not isinstance(metric, (str, Callable, list)):
287
+ raise TypeError(
288
+ f"`metric` must be a string, a callable function, or a list containing "
289
+ f"multiple strings and/or callables. Got {type(metric)}."
290
+ )
291
+
292
+ if forecaster_name == "ForecasterEquivalentDate" and isinstance(
293
+ forecaster.offset, pd.tseries.offsets.DateOffset
294
+ ):
295
+ # NOTE: Checks when initial_train_size is not None cannot be done here
296
+ # because the forecaster is not fitted yet and we don't know the
297
+ # window_size since pd.DateOffset is not a fixed window size.
298
+ if initial_train_size is None:
299
+ raise ValueError(
300
+ f"`initial_train_size` must be an integer greater than "
301
+ f"the `window_size` of the forecaster ({forecaster.window_size}) "
302
+ f"and smaller than the length of `{data_name}` ({data_length}) or "
303
+ f"a date within this range of the index."
304
+ )
305
+ elif initial_train_size is not None:
306
+ if forecaster_name in forecasters_uni:
307
+ index = cv._extract_index(y)
308
+ else:
309
+ index = cv._extract_index(series)
310
+
311
+ initial_train_size = date_to_index_position(
312
+ index=index,
313
+ date_input=initial_train_size,
314
+ method="validation",
315
+ date_literal="initial_train_size",
316
+ )
317
+ if (
318
+ initial_train_size < forecaster.window_size
319
+ or initial_train_size >= data_length
320
+ ):
321
+ raise ValueError(
322
+ f"If `initial_train_size` is an integer, it must be greater than "
323
+ f"the `window_size` of the forecaster ({forecaster.window_size}) "
324
+ f"and smaller than the length of `{data_name}` ({data_length}). If "
325
+ f"it is a date, it must be within this range of the index."
326
+ )
327
+ if allow_incomplete_fold:
328
+ # At least one observation after the gap to allow incomplete fold
329
+ if data_length <= initial_train_size + gap:
330
+ raise ValueError(
331
+ f"`{data_name}` must have more than `initial_train_size + gap` "
332
+ f"observations to create at least one fold.\n"
333
+ f" Time series length: {data_length}\n"
334
+ f" Required > {initial_train_size + gap}\n"
335
+ f" initial_train_size: {initial_train_size}\n"
336
+ f" gap: {gap}\n"
337
+ )
338
+ else:
339
+ # At least one complete fold
340
+ if data_length < initial_train_size + gap + steps:
341
+ raise ValueError(
342
+ f"`{data_name}` must have at least `initial_train_size + gap + steps` "
343
+ f"observations to create a minimum of one complete fold "
344
+ f"(allow_incomplete_fold=False).\n"
345
+ f" Time series length: {data_length}\n"
346
+ f" Required >= {initial_train_size + gap + steps}\n"
347
+ f" initial_train_size: {initial_train_size}\n"
348
+ f" gap: {gap}\n"
349
+ f" steps: {steps}\n"
350
+ )
351
+ else:
352
+ if forecaster_name in ["ForecasterStats", "ForecasterEquivalentDate"]:
353
+ raise ValueError(
354
+ f"When using {forecaster_name}, `initial_train_size` must be an "
355
+ f"integer smaller than the length of `{data_name}` ({data_length})."
356
+ )
357
+ else:
358
+ if not forecaster.is_fitted:
359
+ raise NotFittedError(
360
+ "`forecaster` must be already trained if no `initial_train_size` "
361
+ "is provided."
362
+ )
363
+ if refit:
364
+ raise ValueError(
365
+ "`refit` is only allowed when `initial_train_size` is not `None`."
366
+ )
367
+
368
+ if forecaster_name == "ForecasterStats" and cv.skip_folds is not None:
369
+ raise ValueError(
370
+ "`skip_folds` is not allowed for ForecasterStats. Set it to `None`."
371
+ )
372
+
373
+ if not isinstance(add_aggregated_metric, bool):
374
+ raise TypeError("`add_aggregated_metric` must be a boolean: `True`, `False`.")
375
+ if not isinstance(n_boot, (int, np.integer)) or n_boot < 0:
376
+ raise TypeError(f"`n_boot` must be an integer greater than 0. Got {n_boot}.")
377
+ if not isinstance(use_in_sample_residuals, bool):
378
+ raise TypeError("`use_in_sample_residuals` must be a boolean: `True`, `False`.")
379
+ if not isinstance(use_binned_residuals, bool):
380
+ raise TypeError("`use_binned_residuals` must be a boolean: `True`, `False`.")
381
+ if not isinstance(random_state, (int, np.integer)) or random_state < 0:
382
+ raise TypeError(
383
+ f"`random_state` must be an integer greater than 0. Got {random_state}."
384
+ )
385
+ if not isinstance(return_predictors, bool):
386
+ raise TypeError("`return_predictors` must be a boolean: `True`, `False`.")
387
+ if not isinstance(freeze_params, bool):
388
+ raise TypeError("`freeze_params` must be a boolean: `True`, `False`.")
389
+ if not isinstance(n_jobs, int) and n_jobs != "auto":
390
+ raise TypeError(f"`n_jobs` must be an integer or `'auto'`. Got {n_jobs}.")
391
+ if not isinstance(show_progress, bool):
392
+ raise TypeError("`show_progress` must be a boolean: `True`, `False`.")
393
+ if not isinstance(suppress_warnings, bool):
394
+ raise TypeError("`suppress_warnings` must be a boolean: `True`, `False`.")
395
+
396
+ if interval is not None or alpha is not None:
397
+
398
+ if forecaster_name in forecasters_boot_conformal:
399
+
400
+ if interval_method == "conformal":
401
+ if not isinstance(interval, (float, list, tuple)):
402
+ raise TypeError(
403
+ f"When `interval_method` is 'conformal', `interval` must "
404
+ f"be a float or a list/tuple defining a symmetric interval. "
405
+ f"Got {type(interval)}."
406
+ )
407
+ elif interval_method == "bootstrapping":
408
+ if not isinstance(interval, (float, list, tuple, str)) and (
409
+ not hasattr(interval, "_pdf")
410
+ or not callable(getattr(interval, "fit", None))
411
+ ):
412
+ raise TypeError(
413
+ f"When `interval_method` is 'bootstrapping', `interval` "
414
+ f"must be a float, a list or tuple of floats, a "
415
+ f"scipy.stats distribution object (with methods `_pdf` and "
416
+ f"`fit`) or the string 'bootstrapping'. Got {type(interval)}."
417
+ )
418
+ if isinstance(interval, (list, tuple)):
419
+ for i in interval:
420
+ if not isinstance(i, (int, float)):
421
+ raise TypeError(
422
+ f"`interval` must be a list or tuple of floats. "
423
+ f"Got {type(i)} in {interval}."
424
+ )
425
+ if len(interval) == 2:
426
+ check_interval(interval=interval)
427
+ else:
428
+ for q in interval:
429
+ if (q < 0.0) or (q > 100.0):
430
+ raise ValueError(
431
+ "When `interval` is a list or tuple, all values must be "
432
+ "between 0 and 100 inclusive."
433
+ )
434
+ elif isinstance(interval, str):
435
+ if interval != "bootstrapping":
436
+ raise ValueError(
437
+ f"When `interval` is a string, it must be 'bootstrapping'."
438
+ f"Got {interval}."
439
+ )
440
+ else:
441
+ raise ValueError(
442
+ f"`interval_method` must be 'bootstrapping' or 'conformal'. "
443
+ f"Got {interval_method}."
444
+ )
445
+ else:
446
+ if forecaster_name == "ForecasterRecursiveClassifier":
447
+ raise ValueError(
448
+ f"`interval` is not supported for {forecaster_name}. Class "
449
+ f"probabilities are returned by default during backtesting, "
450
+ f"set `interval=None`."
451
+ )
452
+ check_interval(interval=interval, alpha=alpha)
453
+
454
+ if return_predictors and forecaster_name not in forecasters_return_predictors:
455
+ raise ValueError(
456
+ f"`return_predictors` is only allowed for forecasters of type "
457
+ f"{forecasters_return_predictors}. Got {forecaster_name}."
458
+ )
459
+
460
+ if forecaster_name in forecasters_direct and forecaster.max_step < steps + gap:
461
+ raise ValueError(
462
+ f"When using a {forecaster_name}, the combination of steps "
463
+ f"+ gap ({steps + gap}) cannot be greater than the `steps` parameter "
464
+ f"declared when the forecaster is initialized ({forecaster.max_step})."
465
+ )
466
+
467
+
468
+ def check_one_step_ahead_input(
469
+ forecaster: object,
470
+ cv: object,
471
+ metric: str | Callable | list[str | Callable],
472
+ y: pd.Series | None = None,
473
+ series: pd.DataFrame | dict[str, pd.Series | pd.DataFrame] = None,
474
+ exog: pd.Series | pd.DataFrame | dict[str, pd.Series | pd.DataFrame] | None = None,
475
+ show_progress: bool = True,
476
+ suppress_warnings: bool = False,
477
+ ) -> None:
478
+ """
479
+ This is a helper function to check most inputs of hyperparameter tuning
480
+ functions in modules `model_selection` when using a `OneStepAheadFold`.
481
+
482
+ Args:
483
+ forecaster: Forecaster model.
484
+ cv: OneStepAheadFold object with the information needed to split the data into folds.
485
+ metric: Metric used to quantify the goodness of fit of the model.
486
+ y: Training time series for uni-series forecasters.
487
+ series: Training time series for multi-series forecasters.
488
+ exog: Exogenous variables.
489
+ show_progress: Whether to show a progress bar.
490
+ suppress_warnings: If `True`, spotforecast warnings will be suppressed during the hyperparameter
491
+ search.
492
+
493
+ Returns:
494
+ None
495
+
496
+ Examples:
497
+ >>> import pandas as pd
498
+ >>> from spotforecast2.model_selection.utils_common import check_one_step_ahead_input
499
+ >>> from spotforecast2.forecaster.recursive import ForecasterRecursive
500
+ >>> from spotforecast2.model_selection import OneStepAheadFold
501
+ >>> from sklearn.linear_model import LinearRegression
502
+ >>> from sklearn.metrics import mean_squared_error
503
+ >>> y = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
504
+ >>> forecaster = ForecasterRecursive(LinearRegression(), lags=2)
505
+ >>> cv = OneStepAheadFold(
506
+ ... initial_train_size=5,
507
+ ... return_all_predictions=False
508
+ ... )
509
+ >>> check_one_step_ahead_input(
510
+ ... forecaster=forecaster,
511
+ ... cv=cv,
512
+ ... metric=mean_squared_error,
513
+ ... y=y
514
+ ... )
515
+ """
516
+
517
+ forecaster_name = type(forecaster).__name__
518
+ cv_name = type(cv).__name__
519
+
520
+ if cv_name != "OneStepAheadFold":
521
+ raise TypeError(f"`cv` must be a 'OneStepAheadFold' object. Got '{cv_name}'.")
522
+
523
+ initial_train_size = cv.initial_train_size
524
+
525
+ forecasters_one_step_ahead = [
526
+ "ForecasterRecursive",
527
+ "ForecasterDirect",
528
+ "ForecasterRecursiveClassifier",
529
+ "ForecasterRecursiveMultiSeries",
530
+ "ForecasterDirectMultiVariate",
531
+ ]
532
+ if forecaster_name not in forecasters_one_step_ahead:
533
+ raise TypeError(
534
+ f"Only forecasters of type {forecasters_one_step_ahead} are allowed "
535
+ f"when using `cv` of type `OneStepAheadFold`. Got {forecaster_name}."
536
+ )
537
+
538
+ forecasters_uni = [
539
+ "ForecasterRecursive",
540
+ "ForecasterDirect",
541
+ "ForecasterRecursiveClassifier",
542
+ ]
543
+ forecasters_multi_no_dict = [
544
+ "ForecasterDirectMultiVariate",
545
+ ]
546
+ forecasters_multi_dict = ["ForecasterRecursiveMultiSeries"]
547
+
548
+ if forecaster_name in forecasters_uni:
549
+ if not isinstance(y, pd.Series):
550
+ raise TypeError(f"`y` must be a pandas Series. Got {type(y)}")
551
+ data_name = "y"
552
+ data_length = len(y)
553
+
554
+ elif forecaster_name in forecasters_multi_no_dict:
555
+ if not isinstance(series, pd.DataFrame):
556
+ raise TypeError(f"`series` must be a pandas DataFrame. Got {type(series)}")
557
+ data_name = "series"
558
+ data_length = len(series)
559
+
560
+ elif forecaster_name in forecasters_multi_dict:
561
+
562
+ # NOTE: Checks are not need as they are done in the function
563
+ # `check_preprocess_series` that is used before `check_one_step_ahead_input`
564
+ # in the backtesting function.
565
+
566
+ data_name = "series"
567
+ data_length = max([len(series[serie]) for serie in series])
568
+
569
+ if exog is not None:
570
+ if forecaster_name in forecasters_multi_dict:
571
+ # NOTE: Checks are not need as they are done in the function
572
+ # `check_preprocess_exog_multiseries` that is used before
573
+ # `check_backtesting_input` in the backtesting function.
574
+ pass
575
+ else:
576
+ if not isinstance(exog, (pd.Series, pd.DataFrame)):
577
+ raise TypeError(
578
+ f"`exog` must be a pandas Series, DataFrame or None. Got {type(exog)}."
579
+ )
580
+
581
+ if hasattr(forecaster, "differentiation"):
582
+ if forecaster.differentiation_max != cv.differentiation:
583
+ if forecaster_name == "ForecasterRecursiveMultiSeries" and isinstance(
584
+ forecaster.differentiation, dict
585
+ ):
586
+ raise ValueError(
587
+ f"When using a dict as `differentiation` in ForecasterRecursiveMultiSeries, "
588
+ f"the `differentiation` included in the cv ({cv.differentiation}) must be "
589
+ f"the same as the maximum `differentiation` included in the forecaster "
590
+ f"({forecaster.differentiation_max}). Set the same value "
591
+ f"for both using the `differentiation` argument."
592
+ )
593
+ else:
594
+ raise ValueError(
595
+ f"The differentiation included in the forecaster "
596
+ f"({forecaster.differentiation_max}) differs from the differentiation "
597
+ f"included in the cv ({cv.differentiation}). Set the same value "
598
+ f"for both using the `differentiation` argument."
599
+ )
600
+
601
+ if not isinstance(metric, (str, Callable, list)):
602
+ raise TypeError(
603
+ f"`metric` must be a string, a callable function, or a list containing "
604
+ f"multiple strings and/or callables. Got {type(metric)}."
605
+ )
606
+
607
+ if forecaster_name in forecasters_uni:
608
+ index = cv._extract_index(y)
609
+ else:
610
+ index = cv._extract_index(series)
611
+
612
+ initial_train_size = date_to_index_position(
613
+ index=index,
614
+ date_input=initial_train_size,
615
+ method="validation",
616
+ date_literal="initial_train_size",
617
+ )
618
+ if initial_train_size < forecaster.window_size or initial_train_size >= data_length:
619
+ raise ValueError(
620
+ f"If `initial_train_size` is an integer, it must be greater than "
621
+ f"the `window_size` of the forecaster ({forecaster.window_size}) "
622
+ f"and smaller than the length of `{data_name}` ({data_length}). If "
623
+ f"it is a date, it must be within this range of the index."
624
+ )
625
+
626
+ if not isinstance(show_progress, bool):
627
+ raise TypeError("`show_progress` must be a boolean: `True`, `False`.")
628
+ if not isinstance(suppress_warnings, bool):
629
+ raise TypeError("`suppress_warnings` must be a boolean: `True`, `False`.")
630
+
631
+ if not suppress_warnings:
632
+ warnings.warn(
633
+ "One-step-ahead predictions are used for faster model comparison, but they "
634
+ "may not fully represent multi-step prediction performance. It is recommended "
635
+ "to backtest the final model for a more accurate multi-step performance "
636
+ "estimate.",
637
+ OneStepAheadValidationWarning,
638
+ )
639
+
640
+
641
+ def select_n_jobs_backtesting(forecaster: object, refit: bool | int) -> int:
642
+ """
643
+ Select the optimal number of jobs to use in the backtesting process. This
644
+ selection is based on heuristics and is not guaranteed to be optimal.
645
+
646
+ The number of jobs is chosen as follows:
647
+
648
+ - If `refit` is an integer, then `n_jobs = 1`. This is because parallelization doesn't
649
+ work with intermittent refit.
650
+ - If forecaster is 'ForecasterRecursive' and estimator is a linear estimator,
651
+ then `n_jobs = 1`.
652
+ - If forecaster is 'ForecasterRecursive' and estimator is not a linear
653
+ estimator then `n_jobs = cpu_count() - 1`.
654
+ - If forecaster is 'ForecasterDirect' or 'ForecasterDirectMultiVariate'
655
+ and `refit = True`, then `n_jobs = cpu_count() - 1`.
656
+ - If forecaster is 'ForecasterDirect' or 'ForecasterDirectMultiVariate'
657
+ and `refit = False`, then `n_jobs = 1`.
658
+ - If forecaster is 'ForecasterRecursiveMultiSeries', then `n_jobs = cpu_count() - 1`.
659
+ - If forecaster is 'ForecasterStats' or 'ForecasterEquivalentDate',
660
+ then `n_jobs = 1`.
661
+ - If estimator is a `LGBMRegressor(n_jobs=1)`, then `n_jobs = cpu_count() - 1`.
662
+ - If estimator is a `LGBMRegressor` with internal n_jobs != 1, then `n_jobs = 1`.
663
+ This is because `lightgbm` is highly optimized for gradient boosting and
664
+ parallelizes operations at a very fine-grained level, making additional
665
+ parallelization unnecessary and potentially harmful due to resource contention.
666
+
667
+ Args:
668
+ forecaster: Forecaster model.
669
+ refit: If the forecaster is refitted during the backtesting process.
670
+
671
+ Returns:
672
+ int: The number of jobs to run in parallel.
673
+
674
+ Examples:
675
+ >>> from spotforecast2.model_selection.utils_common import select_n_jobs_backtesting
676
+ >>> from spotforecast2.forecaster.recursive import ForecasterRecursive
677
+ >>> from sklearn.linear_model import LinearRegression
678
+ >>> forecaster = ForecasterRecursive(LinearRegression(), lags=2)
679
+ >>> select_n_jobs_backtesting(forecaster, refit=True)
680
+ 1
681
+ """
682
+
683
+ forecaster_name = type(forecaster).__name__
684
+
685
+ if forecaster_name == "ForecasterStats":
686
+ n_jobs = 1
687
+ return n_jobs
688
+
689
+ if isinstance(forecaster.estimator, Pipeline):
690
+ estimator = forecaster.estimator[-1]
691
+ else:
692
+ estimator = forecaster.estimator
693
+
694
+ refit = False if refit == 0 else refit
695
+ if not isinstance(refit, bool) and refit != 1:
696
+ n_jobs = 1
697
+ else:
698
+ if forecaster_name in {"ForecasterRecursive", "ForecasterRecursiveClassifier"}:
699
+ if isinstance(estimator, (LinearModel, LinearClassifierMixin)):
700
+ n_jobs = 1
701
+ elif type(estimator).__name__ in {"LGBMRegressor", "LGBMClassifier"}:
702
+ n_jobs = cpu_count() - 1 if estimator.n_jobs == 1 else 1
703
+ else:
704
+ n_jobs = cpu_count() - 1
705
+ elif forecaster_name in {"ForecasterDirect", "ForecasterDirectMultiVariate"}:
706
+ # Parallelization is applied during the fitting process.
707
+ n_jobs = 1
708
+ elif forecaster_name in {"ForecasterRecursiveMultiSeries"}:
709
+ if type(estimator).__name__ == "LGBMRegressor":
710
+ n_jobs = cpu_count() - 1 if estimator.n_jobs == 1 else 1
711
+ else:
712
+ n_jobs = cpu_count() - 1
713
+ elif forecaster_name in {"ForecasterEquivalentDate"}:
714
+ n_jobs = 1
715
+ else:
716
+ n_jobs = 1
717
+
718
+ return n_jobs