spotforecast2 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. spotforecast2/.DS_Store +0 -0
  2. spotforecast2/__init__.py +2 -0
  3. spotforecast2/data/__init__.py +0 -0
  4. spotforecast2/data/data.py +130 -0
  5. spotforecast2/data/fetch_data.py +209 -0
  6. spotforecast2/exceptions.py +681 -0
  7. spotforecast2/forecaster/.DS_Store +0 -0
  8. spotforecast2/forecaster/__init__.py +7 -0
  9. spotforecast2/forecaster/base.py +448 -0
  10. spotforecast2/forecaster/metrics.py +527 -0
  11. spotforecast2/forecaster/recursive/__init__.py +4 -0
  12. spotforecast2/forecaster/recursive/_forecaster_equivalent_date.py +1075 -0
  13. spotforecast2/forecaster/recursive/_forecaster_recursive.py +939 -0
  14. spotforecast2/forecaster/recursive/_warnings.py +15 -0
  15. spotforecast2/forecaster/utils.py +954 -0
  16. spotforecast2/model_selection/__init__.py +5 -0
  17. spotforecast2/model_selection/bayesian_search.py +453 -0
  18. spotforecast2/model_selection/grid_search.py +314 -0
  19. spotforecast2/model_selection/random_search.py +151 -0
  20. spotforecast2/model_selection/split_base.py +357 -0
  21. spotforecast2/model_selection/split_one_step.py +245 -0
  22. spotforecast2/model_selection/split_ts_cv.py +634 -0
  23. spotforecast2/model_selection/utils_common.py +718 -0
  24. spotforecast2/model_selection/utils_metrics.py +103 -0
  25. spotforecast2/model_selection/validation.py +685 -0
  26. spotforecast2/preprocessing/__init__.py +30 -0
  27. spotforecast2/preprocessing/_binner.py +378 -0
  28. spotforecast2/preprocessing/_common.py +123 -0
  29. spotforecast2/preprocessing/_differentiator.py +123 -0
  30. spotforecast2/preprocessing/_rolling.py +136 -0
  31. spotforecast2/preprocessing/curate_data.py +254 -0
  32. spotforecast2/preprocessing/imputation.py +92 -0
  33. spotforecast2/preprocessing/outlier.py +114 -0
  34. spotforecast2/preprocessing/split.py +139 -0
  35. spotforecast2/py.typed +0 -0
  36. spotforecast2/utils/__init__.py +43 -0
  37. spotforecast2/utils/convert_to_utc.py +44 -0
  38. spotforecast2/utils/data_transform.py +208 -0
  39. spotforecast2/utils/forecaster_config.py +344 -0
  40. spotforecast2/utils/generate_holiday.py +70 -0
  41. spotforecast2/utils/validation.py +569 -0
  42. spotforecast2/weather/__init__.py +0 -0
  43. spotforecast2/weather/weather_client.py +288 -0
  44. spotforecast2-0.0.1.dist-info/METADATA +47 -0
  45. spotforecast2-0.0.1.dist-info/RECORD +46 -0
  46. spotforecast2-0.0.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,314 @@
1
+ from __future__ import annotations
2
+ import os
3
+ import numpy as np
4
+ import warnings
5
+ from typing import Callable
6
+ from copy import deepcopy
7
+ import pandas as pd
8
+ from joblib import cpu_count
9
+ from tqdm.auto import tqdm
10
+ from sklearn.model_selection import ParameterGrid
11
+
12
+ from spotforecast2.exceptions import (
13
+ IgnoredArgumentWarning,
14
+ )
15
+ from spotforecast2.model_selection.split_ts_cv import TimeSeriesFold
16
+ from spotforecast2.model_selection.split_one_step import OneStepAheadFold
17
+ from spotforecast2.model_selection.utils_common import (
18
+ initialize_lags_grid,
19
+ check_backtesting_input,
20
+ check_one_step_ahead_input,
21
+ select_n_jobs_backtesting,
22
+ )
23
+ from spotforecast2.forecaster.metrics import add_y_train_argument, _get_metric
24
+ from spotforecast2.model_selection.utils_metrics import (
25
+ _calculate_metrics_one_step_ahead,
26
+ )
27
+ from spotforecast2.model_selection.validation import _backtesting_forecaster
28
+ from spotforecast2.forecaster.utils import set_skforecast_warnings
29
+
30
+
31
+ def _evaluate_grid_hyperparameters(
32
+ forecaster: object,
33
+ y: pd.Series,
34
+ cv: TimeSeriesFold | OneStepAheadFold,
35
+ param_grid: dict[str, object],
36
+ metric: str | Callable | list[str | Callable],
37
+ exog: pd.Series | pd.DataFrame | None = None,
38
+ lags_grid: (
39
+ list[int | list[int] | np.ndarray[int] | range[int]]
40
+ | dict[str, list[int | list[int] | np.ndarray[int] | range[int]]]
41
+ | None
42
+ ) = None,
43
+ return_best: bool = True,
44
+ n_jobs: int | str = "auto",
45
+ verbose: bool = False,
46
+ show_progress: bool = True,
47
+ suppress_warnings: bool = False,
48
+ output_file: str | None = None,
49
+ ) -> pd.DataFrame:
50
+ """
51
+ Evaluate combinations of hyperparameters and lags for a given forecaster.
52
+ """
53
+
54
+ set_skforecast_warnings(suppress_warnings, action="ignore")
55
+
56
+ forecaster = deepcopy(forecaster)
57
+ forecaster_search = forecaster # Alias for consistency with original code
58
+ is_regression = (
59
+ forecaster_search.__spotforecast_tags__["forecaster_task"] == "regression"
60
+ )
61
+
62
+ if isinstance(cv, TimeSeriesFold):
63
+ check_backtesting_input(
64
+ forecaster=forecaster,
65
+ cv=cv,
66
+ y=y,
67
+ metric=metric,
68
+ exog=exog,
69
+ n_jobs=n_jobs,
70
+ show_progress=show_progress,
71
+ suppress_warnings=suppress_warnings,
72
+ )
73
+ else:
74
+ # OneStepAheadFold
75
+ check_one_step_ahead_input(
76
+ forecaster=forecaster,
77
+ cv=cv,
78
+ y=y,
79
+ metric=metric,
80
+ exog=exog,
81
+ show_progress=show_progress,
82
+ suppress_warnings=suppress_warnings,
83
+ )
84
+ # Update cv params in case they were modified during input check or need setting
85
+ # (Original code does initialization of initial_train_size here)
86
+ # We assume cv is already correctly set up or updated by user.
87
+ # But OneStepAheadFold in original is used to split?
88
+ # Original code re-initializes cv params?
89
+ # Lines 280-293 in original handle date_to_index_position for initial_train_size.
90
+ # We should probably do that if passing strings.
91
+ # But TimeSeriesFold does it in its init?
92
+ # OneStepAheadFold might support string initial_train_size.
93
+ # Let's adding it for robustness if needed, but keeping it simple for now as per prior porting.
94
+ pass
95
+
96
+ if not isinstance(metric, list):
97
+ metric = [metric]
98
+ metric = [
99
+ _get_metric(metric=m) if isinstance(m, str) else add_y_train_argument(m)
100
+ for m in metric
101
+ ]
102
+ metric_dict = {(m if isinstance(m, str) else m.__name__): [] for m in metric}
103
+
104
+ if len(metric_dict) != len(metric):
105
+ raise ValueError("When `metric` is a `list`, each metric name must be unique.")
106
+
107
+ lags_grid, lags_label = initialize_lags_grid(forecaster, lags_grid)
108
+ cv = deepcopy(cv)
109
+
110
+ if n_jobs == "auto":
111
+ refit = cv.refit if isinstance(cv, TimeSeriesFold) else False
112
+ n_jobs = select_n_jobs_backtesting(forecaster=forecaster, refit=refit)
113
+ elif isinstance(cv, TimeSeriesFold) and cv.refit != 1 and n_jobs != 1:
114
+ warnings.warn(
115
+ "If `refit` is an integer other than 1 (intermittent refit). `n_jobs` "
116
+ "is set to 1 to avoid unexpected results during parallelization.",
117
+ IgnoredArgumentWarning,
118
+ )
119
+ n_jobs = 1
120
+ else:
121
+ n_jobs = n_jobs if n_jobs > 0 else cpu_count()
122
+
123
+ print(
124
+ f"Number of models compared: {len(param_grid) * len(lags_grid)}. "
125
+ f"Training models..."
126
+ )
127
+
128
+ if show_progress:
129
+ lags_grid_tqdm = tqdm(lags_grid.items(), desc="Lags grid", position=0)
130
+ else:
131
+ lags_grid_tqdm = lags_grid.items()
132
+
133
+ if output_file is not None and os.path.isfile(output_file):
134
+ os.remove(output_file)
135
+
136
+ lags_list = []
137
+ lags_label_list = []
138
+ params_list = []
139
+
140
+ for lags_k, lags_v in lags_grid_tqdm:
141
+
142
+ forecaster_search.set_lags(lags_v)
143
+ lags_v = forecaster_search.lags.copy()
144
+ if lags_label == "values":
145
+ lags_k = lags_v
146
+
147
+ # OneStepAhead split is done once per lag config if independent of params
148
+ # But params might affect transformation?
149
+ # In original code, split is done inside the loop over lags, before params loop.
150
+ if isinstance(cv, OneStepAheadFold):
151
+ X_train, y_train, X_test, y_test = (
152
+ forecaster_search._train_test_split_one_step_ahead(
153
+ y=y, initial_train_size=cv.initial_train_size, exog=exog
154
+ )
155
+ )
156
+
157
+ if show_progress:
158
+ param_grid_tqdm = tqdm(
159
+ param_grid, desc="Parameters grid", position=1, leave=False
160
+ )
161
+ else:
162
+ param_grid_tqdm = param_grid
163
+
164
+ for params in param_grid_tqdm:
165
+ try:
166
+ forecaster_search.set_params(**params)
167
+
168
+ if isinstance(cv, TimeSeriesFold):
169
+ metric_values = _backtesting_forecaster(
170
+ forecaster=forecaster_search,
171
+ y=y,
172
+ cv=cv,
173
+ metric=metric,
174
+ exog=exog,
175
+ n_jobs=n_jobs,
176
+ verbose=verbose,
177
+ show_progress=False,
178
+ suppress_warnings=suppress_warnings,
179
+ )[0]
180
+ # metric_values is a DataFrame, we want list of values for the row (0)
181
+ metric_values = metric_values.iloc[0, :].to_list()
182
+ else:
183
+ # One Step Ahead
184
+ metric_values = _calculate_metrics_one_step_ahead(
185
+ forecaster=forecaster_search,
186
+ metrics=metric,
187
+ X_train=X_train,
188
+ y_train=y_train,
189
+ X_test=X_test,
190
+ y_test=y_test,
191
+ )
192
+ except Exception as e:
193
+ warnings.warn(f"Parameters skipped: {params}. {e}", RuntimeWarning)
194
+ continue
195
+
196
+ # Filter warnings if needed/configured
197
+ warnings.filterwarnings(
198
+ "ignore",
199
+ category=RuntimeWarning,
200
+ message="The forecaster will be fit.*",
201
+ )
202
+
203
+ lags_list.append(lags_v)
204
+ lags_label_list.append(lags_k)
205
+ params_list.append(params)
206
+ for m, m_value in zip(metric, metric_values):
207
+ m_name = m if isinstance(m, str) else m.__name__
208
+ metric_dict[m_name].append(m_value)
209
+
210
+ if output_file is not None:
211
+ header = [
212
+ "lags",
213
+ "lags_label",
214
+ "params",
215
+ *metric_dict.keys(),
216
+ *params.keys(),
217
+ ]
218
+ row = [lags_v, lags_k, params, *metric_values, *params.values()]
219
+ if not os.path.isfile(output_file):
220
+ with open(output_file, "w", newline="") as f:
221
+ f.write("\t".join(header) + "\n")
222
+ f.write("\t".join([str(r) for r in row]) + "\n")
223
+ else:
224
+ with open(output_file, "a", newline="") as f:
225
+ f.write("\t".join([str(r) for r in row]) + "\n")
226
+
227
+ results = pd.DataFrame(
228
+ {
229
+ "lags": lags_list,
230
+ "lags_label": lags_label_list,
231
+ "params": params_list,
232
+ **metric_dict,
233
+ }
234
+ )
235
+
236
+ if results.empty:
237
+ warnings.warn(
238
+ "All models failed to train. Check the parameters and data.",
239
+ RuntimeWarning,
240
+ )
241
+ return results
242
+
243
+ results = results.sort_values(
244
+ by=list(metric_dict.keys())[0], ascending=True if is_regression else False
245
+ ).reset_index(drop=True)
246
+ results = pd.concat([results, results["params"].apply(pd.Series)], axis=1)
247
+
248
+ if return_best:
249
+ best_lags = results.loc[0, "lags"]
250
+ best_params = results.loc[0, "params"]
251
+ best_metric = results.loc[0, list(metric_dict.keys())[0]]
252
+
253
+ # NOTE: Here we use the actual forecaster passed by the user
254
+ forecaster.set_lags(best_lags)
255
+ forecaster.set_params(**best_params)
256
+
257
+ forecaster.fit(y=y, exog=exog, store_in_sample_residuals=True)
258
+
259
+ print(
260
+ f"`Forecaster` refitted using the best-found lags and parameters, "
261
+ f"and the whole data set: \n"
262
+ f" Lags: {best_lags} \n"
263
+ f" Parameters: {best_params}\n"
264
+ f" {'Backtesting' if isinstance(cv, TimeSeriesFold) else 'One-step-ahead'} "
265
+ f"metric: {best_metric}"
266
+ )
267
+
268
+ set_skforecast_warnings(suppress_warnings, action="default")
269
+
270
+ return results
271
+
272
+
273
+ def grid_search_forecaster(
274
+ forecaster: object,
275
+ y: pd.Series,
276
+ cv: TimeSeriesFold | OneStepAheadFold,
277
+ param_grid: dict,
278
+ metric: str | Callable | list[str | Callable],
279
+ exog: pd.Series | pd.DataFrame | None = None,
280
+ lags_grid: (
281
+ list[int | list[int] | np.ndarray[int] | range[int]]
282
+ | dict[str, list[int | list[int] | np.ndarray[int] | range[int]]]
283
+ | None
284
+ ) = None,
285
+ return_best: bool = True,
286
+ n_jobs: int | str = "auto",
287
+ verbose: bool = False,
288
+ show_progress: bool = True,
289
+ suppress_warnings: bool = False,
290
+ output_file: str | None = None,
291
+ ) -> pd.DataFrame:
292
+ """
293
+ Exhaustive grid search over parameter values for a Forecaster.
294
+ """
295
+
296
+ param_grid = list(ParameterGrid(param_grid))
297
+
298
+ results = _evaluate_grid_hyperparameters(
299
+ forecaster=forecaster,
300
+ y=y,
301
+ cv=cv,
302
+ param_grid=param_grid,
303
+ metric=metric,
304
+ exog=exog,
305
+ lags_grid=lags_grid,
306
+ return_best=return_best,
307
+ n_jobs=n_jobs,
308
+ verbose=verbose,
309
+ show_progress=show_progress,
310
+ suppress_warnings=suppress_warnings,
311
+ output_file=output_file,
312
+ )
313
+
314
+ return results
@@ -0,0 +1,151 @@
1
+ """Random search hyperparameter optimization for forecasters."""
2
+
3
+ from __future__ import annotations
4
+ from typing import Callable
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn.model_selection import ParameterSampler
8
+ from spotforecast2.model_selection.split_ts_cv import TimeSeriesFold
9
+ from spotforecast2.model_selection.split_one_step import OneStepAheadFold
10
+ from spotforecast2.model_selection.grid_search import (
11
+ _evaluate_grid_hyperparameters,
12
+ )
13
+
14
+
15
+ def random_search_forecaster(
16
+ forecaster: object,
17
+ y: pd.Series,
18
+ cv: TimeSeriesFold | OneStepAheadFold,
19
+ param_distributions: dict,
20
+ metric: str | Callable | list[str | Callable],
21
+ exog: pd.Series | pd.DataFrame | None = None,
22
+ lags_grid: (
23
+ list[int | list[int] | np.ndarray[int] | range[int]]
24
+ | dict[str, list[int | list[int] | np.ndarray[int] | range[int]]]
25
+ | None
26
+ ) = None,
27
+ n_iter: int = 10,
28
+ random_state: int = 123,
29
+ return_best: bool = True,
30
+ n_jobs: int | str = "auto",
31
+ verbose: bool = False,
32
+ show_progress: bool = True,
33
+ suppress_warnings: bool = False,
34
+ output_file: str | None = None,
35
+ ) -> pd.DataFrame:
36
+ """Random search over parameter distributions for a Forecaster.
37
+
38
+ Performs random sampling of parameter settings from distributions for a
39
+ Forecaster object. Validation is done using time series backtesting with
40
+ the provided cross-validation strategy. This is more efficient than grid
41
+ search when exploring large parameter spaces.
42
+
43
+ Args:
44
+ forecaster: Forecaster model (ForecasterRecursive or ForecasterDirect).
45
+ y: Training time series.
46
+ cv: Cross-validation strategy (TimeSeriesFold or OneStepAheadFold)
47
+ with information needed to split the data into folds.
48
+ param_distributions: Dictionary with parameter names (str) as keys
49
+ and distributions or lists of parameters to try as values.
50
+ Use scipy.stats distributions for continuous parameters.
51
+ metric: Metric(s) to quantify model goodness of fit. If str:
52
+ 'mean_squared_error', 'mean_absolute_error',
53
+ 'mean_absolute_percentage_error', 'mean_squared_log_error',
54
+ 'mean_absolute_scaled_error', 'root_mean_squared_scaled_error'.
55
+ If Callable: Function with arguments (y_true, y_pred, y_train)
56
+ that returns a float. If list: Multiple strings and/or Callables.
57
+ exog: Exogenous variable(s) included as predictors. Must have the
58
+ same number of observations as y and aligned so that y[i] is
59
+ regressed on exog[i]. Default is None.
60
+ lags_grid: Lists of lags to try. Can be int, lists, numpy ndarray,
61
+ or range objects. If dict, keys are used as labels in results
62
+ DataFrame. Default is None.
63
+ n_iter: Number of parameter settings sampled per lags configuration.
64
+ Trades off runtime vs solution quality. Default is 10.
65
+ random_state: Seed for random sampling for reproducible output.
66
+ Default is 123.
67
+ return_best: If True, refit the forecaster using best parameters
68
+ on the whole dataset. Default is True.
69
+ n_jobs: Number of jobs to run in parallel. If -1, uses all cores.
70
+ If 'auto', uses select_n_jobs_backtesting. Default is 'auto'.
71
+ verbose: If True, print number of folds used for cv. Default is False.
72
+ show_progress: Whether to show a progress bar. Default is True.
73
+ suppress_warnings: If True, suppress spotforecast warnings during
74
+ hyperparameter search. Default is False.
75
+ output_file: Filename or full path to save results as TSV. If None,
76
+ results are not saved to file. Default is None.
77
+
78
+ Returns:
79
+ Results for each parameter combination with columns: lags (lags
80
+ configuration), lags_label (descriptive label), params (parameters
81
+ configuration), metric (metric value), and additional columns with
82
+ param=value pairs.
83
+
84
+ Examples:
85
+ Basic random search with continuous parameter distributions:
86
+
87
+ >>> import pandas as pd
88
+ >>> import numpy as np
89
+ >>> from sklearn.linear_model import Ridge
90
+ >>> from scipy.stats import uniform
91
+ >>> from spotforecast2.forecaster.recursive import ForecasterRecursive
92
+ >>> from spotforecast2.model_selection import TimeSeriesFold
93
+ >>> from spotforecast2.model_selection.random_search import random_search_forecaster
94
+ >>>
95
+ >>> # Create sample data
96
+ >>> np.random.seed(123)
97
+ >>> y = pd.Series(np.random.randn(50), name='y')
98
+ >>>
99
+ >>> # Set up forecaster and cross-validation
100
+ >>> forecaster = ForecasterRecursive(estimator=Ridge(), lags=3)
101
+ >>> cv = TimeSeriesFold(steps=3, initial_train_size=20, refit=False)
102
+ >>>
103
+ >>> # Define parameter distributions with scipy.stats
104
+ >>> param_distributions = {
105
+ ... 'estimator__alpha': uniform(0.1, 10.0) # Uniform between 0.1 and 10.1
106
+ ... }
107
+ >>>
108
+ >>> # Run random search
109
+ >>> results = random_search_forecaster(
110
+ ... forecaster=forecaster,
111
+ ... y=y,
112
+ ... cv=cv,
113
+ ... param_distributions=param_distributions,
114
+ ... metric='mean_squared_error',
115
+ ... n_iter=5,
116
+ ... random_state=42,
117
+ ... return_best=False,
118
+ ... verbose=False,
119
+ ... show_progress=False
120
+ ... )
121
+ >>>
122
+ >>> # Check results
123
+ >>> print(results.shape[0])
124
+ 5
125
+ >>> print('estimator__alpha' in results.columns)
126
+ True
127
+ >>> print('mean_squared_error' in results.columns)
128
+ True
129
+ """
130
+
131
+ param_grid = list(
132
+ ParameterSampler(param_distributions, n_iter=n_iter, random_state=random_state)
133
+ )
134
+
135
+ results = _evaluate_grid_hyperparameters(
136
+ forecaster=forecaster,
137
+ y=y,
138
+ cv=cv,
139
+ param_grid=param_grid,
140
+ metric=metric,
141
+ exog=exog,
142
+ lags_grid=lags_grid,
143
+ return_best=return_best,
144
+ n_jobs=n_jobs,
145
+ verbose=verbose,
146
+ show_progress=show_progress,
147
+ suppress_warnings=suppress_warnings,
148
+ output_file=output_file,
149
+ )
150
+
151
+ return results