spotforecast2 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spotforecast2/.DS_Store +0 -0
- spotforecast2/__init__.py +2 -0
- spotforecast2/data/__init__.py +0 -0
- spotforecast2/data/data.py +130 -0
- spotforecast2/data/fetch_data.py +209 -0
- spotforecast2/exceptions.py +681 -0
- spotforecast2/forecaster/.DS_Store +0 -0
- spotforecast2/forecaster/__init__.py +7 -0
- spotforecast2/forecaster/base.py +448 -0
- spotforecast2/forecaster/metrics.py +527 -0
- spotforecast2/forecaster/recursive/__init__.py +4 -0
- spotforecast2/forecaster/recursive/_forecaster_equivalent_date.py +1075 -0
- spotforecast2/forecaster/recursive/_forecaster_recursive.py +939 -0
- spotforecast2/forecaster/recursive/_warnings.py +15 -0
- spotforecast2/forecaster/utils.py +954 -0
- spotforecast2/model_selection/__init__.py +5 -0
- spotforecast2/model_selection/bayesian_search.py +453 -0
- spotforecast2/model_selection/grid_search.py +314 -0
- spotforecast2/model_selection/random_search.py +151 -0
- spotforecast2/model_selection/split_base.py +357 -0
- spotforecast2/model_selection/split_one_step.py +245 -0
- spotforecast2/model_selection/split_ts_cv.py +634 -0
- spotforecast2/model_selection/utils_common.py +718 -0
- spotforecast2/model_selection/utils_metrics.py +103 -0
- spotforecast2/model_selection/validation.py +685 -0
- spotforecast2/preprocessing/__init__.py +30 -0
- spotforecast2/preprocessing/_binner.py +378 -0
- spotforecast2/preprocessing/_common.py +123 -0
- spotforecast2/preprocessing/_differentiator.py +123 -0
- spotforecast2/preprocessing/_rolling.py +136 -0
- spotforecast2/preprocessing/curate_data.py +254 -0
- spotforecast2/preprocessing/imputation.py +92 -0
- spotforecast2/preprocessing/outlier.py +114 -0
- spotforecast2/preprocessing/split.py +139 -0
- spotforecast2/py.typed +0 -0
- spotforecast2/utils/__init__.py +43 -0
- spotforecast2/utils/convert_to_utc.py +44 -0
- spotforecast2/utils/data_transform.py +208 -0
- spotforecast2/utils/forecaster_config.py +344 -0
- spotforecast2/utils/generate_holiday.py +70 -0
- spotforecast2/utils/validation.py +569 -0
- spotforecast2/weather/__init__.py +0 -0
- spotforecast2/weather/weather_client.py +288 -0
- spotforecast2-0.0.1.dist-info/METADATA +47 -0
- spotforecast2-0.0.1.dist-info/RECORD +46 -0
- spotforecast2-0.0.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from .curate_data import (
|
|
2
|
+
get_start_end,
|
|
3
|
+
curate_holidays,
|
|
4
|
+
curate_weather,
|
|
5
|
+
basic_ts_checks,
|
|
6
|
+
agg_and_resample_data,
|
|
7
|
+
)
|
|
8
|
+
from .outlier import mark_outliers, manual_outlier_removal
|
|
9
|
+
from .imputation import custom_weights, get_missing_weights
|
|
10
|
+
from .split import split_abs_train_val_test, split_rel_train_val_test
|
|
11
|
+
from ._differentiator import TimeSeriesDifferentiator
|
|
12
|
+
from ._binner import QuantileBinner
|
|
13
|
+
from ._rolling import RollingFeatures
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"get_start_end",
|
|
17
|
+
"curate_holidays",
|
|
18
|
+
"curate_weather",
|
|
19
|
+
"basic_ts_checks",
|
|
20
|
+
"agg_and_resample_data",
|
|
21
|
+
"mark_outliers",
|
|
22
|
+
"manual_outlier_removal",
|
|
23
|
+
"custom_weights",
|
|
24
|
+
"get_missing_weights",
|
|
25
|
+
"split_abs_train_val_test",
|
|
26
|
+
"split_rel_train_val_test",
|
|
27
|
+
"TimeSeriesDifferentiator",
|
|
28
|
+
"QuantileBinner",
|
|
29
|
+
"RollingFeatures",
|
|
30
|
+
]
|
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
"""
|
|
2
|
+
QuantileBinner class for binning data into quantile-based bins.
|
|
3
|
+
|
|
4
|
+
This module contains the QuantileBinner class which bins data into quantile-based bins
|
|
5
|
+
using numpy.percentile with optimized performance using numpy.searchsorted.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
import warnings
|
|
10
|
+
import numpy as np
|
|
11
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
12
|
+
from sklearn.exceptions import NotFittedError
|
|
13
|
+
|
|
14
|
+
from spotforecast2.exceptions import IgnoredArgumentWarning
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class QuantileBinner(BaseEstimator, TransformerMixin):
|
|
18
|
+
"""
|
|
19
|
+
Bin data into quantile-based bins using numpy.percentile.
|
|
20
|
+
|
|
21
|
+
This class is similar to sklearn's KBinsDiscretizer but optimized for
|
|
22
|
+
performance using numpy.searchsorted for fast bin assignment. Bin intervals
|
|
23
|
+
are defined following the convention: bins[i-1] <= x < bins[i]. Values
|
|
24
|
+
outside the range are clipped to the first or last bin.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
n_bins: The number of quantile-based bins to create. Must be >= 2.
|
|
28
|
+
method: The method used to compute quantiles, passed to numpy.percentile.
|
|
29
|
+
Default is 'linear'. Valid values: "inverse_cdf",
|
|
30
|
+
"averaged_inverse_cdf", "closest_observation",
|
|
31
|
+
"interpolated_inverse_cdf", "hazen", "weibull", "linear",
|
|
32
|
+
"median_unbiased", "normal_unbiased".
|
|
33
|
+
subsample: Maximum number of samples for computing quantiles. If dataset
|
|
34
|
+
has more samples, a random subset is used. Default 200000.
|
|
35
|
+
dtype: Data type for bin indices. Default is numpy.float64.
|
|
36
|
+
random_state: Random seed for subset generation. Default 789654.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
n_bins (int): Number of bins to create.
|
|
40
|
+
method (str): Quantile computation method.
|
|
41
|
+
subsample (int): Maximum samples for quantile computation.
|
|
42
|
+
dtype (type): Data type for bin indices.
|
|
43
|
+
random_state (int): Random seed.
|
|
44
|
+
n_bins_ (int): Actual number of bins after fitting (may differ from n_bins
|
|
45
|
+
if duplicate edges are found).
|
|
46
|
+
bin_edges_ (np.ndarray): Edges of the bins learned during fitting.
|
|
47
|
+
internal_edges_ (np.ndarray): Internal edges for optimized bin assignment.
|
|
48
|
+
intervals_ (dict): Mapping from bin index to (lower, upper) interval bounds.
|
|
49
|
+
|
|
50
|
+
Examples:
|
|
51
|
+
>>> import numpy as np
|
|
52
|
+
>>> from spotforecast2.preprocessing import QuantileBinner
|
|
53
|
+
>>>
|
|
54
|
+
>>> # Basic usage: create 3 quantile bins
|
|
55
|
+
>>> X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
|
|
56
|
+
>>> binner = QuantileBinner(n_bins=3)
|
|
57
|
+
>>> _ = binner.fit(X)
|
|
58
|
+
>>> result = binner.transform(np.array([1.5, 5.5, 9.5]))
|
|
59
|
+
>>> print(result)
|
|
60
|
+
[0. 1. 2.]
|
|
61
|
+
>>>
|
|
62
|
+
>>> # Check bin intervals
|
|
63
|
+
>>> print(binner.n_bins_)
|
|
64
|
+
3
|
|
65
|
+
>>> assert len(binner.intervals_) == 3
|
|
66
|
+
>>>
|
|
67
|
+
>>> # Use fit_transform for one-step operation
|
|
68
|
+
>>> X2 = np.array([10, 20, 30, 40, 50])
|
|
69
|
+
>>> binner2 = QuantileBinner(n_bins=2)
|
|
70
|
+
>>> bins = binner2.fit_transform(X2)
|
|
71
|
+
>>> print(bins)
|
|
72
|
+
[0. 0. 1. 1. 1.]
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
n_bins: int,
|
|
78
|
+
method: str = "linear",
|
|
79
|
+
subsample: int = 200000,
|
|
80
|
+
dtype: type = np.float64,
|
|
81
|
+
random_state: int = 789654,
|
|
82
|
+
) -> None:
|
|
83
|
+
|
|
84
|
+
self._validate_params(n_bins, method, subsample, dtype, random_state)
|
|
85
|
+
|
|
86
|
+
self.n_bins = n_bins
|
|
87
|
+
self.method = method
|
|
88
|
+
self.subsample = subsample
|
|
89
|
+
self.dtype = dtype
|
|
90
|
+
self.random_state = random_state
|
|
91
|
+
self.n_bins_ = None
|
|
92
|
+
self.bin_edges_ = None
|
|
93
|
+
self.internal_edges_ = None
|
|
94
|
+
self.intervals_ = None
|
|
95
|
+
|
|
96
|
+
def _validate_params(
|
|
97
|
+
self, n_bins: int, method: str, subsample: int, dtype: type, random_state: int
|
|
98
|
+
):
|
|
99
|
+
"""
|
|
100
|
+
Validate parameters passed to the class initializer.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
n_bins: Number of quantile-based bins. Must be int >= 2.
|
|
104
|
+
method: Quantile computation method for numpy.percentile.
|
|
105
|
+
subsample: Number of samples for computing quantiles. Must be int >= 1.
|
|
106
|
+
dtype: Data type for bin indices. Must be a valid numpy dtype.
|
|
107
|
+
random_state: Random seed for subset generation. Must be int >= 0.
|
|
108
|
+
|
|
109
|
+
Raises:
|
|
110
|
+
ValueError: If n_bins < 2, method is invalid, subsample < 1,
|
|
111
|
+
random_state < 0, or dtype is not a valid type.
|
|
112
|
+
|
|
113
|
+
Examples:
|
|
114
|
+
>>> import numpy as np
|
|
115
|
+
>>> from spotforecast2.preprocessing import QuantileBinner
|
|
116
|
+
>>>
|
|
117
|
+
>>> # Valid parameters work fine
|
|
118
|
+
>>> binner = QuantileBinner(n_bins=5, method='linear')
|
|
119
|
+
>>> assert binner.n_bins == 5
|
|
120
|
+
>>>
|
|
121
|
+
>>> # Invalid n_bins raises ValueError
|
|
122
|
+
>>> try:
|
|
123
|
+
... binner = QuantileBinner(n_bins=1)
|
|
124
|
+
... except ValueError as e:
|
|
125
|
+
... assert 'greater than 1' in str(e)
|
|
126
|
+
>>>
|
|
127
|
+
>>> # Invalid method raises ValueError
|
|
128
|
+
>>> try:
|
|
129
|
+
... binner = QuantileBinner(n_bins=3, method='invalid')
|
|
130
|
+
... except ValueError as e:
|
|
131
|
+
... assert 'must be one of' in str(e)
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
if not isinstance(n_bins, int) or n_bins < 2:
|
|
135
|
+
raise ValueError(f"`n_bins` must be an int greater than 1. Got {n_bins}.")
|
|
136
|
+
|
|
137
|
+
valid_methods = [
|
|
138
|
+
"inverse_cdf",
|
|
139
|
+
"averaged_inverse_cdf",
|
|
140
|
+
"closest_observation",
|
|
141
|
+
"interpolated_inverse_cdf",
|
|
142
|
+
"hazen",
|
|
143
|
+
"weibull",
|
|
144
|
+
"linear",
|
|
145
|
+
"median_unbiased",
|
|
146
|
+
"normal_unbiased",
|
|
147
|
+
]
|
|
148
|
+
if method not in valid_methods:
|
|
149
|
+
raise ValueError(f"`method` must be one of {valid_methods}. Got {method}.")
|
|
150
|
+
if not isinstance(subsample, int) or subsample < 1:
|
|
151
|
+
raise ValueError(
|
|
152
|
+
f"`subsample` must be an integer greater than or equal to 1. "
|
|
153
|
+
f"Got {subsample}."
|
|
154
|
+
)
|
|
155
|
+
if not isinstance(random_state, int) or random_state < 0:
|
|
156
|
+
raise ValueError(
|
|
157
|
+
f"`random_state` must be an integer greater than or equal to 0. "
|
|
158
|
+
f"Got {random_state}."
|
|
159
|
+
)
|
|
160
|
+
if not isinstance(dtype, type):
|
|
161
|
+
raise ValueError(f"`dtype` must be a valid numpy dtype. Got {dtype}.")
|
|
162
|
+
|
|
163
|
+
def fit(self, X: np.ndarray, y: object = None) -> object:
|
|
164
|
+
"""
|
|
165
|
+
Learn bin edges based on quantiles from training data.
|
|
166
|
+
|
|
167
|
+
Computes quantile-based bin edges using numpy.percentile. If the dataset
|
|
168
|
+
contains more samples than `subsample`, a random subset is used. Duplicate
|
|
169
|
+
edges (which can occur with repeated values) are removed automatically.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
X: Training data (1D numpy array) for computing quantiles.
|
|
173
|
+
y: Ignored.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Self for method chaining.
|
|
177
|
+
|
|
178
|
+
Raises:
|
|
179
|
+
ValueError: If input data X is empty.
|
|
180
|
+
|
|
181
|
+
Examples:
|
|
182
|
+
>>> import numpy as np
|
|
183
|
+
>>> from spotforecast2.preprocessing import QuantileBinner
|
|
184
|
+
>>>
|
|
185
|
+
>>> # Fit with basic data
|
|
186
|
+
>>> X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
|
|
187
|
+
>>> binner = QuantileBinner(n_bins=3)
|
|
188
|
+
>>> _ = binner.fit(X)
|
|
189
|
+
>>> print(binner.n_bins_)
|
|
190
|
+
3
|
|
191
|
+
>>> print(len(binner.bin_edges_))
|
|
192
|
+
4
|
|
193
|
+
>>>
|
|
194
|
+
>>> # Fit with repeated values (may reduce number of bins)
|
|
195
|
+
>>> X_repeated = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3])
|
|
196
|
+
>>> binner2 = QuantileBinner(n_bins=5)
|
|
197
|
+
>>> _ = binner2.fit(X_repeated)
|
|
198
|
+
>>> # n_bins_ may be less than 5 due to duplicates
|
|
199
|
+
>>> assert binner2.n_bins_ <= 5
|
|
200
|
+
"""
|
|
201
|
+
# Note: Original implementation expects X, but sklearn TransformerMixin passes y=None.
|
|
202
|
+
# Adjusted signature to (self, X: np.ndarray, y: object = None)
|
|
203
|
+
|
|
204
|
+
if X.size == 0:
|
|
205
|
+
raise ValueError("Input data `X` cannot be empty.")
|
|
206
|
+
if len(X) > self.subsample:
|
|
207
|
+
rng = np.random.default_rng(self.random_state)
|
|
208
|
+
X = X[rng.integers(0, len(X), self.subsample)]
|
|
209
|
+
|
|
210
|
+
bin_edges = np.percentile(
|
|
211
|
+
a=X, q=np.linspace(0, 100, self.n_bins + 1), method=self.method
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Remove duplicate edges (can happen when data has many repeated values)
|
|
215
|
+
# to ensure bins are always numbered 0 to n_bins_-1
|
|
216
|
+
self.bin_edges_ = np.unique(bin_edges)
|
|
217
|
+
|
|
218
|
+
# Ensure at least 1 bin when all values are identical
|
|
219
|
+
if len(self.bin_edges_) == 1:
|
|
220
|
+
# Create artificial edges around the single value
|
|
221
|
+
self.bin_edges_ = np.array([self.bin_edges_.item(), self.bin_edges_.item()])
|
|
222
|
+
|
|
223
|
+
self.n_bins_ = len(self.bin_edges_) - 1
|
|
224
|
+
|
|
225
|
+
if self.n_bins_ != self.n_bins:
|
|
226
|
+
warnings.warn(
|
|
227
|
+
f"The number of bins has been reduced from {self.n_bins} to "
|
|
228
|
+
f"{self.n_bins_} due to duplicated edges caused by repeated predicted "
|
|
229
|
+
f"values.",
|
|
230
|
+
IgnoredArgumentWarning,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Internal edges for optimized transform with searchsorted
|
|
234
|
+
self.internal_edges_ = self.bin_edges_[1:-1]
|
|
235
|
+
self.intervals_ = {
|
|
236
|
+
int(i): (float(self.bin_edges_[i]), float(self.bin_edges_[i + 1]))
|
|
237
|
+
for i in range(self.n_bins_)
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return self
|
|
241
|
+
|
|
242
|
+
def transform(self, X: np.ndarray, y: object = None) -> np.ndarray:
|
|
243
|
+
"""
|
|
244
|
+
Assign new data to learned bins.
|
|
245
|
+
|
|
246
|
+
Uses numpy.searchsorted for efficient bin assignment. Values are assigned
|
|
247
|
+
to bins following the convention: bins[i-1] <= x < bins[i]. Values outside
|
|
248
|
+
the fitted range are clipped to the first or last bin.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
X: Data to assign to bins (1D numpy array).
|
|
252
|
+
y: Ignored.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Bin indices as numpy array with dtype specified in __init__.
|
|
256
|
+
|
|
257
|
+
Raises:
|
|
258
|
+
NotFittedError: If fit() has not been called yet.
|
|
259
|
+
|
|
260
|
+
Examples:
|
|
261
|
+
>>> import numpy as np
|
|
262
|
+
>>> from spotforecast2.preprocessing import QuantileBinner
|
|
263
|
+
>>>
|
|
264
|
+
>>> # Fit and transform
|
|
265
|
+
>>> X_train = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
|
|
266
|
+
>>> binner = QuantileBinner(n_bins=3)
|
|
267
|
+
>>> _ = binner.fit(X_train)
|
|
268
|
+
>>>
|
|
269
|
+
>>> X_test = np.array([1.5, 5.5, 9.5])
|
|
270
|
+
>>> result = binner.transform(X_test)
|
|
271
|
+
>>> print(result)
|
|
272
|
+
[0. 1. 2.]
|
|
273
|
+
>>>
|
|
274
|
+
>>> # Values outside range are clipped
|
|
275
|
+
>>> X_extreme = np.array([0, 100])
|
|
276
|
+
>>> result_extreme = binner.transform(X_extreme)
|
|
277
|
+
>>> print(result_extreme) # Both clipped to valid bin indices
|
|
278
|
+
[0. 2.]
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
if self.bin_edges_ is None:
|
|
282
|
+
raise NotFittedError(
|
|
283
|
+
"The model has not been fitted yet. Call 'fit' with training data first."
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
bin_indices = np.searchsorted(self.internal_edges_, X, side="right").astype(
|
|
287
|
+
self.dtype
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
return bin_indices
|
|
291
|
+
|
|
292
|
+
def fit_transform(self, X, y=None, **fit_params):
|
|
293
|
+
"""
|
|
294
|
+
Fit to data, then transform it.
|
|
295
|
+
|
|
296
|
+
Fits transformer to X and y with optional parameters fit_params
|
|
297
|
+
and returns a transformed version of X.
|
|
298
|
+
|
|
299
|
+
Parameters
|
|
300
|
+
----------
|
|
301
|
+
X : array-like of shape (n_samples, n_features)
|
|
302
|
+
Input samples.
|
|
303
|
+
|
|
304
|
+
y : array-like of shape (n_samples,) or (n_samples, n_outputs), \
|
|
305
|
+
default=None
|
|
306
|
+
Target values (None for unsupervised transformations).
|
|
307
|
+
|
|
308
|
+
**fit_params : dict
|
|
309
|
+
Additional fit parameters.
|
|
310
|
+
|
|
311
|
+
Returns
|
|
312
|
+
-------
|
|
313
|
+
X_new : ndarray array of shape (n_samples, n_features_new)
|
|
314
|
+
Transformed array.
|
|
315
|
+
"""
|
|
316
|
+
# fit_transform is usually provided by TransformerMixin but we can implement it
|
|
317
|
+
# or rely on inheritance. The original implementation had it explicitly.
|
|
318
|
+
|
|
319
|
+
self.fit(X, y)
|
|
320
|
+
return self.transform(X, y)
|
|
321
|
+
|
|
322
|
+
def get_params(self, deep=True):
|
|
323
|
+
"""
|
|
324
|
+
Get parameters of the quantile binner.
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Dictionary containing n_bins, method, subsample, dtype, and
|
|
328
|
+
random_state parameters.
|
|
329
|
+
|
|
330
|
+
Examples:
|
|
331
|
+
>>> import numpy as np
|
|
332
|
+
>>> from spotforecast2.preprocessing import QuantileBinner
|
|
333
|
+
>>>
|
|
334
|
+
>>> binner = QuantileBinner(n_bins=5, method='median_unbiased', subsample=1000)
|
|
335
|
+
>>> params = binner.get_params()
|
|
336
|
+
>>> print(params['n_bins'])
|
|
337
|
+
5
|
|
338
|
+
>>> print(params['method'])
|
|
339
|
+
median_unbiased
|
|
340
|
+
>>> print(params['subsample'])
|
|
341
|
+
1000
|
|
342
|
+
"""
|
|
343
|
+
|
|
344
|
+
return {
|
|
345
|
+
"n_bins": self.n_bins,
|
|
346
|
+
"method": self.method,
|
|
347
|
+
"subsample": self.subsample,
|
|
348
|
+
"dtype": self.dtype,
|
|
349
|
+
"random_state": self.random_state,
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
def set_params(self, **params):
|
|
353
|
+
"""
|
|
354
|
+
Set parameters of the QuantileBinner.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
**params: Parameter names and values to set as keyword arguments.
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
None
|
|
361
|
+
|
|
362
|
+
Examples:
|
|
363
|
+
>>> import numpy as np
|
|
364
|
+
>>> from spotforecast2.preprocessing import QuantileBinner
|
|
365
|
+
>>>
|
|
366
|
+
>>> binner = QuantileBinner(n_bins=3)
|
|
367
|
+
>>> print(binner.n_bins)
|
|
368
|
+
3
|
|
369
|
+
>>> binner.set_params(n_bins=5, method='weibull')
|
|
370
|
+
>>> print(binner.n_bins)
|
|
371
|
+
5
|
|
372
|
+
>>> print(binner.method)
|
|
373
|
+
weibull
|
|
374
|
+
"""
|
|
375
|
+
|
|
376
|
+
for param, value in params.items():
|
|
377
|
+
setattr(self, param, value)
|
|
378
|
+
return self
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Common preprocessing functions and utilities.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import functools
|
|
6
|
+
from typing import Callable, Any
|
|
7
|
+
import numpy as np
|
|
8
|
+
from numba import njit
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _check_X_numpy_ndarray_1d(ensure_1d: bool = True):
|
|
12
|
+
"""
|
|
13
|
+
Decorator to check if argument `X` is a 1D numpy ndarray.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
ensure_1d : bool, default True
|
|
17
|
+
If True, ensure X is a 1D array.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
wrapper : Callable
|
|
21
|
+
Decorated function.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def decorator(func: Callable):
|
|
25
|
+
@functools.wraps(func)
|
|
26
|
+
def wrapper(self, *args, **kwargs):
|
|
27
|
+
# args[0] is self, args[1] is X (if passed positional)
|
|
28
|
+
# kwargs might contain X
|
|
29
|
+
X = kwargs.get("X")
|
|
30
|
+
if X is None and len(args) > 0:
|
|
31
|
+
X = args[0]
|
|
32
|
+
|
|
33
|
+
if X is not None:
|
|
34
|
+
if not isinstance(X, np.ndarray):
|
|
35
|
+
raise TypeError(f"`X` must be a numpy ndarray. Got {type(X)}.")
|
|
36
|
+
if ensure_1d and X.ndim != 1:
|
|
37
|
+
raise ValueError(f"`X` must be a 1D numpy ndarray. Got {X.ndim}D.")
|
|
38
|
+
|
|
39
|
+
return func(self, *args, **kwargs)
|
|
40
|
+
|
|
41
|
+
return wrapper
|
|
42
|
+
|
|
43
|
+
return decorator
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@njit(cache=True)
|
|
47
|
+
def _np_mean_jit(x: np.ndarray) -> float:
|
|
48
|
+
"""
|
|
49
|
+
Numba optimized mean function.
|
|
50
|
+
"""
|
|
51
|
+
return np.nanmean(x)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@njit(cache=True)
|
|
55
|
+
def _np_std_jit(x: np.ndarray) -> float:
|
|
56
|
+
"""
|
|
57
|
+
Numba optimized std function (ddof=1).
|
|
58
|
+
"""
|
|
59
|
+
return np.nanstd(x)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@njit(cache=True)
|
|
63
|
+
def _np_min_jit(x: np.ndarray) -> float:
|
|
64
|
+
"""
|
|
65
|
+
Numba optimized min function.
|
|
66
|
+
"""
|
|
67
|
+
return np.nanmin(x)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@njit(cache=True)
|
|
71
|
+
def _np_max_jit(x: np.ndarray) -> float:
|
|
72
|
+
"""
|
|
73
|
+
Numba optimized max function.
|
|
74
|
+
"""
|
|
75
|
+
return np.nanmax(x)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@njit(cache=True)
|
|
79
|
+
def _np_sum_jit(x: np.ndarray) -> float:
|
|
80
|
+
"""
|
|
81
|
+
Numba optimized sum function.
|
|
82
|
+
"""
|
|
83
|
+
return np.nansum(x)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@njit(cache=True)
|
|
87
|
+
def _np_median_jit(x: np.ndarray) -> float:
|
|
88
|
+
"""
|
|
89
|
+
Numba optimized median function.
|
|
90
|
+
"""
|
|
91
|
+
return np.nanmedian(x)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def check_valid_quantile(quantile: float | list[float] | tuple[float]) -> None:
|
|
95
|
+
"""
|
|
96
|
+
Check if quantile is valid (0 <= quantile <= 1).
|
|
97
|
+
"""
|
|
98
|
+
if isinstance(quantile, (float, int)):
|
|
99
|
+
if not (0 <= quantile <= 1):
|
|
100
|
+
raise ValueError(f"Quantile must be between 0 and 1. Got {quantile}.")
|
|
101
|
+
elif isinstance(quantile, (list, tuple, np.ndarray)):
|
|
102
|
+
for q in quantile:
|
|
103
|
+
if not (0 <= q <= 1):
|
|
104
|
+
raise ValueError(f"Quantiles must be between 0 and 1. Got {q}.")
|
|
105
|
+
else:
|
|
106
|
+
raise TypeError(
|
|
107
|
+
f"Quantile must be a float, list, tuple or numpy array. Got {type(quantile)}."
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def check_is_fitted(estimator: Any, attributes: list[str] | None = None) -> None:
|
|
112
|
+
"""
|
|
113
|
+
Check if estimator is fitted by verifying if attributes exist.
|
|
114
|
+
"""
|
|
115
|
+
if attributes is None:
|
|
116
|
+
attributes = []
|
|
117
|
+
|
|
118
|
+
for attr in attributes:
|
|
119
|
+
if not hasattr(estimator, attr):
|
|
120
|
+
raise ValueError(
|
|
121
|
+
f"This {type(estimator).__name__} instance is not fitted yet. "
|
|
122
|
+
f"Call 'fit' with appropriate arguments before using this estimator."
|
|
123
|
+
)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
3
|
+
from sklearn.utils.validation import check_is_fitted
|
|
4
|
+
from ._common import _check_X_numpy_ndarray_1d
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TimeSeriesDifferentiator(BaseEstimator, TransformerMixin):
|
|
8
|
+
"""
|
|
9
|
+
Transforms a time series into a differenced time series.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
order (int, optional): Order of differentiation. Defaults to 1.
|
|
13
|
+
initial_values (list, numpy ndarray, optional): Values to be used for the inverse transformation (reverting differentiation).
|
|
14
|
+
If None, the first `order` values of the training data `X` are stored during `fit`.
|
|
15
|
+
|
|
16
|
+
Attributes:
|
|
17
|
+
initial_values_ (list): Values stored for inverse transformation.
|
|
18
|
+
last_values_ (list): Last values of the differenced time series.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, order: int = 1, initial_values: list | np.ndarray | None = None):
|
|
22
|
+
self.order = order
|
|
23
|
+
self.initial_values = initial_values
|
|
24
|
+
|
|
25
|
+
@_check_X_numpy_ndarray_1d(ensure_1d=True)
|
|
26
|
+
def fit(self, X: np.ndarray, y: object = None) -> object:
|
|
27
|
+
"""
|
|
28
|
+
Store initial values if not provided.
|
|
29
|
+
"""
|
|
30
|
+
if self.order < 1:
|
|
31
|
+
raise ValueError("`order` must be a positive integer.")
|
|
32
|
+
|
|
33
|
+
if self.initial_values is None:
|
|
34
|
+
if len(X) < self.order:
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"The time series must have at least {self.order} values "
|
|
37
|
+
f"to compute the differentiation of order {self.order}."
|
|
38
|
+
)
|
|
39
|
+
self.initial_values_ = list(X[: self.order])
|
|
40
|
+
else:
|
|
41
|
+
if len(self.initial_values) != self.order:
|
|
42
|
+
raise ValueError(
|
|
43
|
+
f"The length of `initial_values` must be equal to the order "
|
|
44
|
+
f"of differentiation ({self.order})."
|
|
45
|
+
)
|
|
46
|
+
self.initial_values_ = list(self.initial_values)
|
|
47
|
+
|
|
48
|
+
self.last_values_ = X[-self.order :]
|
|
49
|
+
|
|
50
|
+
return self
|
|
51
|
+
|
|
52
|
+
@_check_X_numpy_ndarray_1d(ensure_1d=True)
|
|
53
|
+
def transform(self, X: np.ndarray, y: object = None) -> np.ndarray:
|
|
54
|
+
"""
|
|
55
|
+
Compute the differences.
|
|
56
|
+
"""
|
|
57
|
+
if not hasattr(self, "initial_values_") and self.initial_values is not None:
|
|
58
|
+
self.fit(X)
|
|
59
|
+
elif not hasattr(self, "initial_values_"):
|
|
60
|
+
check_is_fitted(self, ["initial_values_"])
|
|
61
|
+
|
|
62
|
+
X_diff = np.diff(X, n=self.order)
|
|
63
|
+
# Pad with NaNs to keep same length
|
|
64
|
+
X_diff = np.concatenate([np.full(self.order, np.nan), X_diff])
|
|
65
|
+
|
|
66
|
+
# Update last values seen (for next window inverse)
|
|
67
|
+
self.last_values_ = X[-self.order :]
|
|
68
|
+
|
|
69
|
+
return X_diff
|
|
70
|
+
|
|
71
|
+
def inverse_transform_next_window(self, X: np.ndarray) -> np.ndarray:
|
|
72
|
+
"""
|
|
73
|
+
Inverse transform for the next window of predictions.
|
|
74
|
+
"""
|
|
75
|
+
check_is_fitted(self, ["initial_values_", "last_values_"])
|
|
76
|
+
|
|
77
|
+
if self.order == 1:
|
|
78
|
+
result = np.cumsum(X) + self.last_values_[-1]
|
|
79
|
+
else:
|
|
80
|
+
# Recursive or iterative approach for higher orders
|
|
81
|
+
# Simplified: Assuming order 1 is sufficient for now or throwing error
|
|
82
|
+
raise NotImplementedError(
|
|
83
|
+
"inverse_transform_next_window not implemented for order > 1"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return result
|
|
87
|
+
|
|
88
|
+
@_check_X_numpy_ndarray_1d(ensure_1d=True)
|
|
89
|
+
def inverse_transform(self, X: np.ndarray, y: object = None) -> np.ndarray:
|
|
90
|
+
"""
|
|
91
|
+
Revert the differences.
|
|
92
|
+
"""
|
|
93
|
+
check_is_fitted(self, ["initial_values_"])
|
|
94
|
+
|
|
95
|
+
# X contains the differenced series (with NaNs at the beginning potentially)
|
|
96
|
+
# remove NaNs at the start corresponding to order
|
|
97
|
+
X_clean = X[self.order :]
|
|
98
|
+
|
|
99
|
+
if len(X_clean) == 0:
|
|
100
|
+
# Just return initial values if only NaNs were passed
|
|
101
|
+
return np.array(self.initial_values_)
|
|
102
|
+
|
|
103
|
+
result = list(self.initial_values_)
|
|
104
|
+
|
|
105
|
+
if self.order == 1:
|
|
106
|
+
current_value = result[-1]
|
|
107
|
+
restored = []
|
|
108
|
+
for diff_val in X_clean:
|
|
109
|
+
current_value += diff_val
|
|
110
|
+
restored.append(current_value)
|
|
111
|
+
result.extend(restored)
|
|
112
|
+
else:
|
|
113
|
+
# Recursive reconstruction for higher orders logic check
|
|
114
|
+
# For order > 1, np.diff does repeated diffs.
|
|
115
|
+
# To invert, we need to do repeated cumsum.
|
|
116
|
+
# But we need appropriate initial values for each level of integration.
|
|
117
|
+
# This is a simplified version.
|
|
118
|
+
|
|
119
|
+
raise NotImplementedError(
|
|
120
|
+
"Inverse transform for order > 1 is currently not fully implemented in this port."
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return np.array(result)
|