tabpfn-time-series 0.1.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tabpfn_time_series/__init__.py +3 -2
- tabpfn_time_series/features/__init__.py +17 -0
- tabpfn_time_series/features/auto_features.py +307 -0
- tabpfn_time_series/features/basic_features.py +88 -0
- tabpfn_time_series/features/feature_generator_base.py +21 -0
- tabpfn_time_series/features/feature_transformer.py +53 -0
- tabpfn_time_series/predictor.py +5 -7
- tabpfn_time_series/tabpfn_worker.py +119 -17
- {tabpfn_time_series-0.1.2.dist-info → tabpfn_time_series-1.0.0.dist-info}/METADATA +30 -14
- tabpfn_time_series-1.0.0.dist-info/RECORD +15 -0
- tabpfn_time_series/feature.py +0 -78
- tabpfn_time_series-0.1.2.dist-info/RECORD +0 -11
- {tabpfn_time_series-0.1.2.dist-info → tabpfn_time_series-1.0.0.dist-info}/WHEEL +0 -0
- {tabpfn_time_series-0.1.2.dist-info → tabpfn_time_series-1.0.0.dist-info}/licenses/LICENSE.txt +0 -0
tabpfn_time_series/__init__.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
|
-
from .
|
1
|
+
from .features import FeatureTransformer
|
2
2
|
from .predictor import TabPFNTimeSeriesPredictor, TabPFNMode
|
3
|
+
from .defaults import TABPFN_TS_DEFAULT_QUANTILE_CONFIG
|
3
4
|
|
4
5
|
__version__ = "0.1.0"
|
5
6
|
|
6
7
|
__all__ = [
|
7
|
-
"DefaultFeatures",
|
8
8
|
"FeatureTransformer",
|
9
9
|
"TabPFNTimeSeriesPredictor",
|
10
10
|
"TabPFNMode",
|
11
|
+
"TABPFN_TS_DEFAULT_QUANTILE_CONFIG",
|
11
12
|
]
|
@@ -0,0 +1,17 @@
|
|
1
|
+
from .basic_features import (
|
2
|
+
RunningIndexFeature,
|
3
|
+
CalendarFeature,
|
4
|
+
AdditionalCalendarFeature,
|
5
|
+
PeriodicSinCosineFeature,
|
6
|
+
)
|
7
|
+
from .auto_features import AutoSeasonalFeature
|
8
|
+
from .feature_transformer import FeatureTransformer
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"RunningIndexFeature",
|
12
|
+
"CalendarFeature",
|
13
|
+
"AdditionalCalendarFeature",
|
14
|
+
"AutoSeasonalFeature",
|
15
|
+
"PeriodicSinCosineFeature",
|
16
|
+
"FeatureTransformer",
|
17
|
+
]
|
@@ -0,0 +1,307 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from typing import List, Optional, Tuple, Literal
|
4
|
+
|
5
|
+
import logging
|
6
|
+
|
7
|
+
from scipy import fft
|
8
|
+
from scipy.signal import find_peaks
|
9
|
+
from statsmodels.tsa.stattools import acf
|
10
|
+
|
11
|
+
from tabpfn_time_series.features.feature_generator_base import (
|
12
|
+
FeatureGenerator,
|
13
|
+
)
|
14
|
+
from tabpfn_time_series.features.basic_features import (
|
15
|
+
PeriodicSinCosineFeature,
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
class AutoSeasonalFeature(FeatureGenerator):
|
23
|
+
class Config:
|
24
|
+
max_top_k: int = 5
|
25
|
+
do_detrend: bool = True
|
26
|
+
detrend_type: Literal["first_diff", "loess", "linear", "constant"] = "linear"
|
27
|
+
use_peaks_only: bool = True
|
28
|
+
apply_hann_window: bool = True
|
29
|
+
zero_padding_factor: int = 2
|
30
|
+
round_to_closest_integer: bool = True
|
31
|
+
validate_with_acf: bool = False
|
32
|
+
sampling_interval: float = 1.0
|
33
|
+
magnitude_threshold: Optional[float] = 0.05
|
34
|
+
relative_threshold: bool = True
|
35
|
+
exclude_zero: bool = True
|
36
|
+
|
37
|
+
def __init__(self, config: Optional[dict] = None):
|
38
|
+
# Create default config from Config class
|
39
|
+
default_config = {
|
40
|
+
k: v for k, v in vars(self.Config).items() if not k.startswith("__")
|
41
|
+
}
|
42
|
+
|
43
|
+
# Initialize config with defaults
|
44
|
+
self.config = default_config.copy()
|
45
|
+
|
46
|
+
# Update with user-provided config if any
|
47
|
+
if config is not None:
|
48
|
+
self.config.update(config)
|
49
|
+
|
50
|
+
# Validate config parameters
|
51
|
+
self._validate_config()
|
52
|
+
|
53
|
+
logger.debug(f"Initialized AutoSeasonalFeature with config: {self.config}")
|
54
|
+
|
55
|
+
def _validate_config(self):
|
56
|
+
"""Validate configuration parameters"""
|
57
|
+
if self.config["max_top_k"] < 1:
|
58
|
+
logger.warning("max_top_k must be at least 1, setting to 1")
|
59
|
+
self.config["max_top_k"] = 1
|
60
|
+
|
61
|
+
if self.config["zero_padding_factor"] < 1:
|
62
|
+
logger.warning("zero_padding_factor must be at least 1, setting to 1")
|
63
|
+
self.config["zero_padding_factor"] = 1
|
64
|
+
|
65
|
+
if self.config["detrend_type"] not in [
|
66
|
+
"first_diff",
|
67
|
+
"loess",
|
68
|
+
"linear",
|
69
|
+
"constant",
|
70
|
+
]:
|
71
|
+
logger.warning(
|
72
|
+
f"Invalid detrend_type: {self.config['detrend_type']}, using 'linear'"
|
73
|
+
)
|
74
|
+
self.config["detrend_type"] = "linear"
|
75
|
+
|
76
|
+
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
77
|
+
df = df.copy()
|
78
|
+
|
79
|
+
# Detect seasonal periods from target data
|
80
|
+
detected_periods_and_magnitudes = self.find_seasonal_periods(
|
81
|
+
df.target, **self.config
|
82
|
+
)
|
83
|
+
logger.debug(
|
84
|
+
f"Found {len(detected_periods_and_magnitudes)} seasonal periods: {detected_periods_and_magnitudes}"
|
85
|
+
)
|
86
|
+
|
87
|
+
# Extract just the periods (without magnitudes)
|
88
|
+
periods = [period for period, _ in detected_periods_and_magnitudes]
|
89
|
+
|
90
|
+
# Generate features for detected periods using PeriodicSinCosineFeature
|
91
|
+
if periods:
|
92
|
+
feature_generator = PeriodicSinCosineFeature(periods=periods)
|
93
|
+
df = feature_generator.generate(df)
|
94
|
+
|
95
|
+
# Standardize column names for consistency across time series
|
96
|
+
renamed_columns = {}
|
97
|
+
for i, period in enumerate(periods):
|
98
|
+
renamed_columns[f"sin_{period}"] = f"sin_#{i}"
|
99
|
+
renamed_columns[f"cos_{period}"] = f"cos_#{i}"
|
100
|
+
|
101
|
+
df = df.rename(columns=renamed_columns)
|
102
|
+
|
103
|
+
# Add placeholder zero columns for missing periods up to max_top_k
|
104
|
+
for i in range(len(periods), self.config["max_top_k"]):
|
105
|
+
df[f"sin_#{i}"] = 0.0
|
106
|
+
df[f"cos_#{i}"] = 0.0
|
107
|
+
|
108
|
+
return df
|
109
|
+
|
110
|
+
@staticmethod
|
111
|
+
def find_seasonal_periods(
|
112
|
+
target_values: pd.Series,
|
113
|
+
max_top_k: int = 10,
|
114
|
+
do_detrend: bool = True,
|
115
|
+
detrend_type: Literal[
|
116
|
+
"first_diff", "loess", "linear", "constant"
|
117
|
+
] = "first_diff",
|
118
|
+
use_peaks_only: bool = True,
|
119
|
+
apply_hann_window: bool = True,
|
120
|
+
zero_padding_factor: int = 2,
|
121
|
+
round_to_closest_integer: bool = True,
|
122
|
+
validate_with_acf: bool = False,
|
123
|
+
sampling_interval: float = 1.0,
|
124
|
+
magnitude_threshold: Optional[
|
125
|
+
float
|
126
|
+
] = 0.05, # Default relative threshold (5% of max)
|
127
|
+
relative_threshold: bool = True, # Interpret threshold as a fraction of max FFT magnitude
|
128
|
+
exclude_zero: bool = False,
|
129
|
+
) -> List[Tuple[float, float]]:
|
130
|
+
"""
|
131
|
+
Identify dominant seasonal periods in a time series using FFT.
|
132
|
+
|
133
|
+
Parameters:
|
134
|
+
- target_values: pd.Series
|
135
|
+
Input time series data.
|
136
|
+
- max_top_k: int
|
137
|
+
Maximum number of dominant periods to return.
|
138
|
+
- do_detrend: bool
|
139
|
+
If True, remove the linear trend from the signal.
|
140
|
+
- use_peaks_only: bool
|
141
|
+
If True, consider only local peaks in the FFT magnitude spectrum.
|
142
|
+
- apply_hann_window: bool
|
143
|
+
If True, apply a Hann window to reduce spectral leakage.
|
144
|
+
- zero_padding_factor: int
|
145
|
+
Factor by which to zero-pad the signal for finer frequency resolution.
|
146
|
+
- round_to_closest_integer: bool
|
147
|
+
If True, round the detected periods to the nearest integer.
|
148
|
+
- validate_with_acf: bool
|
149
|
+
If True, validate detected periods against the autocorrelation function.
|
150
|
+
- sampling_interval: float
|
151
|
+
Time interval between consecutive samples.
|
152
|
+
- magnitude_threshold: Optional[float]
|
153
|
+
Threshold to filter out less significant frequency components.
|
154
|
+
Default is 0.05, interpreted as 5% of the maximum FFT magnitude if relative_threshold is True.
|
155
|
+
- relative_threshold: bool
|
156
|
+
If True, the `magnitude_threshold` is interpreted as a fraction of the maximum FFT magnitude.
|
157
|
+
Otherwise, it is treated as an absolute threshold value.
|
158
|
+
- exclude_zero: bool
|
159
|
+
If True, exclude periods of 0 from the results.
|
160
|
+
|
161
|
+
Returns:
|
162
|
+
- List[Tuple[float, float]]:
|
163
|
+
A list of (period, magnitude) tuples, sorted in descending order by magnitude.
|
164
|
+
"""
|
165
|
+
# Convert the Pandas Series to a NumPy array
|
166
|
+
values = np.array(target_values, dtype=float)
|
167
|
+
|
168
|
+
# Quick hack to ignore the test_X
|
169
|
+
# (Assuming train_X target is not NaN, and test_X target is NaN)
|
170
|
+
# Dropping all the NaN values
|
171
|
+
values = values[~np.isnan(values)]
|
172
|
+
|
173
|
+
N_original = len(values)
|
174
|
+
|
175
|
+
# Detrend the signal using a linear detrend method if requested
|
176
|
+
if do_detrend:
|
177
|
+
values = detrend(values, detrend_type)
|
178
|
+
|
179
|
+
# Apply a Hann window to reduce spectral leakage
|
180
|
+
if apply_hann_window:
|
181
|
+
window = np.hanning(N_original)
|
182
|
+
values = values * window
|
183
|
+
|
184
|
+
# Zero-pad the signal for improved frequency resolution
|
185
|
+
if zero_padding_factor > 1:
|
186
|
+
padded_length = int(N_original * zero_padding_factor)
|
187
|
+
padded_values = np.zeros(padded_length)
|
188
|
+
padded_values[:N_original] = values
|
189
|
+
values = padded_values
|
190
|
+
N = padded_length
|
191
|
+
else:
|
192
|
+
N = N_original
|
193
|
+
|
194
|
+
# Compute the FFT (using rfft) and obtain frequency bins
|
195
|
+
fft_values = fft.rfft(values)
|
196
|
+
fft_magnitudes = np.abs(fft_values)
|
197
|
+
freqs = np.fft.rfftfreq(N, d=sampling_interval)
|
198
|
+
|
199
|
+
# Exclude the DC component (0 Hz) to avoid bias from the signal's mean
|
200
|
+
fft_magnitudes[0] = 0.0
|
201
|
+
|
202
|
+
# Determine the threshold (absolute value)
|
203
|
+
if magnitude_threshold is not None and relative_threshold:
|
204
|
+
threshold_value = magnitude_threshold * np.max(fft_magnitudes)
|
205
|
+
else:
|
206
|
+
threshold_value = magnitude_threshold
|
207
|
+
|
208
|
+
# Identify dominant frequencies
|
209
|
+
if use_peaks_only:
|
210
|
+
if threshold_value is not None:
|
211
|
+
peak_indices, _ = find_peaks(fft_magnitudes, height=threshold_value)
|
212
|
+
else:
|
213
|
+
peak_indices, _ = find_peaks(fft_magnitudes)
|
214
|
+
if len(peak_indices) == 0:
|
215
|
+
# Fallback to considering all frequency bins if no peaks are found
|
216
|
+
peak_indices = np.arange(len(fft_magnitudes))
|
217
|
+
# Sort the peak indices by magnitude in descending order
|
218
|
+
sorted_peak_indices = peak_indices[
|
219
|
+
np.argsort(fft_magnitudes[peak_indices])[::-1]
|
220
|
+
]
|
221
|
+
top_indices = sorted_peak_indices[:max_top_k]
|
222
|
+
else:
|
223
|
+
sorted_indices = np.argsort(fft_magnitudes)[::-1]
|
224
|
+
if threshold_value is not None:
|
225
|
+
sorted_indices = [
|
226
|
+
i for i in sorted_indices if fft_magnitudes[i] >= threshold_value
|
227
|
+
]
|
228
|
+
top_indices = sorted_indices[:max_top_k]
|
229
|
+
|
230
|
+
# Convert frequencies to periods (avoiding division by zero)
|
231
|
+
periods = np.zeros_like(freqs)
|
232
|
+
non_zero = freqs > 0
|
233
|
+
periods[non_zero] = 1.0 / freqs[non_zero]
|
234
|
+
top_periods = periods[top_indices]
|
235
|
+
|
236
|
+
logger.debug(f"Top periods: {top_periods}")
|
237
|
+
|
238
|
+
# Optionally round the periods to the nearest integer
|
239
|
+
if round_to_closest_integer:
|
240
|
+
top_periods = np.round(top_periods)
|
241
|
+
|
242
|
+
# Filter out zero periods if requested
|
243
|
+
if exclude_zero:
|
244
|
+
non_zero_mask = top_periods != 0
|
245
|
+
top_periods = top_periods[non_zero_mask]
|
246
|
+
top_indices = top_indices[non_zero_mask]
|
247
|
+
|
248
|
+
# Keep unique periods only
|
249
|
+
if len(top_periods) > 0:
|
250
|
+
unique_period_indices = np.unique(top_periods, return_index=True)[1]
|
251
|
+
top_periods = top_periods[unique_period_indices]
|
252
|
+
top_indices = top_indices[unique_period_indices]
|
253
|
+
|
254
|
+
# Pair each period with its corresponding magnitude
|
255
|
+
results = [
|
256
|
+
(top_periods[i], fft_magnitudes[top_indices[i]])
|
257
|
+
for i in range(len(top_indices))
|
258
|
+
]
|
259
|
+
|
260
|
+
# Validate with ACF if requested and filter the results accordingly
|
261
|
+
if validate_with_acf:
|
262
|
+
# Compute ACF on the original (non-padded) detrended signal
|
263
|
+
acf_values = acf(
|
264
|
+
np.array(target_values, dtype=float)[:N_original],
|
265
|
+
nlags=N_original,
|
266
|
+
fft=True,
|
267
|
+
)
|
268
|
+
acf_peak_indices, _ = find_peaks(
|
269
|
+
acf_values, height=1.96 / np.sqrt(N_original)
|
270
|
+
)
|
271
|
+
validated_results = []
|
272
|
+
for period, mag in results:
|
273
|
+
period_int = int(round(period))
|
274
|
+
if period_int < len(acf_values) and any(
|
275
|
+
abs(period_int - peak) <= 1 for peak in acf_peak_indices
|
276
|
+
):
|
277
|
+
validated_results.append((period, mag))
|
278
|
+
if validated_results:
|
279
|
+
results = validated_results
|
280
|
+
|
281
|
+
# Ensure the final results are sorted in descending order by magnitude
|
282
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
283
|
+
|
284
|
+
return results
|
285
|
+
|
286
|
+
|
287
|
+
def detrend(
|
288
|
+
x: np.ndarray, detrend_type: Literal["first_diff", "loess", "linear"]
|
289
|
+
) -> np.ndarray:
|
290
|
+
if detrend_type == "first_diff":
|
291
|
+
return np.diff(x, prepend=x[0])
|
292
|
+
|
293
|
+
elif detrend_type == "loess":
|
294
|
+
from statsmodels.api import nonparametric
|
295
|
+
|
296
|
+
indices = np.arange(len(x))
|
297
|
+
lowess = nonparametric.lowess(x, indices, frac=0.1)
|
298
|
+
trend = lowess[:, 1]
|
299
|
+
return x - trend
|
300
|
+
|
301
|
+
elif detrend_type in ["linear", "constant"]:
|
302
|
+
from scipy.signal import detrend as scipy_detrend
|
303
|
+
|
304
|
+
return scipy_detrend(x, type=detrend_type)
|
305
|
+
|
306
|
+
else:
|
307
|
+
raise ValueError(f"Invalid detrend method: {detrend_type}")
|
@@ -0,0 +1,88 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from typing import List, Dict, Optional
|
4
|
+
|
5
|
+
import gluonts.time_feature
|
6
|
+
|
7
|
+
from tabpfn_time_series.features.feature_generator_base import (
|
8
|
+
FeatureGenerator,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
class RunningIndexFeature(FeatureGenerator):
|
13
|
+
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
14
|
+
df = df.copy()
|
15
|
+
df["running_index"] = range(len(df))
|
16
|
+
return df
|
17
|
+
|
18
|
+
|
19
|
+
class CalendarFeature(FeatureGenerator):
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
components: Optional[List[str]] = None,
|
23
|
+
seasonal_features: Optional[Dict[str, List[float]]] = None,
|
24
|
+
):
|
25
|
+
self.components = components or ["year"]
|
26
|
+
self.seasonal_features = seasonal_features or {
|
27
|
+
# (feature, natural seasonality)
|
28
|
+
"second_of_minute": [60],
|
29
|
+
"minute_of_hour": [60],
|
30
|
+
"hour_of_day": [24],
|
31
|
+
"day_of_week": [7],
|
32
|
+
"day_of_month": [30.5],
|
33
|
+
"day_of_year": [365],
|
34
|
+
"week_of_year": [52],
|
35
|
+
"month_of_year": [12],
|
36
|
+
}
|
37
|
+
|
38
|
+
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
39
|
+
df = df.copy()
|
40
|
+
timestamps = df.index.get_level_values("timestamp")
|
41
|
+
|
42
|
+
# Add basic calendar components
|
43
|
+
for component in self.components:
|
44
|
+
df[component] = getattr(timestamps, component)
|
45
|
+
|
46
|
+
# Add seasonal features
|
47
|
+
for feature_name, periods in self.seasonal_features.items():
|
48
|
+
feature_func = getattr(gluonts.time_feature, f"{feature_name}_index")
|
49
|
+
feature = feature_func(timestamps).astype(np.int32)
|
50
|
+
|
51
|
+
if periods is not None:
|
52
|
+
for period in periods:
|
53
|
+
period = period - 1 # Adjust for 0-based indexing
|
54
|
+
df[f"{feature_name}_sin"] = np.sin(2 * np.pi * feature / period)
|
55
|
+
df[f"{feature_name}_cos"] = np.cos(2 * np.pi * feature / period)
|
56
|
+
else:
|
57
|
+
df[feature_name] = feature
|
58
|
+
|
59
|
+
return df
|
60
|
+
|
61
|
+
|
62
|
+
class AdditionalCalendarFeature(CalendarFeature):
|
63
|
+
def __init__(
|
64
|
+
self,
|
65
|
+
components: Optional[List[str]] = None,
|
66
|
+
additional_seasonal_features: Optional[Dict[str, List[float]]] = None,
|
67
|
+
):
|
68
|
+
super().__init__(components=components)
|
69
|
+
|
70
|
+
self.seasonal_features = {
|
71
|
+
**additional_seasonal_features,
|
72
|
+
**self.seasonal_features,
|
73
|
+
}
|
74
|
+
|
75
|
+
|
76
|
+
class PeriodicSinCosineFeature(FeatureGenerator):
|
77
|
+
def __init__(self, periods: List[float], name_suffix: str = None):
|
78
|
+
self.periods = periods
|
79
|
+
self.name_suffix = name_suffix
|
80
|
+
|
81
|
+
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
82
|
+
df = df.copy()
|
83
|
+
for i, period in enumerate(self.periods):
|
84
|
+
name_suffix = f"{self.name_suffix}_{i}" if self.name_suffix else f"{period}"
|
85
|
+
df[f"sin_{name_suffix}"] = np.sin(2 * np.pi * np.arange(len(df)) / period)
|
86
|
+
df[f"cos_{name_suffix}"] = np.cos(2 * np.pi * np.arange(len(df)) / period)
|
87
|
+
|
88
|
+
return df
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
|
5
|
+
|
6
|
+
class FeatureGenerator(ABC):
|
7
|
+
"""Abstract base class for feature generators"""
|
8
|
+
|
9
|
+
@abstractmethod
|
10
|
+
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
11
|
+
"""Generate features for the given dataframe"""
|
12
|
+
pass
|
13
|
+
|
14
|
+
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
|
15
|
+
return self.generate(df)
|
16
|
+
|
17
|
+
def __str__(self) -> str:
|
18
|
+
return f"{self.__class__.__name__}_{self.__dict__}"
|
19
|
+
|
20
|
+
def __repr__(self) -> str:
|
21
|
+
return self.__str__()
|
@@ -0,0 +1,53 @@
|
|
1
|
+
from typing import List, Tuple
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
|
5
|
+
from autogluon.timeseries import TimeSeriesDataFrame
|
6
|
+
from tabpfn_time_series.features.feature_generator_base import (
|
7
|
+
FeatureGenerator,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
class FeatureTransformer:
|
12
|
+
def __init__(self, feature_generators: List[FeatureGenerator]):
|
13
|
+
self.feature_generators = feature_generators
|
14
|
+
|
15
|
+
def transform(
|
16
|
+
self,
|
17
|
+
train_tsdf: TimeSeriesDataFrame,
|
18
|
+
test_tsdf: TimeSeriesDataFrame,
|
19
|
+
target_column: str = "target",
|
20
|
+
) -> Tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
|
21
|
+
"""Transform both train and test data with the configured feature generators"""
|
22
|
+
|
23
|
+
self._validate_input(train_tsdf, test_tsdf, target_column)
|
24
|
+
tsdf = pd.concat([train_tsdf, test_tsdf])
|
25
|
+
|
26
|
+
# Apply all feature generators
|
27
|
+
for generator in self.feature_generators:
|
28
|
+
tsdf = tsdf.groupby(level="item_id", group_keys=False).apply(generator)
|
29
|
+
|
30
|
+
# Split train and test tsdf
|
31
|
+
train_tsdf = tsdf.iloc[: len(train_tsdf)]
|
32
|
+
test_tsdf = tsdf.iloc[len(train_tsdf) :]
|
33
|
+
|
34
|
+
assert (
|
35
|
+
not train_tsdf[target_column].isna().any()
|
36
|
+
), "All target values in train_tsdf should be non-NaN"
|
37
|
+
assert test_tsdf[target_column].isna().all()
|
38
|
+
|
39
|
+
return train_tsdf, test_tsdf
|
40
|
+
|
41
|
+
@staticmethod
|
42
|
+
def _validate_input(
|
43
|
+
train_tsdf: TimeSeriesDataFrame,
|
44
|
+
test_tsdf: TimeSeriesDataFrame,
|
45
|
+
target_column: str,
|
46
|
+
):
|
47
|
+
if target_column not in train_tsdf.columns:
|
48
|
+
raise ValueError(
|
49
|
+
f"Target column '{target_column}' not found in training data"
|
50
|
+
)
|
51
|
+
|
52
|
+
if not test_tsdf[target_column].isna().all():
|
53
|
+
raise ValueError("Test data should not contain target values")
|
tabpfn_time_series/predictor.py
CHANGED
@@ -3,11 +3,8 @@ from enum import Enum
|
|
3
3
|
|
4
4
|
from autogluon.timeseries import TimeSeriesDataFrame
|
5
5
|
|
6
|
-
from tabpfn_time_series.tabpfn_worker import TabPFNClient, LocalTabPFN
|
7
|
-
from tabpfn_time_series.defaults import
|
8
|
-
TABPFN_TS_DEFAULT_QUANTILE_CONFIG,
|
9
|
-
TABPFN_TS_DEFAULT_CONFIG,
|
10
|
-
)
|
6
|
+
from tabpfn_time_series.tabpfn_worker import TabPFNClient, LocalTabPFN, MockTabPFN
|
7
|
+
from tabpfn_time_series.defaults import TABPFN_TS_DEFAULT_CONFIG
|
11
8
|
|
12
9
|
logger = logging.getLogger(__name__)
|
13
10
|
|
@@ -15,6 +12,7 @@ logger = logging.getLogger(__name__)
|
|
15
12
|
class TabPFNMode(Enum):
|
16
13
|
LOCAL = "tabpfn-local"
|
17
14
|
CLIENT = "tabpfn-client"
|
15
|
+
MOCK = "tabpfn-mock"
|
18
16
|
|
19
17
|
|
20
18
|
class TabPFNTimeSeriesPredictor:
|
@@ -30,6 +28,7 @@ class TabPFNTimeSeriesPredictor:
|
|
30
28
|
worker_mapping = {
|
31
29
|
TabPFNMode.CLIENT: lambda: TabPFNClient(config),
|
32
30
|
TabPFNMode.LOCAL: lambda: LocalTabPFN(config),
|
31
|
+
TabPFNMode.MOCK: lambda: MockTabPFN(config),
|
33
32
|
}
|
34
33
|
self.tabpfn_worker = worker_mapping[tabpfn_mode]()
|
35
34
|
|
@@ -37,7 +36,6 @@ class TabPFNTimeSeriesPredictor:
|
|
37
36
|
self,
|
38
37
|
train_tsdf: TimeSeriesDataFrame, # with features and target
|
39
38
|
test_tsdf: TimeSeriesDataFrame, # with features only
|
40
|
-
quantile_config: list[float] = TABPFN_TS_DEFAULT_QUANTILE_CONFIG,
|
41
39
|
) -> TimeSeriesDataFrame:
|
42
40
|
"""
|
43
41
|
Predict on each time series individually (local forecasting).
|
@@ -47,4 +45,4 @@ class TabPFNTimeSeriesPredictor:
|
|
47
45
|
f"Predicting {len(train_tsdf.item_ids)} time series with config{self.tabpfn_worker.config}"
|
48
46
|
)
|
49
47
|
|
50
|
-
return self.tabpfn_worker.predict(train_tsdf, test_tsdf
|
48
|
+
return self.tabpfn_worker.predict(train_tsdf, test_tsdf)
|
@@ -2,8 +2,10 @@ import logging
|
|
2
2
|
from abc import ABC, abstractmethod
|
3
3
|
from joblib import Parallel, delayed
|
4
4
|
|
5
|
+
from tqdm import tqdm
|
5
6
|
import pandas as pd
|
6
7
|
import numpy as np
|
8
|
+
import torch
|
7
9
|
from scipy.stats import norm
|
8
10
|
from autogluon.timeseries import TimeSeriesDataFrame
|
9
11
|
|
@@ -26,14 +28,7 @@ class TabPFNWorker(ABC):
|
|
26
28
|
self,
|
27
29
|
train_tsdf: TimeSeriesDataFrame,
|
28
30
|
test_tsdf: TimeSeriesDataFrame,
|
29
|
-
quantile_config: list[float],
|
30
31
|
):
|
31
|
-
if not set(quantile_config).issubset(set(TABPFN_TS_DEFAULT_QUANTILE_CONFIG)):
|
32
|
-
raise NotImplementedError(
|
33
|
-
f"We currently only supports {TABPFN_TS_DEFAULT_QUANTILE_CONFIG} for quantile prediction,"
|
34
|
-
f" but got {quantile_config}."
|
35
|
-
)
|
36
|
-
|
37
32
|
predictions = Parallel(
|
38
33
|
n_jobs=self.num_workers,
|
39
34
|
backend="loky",
|
@@ -42,9 +37,8 @@ class TabPFNWorker(ABC):
|
|
42
37
|
item_id,
|
43
38
|
train_tsdf.loc[item_id],
|
44
39
|
test_tsdf.loc[item_id],
|
45
|
-
quantile_config,
|
46
40
|
)
|
47
|
-
for item_id in train_tsdf.item_ids
|
41
|
+
for item_id in tqdm(train_tsdf.item_ids, desc="Predicting time series")
|
48
42
|
)
|
49
43
|
|
50
44
|
predictions = pd.concat(predictions)
|
@@ -59,8 +53,9 @@ class TabPFNWorker(ABC):
|
|
59
53
|
item_id: str,
|
60
54
|
single_train_tsdf: TimeSeriesDataFrame,
|
61
55
|
single_test_tsdf: TimeSeriesDataFrame,
|
62
|
-
quantile_config: list[float],
|
63
56
|
) -> pd.DataFrame:
|
57
|
+
# logger.debug(f"Predicting on item_id: {item_id}")
|
58
|
+
|
64
59
|
test_index = single_test_tsdf.index
|
65
60
|
train_X, train_y = split_time_series_to_X_y(single_train_tsdf.copy())
|
66
61
|
test_X, _ = split_time_series_to_X_y(single_test_tsdf.copy())
|
@@ -70,7 +65,7 @@ class TabPFNWorker(ABC):
|
|
70
65
|
if train_y_has_constant_value:
|
71
66
|
logger.info("Found time-series with constant target")
|
72
67
|
result = self._predict_on_constant_train_target(
|
73
|
-
single_train_tsdf, single_test_tsdf
|
68
|
+
single_train_tsdf, single_test_tsdf
|
74
69
|
)
|
75
70
|
else:
|
76
71
|
tabpfn = self._get_tabpfn_engine()
|
@@ -81,7 +76,9 @@ class TabPFNWorker(ABC):
|
|
81
76
|
result.update(
|
82
77
|
{
|
83
78
|
q: q_pred
|
84
|
-
for q, q_pred in zip(
|
79
|
+
for q, q_pred in zip(
|
80
|
+
TABPFN_TS_DEFAULT_QUANTILE_CONFIG, full_pred["quantiles"]
|
81
|
+
)
|
85
82
|
}
|
86
83
|
)
|
87
84
|
|
@@ -98,7 +95,6 @@ class TabPFNWorker(ABC):
|
|
98
95
|
self,
|
99
96
|
single_train_tsdf: TimeSeriesDataFrame,
|
100
97
|
single_test_tsdf: TimeSeriesDataFrame,
|
101
|
-
quantile_config: list[float],
|
102
98
|
) -> pd.DataFrame:
|
103
99
|
# If train_y is constant, we return the constant value from the training set
|
104
100
|
mean_constant = single_train_tsdf.target.iloc[0]
|
@@ -106,12 +102,14 @@ class TabPFNWorker(ABC):
|
|
106
102
|
|
107
103
|
# For quantile prediction, we assume that the uncertainty follows a standard normal distribution
|
108
104
|
quantile_pred_with_uncertainty = norm.ppf(
|
109
|
-
|
105
|
+
TABPFN_TS_DEFAULT_QUANTILE_CONFIG, loc=mean_constant, scale=1
|
110
106
|
)
|
111
107
|
result.update(
|
112
108
|
{
|
113
109
|
q: np.full(len(single_test_tsdf), v)
|
114
|
-
for q, v in zip(
|
110
|
+
for q, v in zip(
|
111
|
+
TABPFN_TS_DEFAULT_QUANTILE_CONFIG, quantile_pred_with_uncertainty
|
112
|
+
)
|
115
113
|
}
|
116
114
|
)
|
117
115
|
|
@@ -141,8 +139,52 @@ class LocalTabPFN(TabPFNWorker):
|
|
141
139
|
def __init__(
|
142
140
|
self,
|
143
141
|
config: dict = {},
|
142
|
+
num_workers_per_gpu: int = 4, # per GPU
|
143
|
+
):
|
144
|
+
self.num_workers_per_gpu = num_workers_per_gpu
|
145
|
+
|
146
|
+
# Only support GPU for now (inference on CPU takes too long)
|
147
|
+
if not torch.cuda.is_available():
|
148
|
+
raise ValueError("GPU is required for local TabPFN inference")
|
149
|
+
|
150
|
+
super().__init__(
|
151
|
+
config, num_workers=torch.cuda.device_count() * self.num_workers_per_gpu
|
152
|
+
)
|
153
|
+
|
154
|
+
def predict(
|
155
|
+
self,
|
156
|
+
train_tsdf: TimeSeriesDataFrame,
|
157
|
+
test_tsdf: TimeSeriesDataFrame,
|
144
158
|
):
|
145
|
-
|
159
|
+
total_num_workers = torch.cuda.device_count() * self.num_workers_per_gpu
|
160
|
+
|
161
|
+
# Split data into chunks for parallel inference on each GPU
|
162
|
+
# since the time series are of different lengths, we shuffle
|
163
|
+
# the item_ids s.t. the workload is distributed evenly across GPUs
|
164
|
+
# Also, using 'min' since num_workers could be larger than the number of time series
|
165
|
+
np.random.seed(0)
|
166
|
+
item_ids_chunks = np.array_split(
|
167
|
+
np.random.permutation(train_tsdf.item_ids),
|
168
|
+
min(total_num_workers, len(train_tsdf.item_ids)),
|
169
|
+
)
|
170
|
+
|
171
|
+
# Run predictions in parallel
|
172
|
+
predictions = Parallel(n_jobs=len(item_ids_chunks), backend="loky")(
|
173
|
+
delayed(self._prediction_routine_per_gpu)(
|
174
|
+
train_tsdf.loc[chunk],
|
175
|
+
test_tsdf.loc[chunk],
|
176
|
+
gpu_id=i
|
177
|
+
% torch.cuda.device_count(), # Alternate between available GPUs
|
178
|
+
)
|
179
|
+
for i, chunk in enumerate(item_ids_chunks)
|
180
|
+
)
|
181
|
+
|
182
|
+
predictions = pd.concat(predictions)
|
183
|
+
|
184
|
+
# Sort predictions according to original item_ids order
|
185
|
+
predictions = predictions.loc[train_tsdf.item_ids]
|
186
|
+
|
187
|
+
return TimeSeriesDataFrame(predictions)
|
146
188
|
|
147
189
|
def _get_tabpfn_engine(self):
|
148
190
|
from tabpfn import TabPFNRegressor
|
@@ -151,7 +193,67 @@ class LocalTabPFN(TabPFNWorker):
|
|
151
193
|
config = self.config["tabpfn_internal"].copy()
|
152
194
|
config["model_path"] = self._parse_model_path(config["model_path"])
|
153
195
|
|
154
|
-
return TabPFNRegressor(**config)
|
196
|
+
return TabPFNRegressor(**config, random_state=0)
|
155
197
|
|
156
198
|
def _parse_model_path(self, model_name: str) -> str:
|
157
199
|
return f"tabpfn-v2-regressor-{model_name}.ckpt"
|
200
|
+
|
201
|
+
def _prediction_routine_per_gpu(
|
202
|
+
self,
|
203
|
+
train_tsdf: TimeSeriesDataFrame,
|
204
|
+
test_tsdf: TimeSeriesDataFrame,
|
205
|
+
gpu_id: int,
|
206
|
+
):
|
207
|
+
# Set GPU
|
208
|
+
torch.cuda.set_device(gpu_id)
|
209
|
+
|
210
|
+
all_pred = []
|
211
|
+
for item_id in tqdm(train_tsdf.item_ids, desc=f"GPU {gpu_id}:"):
|
212
|
+
predictions = self._prediction_routine(
|
213
|
+
item_id,
|
214
|
+
train_tsdf.loc[item_id],
|
215
|
+
test_tsdf.loc[item_id],
|
216
|
+
)
|
217
|
+
all_pred.append(predictions)
|
218
|
+
|
219
|
+
# Clear GPU cache
|
220
|
+
torch.cuda.empty_cache()
|
221
|
+
|
222
|
+
return pd.concat(all_pred)
|
223
|
+
|
224
|
+
|
225
|
+
class MockTabPFN(TabPFNWorker):
|
226
|
+
"""
|
227
|
+
Mock TabPFN worker that returns random values for predictions.
|
228
|
+
Can be used for testing or debugging.
|
229
|
+
"""
|
230
|
+
|
231
|
+
class MockTabPFNRegressor:
|
232
|
+
TABPFN_QUANTILE = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
|
233
|
+
|
234
|
+
def __init__(self, *args, **kwargs):
|
235
|
+
pass
|
236
|
+
|
237
|
+
def fit(self, *args, **kwargs):
|
238
|
+
pass
|
239
|
+
|
240
|
+
def predict(self, test_X, output_type="main", **kwargs):
|
241
|
+
if output_type != "main":
|
242
|
+
raise NotImplementedError(
|
243
|
+
"Only main output is supported for mock TabPFN"
|
244
|
+
)
|
245
|
+
|
246
|
+
return {
|
247
|
+
"mean": np.random.rand(len(test_X)),
|
248
|
+
"median": np.random.rand(len(test_X)),
|
249
|
+
"mode": np.random.rand(len(test_X)),
|
250
|
+
"quantiles": [
|
251
|
+
np.random.rand(len(test_X)) for _ in self.TABPFN_QUANTILE
|
252
|
+
],
|
253
|
+
}
|
254
|
+
|
255
|
+
def __init__(self, *args, **kwargs):
|
256
|
+
super().__init__(*args, **kwargs)
|
257
|
+
|
258
|
+
def _get_tabpfn_engine(self):
|
259
|
+
return self.MockTabPFNRegressor()
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: tabpfn_time_series
|
3
|
-
Version: 0.
|
4
|
-
Summary: Zero-shot time series forecasting with
|
3
|
+
Version: 1.0.0
|
4
|
+
Summary: Zero-shot time series forecasting with TabPFNv2
|
5
5
|
Project-URL: Homepage, https://github.com/liam-sbhoo/tabpfn-time-series
|
6
6
|
Project-URL: Bug Tracker, https://github.com/liam-sbhoo/tabpfn-time-series/issues
|
7
7
|
Author-email: Liam Shi Bin Hoo <hoos@tf.uni-freiburg.de>
|
@@ -10,28 +10,43 @@ Classifier: License :: OSI Approved :: Apache Software License
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
12
|
Requires-Python: >=3.10
|
13
|
-
Requires-Dist: autogluon-timeseries
|
14
|
-
Requires-Dist:
|
15
|
-
Requires-Dist:
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist:
|
13
|
+
Requires-Dist: autogluon-timeseries>=1.2
|
14
|
+
Requires-Dist: datasets>=3.3.2
|
15
|
+
Requires-Dist: gluonts>=0.16.0
|
16
|
+
Requires-Dist: pandas<2.2.0,>=2.1.2
|
17
|
+
Requires-Dist: python-dotenv>=1.1.0
|
18
|
+
Requires-Dist: pyyaml>=6.0.1
|
19
|
+
Requires-Dist: tabpfn-client>=0.1.7
|
20
|
+
Requires-Dist: tabpfn>=2.0.9
|
18
21
|
Requires-Dist: tqdm
|
19
22
|
Provides-Extra: dev
|
20
23
|
Requires-Dist: build; extra == 'dev'
|
21
24
|
Requires-Dist: jupyter; extra == 'dev'
|
22
25
|
Requires-Dist: pre-commit; extra == 'dev'
|
23
26
|
Requires-Dist: ruff; extra == 'dev'
|
27
|
+
Requires-Dist: submitit>=1.5.2; extra == 'dev'
|
24
28
|
Requires-Dist: twine; extra == 'dev'
|
29
|
+
Requires-Dist: wandb>=0.19.8; extra == 'dev'
|
25
30
|
Description-Content-Type: text/markdown
|
26
31
|
|
27
|
-
#
|
32
|
+
# TabPFN-TS
|
28
33
|
|
34
|
+
> Zero-Shot Time Series Forecasting with TabPFNv2
|
35
|
+
|
36
|
+
[](https://badge.fury.io/py/tabpfn-time-series)
|
29
37
|
[](https://colab.research.google.com/github/liam-sbhoo/tabpfn-time-series/blob/main/demo.ipynb)
|
30
38
|
[](https://discord.com/channels/1285598202732482621/)
|
31
|
-
[](https://arxiv.org/abs/2501.
|
39
|
+
[](https://arxiv.org/abs/2501.02945v3)
|
40
|
+
|
41
|
+
## 📌 News
|
42
|
+
- **27-05-2025**: 📝 New **[paper](https://arxiv.org/abs/2501.02945v3)** version and **v1.0.0** release! Strong [GIFT-EVAL](https://huggingface.co/spaces/Salesforce/GIFT-Eval) results, new AutoSeasonalFeatures, improved CalendarFeatures.
|
43
|
+
- **27-01-2025**: 🚀 Ranked _**1st**_ on [GIFT-EVAL](https://huggingface.co/spaces/Salesforce/GIFT-Eval) benchmark<sup>[1]</sup>!
|
44
|
+
- **10-10-2024**: 🚀 TabPFN-TS [paper](https://arxiv.org/abs/2501.02945v2) accepted to NeurIPS 2024 [TRL](https://table-representation-learning.github.io/NeurIPS2024/) and [TSALM](https://neurips-time-series-workshop.github.io/) workshops!
|
32
45
|
|
46
|
+
_[1] Last checked on: 10/03/2025_
|
33
47
|
|
34
|
-
|
48
|
+
## ✨ Introduction
|
49
|
+
We demonstrate that the tabular foundation model **[TabPFNv2](https://github.com/PriorLabs/TabPFN)**, combined with lightweight feature engineering, enables zero-shot time series forecasting for both point and probabilistic tasks. On the **[GIFT-EVAL](https://huggingface.co/spaces/Salesforce/GIFT-Eval)** benchmark, our method achieves performance on par with top-tier models across both evaluation metrics.
|
35
50
|
|
36
51
|
## 📖 How does it work?
|
37
52
|
|
@@ -41,18 +56,19 @@ Our work proposes to frame **univariate time series forecasting** as a **tabular
|
|
41
56
|
|
42
57
|
Concretely, we:
|
43
58
|
1. Transform a time series into a table
|
44
|
-
2. Extract features
|
45
|
-
3. Perform regression on the table using
|
59
|
+
2. Extract features and add them to the table
|
60
|
+
3. Perform regression on the table using TabPFNv2
|
46
61
|
4. Use regression results as time series forecasting outputs
|
47
62
|
|
48
|
-
For more details, please refer to our [paper](https://arxiv.org/abs/2501.
|
63
|
+
For more details, please refer to our [paper](https://arxiv.org/abs/2501.02945v3).
|
64
|
+
<!-- and our [poster](docs/tabpfn-ts-neurips-poster.pdf) (presented at NeurIPS 2024 TRL and TSALM workshops). -->
|
49
65
|
|
50
66
|
## 👉 **Why gives us a try?**
|
51
67
|
- **Zero-shot forecasting**: this method is extremely fast and requires no training, making it highly accessible for experimenting with your own problems.
|
52
68
|
- **Point and probabilistic forecasting**: it provides accurate point forecasts as well as probabilistic forecasts.
|
53
69
|
- **Support for exogenous variables**: if you have exogenous variables, this method can seemlessly incorporate them into the forecasting model.
|
54
70
|
|
55
|
-
On top of that, thanks to **[tabpfn-client](https://github.com/automl/tabpfn-client)** from **[Prior Labs](https://priorlabs.ai)**, you won’t even need your own GPU to run fast inference with
|
71
|
+
On top of that, thanks to **[tabpfn-client](https://github.com/automl/tabpfn-client)** from **[Prior Labs](https://priorlabs.ai)**, you won’t even need your own GPU to run fast inference with TabPFNv2. 😉 We have included `tabpfn-client` as the default engine in our implementation.
|
56
72
|
|
57
73
|
## How to use it?
|
58
74
|
|
@@ -0,0 +1,15 @@
|
|
1
|
+
tabpfn_time_series/__init__.py,sha256=3XGvQieVbONwhVtn1rITet6HNiTMWQTxHm2xLlGI5ew,314
|
2
|
+
tabpfn_time_series/data_preparation.py,sha256=iNW7sAnRkTgmzzOEHBhkkTwm_lQ3p_Q9xgAQ5PbkOts,5416
|
3
|
+
tabpfn_time_series/defaults.py,sha256=u2_JnwxiZ5NNibzyNpsE63KuP3TcmOL1iAP8llZ2rJk,238
|
4
|
+
tabpfn_time_series/plot.py,sha256=bwSYcWBanzPrUxXKFsbqG8fyGsOJZfgU2v3NsxzTSXo,6571
|
5
|
+
tabpfn_time_series/predictor.py,sha256=JzuV34zERf1XDLacGzSFJb-o077qd7GlKC6lvD62EPk,1457
|
6
|
+
tabpfn_time_series/tabpfn_worker.py,sha256=zvFwg4Dc01_m5emqmVITBr6W_cNZ04tMyntmj40pyPE,8299
|
7
|
+
tabpfn_time_series/features/__init__.py,sha256=lzdZWkEfntfg3ZHqNNbfbg-3o_VIzju0tebdRu3AzF4,421
|
8
|
+
tabpfn_time_series/features/auto_features.py,sha256=3OqqY2h7umcoLjLx4hOXypLTjwzrMtd6cQKTNi83vrU,11561
|
9
|
+
tabpfn_time_series/features/basic_features.py,sha256=OV3B__S30-CX88vGjwYQDWqAbJajQw80PxcnvJVUbm4,2955
|
10
|
+
tabpfn_time_series/features/feature_generator_base.py,sha256=jtySWLJyX4E31v6CbX44EHa8cdz7OMyauf4ltNEQeAQ,534
|
11
|
+
tabpfn_time_series/features/feature_transformer.py,sha256=mUsbnPUhJ4lPcnGWk8Ag1hgCOE1V5I0iQRT4VFgQEso,1763
|
12
|
+
tabpfn_time_series-1.0.0.dist-info/METADATA,sha256=CvXqIOHNTKyd-zpCednsqa3FloPk6lFJ4ISG0eSEWx4,4434
|
13
|
+
tabpfn_time_series-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
14
|
+
tabpfn_time_series-1.0.0.dist-info/licenses/LICENSE.txt,sha256=iwhPL7kIWQG6gyLZZwIMDItGrNgxMDIq9itxkUSMapY,11345
|
15
|
+
tabpfn_time_series-1.0.0.dist-info/RECORD,,
|
tabpfn_time_series/feature.py
DELETED
@@ -1,78 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
import pandas as pd
|
3
|
-
from typing import Tuple, List, Callable
|
4
|
-
|
5
|
-
import gluonts.time_feature
|
6
|
-
from autogluon.timeseries import TimeSeriesDataFrame
|
7
|
-
|
8
|
-
|
9
|
-
class DefaultFeatures:
|
10
|
-
@staticmethod
|
11
|
-
def add_running_index(df: pd.DataFrame) -> pd.Series:
|
12
|
-
df["running_index"] = range(len(df))
|
13
|
-
return df
|
14
|
-
|
15
|
-
@staticmethod
|
16
|
-
def add_calendar_features(df: pd.DataFrame) -> pd.DataFrame:
|
17
|
-
CALENDAR_COMPONENT = [
|
18
|
-
"year",
|
19
|
-
# "month",
|
20
|
-
# "day",
|
21
|
-
]
|
22
|
-
|
23
|
-
CALENDAR_FEATURES = [
|
24
|
-
# (feature, natural seasonality)
|
25
|
-
("hour_of_day", 24),
|
26
|
-
("day_of_week", 7),
|
27
|
-
("day_of_month", 30.5),
|
28
|
-
("day_of_year", 365),
|
29
|
-
("week_of_year", 52),
|
30
|
-
("month_of_year", 12),
|
31
|
-
]
|
32
|
-
|
33
|
-
timestamps = df.index.get_level_values("timestamp")
|
34
|
-
|
35
|
-
for component_name in CALENDAR_COMPONENT:
|
36
|
-
df[component_name] = getattr(timestamps, component_name)
|
37
|
-
|
38
|
-
for feature_name, seasonality in CALENDAR_FEATURES:
|
39
|
-
feature_func = getattr(gluonts.time_feature, f"{feature_name}_index")
|
40
|
-
feature = feature_func(timestamps).astype(np.int32)
|
41
|
-
if seasonality is not None:
|
42
|
-
df[f"{feature_name}_sin"] = np.sin(
|
43
|
-
2 * np.pi * feature / (seasonality - 1)
|
44
|
-
) # seasonality - 1 because the value starts from 0
|
45
|
-
df[f"{feature_name}_cos"] = np.cos(
|
46
|
-
2 * np.pi * feature / (seasonality - 1)
|
47
|
-
)
|
48
|
-
else:
|
49
|
-
df[feature_name] = feature
|
50
|
-
|
51
|
-
return df
|
52
|
-
|
53
|
-
|
54
|
-
class FeatureTransformer:
|
55
|
-
@staticmethod
|
56
|
-
def add_features(
|
57
|
-
train_tsdf: TimeSeriesDataFrame,
|
58
|
-
test_tsdf: TimeSeriesDataFrame,
|
59
|
-
feature_generators: List[Callable[[TimeSeriesDataFrame], TimeSeriesDataFrame]],
|
60
|
-
target_column: str = "target",
|
61
|
-
) -> Tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
|
62
|
-
assert target_column in train_tsdf.columns
|
63
|
-
assert test_tsdf[target_column].isna().all()
|
64
|
-
|
65
|
-
# Join train and test tsdf
|
66
|
-
tsdf = pd.concat([train_tsdf, test_tsdf])
|
67
|
-
|
68
|
-
# Apply feature generators
|
69
|
-
for func in feature_generators:
|
70
|
-
tsdf = tsdf.groupby(level="item_id", group_keys=False).apply(func)
|
71
|
-
|
72
|
-
# Split train and test tsdf
|
73
|
-
train_tsdf = tsdf.iloc[: len(train_tsdf)]
|
74
|
-
test_tsdf = tsdf.iloc[len(train_tsdf) :]
|
75
|
-
|
76
|
-
assert test_tsdf[target_column].isna().all()
|
77
|
-
|
78
|
-
return train_tsdf, test_tsdf
|
@@ -1,11 +0,0 @@
|
|
1
|
-
tabpfn_time_series/__init__.py,sha256=5ruHrmKBQRIZ3WXLA8du4JKttF55ntnI74hkRsHThQ8,256
|
2
|
-
tabpfn_time_series/data_preparation.py,sha256=iNW7sAnRkTgmzzOEHBhkkTwm_lQ3p_Q9xgAQ5PbkOts,5416
|
3
|
-
tabpfn_time_series/defaults.py,sha256=u2_JnwxiZ5NNibzyNpsE63KuP3TcmOL1iAP8llZ2rJk,238
|
4
|
-
tabpfn_time_series/feature.py,sha256=_9FxfQfgPOOO1MiT8hB8523eZ3Nc5oKuoY7vcohKZZc,2531
|
5
|
-
tabpfn_time_series/plot.py,sha256=bwSYcWBanzPrUxXKFsbqG8fyGsOJZfgU2v3NsxzTSXo,6571
|
6
|
-
tabpfn_time_series/predictor.py,sha256=W9JijaxFaR0chfiW7m4RuDQ0wrRcJezDWVwCBEOQDFk,1502
|
7
|
-
tabpfn_time_series/tabpfn_worker.py,sha256=XNpqLEW51PgzrEopNNdtGdYArMCHT4yeBK3BS3z25K0,5021
|
8
|
-
tabpfn_time_series-0.1.2.dist-info/METADATA,sha256=hO69b8GN3GDRIetG4DGtxpdMubc8sm8h_aI2RwEto2U,3285
|
9
|
-
tabpfn_time_series-0.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
10
|
-
tabpfn_time_series-0.1.2.dist-info/licenses/LICENSE.txt,sha256=iwhPL7kIWQG6gyLZZwIMDItGrNgxMDIq9itxkUSMapY,11345
|
11
|
-
tabpfn_time_series-0.1.2.dist-info/RECORD,,
|
File without changes
|
{tabpfn_time_series-0.1.2.dist-info → tabpfn_time_series-1.0.0.dist-info}/licenses/LICENSE.txt
RENAMED
File without changes
|