nanoforecast 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nanoforecast/__init__.py +5 -0
- nanoforecast/config.py +44 -0
- nanoforecast/data/__init__.py +2 -0
- nanoforecast/data/generator.py +164 -0
- nanoforecast/data/pipeline.py +168 -0
- nanoforecast/data/real_datasets.py +213 -0
- nanoforecast/evaluation/__init__.py +1 -0
- nanoforecast/evaluation/benchmark.py +145 -0
- nanoforecast/export/__init__.py +1 -0
- nanoforecast/export/onnx_export.py +156 -0
- nanoforecast/hub.py +271 -0
- nanoforecast/model/__init__.py +15 -0
- nanoforecast/model/blocks.py +254 -0
- nanoforecast/model/core.py +250 -0
- nanoforecast/model/heads.py +123 -0
- nanoforecast/model/utils.py +139 -0
- nanoforecast/train/__init__.py +2 -0
- nanoforecast/train/loss.py +109 -0
- nanoforecast/train/trainer.py +186 -0
- nanoforecast-0.2.0.dist-info/METADATA +359 -0
- nanoforecast-0.2.0.dist-info/RECORD +25 -0
- nanoforecast-0.2.0.dist-info/WHEEL +5 -0
- nanoforecast-0.2.0.dist-info/entry_points.txt +3 -0
- nanoforecast-0.2.0.dist-info/licenses/LICENSE +201 -0
- nanoforecast-0.2.0.dist-info/top_level.txt +1 -0
nanoforecast/__init__.py
ADDED
nanoforecast/config.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
@dataclass
|
|
5
|
+
class NanoForecastConfig:
|
|
6
|
+
# Context and prediction windows
|
|
7
|
+
context_length: int = 512
|
|
8
|
+
prediction_length: int = 96
|
|
9
|
+
|
|
10
|
+
# Architecture dimensions
|
|
11
|
+
d_model: int = 32
|
|
12
|
+
num_layers: int = 4
|
|
13
|
+
patch_size: int = 8
|
|
14
|
+
dropout: float = 0.1
|
|
15
|
+
expansion_factor: int = 2
|
|
16
|
+
|
|
17
|
+
# Model capabilities
|
|
18
|
+
quantiles: List[float] = field(default_factory=lambda: [0.1, 0.25, 0.5, 0.75, 0.9])
|
|
19
|
+
num_frequencies: int = 10 # Learned embeddings for hourly, daily, weekly, etc.
|
|
20
|
+
num_channels: int = 16 # Maximum multivariate channels supported natively
|
|
21
|
+
covariate_dim: int = 4 # Dimensionality of exogenous covariates
|
|
22
|
+
|
|
23
|
+
# Gating and features
|
|
24
|
+
use_gated_router: bool = True
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def nano_200k(cls) -> "NanoForecastConfig":
|
|
28
|
+
return cls(
|
|
29
|
+
d_model=32,
|
|
30
|
+
num_layers=4,
|
|
31
|
+
patch_size=8,
|
|
32
|
+
dropout=0.1,
|
|
33
|
+
expansion_factor=2
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def nano_500k(cls) -> "NanoForecastConfig":
|
|
38
|
+
return cls(
|
|
39
|
+
d_model=64,
|
|
40
|
+
num_layers=8,
|
|
41
|
+
patch_size=8,
|
|
42
|
+
dropout=0.1,
|
|
43
|
+
expansion_factor=2
|
|
44
|
+
)
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import Dict, Tuple, List
|
|
3
|
+
|
|
4
|
+
class SyntheticTimeSeriesGenerator:
|
|
5
|
+
"""
|
|
6
|
+
High-speed, vectorized generator for realistic synthetic time series.
|
|
7
|
+
Generates various patterns: linear/exponential trends, multi-seasonal cycles,
|
|
8
|
+
outlier spikes, level shifts, and non-Gaussian noise.
|
|
9
|
+
"""
|
|
10
|
+
def __init__(self, seed: int = 42):
|
|
11
|
+
self.rng = np.random.default_rng(seed)
|
|
12
|
+
|
|
13
|
+
def generate_single_series(
|
|
14
|
+
self,
|
|
15
|
+
length: int,
|
|
16
|
+
freq_id: int
|
|
17
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
18
|
+
"""
|
|
19
|
+
Generates a single synthetic time series vector along with a covariate matrix.
|
|
20
|
+
Args:
|
|
21
|
+
length: Number of time steps to generate
|
|
22
|
+
freq_id: Integer mapping to a specific frequency (0-9)
|
|
23
|
+
Returns:
|
|
24
|
+
series: Array of shape [length]
|
|
25
|
+
covariates: Array of shape [cov_dim, length] (numeric indicators like holiday, time-of-day)
|
|
26
|
+
"""
|
|
27
|
+
t = np.arange(length)
|
|
28
|
+
|
|
29
|
+
# 1. Base Trend
|
|
30
|
+
trend_type = self.rng.choice(["linear", "exponential", "piecewise", "flat"])
|
|
31
|
+
if trend_type == "linear":
|
|
32
|
+
slope = self.rng.uniform(-0.05, 0.05)
|
|
33
|
+
trend = slope * t
|
|
34
|
+
elif trend_type == "exponential":
|
|
35
|
+
growth = self.rng.uniform(0.001, 0.005)
|
|
36
|
+
trend = 0.1 * np.exp(growth * t)
|
|
37
|
+
elif trend_type == "piecewise":
|
|
38
|
+
changepoint = length // 2
|
|
39
|
+
slope1 = self.rng.uniform(-0.05, 0.05)
|
|
40
|
+
slope2 = self.rng.uniform(-0.05, 0.05)
|
|
41
|
+
trend = np.zeros(length)
|
|
42
|
+
trend[:changepoint] = slope1 * t[:changepoint]
|
|
43
|
+
trend[changepoint:] = trend[changepoint-1] + slope2 * (t[changepoint:] - changepoint)
|
|
44
|
+
else:
|
|
45
|
+
trend = np.zeros(length)
|
|
46
|
+
|
|
47
|
+
# Add random base level shift
|
|
48
|
+
trend += self.rng.uniform(-10.0, 10.0)
|
|
49
|
+
|
|
50
|
+
# 2. Seasonality (Frequency-dependent)
|
|
51
|
+
season = np.zeros(length)
|
|
52
|
+
if freq_id == 0: # 5-minutely (period e.g., 288 steps/day)
|
|
53
|
+
periods = [288]
|
|
54
|
+
elif freq_id == 1: # Hourly (period 24 steps/day, 168 steps/week)
|
|
55
|
+
periods = [24, 168]
|
|
56
|
+
elif freq_id == 2: # Daily (period 7 steps/week)
|
|
57
|
+
periods = [7]
|
|
58
|
+
elif freq_id == 3: # Weekly (period 52 steps/year)
|
|
59
|
+
periods = [52]
|
|
60
|
+
elif freq_id == 4: # Monthly (period 12 steps/year)
|
|
61
|
+
periods = [12]
|
|
62
|
+
else: # Other/Unknown
|
|
63
|
+
periods = [self.rng.choice([10, 20, 30])]
|
|
64
|
+
|
|
65
|
+
for period in periods:
|
|
66
|
+
amplitude = self.rng.uniform(0.5, 3.0)
|
|
67
|
+
phase = self.rng.uniform(0, 2 * np.pi)
|
|
68
|
+
season += amplitude * np.sin(2 * np.pi * t / period + phase)
|
|
69
|
+
|
|
70
|
+
# Add harmonic
|
|
71
|
+
if self.rng.random() > 0.5:
|
|
72
|
+
season += (amplitude * 0.3) * np.sin(4 * np.pi * t / period + phase)
|
|
73
|
+
|
|
74
|
+
# 3. Level Shifts / Step Changes
|
|
75
|
+
shift = np.zeros(length)
|
|
76
|
+
if self.rng.random() > 0.7:
|
|
77
|
+
shift_idx = self.rng.integers(int(length * 0.2), int(length * 0.8))
|
|
78
|
+
shift_magnitude = self.rng.uniform(-5.0, 5.0)
|
|
79
|
+
shift[shift_idx:] = shift_magnitude
|
|
80
|
+
|
|
81
|
+
# 4. Combine Signal (Deterministic)
|
|
82
|
+
signal = trend + season + shift
|
|
83
|
+
|
|
84
|
+
# 5. Outliers / Anomalies
|
|
85
|
+
outliers = np.zeros(length)
|
|
86
|
+
if self.rng.random() > 0.8:
|
|
87
|
+
num_outliers = self.rng.integers(1, 4)
|
|
88
|
+
for _ in range(num_outliers):
|
|
89
|
+
idx = self.rng.integers(0, length)
|
|
90
|
+
outliers[idx] = self.rng.choice([-1.0, 1.0]) * self.rng.uniform(5.0, 15.0)
|
|
91
|
+
|
|
92
|
+
# 6. Noise
|
|
93
|
+
noise_type = self.rng.choice(["gaussian", "student-t", "heteroscedastic"])
|
|
94
|
+
if noise_type == "gaussian":
|
|
95
|
+
noise = self.rng.normal(0, 0.5, size=length)
|
|
96
|
+
elif noise_type == "student-t":
|
|
97
|
+
noise = self.rng.standard_t(df=3, size=length) * 0.3
|
|
98
|
+
else: # Variance increases over time
|
|
99
|
+
scale = 0.1 + 0.9 * (t / length)
|
|
100
|
+
noise = self.rng.normal(0, scale)
|
|
101
|
+
|
|
102
|
+
# Final Time Series
|
|
103
|
+
series = signal + outliers + noise
|
|
104
|
+
|
|
105
|
+
# Scale to random physical range
|
|
106
|
+
series_scale = np.exp(self.rng.uniform(-2.0, 5.0))
|
|
107
|
+
series = series * series_scale
|
|
108
|
+
|
|
109
|
+
# 7. Covariates (Generate 4 channels of exogenous signals)
|
|
110
|
+
# Cov 0: Sine wave capturing cyclic time-of-day/week
|
|
111
|
+
# Cov 1: Binary holiday/promotion indicator (random spikes)
|
|
112
|
+
# Cov 2: Binary weekend indicator
|
|
113
|
+
# Cov 3: Linear step indicator
|
|
114
|
+
covariates = np.zeros((4, length))
|
|
115
|
+
|
|
116
|
+
# Periodic covariate
|
|
117
|
+
covariates[0, :] = np.sin(2 * np.pi * t / 24)
|
|
118
|
+
|
|
119
|
+
# Binary event indicator (promotions/holidays)
|
|
120
|
+
event_indices = self.rng.choice(length, size=int(length * 0.05), replace=False)
|
|
121
|
+
covariates[1, event_indices] = 1.0
|
|
122
|
+
|
|
123
|
+
# Weekend indicator (for daily-ish series)
|
|
124
|
+
if freq_id in [1, 2]:
|
|
125
|
+
weekend_mask = (t % 7 >= 5).astype(float)
|
|
126
|
+
covariates[2, :] = weekend_mask
|
|
127
|
+
|
|
128
|
+
# Generic trend covariate
|
|
129
|
+
covariates[3, :] = t / length
|
|
130
|
+
|
|
131
|
+
return series.astype(np.float32), covariates.astype(np.float32)
|
|
132
|
+
|
|
133
|
+
def generate_dataset(
|
|
134
|
+
self,
|
|
135
|
+
num_series: int,
|
|
136
|
+
context_len: int,
|
|
137
|
+
prediction_len: int
|
|
138
|
+
) -> List[Dict]:
|
|
139
|
+
"""
|
|
140
|
+
Generates a list of dictionaries representing a dataset.
|
|
141
|
+
"""
|
|
142
|
+
dataset = []
|
|
143
|
+
total_len = context_len + prediction_len
|
|
144
|
+
|
|
145
|
+
for i in range(num_series):
|
|
146
|
+
# Select random frequency
|
|
147
|
+
freq_id = int(self.rng.integers(0, 5))
|
|
148
|
+
series, covariates = self.generate_single_series(total_len, freq_id)
|
|
149
|
+
|
|
150
|
+
context_target = series[:context_len]
|
|
151
|
+
prediction_target = series[context_len:]
|
|
152
|
+
|
|
153
|
+
context_covariates = covariates[:, :context_len]
|
|
154
|
+
prediction_covariates = covariates[:, context_len:]
|
|
155
|
+
|
|
156
|
+
dataset.append({
|
|
157
|
+
"context": context_target,
|
|
158
|
+
"prediction": prediction_target,
|
|
159
|
+
"freq_id": freq_id,
|
|
160
|
+
"context_covariates": context_covariates,
|
|
161
|
+
"prediction_covariates": prediction_covariates
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
return dataset
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import torch
|
|
3
|
+
from torch.utils.data import Dataset, Sampler
|
|
4
|
+
from typing import List, Dict, Tuple, Optional
|
|
5
|
+
|
|
6
|
+
class TimeSeriesDataset(Dataset):
|
|
7
|
+
"""
|
|
8
|
+
PyTorch Dataset wrapping time series records.
|
|
9
|
+
Provides option for real-time data augmentations.
|
|
10
|
+
"""
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
records: List[Dict],
|
|
14
|
+
augment: bool = False,
|
|
15
|
+
augment_prob: float = 0.5
|
|
16
|
+
):
|
|
17
|
+
self.records = records
|
|
18
|
+
self.augment = augment
|
|
19
|
+
self.augment_prob = augment_prob
|
|
20
|
+
|
|
21
|
+
def __len__(self) -> int:
|
|
22
|
+
return len(self.records)
|
|
23
|
+
|
|
24
|
+
def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
|
|
25
|
+
rec = self.records[idx]
|
|
26
|
+
|
|
27
|
+
# Extract features
|
|
28
|
+
# Add channel dimension: [C, L] -> here C=1 (univariate targets)
|
|
29
|
+
x = torch.tensor(rec["context"], dtype=torch.float32).unsqueeze(0)
|
|
30
|
+
y = torch.tensor(rec["prediction"], dtype=torch.float32).unsqueeze(0)
|
|
31
|
+
|
|
32
|
+
freq_id = torch.tensor(rec["freq_id"], dtype=torch.long)
|
|
33
|
+
|
|
34
|
+
# Handle covariates
|
|
35
|
+
covariates = torch.tensor(rec["context_covariates"], dtype=torch.float32)
|
|
36
|
+
|
|
37
|
+
# Apply data augmentation if requested
|
|
38
|
+
if self.augment and np.random.rand() < self.augment_prob:
|
|
39
|
+
x, covariates = self._apply_augmentations(x, covariates)
|
|
40
|
+
|
|
41
|
+
return {
|
|
42
|
+
"x": x,
|
|
43
|
+
"y": y,
|
|
44
|
+
"freq_id": freq_id,
|
|
45
|
+
"covariates": covariates
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def _apply_augmentations(
|
|
49
|
+
self,
|
|
50
|
+
x: torch.Tensor,
|
|
51
|
+
covariates: torch.Tensor
|
|
52
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
53
|
+
"""
|
|
54
|
+
Applies random scale adjustments, shifting, and jitter noise.
|
|
55
|
+
"""
|
|
56
|
+
# Random scale multiplier: multiply target by [0.5, 2.0]
|
|
57
|
+
scale = np.random.uniform(0.5, 2.0)
|
|
58
|
+
x = x * scale
|
|
59
|
+
|
|
60
|
+
# Random shifting offset: add constant offset to target
|
|
61
|
+
shift = np.random.uniform(-2.0, 2.0)
|
|
62
|
+
x = x + shift
|
|
63
|
+
|
|
64
|
+
# Add random noise jitter
|
|
65
|
+
if np.random.rand() > 0.5:
|
|
66
|
+
noise = torch.randn_like(x) * 0.05
|
|
67
|
+
x = x + noise
|
|
68
|
+
|
|
69
|
+
# Randomly mask 5-15% of values (simulate missing values)
|
|
70
|
+
if np.random.rand() > 0.7:
|
|
71
|
+
seq_len = x.shape[-1]
|
|
72
|
+
mask_len = int(np.random.uniform(0.05, 0.15) * seq_len)
|
|
73
|
+
mask_start = np.random.randint(0, seq_len - mask_len)
|
|
74
|
+
x[..., mask_start:mask_start+mask_len] = 0.0
|
|
75
|
+
|
|
76
|
+
return x, covariates
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class ResolutionBatchSampler(Sampler):
|
|
80
|
+
"""
|
|
81
|
+
Resolution-Aware Batch Sampler.
|
|
82
|
+
Groups indices by their frequency ID and yields batches containing
|
|
83
|
+
series of ONLY one frequency. This allows the model to learn frequency-specific
|
|
84
|
+
priors cleanly within a batch.
|
|
85
|
+
|
|
86
|
+
By default `drop_last=False` and `min_batch_size=1`, so partial remainder
|
|
87
|
+
batches are emitted rather than dropped silently. Set `drop_last=True` to
|
|
88
|
+
recover the original strict behaviour.
|
|
89
|
+
"""
|
|
90
|
+
def __init__(
|
|
91
|
+
self,
|
|
92
|
+
freq_ids: List[int],
|
|
93
|
+
batch_size: int,
|
|
94
|
+
shuffle: bool = True,
|
|
95
|
+
drop_last: bool = False,
|
|
96
|
+
min_batch_size: int = 1,
|
|
97
|
+
):
|
|
98
|
+
self.batch_size = batch_size
|
|
99
|
+
self.shuffle = shuffle
|
|
100
|
+
self.drop_last = drop_last
|
|
101
|
+
self.min_batch_size = max(1, min_batch_size)
|
|
102
|
+
|
|
103
|
+
self.freq_to_indices = {}
|
|
104
|
+
for idx, freq_id in enumerate(freq_ids):
|
|
105
|
+
self.freq_to_indices.setdefault(freq_id, []).append(idx)
|
|
106
|
+
|
|
107
|
+
def __iter__(self):
|
|
108
|
+
batches = []
|
|
109
|
+
for indices in self.freq_to_indices.values():
|
|
110
|
+
indices_copy = list(indices)
|
|
111
|
+
if self.shuffle:
|
|
112
|
+
np.random.shuffle(indices_copy)
|
|
113
|
+
|
|
114
|
+
for i in range(0, len(indices_copy), self.batch_size):
|
|
115
|
+
batch = indices_copy[i:i + self.batch_size]
|
|
116
|
+
if len(batch) < self.min_batch_size:
|
|
117
|
+
continue
|
|
118
|
+
if self.drop_last and len(batch) != self.batch_size:
|
|
119
|
+
continue
|
|
120
|
+
batches.append(batch)
|
|
121
|
+
|
|
122
|
+
if self.shuffle:
|
|
123
|
+
np.random.shuffle(batches)
|
|
124
|
+
|
|
125
|
+
return iter(batches)
|
|
126
|
+
|
|
127
|
+
def __len__(self) -> int:
|
|
128
|
+
total = 0
|
|
129
|
+
for indices in self.freq_to_indices.values():
|
|
130
|
+
n = len(indices)
|
|
131
|
+
if self.drop_last:
|
|
132
|
+
total += n // self.batch_size
|
|
133
|
+
else:
|
|
134
|
+
# ceil(n / batch_size) with min_batch_size filtering
|
|
135
|
+
full, rem = divmod(n, self.batch_size)
|
|
136
|
+
total += full + (1 if rem >= self.min_batch_size else 0)
|
|
137
|
+
return total
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def create_dataloader(
|
|
141
|
+
records: List[Dict],
|
|
142
|
+
batch_size: int,
|
|
143
|
+
augment: bool = False,
|
|
144
|
+
shuffle: bool = True,
|
|
145
|
+
drop_last: bool = False,
|
|
146
|
+
min_batch_size: int = 1,
|
|
147
|
+
) -> torch.utils.data.DataLoader:
|
|
148
|
+
"""
|
|
149
|
+
Helper function to wrap dataset in a DataLoader using ResolutionBatchSampler.
|
|
150
|
+
"""
|
|
151
|
+
dataset = TimeSeriesDataset(records, augment=augment)
|
|
152
|
+
freq_ids = [rec["freq_id"] for rec in records]
|
|
153
|
+
|
|
154
|
+
sampler = ResolutionBatchSampler(
|
|
155
|
+
freq_ids,
|
|
156
|
+
batch_size,
|
|
157
|
+
shuffle=shuffle,
|
|
158
|
+
drop_last=drop_last,
|
|
159
|
+
min_batch_size=min_batch_size,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
loader = torch.utils.data.DataLoader(
|
|
163
|
+
dataset,
|
|
164
|
+
batch_sampler=sampler,
|
|
165
|
+
num_workers=0,
|
|
166
|
+
pin_memory=True
|
|
167
|
+
)
|
|
168
|
+
return loader
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""Real time series dataset loaders for NanoForecast pretraining.
|
|
2
|
+
|
|
3
|
+
Loads canonical Monash / Informer / Autoformer benchmark datasets from
|
|
4
|
+
public mirrors. Supports univariate slicing so the same loader can feed
|
|
5
|
+
both the univariate NanoForecast core and the multivariate cases.
|
|
6
|
+
|
|
7
|
+
Datasets supported:
|
|
8
|
+
- ETTh1, ETTh2, ETTm1 (hourly / 15-min oil temperature, 7 features)
|
|
9
|
+
- exchange_rate (daily FX, 8 channels)
|
|
10
|
+
- electricity (hourly household consumption, 321 channels)
|
|
11
|
+
- traffic (hourly road occupancy, 862 channels)
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import gzip
|
|
16
|
+
import io
|
|
17
|
+
import os
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from typing import Dict, List, Optional, Tuple
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
import pandas as pd
|
|
23
|
+
import requests
|
|
24
|
+
|
|
25
|
+
CACHE_DIR = os.path.expanduser("~/.cache/nanoforecast/datasets")
|
|
26
|
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
27
|
+
|
|
28
|
+
# freq_id convention from synthetic generator:
|
|
29
|
+
# 0 -> 5-min, 1 -> hourly, 2 -> daily, 3 -> weekly, 4 -> monthly, 5+ -> other
|
|
30
|
+
FREQ_MAP = {
|
|
31
|
+
"ETTh1": 1,
|
|
32
|
+
"ETTh2": 1,
|
|
33
|
+
"ETTm1": 0, # 15-min intervals
|
|
34
|
+
"exchange_rate": 2, # daily
|
|
35
|
+
"electricity": 1, # hourly
|
|
36
|
+
"traffic": 1, # hourly
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
DATASET_URLS: Dict[str, str] = {
|
|
40
|
+
"ETTh1": "https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTh1.csv",
|
|
41
|
+
"ETTh2": "https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTh2.csv",
|
|
42
|
+
"ETTm1": "https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTm1.csv",
|
|
43
|
+
"exchange_rate": "https://raw.githubusercontent.com/laiguokun/multivariate-time-series-data/master/exchange_rate/exchange_rate.txt.gz",
|
|
44
|
+
"electricity": "https://raw.githubusercontent.com/laiguokun/multivariate-time-series-data/master/electricity/electricity.txt.gz",
|
|
45
|
+
"traffic": "https://raw.githubusercontent.com/laiguokun/multivariate-time-series-data/master/traffic/traffic.txt.gz",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class WindowSpec:
|
|
51
|
+
"""Sliding-window specification for turning long series into pretraining records."""
|
|
52
|
+
context_len: int = 256
|
|
53
|
+
prediction_len: int = 48
|
|
54
|
+
stride: int = 64
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _cache_path(dataset: str, suffix: str = "") -> str:
|
|
58
|
+
safe = dataset.replace("/", "_")
|
|
59
|
+
return os.path.join(CACHE_DIR, safe + suffix)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _is_gzip(path: str) -> bool:
|
|
63
|
+
with open(path, "rb") as fh:
|
|
64
|
+
return fh.read(2) == b"\x1f\x8b"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _download(dataset: str, url: str) -> str:
|
|
68
|
+
path = _cache_path(dataset, suffix=".gz" if url.endswith(".gz") else "")
|
|
69
|
+
if os.path.exists(path):
|
|
70
|
+
if os.path.getsize(path) > 0:
|
|
71
|
+
return path
|
|
72
|
+
print(f"[datasets] removing empty cache for {dataset} ...")
|
|
73
|
+
os.remove(path)
|
|
74
|
+
print(f"[datasets] downloading {dataset} from {url} ...")
|
|
75
|
+
resp = requests.get(url, stream=True, timeout=300)
|
|
76
|
+
resp.raise_for_status()
|
|
77
|
+
with open(path, "wb") as fh:
|
|
78
|
+
for chunk in resp.iter_content(chunk_size=1 << 20):
|
|
79
|
+
if chunk:
|
|
80
|
+
fh.write(chunk)
|
|
81
|
+
if os.path.getsize(path) == 0:
|
|
82
|
+
raise RuntimeError(f"Downloaded file for {dataset} is empty: {url}")
|
|
83
|
+
return path
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _clear_cache(dataset: str) -> None:
|
|
87
|
+
for suffix in ["", ".gz"]:
|
|
88
|
+
p = _cache_path(dataset, suffix=suffix)
|
|
89
|
+
if os.path.exists(p):
|
|
90
|
+
os.remove(p)
|
|
91
|
+
|
|
92
|
+
def _load_dataframe(dataset: str) -> pd.DataFrame:
|
|
93
|
+
url = DATASET_URLS[dataset]
|
|
94
|
+
for attempt in range(2):
|
|
95
|
+
path = _download(dataset, url)
|
|
96
|
+
try:
|
|
97
|
+
if _is_gzip(path):
|
|
98
|
+
with gzip.open(path, "rb") as fh:
|
|
99
|
+
data = fh.read()
|
|
100
|
+
arr = np.loadtxt(io.BytesIO(data), delimiter=",", dtype=np.float32)
|
|
101
|
+
return pd.DataFrame(arr)
|
|
102
|
+
return pd.read_csv(path)
|
|
103
|
+
except (pd.errors.EmptyDataError, OSError, ValueError) as e:
|
|
104
|
+
if attempt == 0:
|
|
105
|
+
print(f"[datasets] corrupt cache for {dataset}, re-downloading ... ({e})")
|
|
106
|
+
_clear_cache(dataset)
|
|
107
|
+
continue
|
|
108
|
+
raise
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def list_datasets() -> List[str]:
|
|
112
|
+
return list(DATASET_URLS.keys())
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def load_univariate_series(
|
|
116
|
+
dataset: str,
|
|
117
|
+
channels: Optional[List[int]] = None,
|
|
118
|
+
max_channels: int = 4,
|
|
119
|
+
) -> Tuple[List[np.ndarray], int]:
|
|
120
|
+
"""Load selected channels of a dataset as a list of 1-D float32 arrays.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
dataset: one of the keys in DATASET_URLS.
|
|
124
|
+
channels: explicit channel indices (0-based, excluding the date column for ETT*).
|
|
125
|
+
If None, picks up to ``max_channels`` channels spaced evenly across the file.
|
|
126
|
+
max_channels: used when ``channels`` is None.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
series_list: list of 1-D float32 arrays, one per selected channel.
|
|
130
|
+
freq_id: integer frequency identifier.
|
|
131
|
+
"""
|
|
132
|
+
df = _load_dataframe(dataset)
|
|
133
|
+
if dataset.startswith("ETT"):
|
|
134
|
+
# First column is the date; remaining columns are features.
|
|
135
|
+
values = df.iloc[:, 1:].to_numpy(dtype=np.float32)
|
|
136
|
+
else:
|
|
137
|
+
# Comma-separated dense matrix files (one row per timestep).
|
|
138
|
+
values = df.to_numpy(dtype=np.float32)
|
|
139
|
+
|
|
140
|
+
n_channels = values.shape[1]
|
|
141
|
+
if channels is None:
|
|
142
|
+
if n_channels <= max_channels:
|
|
143
|
+
channels = list(range(n_channels))
|
|
144
|
+
else:
|
|
145
|
+
channels = np.linspace(0, n_channels - 1, max_channels).round().astype(int).tolist()
|
|
146
|
+
channels = sorted(set(channels))
|
|
147
|
+
|
|
148
|
+
series_list = [values[:, c].astype(np.float32) for c in channels]
|
|
149
|
+
return series_list, FREQ_MAP[dataset]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def build_pretraining_records(
|
|
153
|
+
dataset: str,
|
|
154
|
+
spec: WindowSpec,
|
|
155
|
+
channels: Optional[List[int]] = None,
|
|
156
|
+
max_channels: int = 4,
|
|
157
|
+
) -> List[Dict]:
|
|
158
|
+
"""Build NanoForecast-format pretraining records from a dataset.
|
|
159
|
+
|
|
160
|
+
Each record is a dict with: context, prediction, freq_id,
|
|
161
|
+
context_covariates (zeros, dim=4 to match the model config).
|
|
162
|
+
"""
|
|
163
|
+
series_list, freq_id = load_univariate_series(dataset, channels, max_channels)
|
|
164
|
+
|
|
165
|
+
L_in = spec.context_len + spec.prediction_len
|
|
166
|
+
records: List[Dict] = []
|
|
167
|
+
for s in series_list:
|
|
168
|
+
s = np.asarray(s, dtype=np.float32)
|
|
169
|
+
n = s.shape[0]
|
|
170
|
+
if n < L_in:
|
|
171
|
+
continue
|
|
172
|
+
for start in range(0, n - L_in + 1, spec.stride):
|
|
173
|
+
window = s[start:start + L_in]
|
|
174
|
+
records.append({
|
|
175
|
+
"context": window[:spec.context_len],
|
|
176
|
+
"prediction": window[spec.context_len:spec.context_len + spec.prediction_len],
|
|
177
|
+
"freq_id": freq_id,
|
|
178
|
+
"context_covariates": np.zeros((4, spec.context_len), dtype=np.float32),
|
|
179
|
+
"source": dataset,
|
|
180
|
+
})
|
|
181
|
+
return records
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def build_mixed_pretraining_corpus(
|
|
185
|
+
spec: WindowSpec,
|
|
186
|
+
datasets: Optional[List[str]] = None,
|
|
187
|
+
max_channels_per_dataset: int = 4,
|
|
188
|
+
) -> List[Dict]:
|
|
189
|
+
"""Build a mixed pretraining corpus across multiple datasets."""
|
|
190
|
+
if datasets is None:
|
|
191
|
+
datasets = ["ETTh1", "exchange_rate"] # small + reliable default
|
|
192
|
+
all_records: List[Dict] = []
|
|
193
|
+
for ds in datasets:
|
|
194
|
+
recs = build_pretraining_records(ds, spec, max_channels=max_channels_per_dataset)
|
|
195
|
+
print(f"[datasets] {ds}: {len(recs)} windows")
|
|
196
|
+
all_records.extend(recs)
|
|
197
|
+
print(f"[datasets] total: {len(all_records)} windows")
|
|
198
|
+
return all_records
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def time_based_split(
|
|
202
|
+
records: List[Dict],
|
|
203
|
+
val_fraction: float = 0.2,
|
|
204
|
+
) -> Tuple[List[Dict], List[Dict]]:
|
|
205
|
+
"""Time-based train/val split: hold out the last ``val_fraction`` of records.
|
|
206
|
+
|
|
207
|
+
This avoids look-ahead leakage in pretraining.
|
|
208
|
+
"""
|
|
209
|
+
if not records:
|
|
210
|
+
return [], []
|
|
211
|
+
n = len(records)
|
|
212
|
+
cut = int(n * (1.0 - val_fraction))
|
|
213
|
+
return records[:cut], records[cut:]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from nanoforecast.evaluation.benchmark import TimeSeriesEvaluator
|