uchi-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
uchi/timeseries.py ADDED
@@ -0,0 +1,445 @@
1
+ """
2
+ timeseries.py
3
+ =============
4
+ Time series modeling via the Universal Sequence Predictor.
5
+
6
+ Each multivariate time step is encoded as a compound token
7
+ (bin_0, bin_1, ..., bin_{M-1})
8
+ which is directly hashable and exact-matchable in the trie. Vocabulary is
9
+ sparse — only observed transitions are stored.
10
+
11
+ For large M, keep n_bins small (e.g. 4–6); theoretical vocab = n_bins^M.
12
+
13
+ Classes
14
+ -------
15
+ MultivariateTSPredictor — online step-ahead prediction for multivariate series
16
+ TimeSeriesClassifier — classify fixed-length windows (e.g. ECG, HAR)
17
+ AnomalyDetector — online anomaly scoring via prediction log-loss
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import math
23
+ import random
24
+ import warnings
25
+
26
+ from .predictor import UniversalPredictor
27
+ from .discretize import FeatureDiscretizer, LabelEncoder, _to_rows
28
+ from .tabular import _set_history, _infer_dist, _train_one
29
+
30
+ try:
31
+ from sklearn.base import BaseEstimator, ClassifierMixin, OutlierMixin
32
+ _SKLEARN = True
33
+ except ImportError:
34
+ class BaseEstimator: pass
35
+ class ClassifierMixin: pass
36
+ class OutlierMixin: pass
37
+ _SKLEARN = False
38
+
39
+ _LABEL_NS_TS = '__ts_label__'
40
+
41
+
42
+ # ══════════════════════════════════════════════════════════════════════════════
43
+ # Shared internals
44
+ # ══════════════════════════════════════════════════════════════════════════════
45
+
46
+ def _make_predictor(k: int, lr: float, cred_max: float, lp: float) -> UniversalPredictor:
47
+ return UniversalPredictor(
48
+ k, None,
49
+ learning_rate=lr,
50
+ vigilance=0.3,
51
+ adaptive_cap=True,
52
+ binary_correction_scale=0.05,
53
+ cred_max=cred_max,
54
+ lambda_power=lp,
55
+ cont_count_min_vocab=4,
56
+ )
57
+
58
+
59
+ def _compound_token(token_row: list) -> tuple:
60
+ return tuple(b for _, b in sorted(token_row, key=lambda x: x[0]))
61
+
62
+
63
+ def _to_window_rows(w) -> list:
64
+ rows = _to_rows(w)
65
+ if rows and not isinstance(rows[0], (list, tuple)):
66
+ rows = [[v] for v in rows]
67
+ return rows
68
+
69
+
70
+ # ══════════════════════════════════════════════════════════════════════════════
71
+ # MultivariateTSPredictor
72
+ # ══════════════════════════════════════════════════════════════════════════════
73
+
74
+ class MultivariateTSPredictor(BaseEstimator):
75
+ """
76
+ Online step-ahead predictor for multivariate (or univariate) time series.
77
+
78
+ Learns P(x_{t+1} | x_{t-k+1}, ..., x_t) via the trie. Each timestep
79
+ becomes one compound token; the context window is k steps.
80
+
81
+ For large M, set n_bins small (e.g. 4–6) to bound vocabulary.
82
+
83
+ Parameters
84
+ ----------
85
+ n_bins : int
86
+ Quantile bins per dimension.
87
+ context_length : int
88
+ Number of prior steps k used as context.
89
+ learning_rate, cred_max, lambda_power : float
90
+
91
+ Streaming API
92
+ -------------
93
+ pred.fit(X) — fit bins and warm-up trie on historical data
94
+ pred.predict() — float vector of per-dimension bin-center means
95
+ pred.observe(x) — consume one true timestep (advances history)
96
+ pred.feedback(x) — update trie with true value
97
+ pred.forecast(n) — auto-regressive multi-step ahead forecast
98
+ pred.score(X) — average bits-per-step (lower = better)
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ n_bins: int = 8,
104
+ context_length: int = 5,
105
+ learning_rate: float = 0.08,
106
+ cred_max: float = 6.05,
107
+ lambda_power: float = 0.65,
108
+ ):
109
+ self.n_bins = n_bins
110
+ self.context_length = context_length
111
+ self.learning_rate = learning_rate
112
+ self.cred_max = cred_max
113
+ self.lambda_power = lambda_power
114
+
115
+ # ── public API ────────────────────────────────────────────────────────────
116
+
117
+ def fit(self, X, y=None) -> 'MultivariateTSPredictor':
118
+ rows = _to_rows(X)
119
+ if not rows:
120
+ return self
121
+ if not isinstance(rows[0], (list, tuple)):
122
+ rows = [[v] for v in rows]
123
+
124
+ self._n_dims = len(rows[0])
125
+ B, M = self.n_bins, self._n_dims
126
+ if B ** M > 100_000:
127
+ warnings.warn(
128
+ f"Trie vocab ≈ n_bins^M = {B}^{M} = {B**M:,}. "
129
+ f"Reduce n_bins for high-dimensional series.",
130
+ stacklevel=2,
131
+ )
132
+
133
+ self._disc = FeatureDiscretizer(n_bins=self.n_bins)
134
+ self._disc.fit(rows)
135
+ self._pred = _make_predictor(
136
+ self.context_length, self.learning_rate, self.cred_max, self.lambda_power,
137
+ )
138
+
139
+ for row in rows:
140
+ token = _compound_token(self._disc._encode_row(row))
141
+ self._pred.predict()
142
+ self._pred.observe(token)
143
+ self._pred.feedback(token)
144
+
145
+ self.is_fitted_ = True
146
+ return self
147
+
148
+ def observe(self, x) -> 'MultivariateTSPredictor':
149
+ self._check_fitted()
150
+ self._pred.observe(self._tokenize(x))
151
+ return self
152
+
153
+ def predict(self, X=None) -> list:
154
+ """
155
+ Predict next timestep as per-dimension float means.
156
+ X is ignored (streaming API uses internal state); present for sklearn compat.
157
+ """
158
+ self._check_fitted()
159
+ self._pred.predict()
160
+ return self._decode_dist(dict(self._pred._last_distribution))
161
+
162
+ def predict_distribution(self) -> dict:
163
+ self._check_fitted()
164
+ self._pred.predict()
165
+ return dict(self._pred._last_distribution)
166
+
167
+ def feedback(self, x) -> 'MultivariateTSPredictor':
168
+ self._check_fitted()
169
+ self._pred.feedback(self._tokenize(x))
170
+ return self
171
+
172
+ def forecast(self, n_steps: int) -> list:
173
+ """
174
+ Auto-regressive multi-step ahead forecast.
175
+ Returns list of n_steps float vectors. History is temporarily extended
176
+ then restored; trie not modified.
177
+ """
178
+ self._check_fitted()
179
+ saved = self._pred.history[:]
180
+ results = []
181
+ for _ in range(n_steps):
182
+ self._pred.predict()
183
+ means = self._decode_dist(dict(self._pred._last_distribution))
184
+ results.append(means)
185
+ self._pred.observe(self._tokenize(means))
186
+ self._pred.history = saved
187
+ return results
188
+
189
+ def score(self, X, y=None) -> float:
190
+ """
191
+ Average bits-per-step on held-out data (lower = better).
192
+ History temporarily advanced for context; trie not updated.
193
+ """
194
+ self._check_fitted()
195
+ rows = _to_rows(X)
196
+ if not rows:
197
+ return float('inf')
198
+ if not isinstance(rows[0], (list, tuple)):
199
+ rows = [[v] for v in rows]
200
+
201
+ saved = self._pred.history[:]
202
+ total = 0.0
203
+ for row in rows:
204
+ token = _compound_token(self._disc._encode_row(row))
205
+ self._pred.predict()
206
+ prob = max(self._pred._last_distribution.get(token, 1e-12), 1e-12)
207
+ total += -math.log2(prob)
208
+ self._pred.observe(token)
209
+ self._pred.history = saved
210
+ return total / len(rows)
211
+
212
+ # ── internal ──────────────────────────────────────────────────────────────
213
+
214
+ def _tokenize(self, x) -> tuple:
215
+ row = [x] if isinstance(x, (int, float)) else list(x)
216
+ return _compound_token(self._disc._encode_row(row))
217
+
218
+ def _decode_dist(self, dist: dict) -> list:
219
+ mid = self.n_bins // 2
220
+ fallback = [self._disc.bin_center(d, mid) for d in range(self._n_dims)]
221
+ if not dist:
222
+ return fallback
223
+ total = sum(dist.values())
224
+ if total < 1e-12:
225
+ return fallback
226
+ means = [0.0] * self._n_dims
227
+ for token, prob in dist.items():
228
+ if not isinstance(token, tuple) or len(token) != self._n_dims:
229
+ continue
230
+ w = prob / total
231
+ for d, b in enumerate(token):
232
+ if isinstance(b, int):
233
+ means[d] += w * self._disc.bin_center(d, b)
234
+ return means
235
+
236
+ def _check_fitted(self):
237
+ if not hasattr(self, '_pred'):
238
+ raise RuntimeError("Call fit() first.")
239
+
240
+
241
+ # ══════════════════════════════════════════════════════════════════════════════
242
+ # TimeSeriesClassifier
243
+ # ══════════════════════════════════════════════════════════════════════════════
244
+
245
+ class TimeSeriesClassifier(BaseEstimator, ClassifierMixin):
246
+ """
247
+ Classify fixed-length time series windows.
248
+
249
+ Each window of T timesteps is encoded as T compound tokens; the class label
250
+ is the next token after the full window.
251
+
252
+ sklearn-compatible: works in Pipeline, GridSearchCV, cross_val_score.
253
+
254
+ Parameters
255
+ ----------
256
+ n_bins : int
257
+ window_size : int | None
258
+ Expected window length (inferred from first fit call if None).
259
+ n_epochs : int
260
+ learning_rate, cred_max, lambda_power : float
261
+ """
262
+
263
+ def __init__(
264
+ self,
265
+ n_bins: int = 8,
266
+ window_size: int | None = None,
267
+ n_epochs: int = 1,
268
+ learning_rate: float = 0.08,
269
+ cred_max: float = 6.05,
270
+ lambda_power: float = 0.65,
271
+ random_seed: int = 42,
272
+ ):
273
+ self.n_bins = n_bins
274
+ self.window_size = window_size
275
+ self.n_epochs = n_epochs
276
+ self.learning_rate = learning_rate
277
+ self.cred_max = cred_max
278
+ self.lambda_power = lambda_power
279
+ self.random_seed = random_seed
280
+
281
+ # ── public API ────────────────────────────────────────────────────────────
282
+
283
+ def fit(self, X, y) -> 'TimeSeriesClassifier':
284
+ windows = [_to_window_rows(w) for w in X]
285
+ labels = list(y)
286
+
287
+ if not windows:
288
+ return self
289
+
290
+ T = len(windows[0])
291
+ if self.window_size is not None and T != self.window_size:
292
+ raise ValueError(f"window_size mismatch: expected {self.window_size}, got {T}")
293
+ self._T = T
294
+
295
+ all_steps = [step for w in windows for step in w]
296
+ self._disc = FeatureDiscretizer(n_bins=self.n_bins)
297
+ self._disc.fit(all_steps)
298
+ self._lenc = LabelEncoder()
299
+ self._lenc.fit(labels)
300
+ self._rng = random.Random(self.random_seed)
301
+
302
+ self._pred = _make_predictor(self._T, self.learning_rate,
303
+ self.cred_max, self.lambda_power)
304
+
305
+ for _ in range(self.n_epochs):
306
+ pairs = list(zip(windows, labels))
307
+ self._rng.shuffle(pairs)
308
+ for window, label in pairs:
309
+ self._train_window(window, label)
310
+
311
+ self.is_fitted_ = True
312
+ return self
313
+
314
+ def partial_fit(self, X, y, classes=None) -> 'TimeSeriesClassifier':
315
+ if not hasattr(self, '_disc'):
316
+ return self.fit(X, y)
317
+ windows = [_to_window_rows(w) for w in X]
318
+ labels = list(y)
319
+ self._lenc.partial_fit(labels)
320
+ for window, label in zip(windows, labels):
321
+ self._train_window(window, label)
322
+ return self
323
+
324
+ def predict(self, X) -> list:
325
+ proba = self.predict_proba(X)
326
+ return [max(d, key=d.get) for d in proba]
327
+
328
+ def predict_proba(self, X) -> list:
329
+ return [self._infer_window(_to_window_rows(w)) for w in X]
330
+
331
+ def score(self, X, y) -> float:
332
+ preds = self.predict(X)
333
+ return sum(p == t for p, t in zip(preds, y)) / max(len(list(y)), 1)
334
+
335
+ @property
336
+ def classes_(self) -> list:
337
+ return self._lenc.classes_ if hasattr(self, '_lenc') else []
338
+
339
+ # ── internal ──────────────────────────────────────────────────────────────
340
+
341
+ def _label_token(self, label) -> tuple:
342
+ return (_LABEL_NS_TS, self._lenc.encode(label))
343
+
344
+ def _window_to_tokens(self, window: list) -> list:
345
+ return [_compound_token(self._disc._encode_row(step)) for step in window]
346
+
347
+ def _train_window(self, window: list, label) -> None:
348
+ _train_one(self._pred, self._window_to_tokens(window), self._label_token(label))
349
+
350
+ def _infer_window(self, window: list) -> dict:
351
+ tokens = self._window_to_tokens(window)
352
+ dist = _infer_dist(self._pred, tokens)
353
+ classes = self._lenc.classes_
354
+
355
+ if not classes:
356
+ return {}
357
+
358
+ totals = {c: dist.get(self._label_token(c), 0.0) for c in classes}
359
+ total = sum(totals.values())
360
+ if total < 1e-12:
361
+ u = 1.0 / len(classes)
362
+ return {c: u for c in classes}
363
+ return {c: v / total for c, v in totals.items()}
364
+
365
+
366
+ # ══════════════════════════════════════════════════════════════════════════════
367
+ # AnomalyDetector
368
+ # ══════════════════════════════════════════════════════════════════════════════
369
+
370
+ class AnomalyDetector(BaseEstimator, OutlierMixin):
371
+ """
372
+ Online anomaly detection via prediction surprise.
373
+
374
+ Trains a MultivariateTSPredictor on normal data. At inference each
375
+ timestep receives score = -log2 P(actual | context). High score = anomalous.
376
+ The trie is NOT updated during scoring.
377
+
378
+ sklearn-compatible: works in Pipeline.
379
+
380
+ Parameters
381
+ ----------
382
+ All parameters forwarded to MultivariateTSPredictor.
383
+ """
384
+
385
+ def __init__(
386
+ self,
387
+ n_bins: int = 8,
388
+ context_length: int = 5,
389
+ learning_rate: float = 0.08,
390
+ cred_max: float = 6.05,
391
+ lambda_power: float = 0.65,
392
+ ):
393
+ self.n_bins = n_bins
394
+ self.context_length = context_length
395
+ self.learning_rate = learning_rate
396
+ self.cred_max = cred_max
397
+ self.lambda_power = lambda_power
398
+
399
+ def fit(self, X, y=None) -> 'AnomalyDetector':
400
+ self._ts = MultivariateTSPredictor(
401
+ n_bins=self.n_bins, context_length=self.context_length,
402
+ learning_rate=self.learning_rate, cred_max=self.cred_max,
403
+ lambda_power=self.lambda_power,
404
+ )
405
+ self._ts.fit(X)
406
+ self.is_fitted_ = True
407
+ return self
408
+
409
+ def score_samples(self, X) -> list:
410
+ """Anomaly score per timestep (higher = more anomalous)."""
411
+ self._check_fitted()
412
+ rows = _to_rows(X)
413
+ if not rows:
414
+ return []
415
+ if not isinstance(rows[0], (list, tuple)):
416
+ rows = [[v] for v in rows]
417
+
418
+ saved = self._ts._pred.history[:]
419
+ scores = []
420
+ for row in rows:
421
+ token = _compound_token(self._ts._disc._encode_row(row))
422
+ self._ts._pred.predict()
423
+ prob = max(self._ts._pred._last_distribution.get(token, 1e-12), 1e-12)
424
+ scores.append(-math.log2(prob))
425
+ self._ts._pred.observe(token)
426
+ self._ts._pred.history = saved
427
+ return scores
428
+
429
+ def predict(self, X) -> list:
430
+ """Binary labels: True = anomaly (score > mean + 2*std)."""
431
+ scores = self.score_samples(X)
432
+ if not scores:
433
+ return []
434
+ mu = sum(scores) / len(scores)
435
+ var = sum((s - mu) ** 2 for s in scores) / len(scores)
436
+ threshold = mu + 2.0 * math.sqrt(max(var, 0.0))
437
+ return [1 if s > threshold else -1 for s in scores]
438
+
439
+ def decision_function(self, X) -> list:
440
+ """sklearn OutlierMixin: negative anomaly score (higher = more normal)."""
441
+ return [-s for s in self.score_samples(X)]
442
+
443
+ def _check_fitted(self):
444
+ if not hasattr(self, '_ts'):
445
+ raise RuntimeError("Call fit() first.")