uchi-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uchi/__init__.py +57 -0
- uchi/discretize.py +307 -0
- uchi/distributional.py +105 -0
- uchi/dual_predictor.py +172 -0
- uchi/forest.py +410 -0
- uchi/generative.py +910 -0
- uchi/hoeffding.py +225 -0
- uchi/long_term_store.py +345 -0
- uchi/node_compressor.py +492 -0
- uchi/online_tokenizer.py +349 -0
- uchi/predictor.py +578 -0
- uchi/semantic_tokenizer.py +48 -0
- uchi/tabular.py +401 -0
- uchi/timeseries.py +445 -0
- uchi_python-0.1.0.dist-info/METADATA +468 -0
- uchi_python-0.1.0.dist-info/RECORD +19 -0
- uchi_python-0.1.0.dist-info/WHEEL +5 -0
- uchi_python-0.1.0.dist-info/licenses/LICENSE +21 -0
- uchi_python-0.1.0.dist-info/top_level.txt +1 -0
uchi/timeseries.py
ADDED
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
"""
|
|
2
|
+
timeseries.py
|
|
3
|
+
=============
|
|
4
|
+
Time series modeling via the Universal Sequence Predictor.
|
|
5
|
+
|
|
6
|
+
Each multivariate time step is encoded as a compound token
|
|
7
|
+
(bin_0, bin_1, ..., bin_{M-1})
|
|
8
|
+
which is directly hashable and exact-matchable in the trie. Vocabulary is
|
|
9
|
+
sparse — only observed transitions are stored.
|
|
10
|
+
|
|
11
|
+
For large M, keep n_bins small (e.g. 4–6); theoretical vocab = n_bins^M.
|
|
12
|
+
|
|
13
|
+
Classes
|
|
14
|
+
-------
|
|
15
|
+
MultivariateTSPredictor — online step-ahead prediction for multivariate series
|
|
16
|
+
TimeSeriesClassifier — classify fixed-length windows (e.g. ECG, HAR)
|
|
17
|
+
AnomalyDetector — online anomaly scoring via prediction log-loss
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import math
|
|
23
|
+
import random
|
|
24
|
+
import warnings
|
|
25
|
+
|
|
26
|
+
from .predictor import UniversalPredictor
|
|
27
|
+
from .discretize import FeatureDiscretizer, LabelEncoder, _to_rows
|
|
28
|
+
from .tabular import _set_history, _infer_dist, _train_one
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
from sklearn.base import BaseEstimator, ClassifierMixin, OutlierMixin
|
|
32
|
+
_SKLEARN = True
|
|
33
|
+
except ImportError:
|
|
34
|
+
class BaseEstimator: pass
|
|
35
|
+
class ClassifierMixin: pass
|
|
36
|
+
class OutlierMixin: pass
|
|
37
|
+
_SKLEARN = False
|
|
38
|
+
|
|
39
|
+
_LABEL_NS_TS = '__ts_label__'
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
43
|
+
# Shared internals
|
|
44
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
45
|
+
|
|
46
|
+
def _make_predictor(k: int, lr: float, cred_max: float, lp: float) -> UniversalPredictor:
|
|
47
|
+
return UniversalPredictor(
|
|
48
|
+
k, None,
|
|
49
|
+
learning_rate=lr,
|
|
50
|
+
vigilance=0.3,
|
|
51
|
+
adaptive_cap=True,
|
|
52
|
+
binary_correction_scale=0.05,
|
|
53
|
+
cred_max=cred_max,
|
|
54
|
+
lambda_power=lp,
|
|
55
|
+
cont_count_min_vocab=4,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _compound_token(token_row: list) -> tuple:
|
|
60
|
+
return tuple(b for _, b in sorted(token_row, key=lambda x: x[0]))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _to_window_rows(w) -> list:
|
|
64
|
+
rows = _to_rows(w)
|
|
65
|
+
if rows and not isinstance(rows[0], (list, tuple)):
|
|
66
|
+
rows = [[v] for v in rows]
|
|
67
|
+
return rows
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
71
|
+
# MultivariateTSPredictor
|
|
72
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
73
|
+
|
|
74
|
+
class MultivariateTSPredictor(BaseEstimator):
|
|
75
|
+
"""
|
|
76
|
+
Online step-ahead predictor for multivariate (or univariate) time series.
|
|
77
|
+
|
|
78
|
+
Learns P(x_{t+1} | x_{t-k+1}, ..., x_t) via the trie. Each timestep
|
|
79
|
+
becomes one compound token; the context window is k steps.
|
|
80
|
+
|
|
81
|
+
For large M, set n_bins small (e.g. 4–6) to bound vocabulary.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
n_bins : int
|
|
86
|
+
Quantile bins per dimension.
|
|
87
|
+
context_length : int
|
|
88
|
+
Number of prior steps k used as context.
|
|
89
|
+
learning_rate, cred_max, lambda_power : float
|
|
90
|
+
|
|
91
|
+
Streaming API
|
|
92
|
+
-------------
|
|
93
|
+
pred.fit(X) — fit bins and warm-up trie on historical data
|
|
94
|
+
pred.predict() — float vector of per-dimension bin-center means
|
|
95
|
+
pred.observe(x) — consume one true timestep (advances history)
|
|
96
|
+
pred.feedback(x) — update trie with true value
|
|
97
|
+
pred.forecast(n) — auto-regressive multi-step ahead forecast
|
|
98
|
+
pred.score(X) — average bits-per-step (lower = better)
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
def __init__(
|
|
102
|
+
self,
|
|
103
|
+
n_bins: int = 8,
|
|
104
|
+
context_length: int = 5,
|
|
105
|
+
learning_rate: float = 0.08,
|
|
106
|
+
cred_max: float = 6.05,
|
|
107
|
+
lambda_power: float = 0.65,
|
|
108
|
+
):
|
|
109
|
+
self.n_bins = n_bins
|
|
110
|
+
self.context_length = context_length
|
|
111
|
+
self.learning_rate = learning_rate
|
|
112
|
+
self.cred_max = cred_max
|
|
113
|
+
self.lambda_power = lambda_power
|
|
114
|
+
|
|
115
|
+
# ── public API ────────────────────────────────────────────────────────────
|
|
116
|
+
|
|
117
|
+
def fit(self, X, y=None) -> 'MultivariateTSPredictor':
|
|
118
|
+
rows = _to_rows(X)
|
|
119
|
+
if not rows:
|
|
120
|
+
return self
|
|
121
|
+
if not isinstance(rows[0], (list, tuple)):
|
|
122
|
+
rows = [[v] for v in rows]
|
|
123
|
+
|
|
124
|
+
self._n_dims = len(rows[0])
|
|
125
|
+
B, M = self.n_bins, self._n_dims
|
|
126
|
+
if B ** M > 100_000:
|
|
127
|
+
warnings.warn(
|
|
128
|
+
f"Trie vocab ≈ n_bins^M = {B}^{M} = {B**M:,}. "
|
|
129
|
+
f"Reduce n_bins for high-dimensional series.",
|
|
130
|
+
stacklevel=2,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
self._disc = FeatureDiscretizer(n_bins=self.n_bins)
|
|
134
|
+
self._disc.fit(rows)
|
|
135
|
+
self._pred = _make_predictor(
|
|
136
|
+
self.context_length, self.learning_rate, self.cred_max, self.lambda_power,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
for row in rows:
|
|
140
|
+
token = _compound_token(self._disc._encode_row(row))
|
|
141
|
+
self._pred.predict()
|
|
142
|
+
self._pred.observe(token)
|
|
143
|
+
self._pred.feedback(token)
|
|
144
|
+
|
|
145
|
+
self.is_fitted_ = True
|
|
146
|
+
return self
|
|
147
|
+
|
|
148
|
+
def observe(self, x) -> 'MultivariateTSPredictor':
|
|
149
|
+
self._check_fitted()
|
|
150
|
+
self._pred.observe(self._tokenize(x))
|
|
151
|
+
return self
|
|
152
|
+
|
|
153
|
+
def predict(self, X=None) -> list:
|
|
154
|
+
"""
|
|
155
|
+
Predict next timestep as per-dimension float means.
|
|
156
|
+
X is ignored (streaming API uses internal state); present for sklearn compat.
|
|
157
|
+
"""
|
|
158
|
+
self._check_fitted()
|
|
159
|
+
self._pred.predict()
|
|
160
|
+
return self._decode_dist(dict(self._pred._last_distribution))
|
|
161
|
+
|
|
162
|
+
def predict_distribution(self) -> dict:
|
|
163
|
+
self._check_fitted()
|
|
164
|
+
self._pred.predict()
|
|
165
|
+
return dict(self._pred._last_distribution)
|
|
166
|
+
|
|
167
|
+
def feedback(self, x) -> 'MultivariateTSPredictor':
|
|
168
|
+
self._check_fitted()
|
|
169
|
+
self._pred.feedback(self._tokenize(x))
|
|
170
|
+
return self
|
|
171
|
+
|
|
172
|
+
def forecast(self, n_steps: int) -> list:
|
|
173
|
+
"""
|
|
174
|
+
Auto-regressive multi-step ahead forecast.
|
|
175
|
+
Returns list of n_steps float vectors. History is temporarily extended
|
|
176
|
+
then restored; trie not modified.
|
|
177
|
+
"""
|
|
178
|
+
self._check_fitted()
|
|
179
|
+
saved = self._pred.history[:]
|
|
180
|
+
results = []
|
|
181
|
+
for _ in range(n_steps):
|
|
182
|
+
self._pred.predict()
|
|
183
|
+
means = self._decode_dist(dict(self._pred._last_distribution))
|
|
184
|
+
results.append(means)
|
|
185
|
+
self._pred.observe(self._tokenize(means))
|
|
186
|
+
self._pred.history = saved
|
|
187
|
+
return results
|
|
188
|
+
|
|
189
|
+
def score(self, X, y=None) -> float:
|
|
190
|
+
"""
|
|
191
|
+
Average bits-per-step on held-out data (lower = better).
|
|
192
|
+
History temporarily advanced for context; trie not updated.
|
|
193
|
+
"""
|
|
194
|
+
self._check_fitted()
|
|
195
|
+
rows = _to_rows(X)
|
|
196
|
+
if not rows:
|
|
197
|
+
return float('inf')
|
|
198
|
+
if not isinstance(rows[0], (list, tuple)):
|
|
199
|
+
rows = [[v] for v in rows]
|
|
200
|
+
|
|
201
|
+
saved = self._pred.history[:]
|
|
202
|
+
total = 0.0
|
|
203
|
+
for row in rows:
|
|
204
|
+
token = _compound_token(self._disc._encode_row(row))
|
|
205
|
+
self._pred.predict()
|
|
206
|
+
prob = max(self._pred._last_distribution.get(token, 1e-12), 1e-12)
|
|
207
|
+
total += -math.log2(prob)
|
|
208
|
+
self._pred.observe(token)
|
|
209
|
+
self._pred.history = saved
|
|
210
|
+
return total / len(rows)
|
|
211
|
+
|
|
212
|
+
# ── internal ──────────────────────────────────────────────────────────────
|
|
213
|
+
|
|
214
|
+
def _tokenize(self, x) -> tuple:
|
|
215
|
+
row = [x] if isinstance(x, (int, float)) else list(x)
|
|
216
|
+
return _compound_token(self._disc._encode_row(row))
|
|
217
|
+
|
|
218
|
+
def _decode_dist(self, dist: dict) -> list:
|
|
219
|
+
mid = self.n_bins // 2
|
|
220
|
+
fallback = [self._disc.bin_center(d, mid) for d in range(self._n_dims)]
|
|
221
|
+
if not dist:
|
|
222
|
+
return fallback
|
|
223
|
+
total = sum(dist.values())
|
|
224
|
+
if total < 1e-12:
|
|
225
|
+
return fallback
|
|
226
|
+
means = [0.0] * self._n_dims
|
|
227
|
+
for token, prob in dist.items():
|
|
228
|
+
if not isinstance(token, tuple) or len(token) != self._n_dims:
|
|
229
|
+
continue
|
|
230
|
+
w = prob / total
|
|
231
|
+
for d, b in enumerate(token):
|
|
232
|
+
if isinstance(b, int):
|
|
233
|
+
means[d] += w * self._disc.bin_center(d, b)
|
|
234
|
+
return means
|
|
235
|
+
|
|
236
|
+
def _check_fitted(self):
|
|
237
|
+
if not hasattr(self, '_pred'):
|
|
238
|
+
raise RuntimeError("Call fit() first.")
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
242
|
+
# TimeSeriesClassifier
|
|
243
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
244
|
+
|
|
245
|
+
class TimeSeriesClassifier(BaseEstimator, ClassifierMixin):
|
|
246
|
+
"""
|
|
247
|
+
Classify fixed-length time series windows.
|
|
248
|
+
|
|
249
|
+
Each window of T timesteps is encoded as T compound tokens; the class label
|
|
250
|
+
is the next token after the full window.
|
|
251
|
+
|
|
252
|
+
sklearn-compatible: works in Pipeline, GridSearchCV, cross_val_score.
|
|
253
|
+
|
|
254
|
+
Parameters
|
|
255
|
+
----------
|
|
256
|
+
n_bins : int
|
|
257
|
+
window_size : int | None
|
|
258
|
+
Expected window length (inferred from first fit call if None).
|
|
259
|
+
n_epochs : int
|
|
260
|
+
learning_rate, cred_max, lambda_power : float
|
|
261
|
+
"""
|
|
262
|
+
|
|
263
|
+
def __init__(
|
|
264
|
+
self,
|
|
265
|
+
n_bins: int = 8,
|
|
266
|
+
window_size: int | None = None,
|
|
267
|
+
n_epochs: int = 1,
|
|
268
|
+
learning_rate: float = 0.08,
|
|
269
|
+
cred_max: float = 6.05,
|
|
270
|
+
lambda_power: float = 0.65,
|
|
271
|
+
random_seed: int = 42,
|
|
272
|
+
):
|
|
273
|
+
self.n_bins = n_bins
|
|
274
|
+
self.window_size = window_size
|
|
275
|
+
self.n_epochs = n_epochs
|
|
276
|
+
self.learning_rate = learning_rate
|
|
277
|
+
self.cred_max = cred_max
|
|
278
|
+
self.lambda_power = lambda_power
|
|
279
|
+
self.random_seed = random_seed
|
|
280
|
+
|
|
281
|
+
# ── public API ────────────────────────────────────────────────────────────
|
|
282
|
+
|
|
283
|
+
def fit(self, X, y) -> 'TimeSeriesClassifier':
|
|
284
|
+
windows = [_to_window_rows(w) for w in X]
|
|
285
|
+
labels = list(y)
|
|
286
|
+
|
|
287
|
+
if not windows:
|
|
288
|
+
return self
|
|
289
|
+
|
|
290
|
+
T = len(windows[0])
|
|
291
|
+
if self.window_size is not None and T != self.window_size:
|
|
292
|
+
raise ValueError(f"window_size mismatch: expected {self.window_size}, got {T}")
|
|
293
|
+
self._T = T
|
|
294
|
+
|
|
295
|
+
all_steps = [step for w in windows for step in w]
|
|
296
|
+
self._disc = FeatureDiscretizer(n_bins=self.n_bins)
|
|
297
|
+
self._disc.fit(all_steps)
|
|
298
|
+
self._lenc = LabelEncoder()
|
|
299
|
+
self._lenc.fit(labels)
|
|
300
|
+
self._rng = random.Random(self.random_seed)
|
|
301
|
+
|
|
302
|
+
self._pred = _make_predictor(self._T, self.learning_rate,
|
|
303
|
+
self.cred_max, self.lambda_power)
|
|
304
|
+
|
|
305
|
+
for _ in range(self.n_epochs):
|
|
306
|
+
pairs = list(zip(windows, labels))
|
|
307
|
+
self._rng.shuffle(pairs)
|
|
308
|
+
for window, label in pairs:
|
|
309
|
+
self._train_window(window, label)
|
|
310
|
+
|
|
311
|
+
self.is_fitted_ = True
|
|
312
|
+
return self
|
|
313
|
+
|
|
314
|
+
def partial_fit(self, X, y, classes=None) -> 'TimeSeriesClassifier':
|
|
315
|
+
if not hasattr(self, '_disc'):
|
|
316
|
+
return self.fit(X, y)
|
|
317
|
+
windows = [_to_window_rows(w) for w in X]
|
|
318
|
+
labels = list(y)
|
|
319
|
+
self._lenc.partial_fit(labels)
|
|
320
|
+
for window, label in zip(windows, labels):
|
|
321
|
+
self._train_window(window, label)
|
|
322
|
+
return self
|
|
323
|
+
|
|
324
|
+
def predict(self, X) -> list:
|
|
325
|
+
proba = self.predict_proba(X)
|
|
326
|
+
return [max(d, key=d.get) for d in proba]
|
|
327
|
+
|
|
328
|
+
def predict_proba(self, X) -> list:
|
|
329
|
+
return [self._infer_window(_to_window_rows(w)) for w in X]
|
|
330
|
+
|
|
331
|
+
def score(self, X, y) -> float:
|
|
332
|
+
preds = self.predict(X)
|
|
333
|
+
return sum(p == t for p, t in zip(preds, y)) / max(len(list(y)), 1)
|
|
334
|
+
|
|
335
|
+
@property
|
|
336
|
+
def classes_(self) -> list:
|
|
337
|
+
return self._lenc.classes_ if hasattr(self, '_lenc') else []
|
|
338
|
+
|
|
339
|
+
# ── internal ──────────────────────────────────────────────────────────────
|
|
340
|
+
|
|
341
|
+
def _label_token(self, label) -> tuple:
|
|
342
|
+
return (_LABEL_NS_TS, self._lenc.encode(label))
|
|
343
|
+
|
|
344
|
+
def _window_to_tokens(self, window: list) -> list:
|
|
345
|
+
return [_compound_token(self._disc._encode_row(step)) for step in window]
|
|
346
|
+
|
|
347
|
+
def _train_window(self, window: list, label) -> None:
|
|
348
|
+
_train_one(self._pred, self._window_to_tokens(window), self._label_token(label))
|
|
349
|
+
|
|
350
|
+
def _infer_window(self, window: list) -> dict:
|
|
351
|
+
tokens = self._window_to_tokens(window)
|
|
352
|
+
dist = _infer_dist(self._pred, tokens)
|
|
353
|
+
classes = self._lenc.classes_
|
|
354
|
+
|
|
355
|
+
if not classes:
|
|
356
|
+
return {}
|
|
357
|
+
|
|
358
|
+
totals = {c: dist.get(self._label_token(c), 0.0) for c in classes}
|
|
359
|
+
total = sum(totals.values())
|
|
360
|
+
if total < 1e-12:
|
|
361
|
+
u = 1.0 / len(classes)
|
|
362
|
+
return {c: u for c in classes}
|
|
363
|
+
return {c: v / total for c, v in totals.items()}
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
367
|
+
# AnomalyDetector
|
|
368
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
369
|
+
|
|
370
|
+
class AnomalyDetector(BaseEstimator, OutlierMixin):
|
|
371
|
+
"""
|
|
372
|
+
Online anomaly detection via prediction surprise.
|
|
373
|
+
|
|
374
|
+
Trains a MultivariateTSPredictor on normal data. At inference each
|
|
375
|
+
timestep receives score = -log2 P(actual | context). High score = anomalous.
|
|
376
|
+
The trie is NOT updated during scoring.
|
|
377
|
+
|
|
378
|
+
sklearn-compatible: works in Pipeline.
|
|
379
|
+
|
|
380
|
+
Parameters
|
|
381
|
+
----------
|
|
382
|
+
All parameters forwarded to MultivariateTSPredictor.
|
|
383
|
+
"""
|
|
384
|
+
|
|
385
|
+
def __init__(
|
|
386
|
+
self,
|
|
387
|
+
n_bins: int = 8,
|
|
388
|
+
context_length: int = 5,
|
|
389
|
+
learning_rate: float = 0.08,
|
|
390
|
+
cred_max: float = 6.05,
|
|
391
|
+
lambda_power: float = 0.65,
|
|
392
|
+
):
|
|
393
|
+
self.n_bins = n_bins
|
|
394
|
+
self.context_length = context_length
|
|
395
|
+
self.learning_rate = learning_rate
|
|
396
|
+
self.cred_max = cred_max
|
|
397
|
+
self.lambda_power = lambda_power
|
|
398
|
+
|
|
399
|
+
def fit(self, X, y=None) -> 'AnomalyDetector':
|
|
400
|
+
self._ts = MultivariateTSPredictor(
|
|
401
|
+
n_bins=self.n_bins, context_length=self.context_length,
|
|
402
|
+
learning_rate=self.learning_rate, cred_max=self.cred_max,
|
|
403
|
+
lambda_power=self.lambda_power,
|
|
404
|
+
)
|
|
405
|
+
self._ts.fit(X)
|
|
406
|
+
self.is_fitted_ = True
|
|
407
|
+
return self
|
|
408
|
+
|
|
409
|
+
def score_samples(self, X) -> list:
|
|
410
|
+
"""Anomaly score per timestep (higher = more anomalous)."""
|
|
411
|
+
self._check_fitted()
|
|
412
|
+
rows = _to_rows(X)
|
|
413
|
+
if not rows:
|
|
414
|
+
return []
|
|
415
|
+
if not isinstance(rows[0], (list, tuple)):
|
|
416
|
+
rows = [[v] for v in rows]
|
|
417
|
+
|
|
418
|
+
saved = self._ts._pred.history[:]
|
|
419
|
+
scores = []
|
|
420
|
+
for row in rows:
|
|
421
|
+
token = _compound_token(self._ts._disc._encode_row(row))
|
|
422
|
+
self._ts._pred.predict()
|
|
423
|
+
prob = max(self._ts._pred._last_distribution.get(token, 1e-12), 1e-12)
|
|
424
|
+
scores.append(-math.log2(prob))
|
|
425
|
+
self._ts._pred.observe(token)
|
|
426
|
+
self._ts._pred.history = saved
|
|
427
|
+
return scores
|
|
428
|
+
|
|
429
|
+
def predict(self, X) -> list:
|
|
430
|
+
"""Binary labels: True = anomaly (score > mean + 2*std)."""
|
|
431
|
+
scores = self.score_samples(X)
|
|
432
|
+
if not scores:
|
|
433
|
+
return []
|
|
434
|
+
mu = sum(scores) / len(scores)
|
|
435
|
+
var = sum((s - mu) ** 2 for s in scores) / len(scores)
|
|
436
|
+
threshold = mu + 2.0 * math.sqrt(max(var, 0.0))
|
|
437
|
+
return [1 if s > threshold else -1 for s in scores]
|
|
438
|
+
|
|
439
|
+
def decision_function(self, X) -> list:
|
|
440
|
+
"""sklearn OutlierMixin: negative anomaly score (higher = more normal)."""
|
|
441
|
+
return [-s for s in self.score_samples(X)]
|
|
442
|
+
|
|
443
|
+
def _check_fitted(self):
|
|
444
|
+
if not hasattr(self, '_ts'):
|
|
445
|
+
raise RuntimeError("Call fit() first.")
|