uchi-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uchi/__init__.py +57 -0
- uchi/discretize.py +307 -0
- uchi/distributional.py +105 -0
- uchi/dual_predictor.py +172 -0
- uchi/forest.py +410 -0
- uchi/generative.py +910 -0
- uchi/hoeffding.py +225 -0
- uchi/long_term_store.py +345 -0
- uchi/node_compressor.py +492 -0
- uchi/online_tokenizer.py +349 -0
- uchi/predictor.py +578 -0
- uchi/semantic_tokenizer.py +48 -0
- uchi/tabular.py +401 -0
- uchi/timeseries.py +445 -0
- uchi_python-0.1.0.dist-info/METADATA +468 -0
- uchi_python-0.1.0.dist-info/RECORD +19 -0
- uchi_python-0.1.0.dist-info/WHEEL +5 -0
- uchi_python-0.1.0.dist-info/licenses/LICENSE +21 -0
- uchi_python-0.1.0.dist-info/top_level.txt +1 -0
uchi/__init__.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
uchi
|
|
3
|
+
===============
|
|
4
|
+
Online credibility-weighted sequence predictor for tabular, time series,
|
|
5
|
+
and generative machine learning tasks.
|
|
6
|
+
|
|
7
|
+
Quick start
|
|
8
|
+
-----------
|
|
9
|
+
from uchi import TabularPredictor, TabularRegressor
|
|
10
|
+
from uchi import MultivariateTSPredictor, TimeSeriesClassifier
|
|
11
|
+
from uchi import AnomalyDetector
|
|
12
|
+
from uchi import UniversalPredictor, PredictorForest
|
|
13
|
+
|
|
14
|
+
All classes are sklearn-compatible (Pipeline, GridSearchCV, cross_val_score).
|
|
15
|
+
TabularPredictor / TabularRegressor / TimeSeriesClassifier all support
|
|
16
|
+
partial_fit() for online / incremental learning.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from .predictor import UniversalPredictor
|
|
20
|
+
from .forest import PredictorForest
|
|
21
|
+
from .discretize import FeatureDiscretizer, LabelEncoder
|
|
22
|
+
from .tabular import TabularPredictor, TabularRegressor
|
|
23
|
+
from .timeseries import MultivariateTSPredictor, TimeSeriesClassifier, AnomalyDetector
|
|
24
|
+
from .generative import SequenceGenerator, TabularGenerator, TimeSeriesGenerator
|
|
25
|
+
|
|
26
|
+
# Generative services fixes
|
|
27
|
+
from .long_term_store import LongTermStore
|
|
28
|
+
from .dual_predictor import DualPredictor
|
|
29
|
+
from .online_tokenizer import OnlineTokenizer
|
|
30
|
+
from .node_compressor import NodeCompressor
|
|
31
|
+
|
|
32
|
+
__version__ = "0.1.0"
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
# Core engine
|
|
36
|
+
"UniversalPredictor",
|
|
37
|
+
"PredictorForest",
|
|
38
|
+
# Feature engineering
|
|
39
|
+
"FeatureDiscretizer",
|
|
40
|
+
"LabelEncoder",
|
|
41
|
+
# Tabular ML
|
|
42
|
+
"TabularPredictor",
|
|
43
|
+
"TabularRegressor",
|
|
44
|
+
# Time series
|
|
45
|
+
"MultivariateTSPredictor",
|
|
46
|
+
"TimeSeriesClassifier",
|
|
47
|
+
"AnomalyDetector",
|
|
48
|
+
# Generative
|
|
49
|
+
"SequenceGenerator",
|
|
50
|
+
"TabularGenerator",
|
|
51
|
+
"TimeSeriesGenerator",
|
|
52
|
+
# Generative services fixes
|
|
53
|
+
"LongTermStore",
|
|
54
|
+
"DualPredictor",
|
|
55
|
+
"OnlineTokenizer",
|
|
56
|
+
"NodeCompressor",
|
|
57
|
+
]
|
uchi/discretize.py
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
"""
|
|
2
|
+
discretize.py
|
|
3
|
+
=============
|
|
4
|
+
Feature discretization for the Universal Sequence Predictor.
|
|
5
|
+
|
|
6
|
+
Converts continuous and categorical features into discrete symbol tokens
|
|
7
|
+
compatible with UniversalPredictor / PredictorForest.
|
|
8
|
+
|
|
9
|
+
FeatureDiscretizer — fits bins per-column, transforms rows to token lists
|
|
10
|
+
LabelEncoder — encodes classification targets to/from integers
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import math
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# ── sentinel for missing / unseen values ─────────────────────────────────────
|
|
18
|
+
|
|
19
|
+
MISSING = object() # unique identity; cannot equal any real feature value
|
|
20
|
+
MISSING_STR = '__MISSING__'
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
24
|
+
# FeatureDiscretizer
|
|
25
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
26
|
+
|
|
27
|
+
class FeatureDiscretizer:
|
|
28
|
+
"""
|
|
29
|
+
Transforms a feature matrix into sequences of (feature_index, bin) tokens.
|
|
30
|
+
|
|
31
|
+
Column types detected automatically:
|
|
32
|
+
• numeric → equal-frequency (quantile) bins, labelled 0..n_bins-1
|
|
33
|
+
• other → ordinal integer encoding of unique values seen at fit time
|
|
34
|
+
|
|
35
|
+
Missing values (None, NaN, empty string) map to MISSING_STR so they form
|
|
36
|
+
their own trie branch rather than crashing or biasing bin counts.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
n_bins : int
|
|
41
|
+
Number of quantile bins for numeric columns.
|
|
42
|
+
feature_names : list[str] | None
|
|
43
|
+
Optional column names, used only for repr/debugging.
|
|
44
|
+
|
|
45
|
+
Usage
|
|
46
|
+
-----
|
|
47
|
+
disc = FeatureDiscretizer(n_bins=10)
|
|
48
|
+
token_rows = disc.fit_transform(X_train) # list of [(col, bin), ...]
|
|
49
|
+
test_rows = disc.transform(X_test)
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self, n_bins: int = 10, feature_names: list | None = None):
|
|
53
|
+
self.n_bins = n_bins
|
|
54
|
+
self.feature_names = feature_names
|
|
55
|
+
self._n_features: int = 0
|
|
56
|
+
self._types: list = [] # 'numeric' | 'categorical' per column
|
|
57
|
+
self._edges: dict = {} # col → sorted list of quantile cut-points
|
|
58
|
+
self._cat_maps: dict = {} # col → {value: int}
|
|
59
|
+
self._bin_centers: dict = {} # col → list of float centers (numeric only)
|
|
60
|
+
|
|
61
|
+
# Reservoir sampling state for dynamic online splitting
|
|
62
|
+
self._reservoirs: dict = {} # col -> list of sampled float values
|
|
63
|
+
self._reservoir_max: int = 2000
|
|
64
|
+
|
|
65
|
+
# ── public API ────────────────────────────────────────────────────────────
|
|
66
|
+
|
|
67
|
+
def fit(self, X) -> 'FeatureDiscretizer':
|
|
68
|
+
X = _to_rows(X)
|
|
69
|
+
if not X:
|
|
70
|
+
return self
|
|
71
|
+
self._n_features = len(X[0])
|
|
72
|
+
self._types = []
|
|
73
|
+
self._edges = {}
|
|
74
|
+
self._cat_maps = {}
|
|
75
|
+
self._bin_centers = {}
|
|
76
|
+
|
|
77
|
+
for j in range(self._n_features):
|
|
78
|
+
col = [row[j] for row in X]
|
|
79
|
+
if _is_numeric_col(col):
|
|
80
|
+
self._types.append('numeric')
|
|
81
|
+
self._reservoirs[j] = [float(v) for v in col if not _is_missing(v) and not (isinstance(v, float) and math.isnan(v))]
|
|
82
|
+
edges, centers = _quantile_edges(col, self.n_bins)
|
|
83
|
+
self._edges[j] = edges
|
|
84
|
+
self._bin_centers[j] = centers
|
|
85
|
+
else:
|
|
86
|
+
self._types.append('categorical')
|
|
87
|
+
unique = sorted(
|
|
88
|
+
{_safe_str(v) for v in col if not _is_missing(v)})
|
|
89
|
+
self._cat_maps[j] = {v: i for i, v in enumerate(unique)}
|
|
90
|
+
self._total_seen = len(X)
|
|
91
|
+
return self
|
|
92
|
+
|
|
93
|
+
def partial_fit(self, X) -> 'FeatureDiscretizer':
|
|
94
|
+
"""
|
|
95
|
+
Online dynamic splitting: Update the reservoir sample of numeric columns
|
|
96
|
+
and re-calculate quantile bins if enough new data has arrived.
|
|
97
|
+
"""
|
|
98
|
+
if self._n_features == 0:
|
|
99
|
+
return self.fit(X)
|
|
100
|
+
|
|
101
|
+
rows = _to_rows(X)
|
|
102
|
+
if not rows:
|
|
103
|
+
return self
|
|
104
|
+
|
|
105
|
+
import random
|
|
106
|
+
for row in rows:
|
|
107
|
+
self._total_seen += 1
|
|
108
|
+
for j in range(self._n_features):
|
|
109
|
+
v = row[j]
|
|
110
|
+
if self._types[j] == 'numeric':
|
|
111
|
+
if _is_missing(v):
|
|
112
|
+
continue
|
|
113
|
+
v = float(v)
|
|
114
|
+
# Reservoir sampling
|
|
115
|
+
if len(self._reservoirs[j]) < self._reservoir_max:
|
|
116
|
+
self._reservoirs[j].append(v)
|
|
117
|
+
else:
|
|
118
|
+
idx = random.randint(0, self._total_seen - 1)
|
|
119
|
+
if idx < self._reservoir_max:
|
|
120
|
+
self._reservoirs[j][idx] = v
|
|
121
|
+
else:
|
|
122
|
+
# Categorical: dynamically add unseen categories
|
|
123
|
+
if not _is_missing(v):
|
|
124
|
+
s = _safe_str(v)
|
|
125
|
+
if s not in self._cat_maps[j]:
|
|
126
|
+
self._cat_maps[j][s] = len(self._cat_maps[j])
|
|
127
|
+
|
|
128
|
+
# Re-compute bins periodically (e.g., every time we process a batch)
|
|
129
|
+
for j in range(self._n_features):
|
|
130
|
+
if self._types[j] == 'numeric' and self._reservoirs[j]:
|
|
131
|
+
edges, centers = _quantile_edges(self._reservoirs[j], self.n_bins)
|
|
132
|
+
self._edges[j] = edges
|
|
133
|
+
self._bin_centers[j] = centers
|
|
134
|
+
|
|
135
|
+
return self
|
|
136
|
+
|
|
137
|
+
def transform(self, X) -> list:
|
|
138
|
+
"""Return list-of-lists of (col_idx, bin_or_code) tokens."""
|
|
139
|
+
rows = _to_rows(X)
|
|
140
|
+
return [self._encode_row(row) for row in rows]
|
|
141
|
+
|
|
142
|
+
def fit_transform(self, X) -> list:
|
|
143
|
+
return self.fit(X).transform(X)
|
|
144
|
+
|
|
145
|
+
def bin_center(self, col: int, bin_idx: int) -> float:
|
|
146
|
+
"""Inverse of numeric binning: approximate value at bin centre."""
|
|
147
|
+
centers = self._bin_centers.get(col, [])
|
|
148
|
+
if not centers:
|
|
149
|
+
return 0.0
|
|
150
|
+
return centers[min(bin_idx, len(centers) - 1)]
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def n_features(self) -> int:
|
|
154
|
+
return self._n_features
|
|
155
|
+
|
|
156
|
+
# ── internal ──────────────────────────────────────────────────────────────
|
|
157
|
+
|
|
158
|
+
def _encode_row(self, row: list) -> list:
|
|
159
|
+
tokens = []
|
|
160
|
+
for j, v in enumerate(row):
|
|
161
|
+
tokens.append((j, self._encode_val(j, v)))
|
|
162
|
+
return tokens
|
|
163
|
+
|
|
164
|
+
def _encode_val(self, j: int, v) -> Any:
|
|
165
|
+
if _is_missing(v):
|
|
166
|
+
return MISSING_STR
|
|
167
|
+
if self._types[j] == 'numeric':
|
|
168
|
+
return _bin_search(float(v), self._edges[j])
|
|
169
|
+
else:
|
|
170
|
+
return self._cat_maps[j].get(_safe_str(v), MISSING_STR)
|
|
171
|
+
|
|
172
|
+
def __repr__(self) -> str:
|
|
173
|
+
return (f'FeatureDiscretizer(n_bins={self.n_bins}, '
|
|
174
|
+
f'n_features={self._n_features})')
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
178
|
+
# LabelEncoder
|
|
179
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
180
|
+
|
|
181
|
+
class LabelEncoder:
|
|
182
|
+
"""
|
|
183
|
+
Bi-directional map between raw class labels and integer codes.
|
|
184
|
+
New labels seen during partial_fit are assigned the next integer.
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
def __init__(self):
|
|
188
|
+
self._enc: dict = {} # label → int
|
|
189
|
+
self._dec: dict = {} # int → label
|
|
190
|
+
self.classes_: list = []
|
|
191
|
+
|
|
192
|
+
def fit(self, y) -> 'LabelEncoder':
|
|
193
|
+
for label in y:
|
|
194
|
+
self._add(label)
|
|
195
|
+
return self
|
|
196
|
+
|
|
197
|
+
def partial_fit(self, y) -> 'LabelEncoder':
|
|
198
|
+
return self.fit(y)
|
|
199
|
+
|
|
200
|
+
def encode(self, label) -> int:
|
|
201
|
+
if label not in self._enc:
|
|
202
|
+
self._add(label)
|
|
203
|
+
return self._enc[label]
|
|
204
|
+
|
|
205
|
+
def decode(self, code: int):
|
|
206
|
+
return self._dec[code]
|
|
207
|
+
|
|
208
|
+
def __len__(self) -> int:
|
|
209
|
+
return len(self._enc)
|
|
210
|
+
|
|
211
|
+
def _add(self, label):
|
|
212
|
+
if label not in self._enc:
|
|
213
|
+
code = len(self._enc)
|
|
214
|
+
self._enc[label] = code
|
|
215
|
+
self._dec[code] = label
|
|
216
|
+
self.classes_.append(label)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
220
|
+
# Helpers
|
|
221
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
222
|
+
|
|
223
|
+
def _to_rows(X) -> list:
|
|
224
|
+
"""Accept numpy array, pandas DataFrame, or list-of-lists."""
|
|
225
|
+
try:
|
|
226
|
+
import numpy as np
|
|
227
|
+
if isinstance(X, np.ndarray):
|
|
228
|
+
return X.tolist()
|
|
229
|
+
except ImportError:
|
|
230
|
+
pass
|
|
231
|
+
try:
|
|
232
|
+
import pandas as pd
|
|
233
|
+
if isinstance(X, (pd.DataFrame, pd.Series)):
|
|
234
|
+
return X.values.tolist()
|
|
235
|
+
except ImportError:
|
|
236
|
+
pass
|
|
237
|
+
# Already a list; wrap scalars (1-D input → single-column matrix)
|
|
238
|
+
out = list(X)
|
|
239
|
+
if out and not isinstance(out[0], (list, tuple)):
|
|
240
|
+
out = [[v] for v in out]
|
|
241
|
+
return out
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _is_missing(v) -> bool:
|
|
245
|
+
if v is None or v is MISSING:
|
|
246
|
+
return True
|
|
247
|
+
if isinstance(v, float) and math.isnan(v):
|
|
248
|
+
return True
|
|
249
|
+
if isinstance(v, str) and v.strip() == '':
|
|
250
|
+
return True
|
|
251
|
+
return False
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _is_numeric_col(col: list) -> bool:
|
|
255
|
+
"""True if any non-missing value is a float/int (non-bool)."""
|
|
256
|
+
for v in col:
|
|
257
|
+
if _is_missing(v):
|
|
258
|
+
continue
|
|
259
|
+
if isinstance(v, bool):
|
|
260
|
+
return False
|
|
261
|
+
if isinstance(v, (int, float)):
|
|
262
|
+
return True
|
|
263
|
+
return False
|
|
264
|
+
return False
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _safe_str(v) -> str:
|
|
268
|
+
return str(v)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _quantile_edges(col: list, n_bins: int) -> tuple:
|
|
272
|
+
"""
|
|
273
|
+
Compute (n_bins-1) quantile cut-points and n_bins bin centres.
|
|
274
|
+
Values beyond edges map to the first or last bin (clamped).
|
|
275
|
+
"""
|
|
276
|
+
valid = sorted(float(v) for v in col if not _is_missing(v)
|
|
277
|
+
and not (isinstance(v, float) and math.isnan(v)))
|
|
278
|
+
if not valid:
|
|
279
|
+
return [], [0.0]
|
|
280
|
+
|
|
281
|
+
n = len(valid)
|
|
282
|
+
edges = []
|
|
283
|
+
for i in range(1, n_bins):
|
|
284
|
+
idx = int(i * n / n_bins)
|
|
285
|
+
edges.append(valid[min(idx, n - 1)])
|
|
286
|
+
# Deduplicate while keeping order
|
|
287
|
+
edges = sorted(set(edges))
|
|
288
|
+
|
|
289
|
+
# Bin centres: midpoint between consecutive edges
|
|
290
|
+
boundaries = [valid[0] - 1e-9] + edges + [valid[-1] + 1e-9]
|
|
291
|
+
centers = [
|
|
292
|
+
(boundaries[i] + boundaries[i + 1]) / 2.0
|
|
293
|
+
for i in range(len(boundaries) - 1)
|
|
294
|
+
]
|
|
295
|
+
return edges, centers
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _bin_search(v: float, edges: list) -> int:
|
|
299
|
+
"""Binary-search bin index for value v given sorted cut-points."""
|
|
300
|
+
lo, hi = 0, len(edges)
|
|
301
|
+
while lo < hi:
|
|
302
|
+
mid = (lo + hi) // 2
|
|
303
|
+
if v <= edges[mid]:
|
|
304
|
+
hi = mid
|
|
305
|
+
else:
|
|
306
|
+
lo = mid + 1
|
|
307
|
+
return lo
|
uchi/distributional.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
def js_divergence(p: dict, q: dict) -> float:
|
|
5
|
+
"""Calculate Jensen-Shannon Divergence between two probability dicts."""
|
|
6
|
+
vocab = set(p.keys()) | set(q.keys())
|
|
7
|
+
if not vocab:
|
|
8
|
+
return 0.0
|
|
9
|
+
|
|
10
|
+
def kld(dist1, dist2):
|
|
11
|
+
res = 0.0
|
|
12
|
+
for k in vocab:
|
|
13
|
+
v1 = dist1.get(k, 0.0)
|
|
14
|
+
if v1 > 0:
|
|
15
|
+
v2 = dist2.get(k, 0.0)
|
|
16
|
+
if v2 == 0:
|
|
17
|
+
return float('inf')
|
|
18
|
+
res += v1 * math.log2(v1 / v2)
|
|
19
|
+
return res
|
|
20
|
+
|
|
21
|
+
m = {}
|
|
22
|
+
for k in vocab:
|
|
23
|
+
m[k] = 0.5 * (p.get(k, 0.0) + q.get(k, 0.0))
|
|
24
|
+
|
|
25
|
+
return 0.5 * kld(p, m) + 0.5 * kld(q, m)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DistributionalTokenizer:
|
|
29
|
+
"""
|
|
30
|
+
Online semantic clustering via Jensen-Shannon divergence.
|
|
31
|
+
Replaces NLTK WordNet with pure mathematical distributions.
|
|
32
|
+
If two tokens are followed by similar distributions, they are merged.
|
|
33
|
+
"""
|
|
34
|
+
def __init__(self, merge_threshold_jsd: float = 0.1, min_obs: int = 50):
|
|
35
|
+
self.merge_threshold = merge_threshold_jsd
|
|
36
|
+
self.min_obs = min_obs
|
|
37
|
+
|
|
38
|
+
self.successor_counts = {} # token -> dict of next_token -> count
|
|
39
|
+
self.token_totals = {} # token -> int
|
|
40
|
+
|
|
41
|
+
self.clusters = {} # token -> cluster_id
|
|
42
|
+
self.next_cluster_id = 0
|
|
43
|
+
|
|
44
|
+
self._last_token = None
|
|
45
|
+
|
|
46
|
+
def _normalize(self, token: Any) -> dict:
|
|
47
|
+
counts = self.successor_counts.get(token, {})
|
|
48
|
+
total = self.token_totals.get(token, 0)
|
|
49
|
+
if total == 0:
|
|
50
|
+
return {}
|
|
51
|
+
return {k: v / total for k, v in counts.items()}
|
|
52
|
+
|
|
53
|
+
def observe(self, token: Any) -> None:
|
|
54
|
+
if self._last_token is not None:
|
|
55
|
+
prev = self._last_token
|
|
56
|
+
if prev not in self.successor_counts:
|
|
57
|
+
self.successor_counts[prev] = {}
|
|
58
|
+
self.token_totals[prev] = 0
|
|
59
|
+
|
|
60
|
+
self.successor_counts[prev][token] = self.successor_counts[prev].get(token, 0) + 1
|
|
61
|
+
self.token_totals[prev] += 1
|
|
62
|
+
|
|
63
|
+
# Periodically attempt clustering
|
|
64
|
+
if self.token_totals[prev] % 100 == 0 and self.token_totals[prev] >= self.min_obs:
|
|
65
|
+
self._attempt_cluster(prev)
|
|
66
|
+
|
|
67
|
+
self._last_token = token
|
|
68
|
+
|
|
69
|
+
def _attempt_cluster(self, target_token: Any):
|
|
70
|
+
if target_token in self.clusters:
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
target_dist = self._normalize(target_token)
|
|
74
|
+
|
|
75
|
+
best_candidate = None
|
|
76
|
+
lowest_jsd = float('inf')
|
|
77
|
+
|
|
78
|
+
for candidate, total in self.token_totals.items():
|
|
79
|
+
if candidate == target_token or total < self.min_obs:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
candidate_dist = self._normalize(candidate)
|
|
83
|
+
jsd = js_divergence(target_dist, candidate_dist)
|
|
84
|
+
|
|
85
|
+
if jsd < lowest_jsd:
|
|
86
|
+
lowest_jsd = jsd
|
|
87
|
+
best_candidate = candidate
|
|
88
|
+
|
|
89
|
+
if lowest_jsd < self.merge_threshold and best_candidate is not None:
|
|
90
|
+
# Merge!
|
|
91
|
+
if best_candidate in self.clusters:
|
|
92
|
+
c_id = self.clusters[best_candidate]
|
|
93
|
+
else:
|
|
94
|
+
c_id = f"__CLUSTER_{self.next_cluster_id}__"
|
|
95
|
+
self.clusters[best_candidate] = c_id
|
|
96
|
+
self.next_cluster_id += 1
|
|
97
|
+
|
|
98
|
+
self.clusters[target_token] = c_id
|
|
99
|
+
|
|
100
|
+
def tokenize(self, token: Any) -> Any:
|
|
101
|
+
"""Tokenize a single token and update distributions online."""
|
|
102
|
+
self.observe(token)
|
|
103
|
+
if token in self.clusters:
|
|
104
|
+
return self.clusters[token]
|
|
105
|
+
return token
|
uchi/dual_predictor.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DualPredictor
|
|
3
|
+
=============
|
|
4
|
+
Solves Problem 7 — Stationary/drift tradeoff.
|
|
5
|
+
|
|
6
|
+
Two UniversalPredictors run in parallel:
|
|
7
|
+
stable — high cred_max, high lambda_power: best on stationary data
|
|
8
|
+
drift — low cred_max, low lambda_power: fastest drift recovery
|
|
9
|
+
|
|
10
|
+
Their output distributions are blended by a weight that tracks the recent
|
|
11
|
+
error rate on a rolling window. When the error rate is high (something
|
|
12
|
+
changed), weight shifts toward the drift predictor. When low and stable,
|
|
13
|
+
weight shifts toward the stability predictor.
|
|
14
|
+
|
|
15
|
+
The routing is fully automatic and requires no configuration beyond the
|
|
16
|
+
window size.
|
|
17
|
+
|
|
18
|
+
API
|
|
19
|
+
---
|
|
20
|
+
dp = DualPredictor(context_length=4)
|
|
21
|
+
dp.observe(token)
|
|
22
|
+
pred, conf = dp.predict()
|
|
23
|
+
dp.feedback(actual)
|
|
24
|
+
dp.error_rate # current rolling error rate
|
|
25
|
+
dp.active_predictor # 'stable' | 'drift' | 'blend'
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from collections import deque
|
|
29
|
+
from .predictor import UniversalPredictor
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DualPredictor:
|
|
33
|
+
"""
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
context_length : int
|
|
37
|
+
window : int
|
|
38
|
+
Rolling window size for error rate tracking (default 50).
|
|
39
|
+
drift_threshold : float
|
|
40
|
+
Error rate above which the drift predictor dominates (default 0.4).
|
|
41
|
+
stable_threshold : float
|
|
42
|
+
Error rate below which the stability predictor dominates (default 0.15).
|
|
43
|
+
stable_cred_max, stable_lambda_power : float
|
|
44
|
+
Hyperparameters for the stability-tuned predictor.
|
|
45
|
+
drift_cred_max, drift_lambda_power : float
|
|
46
|
+
Hyperparameters for the drift-tuned predictor.
|
|
47
|
+
learning_rate : float
|
|
48
|
+
Shared learning rate for both predictors.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
context_length: int,
|
|
54
|
+
window: int = 50,
|
|
55
|
+
drift_threshold: float = 0.4,
|
|
56
|
+
stable_threshold: float = 0.15,
|
|
57
|
+
stable_cred_max: float = 8.0,
|
|
58
|
+
stable_lambda_power: float = 0.8,
|
|
59
|
+
drift_cred_max: float = 3.0,
|
|
60
|
+
drift_lambda_power: float = 0.4,
|
|
61
|
+
learning_rate: float = 0.08,
|
|
62
|
+
**kwargs,
|
|
63
|
+
):
|
|
64
|
+
self._stable = UniversalPredictor(
|
|
65
|
+
context_length,
|
|
66
|
+
learning_rate=learning_rate,
|
|
67
|
+
cred_max=stable_cred_max,
|
|
68
|
+
lambda_power=stable_lambda_power,
|
|
69
|
+
**kwargs,
|
|
70
|
+
)
|
|
71
|
+
self._drift = UniversalPredictor(
|
|
72
|
+
context_length,
|
|
73
|
+
learning_rate=learning_rate,
|
|
74
|
+
cred_max=drift_cred_max,
|
|
75
|
+
lambda_power=drift_lambda_power,
|
|
76
|
+
**kwargs,
|
|
77
|
+
)
|
|
78
|
+
self._window = deque(maxlen=window)
|
|
79
|
+
self._drift_threshold = drift_threshold
|
|
80
|
+
self._stable_threshold = stable_threshold
|
|
81
|
+
self._error_rate: float = 0.5
|
|
82
|
+
self._last_pred = None
|
|
83
|
+
self._last_dist: dict = {}
|
|
84
|
+
self._history: list[dict] = [] # per-step routing log
|
|
85
|
+
|
|
86
|
+
# ── public API ────────────────────────────────────────────────────────────
|
|
87
|
+
|
|
88
|
+
def observe(self, token) -> 'DualPredictor':
|
|
89
|
+
self._stable.observe(token)
|
|
90
|
+
self._drift.observe(token)
|
|
91
|
+
return self
|
|
92
|
+
|
|
93
|
+
def predict(self):
|
|
94
|
+
"""
|
|
95
|
+
Blend stable and drift distributions based on current error rate.
|
|
96
|
+
Returns (predicted_token, confidence).
|
|
97
|
+
"""
|
|
98
|
+
self._stable.predict()
|
|
99
|
+
self._drift.predict()
|
|
100
|
+
p_s = self._stable._last_distribution
|
|
101
|
+
p_d = self._drift._last_distribution
|
|
102
|
+
|
|
103
|
+
w_drift, w_stable = self._routing_weights()
|
|
104
|
+
|
|
105
|
+
all_keys = set(p_s) | set(p_d)
|
|
106
|
+
if not all_keys:
|
|
107
|
+
self._last_pred = None
|
|
108
|
+
self._last_dist = {}
|
|
109
|
+
return None, 0.0
|
|
110
|
+
|
|
111
|
+
blended = {
|
|
112
|
+
k: w_stable * p_s.get(k, 0.0) + w_drift * p_d.get(k, 0.0)
|
|
113
|
+
for k in all_keys
|
|
114
|
+
}
|
|
115
|
+
total = sum(blended.values()) or 1.0
|
|
116
|
+
self._last_dist = {k: v / total for k, v in blended.items()}
|
|
117
|
+
self._last_pred = max(self._last_dist, key=self._last_dist.get)
|
|
118
|
+
return self._last_pred, self._last_dist[self._last_pred]
|
|
119
|
+
|
|
120
|
+
def feedback(self, actual) -> None:
|
|
121
|
+
"""Update error rate and both predictors."""
|
|
122
|
+
wrong = (self._last_pred != actual)
|
|
123
|
+
self._window.append(1 if wrong else 0)
|
|
124
|
+
self._error_rate = sum(self._window) / len(self._window) if self._window else 0.5
|
|
125
|
+
self._stable.feedback(actual)
|
|
126
|
+
self._drift.feedback(actual)
|
|
127
|
+
|
|
128
|
+
# ── diagnostics ───────────────────────────────────────────────────────────
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def error_rate(self) -> float:
|
|
132
|
+
return self._error_rate
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def active_predictor(self) -> str:
|
|
136
|
+
w_d, w_s = self._routing_weights()
|
|
137
|
+
if w_s > 0.8:
|
|
138
|
+
return 'stable'
|
|
139
|
+
if w_d > 0.8:
|
|
140
|
+
return 'drift'
|
|
141
|
+
return 'blend'
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def _last_distribution(self) -> dict:
|
|
145
|
+
return self._last_dist
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def _vocab(self) -> set:
|
|
149
|
+
return self._stable._vocab
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def history(self) -> list:
|
|
153
|
+
return self._stable.history
|
|
154
|
+
|
|
155
|
+
# ── internal ──────────────────────────────────────────────────────────────
|
|
156
|
+
|
|
157
|
+
def _routing_weights(self) -> tuple[float, float]:
|
|
158
|
+
"""
|
|
159
|
+
Returns (w_drift, w_stable).
|
|
160
|
+
Linear interpolation between thresholds:
|
|
161
|
+
error_rate >= drift_threshold → (1.0, 0.0)
|
|
162
|
+
error_rate <= stable_threshold → (0.0, 1.0)
|
|
163
|
+
in between → smooth blend
|
|
164
|
+
"""
|
|
165
|
+
e = self._error_rate
|
|
166
|
+
if e >= self._drift_threshold:
|
|
167
|
+
return 1.0, 0.0
|
|
168
|
+
if e <= self._stable_threshold:
|
|
169
|
+
return 0.0, 1.0
|
|
170
|
+
span = self._drift_threshold - self._stable_threshold
|
|
171
|
+
w_d = (e - self._stable_threshold) / span
|
|
172
|
+
return w_d, 1.0 - w_d
|