uchi-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uchi/__init__.py +57 -0
- uchi/discretize.py +307 -0
- uchi/distributional.py +105 -0
- uchi/dual_predictor.py +172 -0
- uchi/forest.py +410 -0
- uchi/generative.py +910 -0
- uchi/hoeffding.py +225 -0
- uchi/long_term_store.py +345 -0
- uchi/node_compressor.py +492 -0
- uchi/online_tokenizer.py +349 -0
- uchi/predictor.py +578 -0
- uchi/semantic_tokenizer.py +48 -0
- uchi/tabular.py +401 -0
- uchi/timeseries.py +445 -0
- uchi_python-0.1.0.dist-info/METADATA +468 -0
- uchi_python-0.1.0.dist-info/RECORD +19 -0
- uchi_python-0.1.0.dist-info/WHEEL +5 -0
- uchi_python-0.1.0.dist-info/licenses/LICENSE +21 -0
- uchi_python-0.1.0.dist-info/top_level.txt +1 -0
uchi/tabular.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
"""
|
|
2
|
+
tabular.py
|
|
3
|
+
==========
|
|
4
|
+
Tabular classification and regression via the Universal Sequence Predictor.
|
|
5
|
+
|
|
6
|
+
How it works
|
|
7
|
+
------------
|
|
8
|
+
Each row is encoded as an ordered sequence of (feature_index, bin) tokens.
|
|
9
|
+
The class label (or regression bin) is the next token after all feature tokens.
|
|
10
|
+
The predictor learns the conditional distribution P(label | feature_sequence).
|
|
11
|
+
|
|
12
|
+
Diversity comes from running multiple predictors with different feature orderings:
|
|
13
|
+
• MI-ascending — least informative feature first, most informative last
|
|
14
|
+
• MI-descending — most informative feature first
|
|
15
|
+
• natural — as supplied
|
|
16
|
+
• shuffled — random permutation (adds variety)
|
|
17
|
+
|
|
18
|
+
At inference, label probability distributions are averaged across all predictors.
|
|
19
|
+
|
|
20
|
+
Classes
|
|
21
|
+
-------
|
|
22
|
+
TabularPredictor — classification (sklearn-compatible)
|
|
23
|
+
TabularRegressor — regression (sklearn-compatible)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import math
|
|
29
|
+
import random
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
from .predictor import UniversalPredictor
|
|
33
|
+
from .discretize import FeatureDiscretizer, LabelEncoder, _to_rows
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
|
|
37
|
+
_SKLEARN = True
|
|
38
|
+
except ImportError:
|
|
39
|
+
class BaseEstimator: pass
|
|
40
|
+
class ClassifierMixin: pass
|
|
41
|
+
class RegressorMixin: pass
|
|
42
|
+
_SKLEARN = False
|
|
43
|
+
|
|
44
|
+
_LABEL_NS = '__label__'
|
|
45
|
+
_TARGET_NS = '__target__'
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
49
|
+
# Shared internals
|
|
50
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
51
|
+
|
|
52
|
+
def _make_predictor(
|
|
53
|
+
k: int, lr: float, cred_max: float, lp: float,
|
|
54
|
+
use_similarity_fallback: bool = False,
|
|
55
|
+
use_positional_weights: bool = False,
|
|
56
|
+
) -> UniversalPredictor:
|
|
57
|
+
return UniversalPredictor(
|
|
58
|
+
k, None,
|
|
59
|
+
learning_rate=lr,
|
|
60
|
+
vigilance=0.3,
|
|
61
|
+
adaptive_cap=True,
|
|
62
|
+
binary_correction_scale=0.05,
|
|
63
|
+
cred_max=cred_max,
|
|
64
|
+
lambda_power=lp,
|
|
65
|
+
cont_count_min_vocab=4,
|
|
66
|
+
use_similarity_fallback=use_similarity_fallback,
|
|
67
|
+
use_positional_weights=use_positional_weights,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _mi_order(token_rows: list, y_encoded: list, n_features: int) -> list:
|
|
72
|
+
"""Feature indices sorted by MI with y (ascending). Falls back to natural order."""
|
|
73
|
+
try:
|
|
74
|
+
import numpy as np
|
|
75
|
+
from sklearn.feature_selection import mutual_info_classif
|
|
76
|
+
|
|
77
|
+
X_arr = np.zeros((len(token_rows), n_features), dtype=float)
|
|
78
|
+
for i, row in enumerate(token_rows):
|
|
79
|
+
for j, val in row:
|
|
80
|
+
X_arr[i, j] = -1 if val == '__MISSING__' else (
|
|
81
|
+
float(val) if isinstance(val, int) else hash(val) % 100)
|
|
82
|
+
y_arr = np.array(y_encoded)
|
|
83
|
+
mi = mutual_info_classif(X_arr, y_arr, discrete_features=True, random_state=0)
|
|
84
|
+
return list(np.argsort(mi))
|
|
85
|
+
except (ImportError, Exception):
|
|
86
|
+
return list(range(n_features))
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _apply_order(token_row: list, order: list) -> list:
|
|
90
|
+
idx_map = {tok[0]: tok for tok in token_row}
|
|
91
|
+
return [idx_map[i] for i in order if i in idx_map]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _set_history(p: UniversalPredictor, tokens: list) -> None:
|
|
95
|
+
p.history = list(tokens)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _train_one(p: UniversalPredictor, ordered_tokens: list, next_token: Any) -> None:
|
|
99
|
+
_set_history(p, ordered_tokens)
|
|
100
|
+
p.predict()
|
|
101
|
+
p.observe(next_token)
|
|
102
|
+
p.feedback(next_token)
|
|
103
|
+
p.history.clear()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _infer_dist(p: UniversalPredictor, ordered_tokens: list) -> dict:
|
|
107
|
+
saved = p.history[:]
|
|
108
|
+
_set_history(p, ordered_tokens)
|
|
109
|
+
p.predict()
|
|
110
|
+
dist = dict(p._last_distribution)
|
|
111
|
+
p.history = saved
|
|
112
|
+
return dist
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _build_orders(rows, y_enc, n_feat, n_orderings, rng) -> list:
|
|
116
|
+
natural = list(range(n_feat))
|
|
117
|
+
mi_asc = _mi_order(rows, y_enc, n_feat)
|
|
118
|
+
mi_desc = list(reversed(mi_asc))
|
|
119
|
+
candidates = [mi_asc, mi_desc, natural]
|
|
120
|
+
for seed in range(n_orderings - len(candidates)):
|
|
121
|
+
perm = natural[:]
|
|
122
|
+
random.Random(seed).shuffle(perm)
|
|
123
|
+
candidates.append(perm)
|
|
124
|
+
return candidates[:n_orderings]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
128
|
+
# TabularPredictor
|
|
129
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
130
|
+
|
|
131
|
+
class TabularPredictor(BaseEstimator, ClassifierMixin):
|
|
132
|
+
"""
|
|
133
|
+
Tabular classification via feature-as-sequence encoding.
|
|
134
|
+
|
|
135
|
+
sklearn-compatible: works in Pipeline, GridSearchCV, cross_val_score.
|
|
136
|
+
Supports partial_fit for online / incremental learning.
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
n_bins : int
|
|
141
|
+
Quantile bins for continuous features (default 10).
|
|
142
|
+
context_length : int | None
|
|
143
|
+
Trie depth k. None = number of features (recommended).
|
|
144
|
+
n_orderings : int
|
|
145
|
+
Number of feature orderings to ensemble (default 3).
|
|
146
|
+
n_epochs : int
|
|
147
|
+
Training passes over the data (default 1).
|
|
148
|
+
learning_rate, cred_max, lambda_power : float
|
|
149
|
+
random_seed : int
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
def __init__(
|
|
153
|
+
self,
|
|
154
|
+
n_bins: int = 10,
|
|
155
|
+
context_length: int | None = None,
|
|
156
|
+
n_orderings: int = 3,
|
|
157
|
+
n_epochs: int = 1,
|
|
158
|
+
learning_rate: float = 0.08,
|
|
159
|
+
cred_max: float = 6.05,
|
|
160
|
+
lambda_power: float = 0.65,
|
|
161
|
+
random_seed: int = 42,
|
|
162
|
+
):
|
|
163
|
+
self.n_bins = n_bins
|
|
164
|
+
self.context_length = context_length
|
|
165
|
+
self.n_orderings = n_orderings
|
|
166
|
+
self.n_epochs = n_epochs
|
|
167
|
+
self.learning_rate = learning_rate
|
|
168
|
+
self.cred_max = cred_max
|
|
169
|
+
self.lambda_power = lambda_power
|
|
170
|
+
self.random_seed = random_seed
|
|
171
|
+
self._replay_buffer = []
|
|
172
|
+
self._replay_batch_size = 100
|
|
173
|
+
|
|
174
|
+
# ── public API ────────────────────────────────────────────────────────────
|
|
175
|
+
|
|
176
|
+
def fit(self, X, y) -> 'TabularPredictor':
|
|
177
|
+
self._disc = FeatureDiscretizer(n_bins=self.n_bins)
|
|
178
|
+
self._lenc = LabelEncoder()
|
|
179
|
+
self._rng = random.Random(self.random_seed)
|
|
180
|
+
self._preds = []
|
|
181
|
+
self._orders = []
|
|
182
|
+
|
|
183
|
+
rows = self._disc.fit_transform(X)
|
|
184
|
+
labels = list(y)
|
|
185
|
+
self._lenc.fit(labels)
|
|
186
|
+
y_enc = [self._lenc.encode(l) for l in labels]
|
|
187
|
+
|
|
188
|
+
n_feat = self._disc.n_features
|
|
189
|
+
k = n_feat if self.context_length is None else self.context_length
|
|
190
|
+
self._orders = _build_orders(rows, y_enc, n_feat, self.n_orderings, self._rng)
|
|
191
|
+
self._preds = [_make_predictor(k, self.learning_rate, self.cred_max, self.lambda_power)
|
|
192
|
+
for _ in self._orders]
|
|
193
|
+
|
|
194
|
+
for _ in range(self.n_epochs):
|
|
195
|
+
pairs = list(zip(rows, labels))
|
|
196
|
+
self._rng.shuffle(pairs)
|
|
197
|
+
for tok_row, label in pairs:
|
|
198
|
+
self._train_row(tok_row, label)
|
|
199
|
+
|
|
200
|
+
self.is_fitted_ = True
|
|
201
|
+
return self
|
|
202
|
+
|
|
203
|
+
def partial_fit(self, X, y, classes=None) -> 'TabularPredictor':
|
|
204
|
+
if not hasattr(self, '_disc'):
|
|
205
|
+
return self.fit(X, y)
|
|
206
|
+
self._disc.partial_fit(X)
|
|
207
|
+
rows = self._disc.transform(X)
|
|
208
|
+
labels = list(y)
|
|
209
|
+
self._lenc.partial_fit(labels)
|
|
210
|
+
|
|
211
|
+
# Experience Replay Buffer Logic
|
|
212
|
+
for tok_row, label in zip(rows, labels):
|
|
213
|
+
self._replay_buffer.append((tok_row, label))
|
|
214
|
+
|
|
215
|
+
if len(self._replay_buffer) >= self._replay_batch_size:
|
|
216
|
+
# Train on the buffer multiple times to stabilize
|
|
217
|
+
for _ in range(self.n_epochs):
|
|
218
|
+
buffer_copy = self._replay_buffer[:]
|
|
219
|
+
self._rng.shuffle(buffer_copy)
|
|
220
|
+
for r, l in buffer_copy:
|
|
221
|
+
self._train_row(r, l)
|
|
222
|
+
|
|
223
|
+
# Keep the last 20% to mix with incoming data (sliding window overlap)
|
|
224
|
+
keep = int(self._replay_batch_size * 0.2)
|
|
225
|
+
self._replay_buffer = self._replay_buffer[-keep:]
|
|
226
|
+
|
|
227
|
+
return self
|
|
228
|
+
|
|
229
|
+
def predict(self, X) -> list:
|
|
230
|
+
proba = self.predict_proba(X)
|
|
231
|
+
return [max(d, key=d.get) for d in proba]
|
|
232
|
+
|
|
233
|
+
def predict_proba(self, X) -> list:
|
|
234
|
+
rows = self._disc.transform(X)
|
|
235
|
+
return [self._infer_row(r) for r in rows]
|
|
236
|
+
|
|
237
|
+
def score(self, X, y) -> float:
|
|
238
|
+
preds = self.predict(X)
|
|
239
|
+
return sum(p == t for p, t in zip(preds, y)) / max(len(list(y)), 1)
|
|
240
|
+
|
|
241
|
+
@property
|
|
242
|
+
def classes_(self) -> list:
|
|
243
|
+
return self._lenc.classes_ if hasattr(self, '_lenc') else []
|
|
244
|
+
|
|
245
|
+
# ── internal ──────────────────────────────────────────────────────────────
|
|
246
|
+
|
|
247
|
+
def _label_token(self, label) -> tuple:
|
|
248
|
+
return (_LABEL_NS, self._lenc.encode(label))
|
|
249
|
+
|
|
250
|
+
def _train_row(self, tok_row: list, label) -> None:
|
|
251
|
+
lt = self._label_token(label)
|
|
252
|
+
for p, order in zip(self._preds, self._orders):
|
|
253
|
+
_train_one(p, _apply_order(tok_row, order), lt)
|
|
254
|
+
|
|
255
|
+
def _infer_row(self, tok_row: list) -> dict:
|
|
256
|
+
classes = self._lenc.classes_
|
|
257
|
+
if not classes:
|
|
258
|
+
return {}
|
|
259
|
+
totals = {c: 0.0 for c in classes}
|
|
260
|
+
for p, order in zip(self._preds, self._orders):
|
|
261
|
+
dist = _infer_dist(p, _apply_order(tok_row, order))
|
|
262
|
+
for c in classes:
|
|
263
|
+
totals[c] += dist.get(self._label_token(c), 0.0)
|
|
264
|
+
total = sum(totals.values())
|
|
265
|
+
if total < 1e-12:
|
|
266
|
+
u = 1.0 / len(classes)
|
|
267
|
+
return {c: u for c in classes}
|
|
268
|
+
return {c: v / total for c, v in totals.items()}
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
272
|
+
# TabularRegressor
|
|
273
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
274
|
+
|
|
275
|
+
class TabularRegressor(BaseEstimator, RegressorMixin):
|
|
276
|
+
"""
|
|
277
|
+
Tabular regression via binned-target sequence encoding.
|
|
278
|
+
|
|
279
|
+
The continuous target is discretised into quantile bins at fit time.
|
|
280
|
+
Prediction returns the credibility-weighted mean of bin centres.
|
|
281
|
+
predict_interval() also returns the bin distribution std as an
|
|
282
|
+
uncertainty estimate.
|
|
283
|
+
|
|
284
|
+
sklearn-compatible: works in Pipeline, GridSearchCV, cross_val_score.
|
|
285
|
+
|
|
286
|
+
Parameters
|
|
287
|
+
----------
|
|
288
|
+
n_bins : int
|
|
289
|
+
Bins for continuous features AND for the regression target.
|
|
290
|
+
n_target_bins : int | None
|
|
291
|
+
Bins specifically for the target (defaults to n_bins).
|
|
292
|
+
All other parameters: same as TabularPredictor.
|
|
293
|
+
"""
|
|
294
|
+
|
|
295
|
+
def __init__(
|
|
296
|
+
self,
|
|
297
|
+
n_bins: int = 10,
|
|
298
|
+
n_target_bins: int | None = None,
|
|
299
|
+
context_length: int | None = None,
|
|
300
|
+
n_orderings: int = 3,
|
|
301
|
+
n_epochs: int = 1,
|
|
302
|
+
learning_rate: float = 0.08,
|
|
303
|
+
cred_max: float = 6.05,
|
|
304
|
+
lambda_power: float = 0.65,
|
|
305
|
+
random_seed: int = 42,
|
|
306
|
+
):
|
|
307
|
+
self.n_bins = n_bins
|
|
308
|
+
self.n_target_bins = n_target_bins
|
|
309
|
+
self.context_length = context_length
|
|
310
|
+
self.n_orderings = n_orderings
|
|
311
|
+
self.n_epochs = n_epochs
|
|
312
|
+
self.learning_rate = learning_rate
|
|
313
|
+
self.cred_max = cred_max
|
|
314
|
+
self.lambda_power = lambda_power
|
|
315
|
+
self.random_seed = random_seed
|
|
316
|
+
|
|
317
|
+
# ── public API ────────────────────────────────────────────────────────────
|
|
318
|
+
|
|
319
|
+
def fit(self, X, y) -> 'TabularRegressor':
|
|
320
|
+
n_tgt = self.n_target_bins if self.n_target_bins is not None else self.n_bins
|
|
321
|
+
self._n_tgt_bins = n_tgt
|
|
322
|
+
self._disc = FeatureDiscretizer(n_bins=self.n_bins)
|
|
323
|
+
self._tgt_disc = FeatureDiscretizer(n_bins=n_tgt)
|
|
324
|
+
self._rng = random.Random(self.random_seed)
|
|
325
|
+
self._preds = []
|
|
326
|
+
self._orders = []
|
|
327
|
+
|
|
328
|
+
rows = self._disc.fit_transform(X)
|
|
329
|
+
y_list = list(y)
|
|
330
|
+
y_rows = self._tgt_disc.fit_transform([[v] for v in y_list])
|
|
331
|
+
y_bins = [r[0][1] for r in y_rows]
|
|
332
|
+
|
|
333
|
+
n_feat = self._disc.n_features
|
|
334
|
+
y_enc = [b if isinstance(b, int) else 0 for b in y_bins]
|
|
335
|
+
self._orders = _build_orders(rows, y_enc, n_feat, self.n_orderings, self._rng)
|
|
336
|
+
|
|
337
|
+
k = n_feat if self.context_length is None else self.context_length
|
|
338
|
+
self._preds = [_make_predictor(k, self.learning_rate, self.cred_max, self.lambda_power)
|
|
339
|
+
for _ in self._orders]
|
|
340
|
+
|
|
341
|
+
for _ in range(self.n_epochs):
|
|
342
|
+
triples = list(zip(rows, y_bins))
|
|
343
|
+
self._rng.shuffle(triples)
|
|
344
|
+
for tok_row, y_bin in triples:
|
|
345
|
+
self._train_row(tok_row, y_bin)
|
|
346
|
+
|
|
347
|
+
self.is_fitted_ = True
|
|
348
|
+
return self
|
|
349
|
+
|
|
350
|
+
def partial_fit(self, X, y) -> 'TabularRegressor':
|
|
351
|
+
if not hasattr(self, '_disc'):
|
|
352
|
+
return self.fit(X, y)
|
|
353
|
+
rows = self._disc.transform(X)
|
|
354
|
+
y_rows = self._tgt_disc.transform([[v] for v in y])
|
|
355
|
+
y_bins = [r[0][1] for r in y_rows]
|
|
356
|
+
for tok_row, y_bin in zip(rows, y_bins):
|
|
357
|
+
self._train_row(tok_row, y_bin)
|
|
358
|
+
return self
|
|
359
|
+
|
|
360
|
+
def predict(self, X) -> list:
|
|
361
|
+
return [mu for mu, _ in self.predict_interval(X)]
|
|
362
|
+
|
|
363
|
+
def predict_interval(self, X) -> list:
|
|
364
|
+
"""Return list of (mean, std) tuples."""
|
|
365
|
+
rows = self._disc.transform(X)
|
|
366
|
+
return [self._infer_row(r) for r in rows]
|
|
367
|
+
|
|
368
|
+
def score(self, X, y) -> float:
|
|
369
|
+
preds = self.predict(X)
|
|
370
|
+
y_list = list(y)
|
|
371
|
+
y_mean = sum(y_list) / len(y_list)
|
|
372
|
+
ss_res = sum((p - t) ** 2 for p, t in zip(preds, y_list))
|
|
373
|
+
ss_tot = sum((t - y_mean) ** 2 for t in y_list)
|
|
374
|
+
return 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0
|
|
375
|
+
|
|
376
|
+
# ── internal ──────────────────────────────────────────────────────────────
|
|
377
|
+
|
|
378
|
+
def _target_token(self, y_bin) -> tuple:
|
|
379
|
+
return (_TARGET_NS, y_bin)
|
|
380
|
+
|
|
381
|
+
def _train_row(self, tok_row: list, y_bin) -> None:
|
|
382
|
+
tt = self._target_token(y_bin)
|
|
383
|
+
for p, order in zip(self._preds, self._orders):
|
|
384
|
+
_train_one(p, _apply_order(tok_row, order), tt)
|
|
385
|
+
|
|
386
|
+
def _infer_row(self, tok_row: list) -> tuple:
|
|
387
|
+
n_bins = self._n_tgt_bins
|
|
388
|
+
bin_probs = [0.0] * n_bins
|
|
389
|
+
for p, order in zip(self._preds, self._orders):
|
|
390
|
+
dist = _infer_dist(p, _apply_order(tok_row, order))
|
|
391
|
+
for b in range(n_bins):
|
|
392
|
+
bin_probs[b] += dist.get(self._target_token(b), 0.0)
|
|
393
|
+
total = sum(bin_probs)
|
|
394
|
+
if total < 1e-12:
|
|
395
|
+
probs = [1.0 / n_bins] * n_bins
|
|
396
|
+
else:
|
|
397
|
+
probs = [v / total for v in bin_probs]
|
|
398
|
+
centers = [self._tgt_disc.bin_center(0, b) for b in range(n_bins)]
|
|
399
|
+
mean = sum(p * c for p, c in zip(probs, centers))
|
|
400
|
+
var = sum(p * (c - mean) ** 2 for p, c in zip(probs, centers))
|
|
401
|
+
return mean, math.sqrt(max(var, 0.0))
|