uchi-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
uchi/tabular.py ADDED
@@ -0,0 +1,401 @@
1
+ """
2
+ tabular.py
3
+ ==========
4
+ Tabular classification and regression via the Universal Sequence Predictor.
5
+
6
+ How it works
7
+ ------------
8
+ Each row is encoded as an ordered sequence of (feature_index, bin) tokens.
9
+ The class label (or regression bin) is the next token after all feature tokens.
10
+ The predictor learns the conditional distribution P(label | feature_sequence).
11
+
12
+ Diversity comes from running multiple predictors with different feature orderings:
13
+ • MI-ascending — least informative feature first, most informative last
14
+ • MI-descending — most informative feature first
15
+ • natural — as supplied
16
+ • shuffled — random permutation (adds variety)
17
+
18
+ At inference, label probability distributions are averaged across all predictors.
19
+
20
+ Classes
21
+ -------
22
+ TabularPredictor — classification (sklearn-compatible)
23
+ TabularRegressor — regression (sklearn-compatible)
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import math
29
+ import random
30
+ from typing import Any
31
+
32
+ from .predictor import UniversalPredictor
33
+ from .discretize import FeatureDiscretizer, LabelEncoder, _to_rows
34
+
35
+ try:
36
+ from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
37
+ _SKLEARN = True
38
+ except ImportError:
39
+ class BaseEstimator: pass
40
+ class ClassifierMixin: pass
41
+ class RegressorMixin: pass
42
+ _SKLEARN = False
43
+
44
+ _LABEL_NS = '__label__'
45
+ _TARGET_NS = '__target__'
46
+
47
+
48
+ # ══════════════════════════════════════════════════════════════════════════════
49
+ # Shared internals
50
+ # ══════════════════════════════════════════════════════════════════════════════
51
+
52
+ def _make_predictor(
53
+ k: int, lr: float, cred_max: float, lp: float,
54
+ use_similarity_fallback: bool = False,
55
+ use_positional_weights: bool = False,
56
+ ) -> UniversalPredictor:
57
+ return UniversalPredictor(
58
+ k, None,
59
+ learning_rate=lr,
60
+ vigilance=0.3,
61
+ adaptive_cap=True,
62
+ binary_correction_scale=0.05,
63
+ cred_max=cred_max,
64
+ lambda_power=lp,
65
+ cont_count_min_vocab=4,
66
+ use_similarity_fallback=use_similarity_fallback,
67
+ use_positional_weights=use_positional_weights,
68
+ )
69
+
70
+
71
+ def _mi_order(token_rows: list, y_encoded: list, n_features: int) -> list:
72
+ """Feature indices sorted by MI with y (ascending). Falls back to natural order."""
73
+ try:
74
+ import numpy as np
75
+ from sklearn.feature_selection import mutual_info_classif
76
+
77
+ X_arr = np.zeros((len(token_rows), n_features), dtype=float)
78
+ for i, row in enumerate(token_rows):
79
+ for j, val in row:
80
+ X_arr[i, j] = -1 if val == '__MISSING__' else (
81
+ float(val) if isinstance(val, int) else hash(val) % 100)
82
+ y_arr = np.array(y_encoded)
83
+ mi = mutual_info_classif(X_arr, y_arr, discrete_features=True, random_state=0)
84
+ return list(np.argsort(mi))
85
+ except (ImportError, Exception):
86
+ return list(range(n_features))
87
+
88
+
89
+ def _apply_order(token_row: list, order: list) -> list:
90
+ idx_map = {tok[0]: tok for tok in token_row}
91
+ return [idx_map[i] for i in order if i in idx_map]
92
+
93
+
94
+ def _set_history(p: UniversalPredictor, tokens: list) -> None:
95
+ p.history = list(tokens)
96
+
97
+
98
+ def _train_one(p: UniversalPredictor, ordered_tokens: list, next_token: Any) -> None:
99
+ _set_history(p, ordered_tokens)
100
+ p.predict()
101
+ p.observe(next_token)
102
+ p.feedback(next_token)
103
+ p.history.clear()
104
+
105
+
106
+ def _infer_dist(p: UniversalPredictor, ordered_tokens: list) -> dict:
107
+ saved = p.history[:]
108
+ _set_history(p, ordered_tokens)
109
+ p.predict()
110
+ dist = dict(p._last_distribution)
111
+ p.history = saved
112
+ return dist
113
+
114
+
115
+ def _build_orders(rows, y_enc, n_feat, n_orderings, rng) -> list:
116
+ natural = list(range(n_feat))
117
+ mi_asc = _mi_order(rows, y_enc, n_feat)
118
+ mi_desc = list(reversed(mi_asc))
119
+ candidates = [mi_asc, mi_desc, natural]
120
+ for seed in range(n_orderings - len(candidates)):
121
+ perm = natural[:]
122
+ random.Random(seed).shuffle(perm)
123
+ candidates.append(perm)
124
+ return candidates[:n_orderings]
125
+
126
+
127
+ # ══════════════════════════════════════════════════════════════════════════════
128
+ # TabularPredictor
129
+ # ══════════════════════════════════════════════════════════════════════════════
130
+
131
+ class TabularPredictor(BaseEstimator, ClassifierMixin):
132
+ """
133
+ Tabular classification via feature-as-sequence encoding.
134
+
135
+ sklearn-compatible: works in Pipeline, GridSearchCV, cross_val_score.
136
+ Supports partial_fit for online / incremental learning.
137
+
138
+ Parameters
139
+ ----------
140
+ n_bins : int
141
+ Quantile bins for continuous features (default 10).
142
+ context_length : int | None
143
+ Trie depth k. None = number of features (recommended).
144
+ n_orderings : int
145
+ Number of feature orderings to ensemble (default 3).
146
+ n_epochs : int
147
+ Training passes over the data (default 1).
148
+ learning_rate, cred_max, lambda_power : float
149
+ random_seed : int
150
+ """
151
+
152
+ def __init__(
153
+ self,
154
+ n_bins: int = 10,
155
+ context_length: int | None = None,
156
+ n_orderings: int = 3,
157
+ n_epochs: int = 1,
158
+ learning_rate: float = 0.08,
159
+ cred_max: float = 6.05,
160
+ lambda_power: float = 0.65,
161
+ random_seed: int = 42,
162
+ ):
163
+ self.n_bins = n_bins
164
+ self.context_length = context_length
165
+ self.n_orderings = n_orderings
166
+ self.n_epochs = n_epochs
167
+ self.learning_rate = learning_rate
168
+ self.cred_max = cred_max
169
+ self.lambda_power = lambda_power
170
+ self.random_seed = random_seed
171
+ self._replay_buffer = []
172
+ self._replay_batch_size = 100
173
+
174
+ # ── public API ────────────────────────────────────────────────────────────
175
+
176
+ def fit(self, X, y) -> 'TabularPredictor':
177
+ self._disc = FeatureDiscretizer(n_bins=self.n_bins)
178
+ self._lenc = LabelEncoder()
179
+ self._rng = random.Random(self.random_seed)
180
+ self._preds = []
181
+ self._orders = []
182
+
183
+ rows = self._disc.fit_transform(X)
184
+ labels = list(y)
185
+ self._lenc.fit(labels)
186
+ y_enc = [self._lenc.encode(l) for l in labels]
187
+
188
+ n_feat = self._disc.n_features
189
+ k = n_feat if self.context_length is None else self.context_length
190
+ self._orders = _build_orders(rows, y_enc, n_feat, self.n_orderings, self._rng)
191
+ self._preds = [_make_predictor(k, self.learning_rate, self.cred_max, self.lambda_power)
192
+ for _ in self._orders]
193
+
194
+ for _ in range(self.n_epochs):
195
+ pairs = list(zip(rows, labels))
196
+ self._rng.shuffle(pairs)
197
+ for tok_row, label in pairs:
198
+ self._train_row(tok_row, label)
199
+
200
+ self.is_fitted_ = True
201
+ return self
202
+
203
+ def partial_fit(self, X, y, classes=None) -> 'TabularPredictor':
204
+ if not hasattr(self, '_disc'):
205
+ return self.fit(X, y)
206
+ self._disc.partial_fit(X)
207
+ rows = self._disc.transform(X)
208
+ labels = list(y)
209
+ self._lenc.partial_fit(labels)
210
+
211
+ # Experience Replay Buffer Logic
212
+ for tok_row, label in zip(rows, labels):
213
+ self._replay_buffer.append((tok_row, label))
214
+
215
+ if len(self._replay_buffer) >= self._replay_batch_size:
216
+ # Train on the buffer multiple times to stabilize
217
+ for _ in range(self.n_epochs):
218
+ buffer_copy = self._replay_buffer[:]
219
+ self._rng.shuffle(buffer_copy)
220
+ for r, l in buffer_copy:
221
+ self._train_row(r, l)
222
+
223
+ # Keep the last 20% to mix with incoming data (sliding window overlap)
224
+ keep = int(self._replay_batch_size * 0.2)
225
+ self._replay_buffer = self._replay_buffer[-keep:]
226
+
227
+ return self
228
+
229
+ def predict(self, X) -> list:
230
+ proba = self.predict_proba(X)
231
+ return [max(d, key=d.get) for d in proba]
232
+
233
+ def predict_proba(self, X) -> list:
234
+ rows = self._disc.transform(X)
235
+ return [self._infer_row(r) for r in rows]
236
+
237
+ def score(self, X, y) -> float:
238
+ preds = self.predict(X)
239
+ return sum(p == t for p, t in zip(preds, y)) / max(len(list(y)), 1)
240
+
241
+ @property
242
+ def classes_(self) -> list:
243
+ return self._lenc.classes_ if hasattr(self, '_lenc') else []
244
+
245
+ # ── internal ──────────────────────────────────────────────────────────────
246
+
247
+ def _label_token(self, label) -> tuple:
248
+ return (_LABEL_NS, self._lenc.encode(label))
249
+
250
+ def _train_row(self, tok_row: list, label) -> None:
251
+ lt = self._label_token(label)
252
+ for p, order in zip(self._preds, self._orders):
253
+ _train_one(p, _apply_order(tok_row, order), lt)
254
+
255
+ def _infer_row(self, tok_row: list) -> dict:
256
+ classes = self._lenc.classes_
257
+ if not classes:
258
+ return {}
259
+ totals = {c: 0.0 for c in classes}
260
+ for p, order in zip(self._preds, self._orders):
261
+ dist = _infer_dist(p, _apply_order(tok_row, order))
262
+ for c in classes:
263
+ totals[c] += dist.get(self._label_token(c), 0.0)
264
+ total = sum(totals.values())
265
+ if total < 1e-12:
266
+ u = 1.0 / len(classes)
267
+ return {c: u for c in classes}
268
+ return {c: v / total for c, v in totals.items()}
269
+
270
+
271
+ # ══════════════════════════════════════════════════════════════════════════════
272
+ # TabularRegressor
273
+ # ══════════════════════════════════════════════════════════════════════════════
274
+
275
+ class TabularRegressor(BaseEstimator, RegressorMixin):
276
+ """
277
+ Tabular regression via binned-target sequence encoding.
278
+
279
+ The continuous target is discretised into quantile bins at fit time.
280
+ Prediction returns the credibility-weighted mean of bin centres.
281
+ predict_interval() also returns the bin distribution std as an
282
+ uncertainty estimate.
283
+
284
+ sklearn-compatible: works in Pipeline, GridSearchCV, cross_val_score.
285
+
286
+ Parameters
287
+ ----------
288
+ n_bins : int
289
+ Bins for continuous features AND for the regression target.
290
+ n_target_bins : int | None
291
+ Bins specifically for the target (defaults to n_bins).
292
+ All other parameters: same as TabularPredictor.
293
+ """
294
+
295
+ def __init__(
296
+ self,
297
+ n_bins: int = 10,
298
+ n_target_bins: int | None = None,
299
+ context_length: int | None = None,
300
+ n_orderings: int = 3,
301
+ n_epochs: int = 1,
302
+ learning_rate: float = 0.08,
303
+ cred_max: float = 6.05,
304
+ lambda_power: float = 0.65,
305
+ random_seed: int = 42,
306
+ ):
307
+ self.n_bins = n_bins
308
+ self.n_target_bins = n_target_bins
309
+ self.context_length = context_length
310
+ self.n_orderings = n_orderings
311
+ self.n_epochs = n_epochs
312
+ self.learning_rate = learning_rate
313
+ self.cred_max = cred_max
314
+ self.lambda_power = lambda_power
315
+ self.random_seed = random_seed
316
+
317
+ # ── public API ────────────────────────────────────────────────────────────
318
+
319
+ def fit(self, X, y) -> 'TabularRegressor':
320
+ n_tgt = self.n_target_bins if self.n_target_bins is not None else self.n_bins
321
+ self._n_tgt_bins = n_tgt
322
+ self._disc = FeatureDiscretizer(n_bins=self.n_bins)
323
+ self._tgt_disc = FeatureDiscretizer(n_bins=n_tgt)
324
+ self._rng = random.Random(self.random_seed)
325
+ self._preds = []
326
+ self._orders = []
327
+
328
+ rows = self._disc.fit_transform(X)
329
+ y_list = list(y)
330
+ y_rows = self._tgt_disc.fit_transform([[v] for v in y_list])
331
+ y_bins = [r[0][1] for r in y_rows]
332
+
333
+ n_feat = self._disc.n_features
334
+ y_enc = [b if isinstance(b, int) else 0 for b in y_bins]
335
+ self._orders = _build_orders(rows, y_enc, n_feat, self.n_orderings, self._rng)
336
+
337
+ k = n_feat if self.context_length is None else self.context_length
338
+ self._preds = [_make_predictor(k, self.learning_rate, self.cred_max, self.lambda_power)
339
+ for _ in self._orders]
340
+
341
+ for _ in range(self.n_epochs):
342
+ triples = list(zip(rows, y_bins))
343
+ self._rng.shuffle(triples)
344
+ for tok_row, y_bin in triples:
345
+ self._train_row(tok_row, y_bin)
346
+
347
+ self.is_fitted_ = True
348
+ return self
349
+
350
+ def partial_fit(self, X, y) -> 'TabularRegressor':
351
+ if not hasattr(self, '_disc'):
352
+ return self.fit(X, y)
353
+ rows = self._disc.transform(X)
354
+ y_rows = self._tgt_disc.transform([[v] for v in y])
355
+ y_bins = [r[0][1] for r in y_rows]
356
+ for tok_row, y_bin in zip(rows, y_bins):
357
+ self._train_row(tok_row, y_bin)
358
+ return self
359
+
360
+ def predict(self, X) -> list:
361
+ return [mu for mu, _ in self.predict_interval(X)]
362
+
363
+ def predict_interval(self, X) -> list:
364
+ """Return list of (mean, std) tuples."""
365
+ rows = self._disc.transform(X)
366
+ return [self._infer_row(r) for r in rows]
367
+
368
+ def score(self, X, y) -> float:
369
+ preds = self.predict(X)
370
+ y_list = list(y)
371
+ y_mean = sum(y_list) / len(y_list)
372
+ ss_res = sum((p - t) ** 2 for p, t in zip(preds, y_list))
373
+ ss_tot = sum((t - y_mean) ** 2 for t in y_list)
374
+ return 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0
375
+
376
+ # ── internal ──────────────────────────────────────────────────────────────
377
+
378
+ def _target_token(self, y_bin) -> tuple:
379
+ return (_TARGET_NS, y_bin)
380
+
381
+ def _train_row(self, tok_row: list, y_bin) -> None:
382
+ tt = self._target_token(y_bin)
383
+ for p, order in zip(self._preds, self._orders):
384
+ _train_one(p, _apply_order(tok_row, order), tt)
385
+
386
+ def _infer_row(self, tok_row: list) -> tuple:
387
+ n_bins = self._n_tgt_bins
388
+ bin_probs = [0.0] * n_bins
389
+ for p, order in zip(self._preds, self._orders):
390
+ dist = _infer_dist(p, _apply_order(tok_row, order))
391
+ for b in range(n_bins):
392
+ bin_probs[b] += dist.get(self._target_token(b), 0.0)
393
+ total = sum(bin_probs)
394
+ if total < 1e-12:
395
+ probs = [1.0 / n_bins] * n_bins
396
+ else:
397
+ probs = [v / total for v in bin_probs]
398
+ centers = [self._tgt_disc.bin_center(0, b) for b in range(n_bins)]
399
+ mean = sum(p * c for p, c in zip(probs, centers))
400
+ var = sum(p * (c - mean) ** 2 for p, c in zip(probs, centers))
401
+ return mean, math.sqrt(max(var, 0.0))