uchi-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
uchi/__init__.py ADDED
@@ -0,0 +1,57 @@
1
+ """
2
+ uchi
3
+ ===============
4
+ Online credibility-weighted sequence predictor for tabular, time series,
5
+ and generative machine learning tasks.
6
+
7
+ Quick start
8
+ -----------
9
+ from uchi import TabularPredictor, TabularRegressor
10
+ from uchi import MultivariateTSPredictor, TimeSeriesClassifier
11
+ from uchi import AnomalyDetector
12
+ from uchi import UniversalPredictor, PredictorForest
13
+
14
+ All classes are sklearn-compatible (Pipeline, GridSearchCV, cross_val_score).
15
+ TabularPredictor / TabularRegressor / TimeSeriesClassifier all support
16
+ partial_fit() for online / incremental learning.
17
+ """
18
+
19
+ from .predictor import UniversalPredictor
20
+ from .forest import PredictorForest
21
+ from .discretize import FeatureDiscretizer, LabelEncoder
22
+ from .tabular import TabularPredictor, TabularRegressor
23
+ from .timeseries import MultivariateTSPredictor, TimeSeriesClassifier, AnomalyDetector
24
+ from .generative import SequenceGenerator, TabularGenerator, TimeSeriesGenerator
25
+
26
+ # Generative services fixes
27
+ from .long_term_store import LongTermStore
28
+ from .dual_predictor import DualPredictor
29
+ from .online_tokenizer import OnlineTokenizer
30
+ from .node_compressor import NodeCompressor
31
+
32
+ __version__ = "0.1.0"
33
+
34
+ __all__ = [
35
+ # Core engine
36
+ "UniversalPredictor",
37
+ "PredictorForest",
38
+ # Feature engineering
39
+ "FeatureDiscretizer",
40
+ "LabelEncoder",
41
+ # Tabular ML
42
+ "TabularPredictor",
43
+ "TabularRegressor",
44
+ # Time series
45
+ "MultivariateTSPredictor",
46
+ "TimeSeriesClassifier",
47
+ "AnomalyDetector",
48
+ # Generative
49
+ "SequenceGenerator",
50
+ "TabularGenerator",
51
+ "TimeSeriesGenerator",
52
+ # Generative services fixes
53
+ "LongTermStore",
54
+ "DualPredictor",
55
+ "OnlineTokenizer",
56
+ "NodeCompressor",
57
+ ]
uchi/discretize.py ADDED
@@ -0,0 +1,307 @@
1
+ """
2
+ discretize.py
3
+ =============
4
+ Feature discretization for the Universal Sequence Predictor.
5
+
6
+ Converts continuous and categorical features into discrete symbol tokens
7
+ compatible with UniversalPredictor / PredictorForest.
8
+
9
+ FeatureDiscretizer — fits bins per-column, transforms rows to token lists
10
+ LabelEncoder — encodes classification targets to/from integers
11
+ """
12
+
13
+ import math
14
+ from typing import Any
15
+
16
+
17
+ # ── sentinel for missing / unseen values ─────────────────────────────────────
18
+
19
+ MISSING = object() # unique identity; cannot equal any real feature value
20
+ MISSING_STR = '__MISSING__'
21
+
22
+
23
+ # ══════════════════════════════════════════════════════════════════════════════
24
+ # FeatureDiscretizer
25
+ # ══════════════════════════════════════════════════════════════════════════════
26
+
27
+ class FeatureDiscretizer:
28
+ """
29
+ Transforms a feature matrix into sequences of (feature_index, bin) tokens.
30
+
31
+ Column types detected automatically:
32
+ • numeric → equal-frequency (quantile) bins, labelled 0..n_bins-1
33
+ • other → ordinal integer encoding of unique values seen at fit time
34
+
35
+ Missing values (None, NaN, empty string) map to MISSING_STR so they form
36
+ their own trie branch rather than crashing or biasing bin counts.
37
+
38
+ Parameters
39
+ ----------
40
+ n_bins : int
41
+ Number of quantile bins for numeric columns.
42
+ feature_names : list[str] | None
43
+ Optional column names, used only for repr/debugging.
44
+
45
+ Usage
46
+ -----
47
+ disc = FeatureDiscretizer(n_bins=10)
48
+ token_rows = disc.fit_transform(X_train) # list of [(col, bin), ...]
49
+ test_rows = disc.transform(X_test)
50
+ """
51
+
52
+ def __init__(self, n_bins: int = 10, feature_names: list | None = None):
53
+ self.n_bins = n_bins
54
+ self.feature_names = feature_names
55
+ self._n_features: int = 0
56
+ self._types: list = [] # 'numeric' | 'categorical' per column
57
+ self._edges: dict = {} # col → sorted list of quantile cut-points
58
+ self._cat_maps: dict = {} # col → {value: int}
59
+ self._bin_centers: dict = {} # col → list of float centers (numeric only)
60
+
61
+ # Reservoir sampling state for dynamic online splitting
62
+ self._reservoirs: dict = {} # col -> list of sampled float values
63
+ self._reservoir_max: int = 2000
64
+
65
+ # ── public API ────────────────────────────────────────────────────────────
66
+
67
+ def fit(self, X) -> 'FeatureDiscretizer':
68
+ X = _to_rows(X)
69
+ if not X:
70
+ return self
71
+ self._n_features = len(X[0])
72
+ self._types = []
73
+ self._edges = {}
74
+ self._cat_maps = {}
75
+ self._bin_centers = {}
76
+
77
+ for j in range(self._n_features):
78
+ col = [row[j] for row in X]
79
+ if _is_numeric_col(col):
80
+ self._types.append('numeric')
81
+ self._reservoirs[j] = [float(v) for v in col if not _is_missing(v) and not (isinstance(v, float) and math.isnan(v))]
82
+ edges, centers = _quantile_edges(col, self.n_bins)
83
+ self._edges[j] = edges
84
+ self._bin_centers[j] = centers
85
+ else:
86
+ self._types.append('categorical')
87
+ unique = sorted(
88
+ {_safe_str(v) for v in col if not _is_missing(v)})
89
+ self._cat_maps[j] = {v: i for i, v in enumerate(unique)}
90
+ self._total_seen = len(X)
91
+ return self
92
+
93
+ def partial_fit(self, X) -> 'FeatureDiscretizer':
94
+ """
95
+ Online dynamic splitting: Update the reservoir sample of numeric columns
96
+ and re-calculate quantile bins if enough new data has arrived.
97
+ """
98
+ if self._n_features == 0:
99
+ return self.fit(X)
100
+
101
+ rows = _to_rows(X)
102
+ if not rows:
103
+ return self
104
+
105
+ import random
106
+ for row in rows:
107
+ self._total_seen += 1
108
+ for j in range(self._n_features):
109
+ v = row[j]
110
+ if self._types[j] == 'numeric':
111
+ if _is_missing(v):
112
+ continue
113
+ v = float(v)
114
+ # Reservoir sampling
115
+ if len(self._reservoirs[j]) < self._reservoir_max:
116
+ self._reservoirs[j].append(v)
117
+ else:
118
+ idx = random.randint(0, self._total_seen - 1)
119
+ if idx < self._reservoir_max:
120
+ self._reservoirs[j][idx] = v
121
+ else:
122
+ # Categorical: dynamically add unseen categories
123
+ if not _is_missing(v):
124
+ s = _safe_str(v)
125
+ if s not in self._cat_maps[j]:
126
+ self._cat_maps[j][s] = len(self._cat_maps[j])
127
+
128
+ # Re-compute bins periodically (e.g., every time we process a batch)
129
+ for j in range(self._n_features):
130
+ if self._types[j] == 'numeric' and self._reservoirs[j]:
131
+ edges, centers = _quantile_edges(self._reservoirs[j], self.n_bins)
132
+ self._edges[j] = edges
133
+ self._bin_centers[j] = centers
134
+
135
+ return self
136
+
137
+ def transform(self, X) -> list:
138
+ """Return list-of-lists of (col_idx, bin_or_code) tokens."""
139
+ rows = _to_rows(X)
140
+ return [self._encode_row(row) for row in rows]
141
+
142
+ def fit_transform(self, X) -> list:
143
+ return self.fit(X).transform(X)
144
+
145
+ def bin_center(self, col: int, bin_idx: int) -> float:
146
+ """Inverse of numeric binning: approximate value at bin centre."""
147
+ centers = self._bin_centers.get(col, [])
148
+ if not centers:
149
+ return 0.0
150
+ return centers[min(bin_idx, len(centers) - 1)]
151
+
152
+ @property
153
+ def n_features(self) -> int:
154
+ return self._n_features
155
+
156
+ # ── internal ──────────────────────────────────────────────────────────────
157
+
158
+ def _encode_row(self, row: list) -> list:
159
+ tokens = []
160
+ for j, v in enumerate(row):
161
+ tokens.append((j, self._encode_val(j, v)))
162
+ return tokens
163
+
164
+ def _encode_val(self, j: int, v) -> Any:
165
+ if _is_missing(v):
166
+ return MISSING_STR
167
+ if self._types[j] == 'numeric':
168
+ return _bin_search(float(v), self._edges[j])
169
+ else:
170
+ return self._cat_maps[j].get(_safe_str(v), MISSING_STR)
171
+
172
+ def __repr__(self) -> str:
173
+ return (f'FeatureDiscretizer(n_bins={self.n_bins}, '
174
+ f'n_features={self._n_features})')
175
+
176
+
177
+ # ══════════════════════════════════════════════════════════════════════════════
178
+ # LabelEncoder
179
+ # ══════════════════════════════════════════════════════════════════════════════
180
+
181
+ class LabelEncoder:
182
+ """
183
+ Bi-directional map between raw class labels and integer codes.
184
+ New labels seen during partial_fit are assigned the next integer.
185
+ """
186
+
187
+ def __init__(self):
188
+ self._enc: dict = {} # label → int
189
+ self._dec: dict = {} # int → label
190
+ self.classes_: list = []
191
+
192
+ def fit(self, y) -> 'LabelEncoder':
193
+ for label in y:
194
+ self._add(label)
195
+ return self
196
+
197
+ def partial_fit(self, y) -> 'LabelEncoder':
198
+ return self.fit(y)
199
+
200
+ def encode(self, label) -> int:
201
+ if label not in self._enc:
202
+ self._add(label)
203
+ return self._enc[label]
204
+
205
+ def decode(self, code: int):
206
+ return self._dec[code]
207
+
208
+ def __len__(self) -> int:
209
+ return len(self._enc)
210
+
211
+ def _add(self, label):
212
+ if label not in self._enc:
213
+ code = len(self._enc)
214
+ self._enc[label] = code
215
+ self._dec[code] = label
216
+ self.classes_.append(label)
217
+
218
+
219
+ # ══════════════════════════════════════════════════════════════════════════════
220
+ # Helpers
221
+ # ══════════════════════════════════════════════════════════════════════════════
222
+
223
+ def _to_rows(X) -> list:
224
+ """Accept numpy array, pandas DataFrame, or list-of-lists."""
225
+ try:
226
+ import numpy as np
227
+ if isinstance(X, np.ndarray):
228
+ return X.tolist()
229
+ except ImportError:
230
+ pass
231
+ try:
232
+ import pandas as pd
233
+ if isinstance(X, (pd.DataFrame, pd.Series)):
234
+ return X.values.tolist()
235
+ except ImportError:
236
+ pass
237
+ # Already a list; wrap scalars (1-D input → single-column matrix)
238
+ out = list(X)
239
+ if out and not isinstance(out[0], (list, tuple)):
240
+ out = [[v] for v in out]
241
+ return out
242
+
243
+
244
+ def _is_missing(v) -> bool:
245
+ if v is None or v is MISSING:
246
+ return True
247
+ if isinstance(v, float) and math.isnan(v):
248
+ return True
249
+ if isinstance(v, str) and v.strip() == '':
250
+ return True
251
+ return False
252
+
253
+
254
+ def _is_numeric_col(col: list) -> bool:
255
+ """True if any non-missing value is a float/int (non-bool)."""
256
+ for v in col:
257
+ if _is_missing(v):
258
+ continue
259
+ if isinstance(v, bool):
260
+ return False
261
+ if isinstance(v, (int, float)):
262
+ return True
263
+ return False
264
+ return False
265
+
266
+
267
+ def _safe_str(v) -> str:
268
+ return str(v)
269
+
270
+
271
+ def _quantile_edges(col: list, n_bins: int) -> tuple:
272
+ """
273
+ Compute (n_bins-1) quantile cut-points and n_bins bin centres.
274
+ Values beyond edges map to the first or last bin (clamped).
275
+ """
276
+ valid = sorted(float(v) for v in col if not _is_missing(v)
277
+ and not (isinstance(v, float) and math.isnan(v)))
278
+ if not valid:
279
+ return [], [0.0]
280
+
281
+ n = len(valid)
282
+ edges = []
283
+ for i in range(1, n_bins):
284
+ idx = int(i * n / n_bins)
285
+ edges.append(valid[min(idx, n - 1)])
286
+ # Deduplicate while keeping order
287
+ edges = sorted(set(edges))
288
+
289
+ # Bin centres: midpoint between consecutive edges
290
+ boundaries = [valid[0] - 1e-9] + edges + [valid[-1] + 1e-9]
291
+ centers = [
292
+ (boundaries[i] + boundaries[i + 1]) / 2.0
293
+ for i in range(len(boundaries) - 1)
294
+ ]
295
+ return edges, centers
296
+
297
+
298
+ def _bin_search(v: float, edges: list) -> int:
299
+ """Binary-search bin index for value v given sorted cut-points."""
300
+ lo, hi = 0, len(edges)
301
+ while lo < hi:
302
+ mid = (lo + hi) // 2
303
+ if v <= edges[mid]:
304
+ hi = mid
305
+ else:
306
+ lo = mid + 1
307
+ return lo
uchi/distributional.py ADDED
@@ -0,0 +1,105 @@
1
+ import math
2
+ from typing import Any
3
+
4
+ def js_divergence(p: dict, q: dict) -> float:
5
+ """Calculate Jensen-Shannon Divergence between two probability dicts."""
6
+ vocab = set(p.keys()) | set(q.keys())
7
+ if not vocab:
8
+ return 0.0
9
+
10
+ def kld(dist1, dist2):
11
+ res = 0.0
12
+ for k in vocab:
13
+ v1 = dist1.get(k, 0.0)
14
+ if v1 > 0:
15
+ v2 = dist2.get(k, 0.0)
16
+ if v2 == 0:
17
+ return float('inf')
18
+ res += v1 * math.log2(v1 / v2)
19
+ return res
20
+
21
+ m = {}
22
+ for k in vocab:
23
+ m[k] = 0.5 * (p.get(k, 0.0) + q.get(k, 0.0))
24
+
25
+ return 0.5 * kld(p, m) + 0.5 * kld(q, m)
26
+
27
+
28
+ class DistributionalTokenizer:
29
+ """
30
+ Online semantic clustering via Jensen-Shannon divergence.
31
+ Replaces NLTK WordNet with pure mathematical distributions.
32
+ If two tokens are followed by similar distributions, they are merged.
33
+ """
34
+ def __init__(self, merge_threshold_jsd: float = 0.1, min_obs: int = 50):
35
+ self.merge_threshold = merge_threshold_jsd
36
+ self.min_obs = min_obs
37
+
38
+ self.successor_counts = {} # token -> dict of next_token -> count
39
+ self.token_totals = {} # token -> int
40
+
41
+ self.clusters = {} # token -> cluster_id
42
+ self.next_cluster_id = 0
43
+
44
+ self._last_token = None
45
+
46
+ def _normalize(self, token: Any) -> dict:
47
+ counts = self.successor_counts.get(token, {})
48
+ total = self.token_totals.get(token, 0)
49
+ if total == 0:
50
+ return {}
51
+ return {k: v / total for k, v in counts.items()}
52
+
53
+ def observe(self, token: Any) -> None:
54
+ if self._last_token is not None:
55
+ prev = self._last_token
56
+ if prev not in self.successor_counts:
57
+ self.successor_counts[prev] = {}
58
+ self.token_totals[prev] = 0
59
+
60
+ self.successor_counts[prev][token] = self.successor_counts[prev].get(token, 0) + 1
61
+ self.token_totals[prev] += 1
62
+
63
+ # Periodically attempt clustering
64
+ if self.token_totals[prev] % 100 == 0 and self.token_totals[prev] >= self.min_obs:
65
+ self._attempt_cluster(prev)
66
+
67
+ self._last_token = token
68
+
69
+ def _attempt_cluster(self, target_token: Any):
70
+ if target_token in self.clusters:
71
+ return
72
+
73
+ target_dist = self._normalize(target_token)
74
+
75
+ best_candidate = None
76
+ lowest_jsd = float('inf')
77
+
78
+ for candidate, total in self.token_totals.items():
79
+ if candidate == target_token or total < self.min_obs:
80
+ continue
81
+
82
+ candidate_dist = self._normalize(candidate)
83
+ jsd = js_divergence(target_dist, candidate_dist)
84
+
85
+ if jsd < lowest_jsd:
86
+ lowest_jsd = jsd
87
+ best_candidate = candidate
88
+
89
+ if lowest_jsd < self.merge_threshold and best_candidate is not None:
90
+ # Merge!
91
+ if best_candidate in self.clusters:
92
+ c_id = self.clusters[best_candidate]
93
+ else:
94
+ c_id = f"__CLUSTER_{self.next_cluster_id}__"
95
+ self.clusters[best_candidate] = c_id
96
+ self.next_cluster_id += 1
97
+
98
+ self.clusters[target_token] = c_id
99
+
100
+ def tokenize(self, token: Any) -> Any:
101
+ """Tokenize a single token and update distributions online."""
102
+ self.observe(token)
103
+ if token in self.clusters:
104
+ return self.clusters[token]
105
+ return token
uchi/dual_predictor.py ADDED
@@ -0,0 +1,172 @@
1
+ """
2
+ DualPredictor
3
+ =============
4
+ Solves Problem 7 — Stationary/drift tradeoff.
5
+
6
+ Two UniversalPredictors run in parallel:
7
+ stable — high cred_max, high lambda_power: best on stationary data
8
+ drift — low cred_max, low lambda_power: fastest drift recovery
9
+
10
+ Their output distributions are blended by a weight that tracks the recent
11
+ error rate on a rolling window. When the error rate is high (something
12
+ changed), weight shifts toward the drift predictor. When low and stable,
13
+ weight shifts toward the stability predictor.
14
+
15
+ The routing is fully automatic and requires no configuration beyond the
16
+ window size.
17
+
18
+ API
19
+ ---
20
+ dp = DualPredictor(context_length=4)
21
+ dp.observe(token)
22
+ pred, conf = dp.predict()
23
+ dp.feedback(actual)
24
+ dp.error_rate # current rolling error rate
25
+ dp.active_predictor # 'stable' | 'drift' | 'blend'
26
+ """
27
+
28
+ from collections import deque
29
+ from .predictor import UniversalPredictor
30
+
31
+
32
+ class DualPredictor:
33
+ """
34
+ Parameters
35
+ ----------
36
+ context_length : int
37
+ window : int
38
+ Rolling window size for error rate tracking (default 50).
39
+ drift_threshold : float
40
+ Error rate above which the drift predictor dominates (default 0.4).
41
+ stable_threshold : float
42
+ Error rate below which the stability predictor dominates (default 0.15).
43
+ stable_cred_max, stable_lambda_power : float
44
+ Hyperparameters for the stability-tuned predictor.
45
+ drift_cred_max, drift_lambda_power : float
46
+ Hyperparameters for the drift-tuned predictor.
47
+ learning_rate : float
48
+ Shared learning rate for both predictors.
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ context_length: int,
54
+ window: int = 50,
55
+ drift_threshold: float = 0.4,
56
+ stable_threshold: float = 0.15,
57
+ stable_cred_max: float = 8.0,
58
+ stable_lambda_power: float = 0.8,
59
+ drift_cred_max: float = 3.0,
60
+ drift_lambda_power: float = 0.4,
61
+ learning_rate: float = 0.08,
62
+ **kwargs,
63
+ ):
64
+ self._stable = UniversalPredictor(
65
+ context_length,
66
+ learning_rate=learning_rate,
67
+ cred_max=stable_cred_max,
68
+ lambda_power=stable_lambda_power,
69
+ **kwargs,
70
+ )
71
+ self._drift = UniversalPredictor(
72
+ context_length,
73
+ learning_rate=learning_rate,
74
+ cred_max=drift_cred_max,
75
+ lambda_power=drift_lambda_power,
76
+ **kwargs,
77
+ )
78
+ self._window = deque(maxlen=window)
79
+ self._drift_threshold = drift_threshold
80
+ self._stable_threshold = stable_threshold
81
+ self._error_rate: float = 0.5
82
+ self._last_pred = None
83
+ self._last_dist: dict = {}
84
+ self._history: list[dict] = [] # per-step routing log
85
+
86
+ # ── public API ────────────────────────────────────────────────────────────
87
+
88
+ def observe(self, token) -> 'DualPredictor':
89
+ self._stable.observe(token)
90
+ self._drift.observe(token)
91
+ return self
92
+
93
+ def predict(self):
94
+ """
95
+ Blend stable and drift distributions based on current error rate.
96
+ Returns (predicted_token, confidence).
97
+ """
98
+ self._stable.predict()
99
+ self._drift.predict()
100
+ p_s = self._stable._last_distribution
101
+ p_d = self._drift._last_distribution
102
+
103
+ w_drift, w_stable = self._routing_weights()
104
+
105
+ all_keys = set(p_s) | set(p_d)
106
+ if not all_keys:
107
+ self._last_pred = None
108
+ self._last_dist = {}
109
+ return None, 0.0
110
+
111
+ blended = {
112
+ k: w_stable * p_s.get(k, 0.0) + w_drift * p_d.get(k, 0.0)
113
+ for k in all_keys
114
+ }
115
+ total = sum(blended.values()) or 1.0
116
+ self._last_dist = {k: v / total for k, v in blended.items()}
117
+ self._last_pred = max(self._last_dist, key=self._last_dist.get)
118
+ return self._last_pred, self._last_dist[self._last_pred]
119
+
120
+ def feedback(self, actual) -> None:
121
+ """Update error rate and both predictors."""
122
+ wrong = (self._last_pred != actual)
123
+ self._window.append(1 if wrong else 0)
124
+ self._error_rate = sum(self._window) / len(self._window) if self._window else 0.5
125
+ self._stable.feedback(actual)
126
+ self._drift.feedback(actual)
127
+
128
+ # ── diagnostics ───────────────────────────────────────────────────────────
129
+
130
+ @property
131
+ def error_rate(self) -> float:
132
+ return self._error_rate
133
+
134
+ @property
135
+ def active_predictor(self) -> str:
136
+ w_d, w_s = self._routing_weights()
137
+ if w_s > 0.8:
138
+ return 'stable'
139
+ if w_d > 0.8:
140
+ return 'drift'
141
+ return 'blend'
142
+
143
+ @property
144
+ def _last_distribution(self) -> dict:
145
+ return self._last_dist
146
+
147
+ @property
148
+ def _vocab(self) -> set:
149
+ return self._stable._vocab
150
+
151
+ @property
152
+ def history(self) -> list:
153
+ return self._stable.history
154
+
155
+ # ── internal ──────────────────────────────────────────────────────────────
156
+
157
+ def _routing_weights(self) -> tuple[float, float]:
158
+ """
159
+ Returns (w_drift, w_stable).
160
+ Linear interpolation between thresholds:
161
+ error_rate >= drift_threshold → (1.0, 0.0)
162
+ error_rate <= stable_threshold → (0.0, 1.0)
163
+ in between → smooth blend
164
+ """
165
+ e = self._error_rate
166
+ if e >= self._drift_threshold:
167
+ return 1.0, 0.0
168
+ if e <= self._stable_threshold:
169
+ return 0.0, 1.0
170
+ span = self._drift_threshold - self._stable_threshold
171
+ w_d = (e - self._stable_threshold) / span
172
+ return w_d, 1.0 - w_d