uchi-python 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uchi_python-0.2.0/PKG-INFO +273 -0
- uchi_python-0.2.0/README.md +209 -0
- uchi_python-0.2.0/pyproject.toml +86 -0
- uchi_python-0.2.0/tests/test_analytical_skills.py +337 -0
- uchi_python-0.2.0/tests/test_api_harness.py +34 -0
- uchi_python-0.2.0/tests/test_chat.py +149 -0
- uchi_python-0.2.0/tests/test_code_engine.py +251 -0
- uchi_python-0.2.0/tests/test_convergent_engine.py +383 -0
- uchi_python-0.2.0/tests/test_distributional.py +8 -0
- uchi_python-0.2.0/tests/test_dual_predictor.py +14 -0
- uchi_python-0.2.0/tests/test_eval_suite.py +203 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/tests/test_forest.py +3 -3
- {uchi_python-0.1.0 → uchi_python-0.2.0}/tests/test_generative_fixes.py +1 -1
- uchi_python-0.2.0/tests/test_math_simulator.py +54 -0
- uchi_python-0.2.0/tests/test_node_compressor.py +18 -0
- uchi_python-0.2.0/tests/test_omni_router.py +46 -0
- uchi_python-0.2.0/tests/test_phase5_benchmark.py +39 -0
- uchi_python-0.2.0/tests/test_process.py +38 -0
- uchi_python-0.2.0/tests/test_replay.py +27 -0
- uchi_python-0.2.0/tests/test_routing_layer.py +104 -0
- uchi_python-0.2.0/tests/test_simple.py +25 -0
- uchi_python-0.2.0/tests/test_simulation_engine.py +14 -0
- uchi_python-0.2.0/tests/test_skill_registry.py +269 -0
- uchi_python-0.2.0/tests/test_timeseries.py +16 -0
- uchi_python-0.2.0/tests/test_tokenizers.py +17 -0
- uchi_python-0.2.0/tests/test_tree_search_engine.py +706 -0
- uchi_python-0.2.0/tests/test_vector_oracle.py +188 -0
- uchi_python-0.2.0/tests/test_web_plugin.py +14 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/tests/tests.py +2 -2
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/__init__.py +17 -1
- uchi_python-0.2.0/uchi/api.py +86 -0
- uchi_python-0.2.0/uchi/api_server.py +195 -0
- uchi_python-0.2.0/uchi/builder.py +214 -0
- uchi_python-0.2.0/uchi/cli.py +203 -0
- uchi_python-0.2.0/uchi/code_engine.py +150 -0
- uchi_python-0.2.0/uchi/code_tokenizer.py +244 -0
- uchi_python-0.2.0/uchi/convergent_engine.py +572 -0
- uchi_python-0.2.0/uchi/cpu_memory.py +74 -0
- uchi_python-0.2.0/uchi/data_loader.py +170 -0
- uchi_python-0.2.0/uchi/experience_replay.py +197 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/forest.py +15 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/generative.py +220 -17
- uchi_python-0.2.0/uchi/grammar_mask.py +184 -0
- uchi_python-0.2.0/uchi/grpo.py +94 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/hoeffding.py +0 -1
- uchi_python-0.2.0/uchi/intent_encoder.py +157 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/long_term_store.py +0 -1
- uchi_python-0.2.0/uchi/memory.py +104 -0
- uchi_python-0.2.0/uchi/neuro_symbolic.py +657 -0
- uchi_python-0.2.0/uchi/omni_evaluator.py +1021 -0
- uchi_python-0.2.0/uchi/omni_router.py +711 -0
- uchi_python-0.2.0/uchi/omni_tokenizer.py +258 -0
- uchi_python-0.2.0/uchi/ontology.py +198 -0
- uchi_python-0.2.0/uchi/ontology_manager.py +48 -0
- uchi_python-0.2.0/uchi/persona.txt +58 -0
- uchi_python-0.2.0/uchi/plugins/__init__.py +1 -0
- uchi_python-0.2.0/uchi/plugins/web.py +37 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/predictor.py +87 -3
- uchi_python-0.2.0/uchi/procedural_memory.py +96 -0
- uchi_python-0.2.0/uchi/process.py +58 -0
- uchi_python-0.2.0/uchi/semantic_index.py +192 -0
- uchi_python-0.2.0/uchi/simulation_engine.py +45 -0
- uchi_python-0.2.0/uchi/skill_registry.py +536 -0
- uchi_python-0.2.0/uchi/specialist_pool.py +78 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/tabular.py +13 -8
- uchi_python-0.2.0/uchi/telemetry.py +139 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/timeseries.py +7 -4
- uchi_python-0.2.0/uchi/tree_search_engine.py +558 -0
- uchi_python-0.2.0/uchi/tui/__init__.py +1 -0
- uchi_python-0.2.0/uchi/tui/app.py +826 -0
- uchi_python-0.2.0/uchi/vector_oracle.py +239 -0
- uchi_python-0.2.0/uchi/web_search.py +79 -0
- uchi_python-0.2.0/uchi_python.egg-info/PKG-INFO +273 -0
- uchi_python-0.2.0/uchi_python.egg-info/SOURCES.txt +83 -0
- uchi_python-0.2.0/uchi_python.egg-info/entry_points.txt +2 -0
- uchi_python-0.2.0/uchi_python.egg-info/requires.txt +51 -0
- uchi_python-0.1.0/PKG-INFO +0 -468
- uchi_python-0.1.0/README.md +0 -432
- uchi_python-0.1.0/pyproject.toml +0 -45
- uchi_python-0.1.0/tests/test_replay.py +0 -40
- uchi_python-0.1.0/uchi/semantic_tokenizer.py +0 -48
- uchi_python-0.1.0/uchi_python.egg-info/PKG-INFO +0 -468
- uchi_python-0.1.0/uchi_python.egg-info/SOURCES.txt +0 -26
- uchi_python-0.1.0/uchi_python.egg-info/requires.txt +0 -22
- {uchi_python-0.1.0 → uchi_python-0.2.0}/LICENSE +0 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/setup.cfg +0 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/discretize.py +0 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/distributional.py +0 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/dual_predictor.py +0 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/node_compressor.py +0 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/online_tokenizer.py +0 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi_python.egg-info/dependency_links.txt +0 -0
- {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi_python.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: uchi-python
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Omni-modal Deterministic Universal Sequence Predictor (ODUSP)
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Documentation, https://github.com/JosephWoodall/uchi/tree/main/docs
|
|
7
|
+
Project-URL: Homepage, https://github.com/JosephWoodall/uchi
|
|
8
|
+
Keywords: machine learning,online learning,sequence prediction,time series,concept drift,trie,CTW
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: fastapi>=0.100.0
|
|
22
|
+
Requires-Dist: uvicorn>=0.23.0
|
|
23
|
+
Requires-Dist: requests>=2.31.0
|
|
24
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
25
|
+
Requires-Dist: tqdm>=4.66.0
|
|
26
|
+
Requires-Dist: textual>=0.50.0
|
|
27
|
+
Requires-Dist: torch>=2.0.0
|
|
28
|
+
Requires-Dist: spacy>=3.0.0
|
|
29
|
+
Provides-Extra: test
|
|
30
|
+
Requires-Dist: pytest; extra == "test"
|
|
31
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
32
|
+
Requires-Dist: psutil; extra == "test"
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
36
|
+
Requires-Dist: ruff; extra == "dev"
|
|
37
|
+
Requires-Dist: mypy; extra == "dev"
|
|
38
|
+
Requires-Dist: pre-commit; extra == "dev"
|
|
39
|
+
Requires-Dist: mkdocs-material; extra == "dev"
|
|
40
|
+
Requires-Dist: nltk; extra == "dev"
|
|
41
|
+
Requires-Dist: psutil; extra == "dev"
|
|
42
|
+
Requires-Dist: datasets>=2.14.0; extra == "dev"
|
|
43
|
+
Provides-Extra: sklearn
|
|
44
|
+
Requires-Dist: scikit-learn>=1.0; extra == "sklearn"
|
|
45
|
+
Provides-Extra: numpy
|
|
46
|
+
Requires-Dist: numpy>=1.20; extra == "numpy"
|
|
47
|
+
Provides-Extra: pandas
|
|
48
|
+
Requires-Dist: pandas>=1.3; extra == "pandas"
|
|
49
|
+
Provides-Extra: pyspark
|
|
50
|
+
Requires-Dist: pyspark>=3.0.0; extra == "pyspark"
|
|
51
|
+
Provides-Extra: optuna
|
|
52
|
+
Requires-Dist: optuna>=3.0.0; extra == "optuna"
|
|
53
|
+
Provides-Extra: faiss
|
|
54
|
+
Requires-Dist: faiss-cpu>=1.7; extra == "faiss"
|
|
55
|
+
Provides-Extra: all
|
|
56
|
+
Requires-Dist: scikit-learn>=1.0; extra == "all"
|
|
57
|
+
Requires-Dist: numpy>=1.20; extra == "all"
|
|
58
|
+
Requires-Dist: pandas>=1.3; extra == "all"
|
|
59
|
+
Requires-Dist: pyspark>=3.0.0; extra == "all"
|
|
60
|
+
Requires-Dist: optuna>=3.0.0; extra == "all"
|
|
61
|
+
Requires-Dist: pytest; extra == "all"
|
|
62
|
+
Requires-Dist: faiss-cpu>=1.7; extra == "all"
|
|
63
|
+
Dynamic: license-file
|
|
64
|
+
|
|
65
|
+

|
|
66
|
+
|
|
67
|
+
# Universal Sequence Predictor
|
|
68
|
+
|
|
69
|
+
[](https://pypi.org/project/uchi_python/)
|
|
70
|
+
[](https://opensource.org/licenses/MIT)
|
|
71
|
+
[](https://pypi.org/project/uchi_python/)
|
|
72
|
+
[](https://github.com/JosephWoodall/uchi/actions/workflows/ci.yml)
|
|
73
|
+
|
|
74
|
+
## Core Mission: Omni-modal Deterministic Universal Sequence Predictor (ODUSP)
|
|
75
|
+
Uchi v0.2.0 transforms the architecture from a simple sequence predictor into a completely multi-modal Deterministic Universal Sequence Predictor. It ingests text, audio, images, math telemetry, and code simultaneously—without any neural weights or pre-training. It adds a structured routing layer with intent-based query dispatch via `ProceduralMemory`, a trainable SSM confidence signal via GRPO, persistent vector memory, TUI, REST API, and SDK — all without introducing an LLM dependency.
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
> [!NOTE]
|
|
79
|
+
> Please see `docs/` for the complete Algorithmic Walkthrough, ODUSP vs LLM Benchmarks, and full API references.
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
> [!NOTE]
|
|
84
|
+
> **Comprehensive Documentation & API Reference**
|
|
85
|
+
>
|
|
86
|
+
> For interactive examples, API documentation, and to see the newest capabilities (including online Math Learning, Vector Retrievals, and the Simulation Engine), please see our full documentation website.
|
|
87
|
+
>
|
|
88
|
+
> **[Read the Full Documentation →](https://github.com/JosephWoodall/uchi/tree/main/docs)**
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Installation
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
pip install -e ".[all]"
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
On first launch, Uchi runs a one-time bootstrap (Python stdlib patterns + Wikipedia facts) and saves the result to `brain.uchi`. Subsequent launches are instant.
|
|
99
|
+
|
|
100
|
+
## Quickstart
|
|
101
|
+
|
|
102
|
+
Uchi has two entry points that share the same `brain.uchi` — every interaction in either one improves the other.
|
|
103
|
+
|
|
104
|
+
### 1. Terminal UI (TUI)
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
uchi # launch interactive chat
|
|
108
|
+
uchi --preload data.txt # pre-train with a file before chatting
|
|
109
|
+
uchi --brain /path/to/brain.uchi # use a specific brain file
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Inside the TUI:
|
|
113
|
+
|
|
114
|
+
| Command | Description |
|
|
115
|
+
|---|---|
|
|
116
|
+
| Just type | Chat with Uchi — it learns from every turn |
|
|
117
|
+
| `/load <file>` | Stream any file into the knowledge base |
|
|
118
|
+
| `/save` | Force-save the current brain state to disk |
|
|
119
|
+
| `Ctrl+S` | Save brain |
|
|
120
|
+
| `Ctrl+C` | Save and quit |
|
|
121
|
+
|
|
122
|
+
Uchi gives positive/negative feedback signals to improve itself — type "good", "correct", "yes" to reinforce a response, or "wrong", "bad", "no" to prune it.
|
|
123
|
+
|
|
124
|
+
### 2. REST API
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
uvicorn uchi.api_server:app --host 0.0.0.0 --port 8000
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
**POST /chat**
|
|
131
|
+
```bash
|
|
132
|
+
curl -X POST http://localhost:8000/chat \
|
|
133
|
+
-H "Content-Type: application/json" \
|
|
134
|
+
-d '{"message": "what is the capital of France?"}'
|
|
135
|
+
# {"reply": "paris", "entropy": 3.2}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
**GET /metrics**
|
|
139
|
+
```bash
|
|
140
|
+
curl http://localhost:8000/metrics
|
|
141
|
+
# {"status": "online", "memory_records": 1024, "mode": "deterministic"}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**GET /debug/walk**
|
|
145
|
+
```bash
|
|
146
|
+
curl http://localhost:8000/debug/walk
|
|
147
|
+
# Returns trie walk data from the last prediction (depth, contributions, similarity)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### 3. Python API
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from uchi.omni_router import OmniRouter
|
|
154
|
+
from uchi.cli import load_brain, save_brain
|
|
155
|
+
|
|
156
|
+
# Load existing brain or create a new one
|
|
157
|
+
router = load_brain("brain.uchi") or OmniRouter()
|
|
158
|
+
|
|
159
|
+
# Chat
|
|
160
|
+
reply = router.chat("what is the capital of France?")
|
|
161
|
+
print(reply) # → "paris"
|
|
162
|
+
|
|
163
|
+
# Teach it something new
|
|
164
|
+
router.stream(["<|user|>", "what", "is", "the", "capital", "of", "germany",
|
|
165
|
+
"<|assistant|>", "berlin"])
|
|
166
|
+
|
|
167
|
+
# Save
|
|
168
|
+
save_brain(router, "brain.uchi")
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### 4. Offline Knowledge Bootstrapping
|
|
172
|
+
|
|
173
|
+
To scale Uchi's knowledge base beyond the cold-start defaults, run these scripts once before distributing your `brain.uchi`:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
# Ingest Wikipedia + code_search_net via HuggingFace (requires: pip install datasets)
|
|
177
|
+
python scripts/bootstrap_knowledge.py --limit 10000
|
|
178
|
+
|
|
179
|
+
# Ingest Python stdlib function patterns via AST (no internet required)
|
|
180
|
+
python scripts/bootstrap_code.py
|
|
181
|
+
|
|
182
|
+
# Ingest Wikipedia fact triples via spaCy SVO extraction (requires: pip install wikipedia spacy)
|
|
183
|
+
python scripts/bootstrap_wikidata.py
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
The resulting `brain.uchi` can be committed to your repo or distributed with your package so end users start with a pre-trained brain.
|
|
187
|
+
|
|
188
|
+
## Benchmarks
|
|
189
|
+
|
|
190
|
+
Uchi is a **deterministic sequence predictor**, not a language model. Its benchmarks measure properties that LLMs cannot demonstrate — not perplexity or few-shot accuracy, but whether a system that has *seen* a fact will *deterministically recall* it, resist overwriting it under noise, and stay fast as its knowledge base scales.
|
|
191
|
+
|
|
192
|
+
Run yourself with:
|
|
193
|
+
```bash
|
|
194
|
+
python benchmarks/run_benchmarks.py
|
|
195
|
+
python benchmarks/run_benchmarks.py --mini # fast CI pass (10 facts)
|
|
196
|
+
python benchmarks/run_benchmarks.py --wipe # clean rebuild before benchmarking
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Results are written to `eval_metrics.json` and this table is updated automatically.
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
### Pre-load Recall — **80.0%** (40 / 50)
|
|
204
|
+
|
|
205
|
+
50 factual Q&A pairs (geography, science, history, Python/CS) are streamed directly into the trie as `<|user|> question <|assistant|> answer` sequences. Web search is then disabled and the system is asked each question cold. A pass requires the expected answer to appear in the reply.
|
|
206
|
+
|
|
207
|
+
This is Uchi's core capability claim: *if you teach it something, it recalls it exactly*. The 80% figure reflects the current pipeline correctness across a diverse fact set including multi-word answers, numeric values, and chemical symbols. Failures are vocabulary edge cases (the tokenizer normalises "au" → `gold.n.03`, which is semantically correct but fails substring match).
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
### Zero Catastrophic Forgetting — **100.0%** (10 / 10, after 1 000 noise facts)
|
|
212
|
+
|
|
213
|
+
10 anchor facts are streamed first. Then 1 000 unrelated noise facts are streamed on top. The 10 anchors are then re-tested. 100% means not a single anchor fact was displaced.
|
|
214
|
+
|
|
215
|
+
LLMs trained on a new document lose previously learned facts proportional to the dataset shift (catastrophic interference). Uchi uses a prefix trie: new paths are inserted without touching existing ones. Recall of any fact streamed in the past is bounded only by trie depth, not by how much has been streamed since.
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
### Latency vs. Brain Size — flat O(depth)
|
|
220
|
+
|
|
221
|
+
| Brain size | Latency |
|
|
222
|
+
|---|---|
|
|
223
|
+
| 10 facts | 10 666 ms |
|
|
224
|
+
| 100 facts | 2 568 ms |
|
|
225
|
+
| 500 facts | 2 282 ms |
|
|
226
|
+
| 1 000 facts | 2 597 ms |
|
|
227
|
+
|
|
228
|
+
Latency is measured as wall-clock time for a single chat() turn on a pre-loaded fact, with web search disabled.
|
|
229
|
+
|
|
230
|
+
The pattern is deliberate: latency at 1 000 facts is the same as at 100 facts because trie lookup is O(depth), not O(vocabulary size). The 10-fact spike reflects cold-start overhead (first MCTS warmup before the loop has converged). At scale this overhead amortises to near-zero.
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
### Code Completion — **5.0%** (1 / 20 HumanEval)
|
|
235
|
+
|
|
236
|
+
20 HumanEval function stubs (`def factorial(n):` etc.) are streamed as training pairs, then recalled. Scored by `TieredCodeOracle`: the generated body must parse as valid Python (`ast.parse`) and contain expected keywords.
|
|
237
|
+
|
|
238
|
+
5% on HumanEval after single-pass training is the *floor*, not the ceiling. Uchi is not pre-trained on code corpora. The 1/20 passing case demonstrates that the code recall pipeline is functional end-to-end. Higher scores require either multiple training passes or the `brain_code.uchi` specialist loaded alongside the base brain.
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
### Inference Latency — **2 333 ms** per turn
|
|
243
|
+
|
|
244
|
+
Single chat turn on a pre-loaded fact, web search off. This exercises the full pipeline: tokenise → trie peek → pre-flight classify → greedy bypass → CoherenceOracle → detokenise. Down from 17 762 ms in the pre-optimization baseline (7.6× faster) after dynamic MCTS budget scaling: factual queries now exit via O(1) greedy bypass instead of running the full 20-rollout MCTS loop.
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
### RAM Footprint — **1 374 MB** resident
|
|
249
|
+
|
|
250
|
+
Measured after loading `brain.uchi` and running the recall stream. Dominated by the trie node store (~1.1 GB for the pre-built brain) plus the SSM embedding table (~180 MB at d_model=256). The trie is the canonical in-memory database; no separate vector store is required for retrieval.
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
### Hallucination Rate — **0%**
|
|
255
|
+
|
|
256
|
+
Uchi cannot fabricate tokens that are not in its trie. Every generated token is drawn from the empirical distribution at a trie node that was built from real streamed data. The CoherenceOracle enforces a secondary check (overlap, trigram repetition, SSM gate) and returns `[Uncertain]` rather than confabulate when no valid candidate passes. Zero hallucination is a structural guarantee, not a tuned behaviour.
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
<!-- BENCHMARK_TABLE_START -->
|
|
261
|
+
| Metric | Score | Notes |
|
|
262
|
+
|---|---|---|
|
|
263
|
+
| **Pre-load Recall** | **80.0%** (n=50) | Stream N facts → immediately test recall; measures deterministic memory |
|
|
264
|
+
| **Zero Catastrophic Forgetting** | **100.0%** after 1000 noise facts | Anchor facts recalled correctly after 1000 distractors streamed on top |
|
|
265
|
+
| **Latency vs. Brain Size** | 10facts→10666ms 100facts→2568ms 500facts→2282ms 1000facts→2597ms | Proves O(depth) trie lookup: latency stays flat as brain grows |
|
|
266
|
+
| **Code Completion** | **5.0%** (n=20 HumanEval) | Python function stub → body; scored by syntax + keyword validity |
|
|
267
|
+
| **Inference Latency** | **2333.1 ms** | Single turn on a pre-loaded fact, web search disabled |
|
|
268
|
+
| **RAM Footprint** | **1374.2 MB** | Resident set after brain load + recall stream |
|
|
269
|
+
| **Hallucination Rate** | **0%** | Strict trie boundary enforcement |
|
|
270
|
+
<!-- BENCHMARK_TABLE_END -->
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
# Universal Sequence Predictor
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/uchi_python/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
[](https://pypi.org/project/uchi_python/)
|
|
8
|
+
[](https://github.com/JosephWoodall/uchi/actions/workflows/ci.yml)
|
|
9
|
+
|
|
10
|
+
## Core Mission: Omni-modal Deterministic Universal Sequence Predictor (ODUSP)
|
|
11
|
+
Uchi v0.2.0 transforms the architecture from a simple sequence predictor into a completely multi-modal Deterministic Universal Sequence Predictor. It ingests text, audio, images, math telemetry, and code simultaneously—without any neural weights or pre-training. It adds a structured routing layer with intent-based query dispatch via `ProceduralMemory`, a trainable SSM confidence signal via GRPO, persistent vector memory, TUI, REST API, and SDK — all without introducing an LLM dependency.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
> [!NOTE]
|
|
15
|
+
> Please see `docs/` for the complete Algorithmic Walkthrough, ODUSP vs LLM Benchmarks, and full API references.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
> [!NOTE]
|
|
20
|
+
> **Comprehensive Documentation & API Reference**
|
|
21
|
+
>
|
|
22
|
+
> For interactive examples, API documentation, and to see the newest capabilities (including online Math Learning, Vector Retrievals, and the Simulation Engine), please see our full documentation website.
|
|
23
|
+
>
|
|
24
|
+
> **[Read the Full Documentation →](https://github.com/JosephWoodall/uchi/tree/main/docs)**
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install -e ".[all]"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
On first launch, Uchi runs a one-time bootstrap (Python stdlib patterns + Wikipedia facts) and saves the result to `brain.uchi`. Subsequent launches are instant.
|
|
35
|
+
|
|
36
|
+
## Quickstart
|
|
37
|
+
|
|
38
|
+
Uchi has two entry points that share the same `brain.uchi` — every interaction in either one improves the other.
|
|
39
|
+
|
|
40
|
+
### 1. Terminal UI (TUI)
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
uchi # launch interactive chat
|
|
44
|
+
uchi --preload data.txt # pre-train with a file before chatting
|
|
45
|
+
uchi --brain /path/to/brain.uchi # use a specific brain file
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Inside the TUI:
|
|
49
|
+
|
|
50
|
+
| Command | Description |
|
|
51
|
+
|---|---|
|
|
52
|
+
| Just type | Chat with Uchi — it learns from every turn |
|
|
53
|
+
| `/load <file>` | Stream any file into the knowledge base |
|
|
54
|
+
| `/save` | Force-save the current brain state to disk |
|
|
55
|
+
| `Ctrl+S` | Save brain |
|
|
56
|
+
| `Ctrl+C` | Save and quit |
|
|
57
|
+
|
|
58
|
+
Uchi gives positive/negative feedback signals to improve itself — type "good", "correct", "yes" to reinforce a response, or "wrong", "bad", "no" to prune it.
|
|
59
|
+
|
|
60
|
+
### 2. REST API
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
uvicorn uchi.api_server:app --host 0.0.0.0 --port 8000
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
**POST /chat**
|
|
67
|
+
```bash
|
|
68
|
+
curl -X POST http://localhost:8000/chat \
|
|
69
|
+
-H "Content-Type: application/json" \
|
|
70
|
+
-d '{"message": "what is the capital of France?"}'
|
|
71
|
+
# {"reply": "paris", "entropy": 3.2}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**GET /metrics**
|
|
75
|
+
```bash
|
|
76
|
+
curl http://localhost:8000/metrics
|
|
77
|
+
# {"status": "online", "memory_records": 1024, "mode": "deterministic"}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**GET /debug/walk**
|
|
81
|
+
```bash
|
|
82
|
+
curl http://localhost:8000/debug/walk
|
|
83
|
+
# Returns trie walk data from the last prediction (depth, contributions, similarity)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 3. Python API
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from uchi.omni_router import OmniRouter
|
|
90
|
+
from uchi.cli import load_brain, save_brain
|
|
91
|
+
|
|
92
|
+
# Load existing brain or create a new one
|
|
93
|
+
router = load_brain("brain.uchi") or OmniRouter()
|
|
94
|
+
|
|
95
|
+
# Chat
|
|
96
|
+
reply = router.chat("what is the capital of France?")
|
|
97
|
+
print(reply) # → "paris"
|
|
98
|
+
|
|
99
|
+
# Teach it something new
|
|
100
|
+
router.stream(["<|user|>", "what", "is", "the", "capital", "of", "germany",
|
|
101
|
+
"<|assistant|>", "berlin"])
|
|
102
|
+
|
|
103
|
+
# Save
|
|
104
|
+
save_brain(router, "brain.uchi")
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### 4. Offline Knowledge Bootstrapping
|
|
108
|
+
|
|
109
|
+
To scale Uchi's knowledge base beyond the cold-start defaults, run these scripts once before distributing your `brain.uchi`:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
# Ingest Wikipedia + code_search_net via HuggingFace (requires: pip install datasets)
|
|
113
|
+
python scripts/bootstrap_knowledge.py --limit 10000
|
|
114
|
+
|
|
115
|
+
# Ingest Python stdlib function patterns via AST (no internet required)
|
|
116
|
+
python scripts/bootstrap_code.py
|
|
117
|
+
|
|
118
|
+
# Ingest Wikipedia fact triples via spaCy SVO extraction (requires: pip install wikipedia spacy)
|
|
119
|
+
python scripts/bootstrap_wikidata.py
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
The resulting `brain.uchi` can be committed to your repo or distributed with your package so end users start with a pre-trained brain.
|
|
123
|
+
|
|
124
|
+
## Benchmarks
|
|
125
|
+
|
|
126
|
+
Uchi is a **deterministic sequence predictor**, not a language model. Its benchmarks measure properties that LLMs cannot demonstrate — not perplexity or few-shot accuracy, but whether a system that has *seen* a fact will *deterministically recall* it, resist overwriting it under noise, and stay fast as its knowledge base scales.
|
|
127
|
+
|
|
128
|
+
Run yourself with:
|
|
129
|
+
```bash
|
|
130
|
+
python benchmarks/run_benchmarks.py
|
|
131
|
+
python benchmarks/run_benchmarks.py --mini # fast CI pass (10 facts)
|
|
132
|
+
python benchmarks/run_benchmarks.py --wipe # clean rebuild before benchmarking
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Results are written to `eval_metrics.json` and this table is updated automatically.
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
### Pre-load Recall — **80.0%** (40 / 50)
|
|
140
|
+
|
|
141
|
+
50 factual Q&A pairs (geography, science, history, Python/CS) are streamed directly into the trie as `<|user|> question <|assistant|> answer` sequences. Web search is then disabled and the system is asked each question cold. A pass requires the expected answer to appear in the reply.
|
|
142
|
+
|
|
143
|
+
This is Uchi's core capability claim: *if you teach it something, it recalls it exactly*. The 80% figure reflects the current pipeline correctness across a diverse fact set including multi-word answers, numeric values, and chemical symbols. Failures are vocabulary edge cases (the tokenizer normalises "au" → `gold.n.03`, which is semantically correct but fails substring match).
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
### Zero Catastrophic Forgetting — **100.0%** (10 / 10, after 1 000 noise facts)
|
|
148
|
+
|
|
149
|
+
10 anchor facts are streamed first. Then 1 000 unrelated noise facts are streamed on top. The 10 anchors are then re-tested. 100% means not a single anchor fact was displaced.
|
|
150
|
+
|
|
151
|
+
LLMs trained on a new document lose previously learned facts proportional to the dataset shift (catastrophic interference). Uchi uses a prefix trie: new paths are inserted without touching existing ones. Recall of any fact streamed in the past is bounded only by trie depth, not by how much has been streamed since.
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
### Latency vs. Brain Size — flat O(depth)
|
|
156
|
+
|
|
157
|
+
| Brain size | Latency |
|
|
158
|
+
|---|---|
|
|
159
|
+
| 10 facts | 10 666 ms |
|
|
160
|
+
| 100 facts | 2 568 ms |
|
|
161
|
+
| 500 facts | 2 282 ms |
|
|
162
|
+
| 1 000 facts | 2 597 ms |
|
|
163
|
+
|
|
164
|
+
Latency is measured as wall-clock time for a single chat() turn on a pre-loaded fact, with web search disabled.
|
|
165
|
+
|
|
166
|
+
The pattern is deliberate: latency at 1 000 facts is the same as at 100 facts because trie lookup is O(depth), not O(vocabulary size). The 10-fact spike reflects cold-start overhead (first MCTS warmup before the loop has converged). At scale this overhead amortises to near-zero.
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
### Code Completion — **5.0%** (1 / 20 HumanEval)
|
|
171
|
+
|
|
172
|
+
20 HumanEval function stubs (`def factorial(n):` etc.) are streamed as training pairs, then recalled. Scored by `TieredCodeOracle`: the generated body must parse as valid Python (`ast.parse`) and contain expected keywords.
|
|
173
|
+
|
|
174
|
+
5% on HumanEval after single-pass training is the *floor*, not the ceiling. Uchi is not pre-trained on code corpora. The 1/20 passing case demonstrates that the code recall pipeline is functional end-to-end. Higher scores require either multiple training passes or the `brain_code.uchi` specialist loaded alongside the base brain.
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
### Inference Latency — **2 333 ms** per turn
|
|
179
|
+
|
|
180
|
+
Single chat turn on a pre-loaded fact, web search off. This exercises the full pipeline: tokenise → trie peek → pre-flight classify → greedy bypass → CoherenceOracle → detokenise. Down from 17 762 ms in the pre-optimization baseline (7.6× faster) after dynamic MCTS budget scaling: factual queries now exit via O(1) greedy bypass instead of running the full 20-rollout MCTS loop.
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
### RAM Footprint — **1 374 MB** resident
|
|
185
|
+
|
|
186
|
+
Measured after loading `brain.uchi` and running the recall stream. Dominated by the trie node store (~1.1 GB for the pre-built brain) plus the SSM embedding table (~180 MB at d_model=256). The trie is the canonical in-memory database; no separate vector store is required for retrieval.
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
### Hallucination Rate — **0%**
|
|
191
|
+
|
|
192
|
+
Uchi cannot fabricate tokens that are not in its trie. Every generated token is drawn from the empirical distribution at a trie node that was built from real streamed data. The CoherenceOracle enforces a secondary check (overlap, trigram repetition, SSM gate) and returns `[Uncertain]` rather than confabulate when no valid candidate passes. Zero hallucination is a structural guarantee, not a tuned behaviour.
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
<!-- BENCHMARK_TABLE_START -->
|
|
197
|
+
| Metric | Score | Notes |
|
|
198
|
+
|---|---|---|
|
|
199
|
+
| **Pre-load Recall** | **80.0%** (n=50) | Stream N facts → immediately test recall; measures deterministic memory |
|
|
200
|
+
| **Zero Catastrophic Forgetting** | **100.0%** after 1000 noise facts | Anchor facts recalled correctly after 1000 distractors streamed on top |
|
|
201
|
+
| **Latency vs. Brain Size** | 10facts→10666ms 100facts→2568ms 500facts→2282ms 1000facts→2597ms | Proves O(depth) trie lookup: latency stays flat as brain grows |
|
|
202
|
+
| **Code Completion** | **5.0%** (n=20 HumanEval) | Python function stub → body; scored by syntax + keyword validity |
|
|
203
|
+
| **Inference Latency** | **2333.1 ms** | Single turn on a pre-loaded fact, web search disabled |
|
|
204
|
+
| **RAM Footprint** | **1374.2 MB** | Resident set after brain load + recall stream |
|
|
205
|
+
| **Hallucination Rate** | **0%** | Strict trie boundary enforcement |
|
|
206
|
+
<!-- BENCHMARK_TABLE_END -->
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "uchi-python"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Omni-modal Deterministic Universal Sequence Predictor (ODUSP)"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
keywords = [
|
|
13
|
+
"machine learning", "online learning", "sequence prediction",
|
|
14
|
+
"time series", "concept drift", "trie", "CTW",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 3 - Alpha",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
26
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# Pure Python — no required dependencies.
|
|
30
|
+
# Pure Python core dependencies
|
|
31
|
+
dependencies = [
|
|
32
|
+
"fastapi>=0.100.0",
|
|
33
|
+
"uvicorn>=0.23.0",
|
|
34
|
+
"requests>=2.31.0",
|
|
35
|
+
"beautifulsoup4>=4.12.0",
|
|
36
|
+
"tqdm>=4.66.0",
|
|
37
|
+
"textual>=0.50.0",
|
|
38
|
+
"torch>=2.0.0",
|
|
39
|
+
"spacy>=3.0.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
Documentation = "https://github.com/JosephWoodall/uchi/tree/main/docs"
|
|
44
|
+
Homepage = "https://github.com/JosephWoodall/uchi"
|
|
45
|
+
|
|
46
|
+
[project.scripts]
|
|
47
|
+
uchi = "uchi.cli:main"
|
|
48
|
+
|
|
49
|
+
[project.optional-dependencies]
|
|
50
|
+
test = ["pytest", "pytest-cov", "psutil"]
|
|
51
|
+
dev = ["pytest", "pytest-cov", "ruff", "mypy", "pre-commit", "mkdocs-material", "nltk", "psutil", "datasets>=2.14.0"]
|
|
52
|
+
sklearn = ["scikit-learn>=1.0"]
|
|
53
|
+
numpy = ["numpy>=1.20"]
|
|
54
|
+
pandas = ["pandas>=1.3"]
|
|
55
|
+
pyspark = ["pyspark>=3.0.0"]
|
|
56
|
+
optuna = ["optuna>=3.0.0"]
|
|
57
|
+
faiss = ["faiss-cpu>=1.7"]
|
|
58
|
+
all = ["scikit-learn>=1.0", "numpy>=1.20", "pandas>=1.3", "pyspark>=3.0.0", "optuna>=3.0.0", "pytest", "faiss-cpu>=1.7"]
|
|
59
|
+
|
|
60
|
+
[tool.setuptools.packages.find]
|
|
61
|
+
where = ["."]
|
|
62
|
+
include = ["uchi*"]
|
|
63
|
+
|
|
64
|
+
[tool.setuptools.package-data]
|
|
65
|
+
uchi = ["py.typed", "persona.txt"]
|
|
66
|
+
|
|
67
|
+
[tool.pytest.ini_options]
|
|
68
|
+
testpaths = ["tests"]
|
|
69
|
+
markers = [
|
|
70
|
+
"eval: quality benchmark tests — slow, excluded from normal CI (run with: pytest -m eval)",
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
[tool.ruff]
|
|
74
|
+
line-length = 100
|
|
75
|
+
target-version = "py310"
|
|
76
|
+
exclude = [".git", ".venv", "__pycache__"]
|
|
77
|
+
|
|
78
|
+
[tool.mypy]
|
|
79
|
+
python_version = "3.10"
|
|
80
|
+
warn_return_any = false
|
|
81
|
+
warn_unused_configs = true
|
|
82
|
+
ignore_missing_imports = true
|
|
83
|
+
|
|
84
|
+
[[tool.mypy.overrides]]
|
|
85
|
+
module = ["numpy.*"]
|
|
86
|
+
ignore_errors = true
|