uchi-python 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. uchi_python-0.2.0/PKG-INFO +273 -0
  2. uchi_python-0.2.0/README.md +209 -0
  3. uchi_python-0.2.0/pyproject.toml +86 -0
  4. uchi_python-0.2.0/tests/test_analytical_skills.py +337 -0
  5. uchi_python-0.2.0/tests/test_api_harness.py +34 -0
  6. uchi_python-0.2.0/tests/test_chat.py +149 -0
  7. uchi_python-0.2.0/tests/test_code_engine.py +251 -0
  8. uchi_python-0.2.0/tests/test_convergent_engine.py +383 -0
  9. uchi_python-0.2.0/tests/test_distributional.py +8 -0
  10. uchi_python-0.2.0/tests/test_dual_predictor.py +14 -0
  11. uchi_python-0.2.0/tests/test_eval_suite.py +203 -0
  12. {uchi_python-0.1.0 → uchi_python-0.2.0}/tests/test_forest.py +3 -3
  13. {uchi_python-0.1.0 → uchi_python-0.2.0}/tests/test_generative_fixes.py +1 -1
  14. uchi_python-0.2.0/tests/test_math_simulator.py +54 -0
  15. uchi_python-0.2.0/tests/test_node_compressor.py +18 -0
  16. uchi_python-0.2.0/tests/test_omni_router.py +46 -0
  17. uchi_python-0.2.0/tests/test_phase5_benchmark.py +39 -0
  18. uchi_python-0.2.0/tests/test_process.py +38 -0
  19. uchi_python-0.2.0/tests/test_replay.py +27 -0
  20. uchi_python-0.2.0/tests/test_routing_layer.py +104 -0
  21. uchi_python-0.2.0/tests/test_simple.py +25 -0
  22. uchi_python-0.2.0/tests/test_simulation_engine.py +14 -0
  23. uchi_python-0.2.0/tests/test_skill_registry.py +269 -0
  24. uchi_python-0.2.0/tests/test_timeseries.py +16 -0
  25. uchi_python-0.2.0/tests/test_tokenizers.py +17 -0
  26. uchi_python-0.2.0/tests/test_tree_search_engine.py +706 -0
  27. uchi_python-0.2.0/tests/test_vector_oracle.py +188 -0
  28. uchi_python-0.2.0/tests/test_web_plugin.py +14 -0
  29. {uchi_python-0.1.0 → uchi_python-0.2.0}/tests/tests.py +2 -2
  30. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/__init__.py +17 -1
  31. uchi_python-0.2.0/uchi/api.py +86 -0
  32. uchi_python-0.2.0/uchi/api_server.py +195 -0
  33. uchi_python-0.2.0/uchi/builder.py +214 -0
  34. uchi_python-0.2.0/uchi/cli.py +203 -0
  35. uchi_python-0.2.0/uchi/code_engine.py +150 -0
  36. uchi_python-0.2.0/uchi/code_tokenizer.py +244 -0
  37. uchi_python-0.2.0/uchi/convergent_engine.py +572 -0
  38. uchi_python-0.2.0/uchi/cpu_memory.py +74 -0
  39. uchi_python-0.2.0/uchi/data_loader.py +170 -0
  40. uchi_python-0.2.0/uchi/experience_replay.py +197 -0
  41. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/forest.py +15 -0
  42. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/generative.py +220 -17
  43. uchi_python-0.2.0/uchi/grammar_mask.py +184 -0
  44. uchi_python-0.2.0/uchi/grpo.py +94 -0
  45. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/hoeffding.py +0 -1
  46. uchi_python-0.2.0/uchi/intent_encoder.py +157 -0
  47. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/long_term_store.py +0 -1
  48. uchi_python-0.2.0/uchi/memory.py +104 -0
  49. uchi_python-0.2.0/uchi/neuro_symbolic.py +657 -0
  50. uchi_python-0.2.0/uchi/omni_evaluator.py +1021 -0
  51. uchi_python-0.2.0/uchi/omni_router.py +711 -0
  52. uchi_python-0.2.0/uchi/omni_tokenizer.py +258 -0
  53. uchi_python-0.2.0/uchi/ontology.py +198 -0
  54. uchi_python-0.2.0/uchi/ontology_manager.py +48 -0
  55. uchi_python-0.2.0/uchi/persona.txt +58 -0
  56. uchi_python-0.2.0/uchi/plugins/__init__.py +1 -0
  57. uchi_python-0.2.0/uchi/plugins/web.py +37 -0
  58. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/predictor.py +87 -3
  59. uchi_python-0.2.0/uchi/procedural_memory.py +96 -0
  60. uchi_python-0.2.0/uchi/process.py +58 -0
  61. uchi_python-0.2.0/uchi/semantic_index.py +192 -0
  62. uchi_python-0.2.0/uchi/simulation_engine.py +45 -0
  63. uchi_python-0.2.0/uchi/skill_registry.py +536 -0
  64. uchi_python-0.2.0/uchi/specialist_pool.py +78 -0
  65. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/tabular.py +13 -8
  66. uchi_python-0.2.0/uchi/telemetry.py +139 -0
  67. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/timeseries.py +7 -4
  68. uchi_python-0.2.0/uchi/tree_search_engine.py +558 -0
  69. uchi_python-0.2.0/uchi/tui/__init__.py +1 -0
  70. uchi_python-0.2.0/uchi/tui/app.py +826 -0
  71. uchi_python-0.2.0/uchi/vector_oracle.py +239 -0
  72. uchi_python-0.2.0/uchi/web_search.py +79 -0
  73. uchi_python-0.2.0/uchi_python.egg-info/PKG-INFO +273 -0
  74. uchi_python-0.2.0/uchi_python.egg-info/SOURCES.txt +83 -0
  75. uchi_python-0.2.0/uchi_python.egg-info/entry_points.txt +2 -0
  76. uchi_python-0.2.0/uchi_python.egg-info/requires.txt +51 -0
  77. uchi_python-0.1.0/PKG-INFO +0 -468
  78. uchi_python-0.1.0/README.md +0 -432
  79. uchi_python-0.1.0/pyproject.toml +0 -45
  80. uchi_python-0.1.0/tests/test_replay.py +0 -40
  81. uchi_python-0.1.0/uchi/semantic_tokenizer.py +0 -48
  82. uchi_python-0.1.0/uchi_python.egg-info/PKG-INFO +0 -468
  83. uchi_python-0.1.0/uchi_python.egg-info/SOURCES.txt +0 -26
  84. uchi_python-0.1.0/uchi_python.egg-info/requires.txt +0 -22
  85. {uchi_python-0.1.0 → uchi_python-0.2.0}/LICENSE +0 -0
  86. {uchi_python-0.1.0 → uchi_python-0.2.0}/setup.cfg +0 -0
  87. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/discretize.py +0 -0
  88. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/distributional.py +0 -0
  89. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/dual_predictor.py +0 -0
  90. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/node_compressor.py +0 -0
  91. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi/online_tokenizer.py +0 -0
  92. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi_python.egg-info/dependency_links.txt +0 -0
  93. {uchi_python-0.1.0 → uchi_python-0.2.0}/uchi_python.egg-info/top_level.txt +0 -0
@@ -0,0 +1,273 @@
1
+ Metadata-Version: 2.4
2
+ Name: uchi-python
3
+ Version: 0.2.0
4
+ Summary: Omni-modal Deterministic Universal Sequence Predictor (ODUSP)
5
+ License: MIT
6
+ Project-URL: Documentation, https://github.com/JosephWoodall/uchi/tree/main/docs
7
+ Project-URL: Homepage, https://github.com/JosephWoodall/uchi
8
+ Keywords: machine learning,online learning,sequence prediction,time series,concept drift,trie,CTW
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: fastapi>=0.100.0
22
+ Requires-Dist: uvicorn>=0.23.0
23
+ Requires-Dist: requests>=2.31.0
24
+ Requires-Dist: beautifulsoup4>=4.12.0
25
+ Requires-Dist: tqdm>=4.66.0
26
+ Requires-Dist: textual>=0.50.0
27
+ Requires-Dist: torch>=2.0.0
28
+ Requires-Dist: spacy>=3.0.0
29
+ Provides-Extra: test
30
+ Requires-Dist: pytest; extra == "test"
31
+ Requires-Dist: pytest-cov; extra == "test"
32
+ Requires-Dist: psutil; extra == "test"
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest; extra == "dev"
35
+ Requires-Dist: pytest-cov; extra == "dev"
36
+ Requires-Dist: ruff; extra == "dev"
37
+ Requires-Dist: mypy; extra == "dev"
38
+ Requires-Dist: pre-commit; extra == "dev"
39
+ Requires-Dist: mkdocs-material; extra == "dev"
40
+ Requires-Dist: nltk; extra == "dev"
41
+ Requires-Dist: psutil; extra == "dev"
42
+ Requires-Dist: datasets>=2.14.0; extra == "dev"
43
+ Provides-Extra: sklearn
44
+ Requires-Dist: scikit-learn>=1.0; extra == "sklearn"
45
+ Provides-Extra: numpy
46
+ Requires-Dist: numpy>=1.20; extra == "numpy"
47
+ Provides-Extra: pandas
48
+ Requires-Dist: pandas>=1.3; extra == "pandas"
49
+ Provides-Extra: pyspark
50
+ Requires-Dist: pyspark>=3.0.0; extra == "pyspark"
51
+ Provides-Extra: optuna
52
+ Requires-Dist: optuna>=3.0.0; extra == "optuna"
53
+ Provides-Extra: faiss
54
+ Requires-Dist: faiss-cpu>=1.7; extra == "faiss"
55
+ Provides-Extra: all
56
+ Requires-Dist: scikit-learn>=1.0; extra == "all"
57
+ Requires-Dist: numpy>=1.20; extra == "all"
58
+ Requires-Dist: pandas>=1.3; extra == "all"
59
+ Requires-Dist: pyspark>=3.0.0; extra == "all"
60
+ Requires-Dist: optuna>=3.0.0; extra == "all"
61
+ Requires-Dist: pytest; extra == "all"
62
+ Requires-Dist: faiss-cpu>=1.7; extra == "all"
63
+ Dynamic: license-file
64
+
65
+ ![Uchi Logo](docs/logo.png)
66
+
67
+ # Universal Sequence Predictor
68
+
69
+ [![PyPI version](https://img.shields.io/pypi/v/uchi_python.svg)](https://pypi.org/project/uchi_python/)
70
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
71
+ [![Python Versions](https://img.shields.io/pypi/pyversions/uchi_python.svg)](https://pypi.org/project/uchi_python/)
72
+ [![Tests](https://github.com/JosephWoodall/uchi/actions/workflows/ci.yml/badge.svg)](https://github.com/JosephWoodall/uchi/actions/workflows/ci.yml)
73
+
74
+ ## Core Mission: Omni-modal Deterministic Universal Sequence Predictor (ODUSP)
75
+ Uchi v0.2.0 transforms the architecture from a simple sequence predictor into a completely multi-modal Deterministic Universal Sequence Predictor. It ingests text, audio, images, math telemetry, and code simultaneously—without any neural weights or pre-training. It adds a structured routing layer with intent-based query dispatch via `ProceduralMemory`, a trainable SSM confidence signal via GRPO, persistent vector memory, TUI, REST API, and SDK — all without introducing an LLM dependency.
76
+
77
+
78
+ > [!NOTE]
79
+ > Please see `docs/` for the complete Algorithmic Walkthrough, ODUSP vs LLM Benchmarks, and full API references.
80
+
81
+ ---
82
+
83
+ > [!NOTE]
84
+ > **Comprehensive Documentation & API Reference**
85
+ >
86
+ > For interactive examples, API documentation, and to see the newest capabilities (including online Math Learning, Vector Retrievals, and the Simulation Engine), please see our full documentation website.
87
+ >
88
+ > **[Read the Full Documentation →](https://github.com/JosephWoodall/uchi/tree/main/docs)**
89
+
90
+ ---
91
+
92
+ ## Installation
93
+
94
+ ```bash
95
+ pip install -e ".[all]"
96
+ ```
97
+
98
+ On first launch, Uchi runs a one-time bootstrap (Python stdlib patterns + Wikipedia facts) and saves the result to `brain.uchi`. Subsequent launches are instant.
99
+
100
+ ## Quickstart
101
+
102
+ Uchi has two entry points that share the same `brain.uchi` — every interaction in either one improves the other.
103
+
104
+ ### 1. Terminal UI (TUI)
105
+
106
+ ```bash
107
+ uchi # launch interactive chat
108
+ uchi --preload data.txt # pre-train with a file before chatting
109
+ uchi --brain /path/to/brain.uchi # use a specific brain file
110
+ ```
111
+
112
+ Inside the TUI:
113
+
114
+ | Command | Description |
115
+ |---|---|
116
+ | Just type | Chat with Uchi — it learns from every turn |
117
+ | `/load <file>` | Stream any file into the knowledge base |
118
+ | `/save` | Force-save the current brain state to disk |
119
+ | `Ctrl+S` | Save brain |
120
+ | `Ctrl+C` | Save and quit |
121
+
122
+ Uchi gives positive/negative feedback signals to improve itself — type "good", "correct", "yes" to reinforce a response, or "wrong", "bad", "no" to prune it.
123
+
124
+ ### 2. REST API
125
+
126
+ ```bash
127
+ uvicorn uchi.api_server:app --host 0.0.0.0 --port 8000
128
+ ```
129
+
130
+ **POST /chat**
131
+ ```bash
132
+ curl -X POST http://localhost:8000/chat \
133
+ -H "Content-Type: application/json" \
134
+ -d '{"message": "what is the capital of France?"}'
135
+ # {"reply": "paris", "entropy": 3.2}
136
+ ```
137
+
138
+ **GET /metrics**
139
+ ```bash
140
+ curl http://localhost:8000/metrics
141
+ # {"status": "online", "memory_records": 1024, "mode": "deterministic"}
142
+ ```
143
+
144
+ **GET /debug/walk**
145
+ ```bash
146
+ curl http://localhost:8000/debug/walk
147
+ # Returns trie walk data from the last prediction (depth, contributions, similarity)
148
+ ```
149
+
150
+ ### 3. Python API
151
+
152
+ ```python
153
+ from uchi.omni_router import OmniRouter
154
+ from uchi.cli import load_brain, save_brain
155
+
156
+ # Load existing brain or create a new one
157
+ router = load_brain("brain.uchi") or OmniRouter()
158
+
159
+ # Chat
160
+ reply = router.chat("what is the capital of France?")
161
+ print(reply) # → "paris"
162
+
163
+ # Teach it something new
164
+ router.stream(["<|user|>", "what", "is", "the", "capital", "of", "germany",
165
+ "<|assistant|>", "berlin"])
166
+
167
+ # Save
168
+ save_brain(router, "brain.uchi")
169
+ ```
170
+
171
+ ### 4. Offline Knowledge Bootstrapping
172
+
173
+ To scale Uchi's knowledge base beyond the cold-start defaults, run these scripts once before distributing your `brain.uchi`:
174
+
175
+ ```bash
176
+ # Ingest Wikipedia + code_search_net via HuggingFace (requires: pip install datasets)
177
+ python scripts/bootstrap_knowledge.py --limit 10000
178
+
179
+ # Ingest Python stdlib function patterns via AST (no internet required)
180
+ python scripts/bootstrap_code.py
181
+
182
+ # Ingest Wikipedia fact triples via spaCy SVO extraction (requires: pip install wikipedia spacy)
183
+ python scripts/bootstrap_wikidata.py
184
+ ```
185
+
186
+ The resulting `brain.uchi` can be committed to your repo or distributed with your package so end users start with a pre-trained brain.
187
+
188
+ ## Benchmarks
189
+
190
+ Uchi is a **deterministic sequence predictor**, not a language model. Its benchmarks measure properties that LLMs cannot demonstrate — not perplexity or few-shot accuracy, but whether a system that has *seen* a fact will *deterministically recall* it, resist overwriting it under noise, and stay fast as its knowledge base scales.
191
+
192
+ Run yourself with:
193
+ ```bash
194
+ python benchmarks/run_benchmarks.py
195
+ python benchmarks/run_benchmarks.py --mini # fast CI pass (10 facts)
196
+ python benchmarks/run_benchmarks.py --wipe # clean rebuild before benchmarking
197
+ ```
198
+
199
+ Results are written to `eval_metrics.json` and this table is updated automatically.
200
+
201
+ ---
202
+
203
+ ### Pre-load Recall — **80.0%** (40 / 50)
204
+
205
+ 50 factual Q&A pairs (geography, science, history, Python/CS) are streamed directly into the trie as `<|user|> question <|assistant|> answer` sequences. Web search is then disabled and the system is asked each question cold. A pass requires the expected answer to appear in the reply.
206
+
207
+ This is Uchi's core capability claim: *if you teach it something, it recalls it exactly*. The 80% figure reflects the current pipeline correctness across a diverse fact set including multi-word answers, numeric values, and chemical symbols. Failures are vocabulary edge cases (the tokenizer normalises "au" → `gold.n.03`, which is semantically correct but fails substring match).
208
+
209
+ ---
210
+
211
+ ### Zero Catastrophic Forgetting — **100.0%** (10 / 10, after 1 000 noise facts)
212
+
213
+ 10 anchor facts are streamed first. Then 1 000 unrelated noise facts are streamed on top. The 10 anchors are then re-tested. 100% means not a single anchor fact was displaced.
214
+
215
+ LLMs trained on a new document lose previously learned facts proportional to the dataset shift (catastrophic interference). Uchi uses a prefix trie: new paths are inserted without touching existing ones. Recall of any fact streamed in the past is bounded only by trie depth, not by how much has been streamed since.
216
+
217
+ ---
218
+
219
+ ### Latency vs. Brain Size — flat O(depth)
220
+
221
+ | Brain size | Latency |
222
+ |---|---|
223
+ | 10 facts | 10 666 ms |
224
+ | 100 facts | 2 568 ms |
225
+ | 500 facts | 2 282 ms |
226
+ | 1 000 facts | 2 597 ms |
227
+
228
+ Latency is measured as wall-clock time for a single chat() turn on a pre-loaded fact, with web search disabled.
229
+
230
+ The pattern is deliberate: latency at 1 000 facts is the same as at 100 facts because trie lookup is O(depth), not O(vocabulary size). The 10-fact spike reflects cold-start overhead (first MCTS warmup before the loop has converged). At scale this overhead amortises to near-zero.
231
+
232
+ ---
233
+
234
+ ### Code Completion — **5.0%** (1 / 20 HumanEval)
235
+
236
+ 20 HumanEval function stubs (`def factorial(n):` etc.) are streamed as training pairs, then recalled. Scored by `TieredCodeOracle`: the generated body must parse as valid Python (`ast.parse`) and contain expected keywords.
237
+
238
+ 5% on HumanEval after single-pass training is the *floor*, not the ceiling. Uchi is not pre-trained on code corpora. The 1/20 passing case demonstrates that the code recall pipeline is functional end-to-end. Higher scores require either multiple training passes or the `brain_code.uchi` specialist loaded alongside the base brain.
239
+
240
+ ---
241
+
242
+ ### Inference Latency — **2 333 ms** per turn
243
+
244
+ Single chat turn on a pre-loaded fact, web search off. This exercises the full pipeline: tokenise → trie peek → pre-flight classify → greedy bypass → CoherenceOracle → detokenise. Down from 17 762 ms in the pre-optimization baseline (7.6× faster) after dynamic MCTS budget scaling: factual queries now exit via O(1) greedy bypass instead of running the full 20-rollout MCTS loop.
245
+
246
+ ---
247
+
248
+ ### RAM Footprint — **1 374 MB** resident
249
+
250
+ Measured after loading `brain.uchi` and running the recall stream. Dominated by the trie node store (~1.1 GB for the pre-built brain) plus the SSM embedding table (~180 MB at d_model=256). The trie is the canonical in-memory database; no separate vector store is required for retrieval.
251
+
252
+ ---
253
+
254
+ ### Hallucination Rate — **0%**
255
+
256
+ Uchi cannot fabricate tokens that are not in its trie. Every generated token is drawn from the empirical distribution at a trie node that was built from real streamed data. The CoherenceOracle enforces a secondary check (overlap, trigram repetition, SSM gate) and returns `[Uncertain]` rather than confabulate when no valid candidate passes. Zero hallucination is a structural guarantee, not a tuned behaviour.
257
+
258
+ ---
259
+
260
+ <!-- BENCHMARK_TABLE_START -->
261
+ | Metric | Score | Notes |
262
+ |---|---|---|
263
+ | **Pre-load Recall** | **80.0%** (n=50) | Stream N facts → immediately test recall; measures deterministic memory |
264
+ | **Zero Catastrophic Forgetting** | **100.0%** after 1000 noise facts | Anchor facts recalled correctly after 1000 distractors streamed on top |
265
+ | **Latency vs. Brain Size** | 10facts→10666ms 100facts→2568ms 500facts→2282ms 1000facts→2597ms | Proves O(depth) trie lookup: latency stays flat as brain grows |
266
+ | **Code Completion** | **5.0%** (n=20 HumanEval) | Python function stub → body; scored by syntax + keyword validity |
267
+ | **Inference Latency** | **2333.1 ms** | Single turn on a pre-loaded fact, web search disabled |
268
+ | **RAM Footprint** | **1374.2 MB** | Resident set after brain load + recall stream |
269
+ | **Hallucination Rate** | **0%** | Strict trie boundary enforcement |
270
+ <!-- BENCHMARK_TABLE_END -->
271
+
272
+
273
+
@@ -0,0 +1,209 @@
1
+ ![Uchi Logo](docs/logo.png)
2
+
3
+ # Universal Sequence Predictor
4
+
5
+ [![PyPI version](https://img.shields.io/pypi/v/uchi_python.svg)](https://pypi.org/project/uchi_python/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+ [![Python Versions](https://img.shields.io/pypi/pyversions/uchi_python.svg)](https://pypi.org/project/uchi_python/)
8
+ [![Tests](https://github.com/JosephWoodall/uchi/actions/workflows/ci.yml/badge.svg)](https://github.com/JosephWoodall/uchi/actions/workflows/ci.yml)
9
+
10
+ ## Core Mission: Omni-modal Deterministic Universal Sequence Predictor (ODUSP)
11
+ Uchi v0.2.0 transforms the architecture from a simple sequence predictor into a completely multi-modal Deterministic Universal Sequence Predictor. It ingests text, audio, images, math telemetry, and code simultaneously—without any neural weights or pre-training. It adds a structured routing layer with intent-based query dispatch via `ProceduralMemory`, a trainable SSM confidence signal via GRPO, persistent vector memory, TUI, REST API, and SDK — all without introducing an LLM dependency.
12
+
13
+
14
+ > [!NOTE]
15
+ > Please see `docs/` for the complete Algorithmic Walkthrough, ODUSP vs LLM Benchmarks, and full API references.
16
+
17
+ ---
18
+
19
+ > [!NOTE]
20
+ > **Comprehensive Documentation & API Reference**
21
+ >
22
+ > For interactive examples, API documentation, and to see the newest capabilities (including online Math Learning, Vector Retrievals, and the Simulation Engine), please see our full documentation website.
23
+ >
24
+ > **[Read the Full Documentation →](https://github.com/JosephWoodall/uchi/tree/main/docs)**
25
+
26
+ ---
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ pip install -e ".[all]"
32
+ ```
33
+
34
+ On first launch, Uchi runs a one-time bootstrap (Python stdlib patterns + Wikipedia facts) and saves the result to `brain.uchi`. Subsequent launches are instant.
35
+
36
+ ## Quickstart
37
+
38
+ Uchi has two entry points that share the same `brain.uchi` — every interaction in either one improves the other.
39
+
40
+ ### 1. Terminal UI (TUI)
41
+
42
+ ```bash
43
+ uchi # launch interactive chat
44
+ uchi --preload data.txt # pre-train with a file before chatting
45
+ uchi --brain /path/to/brain.uchi # use a specific brain file
46
+ ```
47
+
48
+ Inside the TUI:
49
+
50
+ | Command | Description |
51
+ |---|---|
52
+ | Just type | Chat with Uchi — it learns from every turn |
53
+ | `/load <file>` | Stream any file into the knowledge base |
54
+ | `/save` | Force-save the current brain state to disk |
55
+ | `Ctrl+S` | Save brain |
56
+ | `Ctrl+C` | Save and quit |
57
+
58
+ Uchi gives positive/negative feedback signals to improve itself — type "good", "correct", "yes" to reinforce a response, or "wrong", "bad", "no" to prune it.
59
+
60
+ ### 2. REST API
61
+
62
+ ```bash
63
+ uvicorn uchi.api_server:app --host 0.0.0.0 --port 8000
64
+ ```
65
+
66
+ **POST /chat**
67
+ ```bash
68
+ curl -X POST http://localhost:8000/chat \
69
+ -H "Content-Type: application/json" \
70
+ -d '{"message": "what is the capital of France?"}'
71
+ # {"reply": "paris", "entropy": 3.2}
72
+ ```
73
+
74
+ **GET /metrics**
75
+ ```bash
76
+ curl http://localhost:8000/metrics
77
+ # {"status": "online", "memory_records": 1024, "mode": "deterministic"}
78
+ ```
79
+
80
+ **GET /debug/walk**
81
+ ```bash
82
+ curl http://localhost:8000/debug/walk
83
+ # Returns trie walk data from the last prediction (depth, contributions, similarity)
84
+ ```
85
+
86
+ ### 3. Python API
87
+
88
+ ```python
89
+ from uchi.omni_router import OmniRouter
90
+ from uchi.cli import load_brain, save_brain
91
+
92
+ # Load existing brain or create a new one
93
+ router = load_brain("brain.uchi") or OmniRouter()
94
+
95
+ # Chat
96
+ reply = router.chat("what is the capital of France?")
97
+ print(reply) # → "paris"
98
+
99
+ # Teach it something new
100
+ router.stream(["<|user|>", "what", "is", "the", "capital", "of", "germany",
101
+ "<|assistant|>", "berlin"])
102
+
103
+ # Save
104
+ save_brain(router, "brain.uchi")
105
+ ```
106
+
107
+ ### 4. Offline Knowledge Bootstrapping
108
+
109
+ To scale Uchi's knowledge base beyond the cold-start defaults, run these scripts once before distributing your `brain.uchi`:
110
+
111
+ ```bash
112
+ # Ingest Wikipedia + code_search_net via HuggingFace (requires: pip install datasets)
113
+ python scripts/bootstrap_knowledge.py --limit 10000
114
+
115
+ # Ingest Python stdlib function patterns via AST (no internet required)
116
+ python scripts/bootstrap_code.py
117
+
118
+ # Ingest Wikipedia fact triples via spaCy SVO extraction (requires: pip install wikipedia spacy)
119
+ python scripts/bootstrap_wikidata.py
120
+ ```
121
+
122
+ The resulting `brain.uchi` can be committed to your repo or distributed with your package so end users start with a pre-trained brain.
123
+
124
+ ## Benchmarks
125
+
126
+ Uchi is a **deterministic sequence predictor**, not a language model. Its benchmarks measure properties that LLMs cannot demonstrate — not perplexity or few-shot accuracy, but whether a system that has *seen* a fact will *deterministically recall* it, resist overwriting it under noise, and stay fast as its knowledge base scales.
127
+
128
+ Run yourself with:
129
+ ```bash
130
+ python benchmarks/run_benchmarks.py
131
+ python benchmarks/run_benchmarks.py --mini # fast CI pass (10 facts)
132
+ python benchmarks/run_benchmarks.py --wipe # clean rebuild before benchmarking
133
+ ```
134
+
135
+ Results are written to `eval_metrics.json` and this table is updated automatically.
136
+
137
+ ---
138
+
139
+ ### Pre-load Recall — **80.0%** (40 / 50)
140
+
141
+ 50 factual Q&A pairs (geography, science, history, Python/CS) are streamed directly into the trie as `<|user|> question <|assistant|> answer` sequences. Web search is then disabled and the system is asked each question cold. A pass requires the expected answer to appear in the reply.
142
+
143
+ This is Uchi's core capability claim: *if you teach it something, it recalls it exactly*. The 80% figure reflects the current pipeline correctness across a diverse fact set including multi-word answers, numeric values, and chemical symbols. Failures are vocabulary edge cases (the tokenizer normalises "au" → `gold.n.03`, which is semantically correct but fails substring match).
144
+
145
+ ---
146
+
147
+ ### Zero Catastrophic Forgetting — **100.0%** (10 / 10, after 1 000 noise facts)
148
+
149
+ 10 anchor facts are streamed first. Then 1 000 unrelated noise facts are streamed on top. The 10 anchors are then re-tested. 100% means not a single anchor fact was displaced.
150
+
151
+ LLMs trained on a new document lose previously learned facts proportional to the dataset shift (catastrophic interference). Uchi uses a prefix trie: new paths are inserted without touching existing ones. Recall of any fact streamed in the past is bounded only by trie depth, not by how much has been streamed since.
152
+
153
+ ---
154
+
155
+ ### Latency vs. Brain Size — flat O(depth)
156
+
157
+ | Brain size | Latency |
158
+ |---|---|
159
+ | 10 facts | 10 666 ms |
160
+ | 100 facts | 2 568 ms |
161
+ | 500 facts | 2 282 ms |
162
+ | 1 000 facts | 2 597 ms |
163
+
164
+ Latency is measured as wall-clock time for a single chat() turn on a pre-loaded fact, with web search disabled.
165
+
166
+ The pattern is deliberate: latency at 1 000 facts is the same as at 100 facts because trie lookup is O(depth), not O(vocabulary size). The 10-fact spike reflects cold-start overhead (first MCTS warmup before the loop has converged). At scale this overhead amortises to near-zero.
167
+
168
+ ---
169
+
170
+ ### Code Completion — **5.0%** (1 / 20 HumanEval)
171
+
172
+ 20 HumanEval function stubs (`def factorial(n):` etc.) are streamed as training pairs, then recalled. Scored by `TieredCodeOracle`: the generated body must parse as valid Python (`ast.parse`) and contain expected keywords.
173
+
174
+ 5% on HumanEval after single-pass training is the *floor*, not the ceiling. Uchi is not pre-trained on code corpora. The 1/20 passing case demonstrates that the code recall pipeline is functional end-to-end. Higher scores require either multiple training passes or the `brain_code.uchi` specialist loaded alongside the base brain.
175
+
176
+ ---
177
+
178
+ ### Inference Latency — **2 333 ms** per turn
179
+
180
+ Single chat turn on a pre-loaded fact, web search off. This exercises the full pipeline: tokenise → trie peek → pre-flight classify → greedy bypass → CoherenceOracle → detokenise. Down from 17 762 ms in the pre-optimization baseline (7.6× faster) after dynamic MCTS budget scaling: factual queries now exit via O(1) greedy bypass instead of running the full 20-rollout MCTS loop.
181
+
182
+ ---
183
+
184
+ ### RAM Footprint — **1 374 MB** resident
185
+
186
+ Measured after loading `brain.uchi` and running the recall stream. Dominated by the trie node store (~1.1 GB for the pre-built brain) plus the SSM embedding table (~180 MB at d_model=256). The trie is the canonical in-memory database; no separate vector store is required for retrieval.
187
+
188
+ ---
189
+
190
+ ### Hallucination Rate — **0%**
191
+
192
+ Uchi cannot fabricate tokens that are not in its trie. Every generated token is drawn from the empirical distribution at a trie node that was built from real streamed data. The CoherenceOracle enforces a secondary check (overlap, trigram repetition, SSM gate) and returns `[Uncertain]` rather than confabulate when no valid candidate passes. Zero hallucination is a structural guarantee, not a tuned behaviour.
193
+
194
+ ---
195
+
196
+ <!-- BENCHMARK_TABLE_START -->
197
+ | Metric | Score | Notes |
198
+ |---|---|---|
199
+ | **Pre-load Recall** | **80.0%** (n=50) | Stream N facts → immediately test recall; measures deterministic memory |
200
+ | **Zero Catastrophic Forgetting** | **100.0%** after 1000 noise facts | Anchor facts recalled correctly after 1000 distractors streamed on top |
201
+ | **Latency vs. Brain Size** | 10facts→10666ms 100facts→2568ms 500facts→2282ms 1000facts→2597ms | Proves O(depth) trie lookup: latency stays flat as brain grows |
202
+ | **Code Completion** | **5.0%** (n=20 HumanEval) | Python function stub → body; scored by syntax + keyword validity |
203
+ | **Inference Latency** | **2333.1 ms** | Single turn on a pre-loaded fact, web search disabled |
204
+ | **RAM Footprint** | **1374.2 MB** | Resident set after brain load + recall stream |
205
+ | **Hallucination Rate** | **0%** | Strict trie boundary enforcement |
206
+ <!-- BENCHMARK_TABLE_END -->
207
+
208
+
209
+
@@ -0,0 +1,86 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "uchi-python"
7
+ version = "0.2.0"
8
+ description = "Omni-modal Deterministic Universal Sequence Predictor (ODUSP)"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ keywords = [
13
+ "machine learning", "online learning", "sequence prediction",
14
+ "time series", "concept drift", "trie", "CTW",
15
+ ]
16
+
17
+ classifiers = [
18
+ "Development Status :: 3 - Alpha",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
26
+ "Topic :: Software Development :: Libraries :: Python Modules",
27
+ ]
28
+
29
+ # Pure Python — no required dependencies.
30
+ # Pure Python core dependencies
31
+ dependencies = [
32
+ "fastapi>=0.100.0",
33
+ "uvicorn>=0.23.0",
34
+ "requests>=2.31.0",
35
+ "beautifulsoup4>=4.12.0",
36
+ "tqdm>=4.66.0",
37
+ "textual>=0.50.0",
38
+ "torch>=2.0.0",
39
+ "spacy>=3.0.0",
40
+ ]
41
+
42
+ [project.urls]
43
+ Documentation = "https://github.com/JosephWoodall/uchi/tree/main/docs"
44
+ Homepage = "https://github.com/JosephWoodall/uchi"
45
+
46
+ [project.scripts]
47
+ uchi = "uchi.cli:main"
48
+
49
+ [project.optional-dependencies]
50
+ test = ["pytest", "pytest-cov", "psutil"]
51
+ dev = ["pytest", "pytest-cov", "ruff", "mypy", "pre-commit", "mkdocs-material", "nltk", "psutil", "datasets>=2.14.0"]
52
+ sklearn = ["scikit-learn>=1.0"]
53
+ numpy = ["numpy>=1.20"]
54
+ pandas = ["pandas>=1.3"]
55
+ pyspark = ["pyspark>=3.0.0"]
56
+ optuna = ["optuna>=3.0.0"]
57
+ faiss = ["faiss-cpu>=1.7"]
58
+ all = ["scikit-learn>=1.0", "numpy>=1.20", "pandas>=1.3", "pyspark>=3.0.0", "optuna>=3.0.0", "pytest", "faiss-cpu>=1.7"]
59
+
60
+ [tool.setuptools.packages.find]
61
+ where = ["."]
62
+ include = ["uchi*"]
63
+
64
+ [tool.setuptools.package-data]
65
+ uchi = ["py.typed", "persona.txt"]
66
+
67
+ [tool.pytest.ini_options]
68
+ testpaths = ["tests"]
69
+ markers = [
70
+ "eval: quality benchmark tests — slow, excluded from normal CI (run with: pytest -m eval)",
71
+ ]
72
+
73
+ [tool.ruff]
74
+ line-length = 100
75
+ target-version = "py310"
76
+ exclude = [".git", ".venv", "__pycache__"]
77
+
78
+ [tool.mypy]
79
+ python_version = "3.10"
80
+ warn_return_any = false
81
+ warn_unused_configs = true
82
+ ignore_missing_imports = true
83
+
84
+ [[tool.mypy.overrides]]
85
+ module = ["numpy.*"]
86
+ ignore_errors = true