uchi-python 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uchi/__init__.py +17 -1
- uchi/api.py +86 -0
- uchi/api_server.py +195 -0
- uchi/builder.py +214 -0
- uchi/cli.py +203 -0
- uchi/code_engine.py +150 -0
- uchi/code_tokenizer.py +244 -0
- uchi/convergent_engine.py +572 -0
- uchi/cpu_memory.py +74 -0
- uchi/data_loader.py +170 -0
- uchi/experience_replay.py +197 -0
- uchi/forest.py +15 -0
- uchi/generative.py +220 -17
- uchi/grammar_mask.py +184 -0
- uchi/grpo.py +94 -0
- uchi/hoeffding.py +0 -1
- uchi/intent_encoder.py +157 -0
- uchi/long_term_store.py +0 -1
- uchi/memory.py +104 -0
- uchi/neuro_symbolic.py +657 -0
- uchi/omni_evaluator.py +1021 -0
- uchi/omni_router.py +711 -0
- uchi/omni_tokenizer.py +258 -0
- uchi/ontology.py +198 -0
- uchi/ontology_manager.py +48 -0
- uchi/persona.txt +58 -0
- uchi/plugins/__init__.py +1 -0
- uchi/plugins/web.py +37 -0
- uchi/predictor.py +87 -3
- uchi/procedural_memory.py +96 -0
- uchi/process.py +58 -0
- uchi/semantic_index.py +192 -0
- uchi/simulation_engine.py +45 -0
- uchi/skill_registry.py +536 -0
- uchi/specialist_pool.py +78 -0
- uchi/tabular.py +13 -8
- uchi/telemetry.py +139 -0
- uchi/timeseries.py +7 -4
- uchi/tree_search_engine.py +558 -0
- uchi/tui/__init__.py +1 -0
- uchi/tui/app.py +826 -0
- uchi/vector_oracle.py +239 -0
- uchi/web_search.py +79 -0
- uchi_python-0.2.0.dist-info/METADATA +273 -0
- uchi_python-0.2.0.dist-info/RECORD +54 -0
- uchi_python-0.2.0.dist-info/entry_points.txt +2 -0
- uchi/semantic_tokenizer.py +0 -48
- uchi_python-0.1.0.dist-info/METADATA +0 -468
- uchi_python-0.1.0.dist-info/RECORD +0 -19
- {uchi_python-0.1.0.dist-info → uchi_python-0.2.0.dist-info}/WHEEL +0 -0
- {uchi_python-0.1.0.dist-info → uchi_python-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {uchi_python-0.1.0.dist-info → uchi_python-0.2.0.dist-info}/top_level.txt +0 -0
uchi/__init__.py
CHANGED
|
@@ -16,6 +16,8 @@ TabularPredictor / TabularRegressor / TimeSeriesClassifier all support
|
|
|
16
16
|
partial_fit() for online / incremental learning.
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
|
+
__version__ = "0.2.0"
|
|
20
|
+
|
|
19
21
|
from .predictor import UniversalPredictor
|
|
20
22
|
from .forest import PredictorForest
|
|
21
23
|
from .discretize import FeatureDiscretizer, LabelEncoder
|
|
@@ -28,8 +30,14 @@ from .long_term_store import LongTermStore
|
|
|
28
30
|
from .dual_predictor import DualPredictor
|
|
29
31
|
from .online_tokenizer import OnlineTokenizer
|
|
30
32
|
from .node_compressor import NodeCompressor
|
|
33
|
+
from .process import ProcessPredictor, OntologicalState, OntologicalAction
|
|
34
|
+
from .simulation_engine import LifelongSimulationEngine
|
|
35
|
+
|
|
36
|
+
from .memory import AssociativeMemory
|
|
37
|
+
from .omni_router import OmniRouter
|
|
38
|
+
from .omni_tokenizer import OmniTokenizer
|
|
31
39
|
|
|
32
|
-
__version__ = "0.
|
|
40
|
+
__version__ = "0.2.0"
|
|
33
41
|
|
|
34
42
|
__all__ = [
|
|
35
43
|
# Core engine
|
|
@@ -54,4 +62,12 @@ __all__ = [
|
|
|
54
62
|
"DualPredictor",
|
|
55
63
|
"OnlineTokenizer",
|
|
56
64
|
"NodeCompressor",
|
|
65
|
+
"ProcessPredictor",
|
|
66
|
+
"OntologicalState",
|
|
67
|
+
"OntologicalAction",
|
|
68
|
+
"LifelongSimulationEngine",
|
|
69
|
+
|
|
70
|
+
"AssociativeMemory",
|
|
71
|
+
"OmniRouter",
|
|
72
|
+
"OmniTokenizer"
|
|
57
73
|
]
|
uchi/api.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from fastapi import FastAPI, HTTPException
|
|
2
|
+
from fastapi.staticfiles import StaticFiles
|
|
3
|
+
from fastapi.responses import FileResponse
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
from .omni_router import OmniRouter
|
|
6
|
+
import os
|
|
7
|
+
import pickle
|
|
8
|
+
|
|
9
|
+
ASCII_LOGO = r"""
|
|
10
|
+
|\_/\_/\_/|
|
|
11
|
+
| |
|
|
12
|
+
| O O |
|
|
13
|
+
| ^ |
|
|
14
|
+
\ ___ /
|
|
15
|
+
\_____/
|
|
16
|
+
ODUSP Daemon v0.2.0
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
app = FastAPI(title="ODUSP Brain API")
|
|
20
|
+
|
|
21
|
+
# Mount UI Harness static files
|
|
22
|
+
STATIC_DIR = os.path.join(os.path.dirname(__file__), "static")
|
|
23
|
+
if os.path.exists(STATIC_DIR):
|
|
24
|
+
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
|
|
25
|
+
|
|
26
|
+
@app.get("/")
|
|
27
|
+
async def serve_ui():
|
|
28
|
+
if os.path.exists(os.path.join(STATIC_DIR, "index.html")):
|
|
29
|
+
return FileResponse(os.path.join(STATIC_DIR, "index.html"))
|
|
30
|
+
return {"message": "ODUSP API is running, but UI is missing."}
|
|
31
|
+
@app.get("/style.css")
|
|
32
|
+
async def serve_css():
|
|
33
|
+
return FileResponse(os.path.join(STATIC_DIR, "style.css"))
|
|
34
|
+
@app.get("/app.js")
|
|
35
|
+
async def serve_js():
|
|
36
|
+
return FileResponse(os.path.join(STATIC_DIR, "app.js"))
|
|
37
|
+
|
|
38
|
+
router = None
|
|
39
|
+
|
|
40
|
+
class StreamRequest(BaseModel):
|
|
41
|
+
tokens: list[str]
|
|
42
|
+
|
|
43
|
+
class QueryRequest(BaseModel):
|
|
44
|
+
tokens: list[str]
|
|
45
|
+
|
|
46
|
+
class PredictRequest(BaseModel):
|
|
47
|
+
context: list[str] = []
|
|
48
|
+
steps: int = 5
|
|
49
|
+
temperature: float = 0.0
|
|
50
|
+
creativity: float = 0.0
|
|
51
|
+
|
|
52
|
+
def load_brain(path: str = "brain.uchi") -> OmniRouter:
|
|
53
|
+
if os.path.exists(path):
|
|
54
|
+
try:
|
|
55
|
+
with open(path, "rb") as f:
|
|
56
|
+
return pickle.load(f)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
print(f"[-] Failed to load {path}: {e}")
|
|
59
|
+
# Fall back to new router if file is corrupted
|
|
60
|
+
pass
|
|
61
|
+
return OmniRouter(use_bpe=True, memory_window=5)
|
|
62
|
+
|
|
63
|
+
@app.on_event("startup")
|
|
64
|
+
async def startup_event():
|
|
65
|
+
global router
|
|
66
|
+
print(ASCII_LOGO)
|
|
67
|
+
print("[*] Booting ODUSP daemon...")
|
|
68
|
+
router = load_brain()
|
|
69
|
+
print("[+] Brain loaded and active.")
|
|
70
|
+
|
|
71
|
+
@app.post("/stream")
|
|
72
|
+
async def stream_data(req: StreamRequest):
|
|
73
|
+
if not req.tokens:
|
|
74
|
+
raise HTTPException(status_code=400, detail="Empty token list")
|
|
75
|
+
router.stream(req.tokens)
|
|
76
|
+
return {"status": "success", "processed": len(req.tokens)}
|
|
77
|
+
|
|
78
|
+
@app.post("/query")
|
|
79
|
+
async def query_memory(req: QueryRequest):
|
|
80
|
+
ans = router.query(req.tokens)
|
|
81
|
+
return {"answer": ans}
|
|
82
|
+
|
|
83
|
+
@app.post("/predict")
|
|
84
|
+
async def predict_future(req: PredictRequest):
|
|
85
|
+
pred = router.predict_future(req.context, steps=req.steps, temperature=req.temperature, creativity=req.creativity)
|
|
86
|
+
return {"prediction": pred}
|
uchi/api_server.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
from contextlib import asynccontextmanager
|
|
2
|
+
from fastapi import FastAPI, HTTPException
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from uchi.omni_router import OmniRouter
|
|
5
|
+
from uchi.cli import load_brain, save_brain
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
_router = None
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@asynccontextmanager
|
|
12
|
+
async def lifespan(app: FastAPI):
|
|
13
|
+
global _router
|
|
14
|
+
_router = load_brain()
|
|
15
|
+
if _router is None:
|
|
16
|
+
_router = OmniRouter(use_bpe=False, memory_window=5)
|
|
17
|
+
save_brain(_router)
|
|
18
|
+
# Start background RL daemon and any other background jobs
|
|
19
|
+
_router.start_background_jobs()
|
|
20
|
+
yield
|
|
21
|
+
# Persist on shutdown
|
|
22
|
+
if _router is not None:
|
|
23
|
+
_router.stop_background_jobs()
|
|
24
|
+
save_brain(_router)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
app = FastAPI(
|
|
28
|
+
title="Uchi ODUSP API",
|
|
29
|
+
description="Deterministic Universal Sequence Predictor — programmatic interface.",
|
|
30
|
+
lifespan=lifespan,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ChatRequest(BaseModel):
|
|
35
|
+
message: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ChatResponse(BaseModel):
|
|
39
|
+
reply: str
|
|
40
|
+
entropy: float = 0.0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class SkillResponse(BaseModel):
|
|
44
|
+
reply: str
|
|
45
|
+
skill: str
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class BootstrapRequest(BaseModel):
|
|
49
|
+
text: str | None = None
|
|
50
|
+
url: str | None = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class BootstrapResponse(BaseModel):
|
|
54
|
+
tokens_ingested: int
|
|
55
|
+
source: str
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@app.post("/chat", response_model=ChatResponse)
|
|
59
|
+
async def chat_endpoint(request: ChatRequest):
|
|
60
|
+
"""
|
|
61
|
+
Send a message to the OmniRouter.
|
|
62
|
+
|
|
63
|
+
Messages starting with `/name args` are dispatched to the skill registry.
|
|
64
|
+
All other messages go through the standard chat pipeline.
|
|
65
|
+
"""
|
|
66
|
+
if not request.message.strip():
|
|
67
|
+
raise HTTPException(status_code=400, detail="Message cannot be empty")
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
msg = request.message.strip()
|
|
71
|
+
if msg.startswith("/"):
|
|
72
|
+
parts = msg[1:].split(None, 1)
|
|
73
|
+
name = parts[0]
|
|
74
|
+
args = parts[1] if len(parts) > 1 else ""
|
|
75
|
+
reply = _router.skills.dispatch(name, args)
|
|
76
|
+
else:
|
|
77
|
+
reply = _router.chat(msg)
|
|
78
|
+
|
|
79
|
+
save_brain(_router)
|
|
80
|
+
return ChatResponse(reply=reply)
|
|
81
|
+
|
|
82
|
+
except Exception as e:
|
|
83
|
+
import traceback
|
|
84
|
+
traceback.print_exc()
|
|
85
|
+
logging.error(f"API Error: {e}")
|
|
86
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@app.post("/skill/{name}", response_model=SkillResponse)
|
|
90
|
+
async def skill_endpoint(name: str, request: ChatRequest):
|
|
91
|
+
"""Invoke a named skill directly."""
|
|
92
|
+
if not _router.skills.has(name):
|
|
93
|
+
raise HTTPException(status_code=404, detail=f"Skill '{name}' not found")
|
|
94
|
+
reply = _router.skills.dispatch(name, request.message)
|
|
95
|
+
save_brain(_router)
|
|
96
|
+
return SkillResponse(reply=reply, skill=name)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@app.get("/skills")
|
|
100
|
+
async def list_skills():
|
|
101
|
+
"""List all registered skills (built-in + user-installed)."""
|
|
102
|
+
return {
|
|
103
|
+
"skills": [
|
|
104
|
+
{
|
|
105
|
+
"name": s.name,
|
|
106
|
+
"description": s.description,
|
|
107
|
+
"args": s.args_hint,
|
|
108
|
+
"mode": s.mode,
|
|
109
|
+
"source": s.source_path,
|
|
110
|
+
}
|
|
111
|
+
for s in _router.skills.list_skills()
|
|
112
|
+
]
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@app.get("/metrics")
|
|
117
|
+
async def metrics_endpoint():
|
|
118
|
+
memory_records = (
|
|
119
|
+
len(_router.memory.cpu_mem.records)
|
|
120
|
+
if hasattr(_router.memory, "cpu_mem")
|
|
121
|
+
else 0
|
|
122
|
+
)
|
|
123
|
+
return {
|
|
124
|
+
"status": "online",
|
|
125
|
+
"memory_records": memory_records,
|
|
126
|
+
"ssm_baseline_mean": round(_router.baseline.mean, 4),
|
|
127
|
+
"skills_loaded": len(_router.skills.list_skills()),
|
|
128
|
+
"mode": "deterministic",
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@app.get("/telemetry")
|
|
133
|
+
async def telemetry_endpoint():
|
|
134
|
+
"""
|
|
135
|
+
Exposes deep internal engine telemetry for the TUI and Cognitive Debugger.
|
|
136
|
+
Pulls data from the central telemetry singleton if available.
|
|
137
|
+
"""
|
|
138
|
+
try:
|
|
139
|
+
import uchi.telemetry as _tel
|
|
140
|
+
return _tel.dump_all()
|
|
141
|
+
except Exception as e:
|
|
142
|
+
return {"error": f"Telemetry not available: {e}"}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@app.post("/bootstrap", response_model=BootstrapResponse)
|
|
146
|
+
async def bootstrap_endpoint(request: BootstrapRequest):
|
|
147
|
+
"""
|
|
148
|
+
Ingest raw text or a URL into Uchi's trie and AssociativeMemory.
|
|
149
|
+
|
|
150
|
+
Accepts JSON body with one of:
|
|
151
|
+
- `{"text": "raw text to learn"}` — streams the text directly
|
|
152
|
+
- `{"url": "https://..."}` — fetches the page, strips HTML, then streams
|
|
153
|
+
|
|
154
|
+
Once Uchi has tool-routing, it can call this endpoint autonomously to
|
|
155
|
+
permanently memorise content it discovers via web search.
|
|
156
|
+
"""
|
|
157
|
+
if not request.text and not request.url:
|
|
158
|
+
raise HTTPException(status_code=400, detail="Provide either 'text' or 'url'.")
|
|
159
|
+
|
|
160
|
+
raw_text = request.text or ""
|
|
161
|
+
source = "text"
|
|
162
|
+
|
|
163
|
+
if request.url:
|
|
164
|
+
source = request.url
|
|
165
|
+
try:
|
|
166
|
+
import requests as _req
|
|
167
|
+
from bs4 import BeautifulSoup
|
|
168
|
+
resp = _req.get(request.url, timeout=10, headers={"User-Agent": "Uchi/1.0"})
|
|
169
|
+
resp.raise_for_status()
|
|
170
|
+
soup = BeautifulSoup(resp.text, "html.parser")
|
|
171
|
+
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
|
172
|
+
tag.decompose()
|
|
173
|
+
raw_text = soup.get_text(separator=" ", strip=True)
|
|
174
|
+
except Exception as exc:
|
|
175
|
+
raise HTTPException(status_code=502, detail=f"Failed to fetch URL: {exc}")
|
|
176
|
+
|
|
177
|
+
if not raw_text.strip():
|
|
178
|
+
raise HTTPException(status_code=400, detail="No usable text found.")
|
|
179
|
+
|
|
180
|
+
tokens = _router.tokenizer.tokenize(raw_text.split(), is_inference=False)
|
|
181
|
+
_router.stream(tokens)
|
|
182
|
+
save_brain(_router)
|
|
183
|
+
return BootstrapResponse(tokens_ingested=len(tokens), source=source)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@app.get("/debug/walk")
|
|
187
|
+
async def debug_walk_endpoint():
|
|
188
|
+
if not hasattr(_router, "last_walk_data"):
|
|
189
|
+
return {"error": "No walk data available"}
|
|
190
|
+
return _router.last_walk_data
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
if __name__ == "__main__":
|
|
194
|
+
import uvicorn
|
|
195
|
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
uchi/builder.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import logging
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
from uchi.cli import save_brain
|
|
6
|
+
from uchi.omni_router import OmniRouter
|
|
7
|
+
from uchi.neuro_symbolic import get_ssm
|
|
8
|
+
import torch
|
|
9
|
+
|
|
10
|
+
# A universal limit for dataset slicing so we don't spend 24 hours downloading
|
|
11
|
+
# 1 million rows, while still pulling a massive functional knowledge base.
|
|
12
|
+
KNOWLEDGE_LIMIT = 50
|
|
13
|
+
|
|
14
|
+
def _safe_load_dataset(*args, **kwargs):
|
|
15
|
+
from datasets import load_dataset
|
|
16
|
+
try:
|
|
17
|
+
if len(args) == 2 and not kwargs:
|
|
18
|
+
return load_dataset(args[0], split=args[1])
|
|
19
|
+
return load_dataset(*args, **kwargs)
|
|
20
|
+
except Exception as e:
|
|
21
|
+
logging.warning(f"[-] Failed to load {args[0] if args else kwargs.get('path', 'dataset')}: {e}")
|
|
22
|
+
return None
|
|
23
|
+
|
|
24
|
+
def build_full_brain(brain_path="brain.uchi"):
|
|
25
|
+
"""
|
|
26
|
+
The Universal Master Builder.
|
|
27
|
+
Executes the 5-stage reconstruction pipeline to build the brain from scratch.
|
|
28
|
+
"""
|
|
29
|
+
print("\n[bold #bb9af7]=== UCHI UNIVERSAL BRAIN BUILDER ===[/bold #bb9af7]")
|
|
30
|
+
print("[*] No existing brain detected. Initiating 5-Stage Reconstruction Pipeline.\n")
|
|
31
|
+
|
|
32
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
33
|
+
# Phase 1: The Wipe
|
|
34
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
35
|
+
print("[bold #7dcfff][*] Phase 1/5: Wiping corrupted or outdated states...[/bold #7dcfff]")
|
|
36
|
+
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
37
|
+
|
|
38
|
+
patterns = ["brain*.uchi", "*.pt", "*.db", "uchi_cpu_memory*.json", "uchi_cpu_memory*.npy"]
|
|
39
|
+
for pat in patterns:
|
|
40
|
+
for f in glob.glob(os.path.join(project_root, pat)):
|
|
41
|
+
os.remove(f)
|
|
42
|
+
print(f" [-] Deleted {os.path.basename(f)}")
|
|
43
|
+
|
|
44
|
+
print(" [+] Clean slate achieved.")
|
|
45
|
+
router = OmniRouter(use_bpe=False)
|
|
46
|
+
|
|
47
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
48
|
+
# Phase 2: SSM Neural Pre-training
|
|
49
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
50
|
+
print("\n[bold #7dcfff][*] Phase 2/5: SSM Neural Pre-training (databricks-dolly-15k)...[/bold #7dcfff]")
|
|
51
|
+
ds_dolly = _safe_load_dataset("databricks/databricks-dolly-15k", f"train[:{KNOWLEDGE_LIMIT}]")
|
|
52
|
+
if ds_dolly:
|
|
53
|
+
ssm = get_ssm()
|
|
54
|
+
optimizer = torch.optim.Adam(ssm.parameters(), lr=1e-3)
|
|
55
|
+
for item in tqdm(ds_dolly, desc="Pre-training MoE Weights"):
|
|
56
|
+
user_turn = item.get("instruction", "").strip()
|
|
57
|
+
assistant_turn = item.get("response", "").strip()
|
|
58
|
+
if not user_turn or not assistant_turn:
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
tokens = ["<|user|>"] + router.tokenizer.tokenize(user_turn.split(), is_inference=False) + \
|
|
62
|
+
["<|assistant|>"] + router.tokenizer.tokenize(assistant_turn.split(), is_inference=False) + ["<|end|>"]
|
|
63
|
+
|
|
64
|
+
router.stream(tokens)
|
|
65
|
+
optimizer.zero_grad()
|
|
66
|
+
v_loss = ssm.update_value(tokens, reward=1.0)
|
|
67
|
+
d_loss = ssm.train_dynamics(tokens)
|
|
68
|
+
(v_loss + d_loss).backward()
|
|
69
|
+
optimizer.step()
|
|
70
|
+
torch.save(ssm.state_dict(), os.path.join(project_root, "ssm_dynamics.pt"))
|
|
71
|
+
|
|
72
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
73
|
+
# Phase 3: Massive World Knowledge Ingestion
|
|
74
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
75
|
+
print("\n[bold #7dcfff][*] Phase 3/5: Massive World Knowledge Ingestion...[/bold #7dcfff]")
|
|
76
|
+
|
|
77
|
+
# 3A: OpenHermes (Conversational)
|
|
78
|
+
ds_hermes = _safe_load_dataset("teknium/OpenHermes-2.5", f"train[:{KNOWLEDGE_LIMIT}]")
|
|
79
|
+
if ds_hermes:
|
|
80
|
+
for item in tqdm(ds_hermes, desc="Ingesting OpenHermes (Chat)"):
|
|
81
|
+
conversations = item.get("conversations", [])
|
|
82
|
+
text = ""
|
|
83
|
+
for turn in conversations:
|
|
84
|
+
role = "<|user|>" if turn.get("from") == "human" else "<|assistant|>"
|
|
85
|
+
text += f"{role} {turn.get('value', '')} "
|
|
86
|
+
if text:
|
|
87
|
+
router.stream(router.tokenizer.tokenize((text + "<|end|>").split(), is_inference=False))
|
|
88
|
+
|
|
89
|
+
# 3B: Wikipedia (Encyclopedic)
|
|
90
|
+
ds_wiki = _safe_load_dataset("wikipedia", "20220301.en", split=f"train[:{KNOWLEDGE_LIMIT}]")
|
|
91
|
+
if not ds_wiki:
|
|
92
|
+
# Fallback dataset format if config needed
|
|
93
|
+
ds_wiki = _safe_load_dataset("wikipedia", "20220301.en[train]")
|
|
94
|
+
if ds_wiki:
|
|
95
|
+
for item in tqdm(ds_wiki, desc="Ingesting Wikipedia (Facts)"):
|
|
96
|
+
text = f"<|user|> Tell me about {item.get('title', '')}. <|assistant|> {item.get('text', '')[:1000]} <|end|>"
|
|
97
|
+
router.stream(router.tokenizer.tokenize(text.split(), is_inference=False))
|
|
98
|
+
|
|
99
|
+
# 3C: MMLU (Graduate Reasoning)
|
|
100
|
+
ds_mmlu = _safe_load_dataset("cais/mmlu", "all") # We'll just take the test split
|
|
101
|
+
if not ds_mmlu:
|
|
102
|
+
from datasets import load_dataset
|
|
103
|
+
try:
|
|
104
|
+
ds_mmlu = load_dataset("cais/mmlu", "all", split="test")
|
|
105
|
+
except Exception:
|
|
106
|
+
ds_mmlu = None
|
|
107
|
+
if ds_mmlu:
|
|
108
|
+
# Convert to list and slice to enforce KNOWLEDGE_LIMIT since split="test" returns the whole split
|
|
109
|
+
ds_mmlu_subset = list(ds_mmlu)[:KNOWLEDGE_LIMIT]
|
|
110
|
+
for item in tqdm(ds_mmlu_subset, desc="Ingesting MMLU (Reasoning)"):
|
|
111
|
+
q, choices, ans_idx = item.get('question',''), item.get('choices',[]), item.get('answer',-1)
|
|
112
|
+
if 0 <= ans_idx < len(choices):
|
|
113
|
+
ans = choices[ans_idx]
|
|
114
|
+
text = f"<|user|> Question: {q} Choices: {', '.join(choices)} <|assistant|> {ans} <|end|>"
|
|
115
|
+
router.stream(router.tokenizer.tokenize(text.split(), is_inference=False))
|
|
116
|
+
|
|
117
|
+
# 3D: GSM8K (Math Reasoning)
|
|
118
|
+
ds_gsm8k = _safe_load_dataset("gsm8k", "main")
|
|
119
|
+
if not ds_gsm8k:
|
|
120
|
+
from datasets import load_dataset
|
|
121
|
+
try:
|
|
122
|
+
ds_gsm8k = load_dataset("gsm8k", "main", split="test")
|
|
123
|
+
except Exception:
|
|
124
|
+
ds_gsm8k = None
|
|
125
|
+
if ds_gsm8k:
|
|
126
|
+
ds_gsm8k_subset = list(ds_gsm8k)[:KNOWLEDGE_LIMIT]
|
|
127
|
+
for item in tqdm(ds_gsm8k_subset, desc="Ingesting GSM8K (Math)"):
|
|
128
|
+
text = f"<|user|> {item.get('question','')} <|assistant|> {item.get('answer','')} <|end|>"
|
|
129
|
+
router.stream(router.tokenizer.tokenize(text.split(), is_inference=False))
|
|
130
|
+
|
|
131
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
132
|
+
# Phase 4: Rigorous Code Logic
|
|
133
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
134
|
+
print("\n[bold #7dcfff][*] Phase 4/5: Rigorous Code Logic Ingestion...[/bold #7dcfff]")
|
|
135
|
+
|
|
136
|
+
# 4A: SWE-Bench (GitHub Issues)
|
|
137
|
+
ds_swe = _safe_load_dataset("princeton-nlp/SWE-bench", "test")
|
|
138
|
+
if not ds_swe:
|
|
139
|
+
from datasets import load_dataset
|
|
140
|
+
try:
|
|
141
|
+
ds_swe = load_dataset("princeton-nlp/SWE-bench", split="test")
|
|
142
|
+
except Exception:
|
|
143
|
+
ds_swe = None
|
|
144
|
+
if ds_swe:
|
|
145
|
+
ds_swe_subset = list(ds_swe)[:KNOWLEDGE_LIMIT]
|
|
146
|
+
for item in tqdm(ds_swe_subset, desc="Ingesting SWE-Bench (Engineering)"):
|
|
147
|
+
issue, patch = item.get('problem_statement',''), item.get('patch','')
|
|
148
|
+
text = f"<|user|> Fix issue:\n{issue[:500]} <|assistant|> {patch[:500]} <|end|>"
|
|
149
|
+
router.stream(router.tokenizer.tokenize(text.split(), is_inference=False))
|
|
150
|
+
|
|
151
|
+
# 4B: HumanEval (Algorithms)
|
|
152
|
+
ds_humaneval = _safe_load_dataset("openai/openai_humaneval", "test")
|
|
153
|
+
if ds_humaneval:
|
|
154
|
+
ds_humaneval_subset = list(ds_humaneval)[:KNOWLEDGE_LIMIT]
|
|
155
|
+
for item in tqdm(ds_humaneval_subset, desc="Ingesting HumanEval (Algorithms)"):
|
|
156
|
+
text = f"<|user|> Complete Python code:\n{item.get('prompt','')} <|assistant|> {item.get('canonical_solution','')} <|end|>"
|
|
157
|
+
router.stream(router.tokenizer.tokenize(text.split(), is_inference=False))
|
|
158
|
+
|
|
159
|
+
# Save the master router
|
|
160
|
+
save_brain(router, os.path.join(project_root, brain_path))
|
|
161
|
+
|
|
162
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
163
|
+
# Phase 5: MoE Specialists
|
|
164
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
165
|
+
print("\n[bold #7dcfff][*] Phase 5/5: Building MoE Specialist Sub-Brains...[/bold #7dcfff]")
|
|
166
|
+
_build_specialists(project_root)
|
|
167
|
+
|
|
168
|
+
print("\n[bold #9ece6a][+] RECONSTRUCTION COMPLETE. Uchi is Online.[/bold #9ece6a]")
|
|
169
|
+
return router
|
|
170
|
+
|
|
171
|
+
def _build_specialists(project_root):
|
|
172
|
+
from uchi.omni_router import OmniRouter
|
|
173
|
+
from uchi.cli import save_brain
|
|
174
|
+
|
|
175
|
+
# Brain Code — trained on HumanEval function completions
|
|
176
|
+
print(" [*] Building brain_code.uchi...")
|
|
177
|
+
r_code = OmniRouter(use_bpe=False)
|
|
178
|
+
ds_he = _safe_load_dataset("openai/openai_humaneval", "test")
|
|
179
|
+
if ds_he:
|
|
180
|
+
for item in tqdm(list(ds_he)[:KNOWLEDGE_LIMIT], desc=" brain_code ← HumanEval"):
|
|
181
|
+
text = (f"<|user|> Complete Python code:\n{item['prompt']} "
|
|
182
|
+
f"<|assistant|> {item['canonical_solution']} <|end|>")
|
|
183
|
+
r_code.stream(r_code.tokenizer.tokenize(text.split(), is_inference=False))
|
|
184
|
+
else:
|
|
185
|
+
r_code.stream(["<|user|>", "complete", "python", "code", "<|assistant|>", "def", "f", "return", "<|end|>"])
|
|
186
|
+
save_brain(r_code, os.path.join(project_root, "brain_code.uchi"))
|
|
187
|
+
|
|
188
|
+
# Brain Math — trained on GSM8K
|
|
189
|
+
print(" [*] Building brain_math.uchi...")
|
|
190
|
+
r_math = OmniRouter(use_bpe=False)
|
|
191
|
+
ds_math = _safe_load_dataset("gsm8k", "main")
|
|
192
|
+
if ds_math:
|
|
193
|
+
for item in tqdm(list(ds_math)[:KNOWLEDGE_LIMIT], desc=" brain_math ← GSM8K"):
|
|
194
|
+
text = f"<|user|> {item['question']} <|assistant|> {item['answer']} <|end|>"
|
|
195
|
+
r_math.stream(r_math.tokenizer.tokenize(text.split(), is_inference=False))
|
|
196
|
+
else:
|
|
197
|
+
r_math.stream(["<|user|>", "math", "equation", "<|assistant|>", "1", "+", "1", "=", "2", "<|end|>"])
|
|
198
|
+
save_brain(r_math, os.path.join(project_root, "brain_math.uchi"))
|
|
199
|
+
|
|
200
|
+
# Brain Convo — trained on OpenHermes conversational turns
|
|
201
|
+
print(" [*] Building brain_convo.uchi...")
|
|
202
|
+
r_convo = OmniRouter(use_bpe=False)
|
|
203
|
+
ds_convo = _safe_load_dataset("teknium/OpenHermes-2.5", f"train[:{KNOWLEDGE_LIMIT}]")
|
|
204
|
+
if ds_convo:
|
|
205
|
+
for item in tqdm(ds_convo, desc=" brain_convo ← OpenHermes"):
|
|
206
|
+
text = ""
|
|
207
|
+
for turn in item.get("conversations", []):
|
|
208
|
+
role = "<|user|>" if turn.get("from") == "human" else "<|assistant|>"
|
|
209
|
+
text += f"{role} {turn.get('value', '')} "
|
|
210
|
+
if text:
|
|
211
|
+
r_convo.stream(r_convo.tokenizer.tokenize((text + "<|end|>").split(), is_inference=False))
|
|
212
|
+
else:
|
|
213
|
+
r_convo.stream(["<|user|>", "hello", "<|assistant|>", "hi", "how", "are", "you", "<|end|>"])
|
|
214
|
+
save_brain(r_convo, os.path.join(project_root, "brain_convo.uchi"))
|