caudate-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- api/__init__.py +5 -0
- api/anthropic_compat.py +1518 -0
- api/artifact_viewer.py +366 -0
- api/caudate_middleware.py +618 -0
- api/forge_bootstrapper_routes.py +377 -0
- api/forge_routes.py +630 -0
- api/forge_system_routes.py +294 -0
- api/openai_compat.py +1993 -0
- api/server.py +667 -0
- api/storyboard_page.py +677 -0
- caudate_cli-0.1.0.dist-info/METADATA +354 -0
- caudate_cli-0.1.0.dist-info/RECORD +153 -0
- caudate_cli-0.1.0.dist-info/WHEEL +5 -0
- caudate_cli-0.1.0.dist-info/entry_points.txt +2 -0
- caudate_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- caudate_cli-0.1.0.dist-info/top_level.txt +14 -0
- cognos_mcp/__init__.py +4 -0
- cognos_mcp/bridge.py +41 -0
- cognos_mcp/client.py +70 -0
- cognos_mcp/config.py +49 -0
- cognos_mcp/server.py +66 -0
- config.py +82 -0
- core/__init__.py +0 -0
- core/agent.py +468 -0
- core/agentic_loop.py +731 -0
- core/anthropic_auth.py +91 -0
- core/background.py +113 -0
- core/banner.py +134 -0
- core/bootstrap.py +292 -0
- core/citations.py +131 -0
- core/compaction.py +109 -0
- core/constitution.py +198 -0
- core/diff_viewer.py +87 -0
- core/export.py +85 -0
- core/file_refs.py +119 -0
- core/files.py +199 -0
- core/hooks.py +209 -0
- core/image.py +599 -0
- core/input.py +91 -0
- core/loop.py +238 -0
- core/memory_md.py +147 -0
- core/notifications.py +99 -0
- core/ownership.py +181 -0
- core/paste.py +81 -0
- core/permissions.py +210 -0
- core/plan_mode.py +215 -0
- core/sandbox_prompt.py +185 -0
- core/scheduler.py +195 -0
- core/schemas.py +202 -0
- core/session.py +90 -0
- core/settings.py +132 -0
- core/skills.py +398 -0
- core/slash_commands.py +977 -0
- core/statusline.py +61 -0
- core/subagent.py +300 -0
- core/thinking.py +50 -0
- core/updater.py +122 -0
- core/usage.py +109 -0
- core/worktree.py +93 -0
- execution/__init__.py +0 -0
- execution/executor.py +329 -0
- execution/plugins.py +108 -0
- execution/tools/__init__.py +0 -0
- execution/tools/agent_tool.py +107 -0
- execution/tools/agentic_tool.py +297 -0
- execution/tools/artifact_tool.py +191 -0
- execution/tools/ask_user_question_tool.py +137 -0
- execution/tools/base.py +81 -0
- execution/tools/calculator_tool.py +137 -0
- execution/tools/cognos_card_tool.py +124 -0
- execution/tools/cron_tool.py +215 -0
- execution/tools/datetime_tool.py +215 -0
- execution/tools/describe_image_tool.py +161 -0
- execution/tools/draw_tool.py +164 -0
- execution/tools/edit_image_tool.py +262 -0
- execution/tools/edit_tool.py +245 -0
- execution/tools/file_tool.py +90 -0
- execution/tools/find_anywhere_tool.py +255 -0
- execution/tools/forge_feature_tools.py +377 -0
- execution/tools/glob_tool.py +59 -0
- execution/tools/grep_tool.py +89 -0
- execution/tools/http_request_tool.py +224 -0
- execution/tools/load_skill_tool.py +104 -0
- execution/tools/longcat_avatar_tool.py +384 -0
- execution/tools/mcp_tool.py +100 -0
- execution/tools/notebook_tool.py +279 -0
- execution/tools/openapi_tool.py +440 -0
- execution/tools/plan_mode_tool.py +95 -0
- execution/tools/push_notification_tool.py +157 -0
- execution/tools/python_tool.py +61 -0
- execution/tools/respond_tool.py +40 -0
- execution/tools/sandbox_tool.py +378 -0
- execution/tools/search_tool.py +153 -0
- execution/tools/semantic_search_tool.py +106 -0
- execution/tools/shell_tool.py +283 -0
- execution/tools/speak_tool.py +134 -0
- execution/tools/storyboard_tool.py +727 -0
- execution/tools/system_info_tool.py +212 -0
- execution/tools/task_tool.py +323 -0
- execution/tools/think_tool.py +49 -0
- execution/tools/transcribe_audio_tool.py +86 -0
- execution/tools/update_memory_tool.py +92 -0
- execution/tools/web_fetch_tool.py +82 -0
- execution/tools/worktree_tool.py +174 -0
- llm/__init__.py +0 -0
- llm/fallback.py +116 -0
- llm/models.py +320 -0
- llm/provider.py +1356 -0
- llm/router.py +373 -0
- main.py +1889 -0
- memory/__init__.py +0 -0
- memory/episodic.py +99 -0
- memory/procedural.py +145 -0
- memory/semantic.py +71 -0
- memory/working.py +64 -0
- nn/__init__.py +43 -0
- nn/auto_evolve.py +245 -0
- nn/caudate.py +136 -0
- nn/config.py +141 -0
- nn/consolidator.py +81 -0
- nn/data.py +1635 -0
- nn/encoder.py +258 -0
- nn/forge_advisor.py +303 -0
- nn/format.py +235 -0
- nn/heads.py +432 -0
- nn/observer.py +994 -0
- nn/policy.py +214 -0
- nn/runtime.py +343 -0
- nn/scorer.py +175 -0
- nn/trainer.py +515 -0
- nn/vision.py +352 -0
- personality/__init__.py +23 -0
- personality/engine.py +129 -0
- personality/identity.py +144 -0
- personality/inner_voice.py +100 -0
- personality/mood.py +205 -0
- planning/__init__.py +0 -0
- planning/dev_server.py +221 -0
- planning/forge_models.py +718 -0
- planning/orchestrator.py +1363 -0
- planning/planner.py +451 -0
- planning/task_graph.py +61 -0
- reflection/__init__.py +0 -0
- reflection/meta_learner.py +156 -0
- reflection/reflector.py +127 -0
- ui/__init__.py +5 -0
- ui/display.py +88 -0
- voice/__init__.py +0 -0
- voice/conversation.py +125 -0
- voice/listener.py +111 -0
- voice/speaker.py +59 -0
- voice/stt.py +126 -0
- voice/tts.py +214 -0
|
@@ -0,0 +1,618 @@
|
|
|
1
|
+
"""CaudateMiddleware — gives Caudate her full body inside /v1/messages.
|
|
2
|
+
|
|
3
|
+
When Claude Code (or any Anthropic-format client) hits Cognos via
|
|
4
|
+
the compat endpoint, this middleware sits between the request and the
|
|
5
|
+
underlying LLM and engages every Caudate capability:
|
|
6
|
+
|
|
7
|
+
1. **State capture** — recent message text + extracted images + mood
|
|
8
|
+
2. **Prediction** — tool / tier / think / value head outputs
|
|
9
|
+
3. **Vocab growth** — registers any new Claude-Code tools
|
|
10
|
+
4. **Hint injection** — at WHISPER+, prepends Caudate's hint to system
|
|
11
|
+
5. **Tier override** — at ADVISOR+, her prediction wins (already wired
|
|
12
|
+
via Router.set_caudate; we just call into it)
|
|
13
|
+
6. **Thinking gate** — at CONTROLLER+, modulates request shape
|
|
14
|
+
7. **Reward derivation**— response shape → heuristic reward
|
|
15
|
+
8. **Episode storage** — every call lands in episodic memory
|
|
16
|
+
9. **Replay + autotrain** — observer accumulates samples, retrains in BG
|
|
17
|
+
10. **Cleanup** — temp images deleted after the call
|
|
18
|
+
|
|
19
|
+
Per-request hot path keeps the latency cost small; the heavy stuff
|
|
20
|
+
(vision encode, episodic write) happens off the critical path where
|
|
21
|
+
possible.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import asyncio
|
|
27
|
+
import base64
|
|
28
|
+
import logging
|
|
29
|
+
import os
|
|
30
|
+
import re
|
|
31
|
+
import tempfile
|
|
32
|
+
import time
|
|
33
|
+
import uuid
|
|
34
|
+
from contextlib import contextmanager
|
|
35
|
+
from typing import Any
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# Hard cap on how many images we extract per request. CLIP/InternVL2
|
|
41
|
+
# are GPU-bound; running 50 images would stall a turn.
|
|
42
|
+
_MAX_IMAGES_PER_TURN = 4
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class CaudateMiddleware:
|
|
46
|
+
"""Wraps an LLM call with Caudate's full prediction/learning loop."""
|
|
47
|
+
|
|
48
|
+
def __init__(self, agent: Any):
|
|
49
|
+
self.agent = agent
|
|
50
|
+
self.caudate = getattr(agent, "caudate", None)
|
|
51
|
+
|
|
52
|
+
# ------------------------------------------------------------------
|
|
53
|
+
# Public hooks called from anthropic_compat
|
|
54
|
+
# ------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
def has_caudate(self) -> bool:
|
|
57
|
+
return self.caudate is not None
|
|
58
|
+
|
|
59
|
+
def begin_turn(
|
|
60
|
+
self,
|
|
61
|
+
messages: list[dict[str, Any]],
|
|
62
|
+
tools: list[dict[str, Any]] | None,
|
|
63
|
+
model_source: str = "unknown",
|
|
64
|
+
) -> "_TurnContext":
|
|
65
|
+
"""Open a turn-context that will be passed back at end_turn().
|
|
66
|
+
|
|
67
|
+
- Builds the textual state Caudate needs.
|
|
68
|
+
- Materializes any base64 images to temp files for vision encoding.
|
|
69
|
+
- Calls caudate.on_turn_start to get a Prediction.
|
|
70
|
+
- Registers Claude Code's tool names in Caudate's vocab.
|
|
71
|
+
- Tags the resulting sample with `model_source` so future
|
|
72
|
+
architectural phases can branch per teacher model
|
|
73
|
+
(CAUDATE_EVOLUTION.md, Phase 1).
|
|
74
|
+
"""
|
|
75
|
+
ctx = _TurnContext(self.agent, messages)
|
|
76
|
+
ctx.model_source = model_source
|
|
77
|
+
if self.caudate is None:
|
|
78
|
+
return ctx
|
|
79
|
+
|
|
80
|
+
# Grow vocab with whatever tools Claude Code passed
|
|
81
|
+
if tools:
|
|
82
|
+
for t in tools:
|
|
83
|
+
fn = t.get("function") or t # accept either shape
|
|
84
|
+
name = fn.get("name") or t.get("name")
|
|
85
|
+
if name:
|
|
86
|
+
self.caudate.replay # touch to keep it alive
|
|
87
|
+
self.caudate.scorer # ditto
|
|
88
|
+
# ToolVocab.add is idempotent
|
|
89
|
+
try: self.caudate.advisor.vocab.add(name)
|
|
90
|
+
except Exception: pass
|
|
91
|
+
|
|
92
|
+
# Build the textual state from recent messages
|
|
93
|
+
recent_text = _flatten_messages_to_text(messages, limit=self.caudate.cfg.msg_window)
|
|
94
|
+
|
|
95
|
+
# Extract images
|
|
96
|
+
image_paths = _extract_images_to_temp(messages, _MAX_IMAGES_PER_TURN)
|
|
97
|
+
ctx.temp_image_paths = image_paths
|
|
98
|
+
|
|
99
|
+
# Mood
|
|
100
|
+
mood = _read_mood(self.agent)
|
|
101
|
+
|
|
102
|
+
# Predict (logs prediction internally, also caches as last_prediction)
|
|
103
|
+
try:
|
|
104
|
+
pred = self.caudate.on_turn_start(
|
|
105
|
+
recent_messages=recent_text,
|
|
106
|
+
mood=mood,
|
|
107
|
+
image_paths=image_paths,
|
|
108
|
+
model_source=model_source,
|
|
109
|
+
)
|
|
110
|
+
ctx.prediction = pred
|
|
111
|
+
except Exception as e:
|
|
112
|
+
logger.debug(f"Caudate.on_turn_start failed: {e}")
|
|
113
|
+
|
|
114
|
+
return ctx
|
|
115
|
+
|
|
116
|
+
def maybe_inject_hint(
|
|
117
|
+
self,
|
|
118
|
+
messages: list[dict[str, Any]],
|
|
119
|
+
ctx: "_TurnContext",
|
|
120
|
+
) -> list[dict[str, Any]]:
|
|
121
|
+
"""At WHISPER+, prepend Caudate's hint to the system message."""
|
|
122
|
+
if self.caudate is None or ctx.prediction is None:
|
|
123
|
+
return messages
|
|
124
|
+
if not self.caudate.can_whisper():
|
|
125
|
+
return messages
|
|
126
|
+
if ctx.prediction.tool_confidence < self.caudate.cfg.advisor_min_confidence:
|
|
127
|
+
return messages
|
|
128
|
+
|
|
129
|
+
hint = _build_hint_block(ctx.prediction, self.caudate.policy.level.label)
|
|
130
|
+
# Find or create system message
|
|
131
|
+
out = list(messages)
|
|
132
|
+
if out and out[0].get("role") == "system":
|
|
133
|
+
head = out[0].get("content", "")
|
|
134
|
+
if isinstance(head, list):
|
|
135
|
+
head = " ".join(b.get("text", "") for b in head if isinstance(b, dict))
|
|
136
|
+
out[0] = {"role": "system", "content": f"{head}\n\n{hint}"}
|
|
137
|
+
else:
|
|
138
|
+
out = [{"role": "system", "content": hint}, *out]
|
|
139
|
+
return out
|
|
140
|
+
|
|
141
|
+
def maybe_disable_thinking(self, ctx: "_TurnContext") -> bool:
|
|
142
|
+
"""CONTROLLER+: returns True if thinking should be DISABLED for this call."""
|
|
143
|
+
if self.caudate is None or ctx.prediction is None:
|
|
144
|
+
return False
|
|
145
|
+
if not self.caudate.can_control():
|
|
146
|
+
return False
|
|
147
|
+
return ctx.prediction.think < 0.3
|
|
148
|
+
|
|
149
|
+
def observe_response_text(self, ctx: "_TurnContext", text: str) -> None:
|
|
150
|
+
"""Record the visible text of the assistant reply (for episode)."""
|
|
151
|
+
ctx.response_text = (ctx.response_text or "") + (text or "")
|
|
152
|
+
|
|
153
|
+
def observe_thinking(self, ctx: "_TurnContext", text: str) -> None:
|
|
154
|
+
"""Record reasoning-channel text. Thinking is valuable signal for
|
|
155
|
+
tool inference even if it never reaches visible content."""
|
|
156
|
+
ctx.thinking_text = (ctx.thinking_text or "") + (text or "")
|
|
157
|
+
|
|
158
|
+
def observe_tool_use(self, ctx: "_TurnContext", tool_name: str) -> None:
|
|
159
|
+
"""One tool call emitted by the LLM."""
|
|
160
|
+
if self.caudate is None:
|
|
161
|
+
return
|
|
162
|
+
try:
|
|
163
|
+
self.caudate.on_tool_use(tool_name)
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.debug(f"Caudate.on_tool_use failed: {e}")
|
|
166
|
+
ctx.tools_used.append(tool_name)
|
|
167
|
+
|
|
168
|
+
def observe_arbitration(
|
|
169
|
+
self,
|
|
170
|
+
ctx: "_TurnContext",
|
|
171
|
+
fast_text: str,
|
|
172
|
+
slow_text: str,
|
|
173
|
+
fast_score: float,
|
|
174
|
+
slow_score: float,
|
|
175
|
+
winner: str,
|
|
176
|
+
fast_model: str = "",
|
|
177
|
+
slow_model: str = "",
|
|
178
|
+
) -> None:
|
|
179
|
+
"""Record a dual-brain arbitration event on this turn.
|
|
180
|
+
|
|
181
|
+
Both drafts + their heuristic scores are stored on the turn
|
|
182
|
+
context; end_turn writes them into the episodic record so
|
|
183
|
+
later we can train a preference head on (state, draft_a,
|
|
184
|
+
draft_b, picked) tuples. This is the data substrate for
|
|
185
|
+
Phase 4 of CAUDATE_EVOLUTION.md (the conductor).
|
|
186
|
+
"""
|
|
187
|
+
ctx.arbitration = {
|
|
188
|
+
"fast": {"text": fast_text[:2000], "score": fast_score, "model": fast_model},
|
|
189
|
+
"slow": {"text": slow_text[:2000], "score": slow_score, "model": slow_model},
|
|
190
|
+
"winner": winner,
|
|
191
|
+
}
|
|
192
|
+
try:
|
|
193
|
+
logger.info(
|
|
194
|
+
f"[arbitration] fast({fast_model})={fast_score:.2f} "
|
|
195
|
+
f"slow({slow_model})={slow_score:.2f} → {winner}"
|
|
196
|
+
)
|
|
197
|
+
except Exception:
|
|
198
|
+
pass
|
|
199
|
+
|
|
200
|
+
def end_turn(self, ctx: "_TurnContext", error: bool = False) -> None:
|
|
201
|
+
"""Close the turn — derive reward, push samples, write episode.
|
|
202
|
+
|
|
203
|
+
Special case: if the LLM didn't actually call a tool but its
|
|
204
|
+
text indicates an intended tool ("I'll use Bash to..."), record
|
|
205
|
+
the *intended* tool as the target with a low reward. This way
|
|
206
|
+
Caudate learns the corrective action ("for this prompt, the
|
|
207
|
+
right tool was Bash even though gemma4 stalled") rather than
|
|
208
|
+
mimicking the failure mode.
|
|
209
|
+
"""
|
|
210
|
+
try:
|
|
211
|
+
# Stalling rescue: infer the *intended* tool from three
|
|
212
|
+
# signals, in priority order:
|
|
213
|
+
# 1. visible response text (model said "I'll use Bash")
|
|
214
|
+
# 2. thinking-channel text (model reasoned "use Bash")
|
|
215
|
+
# 3. user's prompt itself (user asked "what's in folder?"
|
|
216
|
+
# which clearly maps to Bash regardless of model's reply)
|
|
217
|
+
# The user-prompt fallback catches cases where the model
|
|
218
|
+
# refused entirely ("I can't see your files") — Caudate
|
|
219
|
+
# should still learn that the right tool was Bash, with a
|
|
220
|
+
# low reward signaling the model failed.
|
|
221
|
+
inferred_tool: str | None = None
|
|
222
|
+
if (not ctx.tools_used
|
|
223
|
+
and self.caudate is not None
|
|
224
|
+
and self.caudate._pending is not None):
|
|
225
|
+
combined_text = (
|
|
226
|
+
(ctx.response_text or "") + "\n"
|
|
227
|
+
+ (ctx.thinking_text or "")
|
|
228
|
+
)
|
|
229
|
+
if combined_text.strip():
|
|
230
|
+
inferred_tool = _infer_intended_tool(combined_text)
|
|
231
|
+
if inferred_tool is None:
|
|
232
|
+
user_text = _last_user_text(ctx.messages)
|
|
233
|
+
if user_text:
|
|
234
|
+
inferred_tool = _infer_intended_tool(user_text)
|
|
235
|
+
if inferred_tool:
|
|
236
|
+
# Inject into observer's pending state so the sample
|
|
237
|
+
# is built with this target instead of '<none>'.
|
|
238
|
+
self.caudate._pending.chosen_tools.append(inferred_tool)
|
|
239
|
+
ctx.tools_used.append(inferred_tool)
|
|
240
|
+
ctx.inferred_from_stall = True
|
|
241
|
+
|
|
242
|
+
reward = _derive_reward(ctx, error)
|
|
243
|
+
if self.caudate is not None:
|
|
244
|
+
# Pass turn-outcome signals so observer can label the
|
|
245
|
+
# Tier 1 / Tier 2 heads (refusal, code, stall, latency,
|
|
246
|
+
# difficulty, etc.) from what actually happened.
|
|
247
|
+
elapsed_s = max(0.0, time.time() - ctx.started_at)
|
|
248
|
+
self.caudate.on_turn_end(
|
|
249
|
+
reward=reward,
|
|
250
|
+
response_text=ctx.response_text or "",
|
|
251
|
+
inferred_from_stall=getattr(ctx, "inferred_from_stall", False),
|
|
252
|
+
elapsed_s=elapsed_s,
|
|
253
|
+
completion_tokens=getattr(ctx, "completion_tokens", None),
|
|
254
|
+
)
|
|
255
|
+
self._write_episode(ctx, reward)
|
|
256
|
+
|
|
257
|
+
if inferred_tool:
|
|
258
|
+
logger.info(
|
|
259
|
+
f"Caudate stall-rescue: inferred tool={inferred_tool!r} "
|
|
260
|
+
f"from stalling text, reward={reward:.2f}"
|
|
261
|
+
)
|
|
262
|
+
except Exception as e:
|
|
263
|
+
logger.debug(f"Caudate.end_turn failed: {e}")
|
|
264
|
+
finally:
|
|
265
|
+
for p in ctx.temp_image_paths:
|
|
266
|
+
try: os.unlink(p)
|
|
267
|
+
except Exception: pass
|
|
268
|
+
|
|
269
|
+
# ------------------------------------------------------------------
|
|
270
|
+
|
|
271
|
+
def _write_episode(self, ctx: "_TurnContext", reward: float) -> None:
|
|
272
|
+
"""Land each turn in episodic memory so the meta-learner can use it
|
|
273
|
+
and the consolidator picks it up at training time."""
|
|
274
|
+
episodic = getattr(getattr(self.agent, "loop", None), "episodic", None)
|
|
275
|
+
if episodic is None:
|
|
276
|
+
return
|
|
277
|
+
try:
|
|
278
|
+
from core.schemas import Episode, ToolResult, ToolResultStatus
|
|
279
|
+
user_text = _last_user_text(ctx.messages)[:400]
|
|
280
|
+
for tool in ctx.tools_used or ["<reply>"]:
|
|
281
|
+
ep = Episode(
|
|
282
|
+
goal_id="claude-code",
|
|
283
|
+
task_id=str(uuid.uuid4()),
|
|
284
|
+
action=user_text or "(no user text)",
|
|
285
|
+
tool_name=(tool if tool != "<reply>" else None),
|
|
286
|
+
tool_args={},
|
|
287
|
+
result=ToolResult(
|
|
288
|
+
tool_name=(tool if tool != "<reply>" else "Respond"),
|
|
289
|
+
status=(ToolResultStatus.SUCCESS if reward > 0.5
|
|
290
|
+
else ToolResultStatus.ERROR),
|
|
291
|
+
output=(ctx.response_text or "")[:400],
|
|
292
|
+
),
|
|
293
|
+
)
|
|
294
|
+
episodic.store(ep)
|
|
295
|
+
except Exception as e:
|
|
296
|
+
logger.debug(f"episode write failed: {e}")
|
|
297
|
+
|
|
298
|
+
# Dual-brain arbitration record — append to a dedicated JSONL
|
|
299
|
+
# so a future preference-learning trainer can pick it up. We
|
|
300
|
+
# keep it separate from the normal replay buffer because the
|
|
301
|
+
# schema is different (carries TWO drafts + scores + winner,
|
|
302
|
+
# not a single target).
|
|
303
|
+
if ctx.arbitration is not None:
|
|
304
|
+
try:
|
|
305
|
+
import json as _json, time as _time
|
|
306
|
+
from pathlib import Path as _P
|
|
307
|
+
p = _P("data/nn/arbitrations.jsonl")
|
|
308
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
309
|
+
with p.open("a") as f:
|
|
310
|
+
f.write(_json.dumps({
|
|
311
|
+
"ts": _time.time(),
|
|
312
|
+
"user_text": _last_user_text(ctx.messages)[:600],
|
|
313
|
+
"model_source": ctx.model_source,
|
|
314
|
+
"reward": reward,
|
|
315
|
+
"arbitration": ctx.arbitration,
|
|
316
|
+
}) + "\n")
|
|
317
|
+
except Exception as e:
|
|
318
|
+
logger.debug(f"arbitration write failed: {e}")
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
# ---- TurnContext ---------------------------------------------------
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class _TurnContext:
|
|
325
|
+
"""Lives across a single /v1/messages call. Carries Caudate's
|
|
326
|
+
prediction and the bookkeeping needed to score it at the end."""
|
|
327
|
+
|
|
328
|
+
def __init__(self, agent: Any, messages: list[dict[str, Any]]):
|
|
329
|
+
self.agent = agent
|
|
330
|
+
self.messages = messages
|
|
331
|
+
self.prediction = None
|
|
332
|
+
self.temp_image_paths: list[str] = []
|
|
333
|
+
self.tools_used: list[str] = []
|
|
334
|
+
self.response_text: str = ""
|
|
335
|
+
# Thinking-channel text (gemma4 / kimi reasoning). Counted as
|
|
336
|
+
# signal for tool inference but NOT as the visible reply.
|
|
337
|
+
self.thinking_text: str = ""
|
|
338
|
+
self.started_at: float = time.time()
|
|
339
|
+
# True when the LLM stalled (text-only, "I will use X..." but no
|
|
340
|
+
# actual tool_use emitted) and we inferred the intended tool from
|
|
341
|
+
# its text. This flag tunes the reward.
|
|
342
|
+
self.inferred_from_stall: bool = False
|
|
343
|
+
# Phase 1 of evolution roadmap: which model produced this turn's
|
|
344
|
+
# response. Set by begin_turn(); flows into ConversationSample.
|
|
345
|
+
self.model_source: str = "unknown"
|
|
346
|
+
# Dual-brain arbitration record (Phase 4 / pattern 2). When the
|
|
347
|
+
# arbiter ran, this captures both drafts + scores so end_turn
|
|
348
|
+
# can log them for preference-learning training data.
|
|
349
|
+
# Shape: {"fast": {text, score}, "slow": {text, score},
|
|
350
|
+
# "winner": "fast"|"slow"}
|
|
351
|
+
self.arbitration: dict[str, Any] | None = None
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
# ---- Helpers --------------------------------------------------------
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _flatten_messages_to_text(messages: list[dict[str, Any]], limit: int) -> list[str]:
|
|
358
|
+
"""Render the recent conversation as plain text for Caudate's encoder.
|
|
359
|
+
|
|
360
|
+
Important: do NOT silently drop tool_calls / tool_use blocks. They
|
|
361
|
+
carry the structured intent of the previous turn (e.g. the question
|
|
362
|
+
that AskUserQuestion was asking). Without them, Caudate sees only
|
|
363
|
+
the user's picked answer with no idea what was being asked.
|
|
364
|
+
"""
|
|
365
|
+
import json as _json
|
|
366
|
+
out: list[str] = []
|
|
367
|
+
for m in messages[-limit:]:
|
|
368
|
+
role = m.get("role", "?")
|
|
369
|
+
c = m.get("content")
|
|
370
|
+
chunks: list[str] = []
|
|
371
|
+
if isinstance(c, list):
|
|
372
|
+
for b in c:
|
|
373
|
+
if not isinstance(b, dict):
|
|
374
|
+
continue
|
|
375
|
+
btype = b.get("type")
|
|
376
|
+
if btype == "text":
|
|
377
|
+
chunks.append(b.get("text", ""))
|
|
378
|
+
elif btype == "image_url":
|
|
379
|
+
chunks.append("[image]")
|
|
380
|
+
elif btype == "tool_use":
|
|
381
|
+
name = b.get("name", "")
|
|
382
|
+
inp = b.get("input") or {}
|
|
383
|
+
try:
|
|
384
|
+
inp_text = _json.dumps(inp, ensure_ascii=False)[:400]
|
|
385
|
+
except Exception:
|
|
386
|
+
inp_text = str(inp)[:400]
|
|
387
|
+
chunks.append(f"[tool_use {name}({inp_text})]")
|
|
388
|
+
elif btype == "tool_result":
|
|
389
|
+
rc = b.get("content")
|
|
390
|
+
if isinstance(rc, list):
|
|
391
|
+
rc = " ".join(
|
|
392
|
+
x.get("text", "") for x in rc
|
|
393
|
+
if isinstance(x, dict) and x.get("type") == "text"
|
|
394
|
+
)
|
|
395
|
+
chunks.append(f"[tool_result {str(rc)[:200]}]")
|
|
396
|
+
elif btype == "thinking":
|
|
397
|
+
chunks.append(f"[thinking {b.get('thinking', '')[:200]}]")
|
|
398
|
+
text = " ".join(p for p in chunks if p)
|
|
399
|
+
else:
|
|
400
|
+
text = str(c) if c else ""
|
|
401
|
+
# OpenAI-shape assistant messages put tool calls in a sibling
|
|
402
|
+
# field (m["tool_calls"]) rather than inside content blocks.
|
|
403
|
+
# Render them so Caudate sees the structured intent.
|
|
404
|
+
tcs = m.get("tool_calls") or []
|
|
405
|
+
for tc in tcs:
|
|
406
|
+
fn = tc.get("function") or {}
|
|
407
|
+
name = fn.get("name", "")
|
|
408
|
+
args = fn.get("arguments") or ""
|
|
409
|
+
text = f"{text} [tool_call {name}({str(args)[:400]})]".strip()
|
|
410
|
+
if text:
|
|
411
|
+
out.append(f"{role}: {text[:400]}")
|
|
412
|
+
return out
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def _extract_images_to_temp(
|
|
416
|
+
messages: list[dict[str, Any]], cap: int,
|
|
417
|
+
) -> list[str]:
|
|
418
|
+
"""Pull base64 images out of message content blocks, write to temp PNGs.
|
|
419
|
+
|
|
420
|
+
Caudate's vision encoder takes file paths, not raw bytes. So we
|
|
421
|
+
materialize any image_url data: URLs to short-lived tempfiles, return
|
|
422
|
+
paths. Caller deletes them after the turn.
|
|
423
|
+
"""
|
|
424
|
+
paths: list[str] = []
|
|
425
|
+
for m in messages:
|
|
426
|
+
c = m.get("content")
|
|
427
|
+
if not isinstance(c, list):
|
|
428
|
+
continue
|
|
429
|
+
for b in c:
|
|
430
|
+
if not isinstance(b, dict):
|
|
431
|
+
continue
|
|
432
|
+
if len(paths) >= cap:
|
|
433
|
+
return paths
|
|
434
|
+
url = None
|
|
435
|
+
if b.get("type") == "image_url":
|
|
436
|
+
url = (b.get("image_url") or {}).get("url")
|
|
437
|
+
elif b.get("type") == "image":
|
|
438
|
+
src = b.get("source") or {}
|
|
439
|
+
if src.get("type") == "base64":
|
|
440
|
+
media = src.get("media_type", "image/png").split("/")[-1]
|
|
441
|
+
data = src.get("data", "")
|
|
442
|
+
try:
|
|
443
|
+
raw = base64.b64decode(data)
|
|
444
|
+
except Exception:
|
|
445
|
+
continue
|
|
446
|
+
fd, p = tempfile.mkstemp(suffix=f".{media}", prefix="caudate-img-")
|
|
447
|
+
try:
|
|
448
|
+
os.write(fd, raw); paths.append(p)
|
|
449
|
+
finally:
|
|
450
|
+
os.close(fd)
|
|
451
|
+
continue
|
|
452
|
+
if url and url.startswith("data:"):
|
|
453
|
+
try:
|
|
454
|
+
head, _, b64 = url.partition(",")
|
|
455
|
+
media = head.split(";")[0].split("/")[-1] or "png"
|
|
456
|
+
raw = base64.b64decode(b64)
|
|
457
|
+
except Exception:
|
|
458
|
+
continue
|
|
459
|
+
fd, p = tempfile.mkstemp(suffix=f".{media}", prefix="caudate-img-")
|
|
460
|
+
try:
|
|
461
|
+
os.write(fd, raw); paths.append(p)
|
|
462
|
+
finally:
|
|
463
|
+
os.close(fd)
|
|
464
|
+
return paths
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _read_mood(agent: Any) -> list[float]:
|
|
468
|
+
"""Read 4 mood floats off the personality engine if available."""
|
|
469
|
+
p = getattr(agent, "personality", None)
|
|
470
|
+
if p is None:
|
|
471
|
+
return [0.5, 0.5, 0.5, 0.5]
|
|
472
|
+
try:
|
|
473
|
+
m = p.mood
|
|
474
|
+
# MoodState fields: confidence, curiosity, frustration, satisfaction
|
|
475
|
+
return [
|
|
476
|
+
float(getattr(m, "confidence", 0.5)),
|
|
477
|
+
float(getattr(m, "curiosity", 0.5)),
|
|
478
|
+
float(getattr(m, "frustration", 0.5)),
|
|
479
|
+
float(getattr(m, "satisfaction", 0.5)),
|
|
480
|
+
]
|
|
481
|
+
except Exception:
|
|
482
|
+
return [0.5, 0.5, 0.5, 0.5]
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def _last_user_text(messages: list[dict[str, Any]]) -> str:
|
|
486
|
+
for m in reversed(messages):
|
|
487
|
+
if m.get("role") == "user":
|
|
488
|
+
c = m.get("content")
|
|
489
|
+
if isinstance(c, str):
|
|
490
|
+
return c
|
|
491
|
+
if isinstance(c, list):
|
|
492
|
+
for b in c:
|
|
493
|
+
if isinstance(b, dict) and b.get("type") == "text":
|
|
494
|
+
return b.get("text", "")
|
|
495
|
+
return ""
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def _build_hint_block(pred: Any, level: str) -> str:
|
|
499
|
+
if level == "controller":
|
|
500
|
+
preface = "## Caudate (your trained action-selection net) recommends:"
|
|
501
|
+
elif level == "advisor":
|
|
502
|
+
preface = "## Caudate (advisor) suggests, based on prior sessions:"
|
|
503
|
+
else:
|
|
504
|
+
preface = "## Caudate (whispering — still learning) thinks you might want:"
|
|
505
|
+
return (
|
|
506
|
+
f"{preface}\n"
|
|
507
|
+
f" next tool: {pred.tool} (confidence {pred.tool_confidence:.2f})\n"
|
|
508
|
+
f" routing: {pred.tier} (confidence {pred.tier_confidence:.2f})\n"
|
|
509
|
+
f" thinking: {'helpful' if pred.think >= 0.5 else 'not needed'} "
|
|
510
|
+
f"(p={pred.think:.2f})\n"
|
|
511
|
+
f" expected reward: {pred.value:.2f}\n"
|
|
512
|
+
f"You may disagree — Caudate ({level}) hasn't seen this exact context, "
|
|
513
|
+
f"only patterns from past turns."
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
# ---- Stalling detection + intended-tool inference ------------------
|
|
518
|
+
# These power Caudate's "rescue" learning: when an LLM (gemma4 in
|
|
519
|
+
# particular) generates intent without action, we extract the action it
|
|
520
|
+
# *should* have taken so Caudate's next prediction is corrective rather
|
|
521
|
+
# than mimetic.
|
|
522
|
+
|
|
523
|
+
# Regex that catches "I will / I'll / Let me / I should / I need to / I
|
|
524
|
+
# am going to / first I" preambles followed by an action verb. Matches
|
|
525
|
+
# only at the START of a sentence so we don't false-positive on
|
|
526
|
+
# "I would not..." or "I'll be honest" (those end without action plans).
|
|
527
|
+
_STALLING_PHRASES = re.compile(
|
|
528
|
+
r"\b(?:I\s+will|I'll|let\s+me|I\s+should|I\s+need\s+to|"
|
|
529
|
+
r"I\s+am\s+going\s+to|first(?:,)?\s+I'?l?l?|"
|
|
530
|
+
r"I'll\s+start\s+by|I\s+need\s+to)\s+",
|
|
531
|
+
re.IGNORECASE,
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
# Mapping: text fragments → Cognos tool name (case-insensitive).
|
|
535
|
+
# Cognos tools: Bash, Read, Write, Edit, Glob, Grep, WebSearch,
|
|
536
|
+
# WebFetch, PythonExec, Think, Respond, Agent, Draw.
|
|
537
|
+
_TOOL_HINTS: list[tuple[re.Pattern, str]] = [
|
|
538
|
+
(re.compile(r"\b(?:use\s+)?bash\b|`?ls\b|`?cat\b|shell\s+command", re.I), "Bash"),
|
|
539
|
+
(re.compile(r"\b(?:use\s+)?glob\b|find\s+files?\s+match|file\s+pattern", re.I), "Glob"),
|
|
540
|
+
(re.compile(r"\b(?:use\s+)?grep\b|search\s+(?:for|the|in)|search\s+code", re.I), "Grep"),
|
|
541
|
+
(re.compile(r"\b(?:use\s+)?read\b|read\s+(?:the\s+)?file|inspect\s+file", re.I), "Read"),
|
|
542
|
+
(re.compile(r"\b(?:use\s+)?write\b|write\s+(?:to|a)\s+file|create\s+file", re.I), "Write"),
|
|
543
|
+
(re.compile(r"\b(?:use\s+)?edit\b|modify\s+(?:the\s+)?file|change\s+code", re.I), "Edit"),
|
|
544
|
+
(re.compile(r"\b(?:use\s+)?websearch\b|search\s+(?:the\s+)?web|google", re.I), "WebSearch"),
|
|
545
|
+
(re.compile(r"\b(?:use\s+)?webfetch\b|fetch\s+(?:the\s+)?(?:url|page)", re.I), "WebFetch"),
|
|
546
|
+
(re.compile(r"\b(?:use\s+)?python(?:exec)?\b|run\s+python|execute\s+python", re.I), "PythonExec"),
|
|
547
|
+
(re.compile(r"\b(?:use\s+)?(?:the\s+)?think\b|think\s+(?:about|step)", re.I), "Think"),
|
|
548
|
+
(re.compile(r"\b(?:use\s+)?(?:agent|subagent)\b|spawn\s+(?:an?\s+)?agent", re.I), "Agent"),
|
|
549
|
+
(re.compile(r"\b(?:use\s+)?draw(?:ing)?\b|generate\s+(?:an?\s+)?(?:image|picture|diagram)|create\s+(?:an?\s+)?(?:image|picture)", re.I), "Draw"),
|
|
550
|
+
# Generic exploratory prompts → Bash. These catch user-side language
|
|
551
|
+
# that maps clearly to a filesystem-inspection turn even when the
|
|
552
|
+
# model didn't suggest a tool.
|
|
553
|
+
(re.compile(r"list\s+(?:the\s+)?files?|"
|
|
554
|
+
r"what'?s\s+in\s+(?:\w+\s+)?(?:dir|folder|directory)|"
|
|
555
|
+
r"what\s+is\s+in\s+(?:\w+\s+)?(?:dir|folder|directory)|"
|
|
556
|
+
r"show\s+(?:me\s+)?(?:the\s+)?files?|"
|
|
557
|
+
r"(?:check|see)\s+(?:the\s+)?(?:dir|folder|directory|files?)|"
|
|
558
|
+
r"(?:what'?s|what\s+is|tell\s+me\s+what'?s|tell\s+me\s+what\s+is)\s+"
|
|
559
|
+
r"(?:working|happening|going\s+on|here)\s+"
|
|
560
|
+
r"(?:in\s+(?:\w+\s+)?(?:dir|folder|directory|repo|project)|here)", re.I), "Bash"),
|
|
561
|
+
]
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def _is_stalling(text: str) -> bool:
|
|
565
|
+
"""Did the model say 'I will X' without actually emitting a tool call?"""
|
|
566
|
+
return bool(text) and bool(_STALLING_PHRASES.search(text))
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def _infer_intended_tool(text: str) -> str | None:
|
|
570
|
+
"""If the model's text describes a tool action — whether stalling
|
|
571
|
+
("I will use Bash"), refusing ("I can't run code; try `ls`"), or
|
|
572
|
+
advising ("you should grep for that") — return the matching Cognos
|
|
573
|
+
tool name. Returns None if no clear intent.
|
|
574
|
+
|
|
575
|
+
Used to *rescue* Caudate's training data: instead of recording
|
|
576
|
+
target_tool='<none>' whenever the model didn't call a tool, we
|
|
577
|
+
record the tool that the response *describes*, so Caudate learns
|
|
578
|
+
the corrective action rather than the failure pattern.
|
|
579
|
+
"""
|
|
580
|
+
if not text:
|
|
581
|
+
return None
|
|
582
|
+
for pattern, tool in _TOOL_HINTS:
|
|
583
|
+
if pattern.search(text):
|
|
584
|
+
return tool
|
|
585
|
+
return None
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
def _derive_reward(ctx: _TurnContext, error: bool) -> float:
|
|
589
|
+
"""Reward signal for Caudate's training.
|
|
590
|
+
|
|
591
|
+
- upstream LLM error → 0.20 (clearly bad)
|
|
592
|
+
- real tool call emitted (not inferred) → 0.70 (engaged)
|
|
593
|
+
- inferred tool from stalling text → 0.25 (model failed
|
|
594
|
+
— rescue training)
|
|
595
|
+
- text-only reply, no stalling → 0.60 (legitimate answer)
|
|
596
|
+
- empty reply → 0.40 (gave up)
|
|
597
|
+
|
|
598
|
+
The 0.25 reward on stalling-with-inference is the key change: it
|
|
599
|
+
teaches Caudate that the predicted action was correct (so she should
|
|
600
|
+
predict it again next time) but the *outcome* was bad (so the LLM
|
|
601
|
+
needs the hint). High enough that her tool-prediction head learns
|
|
602
|
+
the right answer; low enough that her value-prediction head learns
|
|
603
|
+
"this turn went poorly."
|
|
604
|
+
"""
|
|
605
|
+
if error:
|
|
606
|
+
return 0.20
|
|
607
|
+
if ctx.inferred_from_stall:
|
|
608
|
+
return 0.25
|
|
609
|
+
if ctx.tools_used:
|
|
610
|
+
return 0.70
|
|
611
|
+
text = (ctx.response_text or "")
|
|
612
|
+
if _is_stalling(text):
|
|
613
|
+
# Stalling text but no inference matched (vague intent like
|
|
614
|
+
# "I'll think about it"). Penalize but no rescue available.
|
|
615
|
+
return 0.30
|
|
616
|
+
if text.strip():
|
|
617
|
+
return 0.60
|
|
618
|
+
return 0.40
|