caudate-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. api/__init__.py +5 -0
  2. api/anthropic_compat.py +1518 -0
  3. api/artifact_viewer.py +366 -0
  4. api/caudate_middleware.py +618 -0
  5. api/forge_bootstrapper_routes.py +377 -0
  6. api/forge_routes.py +630 -0
  7. api/forge_system_routes.py +294 -0
  8. api/openai_compat.py +1993 -0
  9. api/server.py +667 -0
  10. api/storyboard_page.py +677 -0
  11. caudate_cli-0.1.0.dist-info/METADATA +354 -0
  12. caudate_cli-0.1.0.dist-info/RECORD +153 -0
  13. caudate_cli-0.1.0.dist-info/WHEEL +5 -0
  14. caudate_cli-0.1.0.dist-info/entry_points.txt +2 -0
  15. caudate_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  16. caudate_cli-0.1.0.dist-info/top_level.txt +14 -0
  17. cognos_mcp/__init__.py +4 -0
  18. cognos_mcp/bridge.py +41 -0
  19. cognos_mcp/client.py +70 -0
  20. cognos_mcp/config.py +49 -0
  21. cognos_mcp/server.py +66 -0
  22. config.py +82 -0
  23. core/__init__.py +0 -0
  24. core/agent.py +468 -0
  25. core/agentic_loop.py +731 -0
  26. core/anthropic_auth.py +91 -0
  27. core/background.py +113 -0
  28. core/banner.py +134 -0
  29. core/bootstrap.py +292 -0
  30. core/citations.py +131 -0
  31. core/compaction.py +109 -0
  32. core/constitution.py +198 -0
  33. core/diff_viewer.py +87 -0
  34. core/export.py +85 -0
  35. core/file_refs.py +119 -0
  36. core/files.py +199 -0
  37. core/hooks.py +209 -0
  38. core/image.py +599 -0
  39. core/input.py +91 -0
  40. core/loop.py +238 -0
  41. core/memory_md.py +147 -0
  42. core/notifications.py +99 -0
  43. core/ownership.py +181 -0
  44. core/paste.py +81 -0
  45. core/permissions.py +210 -0
  46. core/plan_mode.py +215 -0
  47. core/sandbox_prompt.py +185 -0
  48. core/scheduler.py +195 -0
  49. core/schemas.py +202 -0
  50. core/session.py +90 -0
  51. core/settings.py +132 -0
  52. core/skills.py +398 -0
  53. core/slash_commands.py +977 -0
  54. core/statusline.py +61 -0
  55. core/subagent.py +300 -0
  56. core/thinking.py +50 -0
  57. core/updater.py +122 -0
  58. core/usage.py +109 -0
  59. core/worktree.py +93 -0
  60. execution/__init__.py +0 -0
  61. execution/executor.py +329 -0
  62. execution/plugins.py +108 -0
  63. execution/tools/__init__.py +0 -0
  64. execution/tools/agent_tool.py +107 -0
  65. execution/tools/agentic_tool.py +297 -0
  66. execution/tools/artifact_tool.py +191 -0
  67. execution/tools/ask_user_question_tool.py +137 -0
  68. execution/tools/base.py +81 -0
  69. execution/tools/calculator_tool.py +137 -0
  70. execution/tools/cognos_card_tool.py +124 -0
  71. execution/tools/cron_tool.py +215 -0
  72. execution/tools/datetime_tool.py +215 -0
  73. execution/tools/describe_image_tool.py +161 -0
  74. execution/tools/draw_tool.py +164 -0
  75. execution/tools/edit_image_tool.py +262 -0
  76. execution/tools/edit_tool.py +245 -0
  77. execution/tools/file_tool.py +90 -0
  78. execution/tools/find_anywhere_tool.py +255 -0
  79. execution/tools/forge_feature_tools.py +377 -0
  80. execution/tools/glob_tool.py +59 -0
  81. execution/tools/grep_tool.py +89 -0
  82. execution/tools/http_request_tool.py +224 -0
  83. execution/tools/load_skill_tool.py +104 -0
  84. execution/tools/longcat_avatar_tool.py +384 -0
  85. execution/tools/mcp_tool.py +100 -0
  86. execution/tools/notebook_tool.py +279 -0
  87. execution/tools/openapi_tool.py +440 -0
  88. execution/tools/plan_mode_tool.py +95 -0
  89. execution/tools/push_notification_tool.py +157 -0
  90. execution/tools/python_tool.py +61 -0
  91. execution/tools/respond_tool.py +40 -0
  92. execution/tools/sandbox_tool.py +378 -0
  93. execution/tools/search_tool.py +153 -0
  94. execution/tools/semantic_search_tool.py +106 -0
  95. execution/tools/shell_tool.py +283 -0
  96. execution/tools/speak_tool.py +134 -0
  97. execution/tools/storyboard_tool.py +727 -0
  98. execution/tools/system_info_tool.py +212 -0
  99. execution/tools/task_tool.py +323 -0
  100. execution/tools/think_tool.py +49 -0
  101. execution/tools/transcribe_audio_tool.py +86 -0
  102. execution/tools/update_memory_tool.py +92 -0
  103. execution/tools/web_fetch_tool.py +82 -0
  104. execution/tools/worktree_tool.py +174 -0
  105. llm/__init__.py +0 -0
  106. llm/fallback.py +116 -0
  107. llm/models.py +320 -0
  108. llm/provider.py +1356 -0
  109. llm/router.py +373 -0
  110. main.py +1889 -0
  111. memory/__init__.py +0 -0
  112. memory/episodic.py +99 -0
  113. memory/procedural.py +145 -0
  114. memory/semantic.py +71 -0
  115. memory/working.py +64 -0
  116. nn/__init__.py +43 -0
  117. nn/auto_evolve.py +245 -0
  118. nn/caudate.py +136 -0
  119. nn/config.py +141 -0
  120. nn/consolidator.py +81 -0
  121. nn/data.py +1635 -0
  122. nn/encoder.py +258 -0
  123. nn/forge_advisor.py +303 -0
  124. nn/format.py +235 -0
  125. nn/heads.py +432 -0
  126. nn/observer.py +994 -0
  127. nn/policy.py +214 -0
  128. nn/runtime.py +343 -0
  129. nn/scorer.py +175 -0
  130. nn/trainer.py +515 -0
  131. nn/vision.py +352 -0
  132. personality/__init__.py +23 -0
  133. personality/engine.py +129 -0
  134. personality/identity.py +144 -0
  135. personality/inner_voice.py +100 -0
  136. personality/mood.py +205 -0
  137. planning/__init__.py +0 -0
  138. planning/dev_server.py +221 -0
  139. planning/forge_models.py +718 -0
  140. planning/orchestrator.py +1363 -0
  141. planning/planner.py +451 -0
  142. planning/task_graph.py +61 -0
  143. reflection/__init__.py +0 -0
  144. reflection/meta_learner.py +156 -0
  145. reflection/reflector.py +127 -0
  146. ui/__init__.py +5 -0
  147. ui/display.py +88 -0
  148. voice/__init__.py +0 -0
  149. voice/conversation.py +125 -0
  150. voice/listener.py +111 -0
  151. voice/speaker.py +59 -0
  152. voice/stt.py +126 -0
  153. voice/tts.py +214 -0
@@ -0,0 +1,618 @@
1
+ """CaudateMiddleware — gives Caudate her full body inside /v1/messages.
2
+
3
+ When Claude Code (or any Anthropic-format client) hits Cognos via
4
+ the compat endpoint, this middleware sits between the request and the
5
+ underlying LLM and engages every Caudate capability:
6
+
7
+ 1. **State capture** — recent message text + extracted images + mood
8
+ 2. **Prediction** — tool / tier / think / value head outputs
9
+ 3. **Vocab growth** — registers any new Claude-Code tools
10
+ 4. **Hint injection** — at WHISPER+, prepends Caudate's hint to system
11
+ 5. **Tier override** — at ADVISOR+, her prediction wins (already wired
12
+ via Router.set_caudate; we just call into it)
13
+ 6. **Thinking gate** — at CONTROLLER+, modulates request shape
14
+ 7. **Reward derivation**— response shape → heuristic reward
15
+ 8. **Episode storage** — every call lands in episodic memory
16
+ 9. **Replay + autotrain** — observer accumulates samples, retrains in BG
17
+ 10. **Cleanup** — temp images deleted after the call
18
+
19
+ Per-request hot path keeps the latency cost small; the heavy stuff
20
+ (vision encode, episodic write) happens off the critical path where
21
+ possible.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import asyncio
27
+ import base64
28
+ import logging
29
+ import os
30
+ import re
31
+ import tempfile
32
+ import time
33
+ import uuid
34
+ from contextlib import contextmanager
35
+ from typing import Any
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ # Hard cap on how many images we extract per request. CLIP/InternVL2
41
+ # are GPU-bound; running 50 images would stall a turn.
42
+ _MAX_IMAGES_PER_TURN = 4
43
+
44
+
45
+ class CaudateMiddleware:
46
+ """Wraps an LLM call with Caudate's full prediction/learning loop."""
47
+
48
+ def __init__(self, agent: Any):
49
+ self.agent = agent
50
+ self.caudate = getattr(agent, "caudate", None)
51
+
52
+ # ------------------------------------------------------------------
53
+ # Public hooks called from anthropic_compat
54
+ # ------------------------------------------------------------------
55
+
56
+ def has_caudate(self) -> bool:
57
+ return self.caudate is not None
58
+
59
+ def begin_turn(
60
+ self,
61
+ messages: list[dict[str, Any]],
62
+ tools: list[dict[str, Any]] | None,
63
+ model_source: str = "unknown",
64
+ ) -> "_TurnContext":
65
+ """Open a turn-context that will be passed back at end_turn().
66
+
67
+ - Builds the textual state Caudate needs.
68
+ - Materializes any base64 images to temp files for vision encoding.
69
+ - Calls caudate.on_turn_start to get a Prediction.
70
+ - Registers Claude Code's tool names in Caudate's vocab.
71
+ - Tags the resulting sample with `model_source` so future
72
+ architectural phases can branch per teacher model
73
+ (CAUDATE_EVOLUTION.md, Phase 1).
74
+ """
75
+ ctx = _TurnContext(self.agent, messages)
76
+ ctx.model_source = model_source
77
+ if self.caudate is None:
78
+ return ctx
79
+
80
+ # Grow vocab with whatever tools Claude Code passed
81
+ if tools:
82
+ for t in tools:
83
+ fn = t.get("function") or t # accept either shape
84
+ name = fn.get("name") or t.get("name")
85
+ if name:
86
+ self.caudate.replay # touch to keep it alive
87
+ self.caudate.scorer # ditto
88
+ # ToolVocab.add is idempotent
89
+ try: self.caudate.advisor.vocab.add(name)
90
+ except Exception: pass
91
+
92
+ # Build the textual state from recent messages
93
+ recent_text = _flatten_messages_to_text(messages, limit=self.caudate.cfg.msg_window)
94
+
95
+ # Extract images
96
+ image_paths = _extract_images_to_temp(messages, _MAX_IMAGES_PER_TURN)
97
+ ctx.temp_image_paths = image_paths
98
+
99
+ # Mood
100
+ mood = _read_mood(self.agent)
101
+
102
+ # Predict (logs prediction internally, also caches as last_prediction)
103
+ try:
104
+ pred = self.caudate.on_turn_start(
105
+ recent_messages=recent_text,
106
+ mood=mood,
107
+ image_paths=image_paths,
108
+ model_source=model_source,
109
+ )
110
+ ctx.prediction = pred
111
+ except Exception as e:
112
+ logger.debug(f"Caudate.on_turn_start failed: {e}")
113
+
114
+ return ctx
115
+
116
+ def maybe_inject_hint(
117
+ self,
118
+ messages: list[dict[str, Any]],
119
+ ctx: "_TurnContext",
120
+ ) -> list[dict[str, Any]]:
121
+ """At WHISPER+, prepend Caudate's hint to the system message."""
122
+ if self.caudate is None or ctx.prediction is None:
123
+ return messages
124
+ if not self.caudate.can_whisper():
125
+ return messages
126
+ if ctx.prediction.tool_confidence < self.caudate.cfg.advisor_min_confidence:
127
+ return messages
128
+
129
+ hint = _build_hint_block(ctx.prediction, self.caudate.policy.level.label)
130
+ # Find or create system message
131
+ out = list(messages)
132
+ if out and out[0].get("role") == "system":
133
+ head = out[0].get("content", "")
134
+ if isinstance(head, list):
135
+ head = " ".join(b.get("text", "") for b in head if isinstance(b, dict))
136
+ out[0] = {"role": "system", "content": f"{head}\n\n{hint}"}
137
+ else:
138
+ out = [{"role": "system", "content": hint}, *out]
139
+ return out
140
+
141
+ def maybe_disable_thinking(self, ctx: "_TurnContext") -> bool:
142
+ """CONTROLLER+: returns True if thinking should be DISABLED for this call."""
143
+ if self.caudate is None or ctx.prediction is None:
144
+ return False
145
+ if not self.caudate.can_control():
146
+ return False
147
+ return ctx.prediction.think < 0.3
148
+
149
+ def observe_response_text(self, ctx: "_TurnContext", text: str) -> None:
150
+ """Record the visible text of the assistant reply (for episode)."""
151
+ ctx.response_text = (ctx.response_text or "") + (text or "")
152
+
153
+ def observe_thinking(self, ctx: "_TurnContext", text: str) -> None:
154
+ """Record reasoning-channel text. Thinking is valuable signal for
155
+ tool inference even if it never reaches visible content."""
156
+ ctx.thinking_text = (ctx.thinking_text or "") + (text or "")
157
+
158
+ def observe_tool_use(self, ctx: "_TurnContext", tool_name: str) -> None:
159
+ """One tool call emitted by the LLM."""
160
+ if self.caudate is None:
161
+ return
162
+ try:
163
+ self.caudate.on_tool_use(tool_name)
164
+ except Exception as e:
165
+ logger.debug(f"Caudate.on_tool_use failed: {e}")
166
+ ctx.tools_used.append(tool_name)
167
+
168
+ def observe_arbitration(
169
+ self,
170
+ ctx: "_TurnContext",
171
+ fast_text: str,
172
+ slow_text: str,
173
+ fast_score: float,
174
+ slow_score: float,
175
+ winner: str,
176
+ fast_model: str = "",
177
+ slow_model: str = "",
178
+ ) -> None:
179
+ """Record a dual-brain arbitration event on this turn.
180
+
181
+ Both drafts + their heuristic scores are stored on the turn
182
+ context; end_turn writes them into the episodic record so
183
+ later we can train a preference head on (state, draft_a,
184
+ draft_b, picked) tuples. This is the data substrate for
185
+ Phase 4 of CAUDATE_EVOLUTION.md (the conductor).
186
+ """
187
+ ctx.arbitration = {
188
+ "fast": {"text": fast_text[:2000], "score": fast_score, "model": fast_model},
189
+ "slow": {"text": slow_text[:2000], "score": slow_score, "model": slow_model},
190
+ "winner": winner,
191
+ }
192
+ try:
193
+ logger.info(
194
+ f"[arbitration] fast({fast_model})={fast_score:.2f} "
195
+ f"slow({slow_model})={slow_score:.2f} → {winner}"
196
+ )
197
+ except Exception:
198
+ pass
199
+
200
+ def end_turn(self, ctx: "_TurnContext", error: bool = False) -> None:
201
+ """Close the turn — derive reward, push samples, write episode.
202
+
203
+ Special case: if the LLM didn't actually call a tool but its
204
+ text indicates an intended tool ("I'll use Bash to..."), record
205
+ the *intended* tool as the target with a low reward. This way
206
+ Caudate learns the corrective action ("for this prompt, the
207
+ right tool was Bash even though gemma4 stalled") rather than
208
+ mimicking the failure mode.
209
+ """
210
+ try:
211
+ # Stalling rescue: infer the *intended* tool from three
212
+ # signals, in priority order:
213
+ # 1. visible response text (model said "I'll use Bash")
214
+ # 2. thinking-channel text (model reasoned "use Bash")
215
+ # 3. user's prompt itself (user asked "what's in folder?"
216
+ # which clearly maps to Bash regardless of model's reply)
217
+ # The user-prompt fallback catches cases where the model
218
+ # refused entirely ("I can't see your files") — Caudate
219
+ # should still learn that the right tool was Bash, with a
220
+ # low reward signaling the model failed.
221
+ inferred_tool: str | None = None
222
+ if (not ctx.tools_used
223
+ and self.caudate is not None
224
+ and self.caudate._pending is not None):
225
+ combined_text = (
226
+ (ctx.response_text or "") + "\n"
227
+ + (ctx.thinking_text or "")
228
+ )
229
+ if combined_text.strip():
230
+ inferred_tool = _infer_intended_tool(combined_text)
231
+ if inferred_tool is None:
232
+ user_text = _last_user_text(ctx.messages)
233
+ if user_text:
234
+ inferred_tool = _infer_intended_tool(user_text)
235
+ if inferred_tool:
236
+ # Inject into observer's pending state so the sample
237
+ # is built with this target instead of '<none>'.
238
+ self.caudate._pending.chosen_tools.append(inferred_tool)
239
+ ctx.tools_used.append(inferred_tool)
240
+ ctx.inferred_from_stall = True
241
+
242
+ reward = _derive_reward(ctx, error)
243
+ if self.caudate is not None:
244
+ # Pass turn-outcome signals so observer can label the
245
+ # Tier 1 / Tier 2 heads (refusal, code, stall, latency,
246
+ # difficulty, etc.) from what actually happened.
247
+ elapsed_s = max(0.0, time.time() - ctx.started_at)
248
+ self.caudate.on_turn_end(
249
+ reward=reward,
250
+ response_text=ctx.response_text or "",
251
+ inferred_from_stall=getattr(ctx, "inferred_from_stall", False),
252
+ elapsed_s=elapsed_s,
253
+ completion_tokens=getattr(ctx, "completion_tokens", None),
254
+ )
255
+ self._write_episode(ctx, reward)
256
+
257
+ if inferred_tool:
258
+ logger.info(
259
+ f"Caudate stall-rescue: inferred tool={inferred_tool!r} "
260
+ f"from stalling text, reward={reward:.2f}"
261
+ )
262
+ except Exception as e:
263
+ logger.debug(f"Caudate.end_turn failed: {e}")
264
+ finally:
265
+ for p in ctx.temp_image_paths:
266
+ try: os.unlink(p)
267
+ except Exception: pass
268
+
269
+ # ------------------------------------------------------------------
270
+
271
+ def _write_episode(self, ctx: "_TurnContext", reward: float) -> None:
272
+ """Land each turn in episodic memory so the meta-learner can use it
273
+ and the consolidator picks it up at training time."""
274
+ episodic = getattr(getattr(self.agent, "loop", None), "episodic", None)
275
+ if episodic is None:
276
+ return
277
+ try:
278
+ from core.schemas import Episode, ToolResult, ToolResultStatus
279
+ user_text = _last_user_text(ctx.messages)[:400]
280
+ for tool in ctx.tools_used or ["<reply>"]:
281
+ ep = Episode(
282
+ goal_id="claude-code",
283
+ task_id=str(uuid.uuid4()),
284
+ action=user_text or "(no user text)",
285
+ tool_name=(tool if tool != "<reply>" else None),
286
+ tool_args={},
287
+ result=ToolResult(
288
+ tool_name=(tool if tool != "<reply>" else "Respond"),
289
+ status=(ToolResultStatus.SUCCESS if reward > 0.5
290
+ else ToolResultStatus.ERROR),
291
+ output=(ctx.response_text or "")[:400],
292
+ ),
293
+ )
294
+ episodic.store(ep)
295
+ except Exception as e:
296
+ logger.debug(f"episode write failed: {e}")
297
+
298
+ # Dual-brain arbitration record — append to a dedicated JSONL
299
+ # so a future preference-learning trainer can pick it up. We
300
+ # keep it separate from the normal replay buffer because the
301
+ # schema is different (carries TWO drafts + scores + winner,
302
+ # not a single target).
303
+ if ctx.arbitration is not None:
304
+ try:
305
+ import json as _json, time as _time
306
+ from pathlib import Path as _P
307
+ p = _P("data/nn/arbitrations.jsonl")
308
+ p.parent.mkdir(parents=True, exist_ok=True)
309
+ with p.open("a") as f:
310
+ f.write(_json.dumps({
311
+ "ts": _time.time(),
312
+ "user_text": _last_user_text(ctx.messages)[:600],
313
+ "model_source": ctx.model_source,
314
+ "reward": reward,
315
+ "arbitration": ctx.arbitration,
316
+ }) + "\n")
317
+ except Exception as e:
318
+ logger.debug(f"arbitration write failed: {e}")
319
+
320
+
321
+ # ---- TurnContext ---------------------------------------------------
322
+
323
+
324
+ class _TurnContext:
325
+ """Lives across a single /v1/messages call. Carries Caudate's
326
+ prediction and the bookkeeping needed to score it at the end."""
327
+
328
+ def __init__(self, agent: Any, messages: list[dict[str, Any]]):
329
+ self.agent = agent
330
+ self.messages = messages
331
+ self.prediction = None
332
+ self.temp_image_paths: list[str] = []
333
+ self.tools_used: list[str] = []
334
+ self.response_text: str = ""
335
+ # Thinking-channel text (gemma4 / kimi reasoning). Counted as
336
+ # signal for tool inference but NOT as the visible reply.
337
+ self.thinking_text: str = ""
338
+ self.started_at: float = time.time()
339
+ # True when the LLM stalled (text-only, "I will use X..." but no
340
+ # actual tool_use emitted) and we inferred the intended tool from
341
+ # its text. This flag tunes the reward.
342
+ self.inferred_from_stall: bool = False
343
+ # Phase 1 of evolution roadmap: which model produced this turn's
344
+ # response. Set by begin_turn(); flows into ConversationSample.
345
+ self.model_source: str = "unknown"
346
+ # Dual-brain arbitration record (Phase 4 / pattern 2). When the
347
+ # arbiter ran, this captures both drafts + scores so end_turn
348
+ # can log them for preference-learning training data.
349
+ # Shape: {"fast": {text, score}, "slow": {text, score},
350
+ # "winner": "fast"|"slow"}
351
+ self.arbitration: dict[str, Any] | None = None
352
+
353
+
354
+ # ---- Helpers --------------------------------------------------------
355
+
356
+
357
+ def _flatten_messages_to_text(messages: list[dict[str, Any]], limit: int) -> list[str]:
358
+ """Render the recent conversation as plain text for Caudate's encoder.
359
+
360
+ Important: do NOT silently drop tool_calls / tool_use blocks. They
361
+ carry the structured intent of the previous turn (e.g. the question
362
+ that AskUserQuestion was asking). Without them, Caudate sees only
363
+ the user's picked answer with no idea what was being asked.
364
+ """
365
+ import json as _json
366
+ out: list[str] = []
367
+ for m in messages[-limit:]:
368
+ role = m.get("role", "?")
369
+ c = m.get("content")
370
+ chunks: list[str] = []
371
+ if isinstance(c, list):
372
+ for b in c:
373
+ if not isinstance(b, dict):
374
+ continue
375
+ btype = b.get("type")
376
+ if btype == "text":
377
+ chunks.append(b.get("text", ""))
378
+ elif btype == "image_url":
379
+ chunks.append("[image]")
380
+ elif btype == "tool_use":
381
+ name = b.get("name", "")
382
+ inp = b.get("input") or {}
383
+ try:
384
+ inp_text = _json.dumps(inp, ensure_ascii=False)[:400]
385
+ except Exception:
386
+ inp_text = str(inp)[:400]
387
+ chunks.append(f"[tool_use {name}({inp_text})]")
388
+ elif btype == "tool_result":
389
+ rc = b.get("content")
390
+ if isinstance(rc, list):
391
+ rc = " ".join(
392
+ x.get("text", "") for x in rc
393
+ if isinstance(x, dict) and x.get("type") == "text"
394
+ )
395
+ chunks.append(f"[tool_result {str(rc)[:200]}]")
396
+ elif btype == "thinking":
397
+ chunks.append(f"[thinking {b.get('thinking', '')[:200]}]")
398
+ text = " ".join(p for p in chunks if p)
399
+ else:
400
+ text = str(c) if c else ""
401
+ # OpenAI-shape assistant messages put tool calls in a sibling
402
+ # field (m["tool_calls"]) rather than inside content blocks.
403
+ # Render them so Caudate sees the structured intent.
404
+ tcs = m.get("tool_calls") or []
405
+ for tc in tcs:
406
+ fn = tc.get("function") or {}
407
+ name = fn.get("name", "")
408
+ args = fn.get("arguments") or ""
409
+ text = f"{text} [tool_call {name}({str(args)[:400]})]".strip()
410
+ if text:
411
+ out.append(f"{role}: {text[:400]}")
412
+ return out
413
+
414
+
415
+ def _extract_images_to_temp(
416
+ messages: list[dict[str, Any]], cap: int,
417
+ ) -> list[str]:
418
+ """Pull base64 images out of message content blocks, write to temp PNGs.
419
+
420
+ Caudate's vision encoder takes file paths, not raw bytes. So we
421
+ materialize any image_url data: URLs to short-lived tempfiles, return
422
+ paths. Caller deletes them after the turn.
423
+ """
424
+ paths: list[str] = []
425
+ for m in messages:
426
+ c = m.get("content")
427
+ if not isinstance(c, list):
428
+ continue
429
+ for b in c:
430
+ if not isinstance(b, dict):
431
+ continue
432
+ if len(paths) >= cap:
433
+ return paths
434
+ url = None
435
+ if b.get("type") == "image_url":
436
+ url = (b.get("image_url") or {}).get("url")
437
+ elif b.get("type") == "image":
438
+ src = b.get("source") or {}
439
+ if src.get("type") == "base64":
440
+ media = src.get("media_type", "image/png").split("/")[-1]
441
+ data = src.get("data", "")
442
+ try:
443
+ raw = base64.b64decode(data)
444
+ except Exception:
445
+ continue
446
+ fd, p = tempfile.mkstemp(suffix=f".{media}", prefix="caudate-img-")
447
+ try:
448
+ os.write(fd, raw); paths.append(p)
449
+ finally:
450
+ os.close(fd)
451
+ continue
452
+ if url and url.startswith("data:"):
453
+ try:
454
+ head, _, b64 = url.partition(",")
455
+ media = head.split(";")[0].split("/")[-1] or "png"
456
+ raw = base64.b64decode(b64)
457
+ except Exception:
458
+ continue
459
+ fd, p = tempfile.mkstemp(suffix=f".{media}", prefix="caudate-img-")
460
+ try:
461
+ os.write(fd, raw); paths.append(p)
462
+ finally:
463
+ os.close(fd)
464
+ return paths
465
+
466
+
467
+ def _read_mood(agent: Any) -> list[float]:
468
+ """Read 4 mood floats off the personality engine if available."""
469
+ p = getattr(agent, "personality", None)
470
+ if p is None:
471
+ return [0.5, 0.5, 0.5, 0.5]
472
+ try:
473
+ m = p.mood
474
+ # MoodState fields: confidence, curiosity, frustration, satisfaction
475
+ return [
476
+ float(getattr(m, "confidence", 0.5)),
477
+ float(getattr(m, "curiosity", 0.5)),
478
+ float(getattr(m, "frustration", 0.5)),
479
+ float(getattr(m, "satisfaction", 0.5)),
480
+ ]
481
+ except Exception:
482
+ return [0.5, 0.5, 0.5, 0.5]
483
+
484
+
485
+ def _last_user_text(messages: list[dict[str, Any]]) -> str:
486
+ for m in reversed(messages):
487
+ if m.get("role") == "user":
488
+ c = m.get("content")
489
+ if isinstance(c, str):
490
+ return c
491
+ if isinstance(c, list):
492
+ for b in c:
493
+ if isinstance(b, dict) and b.get("type") == "text":
494
+ return b.get("text", "")
495
+ return ""
496
+
497
+
498
+ def _build_hint_block(pred: Any, level: str) -> str:
499
+ if level == "controller":
500
+ preface = "## Caudate (your trained action-selection net) recommends:"
501
+ elif level == "advisor":
502
+ preface = "## Caudate (advisor) suggests, based on prior sessions:"
503
+ else:
504
+ preface = "## Caudate (whispering — still learning) thinks you might want:"
505
+ return (
506
+ f"{preface}\n"
507
+ f" next tool: {pred.tool} (confidence {pred.tool_confidence:.2f})\n"
508
+ f" routing: {pred.tier} (confidence {pred.tier_confidence:.2f})\n"
509
+ f" thinking: {'helpful' if pred.think >= 0.5 else 'not needed'} "
510
+ f"(p={pred.think:.2f})\n"
511
+ f" expected reward: {pred.value:.2f}\n"
512
+ f"You may disagree — Caudate ({level}) hasn't seen this exact context, "
513
+ f"only patterns from past turns."
514
+ )
515
+
516
+
517
+ # ---- Stalling detection + intended-tool inference ------------------
518
+ # These power Caudate's "rescue" learning: when an LLM (gemma4 in
519
+ # particular) generates intent without action, we extract the action it
520
+ # *should* have taken so Caudate's next prediction is corrective rather
521
+ # than mimetic.
522
+
523
+ # Regex that catches "I will / I'll / Let me / I should / I need to / I
524
+ # am going to / first I" preambles followed by an action verb. Matches
525
+ # only at the START of a sentence so we don't false-positive on
526
+ # "I would not..." or "I'll be honest" (those end without action plans).
527
+ _STALLING_PHRASES = re.compile(
528
+ r"\b(?:I\s+will|I'll|let\s+me|I\s+should|I\s+need\s+to|"
529
+ r"I\s+am\s+going\s+to|first(?:,)?\s+I'?l?l?|"
530
+ r"I'll\s+start\s+by|I\s+need\s+to)\s+",
531
+ re.IGNORECASE,
532
+ )
533
+
534
+ # Mapping: text fragments → Cognos tool name (case-insensitive).
535
+ # Cognos tools: Bash, Read, Write, Edit, Glob, Grep, WebSearch,
536
+ # WebFetch, PythonExec, Think, Respond, Agent, Draw.
537
+ _TOOL_HINTS: list[tuple[re.Pattern, str]] = [
538
+ (re.compile(r"\b(?:use\s+)?bash\b|`?ls\b|`?cat\b|shell\s+command", re.I), "Bash"),
539
+ (re.compile(r"\b(?:use\s+)?glob\b|find\s+files?\s+match|file\s+pattern", re.I), "Glob"),
540
+ (re.compile(r"\b(?:use\s+)?grep\b|search\s+(?:for|the|in)|search\s+code", re.I), "Grep"),
541
+ (re.compile(r"\b(?:use\s+)?read\b|read\s+(?:the\s+)?file|inspect\s+file", re.I), "Read"),
542
+ (re.compile(r"\b(?:use\s+)?write\b|write\s+(?:to|a)\s+file|create\s+file", re.I), "Write"),
543
+ (re.compile(r"\b(?:use\s+)?edit\b|modify\s+(?:the\s+)?file|change\s+code", re.I), "Edit"),
544
+ (re.compile(r"\b(?:use\s+)?websearch\b|search\s+(?:the\s+)?web|google", re.I), "WebSearch"),
545
+ (re.compile(r"\b(?:use\s+)?webfetch\b|fetch\s+(?:the\s+)?(?:url|page)", re.I), "WebFetch"),
546
+ (re.compile(r"\b(?:use\s+)?python(?:exec)?\b|run\s+python|execute\s+python", re.I), "PythonExec"),
547
+ (re.compile(r"\b(?:use\s+)?(?:the\s+)?think\b|think\s+(?:about|step)", re.I), "Think"),
548
+ (re.compile(r"\b(?:use\s+)?(?:agent|subagent)\b|spawn\s+(?:an?\s+)?agent", re.I), "Agent"),
549
+ (re.compile(r"\b(?:use\s+)?draw(?:ing)?\b|generate\s+(?:an?\s+)?(?:image|picture|diagram)|create\s+(?:an?\s+)?(?:image|picture)", re.I), "Draw"),
550
+ # Generic exploratory prompts → Bash. These catch user-side language
551
+ # that maps clearly to a filesystem-inspection turn even when the
552
+ # model didn't suggest a tool.
553
+ (re.compile(r"list\s+(?:the\s+)?files?|"
554
+ r"what'?s\s+in\s+(?:\w+\s+)?(?:dir|folder|directory)|"
555
+ r"what\s+is\s+in\s+(?:\w+\s+)?(?:dir|folder|directory)|"
556
+ r"show\s+(?:me\s+)?(?:the\s+)?files?|"
557
+ r"(?:check|see)\s+(?:the\s+)?(?:dir|folder|directory|files?)|"
558
+ r"(?:what'?s|what\s+is|tell\s+me\s+what'?s|tell\s+me\s+what\s+is)\s+"
559
+ r"(?:working|happening|going\s+on|here)\s+"
560
+ r"(?:in\s+(?:\w+\s+)?(?:dir|folder|directory|repo|project)|here)", re.I), "Bash"),
561
+ ]
562
+
563
+
564
+ def _is_stalling(text: str) -> bool:
565
+ """Did the model say 'I will X' without actually emitting a tool call?"""
566
+ return bool(text) and bool(_STALLING_PHRASES.search(text))
567
+
568
+
569
+ def _infer_intended_tool(text: str) -> str | None:
570
+ """If the model's text describes a tool action — whether stalling
571
+ ("I will use Bash"), refusing ("I can't run code; try `ls`"), or
572
+ advising ("you should grep for that") — return the matching Cognos
573
+ tool name. Returns None if no clear intent.
574
+
575
+ Used to *rescue* Caudate's training data: instead of recording
576
+ target_tool='<none>' whenever the model didn't call a tool, we
577
+ record the tool that the response *describes*, so Caudate learns
578
+ the corrective action rather than the failure pattern.
579
+ """
580
+ if not text:
581
+ return None
582
+ for pattern, tool in _TOOL_HINTS:
583
+ if pattern.search(text):
584
+ return tool
585
+ return None
586
+
587
+
588
+ def _derive_reward(ctx: _TurnContext, error: bool) -> float:
589
+ """Reward signal for Caudate's training.
590
+
591
+ - upstream LLM error → 0.20 (clearly bad)
592
+ - real tool call emitted (not inferred) → 0.70 (engaged)
593
+ - inferred tool from stalling text → 0.25 (model failed
594
+ — rescue training)
595
+ - text-only reply, no stalling → 0.60 (legitimate answer)
596
+ - empty reply → 0.40 (gave up)
597
+
598
+ The 0.25 reward on stalling-with-inference is the key change: it
599
+ teaches Caudate that the predicted action was correct (so she should
600
+ predict it again next time) but the *outcome* was bad (so the LLM
601
+ needs the hint). High enough that her tool-prediction head learns
602
+ the right answer; low enough that her value-prediction head learns
603
+ "this turn went poorly."
604
+ """
605
+ if error:
606
+ return 0.20
607
+ if ctx.inferred_from_stall:
608
+ return 0.25
609
+ if ctx.tools_used:
610
+ return 0.70
611
+ text = (ctx.response_text or "")
612
+ if _is_stalling(text):
613
+ # Stalling text but no inference matched (vague intent like
614
+ # "I'll think about it"). Penalize but no rescue available.
615
+ return 0.30
616
+ if text.strip():
617
+ return 0.60
618
+ return 0.40