caudate-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. api/__init__.py +5 -0
  2. api/anthropic_compat.py +1518 -0
  3. api/artifact_viewer.py +366 -0
  4. api/caudate_middleware.py +618 -0
  5. api/forge_bootstrapper_routes.py +377 -0
  6. api/forge_routes.py +630 -0
  7. api/forge_system_routes.py +294 -0
  8. api/openai_compat.py +1993 -0
  9. api/server.py +667 -0
  10. api/storyboard_page.py +677 -0
  11. caudate_cli-0.1.0.dist-info/METADATA +354 -0
  12. caudate_cli-0.1.0.dist-info/RECORD +153 -0
  13. caudate_cli-0.1.0.dist-info/WHEEL +5 -0
  14. caudate_cli-0.1.0.dist-info/entry_points.txt +2 -0
  15. caudate_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  16. caudate_cli-0.1.0.dist-info/top_level.txt +14 -0
  17. cognos_mcp/__init__.py +4 -0
  18. cognos_mcp/bridge.py +41 -0
  19. cognos_mcp/client.py +70 -0
  20. cognos_mcp/config.py +49 -0
  21. cognos_mcp/server.py +66 -0
  22. config.py +82 -0
  23. core/__init__.py +0 -0
  24. core/agent.py +468 -0
  25. core/agentic_loop.py +731 -0
  26. core/anthropic_auth.py +91 -0
  27. core/background.py +113 -0
  28. core/banner.py +134 -0
  29. core/bootstrap.py +292 -0
  30. core/citations.py +131 -0
  31. core/compaction.py +109 -0
  32. core/constitution.py +198 -0
  33. core/diff_viewer.py +87 -0
  34. core/export.py +85 -0
  35. core/file_refs.py +119 -0
  36. core/files.py +199 -0
  37. core/hooks.py +209 -0
  38. core/image.py +599 -0
  39. core/input.py +91 -0
  40. core/loop.py +238 -0
  41. core/memory_md.py +147 -0
  42. core/notifications.py +99 -0
  43. core/ownership.py +181 -0
  44. core/paste.py +81 -0
  45. core/permissions.py +210 -0
  46. core/plan_mode.py +215 -0
  47. core/sandbox_prompt.py +185 -0
  48. core/scheduler.py +195 -0
  49. core/schemas.py +202 -0
  50. core/session.py +90 -0
  51. core/settings.py +132 -0
  52. core/skills.py +398 -0
  53. core/slash_commands.py +977 -0
  54. core/statusline.py +61 -0
  55. core/subagent.py +300 -0
  56. core/thinking.py +50 -0
  57. core/updater.py +122 -0
  58. core/usage.py +109 -0
  59. core/worktree.py +93 -0
  60. execution/__init__.py +0 -0
  61. execution/executor.py +329 -0
  62. execution/plugins.py +108 -0
  63. execution/tools/__init__.py +0 -0
  64. execution/tools/agent_tool.py +107 -0
  65. execution/tools/agentic_tool.py +297 -0
  66. execution/tools/artifact_tool.py +191 -0
  67. execution/tools/ask_user_question_tool.py +137 -0
  68. execution/tools/base.py +81 -0
  69. execution/tools/calculator_tool.py +137 -0
  70. execution/tools/cognos_card_tool.py +124 -0
  71. execution/tools/cron_tool.py +215 -0
  72. execution/tools/datetime_tool.py +215 -0
  73. execution/tools/describe_image_tool.py +161 -0
  74. execution/tools/draw_tool.py +164 -0
  75. execution/tools/edit_image_tool.py +262 -0
  76. execution/tools/edit_tool.py +245 -0
  77. execution/tools/file_tool.py +90 -0
  78. execution/tools/find_anywhere_tool.py +255 -0
  79. execution/tools/forge_feature_tools.py +377 -0
  80. execution/tools/glob_tool.py +59 -0
  81. execution/tools/grep_tool.py +89 -0
  82. execution/tools/http_request_tool.py +224 -0
  83. execution/tools/load_skill_tool.py +104 -0
  84. execution/tools/longcat_avatar_tool.py +384 -0
  85. execution/tools/mcp_tool.py +100 -0
  86. execution/tools/notebook_tool.py +279 -0
  87. execution/tools/openapi_tool.py +440 -0
  88. execution/tools/plan_mode_tool.py +95 -0
  89. execution/tools/push_notification_tool.py +157 -0
  90. execution/tools/python_tool.py +61 -0
  91. execution/tools/respond_tool.py +40 -0
  92. execution/tools/sandbox_tool.py +378 -0
  93. execution/tools/search_tool.py +153 -0
  94. execution/tools/semantic_search_tool.py +106 -0
  95. execution/tools/shell_tool.py +283 -0
  96. execution/tools/speak_tool.py +134 -0
  97. execution/tools/storyboard_tool.py +727 -0
  98. execution/tools/system_info_tool.py +212 -0
  99. execution/tools/task_tool.py +323 -0
  100. execution/tools/think_tool.py +49 -0
  101. execution/tools/transcribe_audio_tool.py +86 -0
  102. execution/tools/update_memory_tool.py +92 -0
  103. execution/tools/web_fetch_tool.py +82 -0
  104. execution/tools/worktree_tool.py +174 -0
  105. llm/__init__.py +0 -0
  106. llm/fallback.py +116 -0
  107. llm/models.py +320 -0
  108. llm/provider.py +1356 -0
  109. llm/router.py +373 -0
  110. main.py +1889 -0
  111. memory/__init__.py +0 -0
  112. memory/episodic.py +99 -0
  113. memory/procedural.py +145 -0
  114. memory/semantic.py +71 -0
  115. memory/working.py +64 -0
  116. nn/__init__.py +43 -0
  117. nn/auto_evolve.py +245 -0
  118. nn/caudate.py +136 -0
  119. nn/config.py +141 -0
  120. nn/consolidator.py +81 -0
  121. nn/data.py +1635 -0
  122. nn/encoder.py +258 -0
  123. nn/forge_advisor.py +303 -0
  124. nn/format.py +235 -0
  125. nn/heads.py +432 -0
  126. nn/observer.py +994 -0
  127. nn/policy.py +214 -0
  128. nn/runtime.py +343 -0
  129. nn/scorer.py +175 -0
  130. nn/trainer.py +515 -0
  131. nn/vision.py +352 -0
  132. personality/__init__.py +23 -0
  133. personality/engine.py +129 -0
  134. personality/identity.py +144 -0
  135. personality/inner_voice.py +100 -0
  136. personality/mood.py +205 -0
  137. planning/__init__.py +0 -0
  138. planning/dev_server.py +221 -0
  139. planning/forge_models.py +718 -0
  140. planning/orchestrator.py +1363 -0
  141. planning/planner.py +451 -0
  142. planning/task_graph.py +61 -0
  143. reflection/__init__.py +0 -0
  144. reflection/meta_learner.py +156 -0
  145. reflection/reflector.py +127 -0
  146. ui/__init__.py +5 -0
  147. ui/display.py +88 -0
  148. voice/__init__.py +0 -0
  149. voice/conversation.py +125 -0
  150. voice/listener.py +111 -0
  151. voice/speaker.py +59 -0
  152. voice/stt.py +126 -0
  153. voice/tts.py +214 -0
api/openai_compat.py ADDED
@@ -0,0 +1,1993 @@
1
+ """OpenAI Chat Completions API compatibility layer.
2
+
3
+ Lets Open WebUI (or any OpenAI-format client) point at Cognos and get
4
+ answers back as if Cognos were OpenAI:
5
+
6
+ incoming /v1/chat/completions (OpenAI schema)
7
+
8
+
9
+ translate to Cognos's internal message format
10
+
11
+
12
+ LLMProvider.chat / .stream (with subscription_auth_scope so the
13
+ web-side OAuth path is available, just like /chat does)
14
+
15
+
16
+ translate response back to OpenAI schema (regular or SSE stream)
17
+
18
+ Caudate observes every turn through the same `CaudateMiddleware` already
19
+ used by `/v1/messages`, so traffic from Open WebUI feeds her training
20
+ corpus identically to traffic from Claude Code or our `/ui` chat.
21
+
22
+ This is the OpenAI-shaped sibling of `api/anthropic_compat.py`.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import asyncio
28
+ import json
29
+ import logging
30
+ import re
31
+ import time
32
+ import uuid
33
+ from typing import Any, AsyncIterator
34
+
35
+ from fastapi import APIRouter, HTTPException, Request
36
+ from fastapi.responses import JSONResponse, StreamingResponse
37
+
38
+ from api.caudate_middleware import CaudateMiddleware
39
+ from core.anthropic_auth import subscription_auth_scope
40
+ from core.schemas import StreamEvent, ToolUseBlock
41
+ from llm.provider import LLMProvider, LLMResponse
42
+
43
+
44
+ # ---- Dual-brain arbitration -----------------------------------------
45
+ # Pattern 2 from the multi-brain options: both system1 and system2 are
46
+ # called in parallel for every cognos-dual-brain turn. Their drafts are
47
+ # compared via a heuristic scorer (Caudate-aware where possible) and
48
+ # the winner is returned to the user. Both drafts are recorded for
49
+ # preference-learning training data — Caudate's substrate to grow into
50
+ # the conductor (Phase 4 of CAUDATE_EVOLUTION.md).
51
+
52
+ # Phrases that strongly suggest a refusal — reduce score for drafts
53
+ # that contain them so non-refusing draft wins by default.
54
+ _REFUSAL_RE = re.compile(
55
+ r"\b("
56
+ r"i\s+can[' ]?t\s+(?:reproduce|share|provide|help|do|assist)|"
57
+ r"i\s+(?:cannot|am\s+not\s+able\s+to|am\s+unable\s+to)|"
58
+ r"i\s+don[' ]?t\s+have\s+access|"
59
+ r"copyright|"
60
+ r"violates?\s+(?:my\s+)?(?:guidelines|policy|policies|terms)|"
61
+ r"against\s+(?:my\s+)?(?:guidelines|policy)|"
62
+ r"ethical\s+guidelines|"
63
+ r"unable\s+to\s+(?:provide|share|reproduce)"
64
+ r")\b",
65
+ re.IGNORECASE,
66
+ )
67
+
68
+
69
+ _HEDGE_PHRASES = (
70
+ "i think", "i believe", "maybe", "perhaps", "i'm not sure",
71
+ "i would suggest", "you might want", "it could be", "i guess",
72
+ )
73
+
74
+
75
+ def _score_draft(resp: LLMResponse | None) -> float:
76
+ """Heuristic quality score for one draft, in [0, 1].
77
+
78
+ The previous version was nearly flat (most signal in a 0-0.20 length
79
+ bonus), which made >70% of arbitrations land inside the 0.05 tie
80
+ threshold and produce no real preference label. This version adds
81
+ several small independent factors so genuine drafts score apart from
82
+ each other reliably while keeping refusals and emptiness clearly
83
+ worst.
84
+ """
85
+ if resp is None:
86
+ return 0.0
87
+ text = (resp.content or "").strip()
88
+ if not text:
89
+ return 0.30 if getattr(resp, "thinking", "") else 0.10
90
+
91
+ score = 0.50
92
+ text_lower = text.lower()
93
+
94
+ # Refusal: stronger penalty so refusing draft cleanly loses (-0.40)
95
+ if _REFUSAL_RE.search(text):
96
+ score -= 0.40
97
+
98
+ # Length: brevity penalty for trivially short replies; soft reward
99
+ # for substance up to 1500 chars; no waffle bonus beyond.
100
+ L = len(text)
101
+ if L < 40:
102
+ score -= 0.10
103
+ score += min(0.15, L / 1500 * 0.15)
104
+
105
+ # Tool calls indicate the model engaged with the task structurally.
106
+ if getattr(resp, "tool_calls", None):
107
+ score += 0.10
108
+
109
+ # Concrete-content signals — each draft varies in these
110
+ # independently of length, which is what makes the gap widen.
111
+ if "```" in text: # code block / preformatted
112
+ score += 0.05
113
+ if any(c.isdigit() for c in text): # numbers / data
114
+ score += 0.03
115
+ if "/" in text or "\\" in text: # paths, filenames
116
+ score += 0.03
117
+ if "://" in text: # URLs
118
+ score += 0.03
119
+
120
+ # Hedge-language penalty — vague, low-confidence prose loses to
121
+ # direct prose of similar length.
122
+ if any(h in text_lower for h in _HEDGE_PHRASES):
123
+ score -= 0.05
124
+
125
+ return max(0.0, min(1.0, score))
126
+
127
+
128
+ async def _dual_brain_arbitrate(
129
+ *,
130
+ llm: Any, # DualLLMProvider
131
+ messages: list[dict[str, Any]],
132
+ tools: list[dict[str, Any]] | None,
133
+ max_tokens: int,
134
+ temperature: float | None,
135
+ middleware: CaudateMiddleware | None,
136
+ turn_ctx: Any,
137
+ ) -> LLMResponse:
138
+ """Run system1 and system2 in parallel, score both drafts, return
139
+ the winner. Records both drafts via observe_arbitration so the
140
+ training corpus accumulates preference pairs."""
141
+ fast_provider = llm.router.fast
142
+ slow_provider = llm.router.slow
143
+
144
+ async def _call(prov: LLMProvider) -> LLMResponse | Exception:
145
+ try:
146
+ return await prov.chat(
147
+ messages=messages, tools=tools,
148
+ max_tokens=max_tokens, temperature=temperature,
149
+ )
150
+ except Exception as e:
151
+ logger.warning(f"arbitrate call failed for {prov.model}: {e}")
152
+ return e
153
+
154
+ # Run in parallel — total latency ≈ max(t_fast, t_slow), not the sum.
155
+ fast_resp, slow_resp = await asyncio.gather(
156
+ _call(fast_provider), _call(slow_provider),
157
+ )
158
+
159
+ # Failure handling: if one died, return the other; if both died, raise.
160
+ fast_ok = not isinstance(fast_resp, Exception)
161
+ slow_ok = not isinstance(slow_resp, Exception)
162
+ if fast_ok and not slow_ok:
163
+ return fast_resp
164
+ if slow_ok and not fast_ok:
165
+ return slow_resp
166
+ if not fast_ok and not slow_ok:
167
+ # both failed — propagate fast's error
168
+ raise fast_resp # type: ignore[misc]
169
+
170
+ # Both succeeded — score them.
171
+ fast_score = _score_draft(fast_resp)
172
+ slow_score = _score_draft(slow_resp)
173
+
174
+ # Confidence safeguard: if the gap is small (< 0.02), prefer fast
175
+ # (system1) as the default — Caudate doesn't yet have strong
176
+ # arbitration; truly close calls shouldn't be made on weak signal.
177
+ # Threshold lowered from 0.05 → 0.02 alongside the more-decisive
178
+ # heuristic so genuine preferences are recorded instead of
179
+ # collapsing to "default to fast".
180
+ gap = abs(fast_score - slow_score)
181
+ if gap < 0.02:
182
+ winner_resp, winner_label = fast_resp, "fast"
183
+ elif fast_score >= slow_score:
184
+ winner_resp, winner_label = fast_resp, "fast"
185
+ else:
186
+ winner_resp, winner_label = slow_resp, "slow"
187
+
188
+ # Record the arbitration so the training corpus captures the
189
+ # comparison even when the winner is the one we'd default to.
190
+ if middleware is not None and turn_ctx is not None:
191
+ try:
192
+ middleware.observe_arbitration(
193
+ turn_ctx,
194
+ fast_text=(fast_resp.content or ""),
195
+ slow_text=(slow_resp.content or ""),
196
+ fast_score=fast_score,
197
+ slow_score=slow_score,
198
+ winner=winner_label,
199
+ fast_model=fast_provider.model,
200
+ slow_model=slow_provider.model,
201
+ )
202
+ except Exception as e:
203
+ logger.debug(f"observe_arbitration failed: {e}")
204
+
205
+ return winner_resp
206
+
207
+ logger = logging.getLogger(__name__)
208
+
209
+
210
+ # ---- Translation helpers --------------------------------------------
211
+
212
+
213
+ # ---- Server-side agentic loop ---------------------------------------
214
+ # When a `cognos-*` model is requested AND the client did not supply a
215
+ # tool schema (Open WebUI doesn't), we run the tool-calling loop
216
+ # server-side: LLM proposes a tool call → executor runs it → result
217
+ # loops back into the messages → LLM is called again → repeat until
218
+ # the LLM produces final text (no more tool_calls). The user only sees
219
+ # the final text. Caudate observes every iteration through the
220
+ # middleware exactly as if the loop ran in the CLI.
221
+
222
+ _AGENTIC_MAX_ITERATIONS = 12
223
+
224
+ # Models for which we activate the server-side agentic loop. Anything
225
+ # else (bare passthrough names) gets the legacy single-shot behavior.
226
+ _AGENTIC_MODELS: frozenset[str] = frozenset({
227
+ "cognos", "cognos-fast", "cognos-slow",
228
+ "cognos-haiku", "cognos-kimi",
229
+ "cognos-dual-brain", "cognos-collab",
230
+ "cognos-vision",
231
+ "cognos-strict",
232
+ })
233
+
234
+
235
+ def _should_run_agentic(requested_model: str | None,
236
+ client_supplied_tools: list | None) -> bool:
237
+ """Yes when the model is a cognos-* and the client didn't bring its
238
+ own tool definitions (i.e. Open WebUI / generic OpenAI client)."""
239
+ if client_supplied_tools:
240
+ return False
241
+ if not requested_model:
242
+ return False
243
+ return requested_model.lower() in _AGENTIC_MODELS
244
+
245
+
246
+ async def _run_agentic_loop(
247
+ *,
248
+ llm: Any,
249
+ executor: Any,
250
+ messages: list[dict[str, Any]],
251
+ middleware: CaudateMiddleware,
252
+ turn_ctx: Any,
253
+ max_tokens: int,
254
+ temperature: float | None,
255
+ caller: str | None = None,
256
+ ) -> LLMResponse:
257
+ """Stateless server-side ReAct loop.
258
+
259
+ Reuses Cognos's existing Executor (so tool implementations are
260
+ identical to the CLI path) but runs from a transient message
261
+ history rather than the agent's persistent session. Returns the
262
+ final LLMResponse — the one whose `.tool_calls` is empty.
263
+
264
+ Caudate observes every iteration's tool calls via the middleware
265
+ so the new heads collect labels on real chat-driven turns.
266
+ """
267
+ history: list[dict[str, Any]] = list(messages)
268
+ # Strip CLI-only tools that don't make sense in the chat path:
269
+ # Respond — prints to the *server* console (rich Panel); when the
270
+ # LLM uses this in chat, the answer goes nowhere visible.
271
+ # Think — also a server-side console aid.
272
+ # The agentic chat path gets the substantive tools (Bash/Read/Write/
273
+ # Edit/Grep/Glob/FindAnywhere/SystemInfo/PythonExec/Web*) and the
274
+ # LLM replies via plain text content.
275
+ _CHAT_HIDDEN_TOOLS = {"Respond", "Think"}
276
+ tool_defs = [
277
+ td for td in executor.tool_definitions()
278
+ if (td.get("function", {}).get("name") or td.get("name"))
279
+ not in _CHAT_HIDDEN_TOOLS
280
+ ]
281
+
282
+ # Runaway-detector state. We remember (tool_name, args_fingerprint,
283
+ # was_error) for each tool call across iterations. If the SAME tool
284
+ # has been called >= 3 times with similar args AND every one of
285
+ # those calls returned an error/empty, we inject a synthetic
286
+ # nudge into the history asking the LLM to stop hunting and try a
287
+ # different approach. This catches loops like the find/system-prompt
288
+ # one where the LLM keeps trying minor variations of the same
289
+ # failing command.
290
+ _RUNAWAY_THRESHOLD = 3
291
+ tool_call_log: list[tuple[str, str, bool]] = []
292
+ nudge_already_injected = False
293
+
294
+ def _args_fingerprint(args: dict[str, Any] | None) -> str:
295
+ """Compact stable hash of the meaningful args — coarse enough
296
+ that 'find / -name x' and 'find / -name y' look similar."""
297
+ if not args:
298
+ return ""
299
+ # Take the first 60 chars of each value to ignore tail variation
300
+ parts: list[str] = []
301
+ for k in sorted(args.keys()):
302
+ v = args[k]
303
+ sval = json.dumps(v) if not isinstance(v, str) else v
304
+ parts.append(f"{k}={sval[:60]}")
305
+ return "|".join(parts)[:200]
306
+
307
+ def _looks_like_error(output: str) -> bool:
308
+ if not output or not output.strip():
309
+ return True
310
+ head = output.strip()[:200].lower()
311
+ return (head.startswith("[error]")
312
+ or "permission denied" in head
313
+ or "no such file" in head
314
+ or "command not found" in head
315
+ or head.startswith("error:"))
316
+
317
+ last_resp: LLMResponse | None = None
318
+ for iteration in range(_AGENTIC_MAX_ITERATIONS):
319
+ kwargs: dict[str, Any] = {
320
+ "messages": history,
321
+ "tools": tool_defs,
322
+ "max_tokens": max_tokens,
323
+ }
324
+ if temperature is not None:
325
+ kwargs["temperature"] = temperature
326
+ if caller is not None:
327
+ kwargs["caller"] = caller
328
+
329
+ resp = await llm.chat(**kwargs)
330
+ last_resp = resp
331
+
332
+ # Append the assistant turn to history (text + tool_calls).
333
+ assistant_msg: dict[str, Any] = {
334
+ "role": "assistant",
335
+ "content": resp.content or None,
336
+ }
337
+ if resp.tool_calls:
338
+ assistant_msg["tool_calls"] = [
339
+ {
340
+ "id": tc.id or f"call_{uuid.uuid4().hex[:12]}",
341
+ "type": "function",
342
+ "function": {
343
+ "name": tc.name,
344
+ "arguments": json.dumps(tc.input or {}),
345
+ },
346
+ }
347
+ for tc in resp.tool_calls
348
+ ]
349
+ history.append(assistant_msg)
350
+
351
+ # Done if the model didn't request any tool calls.
352
+ if not resp.tool_calls:
353
+ return resp
354
+
355
+ # Execute each tool call, append results to history, loop.
356
+ for tc in resp.tool_calls:
357
+ try:
358
+ result = await executor.execute_tool(tc.name, tc.input or {})
359
+ output = result.output if hasattr(result, "output") else str(result)
360
+ if hasattr(result, "error") and result.error:
361
+ output = f"[error] {result.error}"
362
+ except Exception as e:
363
+ logger.exception(f"tool {tc.name!r} crashed")
364
+ output = f"[error] tool {tc.name} raised: {e}"
365
+
366
+ # Runaway-detector: log this call with its outcome.
367
+ fingerprint = _args_fingerprint(tc.input)
368
+ errored = _looks_like_error(str(output))
369
+ tool_call_log.append((tc.name, fingerprint, errored))
370
+
371
+ # Caudate observes this tool use — drives the new heads'
372
+ # label collection (permission, etc.).
373
+ try:
374
+ middleware.observe_tool_use(turn_ctx, tc.name)
375
+ except Exception:
376
+ pass
377
+
378
+ history.append({
379
+ "role": "tool",
380
+ "tool_call_id": tc.id or "",
381
+ "name": tc.name,
382
+ "content": str(output)[:8000], # cap to keep context manageable
383
+ })
384
+
385
+ # After all tool calls in this iteration, check the log for a
386
+ # stuck loop. Two detector branches:
387
+ #
388
+ # (A) "Stuck on same args": same (tool, args) called
389
+ # RUNAWAY_THRESHOLD times in a row — even if every call
390
+ # succeeded. Catches the read-loop pattern where the LLM
391
+ # reads the same file over and over while saying it'll
392
+ # write, but never calls Write.
393
+ #
394
+ # (B) "Stuck on errors": same tool called with varying args,
395
+ # every call errored or returned empty. Catches
396
+ # hunt-in-circles loops like find/grep with bad patterns.
397
+ #
398
+ # The nudge is one-shot per loop run.
399
+ if not nudge_already_injected and len(tool_call_log) >= _RUNAWAY_THRESHOLD:
400
+ recent = tool_call_log[-_RUNAWAY_THRESHOLD:]
401
+ tool_names = {t[0] for t in recent}
402
+ fingerprints = {(t[0], t[1]) for t in recent}
403
+ all_errored = all(t[2] for t in recent)
404
+
405
+ nudge_text = None
406
+ if len(fingerprints) == 1:
407
+ stuck_tool = recent[0][0]
408
+ logger.warning(
409
+ f"agentic loop runaway detected: {stuck_tool} called "
410
+ f"{_RUNAWAY_THRESHOLD}+ times with identical args. "
411
+ f"injecting nudge."
412
+ )
413
+ nudge_text = (
414
+ f"[system nudge] You've called `{stuck_tool}` with the "
415
+ f"same arguments {_RUNAWAY_THRESHOLD}+ times in a row. "
416
+ f"The result won't change on another call. **Take the "
417
+ f"next action now**:\n"
418
+ f" (1) if you've been reading/inspecting and saying "
419
+ f"you'll edit, **call the Write or Edit tool now** with "
420
+ f"the actual content, or\n"
421
+ f" (2) ask me what to do next.\n"
422
+ f"Do not call `{stuck_tool}` with these arguments again."
423
+ )
424
+ elif len(tool_names) == 1 and all_errored:
425
+ stuck_tool = next(iter(tool_names))
426
+ logger.warning(
427
+ f"agentic loop runaway detected: {stuck_tool} called "
428
+ f"{_RUNAWAY_THRESHOLD}+ times, all errors. injecting nudge."
429
+ )
430
+ nudge_text = (
431
+ f"[system nudge] You've called `{stuck_tool}` "
432
+ f"{_RUNAWAY_THRESHOLD}+ times now and every call "
433
+ f"returned an error or empty result. **Stop "
434
+ f"trying variations of the same command.** Either:\n"
435
+ f" (1) explain to me what you were trying to find "
436
+ f"and ask me to clarify, or\n"
437
+ f" (2) try a fundamentally different approach "
438
+ f"(a different tool, or step back and reason about "
439
+ f"the original question).\n"
440
+ f"Do not call `{stuck_tool}` again on the next turn."
441
+ )
442
+
443
+ if nudge_text:
444
+ history.append({"role": "user", "content": nudge_text})
445
+ nudge_already_injected = True
446
+
447
+ # Hit max iterations without a final text — return the last response
448
+ # we got, which still has tool_calls. Caller will surface either the
449
+ # text content or a "[hit max iterations]" placeholder.
450
+ logger.warning(
451
+ f"_run_agentic_loop hit max_iterations={_AGENTIC_MAX_ITERATIONS}"
452
+ )
453
+ return last_resp or LLMResponse(content="", stop_reason="max_iterations")
454
+
455
+
456
+ async def _run_agentic_loop_streaming(
457
+ *,
458
+ llm: Any,
459
+ executor: Any,
460
+ messages: list[dict[str, Any]],
461
+ middleware: CaudateMiddleware,
462
+ turn_ctx: Any,
463
+ max_tokens: int,
464
+ temperature: float | None,
465
+ caller: str | None = None,
466
+ ) -> AsyncIterator[StreamEvent]:
467
+ """Streaming sibling of `_run_agentic_loop`.
468
+
469
+ Yields `StreamEvent`s as the LLM produces them across all
470
+ iterations. Same ReAct contract — call LLM with tools, execute
471
+ any tool calls, loop until LLM emits text-only — but each
472
+ iteration's thinking_delta / text_delta / tool_use_end events
473
+ flow through to the consumer immediately rather than after the
474
+ whole loop completes.
475
+
476
+ Two extra event shapes the consumer should handle:
477
+ - StreamEvent(type="tool_result", tool_name=..., delta=summary,
478
+ raw={"status": "success"|"error"})
479
+ Emitted right after each tool finishes so the UI can show
480
+ "🔧 Bash → ok (240 chars)" inline.
481
+ - StreamEvent(type="iteration_break")
482
+ Emitted between iterations so the consumer can decide
483
+ whether to insert a visual separator or keep the thinking
484
+ block flowing as one continuous trace.
485
+ """
486
+ history: list[dict[str, Any]] = list(messages)
487
+ _CHAT_HIDDEN_TOOLS = {"Respond", "Think"}
488
+ tool_defs = [
489
+ td for td in executor.tool_definitions()
490
+ if (td.get("function", {}).get("name") or td.get("name"))
491
+ not in _CHAT_HIDDEN_TOOLS
492
+ ]
493
+
494
+ _RUNAWAY_THRESHOLD = 3
495
+ tool_call_log: list[tuple[str, str, bool]] = []
496
+ nudge_already_injected = False
497
+
498
+ def _args_fingerprint(args: dict[str, Any] | None) -> str:
499
+ if not args:
500
+ return ""
501
+ parts: list[str] = []
502
+ for k in sorted(args.keys()):
503
+ v = args[k]
504
+ sval = json.dumps(v) if not isinstance(v, str) else v
505
+ parts.append(f"{k}={sval[:60]}")
506
+ return "|".join(parts)[:200]
507
+
508
+ def _looks_like_error(output: str) -> bool:
509
+ if not output or not output.strip():
510
+ return True
511
+ head = output.strip()[:200].lower()
512
+ return (head.startswith("[error]")
513
+ or "permission denied" in head
514
+ or "no such file" in head
515
+ or "command not found" in head
516
+ or head.startswith("error:"))
517
+
518
+ for iteration in range(_AGENTIC_MAX_ITERATIONS):
519
+ kwargs: dict[str, Any] = {
520
+ "messages": history,
521
+ "tools": tool_defs,
522
+ "max_tokens": max_tokens,
523
+ }
524
+ if temperature is not None:
525
+ kwargs["temperature"] = temperature
526
+ if caller is not None:
527
+ kwargs["caller"] = caller
528
+
529
+ # Accumulate this iteration's stream so we can build the
530
+ # assistant_msg for history at the end.
531
+ iter_text = ""
532
+ iter_tool_calls: list[ToolUseBlock] = []
533
+ iter_stop_reason: str | None = None
534
+
535
+ async for event in llm.stream(**kwargs):
536
+ if event.type == "text_delta" and event.delta:
537
+ iter_text += event.delta
538
+ yield event
539
+ elif event.type == "thinking_delta" and event.delta:
540
+ # Forward live so the consumer can render thinking
541
+ # in real time as it crosses iteration boundaries.
542
+ yield event
543
+ elif event.type == "tool_use_end":
544
+ iter_tool_calls.append(ToolUseBlock(
545
+ id=event.tool_use_id or f"call_{uuid.uuid4().hex[:12]}",
546
+ name=event.tool_name or "",
547
+ input=event.tool_input or {},
548
+ ))
549
+ yield event
550
+ elif event.type == "message_stop":
551
+ iter_stop_reason = event.stop_reason
552
+
553
+ # Append assistant turn to history.
554
+ assistant_msg: dict[str, Any] = {
555
+ "role": "assistant",
556
+ "content": iter_text or None,
557
+ }
558
+ if iter_tool_calls:
559
+ assistant_msg["tool_calls"] = [
560
+ {
561
+ "id": tc.id or f"call_{uuid.uuid4().hex[:12]}",
562
+ "type": "function",
563
+ "function": {
564
+ "name": tc.name,
565
+ "arguments": json.dumps(tc.input or {}),
566
+ },
567
+ }
568
+ for tc in iter_tool_calls
569
+ ]
570
+ history.append(assistant_msg)
571
+
572
+ # Done if no tool calls — final answer was the text we already
573
+ # streamed.
574
+ if not iter_tool_calls:
575
+ yield StreamEvent(
576
+ type="message_stop",
577
+ stop_reason=iter_stop_reason or "stop",
578
+ )
579
+ return
580
+
581
+ # Execute tool calls; yield a `tool_result` event per call so
582
+ # the UI can render progress inline.
583
+ for tc in iter_tool_calls:
584
+ try:
585
+ result = await executor.execute_tool(tc.name, tc.input or {})
586
+ output = result.output if hasattr(result, "output") else str(result)
587
+ if hasattr(result, "error") and result.error:
588
+ output = f"[error] {result.error}"
589
+ except Exception as e:
590
+ logger.exception(f"tool {tc.name!r} crashed")
591
+ output = f"[error] tool {tc.name} raised: {e}"
592
+
593
+ fingerprint = _args_fingerprint(tc.input)
594
+ errored = _looks_like_error(str(output))
595
+ tool_call_log.append((tc.name, fingerprint, errored))
596
+
597
+ try:
598
+ middleware.observe_tool_use(turn_ctx, tc.name)
599
+ except Exception:
600
+ pass
601
+
602
+ history.append({
603
+ "role": "tool",
604
+ "tool_call_id": tc.id or "",
605
+ "name": tc.name,
606
+ "content": str(output)[:8000],
607
+ })
608
+
609
+ # Surface a short result summary so the UI can label the
610
+ # tool call inline. The full output stays in `history` so
611
+ # the LLM still sees it on the next iteration.
612
+ summary = str(output)[:240].replace("\n", " ")
613
+ yield StreamEvent(
614
+ type="tool_result",
615
+ tool_name=tc.name,
616
+ delta=summary,
617
+ raw={"status": "error" if errored else "success",
618
+ "output_chars": len(str(output))},
619
+ )
620
+
621
+ # Runaway detector — same two-branch logic as non-streaming
622
+ # variant: (A) same (tool, args) repeated, even on success;
623
+ # (B) same tool, varying args, all errored.
624
+ if not nudge_already_injected and len(tool_call_log) >= _RUNAWAY_THRESHOLD:
625
+ recent = tool_call_log[-_RUNAWAY_THRESHOLD:]
626
+ tool_names = {t[0] for t in recent}
627
+ fingerprints = {(t[0], t[1]) for t in recent}
628
+ all_errored = all(t[2] for t in recent)
629
+
630
+ nudge_text = None
631
+ if len(fingerprints) == 1:
632
+ stuck_tool = recent[0][0]
633
+ logger.warning(
634
+ f"agentic loop runaway detected: {stuck_tool} called "
635
+ f"{_RUNAWAY_THRESHOLD}+ times with identical args. "
636
+ f"injecting nudge."
637
+ )
638
+ nudge_text = (
639
+ f"[system nudge] You've called `{stuck_tool}` with the "
640
+ f"same arguments {_RUNAWAY_THRESHOLD}+ times in a row. "
641
+ f"The result won't change on another call. **Take the "
642
+ f"next action now**:\n"
643
+ f" (1) if you've been reading/inspecting and saying "
644
+ f"you'll edit, **call the Write or Edit tool now** with "
645
+ f"the actual content, or\n"
646
+ f" (2) ask me what to do next.\n"
647
+ f"Do not call `{stuck_tool}` with these arguments again."
648
+ )
649
+ elif len(tool_names) == 1 and all_errored:
650
+ stuck_tool = next(iter(tool_names))
651
+ logger.warning(
652
+ f"agentic loop runaway detected: {stuck_tool} called "
653
+ f"{_RUNAWAY_THRESHOLD}+ times, all errors. injecting nudge."
654
+ )
655
+ nudge_text = (
656
+ f"[system nudge] You've called `{stuck_tool}` "
657
+ f"{_RUNAWAY_THRESHOLD}+ times now and every call "
658
+ f"returned an error or empty result. **Stop "
659
+ f"trying variations of the same command.** Either:\n"
660
+ f" (1) explain to me what you were trying to find "
661
+ f"and ask me to clarify, or\n"
662
+ f" (2) try a fundamentally different approach.\n"
663
+ f"Do not call `{stuck_tool}` again on the next turn."
664
+ )
665
+
666
+ if nudge_text:
667
+ history.append({"role": "user", "content": nudge_text})
668
+ nudge_already_injected = True
669
+
670
+ yield StreamEvent(type="iteration_break")
671
+
672
+ # Hit max iterations.
673
+ logger.warning(
674
+ f"_run_agentic_loop_streaming hit max_iterations={_AGENTIC_MAX_ITERATIONS}"
675
+ )
676
+ yield StreamEvent(type="message_stop", stop_reason="max_iterations")
677
+
678
+
679
+ # ---- Slash-command interception ------------------------------------
680
+ # Open WebUI users type `/caudate`, `/sessions`, etc. as normal chat
681
+ # messages — without interception, those would just go to the LLM as
682
+ # free-form text. Instead, we dispatch through `core/slash_commands.py`
683
+ # (the same registry the Cognos /ui/ uses) and return the result as
684
+ # an assistant reply, skipping the LLM call entirely. Saves quota and
685
+ # gives Open WebUI parity with Cognos /ui/.
686
+
687
+ import re as _re
688
+
689
+ _RICH_TAG_RE = _re.compile(r"\[/?[a-zA-Z][^\]]*\]")
690
+
691
+
692
+ def _strip_rich_markup(s: str) -> str:
693
+ """Strip rich console markup like [red]X[/red] / [dim]Y[/dim] so
694
+ the slash output renders cleanly in the chat UI."""
695
+ if not s:
696
+ return s
697
+ return _RICH_TAG_RE.sub("", s)
698
+
699
+
700
+ def _last_user_text(messages: list[dict[str, Any]]) -> str:
701
+ for m in reversed(messages or []):
702
+ if m.get("role") == "user":
703
+ c = m.get("content", "")
704
+ if isinstance(c, list):
705
+ c = " ".join(b.get("text", "") for b in c
706
+ if isinstance(b, dict))
707
+ return (c or "").strip()
708
+ return ""
709
+
710
+
711
+ async def _try_slash_intercept(body: dict, messages: list[dict],
712
+ agent: Any) -> "JSONResponse | None":
713
+ """If the latest user message is a slash command, run it and
714
+ return a finished /v1/chat/completions response. Otherwise return
715
+ None and the caller falls through to the normal chat flow.
716
+
717
+ `agent` is the resolved CognosAgent (passed in from the closure
718
+ that has access to `_get_agent`).
719
+ """
720
+ user_text = _last_user_text(messages)
721
+ if not user_text or not user_text.lstrip().startswith("/"):
722
+ return None
723
+ try:
724
+ from core.slash_commands import dispatch, SlashContext, SlashResult
725
+ except Exception:
726
+ return None
727
+ if agent is None:
728
+ return None
729
+
730
+ # Some handlers (`/sessions`, `/help`, ...) emit Rich tables via
731
+ # `ctx.console.print(...)` and return an empty string. Use a
732
+ # StringIO-backed Console so the printed output ends up in our
733
+ # buffer too, then merge with the function's return value.
734
+ import io
735
+ from rich.console import Console
736
+ buffer = io.StringIO()
737
+ try:
738
+ ctx = SlashContext(
739
+ agent=agent,
740
+ console=Console(file=buffer, force_terminal=False, width=120),
741
+ )
742
+ result = dispatch(user_text.strip(), ctx)
743
+ except Exception as e:
744
+ logger.exception("slash intercept failed")
745
+ return _slash_response(f"slash command failed: {e}",
746
+ body.get("model") or "cognos")
747
+
748
+ if result is None:
749
+ # Not a recognised slash → let the LLM handle it
750
+ return None
751
+ if isinstance(result, SlashResult):
752
+ text = f"_slash result: {result.value}_"
753
+ else:
754
+ # Combine console-printed output (tables) with return value
755
+ # (one-line status messages). Either or both can be set.
756
+ result_str = _strip_rich_markup(str(result)).rstrip()
757
+ printed = _strip_rich_markup(buffer.getvalue()).rstrip()
758
+ if printed and result_str:
759
+ text = f"{printed}\n\n{result_str}"
760
+ elif printed:
761
+ text = printed
762
+ elif result_str:
763
+ text = result_str
764
+ else:
765
+ text = "_(no output)_"
766
+
767
+ logger.info(f"slash intercept handled {user_text.split()[0]!r}")
768
+ return _slash_response(text, body.get("model") or "cognos")
769
+
770
+
771
+ def _slash_response(text: str, model: str) -> "JSONResponse":
772
+ """Build an OpenAI-shape /v1/chat/completions response from a
773
+ slash command's output."""
774
+ return JSONResponse({
775
+ "id": f"chatcmpl-slash-{uuid.uuid4().hex[:16]}",
776
+ "object": "chat.completion",
777
+ "created": int(time.time()),
778
+ "model": model,
779
+ "choices": [{
780
+ "index": 0,
781
+ "message": {"role": "assistant", "content": text},
782
+ "finish_reason": "stop",
783
+ }],
784
+ "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
785
+ })
786
+
787
+
788
+ def _persist_inline_images(message: dict[str, Any]) -> dict[str, Any]:
789
+ """Save any inline `image_url` blocks to FileStore and inject a
790
+ `files/<id>` reference into the user text.
791
+
792
+ OpenAI multimodal content blocks look like:
793
+ {"role": "user", "content": [
794
+ {"type": "text", "text": "..."},
795
+ {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
796
+ ]}
797
+
798
+ Behaviour:
799
+ - If `content` is a string → return unchanged.
800
+ - If `content` is a list → for every `image_url` block whose URL
801
+ is a `data:` URL, decode and persist to FileStore. Mutate the
802
+ first text block (or insert one) to append a marker so the
803
+ LLM has a string handle. Leave the image_url block intact so
804
+ the multimodal LLM still sees pixels.
805
+ - http(s) URLs are left alone (the LLM can fetch on its own).
806
+ """
807
+ content = message.get("content")
808
+ if not isinstance(content, list) or not content:
809
+ return message
810
+
811
+ import base64 as _b64
812
+ import re as _re
813
+ import tempfile as _tempfile
814
+ import uuid as _uuid
815
+ from pathlib import Path as _Path
816
+
817
+ saved_refs: list[str] = []
818
+ for block in content:
819
+ if not isinstance(block, dict):
820
+ continue
821
+ if block.get("type") != "image_url":
822
+ continue
823
+ url = (block.get("image_url") or {}).get("url") or ""
824
+ if not url.startswith("data:"):
825
+ continue
826
+ # data:[<media-type>][;base64],<data>
827
+ m = _re.match(r"data:([^;]+);base64,(.+)$", url)
828
+ if not m:
829
+ continue
830
+ mime, b64data = m.group(1), m.group(2)
831
+ try:
832
+ raw = _b64.b64decode(b64data)
833
+ except Exception as e:
834
+ logger.warning(f"failed to decode inline image: {e}")
835
+ continue
836
+ # Persist
837
+ try:
838
+ from config import FILES_DIR
839
+ from core.files import FileStore
840
+ ext = mime.split("/", 1)[-1] if "/" in mime else "bin"
841
+ ext = "jpg" if ext == "jpeg" else ext
842
+ with _tempfile.NamedTemporaryFile(
843
+ suffix=f".{ext}", delete=False,
844
+ ) as tmp:
845
+ tmp.write(raw)
846
+ tmp_path = _Path(tmp.name)
847
+ try:
848
+ fs = FileStore(root=FILES_DIR)
849
+ rec = fs.upload(
850
+ tmp_path,
851
+ filename=f"upload_{_uuid.uuid4().hex[:8]}.{ext}",
852
+ )
853
+ saved_refs.append(rec.id)
854
+ logger.info(f"persisted user-uploaded image as files/{rec.id}")
855
+ finally:
856
+ tmp_path.unlink(missing_ok=True)
857
+ except Exception as e:
858
+ logger.warning(f"failed to save uploaded image to FileStore: {e}")
859
+
860
+ if not saved_refs:
861
+ return message
862
+
863
+ # Append marker into the first text block (or insert one) so the
864
+ # LLM has a string handle to pass to EditImage/DescribeImage.
865
+ marker = " ".join(f"[uploaded image: files/{rid}]" for rid in saved_refs)
866
+ new_content = list(content)
867
+ text_idx = next(
868
+ (i for i, b in enumerate(new_content)
869
+ if isinstance(b, dict) and b.get("type") == "text"),
870
+ None,
871
+ )
872
+ if text_idx is None:
873
+ new_content.insert(0, {"type": "text", "text": marker})
874
+ else:
875
+ existing = new_content[text_idx].get("text", "")
876
+ new_content[text_idx] = {
877
+ **new_content[text_idx],
878
+ "text": (existing + ("\n\n" if existing else "") + marker),
879
+ }
880
+ message["content"] = new_content
881
+ return message
882
+
883
+
884
+ def _translate_openai_to_internal(
885
+ body: dict[str, Any],
886
+ ) -> tuple[list[dict[str, Any]], list[dict[str, Any]] | None]:
887
+ """OpenAI /v1/chat/completions body → (messages, tools).
888
+
889
+ OpenAI's message shape *is* Cognos's internal shape (LiteLLM uses
890
+ OpenAI as its lingua franca). So this is mostly pass-through with
891
+ a small amount of normalisation.
892
+ """
893
+ raw_messages = body.get("messages") or []
894
+ raw_tools = body.get("tools") or []
895
+
896
+ out: list[dict[str, Any]] = []
897
+ for m in raw_messages:
898
+ if not isinstance(m, dict):
899
+ continue
900
+ role = m.get("role")
901
+ if role not in ("system", "user", "assistant", "tool"):
902
+ continue
903
+ m = dict(m)
904
+ # WebUI / OpenAI multimodal: when the user uploads an image,
905
+ # it arrives as content blocks with `image_url` data URLs.
906
+ # The LLM can SEE the image (passes through to multimodal
907
+ # backends) but tools like EditImage need a string handle.
908
+ # Persist each upload to FileStore and append a `files/<id>`
909
+ # reference into the message text so the LLM can name it.
910
+ if role == "user":
911
+ m = _persist_inline_images(m)
912
+ # Pass through otherwise — content/tool_calls/tool_call_id all
913
+ # match the LiteLLM-internal shape already.
914
+ out.append(m)
915
+
916
+ tools_translated: list[dict[str, Any]] | None = None
917
+ if raw_tools:
918
+ tools_translated = []
919
+ for t in raw_tools:
920
+ if not isinstance(t, dict):
921
+ continue
922
+ # Both `{"type":"function","function":{...}}` and bare
923
+ # `{"name":...,"parameters":...}` show up — normalise to
924
+ # the wrapped form.
925
+ if t.get("type") == "function" and isinstance(t.get("function"), dict):
926
+ tools_translated.append(t)
927
+ else:
928
+ tools_translated.append({
929
+ "type": "function",
930
+ "function": {
931
+ "name": t.get("name", ""),
932
+ "description": t.get("description", ""),
933
+ "parameters": t.get("parameters")
934
+ or t.get("input_schema")
935
+ or {"type": "object", "properties": {}},
936
+ },
937
+ })
938
+
939
+ return out, tools_translated
940
+
941
+
942
+ def _build_openai_response(
943
+ *,
944
+ text: str,
945
+ tool_calls: list[ToolUseBlock],
946
+ model: str,
947
+ usage: dict[str, int],
948
+ stop_reason: str | None,
949
+ ) -> dict[str, Any]:
950
+ """Build the non-streaming /v1/chat/completions response."""
951
+ message: dict[str, Any] = {"role": "assistant", "content": text or None}
952
+ if tool_calls:
953
+ message["tool_calls"] = [
954
+ {
955
+ "id": tc.id or f"call_{uuid.uuid4().hex[:12]}",
956
+ "type": "function",
957
+ "function": {
958
+ "name": tc.name,
959
+ "arguments": json.dumps(tc.input or {}),
960
+ },
961
+ }
962
+ for tc in tool_calls
963
+ ]
964
+ # Translate stop reasons to OpenAI vocabulary.
965
+ stop_map = {
966
+ "stop": "stop", "end_turn": "stop",
967
+ "length": "length", "max_tokens": "length",
968
+ "tool_calls": "tool_calls", "tool_use": "tool_calls",
969
+ }
970
+ finish = stop_map.get(stop_reason or "stop", "stop")
971
+ if tool_calls and finish == "stop":
972
+ finish = "tool_calls"
973
+
974
+ return {
975
+ "id": f"chatcmpl-{uuid.uuid4().hex[:24]}",
976
+ "object": "chat.completion",
977
+ "created": int(time.time()),
978
+ "model": model,
979
+ "choices": [{
980
+ "index": 0,
981
+ "message": message,
982
+ "finish_reason": finish,
983
+ }],
984
+ "usage": {
985
+ "prompt_tokens": usage.get("prompt_tokens", 0),
986
+ "completion_tokens": usage.get("completion_tokens", 0),
987
+ "total_tokens": usage.get("total_tokens", 0),
988
+ },
989
+ }
990
+
991
+
992
+ # ---- Streaming SSE generator ----------------------------------------
993
+
994
+
995
+ def _caudate_prefix_block(prediction: Any) -> str:
996
+ """Format Caudate's per-turn prediction as a markdown block.
997
+
998
+ Surfaced at the top of the visible reasoning trace so the user
999
+ can SEE what Caudate predicted before the LLM reasons. Without
1000
+ this, Caudate runs invisibly in the background and her work is
1001
+ never credited; with it, every turn becomes a small training
1002
+ event the user can sanity-check.
1003
+ """
1004
+ if prediction is None:
1005
+ return ""
1006
+ try:
1007
+ # Use whichever level the policy currently reports — if
1008
+ # plan_mode imports fail we just leave it blank.
1009
+ from nn.policy import GraduationPolicy
1010
+ from pathlib import Path
1011
+ from config import DATA_DIR
1012
+ level = GraduationPolicy(
1013
+ state_path=Path(DATA_DIR) / "nn" / "policy.json",
1014
+ ).level.label
1015
+ except Exception:
1016
+ level = "?"
1017
+ tool = getattr(prediction, "tool", "?") or "?"
1018
+ tool_conf = getattr(prediction, "tool_confidence", 0.0) or 0.0
1019
+ tier = getattr(prediction, "tier", "?") or "?"
1020
+ tier_conf = getattr(prediction, "tier_confidence", 0.0) or 0.0
1021
+ think = getattr(prediction, "think", 0.0) or 0.0
1022
+ value = getattr(prediction, "value", 0.0) or 0.0
1023
+ # Note: NO horizontal rule (---) below — markdown parsers inside
1024
+ # a <details> block can interpret it as ending the block. Two
1025
+ # newlines instead, gives a visual gap without breaking parsing.
1026
+ return (
1027
+ f"**🧠 Caudate** ({level}) · "
1028
+ f"tier=`{tier}` ({tier_conf:.0%}) · "
1029
+ f"tool=`{tool}` ({tool_conf:.0%}) · "
1030
+ f"think={think:.2f} · "
1031
+ f"value={value:.2f}\n\n"
1032
+ )
1033
+
1034
+
1035
+ async def _stream_openai_events(
1036
+ llm: LLMProvider,
1037
+ messages: list[dict[str, Any]],
1038
+ tools: list[dict[str, Any]] | None,
1039
+ max_tokens: int,
1040
+ temperature: float | None,
1041
+ requested_model: str,
1042
+ middleware: CaudateMiddleware | None = None,
1043
+ turn_ctx: Any = None,
1044
+ caller: str | None = None,
1045
+ prediction: Any = None,
1046
+ ) -> AsyncIterator[bytes]:
1047
+ """Cognos stream → OpenAI SSE format.
1048
+
1049
+ OpenAI emits a sequence of `data: {chatcmpl chunk}` events terminated
1050
+ by `data: [DONE]`. Each chunk has a `choices[0].delta` carrying the
1051
+ incremental content / tool_call / finish_reason.
1052
+ """
1053
+ chunk_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
1054
+ created = int(time.time())
1055
+
1056
+ def _sse(payload: dict[str, Any]) -> bytes:
1057
+ return f"data: {json.dumps(payload)}\n\n".encode()
1058
+
1059
+ def _chunk(delta: dict[str, Any], finish: str | None = None) -> dict[str, Any]:
1060
+ choice: dict[str, Any] = {"index": 0, "delta": delta}
1061
+ if finish:
1062
+ choice["finish_reason"] = finish
1063
+ return {
1064
+ "id": chunk_id,
1065
+ "object": "chat.completion.chunk",
1066
+ "created": created,
1067
+ "model": requested_model,
1068
+ "choices": [choice],
1069
+ }
1070
+
1071
+ # Initial role chunk (OpenAI convention)
1072
+ yield _sse(_chunk({"role": "assistant", "content": ""}))
1073
+
1074
+ tool_call_index = 0
1075
+ text_buffer = ""
1076
+ stop_reason: str | None = None
1077
+ error_occurred = False
1078
+
1079
+ try:
1080
+ # Pass `caller` through so DualLLMProvider's Router honors
1081
+ # forced_slow / forced_fast tags from the cognos-* alias.
1082
+ stream_kwargs: dict[str, Any] = dict(
1083
+ messages=messages, tools=tools,
1084
+ max_tokens=max_tokens, temperature=temperature,
1085
+ )
1086
+ if caller:
1087
+ stream_kwargs["caller"] = caller
1088
+ # Buffer thinking deltas into phrase-sized chunks before
1089
+ # emitting. Without this, Open WebUI renders each tiny token
1090
+ # as its own fragment — thinking traces show one word per
1091
+ # line. Flushing only at word boundaries (>=80 chars ending
1092
+ # in whitespace/punctuation) keeps the stream lively while
1093
+ # giving the UI proper text it can wrap into a paragraph.
1094
+ # Open WebUI 0.9.2 renders thinking via a `<details
1095
+ # type="reasoning">` block inside the regular content stream;
1096
+ # it doesn't read the OpenAI `reasoning_content` field.
1097
+ #
1098
+ # Strategy: STREAM the thinking live so the user sees the
1099
+ # model reasoning in real time. To stay clean we:
1100
+ #
1101
+ # 1. Send `<details>` ONCE on the first thinking delta.
1102
+ # Never close+reopen; WebUI will keep streaming into
1103
+ # the same block.
1104
+ # 2. Buffer thinking text until we hit a SAFE flush point
1105
+ # = the buffer ends in whitespace AND the previous char
1106
+ # was non-whitespace. This guarantees we never cut a
1107
+ # word mid-character.
1108
+ # 3. Send `</details>` ONCE on the first text_delta (or
1109
+ # end of stream if there's no answer text).
1110
+ #
1111
+ # We also emit `reasoning_content` deltas live for newer
1112
+ # clients that prefer the structured field.
1113
+ thinking_buf = ""
1114
+ thinking_open = False
1115
+ thinking_closed = False
1116
+ _THINKING_FLUSH_MIN = 24 # smaller = livelier
1117
+
1118
+ def _safe_flush_idx(buf: str) -> int:
1119
+ """Return an index <= len(buf) at which it's safe to cut.
1120
+
1121
+ Safe = a position where the previous char is whitespace
1122
+ (we cut at a word boundary). Returns 0 if no safe point.
1123
+ We scan from the end backwards so we flush the LARGEST
1124
+ safe prefix possible.
1125
+ """
1126
+ if len(buf) < _THINKING_FLUSH_MIN:
1127
+ return 0
1128
+ # Walk back from the end to the most recent whitespace
1129
+ for i in range(len(buf) - 1, _THINKING_FLUSH_MIN - 1, -1):
1130
+ if buf[i].isspace():
1131
+ return i + 1 # include the whitespace
1132
+ return 0
1133
+
1134
+ # Caudate prefix: surfaced at the top of the thinking block
1135
+ # so the user sees her per-turn prediction (tool/tier/think/
1136
+ # value) BEFORE the LLM's reasoning. Empty string when no
1137
+ # prediction is available (e.g. SILENT trust level).
1138
+ _caudate_prefix = _caudate_prefix_block(prediction)
1139
+
1140
+ def _open_thinking():
1141
+ nonlocal thinking_open
1142
+ if thinking_open or thinking_closed:
1143
+ return None
1144
+ thinking_open = True
1145
+ return _chunk({"content": (
1146
+ '<details type="reasoning" done="false">\n'
1147
+ '<summary>Thinking…</summary>\n\n'
1148
+ f'{_caudate_prefix}'
1149
+ )})
1150
+
1151
+ # Eagerly open the block when Caudate has a prediction so she
1152
+ # is visible on EVERY turn, not just turns where the LLM
1153
+ # decides to emit thinking. Without this, simple queries that
1154
+ # have no LLM thinking would hide Caudate's work entirely.
1155
+ if _caudate_prefix:
1156
+ opener = _open_thinking()
1157
+ if opener:
1158
+ yield _sse(opener)
1159
+
1160
+ def _close_thinking():
1161
+ nonlocal thinking_open, thinking_closed
1162
+ if not thinking_open or thinking_closed:
1163
+ return None
1164
+ thinking_closed = True
1165
+ thinking_open = False
1166
+ return _chunk({"content": "\n\n</details>\n\n"})
1167
+
1168
+ async for event in llm.stream(**stream_kwargs):
1169
+ if event.type == "text_delta" and event.delta:
1170
+ # First answer-text delta: flush remaining thinking
1171
+ # buffer + close the <details> block before answer.
1172
+ # Note: emit ONLY content, not reasoning_content. Open
1173
+ # WebUI 0.9.x renders a "Thought for ..." widget for
1174
+ # every reasoning_content delta, producing one widget
1175
+ # per word boundary.
1176
+ if thinking_buf:
1177
+ yield _sse(_chunk({"content": thinking_buf}))
1178
+ thinking_buf = ""
1179
+ close = _close_thinking()
1180
+ if close:
1181
+ yield _sse(close)
1182
+ text_buffer += event.delta
1183
+ if middleware is not None and turn_ctx is not None:
1184
+ middleware.observe_response_text(turn_ctx, event.delta)
1185
+ yield _sse(_chunk({"content": event.delta}))
1186
+ elif event.type == "thinking_delta" and event.delta:
1187
+ if middleware is not None and turn_ctx is not None:
1188
+ middleware.observe_thinking(turn_ctx, event.delta)
1189
+ # First thinking delta: open the <details> block.
1190
+ opener = _open_thinking()
1191
+ if opener:
1192
+ yield _sse(opener)
1193
+ thinking_buf += event.delta
1194
+ # Flush only at a safe word boundary so we never cut
1195
+ # mid-word. The flush index is the largest prefix
1196
+ # ending in whitespace.
1197
+ cut = _safe_flush_idx(thinking_buf)
1198
+ if cut > 0:
1199
+ out = thinking_buf[:cut]
1200
+ thinking_buf = thinking_buf[cut:]
1201
+ yield _sse(_chunk({"content": out}))
1202
+ elif event.type == "tool_use_end":
1203
+ tc = ToolUseBlock(
1204
+ id=event.tool_use_id or f"call_{uuid.uuid4().hex[:12]}",
1205
+ name=event.tool_name or "",
1206
+ input=event.tool_input or {},
1207
+ )
1208
+ if middleware is not None and turn_ctx is not None:
1209
+ middleware.observe_tool_use(turn_ctx, tc.name)
1210
+ # OpenAI streams a tool_call entry inside delta.tool_calls
1211
+ yield _sse(_chunk({
1212
+ "tool_calls": [{
1213
+ "index": tool_call_index,
1214
+ "id": tc.id,
1215
+ "type": "function",
1216
+ "function": {
1217
+ "name": tc.name,
1218
+ "arguments": json.dumps(tc.input or {}),
1219
+ },
1220
+ }],
1221
+ }))
1222
+ tool_call_index += 1
1223
+ elif event.type == "message_stop":
1224
+ stop_reason = event.stop_reason
1225
+ except Exception as e:
1226
+ logger.exception("OpenAI-compat stream upstream failed")
1227
+ error_occurred = True
1228
+ # Send an error chunk and terminate
1229
+ yield _sse({
1230
+ "id": chunk_id, "object": "chat.completion.chunk",
1231
+ "created": created, "model": requested_model,
1232
+ "choices": [{
1233
+ "index": 0, "delta": {},
1234
+ "finish_reason": "error",
1235
+ }],
1236
+ "error": {"message": str(e), "type": "api_error"},
1237
+ })
1238
+ if middleware is not None and turn_ctx is not None:
1239
+ middleware.end_turn(turn_ctx, error=True)
1240
+ yield b"data: [DONE]\n\n"
1241
+ return
1242
+
1243
+ # If the stream ended without an answer text_delta, the <details>
1244
+ # block is still open with possibly unflushed thinking inside.
1245
+ # Flush whatever's left and close cleanly so WebUI doesn't render
1246
+ # an unterminated <details> block.
1247
+ if thinking_buf:
1248
+ yield _sse(_chunk({"content": thinking_buf}))
1249
+ thinking_buf = ""
1250
+ close = _close_thinking()
1251
+ if close:
1252
+ yield _sse(close)
1253
+
1254
+ stop_map = {
1255
+ "stop": "stop", "end_turn": "stop",
1256
+ "length": "length", "max_tokens": "length",
1257
+ "tool_calls": "tool_calls", "tool_use": "tool_calls",
1258
+ }
1259
+ finish = stop_map.get(stop_reason or "stop", "stop")
1260
+ if tool_call_index > 0 and finish == "stop":
1261
+ finish = "tool_calls"
1262
+
1263
+ yield _sse(_chunk({}, finish=finish))
1264
+ yield b"data: [DONE]\n\n"
1265
+
1266
+ if middleware is not None and turn_ctx is not None and not error_occurred:
1267
+ middleware.end_turn(turn_ctx, error=False)
1268
+
1269
+
1270
+ # ---- FastAPI router -------------------------------------------------
1271
+
1272
+
1273
+ def build_router() -> APIRouter:
1274
+ """Build the /v1/chat/completions router. Reuses the same singleton
1275
+ agent + middleware as `/v1/messages`, so Caudate sees one unified
1276
+ stream of training samples regardless of which API the client
1277
+ speaks."""
1278
+ router = APIRouter()
1279
+ _agent_box: dict[str, Any] = {"agent": None, "middleware": None}
1280
+
1281
+ def _get_agent():
1282
+ if _agent_box["agent"] is None:
1283
+ from core.agent import CognosAgent
1284
+ agent = CognosAgent(
1285
+ mode="agentic",
1286
+ permission_mode="bypass",
1287
+ personality=True,
1288
+ )
1289
+ _agent_box["agent"] = agent
1290
+ _agent_box["middleware"] = CaudateMiddleware(agent)
1291
+ cau = getattr(agent, "caudate", None)
1292
+ cau_status = (cau.policy.level.label
1293
+ if cau and cau.policy else "unavailable")
1294
+ logger.info(
1295
+ f"OpenAI-compat singleton agent ready, "
1296
+ f"llm={agent.llm.model}, caudate={cau_status}"
1297
+ )
1298
+ return _agent_box["agent"], _agent_box["middleware"]
1299
+
1300
+ @router.post("/v1/chat/completions")
1301
+ async def chat_completions(request: Request):
1302
+ try:
1303
+ body = await request.json()
1304
+ except Exception:
1305
+ raise HTTPException(400, "Invalid JSON body")
1306
+ if not isinstance(body, dict):
1307
+ raise HTTPException(400, "Body must be a JSON object")
1308
+
1309
+ try:
1310
+ internal_msgs, internal_tools = _translate_openai_to_internal(body)
1311
+ except Exception as e:
1312
+ logger.exception("OpenAI→internal translation failed")
1313
+ raise HTTPException(400, f"Bad message format: {e}")
1314
+
1315
+ # --- Slash-command interception ---------------------------------
1316
+ # When the latest user message starts with `/`, dispatch through
1317
+ # the existing core/slash_commands.py registry instead of
1318
+ # calling the LLM. Lets Open WebUI users hit `/caudate`,
1319
+ # `/sessions`, `/usage`, `/skills`, `/clear`, etc. and get the
1320
+ # same data the Cognos /ui/ shows — same one place.
1321
+ # Resolve the agent first (needed by the slash registry).
1322
+ _agent_for_slash, _ = _get_agent()
1323
+ slash_response = await _try_slash_intercept(
1324
+ body, internal_msgs, _agent_for_slash
1325
+ )
1326
+ if slash_response is not None:
1327
+ return slash_response
1328
+
1329
+ # Inject sandbox-awareness hint so the LLM scaffolds new files
1330
+ # into cognos/sandbox/ by default. Idempotent across multi-turn.
1331
+ from core.sandbox_prompt import inject_sandbox_hint
1332
+ internal_msgs = inject_sandbox_hint(internal_msgs)
1333
+
1334
+ max_tokens = int(body.get("max_tokens") or 4096)
1335
+ temperature = body.get("temperature")
1336
+ if temperature is not None:
1337
+ temperature = float(temperature)
1338
+ requested_model = body.get("model") or "cognos"
1339
+ stream = bool(body.get("stream", False))
1340
+
1341
+ agent, middleware = _get_agent()
1342
+ llm = agent.llm
1343
+
1344
+ # Alias resolution — the user-facing default is just `cognos`.
1345
+ # Caudate decides arbitrate/constitutional/tier per-turn from
1346
+ # her live prediction. The other cognos-* aliases stay
1347
+ # functional as explicit power-user overrides (debug routes)
1348
+ # but aren't advertised in /v1/models — the user shouldn't
1349
+ # have to choose between 9 modes. Default = smart.
1350
+ forced_caller: str | None = None
1351
+ arbitrate: bool = False
1352
+ constitutional: bool = False
1353
+ rm = (requested_model or "").lower()
1354
+
1355
+ # --- Open Caudate's turn FIRST so her prediction is available
1356
+ # to drive the unified-cognos routing decisions. (Was below the
1357
+ # alias block previously; moved up.)
1358
+ turn_ctx = middleware.begin_turn(
1359
+ internal_msgs, internal_tools,
1360
+ model_source=getattr(llm, "model", "unknown"),
1361
+ )
1362
+ prediction = getattr(turn_ctx, "prediction", None)
1363
+
1364
+ # --- Unified `cognos` (default): Caudate-driven tier routing.
1365
+ #
1366
+ # Constitutional critique is NOT auto-triggered here — it's an
1367
+ # explicit opt-in via the `cognos-strict` alias.
1368
+ #
1369
+ # Why no auto-trigger: Caudate's value head doesn't yet
1370
+ # differentiate high-stakes from low-stakes prompts at her
1371
+ # current scale (~0.46 for both trivial and substantial
1372
+ # prompts in measured tests). A length-based proxy was tried
1373
+ # earlier but is brittle — short high-stakes prompts ("delete
1374
+ # prod database?") slip past it, and it forces a critique
1375
+ # round-trip on every long prompt regardless of need.
1376
+ # The honest call: only critique when the user opts in.
1377
+ if rm in ("cognos", "", None) and prediction is not None:
1378
+ if prediction.tier_confidence < 0.6:
1379
+ arbitrate = True
1380
+ elif prediction.tier == "slow":
1381
+ forced_caller = "forced_slow"
1382
+ elif prediction.tier == "fast":
1383
+ forced_caller = "forced_fast"
1384
+ logger.info(
1385
+ f"unified cognos routing: tier={prediction.tier} "
1386
+ f"(conf={prediction.tier_confidence:.2f}) → "
1387
+ f"arbitrate={arbitrate} forced_caller={forced_caller}"
1388
+ )
1389
+
1390
+ # --- Explicit overrides (debug / power-user routes)
1391
+ elif rm in ("cognos-dual-brain", "cognos-collab"):
1392
+ arbitrate = True
1393
+ elif rm == "cognos-strict":
1394
+ constitutional = True
1395
+ elif rm == "cognos-slow":
1396
+ forced_caller = "forced_slow"
1397
+ elif rm == "cognos-fast":
1398
+ forced_caller = "forced_fast"
1399
+ elif rm in ("cognos-kimi", "cognos-haiku"):
1400
+ try:
1401
+ from llm.router import DualLLMProvider
1402
+ if isinstance(llm, DualLLMProvider):
1403
+ target = "kimi" if rm == "cognos-kimi" else "haiku"
1404
+ fast_m = (getattr(llm.router.fast, "model", "") or "").lower()
1405
+ if target in fast_m:
1406
+ forced_caller = "forced_fast"
1407
+ else:
1408
+ forced_caller = "forced_slow"
1409
+ except Exception:
1410
+ pass
1411
+
1412
+ # Register the tag with the router policy so it actually routes,
1413
+ # not just passes through.
1414
+ try:
1415
+ from llm.router import DualLLMProvider
1416
+ if isinstance(llm, DualLLMProvider) and forced_caller:
1417
+ if forced_caller == "forced_slow":
1418
+ llm.router.policy.slow_caller_tags.add("forced_slow")
1419
+ else:
1420
+ llm.router.policy.fast_caller_tags.add("forced_fast")
1421
+ except Exception:
1422
+ pass
1423
+
1424
+ # Bump max_tokens for thinking models. Kimi-k2.6 (and similar
1425
+ # reasoning models) spend most of a small budget thinking and
1426
+ # emit empty content. Ensure at least 1024 tokens whenever a
1427
+ # thinking-capable model could be on the path — that includes:
1428
+ # - explicit forced_slow / forced_fast pointing at thinking models
1429
+ # - the configured system1/system2 being a thinking model
1430
+ # Catches the "Kimi as system1" case where the router routes
1431
+ # routine traffic to Kimi by default.
1432
+ _THINKING_MODEL_HINTS = ("kimi", "deepseek", "qwen3", "o1", "o3")
1433
+ try:
1434
+ from llm.router import DualLLMProvider
1435
+ candidate_models: list[str] = []
1436
+ if isinstance(llm, DualLLMProvider):
1437
+ # DualLLMProvider holds Router; Router has .fast / .slow
1438
+ candidate_models.append(getattr(llm.router.fast, "model", "") or "")
1439
+ candidate_models.append(getattr(llm.router.slow, "model", "") or "")
1440
+ else:
1441
+ candidate_models.append(getattr(llm, "model", "") or "")
1442
+ hit = any(any(h in m.lower() for h in _THINKING_MODEL_HINTS)
1443
+ for m in candidate_models)
1444
+ if hit and max_tokens < 4096:
1445
+ max_tokens = 4096
1446
+ except Exception as e:
1447
+ logger.debug(f"thinking-model bump skipped: {e}")
1448
+
1449
+ # `turn_ctx` was already opened above (moved earlier so
1450
+ # Caudate's prediction can drive the unified `cognos` routing).
1451
+ # Just inject her hint here as before.
1452
+ internal_msgs = middleware.maybe_inject_hint(internal_msgs, turn_ctx)
1453
+
1454
+ # Open WebUI is a "web UI" caller — let it use the Claude Code
1455
+ # subscription OAuth like our own `/chat` endpoint does. Without
1456
+ # this, anthropic/* models 401 because LiteLLM has no api key.
1457
+ if stream:
1458
+ async def _gen():
1459
+ with subscription_auth_scope():
1460
+ if arbitrate:
1461
+ # Streaming arbitration: stream system1 LIVE so
1462
+ # the user sees text flow within seconds, run
1463
+ # system2 in parallel as a background draft for
1464
+ # the preference corpus. Avoids the 20+ second
1465
+ # blackout that comes from buffering both before
1466
+ # any data flows (which Open WebUI times out on).
1467
+ from llm.router import DualLLMProvider
1468
+ if not isinstance(llm, DualLLMProvider):
1469
+ # No dual brain — normal stream
1470
+ async for chunk in _stream_openai_events(
1471
+ llm=llm, messages=internal_msgs, tools=internal_tools,
1472
+ max_tokens=max_tokens, temperature=temperature,
1473
+ requested_model=requested_model,
1474
+ middleware=middleware, turn_ctx=turn_ctx,
1475
+ prediction=prediction,
1476
+ ):
1477
+ yield chunk
1478
+ return
1479
+
1480
+ # Pick which brain to STREAM live and which to
1481
+ # buffer in the background. Streaming a thinking
1482
+ # model (Kimi) blocks visible content for tens
1483
+ # of seconds while it reasons, which breaks chat
1484
+ # UX. So we always stream the non-thinking brain
1485
+ # if there is one — both are still engaged for
1486
+ # arbitration. Falls back to system1 if both
1487
+ # are thinking models or both are non-thinking.
1488
+ s1, s2 = llm.router.fast, llm.router.slow
1489
+ s1_thinks = any(h in (s1.model or "").lower()
1490
+ for h in ("kimi", "deepseek", "qwen3", "o1", "o3"))
1491
+ s2_thinks = any(h in (s2.model or "").lower()
1492
+ for h in ("kimi", "deepseek", "qwen3", "o1", "o3"))
1493
+ if s1_thinks and not s2_thinks:
1494
+ stream_provider, bg_provider = s2, s1
1495
+ stream_label, bg_label = "slow", "fast"
1496
+ else:
1497
+ stream_provider, bg_provider = s1, s2
1498
+ stream_label, bg_label = "fast", "slow"
1499
+
1500
+ # Kick off the background brain (buffered).
1501
+ bg_task = asyncio.create_task(
1502
+ bg_provider.chat(
1503
+ messages=internal_msgs, tools=internal_tools,
1504
+ max_tokens=max_tokens, temperature=temperature,
1505
+ )
1506
+ )
1507
+
1508
+ # Live-stream the chosen provider directly to client.
1509
+ stream_text_buf: list[str] = []
1510
+ stream_tool_calls: list[ToolUseBlock] = []
1511
+ chunk_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
1512
+ created = int(time.time())
1513
+
1514
+ def _chunk(delta, finish=None):
1515
+ ch = {"index": 0, "delta": delta}
1516
+ if finish:
1517
+ ch["finish_reason"] = finish
1518
+ return {
1519
+ "id": chunk_id,
1520
+ "object": "chat.completion.chunk",
1521
+ "created": created,
1522
+ "model": requested_model,
1523
+ "choices": [ch],
1524
+ }
1525
+
1526
+ # Initial role chunk
1527
+ yield f"data: {json.dumps(_chunk({'role':'assistant','content':''}))}\n\n".encode()
1528
+
1529
+ # Same word-boundary buffering for thinking
1530
+ # as in `_stream_openai_events` — Open WebUI
1531
+ # otherwise renders each token as its own line.
1532
+ # Stream thinking inside ONE `<details>` block
1533
+ # in content (NOT via reasoning_content — that
1534
+ # field generates one "Thought" widget per
1535
+ # delta in WebUI 0.9.x).
1536
+ _arb_thinking_buf = ""
1537
+ _ARB_THINK_MIN = 24
1538
+ _arb_thinking_open = False
1539
+ _arb_thinking_closed = False
1540
+ _arb_caudate_prefix = _caudate_prefix_block(prediction)
1541
+
1542
+ def _arb_safe_flush(buf: str) -> int:
1543
+ if len(buf) < _ARB_THINK_MIN:
1544
+ return 0
1545
+ for i in range(len(buf) - 1, _ARB_THINK_MIN - 1, -1):
1546
+ if buf[i].isspace():
1547
+ return i + 1
1548
+ return 0
1549
+
1550
+ def _arb_open_block():
1551
+ nonlocal _arb_thinking_open
1552
+ if _arb_thinking_open or _arb_thinking_closed:
1553
+ return None
1554
+ _arb_thinking_open = True
1555
+ return _chunk({"content": (
1556
+ '<details type="reasoning" done="false">\n'
1557
+ '<summary>Thinking…</summary>\n\n'
1558
+ f'{_arb_caudate_prefix}'
1559
+ )})
1560
+
1561
+ def _arb_close_block():
1562
+ nonlocal _arb_thinking_open, _arb_thinking_closed
1563
+ if not _arb_thinking_open or _arb_thinking_closed:
1564
+ return None
1565
+ _arb_thinking_closed = True
1566
+ _arb_thinking_open = False
1567
+ return _chunk({"content": "\n\n</details>\n\n"})
1568
+
1569
+ # Eagerly open with Caudate prefix so she's
1570
+ # visible on every arbitration turn too.
1571
+ if _arb_caudate_prefix:
1572
+ opener = _arb_open_block()
1573
+ if opener:
1574
+ yield f"data: {json.dumps(opener)}\n\n".encode()
1575
+
1576
+ try:
1577
+ async for ev in stream_provider.stream(
1578
+ messages=internal_msgs, tools=internal_tools,
1579
+ max_tokens=max_tokens, temperature=temperature,
1580
+ ):
1581
+ if ev.type == "text_delta" and ev.delta:
1582
+ # Flush + close the thinking block
1583
+ # before answer text streams.
1584
+ if _arb_thinking_buf:
1585
+ out = _arb_thinking_buf
1586
+ _arb_thinking_buf = ""
1587
+ yield f"data: {json.dumps(_chunk({'content': out}))}\n\n".encode()
1588
+ close = _arb_close_block()
1589
+ if close:
1590
+ yield f"data: {json.dumps(close)}\n\n".encode()
1591
+ stream_text_buf.append(ev.delta)
1592
+ middleware.observe_response_text(turn_ctx, ev.delta)
1593
+ yield f"data: {json.dumps(_chunk({'content': ev.delta}))}\n\n".encode()
1594
+ elif ev.type == "thinking_delta" and ev.delta:
1595
+ middleware.observe_thinking(turn_ctx, ev.delta)
1596
+ opener = _arb_open_block()
1597
+ if opener:
1598
+ yield f"data: {json.dumps(opener)}\n\n".encode()
1599
+ _arb_thinking_buf += ev.delta
1600
+ cut = _arb_safe_flush(_arb_thinking_buf)
1601
+ if cut > 0:
1602
+ out = _arb_thinking_buf[:cut]
1603
+ _arb_thinking_buf = _arb_thinking_buf[cut:]
1604
+ yield f"data: {json.dumps(_chunk({'content': out}))}\n\n".encode()
1605
+ elif ev.type == "tool_use_end":
1606
+ tc = ToolUseBlock(
1607
+ id=ev.tool_use_id or f"call_{uuid.uuid4().hex[:12]}",
1608
+ name=ev.tool_name or "",
1609
+ input=ev.tool_input or {},
1610
+ )
1611
+ stream_tool_calls.append(tc)
1612
+ middleware.observe_tool_use(turn_ctx, tc.name)
1613
+ yield f"data: {json.dumps(_chunk({'tool_calls':[{'index':len(stream_tool_calls)-1,'id':tc.id,'type':'function','function':{'name':tc.name,'arguments':json.dumps(tc.input or {})}}]}))}\n\n".encode()
1614
+ except Exception as e:
1615
+ logger.exception("stream-leg failed in arbitrate")
1616
+ middleware.end_turn(turn_ctx, error=True)
1617
+ err_chunk = json.dumps({
1618
+ "id": chunk_id, "object": "chat.completion.chunk",
1619
+ "created": created, "model": requested_model,
1620
+ "choices": [{"index": 0, "delta": {},
1621
+ "finish_reason": "error"}],
1622
+ "error": {"message": str(e), "type": "api_error"},
1623
+ })
1624
+ yield f"data: {err_chunk}\n\n".encode()
1625
+ yield b"data: [DONE]\n\n"
1626
+ bg_task.cancel()
1627
+ return
1628
+
1629
+ # Flush any unflushed thinking + close the
1630
+ # block if the stream ended with thinking
1631
+ # only (no answer text).
1632
+ if _arb_thinking_buf:
1633
+ yield f"data: {json.dumps(_chunk({'content': _arb_thinking_buf}))}\n\n".encode()
1634
+ _arb_thinking_buf = ""
1635
+ close = _arb_close_block()
1636
+ if close:
1637
+ yield f"data: {json.dumps(close)}\n\n".encode()
1638
+
1639
+ # Close stream to client.
1640
+ finish = "tool_calls" if stream_tool_calls else "stop"
1641
+ yield f"data: {json.dumps(_chunk({}, finish=finish))}\n\n".encode()
1642
+ yield b"data: [DONE]\n\n"
1643
+
1644
+ # Background: wait for the bg brain to finish,
1645
+ # score both, log arbitration. User has already
1646
+ # seen the streamed brain; this is purely for
1647
+ # the preference corpus.
1648
+ try:
1649
+ bg_resp = await bg_task
1650
+ except Exception as e:
1651
+ logger.warning(f"bg-draft failed: {e}")
1652
+ bg_resp = None
1653
+
1654
+ try:
1655
+ streamed_text = "".join(stream_text_buf)
1656
+ class _FakeResp:
1657
+ content = streamed_text
1658
+ thinking = ""
1659
+ tool_calls = stream_tool_calls
1660
+ streamed_score = _score_draft(_FakeResp()) # type: ignore[arg-type]
1661
+ bg_score = _score_draft(bg_resp) if bg_resp else 0.0
1662
+ # Always label fast/slow consistently regardless
1663
+ # of which one we actually streamed: fast = system1,
1664
+ # slow = system2 in the persisted corpus.
1665
+ if stream_label == "fast":
1666
+ fast_text, fast_score = streamed_text, streamed_score
1667
+ slow_text = bg_resp.content if bg_resp else ""
1668
+ slow_score = bg_score
1669
+ else:
1670
+ slow_text, slow_score = streamed_text, streamed_score
1671
+ fast_text = bg_resp.content if bg_resp else ""
1672
+ fast_score = bg_score
1673
+ winner = ("fast" if fast_score >= slow_score
1674
+ or abs(fast_score - slow_score) < 0.02
1675
+ else "slow")
1676
+ middleware.observe_arbitration(
1677
+ turn_ctx,
1678
+ fast_text=fast_text,
1679
+ slow_text=slow_text,
1680
+ fast_score=fast_score,
1681
+ slow_score=slow_score,
1682
+ winner=winner,
1683
+ fast_model=s1.model,
1684
+ slow_model=s2.model,
1685
+ )
1686
+ except Exception as e:
1687
+ logger.debug(f"arbitration log failed: {e}")
1688
+ middleware.end_turn(turn_ctx, error=False)
1689
+ return
1690
+
1691
+ # Server-side agentic loop (cognos-* + no client
1692
+ # tools): LIVE streaming. We use the streaming
1693
+ # variant of the agentic loop so thinking_delta /
1694
+ # text_delta / tool_result events flow to the UI
1695
+ # as the model emits them across all iterations.
1696
+ if _should_run_agentic(requested_model, internal_tools):
1697
+ chunk_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
1698
+ created = int(time.time())
1699
+
1700
+ def _ag_chunk(delta, finish=None):
1701
+ ch = {"index": 0, "delta": delta}
1702
+ if finish:
1703
+ ch["finish_reason"] = finish
1704
+ return {
1705
+ "id": chunk_id,
1706
+ "object": "chat.completion.chunk",
1707
+ "created": created,
1708
+ "model": requested_model,
1709
+ "choices": [ch],
1710
+ }
1711
+
1712
+ # Initial role chunk
1713
+ yield f"data: {json.dumps(_ag_chunk({'role':'assistant','content':''}))}\n\n".encode()
1714
+
1715
+ # Always open the reasoning block at the start —
1716
+ # even if neither the LLM nor any tools fire,
1717
+ # Caudate's per-turn prediction is shown.
1718
+ ag_caudate_prefix = _caudate_prefix_block(prediction)
1719
+ ag_thinking_open = False
1720
+ ag_thinking_closed = False
1721
+ ag_text_started = False
1722
+ ag_thinking_buf = ""
1723
+ ag_tool_call_index = 0
1724
+ _AG_FLUSH_MIN = 24
1725
+
1726
+ def _ag_safe_flush_idx(buf: str) -> int:
1727
+ if len(buf) < _AG_FLUSH_MIN:
1728
+ return 0
1729
+ for i in range(len(buf) - 1, _AG_FLUSH_MIN - 1, -1):
1730
+ if buf[i].isspace():
1731
+ return i + 1
1732
+ return 0
1733
+
1734
+ def _ag_open_block():
1735
+ nonlocal ag_thinking_open
1736
+ if ag_thinking_open or ag_thinking_closed:
1737
+ return None
1738
+ ag_thinking_open = True
1739
+ return _ag_chunk({"content": (
1740
+ '<details type="reasoning" done="false">\n'
1741
+ '<summary>Thinking…</summary>\n\n'
1742
+ f'{ag_caudate_prefix}'
1743
+ )})
1744
+
1745
+ def _ag_close_block():
1746
+ nonlocal ag_thinking_open, ag_thinking_closed
1747
+ if not ag_thinking_open or ag_thinking_closed:
1748
+ return None
1749
+ ag_thinking_closed = True
1750
+ ag_thinking_open = False
1751
+ return _ag_chunk({"content": "\n\n</details>\n\n"})
1752
+
1753
+ # Eagerly open the reasoning block so Caudate's
1754
+ # prefix is always visible.
1755
+ if ag_caudate_prefix:
1756
+ opener = _ag_open_block()
1757
+ if opener:
1758
+ yield f"data: {json.dumps(opener)}\n\n".encode()
1759
+
1760
+ try:
1761
+ async for ev in _run_agentic_loop_streaming(
1762
+ llm=llm,
1763
+ executor=agent.loop.executor,
1764
+ messages=internal_msgs,
1765
+ middleware=middleware, turn_ctx=turn_ctx,
1766
+ max_tokens=max_tokens, temperature=temperature,
1767
+ caller=forced_caller,
1768
+ ):
1769
+ if ev.type == "thinking_delta" and ev.delta:
1770
+ if middleware is not None and turn_ctx is not None:
1771
+ middleware.observe_thinking(turn_ctx, ev.delta)
1772
+ opener = _ag_open_block()
1773
+ if opener:
1774
+ yield f"data: {json.dumps(opener)}\n\n".encode()
1775
+ ag_thinking_buf += ev.delta
1776
+ cut = _ag_safe_flush_idx(ag_thinking_buf)
1777
+ if cut > 0:
1778
+ out = ag_thinking_buf[:cut]
1779
+ ag_thinking_buf = ag_thinking_buf[cut:]
1780
+ # NOTE: emit ONLY in-content
1781
+ # <details> stream. Do NOT also
1782
+ # emit reasoning_content — Open
1783
+ # WebUI 0.9.x renders a fresh
1784
+ # "Thought for ..." widget per
1785
+ # reasoning_content delta,
1786
+ # producing one widget per word
1787
+ # boundary instead of one block.
1788
+ yield f"data: {json.dumps(_ag_chunk({'content': out}))}\n\n".encode()
1789
+ elif ev.type == "tool_result":
1790
+ # Inline label inside the reasoning
1791
+ # block so the user can see what
1792
+ # happened. Status emoji + name +
1793
+ # short summary.
1794
+ opener = _ag_open_block()
1795
+ if opener:
1796
+ yield f"data: {json.dumps(opener)}\n\n".encode()
1797
+ if ag_thinking_buf:
1798
+ out = ag_thinking_buf
1799
+ ag_thinking_buf = ""
1800
+ yield f"data: {json.dumps(_ag_chunk({'content': out}))}\n\n".encode()
1801
+ status = (ev.raw or {}).get("status", "success")
1802
+ icon = "✓" if status == "success" else "✗"
1803
+ line = f"\n\n{icon} **{ev.tool_name}** — {(ev.delta or '')[:200]}\n\n"
1804
+ yield f"data: {json.dumps(_ag_chunk({'content': line}))}\n\n".encode()
1805
+ elif ev.type == "tool_use_end":
1806
+ # Emit the OpenAI-shape tool_calls
1807
+ # delta so the OpenAI client side
1808
+ # knows what was called. The
1809
+ # tool_result event above renders
1810
+ # the human-visible label.
1811
+ yield f"data: {json.dumps(_ag_chunk({'tool_calls': [{'index': ag_tool_call_index, 'id': ev.tool_use_id or '', 'type': 'function', 'function': {'name': ev.tool_name or '', 'arguments': json.dumps(ev.tool_input or {})}}]}))}\n\n".encode()
1812
+ ag_tool_call_index += 1
1813
+ elif ev.type == "text_delta" and ev.delta:
1814
+ # Final answer arriving — flush the
1815
+ # buffer + close the reasoning
1816
+ # block before the answer streams.
1817
+ if not ag_text_started:
1818
+ if ag_thinking_buf:
1819
+ out = ag_thinking_buf
1820
+ ag_thinking_buf = ""
1821
+ yield f"data: {json.dumps(_ag_chunk({'content': out}))}\n\n".encode()
1822
+ close = _ag_close_block()
1823
+ if close:
1824
+ yield f"data: {json.dumps(close)}\n\n".encode()
1825
+ ag_text_started = True
1826
+ if middleware is not None and turn_ctx is not None:
1827
+ middleware.observe_response_text(turn_ctx, ev.delta)
1828
+ yield f"data: {json.dumps(_ag_chunk({'content': ev.delta}))}\n\n".encode()
1829
+ elif ev.type == "iteration_break":
1830
+ # Visual breath inside the block —
1831
+ # helps when the LLM thinks across
1832
+ # multiple iterations punctuated by
1833
+ # tool calls.
1834
+ pass
1835
+ elif ev.type == "message_stop":
1836
+ pass
1837
+ except Exception as e:
1838
+ logger.exception("agentic stream failed")
1839
+ yield f"data: {json.dumps(_ag_chunk({'content': f'[error] {e}'}, finish='stop'))}\n\n".encode()
1840
+ yield b"data: [DONE]\n\n"
1841
+ middleware.end_turn(turn_ctx, error=True)
1842
+ return
1843
+
1844
+ # Stream finished. If we never got a text_delta
1845
+ # (e.g. tools-only conversation that hit max
1846
+ # iterations), flush remaining thinking + close.
1847
+ if not ag_text_started:
1848
+ if ag_thinking_buf:
1849
+ out = ag_thinking_buf
1850
+ ag_thinking_buf = ""
1851
+ yield f"data: {json.dumps(_ag_chunk({'content': out}))}\n\n".encode()
1852
+ close = _ag_close_block()
1853
+ if close:
1854
+ yield f"data: {json.dumps(close)}\n\n".encode()
1855
+
1856
+ yield f"data: {json.dumps(_ag_chunk({}, finish='stop'))}\n\n".encode()
1857
+ yield b"data: [DONE]\n\n"
1858
+ middleware.end_turn(turn_ctx, error=False)
1859
+ return
1860
+
1861
+ # Normal streaming path (no arbitration, no agentic)
1862
+ async for chunk in _stream_openai_events(
1863
+ llm=llm, messages=internal_msgs, tools=internal_tools,
1864
+ max_tokens=max_tokens, temperature=temperature,
1865
+ requested_model=requested_model,
1866
+ middleware=middleware, turn_ctx=turn_ctx,
1867
+ caller=forced_caller,
1868
+ prediction=prediction,
1869
+ ):
1870
+ yield chunk
1871
+ return StreamingResponse(_gen(), media_type="text/event-stream")
1872
+
1873
+ # Non-streaming
1874
+ agentic = _should_run_agentic(requested_model, internal_tools)
1875
+ try:
1876
+ with subscription_auth_scope():
1877
+ if arbitrate:
1878
+ # Pattern 2: parallel arbitration through Caudate.
1879
+ # Arbitration takes precedence over the agentic
1880
+ # loop; the dual-brain comparison is more valuable
1881
+ # than tool-driven multi-step on cognos-dual-brain.
1882
+ from llm.router import DualLLMProvider
1883
+ if isinstance(llm, DualLLMProvider):
1884
+ resp = await _dual_brain_arbitrate(
1885
+ llm=llm,
1886
+ messages=internal_msgs, tools=internal_tools,
1887
+ max_tokens=max_tokens, temperature=temperature,
1888
+ middleware=middleware, turn_ctx=turn_ctx,
1889
+ )
1890
+ else:
1891
+ resp = await llm.chat(
1892
+ messages=internal_msgs, tools=internal_tools,
1893
+ max_tokens=max_tokens, temperature=temperature,
1894
+ )
1895
+ elif agentic:
1896
+ # Server-side ReAct: LLM proposes tool calls, we
1897
+ # execute them, loop until final text. Open WebUI
1898
+ # users get full Cognos tool capabilities — Bash,
1899
+ # Read, Write, Edit, Grep, Glob, PythonExec, etc.
1900
+ resp = await _run_agentic_loop(
1901
+ llm=llm,
1902
+ executor=agent.loop.executor,
1903
+ messages=internal_msgs,
1904
+ middleware=middleware, turn_ctx=turn_ctx,
1905
+ max_tokens=max_tokens, temperature=temperature,
1906
+ caller=forced_caller,
1907
+ )
1908
+ else:
1909
+ resp = await llm.chat(
1910
+ messages=internal_msgs, tools=internal_tools,
1911
+ max_tokens=max_tokens, temperature=temperature,
1912
+ caller=forced_caller,
1913
+ )
1914
+ except Exception as e:
1915
+ logger.exception("LLM call failed (openai-compat)")
1916
+ middleware.end_turn(turn_ctx, error=True)
1917
+ raise HTTPException(500, f"LLM error: {e}")
1918
+
1919
+ middleware.observe_response_text(turn_ctx, resp.content or "")
1920
+ if getattr(resp, "thinking", None):
1921
+ middleware.observe_thinking(turn_ctx, resp.thinking)
1922
+ for tc in resp.tool_calls:
1923
+ middleware.observe_tool_use(turn_ctx, tc.name)
1924
+ middleware.end_turn(turn_ctx, error=False)
1925
+
1926
+ # Fallback: if the model produced thinking but no visible
1927
+ # content (Kimi-k2.6 cuts off mid-thinking under tight budget),
1928
+ # surface the thinking as the reply so the user sees something
1929
+ # instead of a blank message.
1930
+ text_out = resp.content
1931
+ if (not text_out) and getattr(resp, "thinking", ""):
1932
+ text_out = (
1933
+ f"[thinking — model didn't finish before max_tokens]\n\n"
1934
+ f"{resp.thinking}"
1935
+ )
1936
+
1937
+ # Constitutional critique pass — only for `cognos-strict`.
1938
+ # Runs `core/constitution.py::run_critique` against the response;
1939
+ # if any rule in COGNOS_CONSTITUTION.md is violated, the LLM is
1940
+ # asked to revise. Adds 1–2 LLM calls per turn (worth it when
1941
+ # accuracy matters more than latency).
1942
+ if constitutional and text_out:
1943
+ try:
1944
+ from core.constitution import run_critique
1945
+ # Find the most recent user message for the critique
1946
+ user_msg = ""
1947
+ for m in reversed(internal_msgs):
1948
+ if m.get("role") == "user":
1949
+ c = m.get("content", "")
1950
+ if isinstance(c, list):
1951
+ c = " ".join(b.get("text", "") for b in c
1952
+ if isinstance(b, dict))
1953
+ user_msg = c or ""
1954
+ break
1955
+ logger.info(
1956
+ f"constitutional critique: starting "
1957
+ f"(response={len(text_out)} chars)"
1958
+ )
1959
+ revised, violations = await run_critique(
1960
+ llm=llm,
1961
+ user_message=user_msg,
1962
+ response=text_out,
1963
+ )
1964
+ if violations:
1965
+ logger.info(
1966
+ f"constitutional critique: revised after "
1967
+ f"{len(violations)} violation(s) "
1968
+ f"({[v.get('rule') for v in violations]})"
1969
+ )
1970
+ text_out = revised
1971
+ else:
1972
+ logger.info(
1973
+ "constitutional critique: clean "
1974
+ "(no violations found)"
1975
+ )
1976
+ except Exception as e:
1977
+ logger.warning(f"constitutional critique failed: {e}")
1978
+
1979
+ return JSONResponse(_build_openai_response(
1980
+ text=text_out,
1981
+ tool_calls=resp.tool_calls,
1982
+ model=requested_model,
1983
+ usage=resp.usage,
1984
+ stop_reason=resp.stop_reason,
1985
+ ))
1986
+
1987
+ # /v1/models is owned by api/anthropic_compat — that endpoint
1988
+ # already returns a hybrid shape that satisfies both Anthropic and
1989
+ # OpenAI clients (we updated it when adding this router). No need
1990
+ # to register a second /v1/models here; FastAPI would take only
1991
+ # whichever was registered first anyway.
1992
+
1993
+ return router