PyPI - caudate-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

caudate-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

api/__init__.py +5 -0
api/anthropic_compat.py +1518 -0
api/artifact_viewer.py +366 -0
api/caudate_middleware.py +618 -0
api/forge_bootstrapper_routes.py +377 -0
api/forge_routes.py +630 -0
api/forge_system_routes.py +294 -0
api/openai_compat.py +1993 -0
api/server.py +667 -0
api/storyboard_page.py +677 -0
caudate_cli-0.1.0.dist-info/METADATA +354 -0
caudate_cli-0.1.0.dist-info/RECORD +153 -0
caudate_cli-0.1.0.dist-info/WHEEL +5 -0
caudate_cli-0.1.0.dist-info/entry_points.txt +2 -0
caudate_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
caudate_cli-0.1.0.dist-info/top_level.txt +14 -0
cognos_mcp/__init__.py +4 -0
cognos_mcp/bridge.py +41 -0
cognos_mcp/client.py +70 -0
cognos_mcp/config.py +49 -0
cognos_mcp/server.py +66 -0
config.py +82 -0
core/__init__.py +0 -0
core/agent.py +468 -0
core/agentic_loop.py +731 -0
core/anthropic_auth.py +91 -0
core/background.py +113 -0
core/banner.py +134 -0
core/bootstrap.py +292 -0
core/citations.py +131 -0
core/compaction.py +109 -0
core/constitution.py +198 -0
core/diff_viewer.py +87 -0
core/export.py +85 -0
core/file_refs.py +119 -0
core/files.py +199 -0
core/hooks.py +209 -0
core/image.py +599 -0
core/input.py +91 -0
core/loop.py +238 -0
core/memory_md.py +147 -0
core/notifications.py +99 -0
core/ownership.py +181 -0
core/paste.py +81 -0
core/permissions.py +210 -0
core/plan_mode.py +215 -0
core/sandbox_prompt.py +185 -0
core/scheduler.py +195 -0
core/schemas.py +202 -0
core/session.py +90 -0
core/settings.py +132 -0
core/skills.py +398 -0
core/slash_commands.py +977 -0
core/statusline.py +61 -0
core/subagent.py +300 -0
core/thinking.py +50 -0
core/updater.py +122 -0
core/usage.py +109 -0
core/worktree.py +93 -0
execution/__init__.py +0 -0
execution/executor.py +329 -0
execution/plugins.py +108 -0
execution/tools/__init__.py +0 -0
execution/tools/agent_tool.py +107 -0
execution/tools/agentic_tool.py +297 -0
execution/tools/artifact_tool.py +191 -0
execution/tools/ask_user_question_tool.py +137 -0
execution/tools/base.py +81 -0
execution/tools/calculator_tool.py +137 -0
execution/tools/cognos_card_tool.py +124 -0
execution/tools/cron_tool.py +215 -0
execution/tools/datetime_tool.py +215 -0
execution/tools/describe_image_tool.py +161 -0
execution/tools/draw_tool.py +164 -0
execution/tools/edit_image_tool.py +262 -0
execution/tools/edit_tool.py +245 -0
execution/tools/file_tool.py +90 -0
execution/tools/find_anywhere_tool.py +255 -0
execution/tools/forge_feature_tools.py +377 -0
execution/tools/glob_tool.py +59 -0
execution/tools/grep_tool.py +89 -0
execution/tools/http_request_tool.py +224 -0
execution/tools/load_skill_tool.py +104 -0
execution/tools/longcat_avatar_tool.py +384 -0
execution/tools/mcp_tool.py +100 -0
execution/tools/notebook_tool.py +279 -0
execution/tools/openapi_tool.py +440 -0
execution/tools/plan_mode_tool.py +95 -0
execution/tools/push_notification_tool.py +157 -0
execution/tools/python_tool.py +61 -0
execution/tools/respond_tool.py +40 -0
execution/tools/sandbox_tool.py +378 -0
execution/tools/search_tool.py +153 -0
execution/tools/semantic_search_tool.py +106 -0
execution/tools/shell_tool.py +283 -0
execution/tools/speak_tool.py +134 -0
execution/tools/storyboard_tool.py +727 -0
execution/tools/system_info_tool.py +212 -0
execution/tools/task_tool.py +323 -0
execution/tools/think_tool.py +49 -0
execution/tools/transcribe_audio_tool.py +86 -0
execution/tools/update_memory_tool.py +92 -0
execution/tools/web_fetch_tool.py +82 -0
execution/tools/worktree_tool.py +174 -0
llm/__init__.py +0 -0
llm/fallback.py +116 -0
llm/models.py +320 -0
llm/provider.py +1356 -0
llm/router.py +373 -0
main.py +1889 -0
memory/__init__.py +0 -0
memory/episodic.py +99 -0
memory/procedural.py +145 -0
memory/semantic.py +71 -0
memory/working.py +64 -0
nn/__init__.py +43 -0
nn/auto_evolve.py +245 -0
nn/caudate.py +136 -0
nn/config.py +141 -0
nn/consolidator.py +81 -0
nn/data.py +1635 -0
nn/encoder.py +258 -0
nn/forge_advisor.py +303 -0
nn/format.py +235 -0
nn/heads.py +432 -0
nn/observer.py +994 -0
nn/policy.py +214 -0
nn/runtime.py +343 -0
nn/scorer.py +175 -0
nn/trainer.py +515 -0
nn/vision.py +352 -0
personality/__init__.py +23 -0
personality/engine.py +129 -0
personality/identity.py +144 -0
personality/inner_voice.py +100 -0
personality/mood.py +205 -0
planning/__init__.py +0 -0
planning/dev_server.py +221 -0
planning/forge_models.py +718 -0
planning/orchestrator.py +1363 -0
planning/planner.py +451 -0
planning/task_graph.py +61 -0
reflection/__init__.py +0 -0
reflection/meta_learner.py +156 -0
reflection/reflector.py +127 -0
ui/__init__.py +5 -0
ui/display.py +88 -0
voice/__init__.py +0 -0
voice/conversation.py +125 -0
voice/listener.py +111 -0
voice/speaker.py +59 -0
voice/stt.py +126 -0
voice/tts.py +214 -0

api/openai_compat.py ADDED Viewed

@@ -0,0 +1,1993 @@
+"""OpenAI Chat Completions API compatibility layer.
+Lets Open WebUI (or any OpenAI-format client) point at Cognos and get
+answers back as if Cognos were OpenAI:
+    incoming  /v1/chat/completions  (OpenAI schema)
+        │
+        ▼
+    translate to Cognos's internal message format
+        │
+        ▼
+    LLMProvider.chat / .stream  (with subscription_auth_scope so the
+    web-side OAuth path is available, just like /chat does)
+        │
+        ▼
+    translate response back to OpenAI schema (regular or SSE stream)
+Caudate observes every turn through the same `CaudateMiddleware` already
+used by `/v1/messages`, so traffic from Open WebUI feeds her training
+corpus identically to traffic from Claude Code or our `/ui` chat.
+This is the OpenAI-shaped sibling of `api/anthropic_compat.py`.
+"""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+import re
+import time
+import uuid
+from typing import Any, AsyncIterator
+from fastapi import APIRouter, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from api.caudate_middleware import CaudateMiddleware
+from core.anthropic_auth import subscription_auth_scope
+from core.schemas import StreamEvent, ToolUseBlock
+from llm.provider import LLMProvider, LLMResponse
+# ---- Dual-brain arbitration -----------------------------------------
+# Pattern 2 from the multi-brain options: both system1 and system2 are
+# called in parallel for every cognos-dual-brain turn. Their drafts are
+# compared via a heuristic scorer (Caudate-aware where possible) and
+# the winner is returned to the user. Both drafts are recorded for
+# preference-learning training data — Caudate's substrate to grow into
+# the conductor (Phase 4 of CAUDATE_EVOLUTION.md).
+# Phrases that strongly suggest a refusal — reduce score for drafts
+# that contain them so non-refusing draft wins by default.
+_REFUSAL_RE = re.compile(
+    r"\b("
+    r"i\s+can[' ]?t\s+(?:reproduce|share|provide|help|do|assist)|"
+    r"i\s+(?:cannot|am\s+not\s+able\s+to|am\s+unable\s+to)|"
+    r"i\s+don[' ]?t\s+have\s+access|"
+    r"copyright|"
+    r"violates?\s+(?:my\s+)?(?:guidelines|policy|policies|terms)|"
+    r"against\s+(?:my\s+)?(?:guidelines|policy)|"
+    r"ethical\s+guidelines|"
+    r"unable\s+to\s+(?:provide|share|reproduce)"
+    r")\b",
+    re.IGNORECASE,
+)
+_HEDGE_PHRASES = (
+    "i think", "i believe", "maybe", "perhaps", "i'm not sure",
+    "i would suggest", "you might want", "it could be", "i guess",
+)
+def _score_draft(resp: LLMResponse | None) -> float:
+    """Heuristic quality score for one draft, in [0, 1].
+    The previous version was nearly flat (most signal in a 0-0.20 length
+    bonus), which made >70% of arbitrations land inside the 0.05 tie
+    threshold and produce no real preference label. This version adds
+    several small independent factors so genuine drafts score apart from
+    each other reliably while keeping refusals and emptiness clearly
+    worst.
+    """
+    if resp is None:
+        return 0.0
+    text = (resp.content or "").strip()
+    if not text:
+        return 0.30 if getattr(resp, "thinking", "") else 0.10
+    score = 0.50
+    text_lower = text.lower()
+    # Refusal: stronger penalty so refusing draft cleanly loses (-0.40)
+    if _REFUSAL_RE.search(text):
+        score -= 0.40
+    # Length: brevity penalty for trivially short replies; soft reward
+    # for substance up to 1500 chars; no waffle bonus beyond.
+    L = len(text)
+    if L < 40:
+        score -= 0.10
+    score += min(0.15, L / 1500 * 0.15)
+    # Tool calls indicate the model engaged with the task structurally.
+    if getattr(resp, "tool_calls", None):
+        score += 0.10
+    # Concrete-content signals — each draft varies in these
+    # independently of length, which is what makes the gap widen.
+    if "```" in text:                       # code block / preformatted
+        score += 0.05
+    if any(c.isdigit() for c in text):      # numbers / data
+        score += 0.03
+    if "/" in text or "\\" in text:          # paths, filenames
+        score += 0.03
+    if "://" in text:                       # URLs
+        score += 0.03
+    # Hedge-language penalty — vague, low-confidence prose loses to
+    # direct prose of similar length.
+    if any(h in text_lower for h in _HEDGE_PHRASES):
+        score -= 0.05
+    return max(0.0, min(1.0, score))
+async def _dual_brain_arbitrate(
+    *,
+    llm: Any,                       # DualLLMProvider
+    messages: list[dict[str, Any]],
+    tools: list[dict[str, Any]] | None,
+    max_tokens: int,
+    temperature: float | None,
+    middleware: CaudateMiddleware | None,
+    turn_ctx: Any,
+) -> LLMResponse:
+    """Run system1 and system2 in parallel, score both drafts, return
+    the winner. Records both drafts via observe_arbitration so the
+    training corpus accumulates preference pairs."""
+    fast_provider = llm.router.fast
+    slow_provider = llm.router.slow
+    async def _call(prov: LLMProvider) -> LLMResponse | Exception:
+        try:
+            return await prov.chat(
+                messages=messages, tools=tools,
+                max_tokens=max_tokens, temperature=temperature,
+            )
+        except Exception as e:
+            logger.warning(f"arbitrate call failed for {prov.model}: {e}")
+            return e
+    # Run in parallel — total latency ≈ max(t_fast, t_slow), not the sum.
+    fast_resp, slow_resp = await asyncio.gather(
+        _call(fast_provider), _call(slow_provider),
+    )
+    # Failure handling: if one died, return the other; if both died, raise.
+    fast_ok = not isinstance(fast_resp, Exception)
+    slow_ok = not isinstance(slow_resp, Exception)
+    if fast_ok and not slow_ok:
+        return fast_resp
+    if slow_ok and not fast_ok:
+        return slow_resp
+    if not fast_ok and not slow_ok:
+        # both failed — propagate fast's error
+        raise fast_resp  # type: ignore[misc]
+    # Both succeeded — score them.
+    fast_score = _score_draft(fast_resp)
+    slow_score = _score_draft(slow_resp)
+    # Confidence safeguard: if the gap is small (< 0.02), prefer fast
+    # (system1) as the default — Caudate doesn't yet have strong
+    # arbitration; truly close calls shouldn't be made on weak signal.
+    # Threshold lowered from 0.05 → 0.02 alongside the more-decisive
+    # heuristic so genuine preferences are recorded instead of
+    # collapsing to "default to fast".
+    gap = abs(fast_score - slow_score)
+    if gap < 0.02:
+        winner_resp, winner_label = fast_resp, "fast"
+    elif fast_score >= slow_score:
+        winner_resp, winner_label = fast_resp, "fast"
+    else:
+        winner_resp, winner_label = slow_resp, "slow"
+    # Record the arbitration so the training corpus captures the
+    # comparison even when the winner is the one we'd default to.
+    if middleware is not None and turn_ctx is not None:
+        try:
+            middleware.observe_arbitration(
+                turn_ctx,
+                fast_text=(fast_resp.content or ""),
+                slow_text=(slow_resp.content or ""),
+                fast_score=fast_score,
+                slow_score=slow_score,
+                winner=winner_label,
+                fast_model=fast_provider.model,
+                slow_model=slow_provider.model,
+            )
+        except Exception as e:
+            logger.debug(f"observe_arbitration failed: {e}")
+    return winner_resp
+logger = logging.getLogger(__name__)
+# ---- Translation helpers --------------------------------------------
+# ---- Server-side agentic loop ---------------------------------------
+# When a `cognos-*` model is requested AND the client did not supply a
+# tool schema (Open WebUI doesn't), we run the tool-calling loop
+# server-side: LLM proposes a tool call → executor runs it → result
+# loops back into the messages → LLM is called again → repeat until
+# the LLM produces final text (no more tool_calls). The user only sees
+# the final text. Caudate observes every iteration through the
+# middleware exactly as if the loop ran in the CLI.
+_AGENTIC_MAX_ITERATIONS = 12
+# Models for which we activate the server-side agentic loop. Anything
+# else (bare passthrough names) gets the legacy single-shot behavior.
+_AGENTIC_MODELS: frozenset[str] = frozenset({
+    "cognos", "cognos-fast", "cognos-slow",
+    "cognos-haiku", "cognos-kimi",
+    "cognos-dual-brain", "cognos-collab",
+    "cognos-vision",
+    "cognos-strict",
+})
+def _should_run_agentic(requested_model: str | None,
+                        client_supplied_tools: list | None) -> bool:
+    """Yes when the model is a cognos-* and the client didn't bring its
+    own tool definitions (i.e. Open WebUI / generic OpenAI client)."""
+    if client_supplied_tools:
+        return False
+    if not requested_model:
+        return False
+    return requested_model.lower() in _AGENTIC_MODELS
+async def _run_agentic_loop(
+    *,
+    llm: Any,
+    executor: Any,
+    messages: list[dict[str, Any]],
+    middleware: CaudateMiddleware,
+    turn_ctx: Any,
+    max_tokens: int,
+    temperature: float | None,
+    caller: str | None = None,
+) -> LLMResponse:
+    """Stateless server-side ReAct loop.
+    Reuses Cognos's existing Executor (so tool implementations are
+    identical to the CLI path) but runs from a transient message
+    history rather than the agent's persistent session. Returns the
+    final LLMResponse — the one whose `.tool_calls` is empty.
+    Caudate observes every iteration's tool calls via the middleware
+    so the new heads collect labels on real chat-driven turns.
+    """
+    history: list[dict[str, Any]] = list(messages)
+    # Strip CLI-only tools that don't make sense in the chat path:
+    #   Respond — prints to the *server* console (rich Panel); when the
+    #     LLM uses this in chat, the answer goes nowhere visible.
+    #   Think  — also a server-side console aid.
+    # The agentic chat path gets the substantive tools (Bash/Read/Write/
+    # Edit/Grep/Glob/FindAnywhere/SystemInfo/PythonExec/Web*) and the
+    # LLM replies via plain text content.
+    _CHAT_HIDDEN_TOOLS = {"Respond", "Think"}
+    tool_defs = [
+        td for td in executor.tool_definitions()
+        if (td.get("function", {}).get("name") or td.get("name"))
+           not in _CHAT_HIDDEN_TOOLS
+    ]
+    # Runaway-detector state. We remember (tool_name, args_fingerprint,
+    # was_error) for each tool call across iterations. If the SAME tool
+    # has been called >= 3 times with similar args AND every one of
+    # those calls returned an error/empty, we inject a synthetic
+    # nudge into the history asking the LLM to stop hunting and try a
+    # different approach. This catches loops like the find/system-prompt
+    # one where the LLM keeps trying minor variations of the same
+    # failing command.
+    _RUNAWAY_THRESHOLD = 3
+    tool_call_log: list[tuple[str, str, bool]] = []
+    nudge_already_injected = False
+    def _args_fingerprint(args: dict[str, Any] | None) -> str:
+        """Compact stable hash of the meaningful args — coarse enough
+        that 'find / -name x' and 'find / -name y' look similar."""
+        if not args:
+            return ""
+        # Take the first 60 chars of each value to ignore tail variation
+        parts: list[str] = []
+        for k in sorted(args.keys()):
+            v = args[k]
+            sval = json.dumps(v) if not isinstance(v, str) else v
+            parts.append(f"{k}={sval[:60]}")
+        return "|".join(parts)[:200]
+    def _looks_like_error(output: str) -> bool:
+        if not output or not output.strip():
+            return True
+        head = output.strip()[:200].lower()
+        return (head.startswith("[error]")
+                or "permission denied" in head
+                or "no such file" in head
+                or "command not found" in head
+                or head.startswith("error:"))
+    last_resp: LLMResponse | None = None
+    for iteration in range(_AGENTIC_MAX_ITERATIONS):
+        kwargs: dict[str, Any] = {
+            "messages": history,
+            "tools": tool_defs,
+            "max_tokens": max_tokens,
+        }
+        if temperature is not None:
+            kwargs["temperature"] = temperature
+        if caller is not None:
+            kwargs["caller"] = caller
+        resp = await llm.chat(**kwargs)
+        last_resp = resp
+        # Append the assistant turn to history (text + tool_calls).
+        assistant_msg: dict[str, Any] = {
+            "role": "assistant",
+            "content": resp.content or None,
+        }
+        if resp.tool_calls:
+            assistant_msg["tool_calls"] = [
+                {
+                    "id": tc.id or f"call_{uuid.uuid4().hex[:12]}",
+                    "type": "function",
+                    "function": {
+                        "name": tc.name,
+                        "arguments": json.dumps(tc.input or {}),
+                    },
+                }
+                for tc in resp.tool_calls
+            ]
+        history.append(assistant_msg)
+        # Done if the model didn't request any tool calls.
+        if not resp.tool_calls:
+            return resp
+        # Execute each tool call, append results to history, loop.
+        for tc in resp.tool_calls:
+            try:
+                result = await executor.execute_tool(tc.name, tc.input or {})
+                output = result.output if hasattr(result, "output") else str(result)
+                if hasattr(result, "error") and result.error:
+                    output = f"[error] {result.error}"
+            except Exception as e:
+                logger.exception(f"tool {tc.name!r} crashed")
+                output = f"[error] tool {tc.name} raised: {e}"
+            # Runaway-detector: log this call with its outcome.
+            fingerprint = _args_fingerprint(tc.input)
+            errored = _looks_like_error(str(output))
+            tool_call_log.append((tc.name, fingerprint, errored))
+            # Caudate observes this tool use — drives the new heads'
+            # label collection (permission, etc.).
+            try:
+                middleware.observe_tool_use(turn_ctx, tc.name)
+            except Exception:
+                pass
+            history.append({
+                "role": "tool",
+                "tool_call_id": tc.id or "",
+                "name": tc.name,
+                "content": str(output)[:8000],   # cap to keep context manageable
+            })
+        # After all tool calls in this iteration, check the log for a
+        # stuck loop. Two detector branches:
+        #
+        #  (A) "Stuck on same args": same (tool, args) called
+        #      RUNAWAY_THRESHOLD times in a row — even if every call
+        #      succeeded. Catches the read-loop pattern where the LLM
+        #      reads the same file over and over while saying it'll
+        #      write, but never calls Write.
+        #
+        #  (B) "Stuck on errors": same tool called with varying args,
+        #      every call errored or returned empty. Catches
+        #      hunt-in-circles loops like find/grep with bad patterns.
+        #
+        # The nudge is one-shot per loop run.
+        if not nudge_already_injected and len(tool_call_log) >= _RUNAWAY_THRESHOLD:
+            recent = tool_call_log[-_RUNAWAY_THRESHOLD:]
+            tool_names = {t[0] for t in recent}
+            fingerprints = {(t[0], t[1]) for t in recent}
+            all_errored = all(t[2] for t in recent)
+            nudge_text = None
+            if len(fingerprints) == 1:
+                stuck_tool = recent[0][0]
+                logger.warning(
+                    f"agentic loop runaway detected: {stuck_tool} called "
+                    f"{_RUNAWAY_THRESHOLD}+ times with identical args. "
+                    f"injecting nudge."
+                )
+                nudge_text = (
+                    f"[system nudge] You've called `{stuck_tool}` with the "
+                    f"same arguments {_RUNAWAY_THRESHOLD}+ times in a row. "
+                    f"The result won't change on another call. **Take the "
+                    f"next action now**:\n"
+                    f"  (1) if you've been reading/inspecting and saying "
+                    f"you'll edit, **call the Write or Edit tool now** with "
+                    f"the actual content, or\n"
+                    f"  (2) ask me what to do next.\n"
+                    f"Do not call `{stuck_tool}` with these arguments again."
+                )
+            elif len(tool_names) == 1 and all_errored:
+                stuck_tool = next(iter(tool_names))
+                logger.warning(
+                    f"agentic loop runaway detected: {stuck_tool} called "
+                    f"{_RUNAWAY_THRESHOLD}+ times, all errors. injecting nudge."
+                )
+                nudge_text = (
+                    f"[system nudge] You've called `{stuck_tool}` "
+                    f"{_RUNAWAY_THRESHOLD}+ times now and every call "
+                    f"returned an error or empty result. **Stop "
+                    f"trying variations of the same command.** Either:\n"
+                    f"  (1) explain to me what you were trying to find "
+                    f"and ask me to clarify, or\n"
+                    f"  (2) try a fundamentally different approach "
+                    f"(a different tool, or step back and reason about "
+                    f"the original question).\n"
+                    f"Do not call `{stuck_tool}` again on the next turn."
+                )
+            if nudge_text:
+                history.append({"role": "user", "content": nudge_text})
+                nudge_already_injected = True
+    # Hit max iterations without a final text — return the last response
+    # we got, which still has tool_calls. Caller will surface either the
+    # text content or a "[hit max iterations]" placeholder.
+    logger.warning(
+        f"_run_agentic_loop hit max_iterations={_AGENTIC_MAX_ITERATIONS}"
+    )
+    return last_resp or LLMResponse(content="", stop_reason="max_iterations")
+async def _run_agentic_loop_streaming(
+    *,
+    llm: Any,
+    executor: Any,
+    messages: list[dict[str, Any]],
+    middleware: CaudateMiddleware,
+    turn_ctx: Any,
+    max_tokens: int,
+    temperature: float | None,
+    caller: str | None = None,
+) -> AsyncIterator[StreamEvent]:
+    """Streaming sibling of `_run_agentic_loop`.
+    Yields `StreamEvent`s as the LLM produces them across all
+    iterations. Same ReAct contract — call LLM with tools, execute
+    any tool calls, loop until LLM emits text-only — but each
+    iteration's thinking_delta / text_delta / tool_use_end events
+    flow through to the consumer immediately rather than after the
+    whole loop completes.
+    Two extra event shapes the consumer should handle:
+      - StreamEvent(type="tool_result", tool_name=..., delta=summary,
+                    raw={"status": "success"|"error"})
+        Emitted right after each tool finishes so the UI can show
+        "🔧 Bash → ok (240 chars)" inline.
+      - StreamEvent(type="iteration_break")
+        Emitted between iterations so the consumer can decide
+        whether to insert a visual separator or keep the thinking
+        block flowing as one continuous trace.
+    """
+    history: list[dict[str, Any]] = list(messages)
+    _CHAT_HIDDEN_TOOLS = {"Respond", "Think"}
+    tool_defs = [
+        td for td in executor.tool_definitions()
+        if (td.get("function", {}).get("name") or td.get("name"))
+           not in _CHAT_HIDDEN_TOOLS
+    ]
+    _RUNAWAY_THRESHOLD = 3
+    tool_call_log: list[tuple[str, str, bool]] = []
+    nudge_already_injected = False
+    def _args_fingerprint(args: dict[str, Any] | None) -> str:
+        if not args:
+            return ""
+        parts: list[str] = []
+        for k in sorted(args.keys()):
+            v = args[k]
+            sval = json.dumps(v) if not isinstance(v, str) else v
+            parts.append(f"{k}={sval[:60]}")
+        return "|".join(parts)[:200]
+    def _looks_like_error(output: str) -> bool:
+        if not output or not output.strip():
+            return True
+        head = output.strip()[:200].lower()
+        return (head.startswith("[error]")
+                or "permission denied" in head
+                or "no such file" in head
+                or "command not found" in head
+                or head.startswith("error:"))
+    for iteration in range(_AGENTIC_MAX_ITERATIONS):
+        kwargs: dict[str, Any] = {
+            "messages": history,
+            "tools": tool_defs,
+            "max_tokens": max_tokens,
+        }
+        if temperature is not None:
+            kwargs["temperature"] = temperature
+        if caller is not None:
+            kwargs["caller"] = caller
+        # Accumulate this iteration's stream so we can build the
+        # assistant_msg for history at the end.
+        iter_text = ""
+        iter_tool_calls: list[ToolUseBlock] = []
+        iter_stop_reason: str | None = None
+        async for event in llm.stream(**kwargs):
+            if event.type == "text_delta" and event.delta:
+                iter_text += event.delta
+                yield event
+            elif event.type == "thinking_delta" and event.delta:
+                # Forward live so the consumer can render thinking
+                # in real time as it crosses iteration boundaries.
+                yield event
+            elif event.type == "tool_use_end":
+                iter_tool_calls.append(ToolUseBlock(
+                    id=event.tool_use_id or f"call_{uuid.uuid4().hex[:12]}",
+                    name=event.tool_name or "",
+                    input=event.tool_input or {},
+                ))
+                yield event
+            elif event.type == "message_stop":
+                iter_stop_reason = event.stop_reason
+        # Append assistant turn to history.
+        assistant_msg: dict[str, Any] = {
+            "role": "assistant",
+            "content": iter_text or None,
+        }
+        if iter_tool_calls:
+            assistant_msg["tool_calls"] = [
+                {
+                    "id": tc.id or f"call_{uuid.uuid4().hex[:12]}",
+                    "type": "function",
+                    "function": {
+                        "name": tc.name,
+                        "arguments": json.dumps(tc.input or {}),
+                    },
+                }
+                for tc in iter_tool_calls
+            ]
+        history.append(assistant_msg)
+        # Done if no tool calls — final answer was the text we already
+        # streamed.
+        if not iter_tool_calls:
+            yield StreamEvent(
+                type="message_stop",
+                stop_reason=iter_stop_reason or "stop",
+            )
+            return
+        # Execute tool calls; yield a `tool_result` event per call so
+        # the UI can render progress inline.
+        for tc in iter_tool_calls:
+            try:
+                result = await executor.execute_tool(tc.name, tc.input or {})
+                output = result.output if hasattr(result, "output") else str(result)
+                if hasattr(result, "error") and result.error:
+                    output = f"[error] {result.error}"
+            except Exception as e:
+                logger.exception(f"tool {tc.name!r} crashed")
+                output = f"[error] tool {tc.name} raised: {e}"
+            fingerprint = _args_fingerprint(tc.input)
+            errored = _looks_like_error(str(output))
+            tool_call_log.append((tc.name, fingerprint, errored))
+            try:
+                middleware.observe_tool_use(turn_ctx, tc.name)
+            except Exception:
+                pass
+            history.append({
+                "role": "tool",
+                "tool_call_id": tc.id or "",
+                "name": tc.name,
+                "content": str(output)[:8000],
+            })
+            # Surface a short result summary so the UI can label the
+            # tool call inline. The full output stays in `history` so
+            # the LLM still sees it on the next iteration.
+            summary = str(output)[:240].replace("\n", " ")
+            yield StreamEvent(
+                type="tool_result",
+                tool_name=tc.name,
+                delta=summary,
+                raw={"status": "error" if errored else "success",
+                     "output_chars": len(str(output))},
+            )
+        # Runaway detector — same two-branch logic as non-streaming
+        # variant: (A) same (tool, args) repeated, even on success;
+        # (B) same tool, varying args, all errored.
+        if not nudge_already_injected and len(tool_call_log) >= _RUNAWAY_THRESHOLD:
+            recent = tool_call_log[-_RUNAWAY_THRESHOLD:]
+            tool_names = {t[0] for t in recent}
+            fingerprints = {(t[0], t[1]) for t in recent}
+            all_errored = all(t[2] for t in recent)
+            nudge_text = None
+            if len(fingerprints) == 1:
+                stuck_tool = recent[0][0]
+                logger.warning(
+                    f"agentic loop runaway detected: {stuck_tool} called "
+                    f"{_RUNAWAY_THRESHOLD}+ times with identical args. "
+                    f"injecting nudge."
+                )
+                nudge_text = (
+                    f"[system nudge] You've called `{stuck_tool}` with the "
+                    f"same arguments {_RUNAWAY_THRESHOLD}+ times in a row. "
+                    f"The result won't change on another call. **Take the "
+                    f"next action now**:\n"
+                    f"  (1) if you've been reading/inspecting and saying "
+                    f"you'll edit, **call the Write or Edit tool now** with "
+                    f"the actual content, or\n"
+                    f"  (2) ask me what to do next.\n"
+                    f"Do not call `{stuck_tool}` with these arguments again."
+                )
+            elif len(tool_names) == 1 and all_errored:
+                stuck_tool = next(iter(tool_names))
+                logger.warning(
+                    f"agentic loop runaway detected: {stuck_tool} called "
+                    f"{_RUNAWAY_THRESHOLD}+ times, all errors. injecting nudge."
+                )
+                nudge_text = (
+                    f"[system nudge] You've called `{stuck_tool}` "
+                    f"{_RUNAWAY_THRESHOLD}+ times now and every call "
+                    f"returned an error or empty result. **Stop "
+                    f"trying variations of the same command.** Either:\n"
+                    f"  (1) explain to me what you were trying to find "
+                    f"and ask me to clarify, or\n"
+                    f"  (2) try a fundamentally different approach.\n"
+                    f"Do not call `{stuck_tool}` again on the next turn."
+                )
+            if nudge_text:
+                history.append({"role": "user", "content": nudge_text})
+                nudge_already_injected = True
+        yield StreamEvent(type="iteration_break")
+    # Hit max iterations.
+    logger.warning(
+        f"_run_agentic_loop_streaming hit max_iterations={_AGENTIC_MAX_ITERATIONS}"
+    )
+    yield StreamEvent(type="message_stop", stop_reason="max_iterations")
+# ---- Slash-command interception ------------------------------------
+# Open WebUI users type `/caudate`, `/sessions`, etc. as normal chat
+# messages — without interception, those would just go to the LLM as
+# free-form text. Instead, we dispatch through `core/slash_commands.py`
+# (the same registry the Cognos /ui/ uses) and return the result as
+# an assistant reply, skipping the LLM call entirely. Saves quota and
+# gives Open WebUI parity with Cognos /ui/.
+import re as _re
+_RICH_TAG_RE = _re.compile(r"\[/?[a-zA-Z][^\]]*\]")
+def _strip_rich_markup(s: str) -> str:
+    """Strip rich console markup like [red]X[/red] / [dim]Y[/dim] so
+    the slash output renders cleanly in the chat UI."""
+    if not s:
+        return s
+    return _RICH_TAG_RE.sub("", s)
+def _last_user_text(messages: list[dict[str, Any]]) -> str:
+    for m in reversed(messages or []):
+        if m.get("role") == "user":
+            c = m.get("content", "")
+            if isinstance(c, list):
+                c = " ".join(b.get("text", "") for b in c
+                             if isinstance(b, dict))
+            return (c or "").strip()
+    return ""
+async def _try_slash_intercept(body: dict, messages: list[dict],
+                               agent: Any) -> "JSONResponse | None":
+    """If the latest user message is a slash command, run it and
+    return a finished /v1/chat/completions response. Otherwise return
+    None and the caller falls through to the normal chat flow.
+    `agent` is the resolved CognosAgent (passed in from the closure
+    that has access to `_get_agent`).
+    """
+    user_text = _last_user_text(messages)
+    if not user_text or not user_text.lstrip().startswith("/"):
+        return None
+    try:
+        from core.slash_commands import dispatch, SlashContext, SlashResult
+    except Exception:
+        return None
+    if agent is None:
+        return None
+    # Some handlers (`/sessions`, `/help`, ...) emit Rich tables via
+    # `ctx.console.print(...)` and return an empty string. Use a
+    # StringIO-backed Console so the printed output ends up in our
+    # buffer too, then merge with the function's return value.
+    import io
+    from rich.console import Console
+    buffer = io.StringIO()
+    try:
+        ctx = SlashContext(
+            agent=agent,
+            console=Console(file=buffer, force_terminal=False, width=120),
+        )
+        result = dispatch(user_text.strip(), ctx)
+    except Exception as e:
+        logger.exception("slash intercept failed")
+        return _slash_response(f"slash command failed: {e}",
+                               body.get("model") or "cognos")
+    if result is None:
+        # Not a recognised slash → let the LLM handle it
+        return None
+    if isinstance(result, SlashResult):
+        text = f"_slash result: {result.value}_"
+    else:
+        # Combine console-printed output (tables) with return value
+        # (one-line status messages). Either or both can be set.
+        result_str = _strip_rich_markup(str(result)).rstrip()
+        printed = _strip_rich_markup(buffer.getvalue()).rstrip()
+        if printed and result_str:
+            text = f"{printed}\n\n{result_str}"
+        elif printed:
+            text = printed
+        elif result_str:
+            text = result_str
+        else:
+            text = "_(no output)_"
+    logger.info(f"slash intercept handled {user_text.split()[0]!r}")
+    return _slash_response(text, body.get("model") or "cognos")
+def _slash_response(text: str, model: str) -> "JSONResponse":
+    """Build an OpenAI-shape /v1/chat/completions response from a
+    slash command's output."""
+    return JSONResponse({
+        "id": f"chatcmpl-slash-{uuid.uuid4().hex[:16]}",
+        "object": "chat.completion",
+        "created": int(time.time()),
+        "model": model,
+        "choices": [{
+            "index": 0,
+            "message": {"role": "assistant", "content": text},
+            "finish_reason": "stop",
+        }],
+        "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
+    })
+def _persist_inline_images(message: dict[str, Any]) -> dict[str, Any]:
+    """Save any inline `image_url` blocks to FileStore and inject a
+    `files/<id>` reference into the user text.
+    OpenAI multimodal content blocks look like:
+        {"role": "user", "content": [
+            {"type": "text", "text": "..."},
+            {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
+        ]}
+    Behaviour:
+      - If `content` is a string → return unchanged.
+      - If `content` is a list → for every `image_url` block whose URL
+        is a `data:` URL, decode and persist to FileStore. Mutate the
+        first text block (or insert one) to append a marker so the
+        LLM has a string handle. Leave the image_url block intact so
+        the multimodal LLM still sees pixels.
+      - http(s) URLs are left alone (the LLM can fetch on its own).
+    """
+    content = message.get("content")
+    if not isinstance(content, list) or not content:
+        return message
+    import base64 as _b64
+    import re as _re
+    import tempfile as _tempfile
+    import uuid as _uuid
+    from pathlib import Path as _Path
+    saved_refs: list[str] = []
+    for block in content:
+        if not isinstance(block, dict):
+            continue
+        if block.get("type") != "image_url":
+            continue
+        url = (block.get("image_url") or {}).get("url") or ""
+        if not url.startswith("data:"):
+            continue
+        # data:[<media-type>][;base64],<data>
+        m = _re.match(r"data:([^;]+);base64,(.+)$", url)
+        if not m:
+            continue
+        mime, b64data = m.group(1), m.group(2)
+        try:
+            raw = _b64.b64decode(b64data)
+        except Exception as e:
+            logger.warning(f"failed to decode inline image: {e}")
+            continue
+        # Persist
+        try:
+            from config import FILES_DIR
+            from core.files import FileStore
+            ext = mime.split("/", 1)[-1] if "/" in mime else "bin"
+            ext = "jpg" if ext == "jpeg" else ext
+            with _tempfile.NamedTemporaryFile(
+                suffix=f".{ext}", delete=False,
+            ) as tmp:
+                tmp.write(raw)
+                tmp_path = _Path(tmp.name)
+            try:
+                fs = FileStore(root=FILES_DIR)
+                rec = fs.upload(
+                    tmp_path,
+                    filename=f"upload_{_uuid.uuid4().hex[:8]}.{ext}",
+                )
+                saved_refs.append(rec.id)
+                logger.info(f"persisted user-uploaded image as files/{rec.id}")
+            finally:
+                tmp_path.unlink(missing_ok=True)
+        except Exception as e:
+            logger.warning(f"failed to save uploaded image to FileStore: {e}")
+    if not saved_refs:
+        return message
+    # Append marker into the first text block (or insert one) so the
+    # LLM has a string handle to pass to EditImage/DescribeImage.
+    marker = " ".join(f"[uploaded image: files/{rid}]" for rid in saved_refs)
+    new_content = list(content)
+    text_idx = next(
+        (i for i, b in enumerate(new_content)
+         if isinstance(b, dict) and b.get("type") == "text"),
+        None,
+    )
+    if text_idx is None:
+        new_content.insert(0, {"type": "text", "text": marker})
+    else:
+        existing = new_content[text_idx].get("text", "")
+        new_content[text_idx] = {
+            **new_content[text_idx],
+            "text": (existing + ("\n\n" if existing else "") + marker),
+        }
+    message["content"] = new_content
+    return message
+def _translate_openai_to_internal(
+    body: dict[str, Any],
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]] | None]:
+    """OpenAI /v1/chat/completions body → (messages, tools).
+    OpenAI's message shape *is* Cognos's internal shape (LiteLLM uses
+    OpenAI as its lingua franca). So this is mostly pass-through with
+    a small amount of normalisation.
+    """
+    raw_messages = body.get("messages") or []
+    raw_tools = body.get("tools") or []
+    out: list[dict[str, Any]] = []
+    for m in raw_messages:
+        if not isinstance(m, dict):
+            continue
+        role = m.get("role")
+        if role not in ("system", "user", "assistant", "tool"):
+            continue
+        m = dict(m)
+        # WebUI / OpenAI multimodal: when the user uploads an image,
+        # it arrives as content blocks with `image_url` data URLs.
+        # The LLM can SEE the image (passes through to multimodal
+        # backends) but tools like EditImage need a string handle.
+        # Persist each upload to FileStore and append a `files/<id>`
+        # reference into the message text so the LLM can name it.
+        if role == "user":
+            m = _persist_inline_images(m)
+        # Pass through otherwise — content/tool_calls/tool_call_id all
+        # match the LiteLLM-internal shape already.
+        out.append(m)
+    tools_translated: list[dict[str, Any]] | None = None
+    if raw_tools:
+        tools_translated = []
+        for t in raw_tools:
+            if not isinstance(t, dict):
+                continue
+            # Both `{"type":"function","function":{...}}` and bare
+            # `{"name":...,"parameters":...}` show up — normalise to
+            # the wrapped form.
+            if t.get("type") == "function" and isinstance(t.get("function"), dict):
+                tools_translated.append(t)
+            else:
+                tools_translated.append({
+                    "type": "function",
+                    "function": {
+                        "name": t.get("name", ""),
+                        "description": t.get("description", ""),
+                        "parameters": t.get("parameters")
+                                     or t.get("input_schema")
+                                     or {"type": "object", "properties": {}},
+                    },
+                })
+    return out, tools_translated
+def _build_openai_response(
+    *,
+    text: str,
+    tool_calls: list[ToolUseBlock],
+    model: str,
+    usage: dict[str, int],
+    stop_reason: str | None,
+) -> dict[str, Any]:
+    """Build the non-streaming /v1/chat/completions response."""
+    message: dict[str, Any] = {"role": "assistant", "content": text or None}
+    if tool_calls:
+        message["tool_calls"] = [
+            {
+                "id": tc.id or f"call_{uuid.uuid4().hex[:12]}",
+                "type": "function",
+                "function": {
+                    "name": tc.name,
+                    "arguments": json.dumps(tc.input or {}),
+                },
+            }
+            for tc in tool_calls
+        ]
+    # Translate stop reasons to OpenAI vocabulary.
+    stop_map = {
+        "stop": "stop", "end_turn": "stop",
+        "length": "length", "max_tokens": "length",
+        "tool_calls": "tool_calls", "tool_use": "tool_calls",
+    }
+    finish = stop_map.get(stop_reason or "stop", "stop")
+    if tool_calls and finish == "stop":
+        finish = "tool_calls"
+    return {
+        "id": f"chatcmpl-{uuid.uuid4().hex[:24]}",
+        "object": "chat.completion",
+        "created": int(time.time()),
+        "model": model,
+        "choices": [{
+            "index": 0,
+            "message": message,
+            "finish_reason": finish,
+        }],
+        "usage": {
+            "prompt_tokens": usage.get("prompt_tokens", 0),
+            "completion_tokens": usage.get("completion_tokens", 0),
+            "total_tokens": usage.get("total_tokens", 0),
+        },
+    }
+# ---- Streaming SSE generator ----------------------------------------
+def _caudate_prefix_block(prediction: Any) -> str:
+    """Format Caudate's per-turn prediction as a markdown block.
+    Surfaced at the top of the visible reasoning trace so the user
+    can SEE what Caudate predicted before the LLM reasons. Without
+    this, Caudate runs invisibly in the background and her work is
+    never credited; with it, every turn becomes a small training
+    event the user can sanity-check.
+    """
+    if prediction is None:
+        return ""
+    try:
+        # Use whichever level the policy currently reports — if
+        # plan_mode imports fail we just leave it blank.
+        from nn.policy import GraduationPolicy
+        from pathlib import Path
+        from config import DATA_DIR
+        level = GraduationPolicy(
+            state_path=Path(DATA_DIR) / "nn" / "policy.json",
+        ).level.label
+    except Exception:
+        level = "?"
+    tool = getattr(prediction, "tool", "?") or "?"
+    tool_conf = getattr(prediction, "tool_confidence", 0.0) or 0.0
+    tier = getattr(prediction, "tier", "?") or "?"
+    tier_conf = getattr(prediction, "tier_confidence", 0.0) or 0.0
+    think = getattr(prediction, "think", 0.0) or 0.0
+    value = getattr(prediction, "value", 0.0) or 0.0
+    # Note: NO horizontal rule (---) below — markdown parsers inside
+    # a <details> block can interpret it as ending the block. Two
+    # newlines instead, gives a visual gap without breaking parsing.
+    return (
+        f"**🧠 Caudate** ({level}) · "
+        f"tier=`{tier}` ({tier_conf:.0%}) · "
+        f"tool=`{tool}` ({tool_conf:.0%}) · "
+        f"think={think:.2f} · "
+        f"value={value:.2f}\n\n"
+    )
+async def _stream_openai_events(
+    llm: LLMProvider,
+    messages: list[dict[str, Any]],
+    tools: list[dict[str, Any]] | None,
+    max_tokens: int,
+    temperature: float | None,
+    requested_model: str,
+    middleware: CaudateMiddleware | None = None,
+    turn_ctx: Any = None,
+    caller: str | None = None,
+    prediction: Any = None,
+) -> AsyncIterator[bytes]:
+    """Cognos stream → OpenAI SSE format.
+    OpenAI emits a sequence of `data: {chatcmpl chunk}` events terminated
+    by `data: [DONE]`. Each chunk has a `choices[0].delta` carrying the
+    incremental content / tool_call / finish_reason.
+    """
+    chunk_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
+    created = int(time.time())
+    def _sse(payload: dict[str, Any]) -> bytes:
+        return f"data: {json.dumps(payload)}\n\n".encode()
+    def _chunk(delta: dict[str, Any], finish: str | None = None) -> dict[str, Any]:
+        choice: dict[str, Any] = {"index": 0, "delta": delta}
+        if finish:
+            choice["finish_reason"] = finish
+        return {
+            "id": chunk_id,
+            "object": "chat.completion.chunk",
+            "created": created,
+            "model": requested_model,
+            "choices": [choice],
+        }
+    # Initial role chunk (OpenAI convention)
+    yield _sse(_chunk({"role": "assistant", "content": ""}))
+    tool_call_index = 0
+    text_buffer = ""
+    stop_reason: str | None = None
+    error_occurred = False
+    try:
+        # Pass `caller` through so DualLLMProvider's Router honors
+        # forced_slow / forced_fast tags from the cognos-* alias.
+        stream_kwargs: dict[str, Any] = dict(
+            messages=messages, tools=tools,
+            max_tokens=max_tokens, temperature=temperature,
+        )
+        if caller:
+            stream_kwargs["caller"] = caller
+        # Buffer thinking deltas into phrase-sized chunks before
+        # emitting. Without this, Open WebUI renders each tiny token
+        # as its own fragment — thinking traces show one word per
+        # line. Flushing only at word boundaries (>=80 chars ending
+        # in whitespace/punctuation) keeps the stream lively while
+        # giving the UI proper text it can wrap into a paragraph.
+        # Open WebUI 0.9.2 renders thinking via a `<details
+        # type="reasoning">` block inside the regular content stream;
+        # it doesn't read the OpenAI `reasoning_content` field.
+        #
+        # Strategy: STREAM the thinking live so the user sees the
+        # model reasoning in real time. To stay clean we:
+        #
+        #   1. Send `<details>` ONCE on the first thinking delta.
+        #      Never close+reopen; WebUI will keep streaming into
+        #      the same block.
+        #   2. Buffer thinking text until we hit a SAFE flush point
+        #      = the buffer ends in whitespace AND the previous char
+        #      was non-whitespace. This guarantees we never cut a
+        #      word mid-character.
+        #   3. Send `</details>` ONCE on the first text_delta (or
+        #      end of stream if there's no answer text).
+        #
+        # We also emit `reasoning_content` deltas live for newer
+        # clients that prefer the structured field.
+        thinking_buf = ""
+        thinking_open = False
+        thinking_closed = False
+        _THINKING_FLUSH_MIN = 24  # smaller = livelier
+        def _safe_flush_idx(buf: str) -> int:
+            """Return an index <= len(buf) at which it's safe to cut.
+            Safe = a position where the previous char is whitespace
+            (we cut at a word boundary). Returns 0 if no safe point.
+            We scan from the end backwards so we flush the LARGEST
+            safe prefix possible.
+            """
+            if len(buf) < _THINKING_FLUSH_MIN:
+                return 0
+            # Walk back from the end to the most recent whitespace
+            for i in range(len(buf) - 1, _THINKING_FLUSH_MIN - 1, -1):
+                if buf[i].isspace():
+                    return i + 1  # include the whitespace
+            return 0
+        # Caudate prefix: surfaced at the top of the thinking block
+        # so the user sees her per-turn prediction (tool/tier/think/
+        # value) BEFORE the LLM's reasoning. Empty string when no
+        # prediction is available (e.g. SILENT trust level).
+        _caudate_prefix = _caudate_prefix_block(prediction)
+        def _open_thinking():
+            nonlocal thinking_open
+            if thinking_open or thinking_closed:
+                return None
+            thinking_open = True
+            return _chunk({"content": (
+                '<details type="reasoning" done="false">\n'
+                '<summary>Thinking…</summary>\n\n'
+                f'{_caudate_prefix}'
+            )})
+        # Eagerly open the block when Caudate has a prediction so she
+        # is visible on EVERY turn, not just turns where the LLM
+        # decides to emit thinking. Without this, simple queries that
+        # have no LLM thinking would hide Caudate's work entirely.
+        if _caudate_prefix:
+            opener = _open_thinking()
+            if opener:
+                yield _sse(opener)
+        def _close_thinking():
+            nonlocal thinking_open, thinking_closed
+            if not thinking_open or thinking_closed:
+                return None
+            thinking_closed = True
+            thinking_open = False
+            return _chunk({"content": "\n\n</details>\n\n"})
+        async for event in llm.stream(**stream_kwargs):
+            if event.type == "text_delta" and event.delta:
+                # First answer-text delta: flush remaining thinking
+                # buffer + close the <details> block before answer.
+                # Note: emit ONLY content, not reasoning_content. Open
+                # WebUI 0.9.x renders a "Thought for ..." widget for
+                # every reasoning_content delta, producing one widget
+                # per word boundary.
+                if thinking_buf:
+                    yield _sse(_chunk({"content": thinking_buf}))
+                    thinking_buf = ""
+                close = _close_thinking()
+                if close:
+                    yield _sse(close)
+                text_buffer += event.delta
+                if middleware is not None and turn_ctx is not None:
+                    middleware.observe_response_text(turn_ctx, event.delta)
+                yield _sse(_chunk({"content": event.delta}))
+            elif event.type == "thinking_delta" and event.delta:
+                if middleware is not None and turn_ctx is not None:
+                    middleware.observe_thinking(turn_ctx, event.delta)
+                # First thinking delta: open the <details> block.
+                opener = _open_thinking()
+                if opener:
+                    yield _sse(opener)
+                thinking_buf += event.delta
+                # Flush only at a safe word boundary so we never cut
+                # mid-word. The flush index is the largest prefix
+                # ending in whitespace.
+                cut = _safe_flush_idx(thinking_buf)
+                if cut > 0:
+                    out = thinking_buf[:cut]
+                    thinking_buf = thinking_buf[cut:]
+                    yield _sse(_chunk({"content": out}))
+            elif event.type == "tool_use_end":
+                tc = ToolUseBlock(
+                    id=event.tool_use_id or f"call_{uuid.uuid4().hex[:12]}",
+                    name=event.tool_name or "",
+                    input=event.tool_input or {},
+                )
+                if middleware is not None and turn_ctx is not None:
+                    middleware.observe_tool_use(turn_ctx, tc.name)
+                # OpenAI streams a tool_call entry inside delta.tool_calls
+                yield _sse(_chunk({
+                    "tool_calls": [{
+                        "index": tool_call_index,
+                        "id": tc.id,
+                        "type": "function",
+                        "function": {
+                            "name": tc.name,
+                            "arguments": json.dumps(tc.input or {}),
+                        },
+                    }],
+                }))
+                tool_call_index += 1
+            elif event.type == "message_stop":
+                stop_reason = event.stop_reason
+    except Exception as e:
+        logger.exception("OpenAI-compat stream upstream failed")
+        error_occurred = True
+        # Send an error chunk and terminate
+        yield _sse({
+            "id": chunk_id, "object": "chat.completion.chunk",
+            "created": created, "model": requested_model,
+            "choices": [{
+                "index": 0, "delta": {},
+                "finish_reason": "error",
+            }],
+            "error": {"message": str(e), "type": "api_error"},
+        })
+        if middleware is not None and turn_ctx is not None:
+            middleware.end_turn(turn_ctx, error=True)
+        yield b"data: [DONE]\n\n"
+        return
+    # If the stream ended without an answer text_delta, the <details>
+    # block is still open with possibly unflushed thinking inside.
+    # Flush whatever's left and close cleanly so WebUI doesn't render
+    # an unterminated <details> block.
+    if thinking_buf:
+        yield _sse(_chunk({"content": thinking_buf}))
+        thinking_buf = ""
+    close = _close_thinking()
+    if close:
+        yield _sse(close)
+    stop_map = {
+        "stop": "stop", "end_turn": "stop",
+        "length": "length", "max_tokens": "length",
+        "tool_calls": "tool_calls", "tool_use": "tool_calls",
+    }
+    finish = stop_map.get(stop_reason or "stop", "stop")
+    if tool_call_index > 0 and finish == "stop":
+        finish = "tool_calls"
+    yield _sse(_chunk({}, finish=finish))
+    yield b"data: [DONE]\n\n"
+    if middleware is not None and turn_ctx is not None and not error_occurred:
+        middleware.end_turn(turn_ctx, error=False)
+# ---- FastAPI router -------------------------------------------------
+def build_router() -> APIRouter:
+    """Build the /v1/chat/completions router. Reuses the same singleton
+    agent + middleware as `/v1/messages`, so Caudate sees one unified
+    stream of training samples regardless of which API the client
+    speaks."""
+    router = APIRouter()
+    _agent_box: dict[str, Any] = {"agent": None, "middleware": None}
+    def _get_agent():
+        if _agent_box["agent"] is None:
+            from core.agent import CognosAgent
+            agent = CognosAgent(
+                mode="agentic",
+                permission_mode="bypass",
+                personality=True,
+            )
+            _agent_box["agent"] = agent
+            _agent_box["middleware"] = CaudateMiddleware(agent)
+            cau = getattr(agent, "caudate", None)
+            cau_status = (cau.policy.level.label
+                          if cau and cau.policy else "unavailable")
+            logger.info(
+                f"OpenAI-compat singleton agent ready, "
+                f"llm={agent.llm.model}, caudate={cau_status}"
+            )
+        return _agent_box["agent"], _agent_box["middleware"]
+    @router.post("/v1/chat/completions")
+    async def chat_completions(request: Request):
+        try:
+            body = await request.json()
+        except Exception:
+            raise HTTPException(400, "Invalid JSON body")
+        if not isinstance(body, dict):
+            raise HTTPException(400, "Body must be a JSON object")
+        try:
+            internal_msgs, internal_tools = _translate_openai_to_internal(body)
+        except Exception as e:
+            logger.exception("OpenAI→internal translation failed")
+            raise HTTPException(400, f"Bad message format: {e}")
+        # --- Slash-command interception ---------------------------------
+        # When the latest user message starts with `/`, dispatch through
+        # the existing core/slash_commands.py registry instead of
+        # calling the LLM. Lets Open WebUI users hit `/caudate`,
+        # `/sessions`, `/usage`, `/skills`, `/clear`, etc. and get the
+        # same data the Cognos /ui/ shows — same one place.
+        # Resolve the agent first (needed by the slash registry).
+        _agent_for_slash, _ = _get_agent()
+        slash_response = await _try_slash_intercept(
+            body, internal_msgs, _agent_for_slash
+        )
+        if slash_response is not None:
+            return slash_response
+        # Inject sandbox-awareness hint so the LLM scaffolds new files
+        # into cognos/sandbox/ by default. Idempotent across multi-turn.
+        from core.sandbox_prompt import inject_sandbox_hint
+        internal_msgs = inject_sandbox_hint(internal_msgs)
+        max_tokens = int(body.get("max_tokens") or 4096)
+        temperature = body.get("temperature")
+        if temperature is not None:
+            temperature = float(temperature)
+        requested_model = body.get("model") or "cognos"
+        stream = bool(body.get("stream", False))
+        agent, middleware = _get_agent()
+        llm = agent.llm
+        # Alias resolution — the user-facing default is just `cognos`.
+        # Caudate decides arbitrate/constitutional/tier per-turn from
+        # her live prediction. The other cognos-* aliases stay
+        # functional as explicit power-user overrides (debug routes)
+        # but aren't advertised in /v1/models — the user shouldn't
+        # have to choose between 9 modes. Default = smart.
+        forced_caller: str | None = None
+        arbitrate: bool = False
+        constitutional: bool = False
+        rm = (requested_model or "").lower()
+        # --- Open Caudate's turn FIRST so her prediction is available
+        # to drive the unified-cognos routing decisions. (Was below the
+        # alias block previously; moved up.)
+        turn_ctx = middleware.begin_turn(
+            internal_msgs, internal_tools,
+            model_source=getattr(llm, "model", "unknown"),
+        )
+        prediction = getattr(turn_ctx, "prediction", None)
+        # --- Unified `cognos` (default): Caudate-driven tier routing.
+        #
+        # Constitutional critique is NOT auto-triggered here — it's an
+        # explicit opt-in via the `cognos-strict` alias.
+        #
+        # Why no auto-trigger: Caudate's value head doesn't yet
+        # differentiate high-stakes from low-stakes prompts at her
+        # current scale (~0.46 for both trivial and substantial
+        # prompts in measured tests). A length-based proxy was tried
+        # earlier but is brittle — short high-stakes prompts ("delete
+        # prod database?") slip past it, and it forces a critique
+        # round-trip on every long prompt regardless of need.
+        # The honest call: only critique when the user opts in.
+        if rm in ("cognos", "", None) and prediction is not None:
+            if prediction.tier_confidence < 0.6:
+                arbitrate = True
+            elif prediction.tier == "slow":
+                forced_caller = "forced_slow"
+            elif prediction.tier == "fast":
+                forced_caller = "forced_fast"
+            logger.info(
+                f"unified cognos routing: tier={prediction.tier} "
+                f"(conf={prediction.tier_confidence:.2f}) → "
+                f"arbitrate={arbitrate} forced_caller={forced_caller}"
+            )
+        # --- Explicit overrides (debug / power-user routes)
+        elif rm in ("cognos-dual-brain", "cognos-collab"):
+            arbitrate = True
+        elif rm == "cognos-strict":
+            constitutional = True
+        elif rm == "cognos-slow":
+            forced_caller = "forced_slow"
+        elif rm == "cognos-fast":
+            forced_caller = "forced_fast"
+        elif rm in ("cognos-kimi", "cognos-haiku"):
+            try:
+                from llm.router import DualLLMProvider
+                if isinstance(llm, DualLLMProvider):
+                    target = "kimi" if rm == "cognos-kimi" else "haiku"
+                    fast_m = (getattr(llm.router.fast, "model", "") or "").lower()
+                    if target in fast_m:
+                        forced_caller = "forced_fast"
+                    else:
+                        forced_caller = "forced_slow"
+            except Exception:
+                pass
+        # Register the tag with the router policy so it actually routes,
+        # not just passes through.
+        try:
+            from llm.router import DualLLMProvider
+            if isinstance(llm, DualLLMProvider) and forced_caller:
+                if forced_caller == "forced_slow":
+                    llm.router.policy.slow_caller_tags.add("forced_slow")
+                else:
+                    llm.router.policy.fast_caller_tags.add("forced_fast")
+        except Exception:
+            pass
+        # Bump max_tokens for thinking models. Kimi-k2.6 (and similar
+        # reasoning models) spend most of a small budget thinking and
+        # emit empty content. Ensure at least 1024 tokens whenever a
+        # thinking-capable model could be on the path — that includes:
+        #   - explicit forced_slow / forced_fast pointing at thinking models
+        #   - the configured system1/system2 being a thinking model
+        # Catches the "Kimi as system1" case where the router routes
+        # routine traffic to Kimi by default.
+        _THINKING_MODEL_HINTS = ("kimi", "deepseek", "qwen3", "o1", "o3")
+        try:
+            from llm.router import DualLLMProvider
+            candidate_models: list[str] = []
+            if isinstance(llm, DualLLMProvider):
+                # DualLLMProvider holds Router; Router has .fast / .slow
+                candidate_models.append(getattr(llm.router.fast, "model", "") or "")
+                candidate_models.append(getattr(llm.router.slow, "model", "") or "")
+            else:
+                candidate_models.append(getattr(llm, "model", "") or "")
+            hit = any(any(h in m.lower() for h in _THINKING_MODEL_HINTS)
+                     for m in candidate_models)
+            if hit and max_tokens < 4096:
+                max_tokens = 4096
+        except Exception as e:
+            logger.debug(f"thinking-model bump skipped: {e}")
+        # `turn_ctx` was already opened above (moved earlier so
+        # Caudate's prediction can drive the unified `cognos` routing).
+        # Just inject her hint here as before.
+        internal_msgs = middleware.maybe_inject_hint(internal_msgs, turn_ctx)
+        # Open WebUI is a "web UI" caller — let it use the Claude Code
+        # subscription OAuth like our own `/chat` endpoint does. Without
+        # this, anthropic/* models 401 because LiteLLM has no api key.
+        if stream:
+            async def _gen():
+                with subscription_auth_scope():
+                    if arbitrate:
+                        # Streaming arbitration: stream system1 LIVE so
+                        # the user sees text flow within seconds, run
+                        # system2 in parallel as a background draft for
+                        # the preference corpus. Avoids the 20+ second
+                        # blackout that comes from buffering both before
+                        # any data flows (which Open WebUI times out on).
+                        from llm.router import DualLLMProvider
+                        if not isinstance(llm, DualLLMProvider):
+                            # No dual brain — normal stream
+                            async for chunk in _stream_openai_events(
+                                llm=llm, messages=internal_msgs, tools=internal_tools,
+                                max_tokens=max_tokens, temperature=temperature,
+                                requested_model=requested_model,
+                                middleware=middleware, turn_ctx=turn_ctx,
+                                prediction=prediction,
+                            ):
+                                yield chunk
+                            return
+                        # Pick which brain to STREAM live and which to
+                        # buffer in the background. Streaming a thinking
+                        # model (Kimi) blocks visible content for tens
+                        # of seconds while it reasons, which breaks chat
+                        # UX. So we always stream the non-thinking brain
+                        # if there is one — both are still engaged for
+                        # arbitration. Falls back to system1 if both
+                        # are thinking models or both are non-thinking.
+                        s1, s2 = llm.router.fast, llm.router.slow
+                        s1_thinks = any(h in (s1.model or "").lower()
+                                        for h in ("kimi", "deepseek", "qwen3", "o1", "o3"))
+                        s2_thinks = any(h in (s2.model or "").lower()
+                                        for h in ("kimi", "deepseek", "qwen3", "o1", "o3"))
+                        if s1_thinks and not s2_thinks:
+                            stream_provider, bg_provider = s2, s1
+                            stream_label, bg_label = "slow", "fast"
+                        else:
+                            stream_provider, bg_provider = s1, s2
+                            stream_label, bg_label = "fast", "slow"
+                        # Kick off the background brain (buffered).
+                        bg_task = asyncio.create_task(
+                            bg_provider.chat(
+                                messages=internal_msgs, tools=internal_tools,
+                                max_tokens=max_tokens, temperature=temperature,
+                            )
+                        )
+                        # Live-stream the chosen provider directly to client.
+                        stream_text_buf: list[str] = []
+                        stream_tool_calls: list[ToolUseBlock] = []
+                        chunk_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
+                        created = int(time.time())
+                        def _chunk(delta, finish=None):
+                            ch = {"index": 0, "delta": delta}
+                            if finish:
+                                ch["finish_reason"] = finish
+                            return {
+                                "id": chunk_id,
+                                "object": "chat.completion.chunk",
+                                "created": created,
+                                "model": requested_model,
+                                "choices": [ch],
+                            }
+                        # Initial role chunk
+                        yield f"data: {json.dumps(_chunk({'role':'assistant','content':''}))}\n\n".encode()
+                        # Same word-boundary buffering for thinking
+                        # as in `_stream_openai_events` — Open WebUI
+                        # otherwise renders each token as its own line.
+                        # Stream thinking inside ONE `<details>` block
+                        # in content (NOT via reasoning_content — that
+                        # field generates one "Thought" widget per
+                        # delta in WebUI 0.9.x).
+                        _arb_thinking_buf = ""
+                        _ARB_THINK_MIN = 24
+                        _arb_thinking_open = False
+                        _arb_thinking_closed = False
+                        _arb_caudate_prefix = _caudate_prefix_block(prediction)
+                        def _arb_safe_flush(buf: str) -> int:
+                            if len(buf) < _ARB_THINK_MIN:
+                                return 0
+                            for i in range(len(buf) - 1, _ARB_THINK_MIN - 1, -1):
+                                if buf[i].isspace():
+                                    return i + 1
+                            return 0
+                        def _arb_open_block():
+                            nonlocal _arb_thinking_open
+                            if _arb_thinking_open or _arb_thinking_closed:
+                                return None
+                            _arb_thinking_open = True
+                            return _chunk({"content": (
+                                '<details type="reasoning" done="false">\n'
+                                '<summary>Thinking…</summary>\n\n'
+                                f'{_arb_caudate_prefix}'
+                            )})
+                        def _arb_close_block():
+                            nonlocal _arb_thinking_open, _arb_thinking_closed
+                            if not _arb_thinking_open or _arb_thinking_closed:
+                                return None
+                            _arb_thinking_closed = True
+                            _arb_thinking_open = False
+                            return _chunk({"content": "\n\n</details>\n\n"})
+                        # Eagerly open with Caudate prefix so she's
+                        # visible on every arbitration turn too.
+                        if _arb_caudate_prefix:
+                            opener = _arb_open_block()
+                            if opener:
+                                yield f"data: {json.dumps(opener)}\n\n".encode()
+                        try:
+                            async for ev in stream_provider.stream(
+                                messages=internal_msgs, tools=internal_tools,
+                                max_tokens=max_tokens, temperature=temperature,
+                            ):
+                                if ev.type == "text_delta" and ev.delta:
+                                    # Flush + close the thinking block
+                                    # before answer text streams.
+                                    if _arb_thinking_buf:
+                                        out = _arb_thinking_buf
+                                        _arb_thinking_buf = ""
+                                        yield f"data: {json.dumps(_chunk({'content': out}))}\n\n".encode()
+                                    close = _arb_close_block()
+                                    if close:
+                                        yield f"data: {json.dumps(close)}\n\n".encode()
+                                    stream_text_buf.append(ev.delta)
+                                    middleware.observe_response_text(turn_ctx, ev.delta)
+                                    yield f"data: {json.dumps(_chunk({'content': ev.delta}))}\n\n".encode()
+                                elif ev.type == "thinking_delta" and ev.delta:
+                                    middleware.observe_thinking(turn_ctx, ev.delta)
+                                    opener = _arb_open_block()
+                                    if opener:
+                                        yield f"data: {json.dumps(opener)}\n\n".encode()
+                                    _arb_thinking_buf += ev.delta
+                                    cut = _arb_safe_flush(_arb_thinking_buf)
+                                    if cut > 0:
+                                        out = _arb_thinking_buf[:cut]
+                                        _arb_thinking_buf = _arb_thinking_buf[cut:]
+                                        yield f"data: {json.dumps(_chunk({'content': out}))}\n\n".encode()
+                                elif ev.type == "tool_use_end":
+                                    tc = ToolUseBlock(
+                                        id=ev.tool_use_id or f"call_{uuid.uuid4().hex[:12]}",
+                                        name=ev.tool_name or "",
+                                        input=ev.tool_input or {},
+                                    )
+                                    stream_tool_calls.append(tc)
+                                    middleware.observe_tool_use(turn_ctx, tc.name)
+                                    yield f"data: {json.dumps(_chunk({'tool_calls':[{'index':len(stream_tool_calls)-1,'id':tc.id,'type':'function','function':{'name':tc.name,'arguments':json.dumps(tc.input or {})}}]}))}\n\n".encode()
+                        except Exception as e:
+                            logger.exception("stream-leg failed in arbitrate")
+                            middleware.end_turn(turn_ctx, error=True)
+                            err_chunk = json.dumps({
+                                "id": chunk_id, "object": "chat.completion.chunk",
+                                "created": created, "model": requested_model,
+                                "choices": [{"index": 0, "delta": {},
+                                             "finish_reason": "error"}],
+                                "error": {"message": str(e), "type": "api_error"},
+                            })
+                            yield f"data: {err_chunk}\n\n".encode()
+                            yield b"data: [DONE]\n\n"
+                            bg_task.cancel()
+                            return
+                        # Flush any unflushed thinking + close the
+                        # block if the stream ended with thinking
+                        # only (no answer text).
+                        if _arb_thinking_buf:
+                            yield f"data: {json.dumps(_chunk({'content': _arb_thinking_buf}))}\n\n".encode()
+                            _arb_thinking_buf = ""
+                        close = _arb_close_block()
+                        if close:
+                            yield f"data: {json.dumps(close)}\n\n".encode()
+                        # Close stream to client.
+                        finish = "tool_calls" if stream_tool_calls else "stop"
+                        yield f"data: {json.dumps(_chunk({}, finish=finish))}\n\n".encode()
+                        yield b"data: [DONE]\n\n"
+                        # Background: wait for the bg brain to finish,
+                        # score both, log arbitration. User has already
+                        # seen the streamed brain; this is purely for
+                        # the preference corpus.
+                        try:
+                            bg_resp = await bg_task
+                        except Exception as e:
+                            logger.warning(f"bg-draft failed: {e}")
+                            bg_resp = None
+                        try:
+                            streamed_text = "".join(stream_text_buf)
+                            class _FakeResp:
+                                content = streamed_text
+                                thinking = ""
+                                tool_calls = stream_tool_calls
+                            streamed_score = _score_draft(_FakeResp())  # type: ignore[arg-type]
+                            bg_score = _score_draft(bg_resp) if bg_resp else 0.0
+                            # Always label fast/slow consistently regardless
+                            # of which one we actually streamed: fast = system1,
+                            # slow = system2 in the persisted corpus.
+                            if stream_label == "fast":
+                                fast_text, fast_score = streamed_text, streamed_score
+                                slow_text = bg_resp.content if bg_resp else ""
+                                slow_score = bg_score
+                            else:
+                                slow_text, slow_score = streamed_text, streamed_score
+                                fast_text = bg_resp.content if bg_resp else ""
+                                fast_score = bg_score
+                            winner = ("fast" if fast_score >= slow_score
+                                      or abs(fast_score - slow_score) < 0.02
+                                      else "slow")
+                            middleware.observe_arbitration(
+                                turn_ctx,
+                                fast_text=fast_text,
+                                slow_text=slow_text,
+                                fast_score=fast_score,
+                                slow_score=slow_score,
+                                winner=winner,
+                                fast_model=s1.model,
+                                slow_model=s2.model,
+                            )
+                        except Exception as e:
+                            logger.debug(f"arbitration log failed: {e}")
+                        middleware.end_turn(turn_ctx, error=False)
+                        return
+                    # Server-side agentic loop (cognos-* + no client
+                    # tools): LIVE streaming. We use the streaming
+                    # variant of the agentic loop so thinking_delta /
+                    # text_delta / tool_result events flow to the UI
+                    # as the model emits them across all iterations.
+                    if _should_run_agentic(requested_model, internal_tools):
+                        chunk_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
+                        created = int(time.time())
+                        def _ag_chunk(delta, finish=None):
+                            ch = {"index": 0, "delta": delta}
+                            if finish:
+                                ch["finish_reason"] = finish
+                            return {
+                                "id": chunk_id,
+                                "object": "chat.completion.chunk",
+                                "created": created,
+                                "model": requested_model,
+                                "choices": [ch],
+                            }
+                        # Initial role chunk
+                        yield f"data: {json.dumps(_ag_chunk({'role':'assistant','content':''}))}\n\n".encode()
+                        # Always open the reasoning block at the start —
+                        # even if neither the LLM nor any tools fire,
+                        # Caudate's per-turn prediction is shown.
+                        ag_caudate_prefix = _caudate_prefix_block(prediction)
+                        ag_thinking_open = False
+                        ag_thinking_closed = False
+                        ag_text_started = False
+                        ag_thinking_buf = ""
+                        ag_tool_call_index = 0
+                        _AG_FLUSH_MIN = 24
+                        def _ag_safe_flush_idx(buf: str) -> int:
+                            if len(buf) < _AG_FLUSH_MIN:
+                                return 0
+                            for i in range(len(buf) - 1, _AG_FLUSH_MIN - 1, -1):
+                                if buf[i].isspace():
+                                    return i + 1
+                            return 0
+                        def _ag_open_block():
+                            nonlocal ag_thinking_open
+                            if ag_thinking_open or ag_thinking_closed:
+                                return None
+                            ag_thinking_open = True
+                            return _ag_chunk({"content": (
+                                '<details type="reasoning" done="false">\n'
+                                '<summary>Thinking…</summary>\n\n'
+                                f'{ag_caudate_prefix}'
+                            )})
+                        def _ag_close_block():
+                            nonlocal ag_thinking_open, ag_thinking_closed
+                            if not ag_thinking_open or ag_thinking_closed:
+                                return None
+                            ag_thinking_closed = True
+                            ag_thinking_open = False
+                            return _ag_chunk({"content": "\n\n</details>\n\n"})
+                        # Eagerly open the reasoning block so Caudate's
+                        # prefix is always visible.
+                        if ag_caudate_prefix:
+                            opener = _ag_open_block()
+                            if opener:
+                                yield f"data: {json.dumps(opener)}\n\n".encode()
+                        try:
+                            async for ev in _run_agentic_loop_streaming(
+                                llm=llm,
+                                executor=agent.loop.executor,
+                                messages=internal_msgs,
+                                middleware=middleware, turn_ctx=turn_ctx,
+                                max_tokens=max_tokens, temperature=temperature,
+                                caller=forced_caller,
+                            ):
+                                if ev.type == "thinking_delta" and ev.delta:
+                                    if middleware is not None and turn_ctx is not None:
+                                        middleware.observe_thinking(turn_ctx, ev.delta)
+                                    opener = _ag_open_block()
+                                    if opener:
+                                        yield f"data: {json.dumps(opener)}\n\n".encode()
+                                    ag_thinking_buf += ev.delta
+                                    cut = _ag_safe_flush_idx(ag_thinking_buf)
+                                    if cut > 0:
+                                        out = ag_thinking_buf[:cut]
+                                        ag_thinking_buf = ag_thinking_buf[cut:]
+                                        # NOTE: emit ONLY in-content
+                                        # <details> stream. Do NOT also
+                                        # emit reasoning_content — Open
+                                        # WebUI 0.9.x renders a fresh
+                                        # "Thought for ..." widget per
+                                        # reasoning_content delta,
+                                        # producing one widget per word
+                                        # boundary instead of one block.
+                                        yield f"data: {json.dumps(_ag_chunk({'content': out}))}\n\n".encode()
+                                elif ev.type == "tool_result":
+                                    # Inline label inside the reasoning
+                                    # block so the user can see what
+                                    # happened. Status emoji + name +
+                                    # short summary.
+                                    opener = _ag_open_block()
+                                    if opener:
+                                        yield f"data: {json.dumps(opener)}\n\n".encode()
+                                    if ag_thinking_buf:
+                                        out = ag_thinking_buf
+                                        ag_thinking_buf = ""
+                                        yield f"data: {json.dumps(_ag_chunk({'content': out}))}\n\n".encode()
+                                    status = (ev.raw or {}).get("status", "success")
+                                    icon = "✓" if status == "success" else "✗"
+                                    line = f"\n\n{icon} **{ev.tool_name}** — {(ev.delta or '')[:200]}\n\n"
+                                    yield f"data: {json.dumps(_ag_chunk({'content': line}))}\n\n".encode()
+                                elif ev.type == "tool_use_end":
+                                    # Emit the OpenAI-shape tool_calls
+                                    # delta so the OpenAI client side
+                                    # knows what was called. The
+                                    # tool_result event above renders
+                                    # the human-visible label.
+                                    yield f"data: {json.dumps(_ag_chunk({'tool_calls': [{'index': ag_tool_call_index, 'id': ev.tool_use_id or '', 'type': 'function', 'function': {'name': ev.tool_name or '', 'arguments': json.dumps(ev.tool_input or {})}}]}))}\n\n".encode()
+                                    ag_tool_call_index += 1
+                                elif ev.type == "text_delta" and ev.delta:
+                                    # Final answer arriving — flush the
+                                    # buffer + close the reasoning
+                                    # block before the answer streams.
+                                    if not ag_text_started:
+                                        if ag_thinking_buf:
+                                            out = ag_thinking_buf
+                                            ag_thinking_buf = ""
+                                            yield f"data: {json.dumps(_ag_chunk({'content': out}))}\n\n".encode()
+                                        close = _ag_close_block()
+                                        if close:
+                                            yield f"data: {json.dumps(close)}\n\n".encode()
+                                        ag_text_started = True
+                                    if middleware is not None and turn_ctx is not None:
+                                        middleware.observe_response_text(turn_ctx, ev.delta)
+                                    yield f"data: {json.dumps(_ag_chunk({'content': ev.delta}))}\n\n".encode()
+                                elif ev.type == "iteration_break":
+                                    # Visual breath inside the block —
+                                    # helps when the LLM thinks across
+                                    # multiple iterations punctuated by
+                                    # tool calls.
+                                    pass
+                                elif ev.type == "message_stop":
+                                    pass
+                        except Exception as e:
+                            logger.exception("agentic stream failed")
+                            yield f"data: {json.dumps(_ag_chunk({'content': f'[error] {e}'}, finish='stop'))}\n\n".encode()
+                            yield b"data: [DONE]\n\n"
+                            middleware.end_turn(turn_ctx, error=True)
+                            return
+                        # Stream finished. If we never got a text_delta
+                        # (e.g. tools-only conversation that hit max
+                        # iterations), flush remaining thinking + close.
+                        if not ag_text_started:
+                            if ag_thinking_buf:
+                                out = ag_thinking_buf
+                                ag_thinking_buf = ""
+                                yield f"data: {json.dumps(_ag_chunk({'content': out}))}\n\n".encode()
+                            close = _ag_close_block()
+                            if close:
+                                yield f"data: {json.dumps(close)}\n\n".encode()
+                        yield f"data: {json.dumps(_ag_chunk({}, finish='stop'))}\n\n".encode()
+                        yield b"data: [DONE]\n\n"
+                        middleware.end_turn(turn_ctx, error=False)
+                        return
+                    # Normal streaming path (no arbitration, no agentic)
+                    async for chunk in _stream_openai_events(
+                        llm=llm, messages=internal_msgs, tools=internal_tools,
+                        max_tokens=max_tokens, temperature=temperature,
+                        requested_model=requested_model,
+                        middleware=middleware, turn_ctx=turn_ctx,
+                        caller=forced_caller,
+                        prediction=prediction,
+                    ):
+                        yield chunk
+            return StreamingResponse(_gen(), media_type="text/event-stream")
+        # Non-streaming
+        agentic = _should_run_agentic(requested_model, internal_tools)
+        try:
+            with subscription_auth_scope():
+                if arbitrate:
+                    # Pattern 2: parallel arbitration through Caudate.
+                    # Arbitration takes precedence over the agentic
+                    # loop; the dual-brain comparison is more valuable
+                    # than tool-driven multi-step on cognos-dual-brain.
+                    from llm.router import DualLLMProvider
+                    if isinstance(llm, DualLLMProvider):
+                        resp = await _dual_brain_arbitrate(
+                            llm=llm,
+                            messages=internal_msgs, tools=internal_tools,
+                            max_tokens=max_tokens, temperature=temperature,
+                            middleware=middleware, turn_ctx=turn_ctx,
+                        )
+                    else:
+                        resp = await llm.chat(
+                            messages=internal_msgs, tools=internal_tools,
+                            max_tokens=max_tokens, temperature=temperature,
+                        )
+                elif agentic:
+                    # Server-side ReAct: LLM proposes tool calls, we
+                    # execute them, loop until final text. Open WebUI
+                    # users get full Cognos tool capabilities — Bash,
+                    # Read, Write, Edit, Grep, Glob, PythonExec, etc.
+                    resp = await _run_agentic_loop(
+                        llm=llm,
+                        executor=agent.loop.executor,
+                        messages=internal_msgs,
+                        middleware=middleware, turn_ctx=turn_ctx,
+                        max_tokens=max_tokens, temperature=temperature,
+                        caller=forced_caller,
+                    )
+                else:
+                    resp = await llm.chat(
+                        messages=internal_msgs, tools=internal_tools,
+                        max_tokens=max_tokens, temperature=temperature,
+                        caller=forced_caller,
+                    )
+        except Exception as e:
+            logger.exception("LLM call failed (openai-compat)")
+            middleware.end_turn(turn_ctx, error=True)
+            raise HTTPException(500, f"LLM error: {e}")
+        middleware.observe_response_text(turn_ctx, resp.content or "")
+        if getattr(resp, "thinking", None):
+            middleware.observe_thinking(turn_ctx, resp.thinking)
+        for tc in resp.tool_calls:
+            middleware.observe_tool_use(turn_ctx, tc.name)
+        middleware.end_turn(turn_ctx, error=False)
+        # Fallback: if the model produced thinking but no visible
+        # content (Kimi-k2.6 cuts off mid-thinking under tight budget),
+        # surface the thinking as the reply so the user sees something
+        # instead of a blank message.
+        text_out = resp.content
+        if (not text_out) and getattr(resp, "thinking", ""):
+            text_out = (
+                f"[thinking — model didn't finish before max_tokens]\n\n"
+                f"{resp.thinking}"
+            )
+        # Constitutional critique pass — only for `cognos-strict`.
+        # Runs `core/constitution.py::run_critique` against the response;
+        # if any rule in COGNOS_CONSTITUTION.md is violated, the LLM is
+        # asked to revise. Adds 1–2 LLM calls per turn (worth it when
+        # accuracy matters more than latency).
+        if constitutional and text_out:
+            try:
+                from core.constitution import run_critique
+                # Find the most recent user message for the critique
+                user_msg = ""
+                for m in reversed(internal_msgs):
+                    if m.get("role") == "user":
+                        c = m.get("content", "")
+                        if isinstance(c, list):
+                            c = " ".join(b.get("text", "") for b in c
+                                         if isinstance(b, dict))
+                        user_msg = c or ""
+                        break
+                logger.info(
+                    f"constitutional critique: starting "
+                    f"(response={len(text_out)} chars)"
+                )
+                revised, violations = await run_critique(
+                    llm=llm,
+                    user_message=user_msg,
+                    response=text_out,
+                )
+                if violations:
+                    logger.info(
+                        f"constitutional critique: revised after "
+                        f"{len(violations)} violation(s) "
+                        f"({[v.get('rule') for v in violations]})"
+                    )
+                    text_out = revised
+                else:
+                    logger.info(
+                        "constitutional critique: clean "
+                        "(no violations found)"
+                    )
+            except Exception as e:
+                logger.warning(f"constitutional critique failed: {e}")
+        return JSONResponse(_build_openai_response(
+            text=text_out,
+            tool_calls=resp.tool_calls,
+            model=requested_model,
+            usage=resp.usage,
+            stop_reason=resp.stop_reason,
+        ))
+    # /v1/models is owned by api/anthropic_compat — that endpoint
+    # already returns a hybrid shape that satisfies both Anthropic and
+    # OpenAI clients (we updated it when adding this router). No need
+    # to register a second /v1/models here; FastAPI would take only
+    # whichever was registered first anyway.
+    return router