PyPI - caudate-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

caudate-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

api/__init__.py +5 -0
api/anthropic_compat.py +1518 -0
api/artifact_viewer.py +366 -0
api/caudate_middleware.py +618 -0
api/forge_bootstrapper_routes.py +377 -0
api/forge_routes.py +630 -0
api/forge_system_routes.py +294 -0
api/openai_compat.py +1993 -0
api/server.py +667 -0
api/storyboard_page.py +677 -0
caudate_cli-0.1.0.dist-info/METADATA +354 -0
caudate_cli-0.1.0.dist-info/RECORD +153 -0
caudate_cli-0.1.0.dist-info/WHEEL +5 -0
caudate_cli-0.1.0.dist-info/entry_points.txt +2 -0
caudate_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
caudate_cli-0.1.0.dist-info/top_level.txt +14 -0
cognos_mcp/__init__.py +4 -0
cognos_mcp/bridge.py +41 -0
cognos_mcp/client.py +70 -0
cognos_mcp/config.py +49 -0
cognos_mcp/server.py +66 -0
config.py +82 -0
core/__init__.py +0 -0
core/agent.py +468 -0
core/agentic_loop.py +731 -0
core/anthropic_auth.py +91 -0
core/background.py +113 -0
core/banner.py +134 -0
core/bootstrap.py +292 -0
core/citations.py +131 -0
core/compaction.py +109 -0
core/constitution.py +198 -0
core/diff_viewer.py +87 -0
core/export.py +85 -0
core/file_refs.py +119 -0
core/files.py +199 -0
core/hooks.py +209 -0
core/image.py +599 -0
core/input.py +91 -0
core/loop.py +238 -0
core/memory_md.py +147 -0
core/notifications.py +99 -0
core/ownership.py +181 -0
core/paste.py +81 -0
core/permissions.py +210 -0
core/plan_mode.py +215 -0
core/sandbox_prompt.py +185 -0
core/scheduler.py +195 -0
core/schemas.py +202 -0
core/session.py +90 -0
core/settings.py +132 -0
core/skills.py +398 -0
core/slash_commands.py +977 -0
core/statusline.py +61 -0
core/subagent.py +300 -0
core/thinking.py +50 -0
core/updater.py +122 -0
core/usage.py +109 -0
core/worktree.py +93 -0
execution/__init__.py +0 -0
execution/executor.py +329 -0
execution/plugins.py +108 -0
execution/tools/__init__.py +0 -0
execution/tools/agent_tool.py +107 -0
execution/tools/agentic_tool.py +297 -0
execution/tools/artifact_tool.py +191 -0
execution/tools/ask_user_question_tool.py +137 -0
execution/tools/base.py +81 -0
execution/tools/calculator_tool.py +137 -0
execution/tools/cognos_card_tool.py +124 -0
execution/tools/cron_tool.py +215 -0
execution/tools/datetime_tool.py +215 -0
execution/tools/describe_image_tool.py +161 -0
execution/tools/draw_tool.py +164 -0
execution/tools/edit_image_tool.py +262 -0
execution/tools/edit_tool.py +245 -0
execution/tools/file_tool.py +90 -0
execution/tools/find_anywhere_tool.py +255 -0
execution/tools/forge_feature_tools.py +377 -0
execution/tools/glob_tool.py +59 -0
execution/tools/grep_tool.py +89 -0
execution/tools/http_request_tool.py +224 -0
execution/tools/load_skill_tool.py +104 -0
execution/tools/longcat_avatar_tool.py +384 -0
execution/tools/mcp_tool.py +100 -0
execution/tools/notebook_tool.py +279 -0
execution/tools/openapi_tool.py +440 -0
execution/tools/plan_mode_tool.py +95 -0
execution/tools/push_notification_tool.py +157 -0
execution/tools/python_tool.py +61 -0
execution/tools/respond_tool.py +40 -0
execution/tools/sandbox_tool.py +378 -0
execution/tools/search_tool.py +153 -0
execution/tools/semantic_search_tool.py +106 -0
execution/tools/shell_tool.py +283 -0
execution/tools/speak_tool.py +134 -0
execution/tools/storyboard_tool.py +727 -0
execution/tools/system_info_tool.py +212 -0
execution/tools/task_tool.py +323 -0
execution/tools/think_tool.py +49 -0
execution/tools/transcribe_audio_tool.py +86 -0
execution/tools/update_memory_tool.py +92 -0
execution/tools/web_fetch_tool.py +82 -0
execution/tools/worktree_tool.py +174 -0
llm/__init__.py +0 -0
llm/fallback.py +116 -0
llm/models.py +320 -0
llm/provider.py +1356 -0
llm/router.py +373 -0
main.py +1889 -0
memory/__init__.py +0 -0
memory/episodic.py +99 -0
memory/procedural.py +145 -0
memory/semantic.py +71 -0
memory/working.py +64 -0
nn/__init__.py +43 -0
nn/auto_evolve.py +245 -0
nn/caudate.py +136 -0
nn/config.py +141 -0
nn/consolidator.py +81 -0
nn/data.py +1635 -0
nn/encoder.py +258 -0
nn/forge_advisor.py +303 -0
nn/format.py +235 -0
nn/heads.py +432 -0
nn/observer.py +994 -0
nn/policy.py +214 -0
nn/runtime.py +343 -0
nn/scorer.py +175 -0
nn/trainer.py +515 -0
nn/vision.py +352 -0
personality/__init__.py +23 -0
personality/engine.py +129 -0
personality/identity.py +144 -0
personality/inner_voice.py +100 -0
personality/mood.py +205 -0
planning/__init__.py +0 -0
planning/dev_server.py +221 -0
planning/forge_models.py +718 -0
planning/orchestrator.py +1363 -0
planning/planner.py +451 -0
planning/task_graph.py +61 -0
reflection/__init__.py +0 -0
reflection/meta_learner.py +156 -0
reflection/reflector.py +127 -0
ui/__init__.py +5 -0
ui/display.py +88 -0
voice/__init__.py +0 -0
voice/conversation.py +125 -0
voice/listener.py +111 -0
voice/speaker.py +59 -0
voice/stt.py +126 -0
voice/tts.py +214 -0

llm/provider.py ADDED Viewed

@@ -0,0 +1,1356 @@
+"""Model-agnostic LLM provider using LiteLLM.
+Supports:
+- Plain chat/completion
+- Tool calling (native + prompt-based fallback)
+- Streaming (async generator of StreamEvents)
+- Structured JSON output
+"""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+import random
+import re
+import uuid
+from typing import Any, AsyncIterator, Awaitable, Callable, TypeVar
+import litellm
+from pydantic import BaseModel
+from config import LLM_MODEL, LLM_TEMPERATURE, LLM_MAX_TOKENS, PROMPT_CACHING
+from core.schemas import StreamEvent, ToolUseBlock
+from core.usage import get_global_tracker
+logger = logging.getLogger(__name__)
+T = TypeVar("T")
+# Retry policy for LLM calls — transient failures (network blips, model
+# warmup stalls) should auto-retry with backoff. Pathological failures
+# (bad model id, auth) are NOT retried — we raise immediately.
+DEFAULT_MAX_RETRIES = 3
+DEFAULT_INITIAL_BACKOFF_S = 1.0
+DEFAULT_BACKOFF_MULTIPLIER = 2.5
+DEFAULT_JITTER_S = 0.3
+# Errors that are worth retrying. Keep this narrow: anything model-side
+# (wrong model name, invalid request, context-length overflow) won't fix
+# itself by retrying.
+_RETRYABLE_SUBSTRINGS = (
+    "timeout", "timed out", "connection", "econnreset", "read error",
+    "temporarily", "service unavailable", "503", "502", "500",
+    "rate limit", "429", "overloaded", "busy",
+)
+def _is_retryable(err: BaseException) -> bool:
+    """True if the error is transient and worth retrying."""
+    if isinstance(err, (asyncio.TimeoutError, ConnectionError, TimeoutError)):
+        return True
+    message = str(err).lower()
+    return any(tok in message for tok in _RETRYABLE_SUBSTRINGS)
+async def _with_retry(
+    op: Callable[[], Awaitable[T]],
+    label: str,
+    max_retries: int = DEFAULT_MAX_RETRIES,
+) -> T:
+    """Run an async op with exponential-backoff retry on transient errors."""
+    delay = DEFAULT_INITIAL_BACKOFF_S
+    last: BaseException | None = None
+    for attempt in range(max_retries + 1):
+        try:
+            return await op()
+        except Exception as e:
+            last = e
+            if not _is_retryable(e) or attempt == max_retries:
+                break
+            jitter = random.uniform(0, DEFAULT_JITTER_S)
+            logger.warning(
+                f"{label} failed (attempt {attempt + 1}/{max_retries + 1}): "
+                f"{e} — retrying in {delay + jitter:.1f}s"
+            )
+            await asyncio.sleep(delay + jitter)
+            delay *= DEFAULT_BACKOFF_MULTIPLIER
+    assert last is not None
+    raise last
+# Suppress litellm noise
+litellm.suppress_debug_info = True
+class LLMResponse(BaseModel):
+    content: str
+    raw: dict[str, Any] = {}
+    model: str = ""
+    usage: dict[str, int] = {}
+    stop_reason: str | None = None
+    tool_calls: list[ToolUseBlock] = []
+    # Thinking-model output (gemma4, kimi, deepseek, etc). Most models
+    # leave this empty; thinking models surface their internal reasoning
+    # here. Cognos passes it through to Anthropic-compat clients as a
+    # `thinking` content block so Claude Code can render it.
+    thinking: str = ""
+# Models known to lack native tool-calling — fall back to prompt-based
+# protocol.
+_NO_NATIVE_TOOLS = {
+    "gemma", "gemma2", "gemma3", "gemma4",
+}
+def _has_native_tool_support(model: str) -> bool:
+    """Heuristic: does this model support native tool-calling via LiteLLM?"""
+    m = model.lower()
+    for prefix in _NO_NATIVE_TOOLS:
+        if prefix in m:
+            return False
+    return True
+_KNOWN_NO_JSON_MODE = {"gemma", "gemma2"}  # older gemma rejects response_format
+def _has_json_mode_support(model: str) -> bool:
+    m = model.lower()
+    return not any(prefix in m for prefix in _KNOWN_NO_JSON_MODE)
+def _supports_prompt_caching(model: str) -> bool:
+    """Only Anthropic models honor cache_control today. Other providers ignore it."""
+    return "claude" in model.lower() or model.lower().startswith("anthropic/")
+def _apply_cache_control(
+    messages: list[dict[str, Any]],
+    model: str,
+) -> list[dict[str, Any]]:
+    """Rewrite the first system message with a cache_control breakpoint.
+    LiteLLM forwards cache_control through to Anthropic's SDK. Other providers
+    see the nested content-block list and ignore the extra fields, so this is
+    safe to leave on globally. We only bother if the system prompt is long
+    enough to matter (>= 1024 tokens ≈ 4000 chars) — Anthropic's caching has a
+    minimum chunk size below which it's a no-op.
+    """
+    if not PROMPT_CACHING or not _supports_prompt_caching(model):
+        return messages
+    if not messages or messages[0].get("role") != "system":
+        return messages
+    raw = messages[0].get("content")
+    if not isinstance(raw, str) or len(raw) < 4000:
+        return messages
+    out = list(messages)
+    out[0] = {
+        "role": "system",
+        "content": [
+            {
+                "type": "text",
+                "text": raw,
+                "cache_control": {"type": "ephemeral"},
+            },
+        ],
+    }
+    return out
+class LLMProvider:
+    """Unified LLM interface supporting local (Ollama) and cloud (Claude, OpenAI) models."""
+    def __init__(
+        self,
+        model: str = LLM_MODEL,
+        temperature: float = LLM_TEMPERATURE,
+        max_tokens: int = LLM_MAX_TOKENS,
+    ):
+        self.model = model
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+    # ------------------------------------------------------------------
+    # Basic completion / chat
+    # ------------------------------------------------------------------
+    async def complete(
+        self,
+        prompt: str,
+        system: str | None = None,
+        temperature: float | None = None,
+        max_tokens: int | None = None,
+        response_format: dict | None = None,
+        caller: str | None = None,
+    ) -> LLMResponse:
+        """Send a single-turn completion request."""
+        messages: list[dict[str, Any]] = []
+        if system:
+            messages.append({"role": "system", "content": system})
+        messages.append({"role": "user", "content": prompt})
+        return await self.chat(
+            messages,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            response_format=response_format,
+        )
+    def _route_model_for_litellm(self) -> str:
+        """LiteLLM uses different prefixes for different Ollama endpoints:
+          - `ollama/<name>`       → /v1/chat/completions (OpenAI-compat;
+                                    drops Ollama-specific fields like thinking)
+          - `ollama_chat/<name>`  → /api/chat (native; preserves thinking
+                                    as `reasoning_content`)
+        We always want the native path so thinking-model output reaches
+        the agent and downstream Anthropic-compat clients.
+        """
+        if self.model.startswith("ollama/") and not self.model.startswith("ollama_chat/"):
+            return "ollama_chat/" + self.model[len("ollama/"):]
+        return self.model
+    def _should_use_anthropic_subscription(self) -> bool:
+        """True iff the request should bypass LiteLLM and call
+        api.anthropic.com directly with the Claude-Code subscription
+        OAuth token. We bypass LiteLLM because its anthropic provider
+        sends `x-api-key` (not Bearer); subscription tokens require
+        the Bearer header instead.
+        Activation requires *all* of:
+          - configured model is an anthropic/* id
+          - the calling code has entered `subscription_auth_scope()`
+            (only the web-UI /chat and /chat/stream endpoints do)
+          - the credentials file is readable
+        """
+        if not self.model.startswith("anthropic/"):
+            return False
+        try:
+            from core.anthropic_auth import is_active, read_subscription_token
+        except Exception:
+            return False
+        if not is_active():
+            return False
+        return read_subscription_token() is not None
+    def _anthropic_subscription_headers(self) -> dict[str, str]:
+        from core.anthropic_auth import read_subscription_token
+        token = read_subscription_token() or ""
+        return {
+            "Authorization": f"Bearer {token}",
+            "anthropic-version": "2023-06-01",
+            "anthropic-beta": "claude-code-20250219,oauth-2025-04-20",
+            "content-type": "application/json",
+        }
+    def _build_anthropic_body(
+        self,
+        messages: list[dict[str, Any]],
+        tools: list[dict] | None,
+        max_tokens: int | None,
+        temperature: float | None,
+        stream: bool,
+    ) -> dict[str, Any]:
+        """Translate Cognos's internal OpenAI-shape messages into
+        Anthropic's /v1/messages body."""
+        # Strip the "anthropic/" prefix to get the bare model id.
+        model = self.model.split("/", 1)[1]
+        body: dict[str, Any] = {
+            "model": model,
+            "max_tokens": max_tokens if max_tokens is not None else self.max_tokens,
+            "stream": stream,
+        }
+        if temperature is not None:
+            body["temperature"] = temperature
+        system_parts: list[str] = []
+        out_msgs: list[dict[str, Any]] = []
+        for m in messages:
+            role = m.get("role")
+            content = m.get("content")
+            if role == "system":
+                if isinstance(content, str):
+                    system_parts.append(content)
+                elif isinstance(content, list):
+                    for b in content:
+                        if isinstance(b, dict) and b.get("type") == "text":
+                            system_parts.append(b.get("text", ""))
+                continue
+            if role == "tool":
+                # OpenAI tool-result → Anthropic user msg w/ tool_result block.
+                out_msgs.append({
+                    "role": "user",
+                    "content": [{
+                        "type": "tool_result",
+                        "tool_use_id": m.get("tool_call_id", ""),
+                        "content": str(content or ""),
+                    }],
+                })
+                continue
+            if role == "assistant":
+                blocks: list[dict[str, Any]] = []
+                if isinstance(content, str) and content:
+                    blocks.append({"type": "text", "text": content})
+                elif isinstance(content, list):
+                    for b in content:
+                        if isinstance(b, dict) and b.get("type") == "text":
+                            blocks.append({"type": "text", "text": b.get("text", "")})
+                for tc in m.get("tool_calls") or []:
+                    fn = tc.get("function") or {}
+                    raw_args = fn.get("arguments")
+                    # Ollama/Kimi sometimes send arguments as a dict
+                    # already (their tool-call format diverges from
+                    # OpenAI's "arguments must be a JSON string" rule).
+                    if isinstance(raw_args, dict):
+                        args = raw_args
+                    else:
+                        try:
+                            args = json.loads(raw_args or "{}")
+                        except Exception:
+                            args = {}
+                    blocks.append({
+                        "type": "tool_use",
+                        "id": tc.get("id") or f"toolu_{uuid.uuid4().hex[:12]}",
+                        "name": fn.get("name", ""),
+                        "input": args,
+                    })
+                if blocks:
+                    out_msgs.append({"role": "assistant", "content": blocks})
+                continue
+            # role == "user"
+            if isinstance(content, str):
+                out_msgs.append({"role": "user", "content": content})
+            elif isinstance(content, list):
+                blocks = []
+                for b in content:
+                    if not isinstance(b, dict):
+                        continue
+                    if b.get("type") == "text":
+                        blocks.append({"type": "text", "text": b.get("text", "")})
+                    elif b.get("type") == "image_url":
+                        url = (b.get("image_url") or {}).get("url", "")
+                        if url.startswith("data:"):
+                            head, _, b64 = url.partition(",")
+                            media = head.split(";")[0].split(":")[-1] or "image/png"
+                            blocks.append({
+                                "type": "image",
+                                "source": {"type": "base64",
+                                           "media_type": media, "data": b64},
+                            })
+                out_msgs.append({
+                    "role": "user",
+                    "content": blocks or [{"type": "text", "text": ""}],
+                })
+        if system_parts:
+            body["system"] = "\n\n".join(p for p in system_parts if p)
+        body["messages"] = out_msgs
+        if tools:
+            body["tools"] = []
+            for t in tools:
+                fn = t.get("function") or t
+                body["tools"].append({
+                    "name": fn.get("name", ""),
+                    "description": fn.get("description", ""),
+                    "input_schema": fn.get("parameters") or {
+                        "type": "object", "properties": {},
+                    },
+                })
+        return body
+    async def _call_anthropic_subscription_chat(
+        self,
+        messages: list[dict[str, Any]],
+        tools: list[dict] | None,
+        max_tokens: int | None,
+        temperature: float | None,
+    ) -> LLMResponse:
+        """Non-streaming direct call to api.anthropic.com using the OAuth
+        subscription Bearer."""
+        import httpx
+        body = self._build_anthropic_body(messages, tools, max_tokens, temperature, stream=False)
+        headers = self._anthropic_subscription_headers()
+        async with httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=15.0)) as client:
+            resp = await client.post(
+                "https://api.anthropic.com/v1/messages",
+                headers=headers, json=body,
+            )
+        if resp.status_code >= 400:
+            raise RuntimeError(
+                f"Anthropic subscription call failed: {resp.status_code} {resp.text}"
+            )
+        data = resp.json()
+        text = ""
+        thinking = ""
+        tool_calls: list[ToolUseBlock] = []
+        for block in data.get("content") or []:
+            btype = block.get("type")
+            if btype == "text":
+                text += block.get("text", "")
+            elif btype == "thinking":
+                thinking += block.get("thinking", "")
+            elif btype == "tool_use":
+                tool_calls.append(ToolUseBlock(
+                    id=block.get("id") or str(uuid.uuid4()),
+                    name=block.get("name", ""),
+                    input=block.get("input") or {},
+                ))
+        usage = data.get("usage") or {}
+        in_tok = int(usage.get("input_tokens", 0))
+        out_tok = int(usage.get("output_tokens", 0))
+        usage_dict = {
+            "prompt_tokens": in_tok,
+            "completion_tokens": out_tok,
+            "total_tokens": in_tok + out_tok,
+        }
+        get_global_tracker().record(self.model, usage_dict)
+        return LLMResponse(
+            content=text,
+            raw=data,
+            model=data.get("model") or self.model,
+            usage=usage_dict,
+            stop_reason=data.get("stop_reason"),
+            tool_calls=tool_calls,
+            thinking=thinking,
+        )
+    async def _call_anthropic_subscription_stream(
+        self,
+        messages: list[dict[str, Any]],
+        tools: list[dict] | None,
+        max_tokens: int | None,
+        temperature: float | None,
+    ) -> AsyncIterator[StreamEvent]:
+        """Streaming direct call to api.anthropic.com — yields
+        Cognos-internal StreamEvents derived from Anthropic SSE."""
+        import httpx
+        body = self._build_anthropic_body(messages, tools, max_tokens, temperature, stream=True)
+        headers = self._anthropic_subscription_headers()
+        yield StreamEvent(type="message_start")
+        block_types: dict[int, str] = {}
+        block_tool_names: dict[int, str] = {}
+        block_tool_inputs: dict[int, str] = {}
+        stop_reason: str | None = None
+        pending = ""
+        async with httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=15.0)) as client:
+            async with client.stream(
+                "POST", "https://api.anthropic.com/v1/messages",
+                headers=headers, json=body,
+            ) as resp:
+                if resp.status_code >= 400:
+                    body_bytes = await resp.aread()
+                    raise RuntimeError(
+                        f"Anthropic subscription stream failed: "
+                        f"{resp.status_code} {body_bytes.decode('utf-8', 'replace')}"
+                    )
+                async for chunk in resp.aiter_bytes():
+                    if not chunk:
+                        continue
+                    pending += chunk.decode("utf-8", errors="replace")
+                    while "\n\n" in pending:
+                        raw_event, pending = pending.split("\n\n", 1)
+                        if not raw_event.strip():
+                            continue
+                        evt_type = None
+                        data_lines: list[str] = []
+                        for line in raw_event.splitlines():
+                            if line.startswith("event:"):
+                                evt_type = line[6:].strip()
+                            elif line.startswith("data:"):
+                                data_lines.append(line[5:].strip())
+                        if not data_lines:
+                            continue
+                        try:
+                            data = json.loads("\n".join(data_lines))
+                        except Exception:
+                            continue
+                        if evt_type == "content_block_start":
+                            idx = int(data.get("index", -1))
+                            cb = data.get("content_block") or {}
+                            block_types[idx] = cb.get("type", "")
+                            if cb.get("type") == "tool_use":
+                                block_tool_names[idx] = cb.get("name", "")
+                                block_tool_inputs[idx] = ""
+                        elif evt_type == "content_block_delta":
+                            delta = data.get("delta") or {}
+                            dtype = delta.get("type")
+                            if dtype == "text_delta":
+                                yield StreamEvent(type="text_delta", delta=delta.get("text", ""))
+                            elif dtype == "thinking_delta":
+                                yield StreamEvent(type="thinking_delta", delta=delta.get("thinking", ""))
+                            elif dtype == "input_json_delta":
+                                idx = int(data.get("index", -1))
+                                block_tool_inputs[idx] = (
+                                    block_tool_inputs.get(idx, "")
+                                    + (delta.get("partial_json") or "")
+                                )
+                        elif evt_type == "content_block_stop":
+                            idx = int(data.get("index", -1))
+                            if block_types.get(idx) == "tool_use":
+                                raw = block_tool_inputs.get(idx, "")
+                                try:
+                                    args = json.loads(raw) if raw else {}
+                                except Exception:
+                                    args = {"_raw": raw}
+                                yield StreamEvent(
+                                    type="tool_use_end",
+                                    tool_use_id=f"toolu_{uuid.uuid4().hex[:12]}",
+                                    tool_name=block_tool_names.get(idx, ""),
+                                    tool_input=args,
+                                    block_index=idx,
+                                )
+                        elif evt_type == "message_delta":
+                            stop_reason = (data.get("delta") or {}).get("stop_reason") or stop_reason
+        yield StreamEvent(type="message_stop", stop_reason=stop_reason)
+    async def chat(
+        self,
+        messages: list[dict[str, Any]],
+        temperature: float | None = None,
+        max_tokens: int | None = None,
+        response_format: dict | None = None,
+        tools: list[dict] | None = None,
+        tool_choice: str | None = None,
+        caller: str | None = None,
+    ) -> LLMResponse:
+        """Send a chat completion request.
+        If `tools` is passed and the model supports native tool-calling,
+        pass them through. Otherwise, fall back to a prompt-based protocol.
+        """
+        # Subscription-OAuth path: web UI calls Anthropic via direct
+        # httpx (LiteLLM can't send Bearer-only auth cleanly).
+        if self._should_use_anthropic_subscription():
+            return await _with_retry(
+                lambda: self._call_anthropic_subscription_chat(
+                    messages, tools, max_tokens, temperature,
+                ),
+                label=f"LLM chat ({self.model}, subscription)",
+            )
+        use_native_tools = tools and _has_native_tool_support(self.model)
+        use_prompt_tools = tools and not use_native_tools
+        if use_prompt_tools:
+            messages = self._inject_tool_prompt(messages, tools)
+        messages = _apply_cache_control(messages, self.model)
+        kwargs: dict[str, Any] = {
+            "model": self._route_model_for_litellm(),
+            "messages": messages,
+            "temperature": temperature if temperature is not None else self.temperature,
+            "max_tokens": max_tokens if max_tokens is not None else self.max_tokens,
+            "timeout": 300,
+        }
+        if response_format:
+            kwargs["response_format"] = response_format
+        if use_native_tools:
+            kwargs["tools"] = tools
+            if tool_choice:
+                kwargs["tool_choice"] = tool_choice
+        try:
+            response = await _with_retry(
+                lambda: litellm.acompletion(**kwargs),
+                label=f"LLM chat ({self.model})",
+            )
+        except Exception as e:
+            logger.error(f"LLM call failed: {e}")
+            raise
+        message = response.choices[0].message
+        content = message.content or ""
+        stop_reason = response.choices[0].finish_reason
+        tool_calls: list[ToolUseBlock] = []
+        if use_native_tools and getattr(message, "tool_calls", None):
+            for tc in message.tool_calls:
+                raw_args = tc.function.arguments
+                # Ollama/Kimi may send `arguments` as a dict instead
+                # of a JSON string (OpenAI's spec says string, but
+                # many local backends diverge). Accept both.
+                if isinstance(raw_args, dict):
+                    args = raw_args
+                elif raw_args:
+                    try:
+                        args = json.loads(raw_args)
+                    except json.JSONDecodeError:
+                        args = {"_raw": raw_args}
+                else:
+                    args = {}
+                tool_calls.append(ToolUseBlock(
+                    id=tc.id or str(uuid.uuid4()),
+                    name=tc.function.name,
+                    input=args,
+                ))
+        elif use_prompt_tools:
+            # Parse tool calls from the text content
+            parsed_calls, stripped = self._parse_prompt_tool_calls(content)
+            if parsed_calls:
+                tool_calls = parsed_calls
+                content = stripped
+        # Salvage path: if we asked for native tools but got an empty
+        # tool_calls field AND content looks like a JSON code block,
+        # try the prompt parser. Some models (GLM-5.1, some Llamas)
+        # emit tool calls as text even when given the function-calling
+        # API. This makes them work without a per-model allowlist.
+        if use_native_tools and not tool_calls and content:
+            parsed_calls, stripped = self._parse_prompt_tool_calls(content)
+            if parsed_calls:
+                tool_calls = parsed_calls
+                content = stripped
+        # Some models (gemma4 in particular) leak their tokenizer's
+        # special tokens into output text — `<tool_call|>`, `<thought`,
+        # `<channel|>`, `<|im_start|>` etc. Strip them.
+        if content:
+            content = _strip_template_leaks(content).strip()
+        usage_dict = {
+            "prompt_tokens": response.usage.prompt_tokens,
+            "completion_tokens": response.usage.completion_tokens,
+            "total_tokens": response.usage.total_tokens,
+        }
+        get_global_tracker().record(self.model, usage_dict)
+        # Thinking models (gemma4, kimi, etc.) put their reasoning in a
+        # separate `thinking` field on the message. Pull it out so we can
+        # forward it in the Anthropic-compat layer.
+        thinking_text = ""
+        try:
+            thinking_text = (
+                getattr(message, "thinking", None)
+                or getattr(message, "reasoning_content", None)
+                or ""
+            )
+        except Exception:
+            pass
+        return LLMResponse(
+            content=content,
+            raw=response.model_dump(),
+            model=response.model or self.model,
+            usage=usage_dict,
+            stop_reason=stop_reason,
+            tool_calls=tool_calls,
+            thinking=str(thinking_text or ""),
+        )
+    # ------------------------------------------------------------------
+    # Streaming
+    # ------------------------------------------------------------------
+    async def stream(
+        self,
+        messages: list[dict[str, Any]],
+        temperature: float | None = None,
+        max_tokens: int | None = None,
+        tools: list[dict] | None = None,
+        tool_choice: str | None = None,
+        caller: str | None = None,
+    ) -> AsyncIterator[StreamEvent]:
+        """Stream a chat completion, yielding StreamEvent deltas."""
+        use_native_tools = tools and _has_native_tool_support(self.model)
+        use_prompt_tools = tools and not use_native_tools
+        if use_prompt_tools:
+            messages = self._inject_tool_prompt(messages, tools)
+        messages = _apply_cache_control(messages, self.model)
+        # Subscription-OAuth streaming path: web UI calls Anthropic via
+        # direct httpx (LiteLLM auth limitation, see chat()).
+        if self._should_use_anthropic_subscription():
+            async for event in self._call_anthropic_subscription_stream(
+                messages, tools, max_tokens, temperature,
+            ):
+                yield event
+            return
+        kwargs: dict[str, Any] = {
+            "model": self._route_model_for_litellm(),
+            "messages": messages,
+            "temperature": temperature if temperature is not None else self.temperature,
+            "max_tokens": max_tokens if max_tokens is not None else self.max_tokens,
+            "timeout": 300,
+            "stream": True,
+        }
+        if use_native_tools:
+            kwargs["tools"] = tools
+            if tool_choice:
+                kwargs["tool_choice"] = tool_choice
+        yield StreamEvent(type="message_start")
+        # Track partial tool_calls being streamed (native path)
+        partial_tools: dict[int, dict[str, Any]] = {}
+        buffered_text = ""  # for prompt-based tool parsing
+        stop_reason: str | None = None
+        try:
+            stream = await _with_retry(
+                lambda: litellm.acompletion(**kwargs),
+                label=f"LLM stream ({self.model})",
+            )
+            async for chunk in stream:
+                choice = chunk.choices[0]
+                delta = choice.delta
+                # Text delta — strip tokenizer-leak tool-call delimiters
+                # (gemma4 in particular leaks `<tool_call|>` etc. into text)
+                text = getattr(delta, "content", None)
+                if text:
+                    text = _strip_template_leaks(text)
+                    if use_prompt_tools:
+                        buffered_text += text
+                    elif text:
+                        yield StreamEvent(type="text_delta", delta=text)
+                # Thinking delta — gemma4/kimi/deepseek-style models emit
+                # reasoning incrementally in a separate `thinking` (or
+                # `reasoning_content`) field on each chunk.
+                thinking_delta = (
+                    getattr(delta, "thinking", None)
+                    or getattr(delta, "reasoning_content", None)
+                )
+                if thinking_delta:
+                    yield StreamEvent(type="thinking_delta", delta=thinking_delta)
+                # Native tool_calls streaming
+                tool_deltas = getattr(delta, "tool_calls", None)
+                if tool_deltas:
+                    for td in tool_deltas:
+                        idx = getattr(td, "index", 0) or 0
+                        slot = partial_tools.setdefault(idx, {"id": None, "name": None, "args": ""})
+                        if getattr(td, "id", None):
+                            slot["id"] = td.id
+                        fn = getattr(td, "function", None)
+                        if fn:
+                            if getattr(fn, "name", None):
+                                slot["name"] = fn.name
+                            if getattr(fn, "arguments", None):
+                                fn_args = fn.arguments
+                                # Ollama/Kimi sometimes deliver the
+                                # arguments as a complete dict in one
+                                # shot rather than streaming string
+                                # deltas. Accept both shapes.
+                                if isinstance(fn_args, dict):
+                                    slot["args"] = fn_args
+                                else:
+                                    slot["args"] = (slot["args"] or "") + fn_args
+                if choice.finish_reason:
+                    stop_reason = choice.finish_reason
+        except Exception as e:
+            logger.error(f"Stream failed: {e}")
+            raise
+        # Prompt-based tool parsing from buffered text
+        if use_prompt_tools and buffered_text:
+            parsed_calls, stripped = self._parse_prompt_tool_calls(buffered_text)
+            if stripped:
+                yield StreamEvent(type="text_delta", delta=stripped)
+            for call in parsed_calls:
+                yield StreamEvent(
+                    type="tool_use_end",
+                    tool_use_id=call.id,
+                    tool_name=call.name,
+                    tool_input=call.input,
+                )
+        # Emit native tool calls
+        for idx, slot in partial_tools.items():
+            raw_args = slot["args"]
+            if isinstance(raw_args, dict):
+                args = raw_args
+            elif raw_args:
+                try:
+                    args = json.loads(raw_args)
+                except json.JSONDecodeError:
+                    args = {"_raw": raw_args}
+            else:
+                args = {}
+            yield StreamEvent(
+                type="tool_use_end",
+                tool_use_id=slot["id"] or str(uuid.uuid4()),
+                tool_name=slot["name"] or "",
+                tool_input=args,
+                block_index=idx,
+            )
+        yield StreamEvent(type="message_stop", stop_reason=stop_reason)
+    # ------------------------------------------------------------------
+    # Structured output (Pydantic)
+    # ------------------------------------------------------------------
+    async def structured_output(
+        self,
+        prompt: str,
+        system: str | None = None,
+        schema_hint: str = "",
+        response_model: type[BaseModel] | None = None,
+        caller: str | None = None,
+        max_tokens: int | None = None,
+    ) -> Any:
+        """Get JSON-structured output from the LLM.
+        If ``response_model`` is provided, returns a validated instance.
+        Otherwise returns a plain dict (legacy behavior).
+        ``max_tokens`` overrides the provider's default — long-form
+        structured generation (e.g. a 15-feature backlog) needs more
+        than the 4 k default or the response gets truncated mid-string
+        and the validator fails. Pass ``8192`` or higher for those.
+        """
+        if response_model is not None:
+            schema = response_model.model_json_schema()
+            json_prompt = (
+                f"{prompt}\n\n"
+                f"Respond with valid JSON matching this schema:\n"
+                f"{json.dumps(schema, indent=2)}\n\n"
+                f"Respond ONLY with the JSON object, no markdown or explanation."
+            )
+            response = await self.complete(
+                prompt=json_prompt, system=system, max_tokens=max_tokens,
+            )
+            text = _strip_code_fence(response.content.strip())
+            try:
+                return response_model.model_validate_json(text)
+            except Exception as first_err:
+                # Salvage path. Tries three repairs in order:
+                #   1. Cheap salvage (prose framing, trailing commas,
+                #      smart quotes) — handles most well-formed-but-noisy
+                #      responses.
+                #   2. Truncation repair — if the response was cut off
+                #      mid-string (max_tokens), close the open string,
+                #      drop the partial element, close any open arrays
+                #      and objects so the front of the structure parses.
+                #   3. Both combined.
+                # Each retry that yields a parseable validator hit short-
+                # circuits the chain; only if every repair fails do we
+                # propagate the original error.
+                attempts: list[tuple[str, str]] = []
+                cheap = _salvage_json(text)
+                if cheap:
+                    attempts.append(("cheap", cheap))
+                truncated = _repair_truncated_json(text)
+                if truncated:
+                    attempts.append(("truncated", truncated))
+                    # also try cheap on top of truncation repair
+                    cheap_on_trunc = _salvage_json(truncated)
+                    if cheap_on_trunc and cheap_on_trunc != truncated:
+                        attempts.append(("trunc+cheap", cheap_on_trunc))
+                last_err = first_err
+                for label, candidate in attempts:
+                    try:
+                        result = response_model.model_validate_json(candidate)
+                        logger.info(
+                            f"structured_output: salvaged via {label} "
+                            f"({len(text)} → {len(candidate)} chars)"
+                        )
+                        return result
+                    except Exception as e:
+                        last_err = e
+                        logger.debug(
+                            f"structured_output: {label} salvage failed: {e}"
+                        )
+                logger.warning(
+                    f"structured_output: every salvage attempt failed. "
+                    f"first_err={first_err}; last_err={last_err}; "
+                    f"raw[:200]={text[:200]!r}; raw[-200:]={text[-200:]!r}"
+                )
+                raise last_err
+        # Legacy dict-based path
+        json_prompt = prompt
+        if schema_hint:
+            json_prompt += f"\n\nRespond with valid JSON matching this structure:\n{schema_hint}"
+        json_prompt += "\n\nRespond ONLY with valid JSON, no markdown or explanation."
+        response = await self.complete(prompt=json_prompt, system=system)
+        text = _strip_code_fence(response.content.strip())
+        return json.loads(text)
+    def switch_model(self, model: str) -> None:
+        """Switch to a different model at runtime."""
+        logger.info(f"Switching model: {self.model} -> {model}")
+        self.model = model
+    # ------------------------------------------------------------------
+    # Fill-in-the-middle (FIM) — code completion for editor-style gap fills.
+    # ------------------------------------------------------------------
+    # FIM token templates per model family. Match against the (case-folded)
+    # model id; first hit wins. Add new families inline as needed.
+    _FIM_TEMPLATES: tuple[tuple[str, dict[str, Any]], ...] = (
+        ("qwen", {  # qwen2.5-coder, qwen3-coder, qwen3-coder-next
+            "prefix": "<|fim_prefix|>",
+            "suffix": "<|fim_suffix|>",
+            "middle": "<|fim_middle|>",
+            "stop": ["<|endoftext|>", "<|fim_pad|>", "<|im_end|>",
+                     "<|repo_name|>", "<|file_sep|>"],
+        }),
+        ("deepseek", {
+            "prefix": "<｜fim▁begin｜>",
+            "suffix": "<｜fim▁hole｜>",
+            "middle": "<｜fim▁end｜>",
+            "stop": ["<｜end▁of▁sentence｜>"],
+        }),
+        ("codellama", {
+            "prefix": "<PRE> ",
+            "suffix": " <SUF>",
+            "middle": " <MID>",
+            "stop": ["<EOT>"],
+        }),
+        ("starcoder", {
+            "prefix": "<fim_prefix>",
+            "suffix": "<fim_suffix>",
+            "middle": "<fim_middle>",
+            "stop": ["<|endoftext|>"],
+        }),
+        ("codegemma", {
+            "prefix": "<|fim_prefix|>",
+            "suffix": "<|fim_suffix|>",
+            "middle": "<|fim_middle|>",
+            "stop": ["<|file_separator|>", "<|endoftext|>"],
+        }),
+    )
+    @classmethod
+    def fim_template_for(cls, model: str) -> dict[str, Any] | None:
+        """Return the FIM token template for a model, or None if unknown."""
+        m = model.lower()
+        for key, tpl in cls._FIM_TEMPLATES:
+            if key in m:
+                return tpl
+        return None
+    async def fim_complete(
+        self,
+        prefix: str,
+        suffix: str = "",
+        model: str | None = None,
+        temperature: float | None = None,
+        max_tokens: int | None = None,
+        stop: list[str] | None = None,
+        ollama_host: str = "http://localhost:11434",
+    ) -> str:
+        """Fill-in-the-middle completion.
+        Generates the text that should appear between `prefix` and
+        `suffix`. Requires a FIM-trained code model on Ollama
+        (qwen-coder, deepseek-coder, codellama, starcoder, codegemma).
+        Anthropic/OpenAI chat models cannot do FIM natively — raises
+        ValueError if asked.
+        """
+        use_model = model or self.model
+        bare = use_model.split("/", 1)[-1] if "/" in use_model else use_model
+        tpl = self.fim_template_for(bare)
+        if tpl is None:
+            raise ValueError(
+                f"Model '{use_model}' is not a known FIM-capable model. "
+                "Use a qwen-coder / deepseek-coder / codellama / "
+                "starcoder / codegemma variant on Ollama."
+            )
+        if not (use_model.startswith("ollama/")
+                or use_model.startswith("ollama_chat/")
+                or "/" not in use_model):
+            raise ValueError(
+                f"FIM only supported via Ollama backend (got '{use_model}')."
+            )
+        prompt = f"{tpl['prefix']}{prefix}{tpl['suffix']}{suffix}{tpl['middle']}"
+        body: dict[str, Any] = {
+            "model": bare,
+            "prompt": prompt,
+            "raw": True,        # do not wrap in chat template
+            "stream": False,
+            "options": {
+                "temperature": temperature if temperature is not None else 0.1,
+                "num_predict": max_tokens if max_tokens is not None else 128,
+                "stop": list(stop) if stop else list(tpl["stop"]),
+            },
+        }
+        import httpx
+        async with httpx.AsyncClient(timeout=httpx.Timeout(60.0, connect=10.0)) as client:
+            resp = await client.post(f"{ollama_host}/api/generate", json=body)
+            resp.raise_for_status()
+            data = resp.json()
+        completion = data.get("response", "")
+        # Trim any leaked stop tokens (Ollama usually strips them, but be safe).
+        for s in body["options"]["stop"]:
+            idx = completion.find(s)
+            if idx >= 0:
+                completion = completion[:idx]
+                break
+        return completion
+    # ------------------------------------------------------------------
+    # Prompt-based tool-calling fallback (for models w/o native support)
+    # ------------------------------------------------------------------
+    # Tool calls in raw text show up under several tag names depending
+    # on which model you talk to:
+    #   <tool_call>...</tool_call>           — Cognos's prompt-fallback
+    #   <function_call>...</function_call>    — GLM-5.1 with tools=
+    #   <function>...</function>              — older Mistral / Llama
+    #   <action>...</action>                  — some experimental models
+    # We accept any of these for robustness.
+    _TOOL_CALL_RE = re.compile(
+        r"<(?:tool_call|function_call|function|action)>\s*(\{.*?\})\s*"
+        r"</(?:tool_call|function_call|function|action)>",
+        re.DOTALL,
+    )
+    _JSON_FENCE_RE = re.compile(
+        r"```(?:json)?\s*(\{[^`]*?\})\s*```",
+        re.DOTALL,
+    )
+    def _inject_tool_prompt(
+        self,
+        messages: list[dict[str, Any]],
+        tools: list[dict],
+    ) -> list[dict[str, Any]]:
+        """Inject a system prompt describing available tools for models
+        without native tool-calling support."""
+        tool_descriptions = []
+        for t in tools:
+            fn = t.get("function", {})
+            tool_descriptions.append(
+                f"- {fn.get('name')}: {fn.get('description')}\n"
+                f"  input_schema: {json.dumps(fn.get('parameters', {}))}"
+            )
+        instruction = (
+            "You have access to these tools:\n"
+            + "\n".join(tool_descriptions)
+            + "\n\nTo call a tool, emit a block like this EXACTLY:\n"
+            '<tool_call>{"name": "ToolName", "input": {"arg": "value"}}</tool_call>\n'
+            "You may emit multiple tool_call blocks. Any text outside tool_call "
+            "blocks is shown to the user as your response. If no tools are needed, "
+            "just respond normally."
+        )
+        # Prepend to existing system message, or insert a new one
+        if messages and messages[0].get("role") == "system":
+            messages = list(messages)
+            messages[0] = {
+                **messages[0],
+                "content": f"{messages[0].get('content', '')}\n\n{instruction}",
+            }
+        else:
+            messages = [{"role": "system", "content": instruction}, *messages]
+        return messages
+    def _parse_prompt_tool_calls(
+        self,
+        text: str,
+    ) -> tuple[list[ToolUseBlock], str]:
+        """Parse tool calls from raw text. Handles two formats:
+          - Explicit `<tool_call>{...}</tool_call>` (the format we ask
+            the model to use).
+          - Bare `\`\`\`json {...}\`\`\`` code blocks (the format some
+            models — GLM especially — naturally emit).
+        Returns (tool_calls, text_with_blocks_removed).
+        """
+        calls: list[ToolUseBlock] = []
+        stripped = text
+        # 1. Explicit <tool_call> blocks (preferred format).
+        for match in self._TOOL_CALL_RE.finditer(text):
+            try:
+                data = json.loads(match.group(1))
+                calls.append(ToolUseBlock(
+                    name=data.get("name", ""),
+                    input=data.get("input", {}),
+                ))
+            except (json.JSONDecodeError, KeyError) as e:
+                logger.warning(f"Failed to parse tool_call block: {e}")
+        stripped = self._TOOL_CALL_RE.sub("", stripped)
+        # 2. ```json {...}``` blocks — only consume them if they
+        # actually look like a tool call (have a 'name' key plus
+        # 'arguments' or 'input').
+        for match in self._JSON_FENCE_RE.finditer(stripped):
+            try:
+                data = json.loads(match.group(1))
+            except json.JSONDecodeError:
+                continue
+            name = data.get("name")
+            if not name:
+                continue
+            args = data.get("input")
+            if args is None:
+                args = data.get("arguments", {})
+            calls.append(ToolUseBlock(name=name, input=args or {}))
+        # Only remove fenced blocks that produced calls — leave normal
+        # code-block content alone. Re-iterate and replace just those.
+        if calls:
+            def _rep(m: re.Match) -> str:
+                try:
+                    d = json.loads(m.group(1))
+                    if isinstance(d, dict) and d.get("name"):
+                        return ""
+                except Exception:
+                    pass
+                return m.group(0)
+            stripped = self._JSON_FENCE_RE.sub(_rep, stripped)
+        return calls, stripped.strip()
+_TEMPLATE_LEAK_RE = re.compile(
+    # Two flavors of leak observed from gemma4 / qwen / llama chat templates:
+    #   1. Generic `<|name|>` special tokens — `<|tool_call|>`, `<|channel|>`,
+    #      `<|im_start|>`, `<|/tool_call|>`, etc.
+    #   2. Bare tag names that occasionally appear with or without closing —
+    #      `<tool_call>`, `</tool_call>`, `<tool_call|>`, `<thought`,
+    #      `<channel|>`, `<thinking>`, `<action>`. The `\b` after the tag
+    #      name avoids eating real words like "<thoughts" or "<channels".
+    r"<\|[a-z_/]+\|>"
+    r"|"
+    r"</?"
+    r"(?:tool_call|function_call|function|action|thought|thinking|channel|"
+    r"im_start|im_end|user_token|assistant_token|system_token)"
+    r"\b\|?>?",
+    re.IGNORECASE,
+)
+def _strip_template_leaks(text: str) -> str:
+    """Remove tokenizer-template artifacts that leak into model output."""
+    if not text:
+        return text
+    return _TEMPLATE_LEAK_RE.sub("", text)
+def _strip_code_fence(text: str) -> str:
+    """Strip ```json ... ``` fences if present."""
+    if text.startswith("```"):
+        lines = text.split("\n")
+        text = "\n".join(lines[1:-1]) if lines[-1].strip() == "```" else "\n".join(lines[1:])
+    return text.strip()
+def _salvage_json(text: str) -> str | None:
+    """Best-effort extraction of a JSON object from a noisy LLM response.
+    Handles three common failure modes:
+      1. Prose framing — find the outermost {...} pair and slice it out.
+      2. Trailing commas before } or ] (some LLMs love these).
+      3. Smart quotes / curly apostrophes that break json.
+    Returns the cleaned string, or None if no balanced object was found.
+    Used by `structured_output` as a salvage path on first parse failure.
+    """
+    if not text:
+        return None
+    # 1. Slice to the outermost {...}
+    start = text.find("{")
+    end = text.rfind("}")
+    if start == -1 or end == -1 or end < start:
+        return None
+    candidate = text[start : end + 1]
+    # 2. Strip trailing commas: ",}" -> "}", ",]" -> "]"
+    import re
+    candidate = re.sub(r",(\s*[}\]])", r"\1", candidate)
+    # 3. Replace smart quotes
+    candidate = (candidate
+                 .replace("“", '"').replace("”", '"')
+                 .replace("‘", "'").replace("’", "'"))
+    return candidate
+def _repair_truncated_json(text: str) -> str | None:
+    """Repair a response that was cut off mid-generation.
+    Common when ``max_tokens`` runs out partway through a long array of
+    objects (e.g. a 15-feature backlog). The tail of ``text`` is:
+      - an unterminated string (parser reports "EOF while parsing a
+        string"), OR
+      - a key with no value (``"foo": ``), OR
+      - a half-written object (``{"title": "X", "descr``)
+    Strategy: walk ``text`` with a stack-based state machine that tracks
+    string-vs-not and brace/bracket depth. Remember the **last index
+    where every nesting level was complete and the next position was
+    inside a top-level array or object**. Truncate to that index, drop
+    any trailing partial element, close the open structures with the
+    right sequence of ``]`` and ``}``.
+    Returns the repaired JSON string, or None if no recovery point
+    existed (e.g. the truncation hit before the first balanced element).
+    """
+    if not text:
+        return None
+    # Find the outermost {
+    obj_start = text.find("{")
+    if obj_start == -1:
+        return None
+    body = text[obj_start:]
+    # Walk the body tracking state. `stack` holds the OPEN container
+    # chars in order (e.g. ['{', '"key"', '[', '{', '...']). When we
+    # close one, we pop. We also record the index of the last comma
+    # encountered at each depth — that's our safe re-truncation point
+    # if the trailing element is incomplete.
+    in_string = False
+    escape = False
+    stack: list[str] = []   # holds '{' and '[' chars
+    # last_complete_idx[d] = last index inside container at depth d
+    # where the structure was clean (just after a balanced child + ',')
+    last_clean_at_depth: dict[int, int] = {}
+    for i, ch in enumerate(body):
+        if in_string:
+            if escape:
+                escape = False
+            elif ch == "\\":
+                escape = True
+            elif ch == '"':
+                in_string = False
+            continue
+        if ch == '"':
+            in_string = True
+            continue
+        if ch == "{" or ch == "[":
+            stack.append(ch)
+            continue
+        if ch == "}" or ch == "]":
+            if not stack:
+                # garbage tail before the body started — give up
+                return None
+            stack.pop()
+            # After popping, the parent's depth gained a complete child.
+            # Record the position so we can return here if needed.
+            last_clean_at_depth[len(stack)] = i + 1
+            continue
+        if ch == "," and not stack:
+            # Comma outside any container — malformed; bail.
+            return None
+        if ch == "," and stack:
+            last_clean_at_depth[len(stack) - 1] = i + 1
+            continue
+    if not stack:
+        # Walk succeeded without truncation — nothing to repair here.
+        return None
+    # Truncate to the deepest clean point we know about, then close the
+    # remaining open containers in reverse order.
+    # Pick the **shallowest** clean point that was after the outermost
+    # `{`, because closing back to that level discards the least content
+    # but keeps everything above it intact.
+    if not last_clean_at_depth:
+        # No clean child landed before the truncation — nothing to keep.
+        return None
+    # Use the deepest depth's clean index (the most recent valid boundary)
+    deepest = max(last_clean_at_depth.keys())
+    cut = last_clean_at_depth[deepest]
+    head = body[:cut]
+    # Trim a trailing comma before closing so we don't emit ",]" or ",}"
+    head = head.rstrip()
+    if head.endswith(","):
+        head = head[:-1]
+    # Compute the closer string from the surviving stack. The stack
+    # holds the OPEN chars up to `cut` — but `cut` was placed AFTER
+    # closing a child, so the remaining stack at that point is
+    # `stack[: len(stack) - (original_depth - deepest)]`. Easier: walk
+    # the surviving head again to compute the remaining stack.
+    survivor: list[str] = []
+    in_s = False
+    esc = False
+    for ch in head:
+        if in_s:
+            if esc: esc = False
+            elif ch == "\\": esc = True
+            elif ch == '"': in_s = False
+            continue
+        if ch == '"': in_s = True
+        elif ch in "{[": survivor.append(ch)
+        elif ch in "}]":
+            if survivor:
+                survivor.pop()
+    closers = "".join("}" if c == "{" else "]" for c in reversed(survivor))
+    repaired = text[:obj_start] + head + closers
+    # Cheap salvage on top (trailing commas inside repaired text).
+    import re
+    repaired = re.sub(r",(\s*[}\]])", r"\1", repaired)
+    return repaired
+# ---------------------------------------------------------------------------
+# Module-level FIM convenience function
+#
+# Lets any caller — Caudate's dispatch hook, an editor route, the Edit
+# tool, an external script — invoke FIM with one import and no provider
+# instance. Spawns a transient LLMProvider per call (cheap: just config,
+# the httpx client is per-call inside `fim_complete`).
+#
+# Default model is qwen2.5-coder:1.5b because it's the smallest FIM
+# model on the local Ollama and gives ~50-100ms latency suitable for
+# in-editor autocomplete. Override with `model=` for heavier gap-fills
+# (qwen3-coder-next, deepseek-coder, etc.).
+# ---------------------------------------------------------------------------
+DEFAULT_FIM_MODEL = "ollama/qwen2.5-coder:1.5b"
+async def fim_complete(
+    prefix: str,
+    suffix: str = "",
+    *,
+    model: str = DEFAULT_FIM_MODEL,
+    temperature: float | None = None,
+    max_tokens: int | None = None,
+    stop: list[str] | None = None,
+    ollama_host: str = "http://localhost:11434",
+) -> str:
+    """Fill-in-the-middle: generate the text between `prefix` and `suffix`.
+    Module-level entry point — does not require an existing LLMProvider.
+    Routes through LLMProvider.fim_complete(); see that method for full
+    semantics, supported model families, and error cases.
+    Caudate can call this directly via `from llm.provider import
+    fim_complete`. It's a side-channel: it does not go through the
+    System-1/System-2 chat router and does not update tracker state.
+    """
+    provider = LLMProvider(model=model)
+    return await provider.fim_complete(
+        prefix=prefix,
+        suffix=suffix,
+        model=model,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        stop=stop,
+        ollama_host=ollama_host,
+    )