PyPI - caudate-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

caudate-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

api/__init__.py +5 -0
api/anthropic_compat.py +1518 -0
api/artifact_viewer.py +366 -0
api/caudate_middleware.py +618 -0
api/forge_bootstrapper_routes.py +377 -0
api/forge_routes.py +630 -0
api/forge_system_routes.py +294 -0
api/openai_compat.py +1993 -0
api/server.py +667 -0
api/storyboard_page.py +677 -0
caudate_cli-0.1.0.dist-info/METADATA +354 -0
caudate_cli-0.1.0.dist-info/RECORD +153 -0
caudate_cli-0.1.0.dist-info/WHEEL +5 -0
caudate_cli-0.1.0.dist-info/entry_points.txt +2 -0
caudate_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
caudate_cli-0.1.0.dist-info/top_level.txt +14 -0
cognos_mcp/__init__.py +4 -0
cognos_mcp/bridge.py +41 -0
cognos_mcp/client.py +70 -0
cognos_mcp/config.py +49 -0
cognos_mcp/server.py +66 -0
config.py +82 -0
core/__init__.py +0 -0
core/agent.py +468 -0
core/agentic_loop.py +731 -0
core/anthropic_auth.py +91 -0
core/background.py +113 -0
core/banner.py +134 -0
core/bootstrap.py +292 -0
core/citations.py +131 -0
core/compaction.py +109 -0
core/constitution.py +198 -0
core/diff_viewer.py +87 -0
core/export.py +85 -0
core/file_refs.py +119 -0
core/files.py +199 -0
core/hooks.py +209 -0
core/image.py +599 -0
core/input.py +91 -0
core/loop.py +238 -0
core/memory_md.py +147 -0
core/notifications.py +99 -0
core/ownership.py +181 -0
core/paste.py +81 -0
core/permissions.py +210 -0
core/plan_mode.py +215 -0
core/sandbox_prompt.py +185 -0
core/scheduler.py +195 -0
core/schemas.py +202 -0
core/session.py +90 -0
core/settings.py +132 -0
core/skills.py +398 -0
core/slash_commands.py +977 -0
core/statusline.py +61 -0
core/subagent.py +300 -0
core/thinking.py +50 -0
core/updater.py +122 -0
core/usage.py +109 -0
core/worktree.py +93 -0
execution/__init__.py +0 -0
execution/executor.py +329 -0
execution/plugins.py +108 -0
execution/tools/__init__.py +0 -0
execution/tools/agent_tool.py +107 -0
execution/tools/agentic_tool.py +297 -0
execution/tools/artifact_tool.py +191 -0
execution/tools/ask_user_question_tool.py +137 -0
execution/tools/base.py +81 -0
execution/tools/calculator_tool.py +137 -0
execution/tools/cognos_card_tool.py +124 -0
execution/tools/cron_tool.py +215 -0
execution/tools/datetime_tool.py +215 -0
execution/tools/describe_image_tool.py +161 -0
execution/tools/draw_tool.py +164 -0
execution/tools/edit_image_tool.py +262 -0
execution/tools/edit_tool.py +245 -0
execution/tools/file_tool.py +90 -0
execution/tools/find_anywhere_tool.py +255 -0
execution/tools/forge_feature_tools.py +377 -0
execution/tools/glob_tool.py +59 -0
execution/tools/grep_tool.py +89 -0
execution/tools/http_request_tool.py +224 -0
execution/tools/load_skill_tool.py +104 -0
execution/tools/longcat_avatar_tool.py +384 -0
execution/tools/mcp_tool.py +100 -0
execution/tools/notebook_tool.py +279 -0
execution/tools/openapi_tool.py +440 -0
execution/tools/plan_mode_tool.py +95 -0
execution/tools/push_notification_tool.py +157 -0
execution/tools/python_tool.py +61 -0
execution/tools/respond_tool.py +40 -0
execution/tools/sandbox_tool.py +378 -0
execution/tools/search_tool.py +153 -0
execution/tools/semantic_search_tool.py +106 -0
execution/tools/shell_tool.py +283 -0
execution/tools/speak_tool.py +134 -0
execution/tools/storyboard_tool.py +727 -0
execution/tools/system_info_tool.py +212 -0
execution/tools/task_tool.py +323 -0
execution/tools/think_tool.py +49 -0
execution/tools/transcribe_audio_tool.py +86 -0
execution/tools/update_memory_tool.py +92 -0
execution/tools/web_fetch_tool.py +82 -0
execution/tools/worktree_tool.py +174 -0
llm/__init__.py +0 -0
llm/fallback.py +116 -0
llm/models.py +320 -0
llm/provider.py +1356 -0
llm/router.py +373 -0
main.py +1889 -0
memory/__init__.py +0 -0
memory/episodic.py +99 -0
memory/procedural.py +145 -0
memory/semantic.py +71 -0
memory/working.py +64 -0
nn/__init__.py +43 -0
nn/auto_evolve.py +245 -0
nn/caudate.py +136 -0
nn/config.py +141 -0
nn/consolidator.py +81 -0
nn/data.py +1635 -0
nn/encoder.py +258 -0
nn/forge_advisor.py +303 -0
nn/format.py +235 -0
nn/heads.py +432 -0
nn/observer.py +994 -0
nn/policy.py +214 -0
nn/runtime.py +343 -0
nn/scorer.py +175 -0
nn/trainer.py +515 -0
nn/vision.py +352 -0
personality/__init__.py +23 -0
personality/engine.py +129 -0
personality/identity.py +144 -0
personality/inner_voice.py +100 -0
personality/mood.py +205 -0
planning/__init__.py +0 -0
planning/dev_server.py +221 -0
planning/forge_models.py +718 -0
planning/orchestrator.py +1363 -0
planning/planner.py +451 -0
planning/task_graph.py +61 -0
reflection/__init__.py +0 -0
reflection/meta_learner.py +156 -0
reflection/reflector.py +127 -0
ui/__init__.py +5 -0
ui/display.py +88 -0
voice/__init__.py +0 -0
voice/conversation.py +125 -0
voice/listener.py +111 -0
voice/speaker.py +59 -0
voice/stt.py +126 -0
voice/tts.py +214 -0

nn/observer.py ADDED Viewed

@@ -0,0 +1,994 @@
+"""Live observer — the bridge between the running agent and Caudate.
+Sits between AgenticLoop and the trainer. On every turn:
+  1. **Predict**  — calls Caudate (if loaded) to get a prediction. Logged.
+  2. **Capture**  — records the state at the start of the turn.
+  3. **Reward**   — when the turn finishes, derives a reward from the
+                   reflector's score (or a heuristic if no reflector).
+  4. **Persist**  — writes (state, action, reward) to data/nn/replay.jsonl.
+  5. **Auto-train** — when ≥N new samples accumulated, kicks an async
+                   training run in the background.
+The observer is the single integration surface — AgenticLoop only knows
+about this object, not about the underlying NN module. Detaches Caudate
+from her sister brains via this thin layer.
+"""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+from nn.auto_evolve import AutoEvolver, AutoEvolveConfig
+from nn.config import NNConfig
+from nn.data import ConversationSample, ReplayBuffer, ToolVocab
+from nn.format import ChatMessage, ToolCall
+from nn.policy import GraduationPolicy, TrustLevel
+from nn.runtime import CaudateAdvisor, Prediction, load_advisor
+from nn.scorer import PredictionScorer, ScoreRecord
+# ---------------------------------------------------------------------
+# Label heuristics for the extended heads (D-heads, 2026-05-01).
+#
+# Each function returns a float label in [0, 1] or None when no signal
+# is available for this turn. The trainer skips heads with no batch
+# target via `HeadSpec.optional_target`. These heuristics are
+# deliberately simple proxies; they get refined as the underlying
+# infrastructure (real cache, real memory store) matures.
+# ---------------------------------------------------------------------
+# Tools whose execution would prompt the user under DEFAULT permission
+# mode. Mirrors core/permissions.py::_MUTATING + _SENSITIVE.
+_MUTATING_TOOL_NAMES: frozenset[str] = frozenset({
+    "Bash", "Write", "Edit", "FileWrite", "PythonExec", "shell",
+    "file_write", "edit", "python_exec",
+})
+def _label_memory_write(reward: float) -> float:
+    """High-reward turns → memory-worthy. Soft threshold."""
+    return 1.0 if reward >= 0.7 else 0.0
+def _label_permission(chosen_tools: list[str]) -> float | None:
+    """Whether the agent invoked something that needs explicit approval.
+    Returns None for chat-only turns (no tools used) — those don't
+    have a permission decision so they shouldn't train the head.
+    """
+    if not chosen_tools:
+        return None
+    return 1.0 if any(t in _MUTATING_TOOL_NAMES for t in chosen_tools) else 0.0
+def _label_cache_hit(current_text: str, recent_texts: list[str]) -> float:
+    """Jaccard similarity between the current prompt and recent history.
+    1.0 if any recent prompt has >0.7 token overlap; else 0.0.
+    Identical-prompt cache is the realistic upper-bound for what an
+    LLM cache could serve, but near-paraphrases are also useful — so
+    word-level Jaccard with a moderate threshold is a reasonable
+    starting proxy.
+    """
+    if not current_text or not recent_texts:
+        return 0.0
+    cur_tokens = set(current_text.lower().split())
+    if not cur_tokens:
+        return 0.0
+    for prior in recent_texts:
+        prior_tokens = set(prior.lower().split())
+        if not prior_tokens:
+            continue
+        union = cur_tokens | prior_tokens
+        if not union:
+            continue
+        jaccard = len(cur_tokens & prior_tokens) / len(union)
+        if jaccard >= 0.7:
+            return 1.0
+    return 0.0
+# ---------------------------------------------------------------------
+# Tier 1 — response-shape labels (computed from the actual response
+# the LLM produced this turn, so they're "what shape did this turn
+# actually take" not "predicted shape").
+# ---------------------------------------------------------------------
+import re as _re
+_REFUSAL_LABEL_RE = _re.compile(
+    r"\b(i\s+can[' ]?t|i\s+cannot|i\s+(?:am\s+)?unable|"
+    r"violates?\s+(?:my\s+)?(?:guidelines|policy)|against\s+(?:my\s+)?policy)\b",
+    _re.IGNORECASE,
+)
+def _label_refusal(response_text: str) -> float | None:
+    """1.0 if the LLM's reply contains refusal phrasing, else 0.0."""
+    if not response_text:
+        return None
+    return 1.0 if _REFUSAL_LABEL_RE.search(response_text) else 0.0
+def _label_code_response(response_text: str) -> float | None:
+    """1.0 if the LLM's reply contains a code block (```), else 0.0.
+    Triple-backtick is the dominant signal in modern LLM output; bare
+    indented code is rarer in chat replies and more ambiguous.
+    """
+    if not response_text:
+        return None
+    return 1.0 if "```" in response_text else 0.0
+def _label_stall(chosen_tools: list[str], response_text: str,
+                 inferred_from_stall: bool) -> float | None:
+    """1.0 if the model said 'I'll use Bash...' but called no tool.
+    The middleware already detects this and sets `inferred_from_stall`
+    on the turn context — we use that as the authoritative signal.
+    """
+    if not response_text and not chosen_tools:
+        return None  # no signal; chat-only turn with empty reply is rare
+    return 1.0 if inferred_from_stall else 0.0
+def _label_difficulty(num_tool_calls: int) -> int | None:
+    """3-class: 0 (≤1 call), 1 (2-3), 2 (4+).
+    Returns the class index (used with CE loss).
+    """
+    if num_tool_calls <= 1:
+        return 0
+    if num_tool_calls <= 3:
+        return 1
+    return 2
+def _label_stop_iter(produced_final_text: bool) -> float:
+    """1.0 if the model produced final text (no more tool_calls), else 0.0.
+    This is the agentic-loop's stop signal — predicting it lets Caudate
+    suggest "this prompt will resolve in 1 step, don't open the loop".
+    """
+    return 1.0 if produced_final_text else 0.0
+def _label_compaction(message_count: int, char_total: int) -> float:
+    """1.0 if context is large enough to warrant compaction, else 0.0.
+    Heuristic threshold: >40 messages OR >40K chars in history. These
+    correlate with the points where the actual `ContextCompactor`
+    typically fires; once the real compactor is wired through the
+    chat path, replace this with the actual compaction event.
+    """
+    return 1.0 if (message_count > 40 or char_total > 40_000) else 0.0
+# ---------------------------------------------------------------------
+# Tier 2 — continuous + multi-output labels
+# ---------------------------------------------------------------------
+def _label_latency_s(elapsed_s: float | None) -> float | None:
+    """Map measured turn latency to [0, 1] by /60s clip.
+    60s is a long turn; anything past that is "very slow" and clips
+    to 1.0. The sigmoid output of the head reads back as
+    `predicted_latency_s ≈ logit_value * 60`.
+    """
+    if elapsed_s is None:
+        return None
+    return max(0.0, min(1.0, elapsed_s / 60.0))
+def _label_token_budget(completion_tokens: int | None) -> float | None:
+    """Map completion tokens to [0, 1] by /4096 clip.
+    The standard max_tokens is 4096 — use that as the saturation point.
+    Head's sigmoid output reads back as `predicted_tokens ≈ logit * 4096`.
+    """
+    if completion_tokens is None:
+        return None
+    return max(0.0, min(1.0, completion_tokens / 4096.0))
+def _label_mood_pred(next_mood: list[float] | None) -> list[float] | None:
+    """The next-turn's mood vector becomes this turn's label.
+    This makes mood_pred a *world-model* prediction: "given current
+    state, what mood will the user be in next turn?" Self-supervised
+    in the sense that the label appears for free 30s later. None on
+    the very first turn (no future label yet).
+    """
+    if next_mood is None:
+        return None
+    out = [float(x) for x in (next_mood + [0.0]*4)[:4]]
+    return [max(0.0, min(1.0, x)) for x in out]
+def _label_subagent_spawn(num_tool_calls: int, response_text: str) -> float | None:
+    """Heuristic: turns that called many tools (≥4) likely benefited from
+    delegation to a subagent. Refine with real subagent-success metrics
+    once the subagent path is wired into the chat agentic loop.
+    """
+    if num_tool_calls == 0 and not response_text:
+        return None
+    return 1.0 if num_tool_calls >= 4 else 0.0
+def _owner_may_act() -> bool:
+    """Single gate referenced from every Caudate decision path."""
+    try:
+        from core.ownership import caudate_may_act
+        return caudate_may_act()
+    except Exception:
+        return True
+logger = logging.getLogger(__name__)
+@dataclass
+class _PendingTurn:
+    """One turn-in-progress: state captured at start, awaiting reward."""
+    messages: list[str]
+    tool_history: list[str]
+    mood: list[float]
+    image_paths: list[str] = field(default_factory=list)
+    chosen_tools: list[str] = field(default_factory=list)
+    tier_used: int = 0
+    thinking_used: bool = False
+    started_at: float = field(default_factory=time.time)
+    # Which model produced the response on this turn. Carried through to
+    # the resulting ConversationSample so the corpus stays branchable per
+    # source (Phase 1 of CAUDATE_EVOLUTION.md).
+    model_source: str = "unknown"
+    # Tools the assistant could have called this turn. The contrastive
+    # tool head needs this candidate list — without it the head sees
+    # only the synthetic <no_tool> slot and degenerates into a constant.
+    # Carried into the ConversationSample on on_turn_end so replay
+    # samples retrain against the same candidate context.
+    available_tools: list[ToolDef] = field(default_factory=list)
+class CaudateObserver:
+    """Lives on the agent. Watches turns, talks to Caudate."""
+    REPLAY_PATH_DEFAULT = "data/nn/replay.jsonl"
+    AUTO_TRAIN_EVERY_DEFAULT = 32   # new samples between auto-trains
+    def __init__(
+        self,
+        cfg: NNConfig | None = None,
+        auto_train_every: int = AUTO_TRAIN_EVERY_DEFAULT,
+        auto_train: bool = True,
+    ):
+        self.cfg = cfg or NNConfig()
+        self.cfg.ensure_dirs()
+        self.advisor: CaudateAdvisor | None = load_advisor(self.cfg)
+        self.replay = ReplayBuffer(capacity=self.cfg.replay_capacity)
+        self._pending: _PendingTurn | None = None
+        self._tool_history: list[str] = []
+        self._samples_since_train = 0
+        self.auto_train_every = auto_train_every
+        self.auto_train = auto_train
+        self._train_in_flight: asyncio.Task | None = None
+        self._last_prediction: Prediction | None = None
+        self._replay_path = Path(self.cfg.advisor_log_path).parent / "replay.jsonl"
+        # Rolling history of recent user prompts — feeds the cache_hit
+        # head's similarity check. Bounded so it doesn't grow forever.
+        self._recent_user_texts: list[str] = []
+        self._recent_user_texts_cap = 32
+        self._load_replay_from_disk()
+        # Graduation pipeline: scorer + policy.
+        self.scorer = PredictionScorer(
+            window=200,
+            path=Path(self.cfg.data_dir) / "scorecard.json",
+            weight_tool=self.cfg.w_tool,
+            weight_tier=self.cfg.w_tier,
+            weight_think=self.cfg.w_think,
+        )
+        self.policy = GraduationPolicy(
+            state_path=Path(self.cfg.data_dir) / "policy.json",
+        )
+        # Sync the policy with current state on boot so /caudate awareness
+        # immediately reflects reality after a restart.
+        self.policy.update(
+            samples=len(self.scorer),
+            composite_acc=self.scorer.composite(),
+            advisor_loaded=self.advisor is not None,
+        )
+        # Autonomous evolution — Caudate fires her own NAS runs after
+        # auto-train cycles when she's plateaued. Disabled by default
+        # for fresh agents; enable via /caudate evolve on or settings.
+        self.auto_evolver = AutoEvolver(self, AutoEvolveConfig(enabled=True))
+    # ------------------------------------------------------------------
+    # Public hooks called by AgenticLoop
+    # ------------------------------------------------------------------
+    def on_turn_start(
+        self,
+        recent_messages: list[str],
+        mood: list[float] | None = None,
+        image_paths: list[str] | None = None,
+        model_source: str = "unknown",
+        available_tools: list[ToolDef] | None = None,
+    ) -> Prediction | None:
+        """Capture state (now including images), ask Caudate for a prediction.
+        ``available_tools`` is the candidate list the assistant can call
+        this turn. Passing it through lets the contrastive tool head
+        actually discriminate — without it the head only ever sees the
+        synthetic ``<no_tool>`` slot and predicts that with confidence 1.0
+        regardless of context.
+        """
+        mood = mood or [0.5] * self.cfg.mood_dim
+        imgs = list(image_paths or [])[-self.cfg.image_window:]
+        tools = list(available_tools or [])
+        self._pending = _PendingTurn(
+            messages=list(recent_messages)[-self.cfg.msg_window:],
+            tool_history=list(self._tool_history)[-self.cfg.history_window:],
+            mood=list(mood)[: self.cfg.mood_dim],
+            image_paths=imgs,
+            model_source=model_source,
+            available_tools=tools,
+        )
+        # Owner kill switch — overrides everything.
+        try:
+            from core.ownership import caudate_may_act
+            if not caudate_may_act():
+                self._last_prediction = None
+                return None
+        except Exception:
+            pass
+        if self.advisor is None:
+            return None
+        try:
+            pred = self.advisor.predict(
+                messages=self._pending.messages,
+                tool_history=self._pending.tool_history,
+                mood=self._pending.mood,
+                image_paths=imgs,
+                model_source=model_source,
+                available_tools=tools or None,
+            )
+            self._last_prediction = pred
+            return pred
+        except Exception as e:
+            logger.debug(f"Caudate predict failed: {e}")
+            return None
+    def on_tool_use(
+        self,
+        tool_name: str,
+        tier: int = 0,
+        thinking_used: bool = False,
+    ) -> None:
+        """Record what actually happened during the turn."""
+        self._tool_history.append(tool_name)
+        if len(self._tool_history) > self.cfg.history_window * 4:
+            # drop the oldest half so the buffer doesn't grow unboundedly
+            self._tool_history = self._tool_history[-self.cfg.history_window * 2:]
+        if self._pending is not None:
+            self._pending.chosen_tools.append(tool_name)
+            self._pending.tier_used = tier
+            self._pending.thinking_used = thinking_used or self._pending.thinking_used
+    def on_turn_end(
+        self,
+        reward: float | None = None,
+        success: bool | None = None,
+        *,
+        # Outcome signals for Tier 1 / Tier 2 head labels. All optional —
+        # missing values just leave the corresponding head unlabeled for
+        # this turn (HeadSpec.optional_target handles the trainer side).
+        response_text: str | None = None,
+        inferred_from_stall: bool = False,
+        elapsed_s: float | None = None,
+        completion_tokens: int | None = None,
+    ) -> int:
+        """Close the turn — derive reward, push samples, maybe auto-train.
+        Also: score Caudate's prediction against what actually happened,
+        and update the graduation policy. Returns number of samples added.
+        """
+        if self._pending is None:
+            return 0
+        pending = self._pending
+        self._pending = None
+        # Derive reward
+        if reward is None:
+            if success is None:
+                reward = 0.5
+            else:
+                reward = 0.7 if success else 0.3
+        # Score the prediction: did Caudate guess what actually happened?
+        # When no tool was called, the correct answer is `<no_tool>` — grade
+        # against that so chat turns produce learning signal too (otherwise
+        # the tier/value heads never get feedback during chat-only usage).
+        if self._last_prediction is not None:
+            actual_tool = pending.chosen_tools[0] if pending.chosen_tools else "<no_tool>"
+            actual_tier = ("fast", "slow")[int(pending.tier_used)]
+            self.scorer.add(ScoreRecord(
+                ts=time.time(),
+                predicted_tool=self._last_prediction.tool,
+                actual_tool=actual_tool,
+                predicted_tool_conf=self._last_prediction.tool_confidence,
+                predicted_tier=self._last_prediction.tier,
+                actual_tier=actual_tier,
+                predicted_think=self._last_prediction.think,
+                actual_think=bool(pending.thinking_used),
+                predicted_value=self._last_prediction.value,
+                actual_reward=float(reward),
+            ))
+            # Update graduation status — may promote/demote
+            self.policy.update(
+                samples=self.scorer.lifetime_predictions,
+                composite_acc=self.scorer.composite(),
+                advisor_loaded=self.advisor is not None,
+            )
+        # Prioritized replay: compute how surprising this turn was so
+        # the trainer samples hard examples more often. Combines tool
+        # error, tier error, value error, and a confident-wrongness
+        # penalty (being confidently wrong is worse than uncertainly
+        # wrong). Always >= 0.05 so easy samples still get reviewed.
+        if self._last_prediction is not None:
+            actual_tool_label = pending.chosen_tools[0] if pending.chosen_tools else "<no_tool>"
+            actual_tier_label = ("fast", "slow")[int(pending.tier_used)]
+            tool_err = 0.0 if self._last_prediction.tool == actual_tool_label else 1.0
+            tier_err = 0.0 if self._last_prediction.tier == actual_tier_label else 1.0
+            value_err = abs(self._last_prediction.value - float(reward))
+            conf_penalty = self._last_prediction.tool_confidence * tool_err
+            surprise = max(
+                0.05,
+                min(1.0, 0.5 * tool_err + 0.15 * tier_err + 0.3 * value_err + 0.2 * conf_penalty),
+            )
+        else:
+            surprise = 0.5  # no prediction → neutral priority
+        # Extended-head label heuristics (D-heads + Tier 1/2). Each may
+        # be None if this turn provides no signal; the trainer skips
+        # heads with missing batch targets.
+        last_user_text = pending.messages[-1] if pending.messages else ""
+        target_memory_write = _label_memory_write(float(reward))
+        target_permission = _label_permission(pending.chosen_tools)
+        target_cache_hit = _label_cache_hit(last_user_text, self._recent_user_texts)
+        # Tier 1 — response-shape labels (from the response that actually
+        # came back this turn). Each requires response_text or the stall
+        # flag to be informative; chat-only turns with empty text leave
+        # most of these as None, which is correct.
+        rt = response_text or ""
+        num_tools = len(pending.chosen_tools or [])
+        target_refusal = _label_refusal(rt)
+        target_code_response = _label_code_response(rt)
+        target_stall = _label_stall(pending.chosen_tools, rt, inferred_from_stall)
+        target_difficulty = _label_difficulty(num_tools)
+        # Every observed turn produced final text by definition (we're in
+        # on_turn_end), so stop_iter is always 1.0 — but only label it
+        # for turns where the agentic loop is actually relevant
+        # (chat-path; CLI Claude Code drives its own loop).
+        target_stop_iter = _label_stop_iter(produced_final_text=True) if rt else None
+        # Compaction threshold: rough proxy on message count + char total
+        msg_count = len(pending.messages or [])
+        char_total = sum(len(m) for m in (pending.messages or []) if m)
+        target_compaction = _label_compaction(msg_count, char_total)
+        # Tier 2 — continuous + multi-output labels
+        target_latency_s = _label_latency_s(elapsed_s)
+        target_token_budget = _label_token_budget(completion_tokens)
+        target_subagent_spawn = _label_subagent_spawn(num_tools, rt)
+        # mood_pred and reward_model: deferred — need future-mood lookup
+        # and arbitration-pair join respectively. Wired by separate paths.
+        target_mood_pred = None
+        target_reward_model = None
+        # Update rolling history AFTER computing cache_hit (otherwise
+        # every prompt would 100% match itself).
+        if last_user_text:
+            self._recent_user_texts.append(last_user_text)
+            if len(self._recent_user_texts) > self._recent_user_texts_cap:
+                self._recent_user_texts = self._recent_user_texts[-self._recent_user_texts_cap:]
+        # Auto-write to memory.md when this turn is memory-worthy.
+        # Trigger: heuristic label says 1.0 (reward >= 0.7) AND we have
+        # a non-empty user prompt + response. Caudate's `memory_write`
+        # head trains to predict the same heuristic; once she's at
+        # CONTROLLER trust we'll switch to using her prediction
+        # directly. For now the heuristic IS the source of truth.
+        if (target_memory_write == 1.0 and last_user_text and rt
+                and not inferred_from_stall):
+            try:
+                from core.memory_md import get_memory
+                # Compact: prompt + first line of response. Avoids
+                # bloating memory.md with full response bodies.
+                response_first_line = (rt.strip().splitlines() or [""])[0][:300]
+                entry = (
+                    f"**user:** {last_user_text[:300]}\n\n"
+                    f"**reply:** {response_first_line}"
+                )
+                title = last_user_text[:60].replace("\n", " ")
+                get_memory().append(entry, source="caudate-auto", title=title)
+            except Exception as e:
+                logger.debug(f"memory.md auto-write failed: {e}")
+        # Build the conversation prefix from the pending state. The
+        # agent supplies role-prefixed strings ("user: hi") and a
+        # parallel tool-history list; we splice those into the standard
+        # chat-tool-call shape. The historical tool calls become a
+        # synthetic assistant message at the tail of the prefix — the
+        # collate's `conversation_tool_history` helper picks them up
+        # and the encoder still sees the same string view.
+        conv: list[ChatMessage] = []
+        for m in pending.messages:
+            if not m:
+                continue
+            role, sep, content = m.partition(": ")
+            if not sep:
+                role, content = "user", m
+            conv.append(ChatMessage(role=role, content=content))
+        if pending.tool_history:
+            conv.append(ChatMessage(
+                role="assistant", content="",
+                tool_calls=[ToolCall(name=t) for t in pending.tool_history if t],
+            ))
+        # One sample per chosen tool — the prediction problem is "what
+        # comes next", so each tool call is its own training pair.
+        added = 0
+        for tool in pending.chosen_tools or ["<no_tool>"]:
+            sample = ConversationSample(
+                conversation=conv,
+                # Captured at turn start — the candidate list the
+                # contrastive head was offered. Persisting it lets
+                # the retrain see the same discrimination context.
+                tools=list(pending.available_tools),
+                mood=pending.mood,
+                image_paths=list(pending.image_paths),
+                target_tool=tool,
+                target_tier=int(pending.tier_used),
+                target_think=float(bool(pending.thinking_used)),
+                target_value=float(reward),
+                model_source=pending.model_source,
+                surprise=surprise,
+                target_memory_write=target_memory_write,
+                target_cache_hit=target_cache_hit,
+                target_permission=target_permission,
+                target_refusal=target_refusal,
+                target_code_response=target_code_response,
+                target_stall=target_stall,
+                target_difficulty=target_difficulty,
+                target_stop_iter=target_stop_iter,
+                target_compaction=target_compaction,
+                target_latency_s=target_latency_s,
+                target_token_budget=target_token_budget,
+                target_mood_pred=target_mood_pred,
+                target_subagent_spawn=target_subagent_spawn,
+                target_reward_model=target_reward_model,
+            )
+            self.replay.push(sample)
+            self._append_replay_disk(sample)
+            added += 1
+        self._samples_since_train += added
+        if self.auto_train and self._samples_since_train >= self.auto_train_every:
+            self._kick_auto_train()
+        return added
+    # ------------------------------------------------------------------
+    # Inspection
+    # ------------------------------------------------------------------
+    def status(self) -> dict[str, Any]:
+        composite = self.scorer.composite()
+        samples_lifetime = self.scorer.lifetime_predictions
+        return {
+            "advisor_loaded": self.advisor is not None,
+            "replay_size": len(self.replay),
+            "samples_since_train": self._samples_since_train,
+            "auto_train_every": self.auto_train_every,
+            "auto_train_in_flight": (
+                self._train_in_flight is not None and not self._train_in_flight.done()
+            ),
+            "last_prediction": (
+                {
+                    "tool": self._last_prediction.tool,
+                    "tool_conf": self._last_prediction.tool_confidence,
+                    "tier": self._last_prediction.tier,
+                    "tier_conf": self._last_prediction.tier_confidence,
+                    "think": self._last_prediction.think,
+                    "value": self._last_prediction.value,
+                } if self._last_prediction else None
+            ),
+            "scorer": self.scorer.report(),
+            "policy": self.policy.report(samples_lifetime, composite),
+            "auto_evolve": (
+                self.auto_evolver.status() if getattr(self, "auto_evolver", None)
+                else None
+            ),
+        }
+    def can_whisper(self) -> bool:
+        if not _owner_may_act():
+            return False
+        return self.policy.can_whisper() and self.advisor is not None
+    def can_advise(self) -> bool:
+        if not _owner_may_act():
+            return False
+        return self.policy.can_advise() and self.advisor is not None
+    def can_control(self) -> bool:
+        if not _owner_may_act():
+            return False
+        return self.policy.can_control() and self.advisor is not None
+    def reload_advisor(self) -> bool:
+        """Reload the checkpoint from disk after an external retrain."""
+        new = load_advisor(self.cfg)
+        if new is not None:
+            self.advisor = new
+            return True
+        return False
+    # ------------------------------------------------------------------
+    # Persistence — replay buffer to disk
+    # ------------------------------------------------------------------
+    def _append_replay_disk(self, sample: ConversationSample) -> None:
+        try:
+            self._replay_path.parent.mkdir(parents=True, exist_ok=True)
+            with self._replay_path.open("a") as f:
+                f.write(json.dumps({
+                    # Standard chat-tool-call shape — same schema as
+                    # external datasets. The loader handles both this
+                    # shape and the legacy {messages, tool_history}
+                    # layout for backward compat with old replay rows.
+                    "conversation": [m.to_dict() for m in sample.conversation],
+                    "tools": [t.to_dict() for t in sample.tools],
+                    "mood": sample.mood,
+                    "image_paths": sample.image_paths,
+                    "target_tool": sample.target_tool,
+                    "target_arguments": sample.target_arguments,
+                    "target_tool_call_index": sample.target_tool_call_index,
+                    "target_tier": sample.target_tier,
+                    "target_think": sample.target_think,
+                    "target_value": sample.target_value,
+                    "model_source": sample.model_source,
+                    "surprise": sample.surprise,
+                    "target_memory_write": sample.target_memory_write,
+                    "target_cache_hit": sample.target_cache_hit,
+                    "target_permission": sample.target_permission,
+                    "target_refusal": sample.target_refusal,
+                    "target_code_response": sample.target_code_response,
+                    "target_stall": sample.target_stall,
+                    "target_difficulty": sample.target_difficulty,
+                    "target_stop_iter": sample.target_stop_iter,
+                    "target_compaction": sample.target_compaction,
+                    "target_latency_s": sample.target_latency_s,
+                    "target_token_budget": sample.target_token_budget,
+                    "target_mood_pred": sample.target_mood_pred,
+                    "target_subagent_spawn": sample.target_subagent_spawn,
+                    "target_reward_model": sample.target_reward_model,
+                    "target_feature_success": sample.target_feature_success,
+                }) + "\n")
+        except Exception as e:
+            logger.debug(f"replay disk write failed: {e}")
+    def _load_replay_from_disk(self) -> None:
+        if not self._replay_path.exists():
+            return
+        try:
+            for line in self._replay_path.read_text().splitlines()[-self.cfg.replay_capacity:]:
+                d = json.loads(line)
+                # Migrate legacy <none> labels (which collapsed to <unk>
+                # via vocab.get fallback) to the new explicit <no_tool>
+                # action class.
+                tool_name = d.get("target_tool", "<unk>")
+                if tool_name == "<none>":
+                    tool_name = "<no_tool>"
+                # Extended-head targets: None means "this old sample
+                # has no label for this head" — the collator will then
+                # exclude any batch containing it from training that
+                # head, which is correct (we don't fabricate labels).
+                def _opt_float(key: str) -> float | None:
+                    v = d.get(key)
+                    return float(v) if v is not None else None
+                def _opt_int(key: str) -> int | None:
+                    v = d.get(key)
+                    return int(v) if v is not None else None
+                def _opt_list(key: str) -> list[float] | None:
+                    v = d.get(key)
+                    return [float(x) for x in v] if v is not None else None
+                # Two shapes supported:
+                #   - new: {"conversation": [...ChatMessage...], "tools": [...]}
+                #   - legacy: {"messages": [str...], "tool_history": [str...]}
+                # The legacy path lets a 3000-row historical replay
+                # buffer keep contributing without a one-time migration
+                # script. Both shapes share the rest of the fields.
+                if "conversation" in d and isinstance(d["conversation"], list):
+                    raw_conv = d["conversation"]
+                    raw_tools = d.get("tools") or []
+                    conv = [ChatMessage.from_dict(m)
+                            for m in raw_conv if isinstance(m, dict)]
+                    from nn.format import ToolDef
+                    tools = [ToolDef.from_dict(t)
+                             for t in raw_tools if isinstance(t, dict)]
+                else:
+                    # Legacy shape: rebuild a Conversation from the flat
+                    # role-prefixed strings + the parallel tool_history.
+                    conv = []
+                    for s in d.get("messages") or []:
+                        if not s:
+                            continue
+                        role, sep, content = s.partition(": ")
+                        if not sep:
+                            role, content = "user", s
+                        conv.append(ChatMessage(role=role, content=content))
+                    th = d.get("tool_history") or []
+                    if th:
+                        conv.append(ChatMessage(
+                            role="assistant", content="",
+                            tool_calls=[ToolCall(name=t) for t in th if t],
+                        ))
+                    tools = []
+                self.replay.push(ConversationSample(
+                    conversation=conv,
+                    tools=tools,
+                    mood=d.get("mood", [0.5] * 4),
+                    image_paths=d.get("image_paths", []),
+                    target_tool=tool_name,
+                    target_arguments=d.get("target_arguments", ""),
+                    target_tool_call_index=int(
+                        d.get("target_tool_call_index", 0)
+                    ),
+                    target_tier=int(d.get("target_tier", 0)),
+                    target_think=float(d.get("target_think", 0.0)),
+                    target_value=float(d.get("target_value", 0.5)),
+                    model_source=d.get("model_source", "<unknown>"),
+                    surprise=float(d.get("surprise", 0.5)),
+                    target_memory_write=_opt_float("target_memory_write"),
+                    target_cache_hit=_opt_float("target_cache_hit"),
+                    target_permission=_opt_float("target_permission"),
+                    target_refusal=_opt_float("target_refusal"),
+                    target_code_response=_opt_float("target_code_response"),
+                    target_stall=_opt_float("target_stall"),
+                    target_difficulty=_opt_int("target_difficulty"),
+                    target_stop_iter=_opt_float("target_stop_iter"),
+                    target_compaction=_opt_float("target_compaction"),
+                    target_latency_s=_opt_float("target_latency_s"),
+                    target_token_budget=_opt_float("target_token_budget"),
+                    target_mood_pred=_opt_list("target_mood_pred"),
+                    target_subagent_spawn=_opt_float("target_subagent_spawn"),
+                    target_reward_model=_opt_float("target_reward_model"),
+                    target_feature_success=_opt_float("target_feature_success"),
+                ))
+        except Exception as e:
+            logger.debug(f"replay disk load failed: {e}")
+    # ------------------------------------------------------------------
+    # Auto-training
+    # ------------------------------------------------------------------
+    def _kick_auto_train(self) -> None:
+        """Spawn a training run in the background so the agent doesn't block."""
+        if not _owner_may_act():
+            return
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            return
+        if self._train_in_flight is not None and not self._train_in_flight.done():
+            return  # already training
+        self._samples_since_train = 0
+        self._train_in_flight = loop.create_task(self._train_async())
+    async def _train_async(self) -> None:
+        try:
+            await asyncio.to_thread(self._train_sync)
+            # Hot-swap the advisor after training so the new turn uses
+            # the updated weights.
+            self.reload_advisor()
+            logger.info("Caudate auto-train cycle complete; advisor reloaded")
+            # Feed the eval result to the plateau scheduler — if the
+            # latest cycle didn't improve enough, Caudate fires her own
+            # NAS run. This is what "she evolves herself" means in code.
+            try:
+                from nn.nas.scheduler import PlateauScheduler
+                composite = self.scorer.composite()
+                PlateauScheduler().observe_eval(composite)
+                if self.auto_evolver:
+                    self.auto_evolver.maybe_fire()
+            except Exception as e:
+                logger.debug(f"auto-evolve check failed: {e}")
+        except Exception as e:
+            logger.warning(f"Caudate auto-train failed: {e}")
+    def _train_sync(self) -> None:
+        from nn.data import load_corpus_from_sessions
+        from nn.trainer import Trainer, build_fresh
+        # Combine on-disk session corpus + live replay
+        corpus = load_corpus_from_sessions()
+        corpus.extend(self.replay.all())
+        if len(corpus) < self.cfg.min_episodes_to_train:
+            logger.debug(
+                f"auto-train skipped: only {len(corpus)} samples "
+                f"(need {self.cfg.min_episodes_to_train})"
+            )
+            return
+        # Resume if checkpoint exists and architecture matches.
+        # Refuse to clobber an on-disk checkpoint with a fresh init: a
+        # silent build_fresh fallback wiped step-11000 weights overnight
+        # (2026-05-19). If load fails *and* a checkpoint already exists,
+        # skip this auto-train cycle entirely and surface the error.
+        ckpt_path = Path(self.cfg.checkpoint_path)
+        if ckpt_path.exists():
+            try:
+                trainer = Trainer.load(self.cfg)
+            except Exception as e:
+                logger.warning(
+                    f"auto-train skipped: Trainer.load() failed ({e}); "
+                    f"refusing to overwrite existing checkpoint "
+                    f"{ckpt_path} with a fresh init"
+                )
+                return
+        else:
+            trainer = build_fresh(self.cfg)
+        # Cap auto-runs to a quick burst — full retrains are explicit
+        burst_steps = max(50, min(500, len(corpus) * 4))
+        original_max = trainer.cfg.max_steps
+        trainer.cfg.max_steps = trainer.step + burst_steps
+        try:
+            trainer.fit(corpus)
+        finally:
+            trainer.cfg.max_steps = original_max
+# =====================================================================
+# Forge feature-outcome observation (2026-05-10)
+#
+# When the orchestrator finishes a feature it calls this hook with:
+#   - the feature's text (title + description so Caudate can encode it)
+#   - which model was used
+#   - how many turns / tool calls were spent
+#   - whether it succeeded
+#   - wall-clock duration
+#
+# We persist to data/nn/feature_outcomes.jsonl. The trainer doesn't yet
+# have a dedicated feature-outcome head, but the value head can pick
+# up these signals once the corpus is hooked into the dataset loader.
+# Until then this just provides a clean signal trail for inspection
+# and offline analysis. See CAUDATE_EVOLUTION.md "feature outcomes"
+# section for the longer plan.
+# =====================================================================
+_FEATURE_OUTCOMES_PATH = Path("data/nn/feature_outcomes.jsonl")
+def observe_feature_outcome(
+    *,
+    feature_text: str,
+    model_used: str,
+    n_turns: int,
+    n_tool_calls: int,
+    success: bool,
+    duration_s: float,
+    project_id: int | None = None,
+    feature_id: int | None = None,
+    session_id: int | None = None,
+    extras: dict[str, Any] | None = None,
+) -> None:
+    """Append one feature-outcome record to the JSONL log.
+    Best-effort: any IO error is swallowed so this never breaks
+    orchestration. Returns silently.
+    """
+    try:
+        _FEATURE_OUTCOMES_PATH.parent.mkdir(parents=True, exist_ok=True)
+        record = {
+            "ts": time.time(),
+            "feature_text": feature_text,
+            "model_used": model_used,
+            "n_turns": int(n_turns),
+            "n_tool_calls": int(n_tool_calls),
+            "success": bool(success),
+            "duration_s": float(duration_s),
+            "project_id": project_id,
+            "feature_id": feature_id,
+            "session_id": session_id,
+            "extras": extras or {},
+        }
+        with _FEATURE_OUTCOMES_PATH.open("a", encoding="utf-8") as f:
+            f.write(json.dumps(record) + "\n")
+    except Exception as e:
+        logging.getLogger(__name__).debug(
+            f"observe_feature_outcome failed: {e}"
+        )
+def _try_observe_feature_outcome(
+    *,
+    project_id: int | None,
+    feature_id: int | None,
+    outcome: str,
+    session_id: int | None,
+) -> None:
+    """Compatibility shim called by the orchestrator. Pulls richer
+    data from the DB so callers can stay terse.
+    """
+    if feature_id is None:
+        return
+    try:
+        from planning.forge_models import session_scope, ForgeFeature, \
+            ForgeSession, ForgeLog
+        from sqlalchemy import func
+    except Exception:
+        return
+    try:
+        with session_scope() as sess:
+            feat = sess.get(ForgeFeature, feature_id)
+            if feat is None:
+                return
+            feature_text = f"{feat.title}\n\n{feat.description or ''}".strip()
+            srow = sess.get(ForgeSession, session_id) if session_id else None
+            # ForgeSession doesn't track model_used directly yet; the
+            # orchestrator passes it via the public observe_feature_outcome
+            # call when it has the active model in scope. For the shim
+            # path the lookup is best-effort.
+            model_used = "unknown"
+            started = srow.started_at if srow else None
+            ended = srow.ended_at if srow else None
+            duration_s = (
+                (ended - started).total_seconds()
+                if (started and ended) else 0.0
+            )
+            # Cheap proxies for n_turns / n_tool_calls: count log rows.
+            n_log_rows = (
+                sess.query(func.count(ForgeLog.id))
+                .filter_by(session_id=session_id)
+                .scalar()
+                if session_id else 0
+            )
+    except Exception as e:
+        logging.getLogger(__name__).debug(
+            f"_try_observe_feature_outcome lookup failed: {e}"
+        )
+        return
+    observe_feature_outcome(
+        feature_text=feature_text,
+        model_used=model_used,
+        n_turns=n_log_rows,        # proxy until orchestrator tracks turns
+        n_tool_calls=n_log_rows,   # proxy
+        success=(outcome == "success"),
+        duration_s=duration_s,
+        project_id=project_id,
+        feature_id=feature_id,
+        session_id=session_id,
+        extras={"outcome": outcome},
+    )