caudate-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. api/__init__.py +5 -0
  2. api/anthropic_compat.py +1518 -0
  3. api/artifact_viewer.py +366 -0
  4. api/caudate_middleware.py +618 -0
  5. api/forge_bootstrapper_routes.py +377 -0
  6. api/forge_routes.py +630 -0
  7. api/forge_system_routes.py +294 -0
  8. api/openai_compat.py +1993 -0
  9. api/server.py +667 -0
  10. api/storyboard_page.py +677 -0
  11. caudate_cli-0.1.0.dist-info/METADATA +354 -0
  12. caudate_cli-0.1.0.dist-info/RECORD +153 -0
  13. caudate_cli-0.1.0.dist-info/WHEEL +5 -0
  14. caudate_cli-0.1.0.dist-info/entry_points.txt +2 -0
  15. caudate_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  16. caudate_cli-0.1.0.dist-info/top_level.txt +14 -0
  17. cognos_mcp/__init__.py +4 -0
  18. cognos_mcp/bridge.py +41 -0
  19. cognos_mcp/client.py +70 -0
  20. cognos_mcp/config.py +49 -0
  21. cognos_mcp/server.py +66 -0
  22. config.py +82 -0
  23. core/__init__.py +0 -0
  24. core/agent.py +468 -0
  25. core/agentic_loop.py +731 -0
  26. core/anthropic_auth.py +91 -0
  27. core/background.py +113 -0
  28. core/banner.py +134 -0
  29. core/bootstrap.py +292 -0
  30. core/citations.py +131 -0
  31. core/compaction.py +109 -0
  32. core/constitution.py +198 -0
  33. core/diff_viewer.py +87 -0
  34. core/export.py +85 -0
  35. core/file_refs.py +119 -0
  36. core/files.py +199 -0
  37. core/hooks.py +209 -0
  38. core/image.py +599 -0
  39. core/input.py +91 -0
  40. core/loop.py +238 -0
  41. core/memory_md.py +147 -0
  42. core/notifications.py +99 -0
  43. core/ownership.py +181 -0
  44. core/paste.py +81 -0
  45. core/permissions.py +210 -0
  46. core/plan_mode.py +215 -0
  47. core/sandbox_prompt.py +185 -0
  48. core/scheduler.py +195 -0
  49. core/schemas.py +202 -0
  50. core/session.py +90 -0
  51. core/settings.py +132 -0
  52. core/skills.py +398 -0
  53. core/slash_commands.py +977 -0
  54. core/statusline.py +61 -0
  55. core/subagent.py +300 -0
  56. core/thinking.py +50 -0
  57. core/updater.py +122 -0
  58. core/usage.py +109 -0
  59. core/worktree.py +93 -0
  60. execution/__init__.py +0 -0
  61. execution/executor.py +329 -0
  62. execution/plugins.py +108 -0
  63. execution/tools/__init__.py +0 -0
  64. execution/tools/agent_tool.py +107 -0
  65. execution/tools/agentic_tool.py +297 -0
  66. execution/tools/artifact_tool.py +191 -0
  67. execution/tools/ask_user_question_tool.py +137 -0
  68. execution/tools/base.py +81 -0
  69. execution/tools/calculator_tool.py +137 -0
  70. execution/tools/cognos_card_tool.py +124 -0
  71. execution/tools/cron_tool.py +215 -0
  72. execution/tools/datetime_tool.py +215 -0
  73. execution/tools/describe_image_tool.py +161 -0
  74. execution/tools/draw_tool.py +164 -0
  75. execution/tools/edit_image_tool.py +262 -0
  76. execution/tools/edit_tool.py +245 -0
  77. execution/tools/file_tool.py +90 -0
  78. execution/tools/find_anywhere_tool.py +255 -0
  79. execution/tools/forge_feature_tools.py +377 -0
  80. execution/tools/glob_tool.py +59 -0
  81. execution/tools/grep_tool.py +89 -0
  82. execution/tools/http_request_tool.py +224 -0
  83. execution/tools/load_skill_tool.py +104 -0
  84. execution/tools/longcat_avatar_tool.py +384 -0
  85. execution/tools/mcp_tool.py +100 -0
  86. execution/tools/notebook_tool.py +279 -0
  87. execution/tools/openapi_tool.py +440 -0
  88. execution/tools/plan_mode_tool.py +95 -0
  89. execution/tools/push_notification_tool.py +157 -0
  90. execution/tools/python_tool.py +61 -0
  91. execution/tools/respond_tool.py +40 -0
  92. execution/tools/sandbox_tool.py +378 -0
  93. execution/tools/search_tool.py +153 -0
  94. execution/tools/semantic_search_tool.py +106 -0
  95. execution/tools/shell_tool.py +283 -0
  96. execution/tools/speak_tool.py +134 -0
  97. execution/tools/storyboard_tool.py +727 -0
  98. execution/tools/system_info_tool.py +212 -0
  99. execution/tools/task_tool.py +323 -0
  100. execution/tools/think_tool.py +49 -0
  101. execution/tools/transcribe_audio_tool.py +86 -0
  102. execution/tools/update_memory_tool.py +92 -0
  103. execution/tools/web_fetch_tool.py +82 -0
  104. execution/tools/worktree_tool.py +174 -0
  105. llm/__init__.py +0 -0
  106. llm/fallback.py +116 -0
  107. llm/models.py +320 -0
  108. llm/provider.py +1356 -0
  109. llm/router.py +373 -0
  110. main.py +1889 -0
  111. memory/__init__.py +0 -0
  112. memory/episodic.py +99 -0
  113. memory/procedural.py +145 -0
  114. memory/semantic.py +71 -0
  115. memory/working.py +64 -0
  116. nn/__init__.py +43 -0
  117. nn/auto_evolve.py +245 -0
  118. nn/caudate.py +136 -0
  119. nn/config.py +141 -0
  120. nn/consolidator.py +81 -0
  121. nn/data.py +1635 -0
  122. nn/encoder.py +258 -0
  123. nn/forge_advisor.py +303 -0
  124. nn/format.py +235 -0
  125. nn/heads.py +432 -0
  126. nn/observer.py +994 -0
  127. nn/policy.py +214 -0
  128. nn/runtime.py +343 -0
  129. nn/scorer.py +175 -0
  130. nn/trainer.py +515 -0
  131. nn/vision.py +352 -0
  132. personality/__init__.py +23 -0
  133. personality/engine.py +129 -0
  134. personality/identity.py +144 -0
  135. personality/inner_voice.py +100 -0
  136. personality/mood.py +205 -0
  137. planning/__init__.py +0 -0
  138. planning/dev_server.py +221 -0
  139. planning/forge_models.py +718 -0
  140. planning/orchestrator.py +1363 -0
  141. planning/planner.py +451 -0
  142. planning/task_graph.py +61 -0
  143. reflection/__init__.py +0 -0
  144. reflection/meta_learner.py +156 -0
  145. reflection/reflector.py +127 -0
  146. ui/__init__.py +5 -0
  147. ui/display.py +88 -0
  148. voice/__init__.py +0 -0
  149. voice/conversation.py +125 -0
  150. voice/listener.py +111 -0
  151. voice/speaker.py +59 -0
  152. voice/stt.py +126 -0
  153. voice/tts.py +214 -0
nn/observer.py ADDED
@@ -0,0 +1,994 @@
1
+ """Live observer — the bridge between the running agent and Caudate.
2
+
3
+ Sits between AgenticLoop and the trainer. On every turn:
4
+
5
+ 1. **Predict** — calls Caudate (if loaded) to get a prediction. Logged.
6
+ 2. **Capture** — records the state at the start of the turn.
7
+ 3. **Reward** — when the turn finishes, derives a reward from the
8
+ reflector's score (or a heuristic if no reflector).
9
+ 4. **Persist** — writes (state, action, reward) to data/nn/replay.jsonl.
10
+ 5. **Auto-train** — when ≥N new samples accumulated, kicks an async
11
+ training run in the background.
12
+
13
+ The observer is the single integration surface — AgenticLoop only knows
14
+ about this object, not about the underlying NN module. Detaches Caudate
15
+ from her sister brains via this thin layer.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import asyncio
21
+ import json
22
+ import logging
23
+ import time
24
+ from dataclasses import dataclass, field
25
+ from pathlib import Path
26
+ from typing import Any
27
+
28
+ from nn.auto_evolve import AutoEvolver, AutoEvolveConfig
29
+ from nn.config import NNConfig
30
+ from nn.data import ConversationSample, ReplayBuffer, ToolVocab
31
+ from nn.format import ChatMessage, ToolCall
32
+ from nn.policy import GraduationPolicy, TrustLevel
33
+ from nn.runtime import CaudateAdvisor, Prediction, load_advisor
34
+ from nn.scorer import PredictionScorer, ScoreRecord
35
+
36
+
37
+ # ---------------------------------------------------------------------
38
+ # Label heuristics for the extended heads (D-heads, 2026-05-01).
39
+ #
40
+ # Each function returns a float label in [0, 1] or None when no signal
41
+ # is available for this turn. The trainer skips heads with no batch
42
+ # target via `HeadSpec.optional_target`. These heuristics are
43
+ # deliberately simple proxies; they get refined as the underlying
44
+ # infrastructure (real cache, real memory store) matures.
45
+ # ---------------------------------------------------------------------
46
+
47
+ # Tools whose execution would prompt the user under DEFAULT permission
48
+ # mode. Mirrors core/permissions.py::_MUTATING + _SENSITIVE.
49
+ _MUTATING_TOOL_NAMES: frozenset[str] = frozenset({
50
+ "Bash", "Write", "Edit", "FileWrite", "PythonExec", "shell",
51
+ "file_write", "edit", "python_exec",
52
+ })
53
+
54
+
55
+ def _label_memory_write(reward: float) -> float:
56
+ """High-reward turns → memory-worthy. Soft threshold."""
57
+ return 1.0 if reward >= 0.7 else 0.0
58
+
59
+
60
+ def _label_permission(chosen_tools: list[str]) -> float | None:
61
+ """Whether the agent invoked something that needs explicit approval.
62
+
63
+ Returns None for chat-only turns (no tools used) — those don't
64
+ have a permission decision so they shouldn't train the head.
65
+ """
66
+ if not chosen_tools:
67
+ return None
68
+ return 1.0 if any(t in _MUTATING_TOOL_NAMES for t in chosen_tools) else 0.0
69
+
70
+
71
+ def _label_cache_hit(current_text: str, recent_texts: list[str]) -> float:
72
+ """Jaccard similarity between the current prompt and recent history.
73
+
74
+ 1.0 if any recent prompt has >0.7 token overlap; else 0.0.
75
+ Identical-prompt cache is the realistic upper-bound for what an
76
+ LLM cache could serve, but near-paraphrases are also useful — so
77
+ word-level Jaccard with a moderate threshold is a reasonable
78
+ starting proxy.
79
+ """
80
+ if not current_text or not recent_texts:
81
+ return 0.0
82
+ cur_tokens = set(current_text.lower().split())
83
+ if not cur_tokens:
84
+ return 0.0
85
+ for prior in recent_texts:
86
+ prior_tokens = set(prior.lower().split())
87
+ if not prior_tokens:
88
+ continue
89
+ union = cur_tokens | prior_tokens
90
+ if not union:
91
+ continue
92
+ jaccard = len(cur_tokens & prior_tokens) / len(union)
93
+ if jaccard >= 0.7:
94
+ return 1.0
95
+ return 0.0
96
+
97
+
98
+ # ---------------------------------------------------------------------
99
+ # Tier 1 — response-shape labels (computed from the actual response
100
+ # the LLM produced this turn, so they're "what shape did this turn
101
+ # actually take" not "predicted shape").
102
+ # ---------------------------------------------------------------------
103
+
104
+ import re as _re
105
+ _REFUSAL_LABEL_RE = _re.compile(
106
+ r"\b(i\s+can[' ]?t|i\s+cannot|i\s+(?:am\s+)?unable|"
107
+ r"violates?\s+(?:my\s+)?(?:guidelines|policy)|against\s+(?:my\s+)?policy)\b",
108
+ _re.IGNORECASE,
109
+ )
110
+
111
+
112
+ def _label_refusal(response_text: str) -> float | None:
113
+ """1.0 if the LLM's reply contains refusal phrasing, else 0.0."""
114
+ if not response_text:
115
+ return None
116
+ return 1.0 if _REFUSAL_LABEL_RE.search(response_text) else 0.0
117
+
118
+
119
+ def _label_code_response(response_text: str) -> float | None:
120
+ """1.0 if the LLM's reply contains a code block (```), else 0.0.
121
+
122
+ Triple-backtick is the dominant signal in modern LLM output; bare
123
+ indented code is rarer in chat replies and more ambiguous.
124
+ """
125
+ if not response_text:
126
+ return None
127
+ return 1.0 if "```" in response_text else 0.0
128
+
129
+
130
+ def _label_stall(chosen_tools: list[str], response_text: str,
131
+ inferred_from_stall: bool) -> float | None:
132
+ """1.0 if the model said 'I'll use Bash...' but called no tool.
133
+
134
+ The middleware already detects this and sets `inferred_from_stall`
135
+ on the turn context — we use that as the authoritative signal.
136
+ """
137
+ if not response_text and not chosen_tools:
138
+ return None # no signal; chat-only turn with empty reply is rare
139
+ return 1.0 if inferred_from_stall else 0.0
140
+
141
+
142
+ def _label_difficulty(num_tool_calls: int) -> int | None:
143
+ """3-class: 0 (≤1 call), 1 (2-3), 2 (4+).
144
+
145
+ Returns the class index (used with CE loss).
146
+ """
147
+ if num_tool_calls <= 1:
148
+ return 0
149
+ if num_tool_calls <= 3:
150
+ return 1
151
+ return 2
152
+
153
+
154
+ def _label_stop_iter(produced_final_text: bool) -> float:
155
+ """1.0 if the model produced final text (no more tool_calls), else 0.0.
156
+
157
+ This is the agentic-loop's stop signal — predicting it lets Caudate
158
+ suggest "this prompt will resolve in 1 step, don't open the loop".
159
+ """
160
+ return 1.0 if produced_final_text else 0.0
161
+
162
+
163
+ def _label_compaction(message_count: int, char_total: int) -> float:
164
+ """1.0 if context is large enough to warrant compaction, else 0.0.
165
+
166
+ Heuristic threshold: >40 messages OR >40K chars in history. These
167
+ correlate with the points where the actual `ContextCompactor`
168
+ typically fires; once the real compactor is wired through the
169
+ chat path, replace this with the actual compaction event.
170
+ """
171
+ return 1.0 if (message_count > 40 or char_total > 40_000) else 0.0
172
+
173
+
174
+ # ---------------------------------------------------------------------
175
+ # Tier 2 — continuous + multi-output labels
176
+ # ---------------------------------------------------------------------
177
+
178
+
179
+ def _label_latency_s(elapsed_s: float | None) -> float | None:
180
+ """Map measured turn latency to [0, 1] by /60s clip.
181
+
182
+ 60s is a long turn; anything past that is "very slow" and clips
183
+ to 1.0. The sigmoid output of the head reads back as
184
+ `predicted_latency_s ≈ logit_value * 60`.
185
+ """
186
+ if elapsed_s is None:
187
+ return None
188
+ return max(0.0, min(1.0, elapsed_s / 60.0))
189
+
190
+
191
+ def _label_token_budget(completion_tokens: int | None) -> float | None:
192
+ """Map completion tokens to [0, 1] by /4096 clip.
193
+
194
+ The standard max_tokens is 4096 — use that as the saturation point.
195
+ Head's sigmoid output reads back as `predicted_tokens ≈ logit * 4096`.
196
+ """
197
+ if completion_tokens is None:
198
+ return None
199
+ return max(0.0, min(1.0, completion_tokens / 4096.0))
200
+
201
+
202
+ def _label_mood_pred(next_mood: list[float] | None) -> list[float] | None:
203
+ """The next-turn's mood vector becomes this turn's label.
204
+
205
+ This makes mood_pred a *world-model* prediction: "given current
206
+ state, what mood will the user be in next turn?" Self-supervised
207
+ in the sense that the label appears for free 30s later. None on
208
+ the very first turn (no future label yet).
209
+ """
210
+ if next_mood is None:
211
+ return None
212
+ out = [float(x) for x in (next_mood + [0.0]*4)[:4]]
213
+ return [max(0.0, min(1.0, x)) for x in out]
214
+
215
+
216
+ def _label_subagent_spawn(num_tool_calls: int, response_text: str) -> float | None:
217
+ """Heuristic: turns that called many tools (≥4) likely benefited from
218
+ delegation to a subagent. Refine with real subagent-success metrics
219
+ once the subagent path is wired into the chat agentic loop.
220
+ """
221
+ if num_tool_calls == 0 and not response_text:
222
+ return None
223
+ return 1.0 if num_tool_calls >= 4 else 0.0
224
+
225
+
226
+ def _owner_may_act() -> bool:
227
+ """Single gate referenced from every Caudate decision path."""
228
+ try:
229
+ from core.ownership import caudate_may_act
230
+ return caudate_may_act()
231
+ except Exception:
232
+ return True
233
+
234
+ logger = logging.getLogger(__name__)
235
+
236
+
237
+ @dataclass
238
+ class _PendingTurn:
239
+ """One turn-in-progress: state captured at start, awaiting reward."""
240
+
241
+ messages: list[str]
242
+ tool_history: list[str]
243
+ mood: list[float]
244
+ image_paths: list[str] = field(default_factory=list)
245
+ chosen_tools: list[str] = field(default_factory=list)
246
+ tier_used: int = 0
247
+ thinking_used: bool = False
248
+ started_at: float = field(default_factory=time.time)
249
+ # Which model produced the response on this turn. Carried through to
250
+ # the resulting ConversationSample so the corpus stays branchable per
251
+ # source (Phase 1 of CAUDATE_EVOLUTION.md).
252
+ model_source: str = "unknown"
253
+ # Tools the assistant could have called this turn. The contrastive
254
+ # tool head needs this candidate list — without it the head sees
255
+ # only the synthetic <no_tool> slot and degenerates into a constant.
256
+ # Carried into the ConversationSample on on_turn_end so replay
257
+ # samples retrain against the same candidate context.
258
+ available_tools: list[ToolDef] = field(default_factory=list)
259
+
260
+
261
+ class CaudateObserver:
262
+ """Lives on the agent. Watches turns, talks to Caudate."""
263
+
264
+ REPLAY_PATH_DEFAULT = "data/nn/replay.jsonl"
265
+ AUTO_TRAIN_EVERY_DEFAULT = 32 # new samples between auto-trains
266
+
267
+ def __init__(
268
+ self,
269
+ cfg: NNConfig | None = None,
270
+ auto_train_every: int = AUTO_TRAIN_EVERY_DEFAULT,
271
+ auto_train: bool = True,
272
+ ):
273
+ self.cfg = cfg or NNConfig()
274
+ self.cfg.ensure_dirs()
275
+ self.advisor: CaudateAdvisor | None = load_advisor(self.cfg)
276
+ self.replay = ReplayBuffer(capacity=self.cfg.replay_capacity)
277
+ self._pending: _PendingTurn | None = None
278
+ self._tool_history: list[str] = []
279
+ self._samples_since_train = 0
280
+ self.auto_train_every = auto_train_every
281
+ self.auto_train = auto_train
282
+ self._train_in_flight: asyncio.Task | None = None
283
+ self._last_prediction: Prediction | None = None
284
+ self._replay_path = Path(self.cfg.advisor_log_path).parent / "replay.jsonl"
285
+ # Rolling history of recent user prompts — feeds the cache_hit
286
+ # head's similarity check. Bounded so it doesn't grow forever.
287
+ self._recent_user_texts: list[str] = []
288
+ self._recent_user_texts_cap = 32
289
+ self._load_replay_from_disk()
290
+
291
+ # Graduation pipeline: scorer + policy.
292
+ self.scorer = PredictionScorer(
293
+ window=200,
294
+ path=Path(self.cfg.data_dir) / "scorecard.json",
295
+ weight_tool=self.cfg.w_tool,
296
+ weight_tier=self.cfg.w_tier,
297
+ weight_think=self.cfg.w_think,
298
+ )
299
+ self.policy = GraduationPolicy(
300
+ state_path=Path(self.cfg.data_dir) / "policy.json",
301
+ )
302
+ # Sync the policy with current state on boot so /caudate awareness
303
+ # immediately reflects reality after a restart.
304
+ self.policy.update(
305
+ samples=len(self.scorer),
306
+ composite_acc=self.scorer.composite(),
307
+ advisor_loaded=self.advisor is not None,
308
+ )
309
+
310
+ # Autonomous evolution — Caudate fires her own NAS runs after
311
+ # auto-train cycles when she's plateaued. Disabled by default
312
+ # for fresh agents; enable via /caudate evolve on or settings.
313
+ self.auto_evolver = AutoEvolver(self, AutoEvolveConfig(enabled=True))
314
+
315
+ # ------------------------------------------------------------------
316
+ # Public hooks called by AgenticLoop
317
+ # ------------------------------------------------------------------
318
+
319
+ def on_turn_start(
320
+ self,
321
+ recent_messages: list[str],
322
+ mood: list[float] | None = None,
323
+ image_paths: list[str] | None = None,
324
+ model_source: str = "unknown",
325
+ available_tools: list[ToolDef] | None = None,
326
+ ) -> Prediction | None:
327
+ """Capture state (now including images), ask Caudate for a prediction.
328
+
329
+ ``available_tools`` is the candidate list the assistant can call
330
+ this turn. Passing it through lets the contrastive tool head
331
+ actually discriminate — without it the head only ever sees the
332
+ synthetic ``<no_tool>`` slot and predicts that with confidence 1.0
333
+ regardless of context.
334
+ """
335
+ mood = mood or [0.5] * self.cfg.mood_dim
336
+ imgs = list(image_paths or [])[-self.cfg.image_window:]
337
+ tools = list(available_tools or [])
338
+ self._pending = _PendingTurn(
339
+ messages=list(recent_messages)[-self.cfg.msg_window:],
340
+ tool_history=list(self._tool_history)[-self.cfg.history_window:],
341
+ mood=list(mood)[: self.cfg.mood_dim],
342
+ image_paths=imgs,
343
+ model_source=model_source,
344
+ available_tools=tools,
345
+ )
346
+
347
+ # Owner kill switch — overrides everything.
348
+ try:
349
+ from core.ownership import caudate_may_act
350
+ if not caudate_may_act():
351
+ self._last_prediction = None
352
+ return None
353
+ except Exception:
354
+ pass
355
+
356
+ if self.advisor is None:
357
+ return None
358
+ try:
359
+ pred = self.advisor.predict(
360
+ messages=self._pending.messages,
361
+ tool_history=self._pending.tool_history,
362
+ mood=self._pending.mood,
363
+ image_paths=imgs,
364
+ model_source=model_source,
365
+ available_tools=tools or None,
366
+ )
367
+ self._last_prediction = pred
368
+ return pred
369
+ except Exception as e:
370
+ logger.debug(f"Caudate predict failed: {e}")
371
+ return None
372
+
373
+ def on_tool_use(
374
+ self,
375
+ tool_name: str,
376
+ tier: int = 0,
377
+ thinking_used: bool = False,
378
+ ) -> None:
379
+ """Record what actually happened during the turn."""
380
+ self._tool_history.append(tool_name)
381
+ if len(self._tool_history) > self.cfg.history_window * 4:
382
+ # drop the oldest half so the buffer doesn't grow unboundedly
383
+ self._tool_history = self._tool_history[-self.cfg.history_window * 2:]
384
+ if self._pending is not None:
385
+ self._pending.chosen_tools.append(tool_name)
386
+ self._pending.tier_used = tier
387
+ self._pending.thinking_used = thinking_used or self._pending.thinking_used
388
+
389
+ def on_turn_end(
390
+ self,
391
+ reward: float | None = None,
392
+ success: bool | None = None,
393
+ *,
394
+ # Outcome signals for Tier 1 / Tier 2 head labels. All optional —
395
+ # missing values just leave the corresponding head unlabeled for
396
+ # this turn (HeadSpec.optional_target handles the trainer side).
397
+ response_text: str | None = None,
398
+ inferred_from_stall: bool = False,
399
+ elapsed_s: float | None = None,
400
+ completion_tokens: int | None = None,
401
+ ) -> int:
402
+ """Close the turn — derive reward, push samples, maybe auto-train.
403
+
404
+ Also: score Caudate's prediction against what actually happened,
405
+ and update the graduation policy. Returns number of samples added.
406
+ """
407
+ if self._pending is None:
408
+ return 0
409
+ pending = self._pending
410
+ self._pending = None
411
+
412
+ # Derive reward
413
+ if reward is None:
414
+ if success is None:
415
+ reward = 0.5
416
+ else:
417
+ reward = 0.7 if success else 0.3
418
+
419
+ # Score the prediction: did Caudate guess what actually happened?
420
+ # When no tool was called, the correct answer is `<no_tool>` — grade
421
+ # against that so chat turns produce learning signal too (otherwise
422
+ # the tier/value heads never get feedback during chat-only usage).
423
+ if self._last_prediction is not None:
424
+ actual_tool = pending.chosen_tools[0] if pending.chosen_tools else "<no_tool>"
425
+ actual_tier = ("fast", "slow")[int(pending.tier_used)]
426
+ self.scorer.add(ScoreRecord(
427
+ ts=time.time(),
428
+ predicted_tool=self._last_prediction.tool,
429
+ actual_tool=actual_tool,
430
+ predicted_tool_conf=self._last_prediction.tool_confidence,
431
+ predicted_tier=self._last_prediction.tier,
432
+ actual_tier=actual_tier,
433
+ predicted_think=self._last_prediction.think,
434
+ actual_think=bool(pending.thinking_used),
435
+ predicted_value=self._last_prediction.value,
436
+ actual_reward=float(reward),
437
+ ))
438
+ # Update graduation status — may promote/demote
439
+ self.policy.update(
440
+ samples=self.scorer.lifetime_predictions,
441
+ composite_acc=self.scorer.composite(),
442
+ advisor_loaded=self.advisor is not None,
443
+ )
444
+
445
+ # Prioritized replay: compute how surprising this turn was so
446
+ # the trainer samples hard examples more often. Combines tool
447
+ # error, tier error, value error, and a confident-wrongness
448
+ # penalty (being confidently wrong is worse than uncertainly
449
+ # wrong). Always >= 0.05 so easy samples still get reviewed.
450
+ if self._last_prediction is not None:
451
+ actual_tool_label = pending.chosen_tools[0] if pending.chosen_tools else "<no_tool>"
452
+ actual_tier_label = ("fast", "slow")[int(pending.tier_used)]
453
+ tool_err = 0.0 if self._last_prediction.tool == actual_tool_label else 1.0
454
+ tier_err = 0.0 if self._last_prediction.tier == actual_tier_label else 1.0
455
+ value_err = abs(self._last_prediction.value - float(reward))
456
+ conf_penalty = self._last_prediction.tool_confidence * tool_err
457
+ surprise = max(
458
+ 0.05,
459
+ min(1.0, 0.5 * tool_err + 0.15 * tier_err + 0.3 * value_err + 0.2 * conf_penalty),
460
+ )
461
+ else:
462
+ surprise = 0.5 # no prediction → neutral priority
463
+
464
+ # Extended-head label heuristics (D-heads + Tier 1/2). Each may
465
+ # be None if this turn provides no signal; the trainer skips
466
+ # heads with missing batch targets.
467
+ last_user_text = pending.messages[-1] if pending.messages else ""
468
+ target_memory_write = _label_memory_write(float(reward))
469
+ target_permission = _label_permission(pending.chosen_tools)
470
+ target_cache_hit = _label_cache_hit(last_user_text, self._recent_user_texts)
471
+
472
+ # Tier 1 — response-shape labels (from the response that actually
473
+ # came back this turn). Each requires response_text or the stall
474
+ # flag to be informative; chat-only turns with empty text leave
475
+ # most of these as None, which is correct.
476
+ rt = response_text or ""
477
+ num_tools = len(pending.chosen_tools or [])
478
+ target_refusal = _label_refusal(rt)
479
+ target_code_response = _label_code_response(rt)
480
+ target_stall = _label_stall(pending.chosen_tools, rt, inferred_from_stall)
481
+ target_difficulty = _label_difficulty(num_tools)
482
+ # Every observed turn produced final text by definition (we're in
483
+ # on_turn_end), so stop_iter is always 1.0 — but only label it
484
+ # for turns where the agentic loop is actually relevant
485
+ # (chat-path; CLI Claude Code drives its own loop).
486
+ target_stop_iter = _label_stop_iter(produced_final_text=True) if rt else None
487
+ # Compaction threshold: rough proxy on message count + char total
488
+ msg_count = len(pending.messages or [])
489
+ char_total = sum(len(m) for m in (pending.messages or []) if m)
490
+ target_compaction = _label_compaction(msg_count, char_total)
491
+
492
+ # Tier 2 — continuous + multi-output labels
493
+ target_latency_s = _label_latency_s(elapsed_s)
494
+ target_token_budget = _label_token_budget(completion_tokens)
495
+ target_subagent_spawn = _label_subagent_spawn(num_tools, rt)
496
+ # mood_pred and reward_model: deferred — need future-mood lookup
497
+ # and arbitration-pair join respectively. Wired by separate paths.
498
+ target_mood_pred = None
499
+ target_reward_model = None
500
+
501
+ # Update rolling history AFTER computing cache_hit (otherwise
502
+ # every prompt would 100% match itself).
503
+ if last_user_text:
504
+ self._recent_user_texts.append(last_user_text)
505
+ if len(self._recent_user_texts) > self._recent_user_texts_cap:
506
+ self._recent_user_texts = self._recent_user_texts[-self._recent_user_texts_cap:]
507
+
508
+ # Auto-write to memory.md when this turn is memory-worthy.
509
+ # Trigger: heuristic label says 1.0 (reward >= 0.7) AND we have
510
+ # a non-empty user prompt + response. Caudate's `memory_write`
511
+ # head trains to predict the same heuristic; once she's at
512
+ # CONTROLLER trust we'll switch to using her prediction
513
+ # directly. For now the heuristic IS the source of truth.
514
+ if (target_memory_write == 1.0 and last_user_text and rt
515
+ and not inferred_from_stall):
516
+ try:
517
+ from core.memory_md import get_memory
518
+ # Compact: prompt + first line of response. Avoids
519
+ # bloating memory.md with full response bodies.
520
+ response_first_line = (rt.strip().splitlines() or [""])[0][:300]
521
+ entry = (
522
+ f"**user:** {last_user_text[:300]}\n\n"
523
+ f"**reply:** {response_first_line}"
524
+ )
525
+ title = last_user_text[:60].replace("\n", " ")
526
+ get_memory().append(entry, source="caudate-auto", title=title)
527
+ except Exception as e:
528
+ logger.debug(f"memory.md auto-write failed: {e}")
529
+
530
+ # Build the conversation prefix from the pending state. The
531
+ # agent supplies role-prefixed strings ("user: hi") and a
532
+ # parallel tool-history list; we splice those into the standard
533
+ # chat-tool-call shape. The historical tool calls become a
534
+ # synthetic assistant message at the tail of the prefix — the
535
+ # collate's `conversation_tool_history` helper picks them up
536
+ # and the encoder still sees the same string view.
537
+ conv: list[ChatMessage] = []
538
+ for m in pending.messages:
539
+ if not m:
540
+ continue
541
+ role, sep, content = m.partition(": ")
542
+ if not sep:
543
+ role, content = "user", m
544
+ conv.append(ChatMessage(role=role, content=content))
545
+ if pending.tool_history:
546
+ conv.append(ChatMessage(
547
+ role="assistant", content="",
548
+ tool_calls=[ToolCall(name=t) for t in pending.tool_history if t],
549
+ ))
550
+
551
+ # One sample per chosen tool — the prediction problem is "what
552
+ # comes next", so each tool call is its own training pair.
553
+ added = 0
554
+ for tool in pending.chosen_tools or ["<no_tool>"]:
555
+ sample = ConversationSample(
556
+ conversation=conv,
557
+ # Captured at turn start — the candidate list the
558
+ # contrastive head was offered. Persisting it lets
559
+ # the retrain see the same discrimination context.
560
+ tools=list(pending.available_tools),
561
+ mood=pending.mood,
562
+ image_paths=list(pending.image_paths),
563
+ target_tool=tool,
564
+ target_tier=int(pending.tier_used),
565
+ target_think=float(bool(pending.thinking_used)),
566
+ target_value=float(reward),
567
+ model_source=pending.model_source,
568
+ surprise=surprise,
569
+ target_memory_write=target_memory_write,
570
+ target_cache_hit=target_cache_hit,
571
+ target_permission=target_permission,
572
+ target_refusal=target_refusal,
573
+ target_code_response=target_code_response,
574
+ target_stall=target_stall,
575
+ target_difficulty=target_difficulty,
576
+ target_stop_iter=target_stop_iter,
577
+ target_compaction=target_compaction,
578
+ target_latency_s=target_latency_s,
579
+ target_token_budget=target_token_budget,
580
+ target_mood_pred=target_mood_pred,
581
+ target_subagent_spawn=target_subagent_spawn,
582
+ target_reward_model=target_reward_model,
583
+ )
584
+ self.replay.push(sample)
585
+ self._append_replay_disk(sample)
586
+ added += 1
587
+
588
+ self._samples_since_train += added
589
+
590
+ if self.auto_train and self._samples_since_train >= self.auto_train_every:
591
+ self._kick_auto_train()
592
+
593
+ return added
594
+
595
+ # ------------------------------------------------------------------
596
+ # Inspection
597
+ # ------------------------------------------------------------------
598
+
599
+ def status(self) -> dict[str, Any]:
600
+ composite = self.scorer.composite()
601
+ samples_lifetime = self.scorer.lifetime_predictions
602
+ return {
603
+ "advisor_loaded": self.advisor is not None,
604
+ "replay_size": len(self.replay),
605
+ "samples_since_train": self._samples_since_train,
606
+ "auto_train_every": self.auto_train_every,
607
+ "auto_train_in_flight": (
608
+ self._train_in_flight is not None and not self._train_in_flight.done()
609
+ ),
610
+ "last_prediction": (
611
+ {
612
+ "tool": self._last_prediction.tool,
613
+ "tool_conf": self._last_prediction.tool_confidence,
614
+ "tier": self._last_prediction.tier,
615
+ "tier_conf": self._last_prediction.tier_confidence,
616
+ "think": self._last_prediction.think,
617
+ "value": self._last_prediction.value,
618
+ } if self._last_prediction else None
619
+ ),
620
+ "scorer": self.scorer.report(),
621
+ "policy": self.policy.report(samples_lifetime, composite),
622
+ "auto_evolve": (
623
+ self.auto_evolver.status() if getattr(self, "auto_evolver", None)
624
+ else None
625
+ ),
626
+ }
627
+
628
+ def can_whisper(self) -> bool:
629
+ if not _owner_may_act():
630
+ return False
631
+ return self.policy.can_whisper() and self.advisor is not None
632
+
633
+ def can_advise(self) -> bool:
634
+ if not _owner_may_act():
635
+ return False
636
+ return self.policy.can_advise() and self.advisor is not None
637
+
638
+ def can_control(self) -> bool:
639
+ if not _owner_may_act():
640
+ return False
641
+ return self.policy.can_control() and self.advisor is not None
642
+
643
+ def reload_advisor(self) -> bool:
644
+ """Reload the checkpoint from disk after an external retrain."""
645
+ new = load_advisor(self.cfg)
646
+ if new is not None:
647
+ self.advisor = new
648
+ return True
649
+ return False
650
+
651
+ # ------------------------------------------------------------------
652
+ # Persistence — replay buffer to disk
653
+ # ------------------------------------------------------------------
654
+
655
+ def _append_replay_disk(self, sample: ConversationSample) -> None:
656
+ try:
657
+ self._replay_path.parent.mkdir(parents=True, exist_ok=True)
658
+ with self._replay_path.open("a") as f:
659
+ f.write(json.dumps({
660
+ # Standard chat-tool-call shape — same schema as
661
+ # external datasets. The loader handles both this
662
+ # shape and the legacy {messages, tool_history}
663
+ # layout for backward compat with old replay rows.
664
+ "conversation": [m.to_dict() for m in sample.conversation],
665
+ "tools": [t.to_dict() for t in sample.tools],
666
+ "mood": sample.mood,
667
+ "image_paths": sample.image_paths,
668
+ "target_tool": sample.target_tool,
669
+ "target_arguments": sample.target_arguments,
670
+ "target_tool_call_index": sample.target_tool_call_index,
671
+ "target_tier": sample.target_tier,
672
+ "target_think": sample.target_think,
673
+ "target_value": sample.target_value,
674
+ "model_source": sample.model_source,
675
+ "surprise": sample.surprise,
676
+ "target_memory_write": sample.target_memory_write,
677
+ "target_cache_hit": sample.target_cache_hit,
678
+ "target_permission": sample.target_permission,
679
+ "target_refusal": sample.target_refusal,
680
+ "target_code_response": sample.target_code_response,
681
+ "target_stall": sample.target_stall,
682
+ "target_difficulty": sample.target_difficulty,
683
+ "target_stop_iter": sample.target_stop_iter,
684
+ "target_compaction": sample.target_compaction,
685
+ "target_latency_s": sample.target_latency_s,
686
+ "target_token_budget": sample.target_token_budget,
687
+ "target_mood_pred": sample.target_mood_pred,
688
+ "target_subagent_spawn": sample.target_subagent_spawn,
689
+ "target_reward_model": sample.target_reward_model,
690
+ "target_feature_success": sample.target_feature_success,
691
+ }) + "\n")
692
+ except Exception as e:
693
+ logger.debug(f"replay disk write failed: {e}")
694
+
695
+ def _load_replay_from_disk(self) -> None:
696
+ if not self._replay_path.exists():
697
+ return
698
+ try:
699
+ for line in self._replay_path.read_text().splitlines()[-self.cfg.replay_capacity:]:
700
+ d = json.loads(line)
701
+ # Migrate legacy <none> labels (which collapsed to <unk>
702
+ # via vocab.get fallback) to the new explicit <no_tool>
703
+ # action class.
704
+ tool_name = d.get("target_tool", "<unk>")
705
+ if tool_name == "<none>":
706
+ tool_name = "<no_tool>"
707
+ # Extended-head targets: None means "this old sample
708
+ # has no label for this head" — the collator will then
709
+ # exclude any batch containing it from training that
710
+ # head, which is correct (we don't fabricate labels).
711
+ def _opt_float(key: str) -> float | None:
712
+ v = d.get(key)
713
+ return float(v) if v is not None else None
714
+ def _opt_int(key: str) -> int | None:
715
+ v = d.get(key)
716
+ return int(v) if v is not None else None
717
+ def _opt_list(key: str) -> list[float] | None:
718
+ v = d.get(key)
719
+ return [float(x) for x in v] if v is not None else None
720
+ # Two shapes supported:
721
+ # - new: {"conversation": [...ChatMessage...], "tools": [...]}
722
+ # - legacy: {"messages": [str...], "tool_history": [str...]}
723
+ # The legacy path lets a 3000-row historical replay
724
+ # buffer keep contributing without a one-time migration
725
+ # script. Both shapes share the rest of the fields.
726
+ if "conversation" in d and isinstance(d["conversation"], list):
727
+ raw_conv = d["conversation"]
728
+ raw_tools = d.get("tools") or []
729
+ conv = [ChatMessage.from_dict(m)
730
+ for m in raw_conv if isinstance(m, dict)]
731
+ from nn.format import ToolDef
732
+ tools = [ToolDef.from_dict(t)
733
+ for t in raw_tools if isinstance(t, dict)]
734
+ else:
735
+ # Legacy shape: rebuild a Conversation from the flat
736
+ # role-prefixed strings + the parallel tool_history.
737
+ conv = []
738
+ for s in d.get("messages") or []:
739
+ if not s:
740
+ continue
741
+ role, sep, content = s.partition(": ")
742
+ if not sep:
743
+ role, content = "user", s
744
+ conv.append(ChatMessage(role=role, content=content))
745
+ th = d.get("tool_history") or []
746
+ if th:
747
+ conv.append(ChatMessage(
748
+ role="assistant", content="",
749
+ tool_calls=[ToolCall(name=t) for t in th if t],
750
+ ))
751
+ tools = []
752
+ self.replay.push(ConversationSample(
753
+ conversation=conv,
754
+ tools=tools,
755
+ mood=d.get("mood", [0.5] * 4),
756
+ image_paths=d.get("image_paths", []),
757
+ target_tool=tool_name,
758
+ target_arguments=d.get("target_arguments", ""),
759
+ target_tool_call_index=int(
760
+ d.get("target_tool_call_index", 0)
761
+ ),
762
+ target_tier=int(d.get("target_tier", 0)),
763
+ target_think=float(d.get("target_think", 0.0)),
764
+ target_value=float(d.get("target_value", 0.5)),
765
+ model_source=d.get("model_source", "<unknown>"),
766
+ surprise=float(d.get("surprise", 0.5)),
767
+ target_memory_write=_opt_float("target_memory_write"),
768
+ target_cache_hit=_opt_float("target_cache_hit"),
769
+ target_permission=_opt_float("target_permission"),
770
+ target_refusal=_opt_float("target_refusal"),
771
+ target_code_response=_opt_float("target_code_response"),
772
+ target_stall=_opt_float("target_stall"),
773
+ target_difficulty=_opt_int("target_difficulty"),
774
+ target_stop_iter=_opt_float("target_stop_iter"),
775
+ target_compaction=_opt_float("target_compaction"),
776
+ target_latency_s=_opt_float("target_latency_s"),
777
+ target_token_budget=_opt_float("target_token_budget"),
778
+ target_mood_pred=_opt_list("target_mood_pred"),
779
+ target_subagent_spawn=_opt_float("target_subagent_spawn"),
780
+ target_reward_model=_opt_float("target_reward_model"),
781
+ target_feature_success=_opt_float("target_feature_success"),
782
+ ))
783
+ except Exception as e:
784
+ logger.debug(f"replay disk load failed: {e}")
785
+
786
+ # ------------------------------------------------------------------
787
+ # Auto-training
788
+ # ------------------------------------------------------------------
789
+
790
+ def _kick_auto_train(self) -> None:
791
+ """Spawn a training run in the background so the agent doesn't block."""
792
+ if not _owner_may_act():
793
+ return
794
+ try:
795
+ loop = asyncio.get_running_loop()
796
+ except RuntimeError:
797
+ return
798
+ if self._train_in_flight is not None and not self._train_in_flight.done():
799
+ return # already training
800
+ self._samples_since_train = 0
801
+ self._train_in_flight = loop.create_task(self._train_async())
802
+
803
+ async def _train_async(self) -> None:
804
+ try:
805
+ await asyncio.to_thread(self._train_sync)
806
+ # Hot-swap the advisor after training so the new turn uses
807
+ # the updated weights.
808
+ self.reload_advisor()
809
+ logger.info("Caudate auto-train cycle complete; advisor reloaded")
810
+
811
+ # Feed the eval result to the plateau scheduler — if the
812
+ # latest cycle didn't improve enough, Caudate fires her own
813
+ # NAS run. This is what "she evolves herself" means in code.
814
+ try:
815
+ from nn.nas.scheduler import PlateauScheduler
816
+ composite = self.scorer.composite()
817
+ PlateauScheduler().observe_eval(composite)
818
+ if self.auto_evolver:
819
+ self.auto_evolver.maybe_fire()
820
+ except Exception as e:
821
+ logger.debug(f"auto-evolve check failed: {e}")
822
+ except Exception as e:
823
+ logger.warning(f"Caudate auto-train failed: {e}")
824
+
825
+ def _train_sync(self) -> None:
826
+ from nn.data import load_corpus_from_sessions
827
+ from nn.trainer import Trainer, build_fresh
828
+
829
+ # Combine on-disk session corpus + live replay
830
+ corpus = load_corpus_from_sessions()
831
+ corpus.extend(self.replay.all())
832
+
833
+ if len(corpus) < self.cfg.min_episodes_to_train:
834
+ logger.debug(
835
+ f"auto-train skipped: only {len(corpus)} samples "
836
+ f"(need {self.cfg.min_episodes_to_train})"
837
+ )
838
+ return
839
+
840
+ # Resume if checkpoint exists and architecture matches.
841
+ # Refuse to clobber an on-disk checkpoint with a fresh init: a
842
+ # silent build_fresh fallback wiped step-11000 weights overnight
843
+ # (2026-05-19). If load fails *and* a checkpoint already exists,
844
+ # skip this auto-train cycle entirely and surface the error.
845
+ ckpt_path = Path(self.cfg.checkpoint_path)
846
+ if ckpt_path.exists():
847
+ try:
848
+ trainer = Trainer.load(self.cfg)
849
+ except Exception as e:
850
+ logger.warning(
851
+ f"auto-train skipped: Trainer.load() failed ({e}); "
852
+ f"refusing to overwrite existing checkpoint "
853
+ f"{ckpt_path} with a fresh init"
854
+ )
855
+ return
856
+ else:
857
+ trainer = build_fresh(self.cfg)
858
+
859
+ # Cap auto-runs to a quick burst — full retrains are explicit
860
+ burst_steps = max(50, min(500, len(corpus) * 4))
861
+ original_max = trainer.cfg.max_steps
862
+ trainer.cfg.max_steps = trainer.step + burst_steps
863
+ try:
864
+ trainer.fit(corpus)
865
+ finally:
866
+ trainer.cfg.max_steps = original_max
867
+
868
+
869
+ # =====================================================================
870
+ # Forge feature-outcome observation (2026-05-10)
871
+ #
872
+ # When the orchestrator finishes a feature it calls this hook with:
873
+ # - the feature's text (title + description so Caudate can encode it)
874
+ # - which model was used
875
+ # - how many turns / tool calls were spent
876
+ # - whether it succeeded
877
+ # - wall-clock duration
878
+ #
879
+ # We persist to data/nn/feature_outcomes.jsonl. The trainer doesn't yet
880
+ # have a dedicated feature-outcome head, but the value head can pick
881
+ # up these signals once the corpus is hooked into the dataset loader.
882
+ # Until then this just provides a clean signal trail for inspection
883
+ # and offline analysis. See CAUDATE_EVOLUTION.md "feature outcomes"
884
+ # section for the longer plan.
885
+ # =====================================================================
886
+
887
+ _FEATURE_OUTCOMES_PATH = Path("data/nn/feature_outcomes.jsonl")
888
+
889
+
890
+ def observe_feature_outcome(
891
+ *,
892
+ feature_text: str,
893
+ model_used: str,
894
+ n_turns: int,
895
+ n_tool_calls: int,
896
+ success: bool,
897
+ duration_s: float,
898
+ project_id: int | None = None,
899
+ feature_id: int | None = None,
900
+ session_id: int | None = None,
901
+ extras: dict[str, Any] | None = None,
902
+ ) -> None:
903
+ """Append one feature-outcome record to the JSONL log.
904
+
905
+ Best-effort: any IO error is swallowed so this never breaks
906
+ orchestration. Returns silently.
907
+ """
908
+ try:
909
+ _FEATURE_OUTCOMES_PATH.parent.mkdir(parents=True, exist_ok=True)
910
+ record = {
911
+ "ts": time.time(),
912
+ "feature_text": feature_text,
913
+ "model_used": model_used,
914
+ "n_turns": int(n_turns),
915
+ "n_tool_calls": int(n_tool_calls),
916
+ "success": bool(success),
917
+ "duration_s": float(duration_s),
918
+ "project_id": project_id,
919
+ "feature_id": feature_id,
920
+ "session_id": session_id,
921
+ "extras": extras or {},
922
+ }
923
+ with _FEATURE_OUTCOMES_PATH.open("a", encoding="utf-8") as f:
924
+ f.write(json.dumps(record) + "\n")
925
+ except Exception as e:
926
+ logging.getLogger(__name__).debug(
927
+ f"observe_feature_outcome failed: {e}"
928
+ )
929
+
930
+
931
+ def _try_observe_feature_outcome(
932
+ *,
933
+ project_id: int | None,
934
+ feature_id: int | None,
935
+ outcome: str,
936
+ session_id: int | None,
937
+ ) -> None:
938
+ """Compatibility shim called by the orchestrator. Pulls richer
939
+ data from the DB so callers can stay terse.
940
+ """
941
+ if feature_id is None:
942
+ return
943
+ try:
944
+ from planning.forge_models import session_scope, ForgeFeature, \
945
+ ForgeSession, ForgeLog
946
+ from sqlalchemy import func
947
+ except Exception:
948
+ return
949
+
950
+ try:
951
+ with session_scope() as sess:
952
+ feat = sess.get(ForgeFeature, feature_id)
953
+ if feat is None:
954
+ return
955
+ feature_text = f"{feat.title}\n\n{feat.description or ''}".strip()
956
+
957
+ srow = sess.get(ForgeSession, session_id) if session_id else None
958
+ # ForgeSession doesn't track model_used directly yet; the
959
+ # orchestrator passes it via the public observe_feature_outcome
960
+ # call when it has the active model in scope. For the shim
961
+ # path the lookup is best-effort.
962
+ model_used = "unknown"
963
+ started = srow.started_at if srow else None
964
+ ended = srow.ended_at if srow else None
965
+ duration_s = (
966
+ (ended - started).total_seconds()
967
+ if (started and ended) else 0.0
968
+ )
969
+
970
+ # Cheap proxies for n_turns / n_tool_calls: count log rows.
971
+ n_log_rows = (
972
+ sess.query(func.count(ForgeLog.id))
973
+ .filter_by(session_id=session_id)
974
+ .scalar()
975
+ if session_id else 0
976
+ )
977
+ except Exception as e:
978
+ logging.getLogger(__name__).debug(
979
+ f"_try_observe_feature_outcome lookup failed: {e}"
980
+ )
981
+ return
982
+
983
+ observe_feature_outcome(
984
+ feature_text=feature_text,
985
+ model_used=model_used,
986
+ n_turns=n_log_rows, # proxy until orchestrator tracks turns
987
+ n_tool_calls=n_log_rows, # proxy
988
+ success=(outcome == "success"),
989
+ duration_s=duration_s,
990
+ project_id=project_id,
991
+ feature_id=feature_id,
992
+ session_id=session_id,
993
+ extras={"outcome": outcome},
994
+ )