@blockrun/franklin 3.8.13 → 3.8.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,56 @@
1
+ /**
2
+ * Grounding evaluator — a cheap second-pass check that every factual claim
3
+ * in Franklin's answer traces back to a tool-call result, not model memory.
4
+ *
5
+ * Why this exists (2026-04 retrospective): the CRCL incident — user asked
6
+ * about a stock Franklin had tools to query, Franklin answered from 2022
7
+ * training data instead. Root cause wasn't a prompt defect; it was an
8
+ * absent evaluator. The existing `verification.ts` only fires when the
9
+ * agent writes code (Edit / Write / Bash threshold), so read-heavy hero
10
+ * use cases (trading, research, analysis) never triggered any quality gate.
11
+ *
12
+ * This module is the complement: fires on *answers with factual content*,
13
+ * regardless of tool type. Anthropic's harness-design article calls out
14
+ * "self-evaluation on complex tasks" as anti-pattern #14 — models skew
15
+ * positive when grading themselves. So the check runs as a separate agent
16
+ * (different system prompt, explicitly adversarial) with its own model.
17
+ *
18
+ * v1 scope: check only, never re-prompt. Emit a follow-up ⚠️ event when
19
+ * claims look ungrounded, let the user decide whether to re-ask. The
20
+ * re-prompt loop (generator iterates against evaluator findings until
21
+ * PASS) is a v2 concern once we know v1 catches real cases without
22
+ * false-positive noise.
23
+ */
24
+ import type { CapabilityHandler, Dialogue } from './types.js';
25
+ import { ModelClient } from './llm.js';
26
+ export type GroundingVerdict = 'GROUNDED' | 'PARTIAL' | 'UNGROUNDED' | 'SKIPPED';
27
+ export interface GroundingResult {
28
+ verdict: GroundingVerdict;
29
+ issues: string[];
30
+ raw: string;
31
+ }
32
+ /**
33
+ * Decide whether this turn warrants a grounding check. Principles:
34
+ * - Non-trivial user input (not a greeting, not a slash command)
35
+ * - Non-trivial assistant text output (not just a tool-result echo)
36
+ *
37
+ * Intentionally NOT gating on tool-type (read vs write) — the whole point
38
+ * of this module is to cover read-heavy turns the code verifier misses.
39
+ */
40
+ export declare function shouldCheckGrounding(userInput: string, assistantText: string): boolean;
41
+ export declare function parseGroundingResponse(raw: string): GroundingResult;
42
+ /** Cheap model for grading. Default matches existing verification.ts
43
+ * choice so both quality gates have the same cost profile. Override via
44
+ * `FRANKLIN_EVALUATOR_MODEL` to experiment with accuracy/cost trade-offs. */
45
+ export declare function evaluatorModel(): string;
46
+ export declare function checkGrounding(userInput: string, history: Dialogue[], assistantText: string, client: ModelClient, opts?: {
47
+ abortSignal?: AbortSignal;
48
+ model?: string;
49
+ }): Promise<GroundingResult>;
50
+ /**
51
+ * Convert a grounding result into a user-facing follow-up message. Returns
52
+ * empty string when verdict is GROUNDED / SKIPPED — no reason to spam the
53
+ * user when the check agreed the answer was sound.
54
+ */
55
+ export declare function renderGroundingFollowup(result: GroundingResult): string;
56
+ export type { CapabilityHandler };
@@ -0,0 +1,233 @@
1
+ /**
2
+ * Grounding evaluator — a cheap second-pass check that every factual claim
3
+ * in Franklin's answer traces back to a tool-call result, not model memory.
4
+ *
5
+ * Why this exists (2026-04 retrospective): the CRCL incident — user asked
6
+ * about a stock Franklin had tools to query, Franklin answered from 2022
7
+ * training data instead. Root cause wasn't a prompt defect; it was an
8
+ * absent evaluator. The existing `verification.ts` only fires when the
9
+ * agent writes code (Edit / Write / Bash threshold), so read-heavy hero
10
+ * use cases (trading, research, analysis) never triggered any quality gate.
11
+ *
12
+ * This module is the complement: fires on *answers with factual content*,
13
+ * regardless of tool type. Anthropic's harness-design article calls out
14
+ * "self-evaluation on complex tasks" as anti-pattern #14 — models skew
15
+ * positive when grading themselves. So the check runs as a separate agent
16
+ * (different system prompt, explicitly adversarial) with its own model.
17
+ *
18
+ * v1 scope: check only, never re-prompt. Emit a follow-up ⚠️ event when
19
+ * claims look ungrounded, let the user decide whether to re-ask. The
20
+ * re-prompt loop (generator iterates against evaluator findings until
21
+ * PASS) is a v2 concern once we know v1 catches real cases without
22
+ * false-positive noise.
23
+ */
24
+ // ─── Evaluator system prompt ─────────────────────────────────────────────
25
+ //
26
+ // Principle-based, not example-enumerating. Specific tickers or phrasings
27
+ // hard-coded here would rot the moment the market changes. The rule is
28
+ // general: claim → tool result or explicit uncertainty.
29
+ const EVALUATOR_PROMPT = `You are a GROUNDING CHECK agent. Your job is to verify that an AI assistant's answer is grounded in tool-call evidence, not model memory.
30
+
31
+ ## What you receive
32
+ - The user's question
33
+ - A list of tool calls made this turn (tool name, input summary, whether it succeeded)
34
+ - The assistant's final text answer
35
+
36
+ ## What you check
37
+ Every **factual claim** in the answer must trace to ONE of:
38
+ (a) A successful tool call result from this turn, OR
39
+ (b) Explicit acknowledgment of uncertainty ("I'm not sure", "based on older data", "I'd need to check")
40
+
41
+ Claims that are ungrounded:
42
+ - Specific current-world facts stated with confidence but not backed by any tool call this turn
43
+ - Recommendations or conclusions that depend on unstated data (e.g. "you should sell" without a price lookup)
44
+ - Invented specifics — names, numbers, dates the model produced without a tool call supporting them
45
+
46
+ Claims that are grounded:
47
+ - Anything directly derived from a tool result shown in the turn
48
+ - General knowledge / definitions / reasoning that doesn't depend on current-world specifics
49
+ - Claims explicitly hedged as uncertain
50
+
51
+ ## Output — exact format
52
+
53
+ VERDICT: GROUNDED | PARTIAL | UNGROUNDED
54
+
55
+ If not GROUNDED, list each ungrounded claim on its own line starting with "- " and the tool that should have been called, like:
56
+ - Claim: "<the ungrounded part, quoted briefly>" → missing tool: <TradingMarket | ExaAnswer | ExaSearch | WebSearch | ...>
57
+
58
+ Empty line between verdict and list. No other text. No preamble. No apology. Be terse.`;
59
+ // ─── Trigger policy ──────────────────────────────────────────────────────
60
+ const MIN_USER_CHARS = 20; // Short inputs are greetings/acks, not questions
61
+ const MIN_ANSWER_CHARS = 50; // Short answers are acks, not factual claims
62
+ /**
63
+ * Decide whether this turn warrants a grounding check. Principles:
64
+ * - Non-trivial user input (not a greeting, not a slash command)
65
+ * - Non-trivial assistant text output (not just a tool-result echo)
66
+ *
67
+ * Intentionally NOT gating on tool-type (read vs write) — the whole point
68
+ * of this module is to cover read-heavy turns the code verifier misses.
69
+ */
70
+ export function shouldCheckGrounding(userInput, assistantText) {
71
+ if (process.env.FRANKLIN_NO_EVAL === '1')
72
+ return false;
73
+ const ui = userInput.trim();
74
+ if (ui.length < MIN_USER_CHARS)
75
+ return false;
76
+ if (ui.startsWith('/'))
77
+ return false;
78
+ if (assistantText.trim().length < MIN_ANSWER_CHARS)
79
+ return false;
80
+ return true;
81
+ }
82
+ // ─── Turn summary extraction ─────────────────────────────────────────────
83
+ /**
84
+ * Summarize the current turn for the evaluator: user question + tool calls
85
+ * + tool result snippets + assistant's final answer. Bounded to keep the
86
+ * evaluator call cheap; it doesn't need every byte of every tool output.
87
+ */
88
+ function summarizeTurn(userInput, history, assistantText) {
89
+ const lines = [];
90
+ lines.push(`## User question`);
91
+ lines.push(userInput.trim().slice(0, 800));
92
+ lines.push('');
93
+ lines.push(`## Tool calls this turn`);
94
+ // Walk from the end of history back to (but not including) the user message.
95
+ // Each assistant tool_use and each user tool_result get condensed to one line.
96
+ let found = 0;
97
+ const toolLines = [];
98
+ for (let i = history.length - 1; i >= 0 && found < 40; i--) {
99
+ const msg = history[i];
100
+ if (msg.role === 'user' && typeof msg.content === 'string')
101
+ break;
102
+ if (msg.role === 'assistant' && Array.isArray(msg.content)) {
103
+ for (const part of msg.content) {
104
+ if (typeof part === 'object' && part.type === 'tool_use') {
105
+ const inputStr = JSON.stringify(part.input).slice(0, 160);
106
+ toolLines.unshift(` - ${part.name}(${inputStr})`);
107
+ found++;
108
+ }
109
+ }
110
+ }
111
+ else if (msg.role === 'user' && Array.isArray(msg.content)) {
112
+ for (const part of msg.content) {
113
+ if (typeof part === 'object' && part.type === 'tool_result') {
114
+ const output = typeof part.content === 'string'
115
+ ? part.content
116
+ : Array.isArray(part.content)
117
+ ? part.content.map(c => c.text || '').join('\n')
118
+ : '';
119
+ const snippet = output.slice(0, 240).replace(/\s+/g, ' ');
120
+ toolLines.unshift(` → ${snippet}`);
121
+ found++;
122
+ }
123
+ }
124
+ }
125
+ }
126
+ if (toolLines.length === 0) {
127
+ lines.push(' (none)');
128
+ }
129
+ else {
130
+ lines.push(...toolLines);
131
+ }
132
+ lines.push('');
133
+ lines.push(`## Assistant's answer`);
134
+ lines.push(assistantText.trim().slice(0, 2400));
135
+ return lines.join('\n');
136
+ }
137
+ // ─── Verdict parser ──────────────────────────────────────────────────────
138
+ export function parseGroundingResponse(raw) {
139
+ const text = raw.trim();
140
+ const m = text.match(/VERDICT:\s*(GROUNDED|PARTIAL|UNGROUNDED)/i);
141
+ const verdict = m
142
+ ? m[1].toUpperCase()
143
+ : 'PARTIAL'; // If the evaluator couldn't produce a clean verdict, err on the side of "flag for the user".
144
+ const issues = [];
145
+ const lines = text.split('\n');
146
+ for (const line of lines) {
147
+ const l = line.trim();
148
+ if (l.startsWith('- ') && l.length > 3) {
149
+ issues.push(l.slice(2).trim());
150
+ }
151
+ }
152
+ return { verdict, issues, raw: text };
153
+ }
154
+ // ─── Default evaluator model ─────────────────────────────────────────────
155
+ /** Cheap model for grading. Default matches existing verification.ts
156
+ * choice so both quality gates have the same cost profile. Override via
157
+ * `FRANKLIN_EVALUATOR_MODEL` to experiment with accuracy/cost trade-offs. */
158
+ export function evaluatorModel() {
159
+ return process.env.FRANKLIN_EVALUATOR_MODEL || 'nvidia/nemotron-ultra-253b';
160
+ }
161
+ // ─── Run grounding check ─────────────────────────────────────────────────
162
+ const MAX_EVAL_TOKENS = 512;
163
+ const EVAL_TIMEOUT_MS = 15_000;
164
+ export async function checkGrounding(userInput, history, assistantText, client, opts = {}) {
165
+ const model = opts.model || evaluatorModel();
166
+ const summary = summarizeTurn(userInput, history, assistantText);
167
+ // Run independently of the main agent — the evaluator gets NO tools
168
+ // (it just reads and grades). Limit tokens so a chatty evaluator can't
169
+ // balloon the cost of a cheap check.
170
+ const timeoutCtrl = new AbortController();
171
+ const timer = setTimeout(() => timeoutCtrl.abort(), EVAL_TIMEOUT_MS);
172
+ const signal = opts.abortSignal
173
+ ? anySignal([opts.abortSignal, timeoutCtrl.signal])
174
+ : timeoutCtrl.signal;
175
+ try {
176
+ const response = await client.complete({
177
+ model,
178
+ system: EVALUATOR_PROMPT,
179
+ messages: [{ role: 'user', content: summary }],
180
+ tools: [],
181
+ max_tokens: MAX_EVAL_TOKENS,
182
+ }, signal);
183
+ let raw = '';
184
+ for (const part of response.content) {
185
+ if (typeof part === 'object' && part.type === 'text' && part.text) {
186
+ raw += part.text;
187
+ }
188
+ }
189
+ if (!raw.trim()) {
190
+ return { verdict: 'SKIPPED', issues: [], raw: '(empty response)' };
191
+ }
192
+ return parseGroundingResponse(raw);
193
+ }
194
+ catch (err) {
195
+ return {
196
+ verdict: 'SKIPPED',
197
+ issues: [],
198
+ raw: `(evaluator error: ${err.message})`,
199
+ };
200
+ }
201
+ finally {
202
+ clearTimeout(timer);
203
+ }
204
+ }
205
+ /** Compose multiple AbortSignals into one — aborts when any source aborts. */
206
+ function anySignal(signals) {
207
+ const ctrl = new AbortController();
208
+ for (const s of signals) {
209
+ if (s.aborted) {
210
+ ctrl.abort();
211
+ break;
212
+ }
213
+ s.addEventListener('abort', () => ctrl.abort(), { once: true });
214
+ }
215
+ return ctrl.signal;
216
+ }
217
+ // ─── Render result for the UI ────────────────────────────────────────────
218
+ /**
219
+ * Convert a grounding result into a user-facing follow-up message. Returns
220
+ * empty string when verdict is GROUNDED / SKIPPED — no reason to spam the
221
+ * user when the check agreed the answer was sound.
222
+ */
223
+ export function renderGroundingFollowup(result) {
224
+ if (result.verdict === 'GROUNDED' || result.verdict === 'SKIPPED')
225
+ return '';
226
+ const header = result.verdict === 'UNGROUNDED'
227
+ ? '⚠️ **Grounding check failed** — the previous answer relied on memory where a tool call was available:'
228
+ : '⚠️ **Grounding check flagged some claims** — re-run with the suggested tools for a verified answer:';
229
+ const body = result.issues.length > 0
230
+ ? result.issues.map(i => `- ${i}`).join('\n')
231
+ : '(evaluator returned no specific items — check the transcript manually)';
232
+ return `\n\n${header}\n${body}\n\n_Ask again with an explicit instruction to call the tools, or disable these checks with \`FRANKLIN_NO_EVAL=1\`._`;
233
+ }
@@ -25,6 +25,7 @@ import { routeRequest, parseRoutingProfile } from '../router/index.js';
25
25
  import { recordOutcome } from '../router/local-elo.js';
26
26
  import { shouldPlan, getPlanningPrompt, getExecutorModel, isExecutorStuck, toolCallSignature } from './planner.js';
27
27
  import { shouldVerify, runVerification } from './verification.js';
28
+ import { shouldCheckGrounding, checkGrounding, renderGroundingFollowup } from './evaluator.js';
28
29
  import { createSessionId, appendToSession, updateSessionMeta, pruneOldSessions, loadSessionHistory, loadSessionMeta, } from '../session/storage.js';
29
30
  /**
30
31
  * Atomically replace all elements in a history array.
@@ -1073,7 +1074,10 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
1073
1074
  });
1074
1075
  }
1075
1076
  }
1076
- // ── Verification gate: run adversarial checks on substantial work ──
1077
+ // ── Verification gate: run adversarial checks on substantial CODE work ──
1078
+ // Fires when the agent Edit/Write/Bash-ed enough to warrant running
1079
+ // the build + tests. Complements the grounding check below, which
1080
+ // covers read-heavy answers this verifier misses.
1077
1081
  if (shouldVerify(turnToolCalls, turnToolCounts, lastUserInput || '')) {
1078
1082
  try {
1079
1083
  const vResult = await runVerification(history, capabilityMap, client, {
@@ -1102,6 +1106,31 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
1102
1106
  // Verification errors never block the main flow
1103
1107
  }
1104
1108
  }
1109
+ // ── Grounding gate: check that factual claims trace to tool calls ──
1110
+ // Fires on any substantive answer to a non-trivial question. Designed
1111
+ // to catch the failure mode the code-verifier misses: model answers
1112
+ // a "what's X / should I buy Y" question from memory instead of
1113
+ // calling the live tools. Evaluator runs as a separate agent on a
1114
+ // cheap model; never blocks the turn, only appends a ⚠️ note when
1115
+ // the answer looks ungrounded so the user can re-ask.
1116
+ try {
1117
+ const assistantText = responseParts
1118
+ .filter(p => p.type === 'text' && typeof p.text === 'string')
1119
+ .map(p => p.text)
1120
+ .join('');
1121
+ if (shouldCheckGrounding(lastUserInput || '', assistantText)) {
1122
+ const gResult = await checkGrounding(lastUserInput, history, assistantText, client, {
1123
+ abortSignal: abort.signal,
1124
+ });
1125
+ const followup = renderGroundingFollowup(gResult);
1126
+ if (followup) {
1127
+ onEvent({ kind: 'text_delta', text: followup });
1128
+ }
1129
+ }
1130
+ }
1131
+ catch {
1132
+ // Grounding check is best-effort — never block the main flow.
1133
+ }
1105
1134
  // Record success for local Elo learning (include tool call count for efficiency)
1106
1135
  if (lastRoutedCategory && lastRoutedModel) {
1107
1136
  recordOutcome(lastRoutedCategory, lastRoutedModel, 'continued', turnToolCalls);
@@ -17,6 +17,10 @@ const MULTI_STEP_PATTERN = /first.*then|step\s+\d|\d+\.\s|and\s+then|after\s+tha
17
17
  * the overhead of an extra planning call.
18
18
  */
19
19
  export function shouldPlan(tier, profile, userText, ultrathink, planDisabled) {
20
+ // Per-process opt-out for ablation / scripting ("is plan-then-execute
21
+ // still load-bearing?"). Takes precedence over every other heuristic.
22
+ if (process.env.FRANKLIN_NOPLAN === '1')
23
+ return false;
20
24
  // User disabled planning for this session
21
25
  if (planDisabled)
22
26
  return false;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blockrun/franklin",
3
- "version": "3.8.13",
3
+ "version": "3.8.15",
4
4
  "description": "Franklin — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.",
5
5
  "type": "module",
6
6
  "exports": {