@blockrun/franklin 3.8.13 → 3.8.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/evaluator.d.ts +56 -0
- package/dist/agent/evaluator.js +233 -0
- package/dist/agent/loop.js +30 -1
- package/package.json +1 -1
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Grounding evaluator — a cheap second-pass check that every factual claim
|
|
3
|
+
* in Franklin's answer traces back to a tool-call result, not model memory.
|
|
4
|
+
*
|
|
5
|
+
* Why this exists (2026-04 retrospective): the CRCL incident — user asked
|
|
6
|
+
* about a stock Franklin had tools to query, Franklin answered from 2022
|
|
7
|
+
* training data instead. Root cause wasn't a prompt defect; it was an
|
|
8
|
+
* absent evaluator. The existing `verification.ts` only fires when the
|
|
9
|
+
* agent writes code (Edit / Write / Bash threshold), so read-heavy hero
|
|
10
|
+
* use cases (trading, research, analysis) never triggered any quality gate.
|
|
11
|
+
*
|
|
12
|
+
* This module is the complement: fires on *answers with factual content*,
|
|
13
|
+
* regardless of tool type. Anthropic's harness-design article calls out
|
|
14
|
+
* "self-evaluation on complex tasks" as anti-pattern #14 — models skew
|
|
15
|
+
* positive when grading themselves. So the check runs as a separate agent
|
|
16
|
+
* (different system prompt, explicitly adversarial) with its own model.
|
|
17
|
+
*
|
|
18
|
+
* v1 scope: check only, never re-prompt. Emit a follow-up ⚠️ event when
|
|
19
|
+
* claims look ungrounded, let the user decide whether to re-ask. The
|
|
20
|
+
* re-prompt loop (generator iterates against evaluator findings until
|
|
21
|
+
* PASS) is a v2 concern once we know v1 catches real cases without
|
|
22
|
+
* false-positive noise.
|
|
23
|
+
*/
|
|
24
|
+
import type { CapabilityHandler, Dialogue } from './types.js';
|
|
25
|
+
import { ModelClient } from './llm.js';
|
|
26
|
+
export type GroundingVerdict = 'GROUNDED' | 'PARTIAL' | 'UNGROUNDED' | 'SKIPPED';
|
|
27
|
+
export interface GroundingResult {
|
|
28
|
+
verdict: GroundingVerdict;
|
|
29
|
+
issues: string[];
|
|
30
|
+
raw: string;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Decide whether this turn warrants a grounding check. Principles:
|
|
34
|
+
* - Non-trivial user input (not a greeting, not a slash command)
|
|
35
|
+
* - Non-trivial assistant text output (not just a tool-result echo)
|
|
36
|
+
*
|
|
37
|
+
* Intentionally NOT gating on tool-type (read vs write) — the whole point
|
|
38
|
+
* of this module is to cover read-heavy turns the code verifier misses.
|
|
39
|
+
*/
|
|
40
|
+
export declare function shouldCheckGrounding(userInput: string, assistantText: string): boolean;
|
|
41
|
+
export declare function parseGroundingResponse(raw: string): GroundingResult;
|
|
42
|
+
/** Cheap model for grading. Default matches existing verification.ts
|
|
43
|
+
* choice so both quality gates have the same cost profile. Override via
|
|
44
|
+
* `FRANKLIN_EVALUATOR_MODEL` to experiment with accuracy/cost trade-offs. */
|
|
45
|
+
export declare function evaluatorModel(): string;
|
|
46
|
+
export declare function checkGrounding(userInput: string, history: Dialogue[], assistantText: string, client: ModelClient, opts?: {
|
|
47
|
+
abortSignal?: AbortSignal;
|
|
48
|
+
model?: string;
|
|
49
|
+
}): Promise<GroundingResult>;
|
|
50
|
+
/**
|
|
51
|
+
* Convert a grounding result into a user-facing follow-up message. Returns
|
|
52
|
+
* empty string when verdict is GROUNDED / SKIPPED — no reason to spam the
|
|
53
|
+
* user when the check agreed the answer was sound.
|
|
54
|
+
*/
|
|
55
|
+
export declare function renderGroundingFollowup(result: GroundingResult): string;
|
|
56
|
+
export type { CapabilityHandler };
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Grounding evaluator — a cheap second-pass check that every factual claim
|
|
3
|
+
* in Franklin's answer traces back to a tool-call result, not model memory.
|
|
4
|
+
*
|
|
5
|
+
* Why this exists (2026-04 retrospective): the CRCL incident — user asked
|
|
6
|
+
* about a stock Franklin had tools to query, Franklin answered from 2022
|
|
7
|
+
* training data instead. Root cause wasn't a prompt defect; it was an
|
|
8
|
+
* absent evaluator. The existing `verification.ts` only fires when the
|
|
9
|
+
* agent writes code (Edit / Write / Bash threshold), so read-heavy hero
|
|
10
|
+
* use cases (trading, research, analysis) never triggered any quality gate.
|
|
11
|
+
*
|
|
12
|
+
* This module is the complement: fires on *answers with factual content*,
|
|
13
|
+
* regardless of tool type. Anthropic's harness-design article calls out
|
|
14
|
+
* "self-evaluation on complex tasks" as anti-pattern #14 — models skew
|
|
15
|
+
* positive when grading themselves. So the check runs as a separate agent
|
|
16
|
+
* (different system prompt, explicitly adversarial) with its own model.
|
|
17
|
+
*
|
|
18
|
+
* v1 scope: check only, never re-prompt. Emit a follow-up ⚠️ event when
|
|
19
|
+
* claims look ungrounded, let the user decide whether to re-ask. The
|
|
20
|
+
* re-prompt loop (generator iterates against evaluator findings until
|
|
21
|
+
* PASS) is a v2 concern once we know v1 catches real cases without
|
|
22
|
+
* false-positive noise.
|
|
23
|
+
*/
|
|
24
|
+
// ─── Evaluator system prompt ─────────────────────────────────────────────
|
|
25
|
+
//
|
|
26
|
+
// Principle-based, not example-enumerating. Specific tickers or phrasings
|
|
27
|
+
// hard-coded here would rot the moment the market changes. The rule is
|
|
28
|
+
// general: claim → tool result or explicit uncertainty.
|
|
29
|
+
const EVALUATOR_PROMPT = `You are a GROUNDING CHECK agent. Your job is to verify that an AI assistant's answer is grounded in tool-call evidence, not model memory.
|
|
30
|
+
|
|
31
|
+
## What you receive
|
|
32
|
+
- The user's question
|
|
33
|
+
- A list of tool calls made this turn (tool name, input summary, whether it succeeded)
|
|
34
|
+
- The assistant's final text answer
|
|
35
|
+
|
|
36
|
+
## What you check
|
|
37
|
+
Every **factual claim** in the answer must trace to ONE of:
|
|
38
|
+
(a) A successful tool call result from this turn, OR
|
|
39
|
+
(b) Explicit acknowledgment of uncertainty ("I'm not sure", "based on older data", "I'd need to check")
|
|
40
|
+
|
|
41
|
+
Claims that are ungrounded:
|
|
42
|
+
- Specific current-world facts stated with confidence but not backed by any tool call this turn
|
|
43
|
+
- Recommendations or conclusions that depend on unstated data (e.g. "you should sell" without a price lookup)
|
|
44
|
+
- Invented specifics — names, numbers, dates the model produced without a tool call supporting them
|
|
45
|
+
|
|
46
|
+
Claims that are grounded:
|
|
47
|
+
- Anything directly derived from a tool result shown in the turn
|
|
48
|
+
- General knowledge / definitions / reasoning that doesn't depend on current-world specifics
|
|
49
|
+
- Claims explicitly hedged as uncertain
|
|
50
|
+
|
|
51
|
+
## Output — exact format
|
|
52
|
+
|
|
53
|
+
VERDICT: GROUNDED | PARTIAL | UNGROUNDED
|
|
54
|
+
|
|
55
|
+
If not GROUNDED, list each ungrounded claim on its own line starting with "- " and the tool that should have been called, like:
|
|
56
|
+
- Claim: "<the ungrounded part, quoted briefly>" → missing tool: <TradingMarket | ExaAnswer | ExaSearch | WebSearch | ...>
|
|
57
|
+
|
|
58
|
+
Empty line between verdict and list. No other text. No preamble. No apology. Be terse.`;
|
|
59
|
+
// ─── Trigger policy ──────────────────────────────────────────────────────
|
|
60
|
+
const MIN_USER_CHARS = 20; // Short inputs are greetings/acks, not questions
|
|
61
|
+
const MIN_ANSWER_CHARS = 50; // Short answers are acks, not factual claims
|
|
62
|
+
/**
|
|
63
|
+
* Decide whether this turn warrants a grounding check. Principles:
|
|
64
|
+
* - Non-trivial user input (not a greeting, not a slash command)
|
|
65
|
+
* - Non-trivial assistant text output (not just a tool-result echo)
|
|
66
|
+
*
|
|
67
|
+
* Intentionally NOT gating on tool-type (read vs write) — the whole point
|
|
68
|
+
* of this module is to cover read-heavy turns the code verifier misses.
|
|
69
|
+
*/
|
|
70
|
+
export function shouldCheckGrounding(userInput, assistantText) {
|
|
71
|
+
if (process.env.FRANKLIN_NO_EVAL === '1')
|
|
72
|
+
return false;
|
|
73
|
+
const ui = userInput.trim();
|
|
74
|
+
if (ui.length < MIN_USER_CHARS)
|
|
75
|
+
return false;
|
|
76
|
+
if (ui.startsWith('/'))
|
|
77
|
+
return false;
|
|
78
|
+
if (assistantText.trim().length < MIN_ANSWER_CHARS)
|
|
79
|
+
return false;
|
|
80
|
+
return true;
|
|
81
|
+
}
|
|
82
|
+
// ─── Turn summary extraction ─────────────────────────────────────────────
|
|
83
|
+
/**
|
|
84
|
+
* Summarize the current turn for the evaluator: user question + tool calls
|
|
85
|
+
* + tool result snippets + assistant's final answer. Bounded to keep the
|
|
86
|
+
* evaluator call cheap; it doesn't need every byte of every tool output.
|
|
87
|
+
*/
|
|
88
|
+
function summarizeTurn(userInput, history, assistantText) {
|
|
89
|
+
const lines = [];
|
|
90
|
+
lines.push(`## User question`);
|
|
91
|
+
lines.push(userInput.trim().slice(0, 800));
|
|
92
|
+
lines.push('');
|
|
93
|
+
lines.push(`## Tool calls this turn`);
|
|
94
|
+
// Walk from the end of history back to (but not including) the user message.
|
|
95
|
+
// Each assistant tool_use and each user tool_result get condensed to one line.
|
|
96
|
+
let found = 0;
|
|
97
|
+
const toolLines = [];
|
|
98
|
+
for (let i = history.length - 1; i >= 0 && found < 40; i--) {
|
|
99
|
+
const msg = history[i];
|
|
100
|
+
if (msg.role === 'user' && typeof msg.content === 'string')
|
|
101
|
+
break;
|
|
102
|
+
if (msg.role === 'assistant' && Array.isArray(msg.content)) {
|
|
103
|
+
for (const part of msg.content) {
|
|
104
|
+
if (typeof part === 'object' && part.type === 'tool_use') {
|
|
105
|
+
const inputStr = JSON.stringify(part.input).slice(0, 160);
|
|
106
|
+
toolLines.unshift(` - ${part.name}(${inputStr})`);
|
|
107
|
+
found++;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
else if (msg.role === 'user' && Array.isArray(msg.content)) {
|
|
112
|
+
for (const part of msg.content) {
|
|
113
|
+
if (typeof part === 'object' && part.type === 'tool_result') {
|
|
114
|
+
const output = typeof part.content === 'string'
|
|
115
|
+
? part.content
|
|
116
|
+
: Array.isArray(part.content)
|
|
117
|
+
? part.content.map(c => c.text || '').join('\n')
|
|
118
|
+
: '';
|
|
119
|
+
const snippet = output.slice(0, 240).replace(/\s+/g, ' ');
|
|
120
|
+
toolLines.unshift(` → ${snippet}`);
|
|
121
|
+
found++;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
if (toolLines.length === 0) {
|
|
127
|
+
lines.push(' (none)');
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
lines.push(...toolLines);
|
|
131
|
+
}
|
|
132
|
+
lines.push('');
|
|
133
|
+
lines.push(`## Assistant's answer`);
|
|
134
|
+
lines.push(assistantText.trim().slice(0, 2400));
|
|
135
|
+
return lines.join('\n');
|
|
136
|
+
}
|
|
137
|
+
// ─── Verdict parser ──────────────────────────────────────────────────────
|
|
138
|
+
export function parseGroundingResponse(raw) {
|
|
139
|
+
const text = raw.trim();
|
|
140
|
+
const m = text.match(/VERDICT:\s*(GROUNDED|PARTIAL|UNGROUNDED)/i);
|
|
141
|
+
const verdict = m
|
|
142
|
+
? m[1].toUpperCase()
|
|
143
|
+
: 'PARTIAL'; // If the evaluator couldn't produce a clean verdict, err on the side of "flag for the user".
|
|
144
|
+
const issues = [];
|
|
145
|
+
const lines = text.split('\n');
|
|
146
|
+
for (const line of lines) {
|
|
147
|
+
const l = line.trim();
|
|
148
|
+
if (l.startsWith('- ') && l.length > 3) {
|
|
149
|
+
issues.push(l.slice(2).trim());
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
return { verdict, issues, raw: text };
|
|
153
|
+
}
|
|
154
|
+
// ─── Default evaluator model ─────────────────────────────────────────────
|
|
155
|
+
/** Cheap model for grading. Default matches existing verification.ts
|
|
156
|
+
* choice so both quality gates have the same cost profile. Override via
|
|
157
|
+
* `FRANKLIN_EVALUATOR_MODEL` to experiment with accuracy/cost trade-offs. */
|
|
158
|
+
export function evaluatorModel() {
|
|
159
|
+
return process.env.FRANKLIN_EVALUATOR_MODEL || 'nvidia/nemotron-ultra-253b';
|
|
160
|
+
}
|
|
161
|
+
// ─── Run grounding check ─────────────────────────────────────────────────
|
|
162
|
+
const MAX_EVAL_TOKENS = 512;
|
|
163
|
+
const EVAL_TIMEOUT_MS = 15_000;
|
|
164
|
+
export async function checkGrounding(userInput, history, assistantText, client, opts = {}) {
|
|
165
|
+
const model = opts.model || evaluatorModel();
|
|
166
|
+
const summary = summarizeTurn(userInput, history, assistantText);
|
|
167
|
+
// Run independently of the main agent — the evaluator gets NO tools
|
|
168
|
+
// (it just reads and grades). Limit tokens so a chatty evaluator can't
|
|
169
|
+
// balloon the cost of a cheap check.
|
|
170
|
+
const timeoutCtrl = new AbortController();
|
|
171
|
+
const timer = setTimeout(() => timeoutCtrl.abort(), EVAL_TIMEOUT_MS);
|
|
172
|
+
const signal = opts.abortSignal
|
|
173
|
+
? anySignal([opts.abortSignal, timeoutCtrl.signal])
|
|
174
|
+
: timeoutCtrl.signal;
|
|
175
|
+
try {
|
|
176
|
+
const response = await client.complete({
|
|
177
|
+
model,
|
|
178
|
+
system: EVALUATOR_PROMPT,
|
|
179
|
+
messages: [{ role: 'user', content: summary }],
|
|
180
|
+
tools: [],
|
|
181
|
+
max_tokens: MAX_EVAL_TOKENS,
|
|
182
|
+
}, signal);
|
|
183
|
+
let raw = '';
|
|
184
|
+
for (const part of response.content) {
|
|
185
|
+
if (typeof part === 'object' && part.type === 'text' && part.text) {
|
|
186
|
+
raw += part.text;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
if (!raw.trim()) {
|
|
190
|
+
return { verdict: 'SKIPPED', issues: [], raw: '(empty response)' };
|
|
191
|
+
}
|
|
192
|
+
return parseGroundingResponse(raw);
|
|
193
|
+
}
|
|
194
|
+
catch (err) {
|
|
195
|
+
return {
|
|
196
|
+
verdict: 'SKIPPED',
|
|
197
|
+
issues: [],
|
|
198
|
+
raw: `(evaluator error: ${err.message})`,
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
finally {
|
|
202
|
+
clearTimeout(timer);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
/** Compose multiple AbortSignals into one — aborts when any source aborts. */
|
|
206
|
+
function anySignal(signals) {
|
|
207
|
+
const ctrl = new AbortController();
|
|
208
|
+
for (const s of signals) {
|
|
209
|
+
if (s.aborted) {
|
|
210
|
+
ctrl.abort();
|
|
211
|
+
break;
|
|
212
|
+
}
|
|
213
|
+
s.addEventListener('abort', () => ctrl.abort(), { once: true });
|
|
214
|
+
}
|
|
215
|
+
return ctrl.signal;
|
|
216
|
+
}
|
|
217
|
+
// ─── Render result for the UI ────────────────────────────────────────────
|
|
218
|
+
/**
|
|
219
|
+
* Convert a grounding result into a user-facing follow-up message. Returns
|
|
220
|
+
* empty string when verdict is GROUNDED / SKIPPED — no reason to spam the
|
|
221
|
+
* user when the check agreed the answer was sound.
|
|
222
|
+
*/
|
|
223
|
+
export function renderGroundingFollowup(result) {
|
|
224
|
+
if (result.verdict === 'GROUNDED' || result.verdict === 'SKIPPED')
|
|
225
|
+
return '';
|
|
226
|
+
const header = result.verdict === 'UNGROUNDED'
|
|
227
|
+
? '⚠️ **Grounding check failed** — the previous answer relied on memory where a tool call was available:'
|
|
228
|
+
: '⚠️ **Grounding check flagged some claims** — re-run with the suggested tools for a verified answer:';
|
|
229
|
+
const body = result.issues.length > 0
|
|
230
|
+
? result.issues.map(i => `- ${i}`).join('\n')
|
|
231
|
+
: '(evaluator returned no specific items — check the transcript manually)';
|
|
232
|
+
return `\n\n${header}\n${body}\n\n_Ask again with an explicit instruction to call the tools, or disable these checks with \`FRANKLIN_NO_EVAL=1\`._`;
|
|
233
|
+
}
|
package/dist/agent/loop.js
CHANGED
|
@@ -25,6 +25,7 @@ import { routeRequest, parseRoutingProfile } from '../router/index.js';
|
|
|
25
25
|
import { recordOutcome } from '../router/local-elo.js';
|
|
26
26
|
import { shouldPlan, getPlanningPrompt, getExecutorModel, isExecutorStuck, toolCallSignature } from './planner.js';
|
|
27
27
|
import { shouldVerify, runVerification } from './verification.js';
|
|
28
|
+
import { shouldCheckGrounding, checkGrounding, renderGroundingFollowup } from './evaluator.js';
|
|
28
29
|
import { createSessionId, appendToSession, updateSessionMeta, pruneOldSessions, loadSessionHistory, loadSessionMeta, } from '../session/storage.js';
|
|
29
30
|
/**
|
|
30
31
|
* Atomically replace all elements in a history array.
|
|
@@ -1073,7 +1074,10 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
|
|
|
1073
1074
|
});
|
|
1074
1075
|
}
|
|
1075
1076
|
}
|
|
1076
|
-
// ── Verification gate: run adversarial checks on substantial work ──
|
|
1077
|
+
// ── Verification gate: run adversarial checks on substantial CODE work ──
|
|
1078
|
+
// Fires when the agent Edit/Write/Bash-ed enough to warrant running
|
|
1079
|
+
// the build + tests. Complements the grounding check below, which
|
|
1080
|
+
// covers read-heavy answers this verifier misses.
|
|
1077
1081
|
if (shouldVerify(turnToolCalls, turnToolCounts, lastUserInput || '')) {
|
|
1078
1082
|
try {
|
|
1079
1083
|
const vResult = await runVerification(history, capabilityMap, client, {
|
|
@@ -1102,6 +1106,31 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
|
|
|
1102
1106
|
// Verification errors never block the main flow
|
|
1103
1107
|
}
|
|
1104
1108
|
}
|
|
1109
|
+
// ── Grounding gate: check that factual claims trace to tool calls ──
|
|
1110
|
+
// Fires on any substantive answer to a non-trivial question. Designed
|
|
1111
|
+
// to catch the failure mode the code-verifier misses: model answers
|
|
1112
|
+
// a "what's X / should I buy Y" question from memory instead of
|
|
1113
|
+
// calling the live tools. Evaluator runs as a separate agent on a
|
|
1114
|
+
// cheap model; never blocks the turn, only appends a ⚠️ note when
|
|
1115
|
+
// the answer looks ungrounded so the user can re-ask.
|
|
1116
|
+
try {
|
|
1117
|
+
const assistantText = responseParts
|
|
1118
|
+
.filter(p => p.type === 'text' && typeof p.text === 'string')
|
|
1119
|
+
.map(p => p.text)
|
|
1120
|
+
.join('');
|
|
1121
|
+
if (shouldCheckGrounding(lastUserInput || '', assistantText)) {
|
|
1122
|
+
const gResult = await checkGrounding(lastUserInput, history, assistantText, client, {
|
|
1123
|
+
abortSignal: abort.signal,
|
|
1124
|
+
});
|
|
1125
|
+
const followup = renderGroundingFollowup(gResult);
|
|
1126
|
+
if (followup) {
|
|
1127
|
+
onEvent({ kind: 'text_delta', text: followup });
|
|
1128
|
+
}
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1131
|
+
catch {
|
|
1132
|
+
// Grounding check is best-effort — never block the main flow.
|
|
1133
|
+
}
|
|
1105
1134
|
// Record success for local Elo learning (include tool call count for efficiency)
|
|
1106
1135
|
if (lastRoutedCategory && lastRoutedModel) {
|
|
1107
1136
|
recordOutcome(lastRoutedCategory, lastRoutedModel, 'continued', turnToolCalls);
|
package/package.json
CHANGED