@blockrun/franklin 3.8.12 → 3.8.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/context.js +8 -4
- package/dist/agent/evaluator.d.ts +56 -0
- package/dist/agent/evaluator.js +233 -0
- package/dist/agent/loop.js +30 -1
- package/package.json +1 -1
package/dist/agent/context.js
CHANGED
|
@@ -158,12 +158,16 @@ function getToolPatternsSection() {
|
|
|
158
158
|
- **Understanding code**: Glob for structure → Read key files → Grep for specific symbols/patterns. Don't read every file in a directory.
|
|
159
159
|
- **Making changes**: Read the file → Edit with targeted replacement → verify the edit worked (Read again or run tests). Never Edit without Reading first.
|
|
160
160
|
- **Running commands**: Use Bash for shell operations that have no dedicated tool. Chain commands with && when sequential. Use separate Bash calls when you need to inspect intermediate output.
|
|
161
|
-
- **
|
|
162
|
-
- **Current events / "what happened to X" / "why did X drop"**: call **ExaAnswer** for a cited synthesized answer. For breadth use **ExaSearch** then **ExaReadUrls** on the best results. Never guess at recent events from training data — the model cutoff is older than the question.
|
|
163
|
-
- **General web research**: WebSearch for discovery → WebFetch for specific URLs from search results. Don't WebFetch URLs you invented.
|
|
161
|
+
- **Research**: WebSearch for discovery → WebFetch for specific URLs from search results. Don't WebFetch URLs you invented.
|
|
164
162
|
- **Complex tasks**: Use Agent to spawn sub-agents for 2+ independent research or implementation tasks. Don't do sequentially what can be done in parallel.
|
|
165
163
|
- **Multiple independent lookups**: Call all tools in a single response. NEVER make sequential calls when parallel calls would work.
|
|
166
|
-
|
|
164
|
+
|
|
165
|
+
# Grounding Before Answering
|
|
166
|
+
Your training data is frozen in the past. Live-world questions MUST be answered from tool results, not memory.
|
|
167
|
+
- Any question about a current price, quote, market state, or "should I buy/sell/hold X" → use **TradingMarket** (crypto/FX/commodity are free; stocks cost \$0.001 via the wallet).
|
|
168
|
+
- Any "what happened / why did it change / latest news on X" → use **ExaAnswer** for a cited synthesized answer, or **ExaSearch** + **ExaReadUrls** when you need more depth.
|
|
169
|
+
- If the user names a thing you don't recognize (a company, ticker, project), don't demand clarification — call the research tools and figure it out. You have a wallet to spend on exactly this.
|
|
170
|
+
- If a tool returns an error (rate-limit, 404, insufficient funds), say so plainly and suggest the next action. Don't silently fall back to memory.`;
|
|
167
171
|
}
|
|
168
172
|
function getTokenEfficiencySection() {
|
|
169
173
|
return `# Token Efficiency
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Grounding evaluator — a cheap second-pass check that every factual claim
|
|
3
|
+
* in Franklin's answer traces back to a tool-call result, not model memory.
|
|
4
|
+
*
|
|
5
|
+
* Why this exists (2026-04 retrospective): the CRCL incident — user asked
|
|
6
|
+
* about a stock Franklin had tools to query, Franklin answered from 2022
|
|
7
|
+
* training data instead. Root cause wasn't a prompt defect; it was an
|
|
8
|
+
* absent evaluator. The existing `verification.ts` only fires when the
|
|
9
|
+
* agent writes code (Edit / Write / Bash threshold), so read-heavy hero
|
|
10
|
+
* use cases (trading, research, analysis) never triggered any quality gate.
|
|
11
|
+
*
|
|
12
|
+
* This module is the complement: fires on *answers with factual content*,
|
|
13
|
+
* regardless of tool type. Anthropic's harness-design article calls out
|
|
14
|
+
* "self-evaluation on complex tasks" as anti-pattern #14 — models skew
|
|
15
|
+
* positive when grading themselves. So the check runs as a separate agent
|
|
16
|
+
* (different system prompt, explicitly adversarial) with its own model.
|
|
17
|
+
*
|
|
18
|
+
* v1 scope: check only, never re-prompt. Emit a follow-up ⚠️ event when
|
|
19
|
+
* claims look ungrounded, let the user decide whether to re-ask. The
|
|
20
|
+
* re-prompt loop (generator iterates against evaluator findings until
|
|
21
|
+
* PASS) is a v2 concern once we know v1 catches real cases without
|
|
22
|
+
* false-positive noise.
|
|
23
|
+
*/
|
|
24
|
+
import type { CapabilityHandler, Dialogue } from './types.js';
|
|
25
|
+
import { ModelClient } from './llm.js';
|
|
26
|
+
export type GroundingVerdict = 'GROUNDED' | 'PARTIAL' | 'UNGROUNDED' | 'SKIPPED';
|
|
27
|
+
export interface GroundingResult {
|
|
28
|
+
verdict: GroundingVerdict;
|
|
29
|
+
issues: string[];
|
|
30
|
+
raw: string;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Decide whether this turn warrants a grounding check. Principles:
|
|
34
|
+
* - Non-trivial user input (not a greeting, not a slash command)
|
|
35
|
+
* - Non-trivial assistant text output (not just a tool-result echo)
|
|
36
|
+
*
|
|
37
|
+
* Intentionally NOT gating on tool-type (read vs write) — the whole point
|
|
38
|
+
* of this module is to cover read-heavy turns the code verifier misses.
|
|
39
|
+
*/
|
|
40
|
+
export declare function shouldCheckGrounding(userInput: string, assistantText: string): boolean;
|
|
41
|
+
export declare function parseGroundingResponse(raw: string): GroundingResult;
|
|
42
|
+
/** Cheap model for grading. Default matches existing verification.ts
|
|
43
|
+
* choice so both quality gates have the same cost profile. Override via
|
|
44
|
+
* `FRANKLIN_EVALUATOR_MODEL` to experiment with accuracy/cost trade-offs. */
|
|
45
|
+
export declare function evaluatorModel(): string;
|
|
46
|
+
export declare function checkGrounding(userInput: string, history: Dialogue[], assistantText: string, client: ModelClient, opts?: {
|
|
47
|
+
abortSignal?: AbortSignal;
|
|
48
|
+
model?: string;
|
|
49
|
+
}): Promise<GroundingResult>;
|
|
50
|
+
/**
|
|
51
|
+
* Convert a grounding result into a user-facing follow-up message. Returns
|
|
52
|
+
* empty string when verdict is GROUNDED / SKIPPED — no reason to spam the
|
|
53
|
+
* user when the check agreed the answer was sound.
|
|
54
|
+
*/
|
|
55
|
+
export declare function renderGroundingFollowup(result: GroundingResult): string;
|
|
56
|
+
export type { CapabilityHandler };
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Grounding evaluator — a cheap second-pass check that every factual claim
|
|
3
|
+
* in Franklin's answer traces back to a tool-call result, not model memory.
|
|
4
|
+
*
|
|
5
|
+
* Why this exists (2026-04 retrospective): the CRCL incident — user asked
|
|
6
|
+
* about a stock Franklin had tools to query, Franklin answered from 2022
|
|
7
|
+
* training data instead. Root cause wasn't a prompt defect; it was an
|
|
8
|
+
* absent evaluator. The existing `verification.ts` only fires when the
|
|
9
|
+
* agent writes code (Edit / Write / Bash threshold), so read-heavy hero
|
|
10
|
+
* use cases (trading, research, analysis) never triggered any quality gate.
|
|
11
|
+
*
|
|
12
|
+
* This module is the complement: fires on *answers with factual content*,
|
|
13
|
+
* regardless of tool type. Anthropic's harness-design article calls out
|
|
14
|
+
* "self-evaluation on complex tasks" as anti-pattern #14 — models skew
|
|
15
|
+
* positive when grading themselves. So the check runs as a separate agent
|
|
16
|
+
* (different system prompt, explicitly adversarial) with its own model.
|
|
17
|
+
*
|
|
18
|
+
* v1 scope: check only, never re-prompt. Emit a follow-up ⚠️ event when
|
|
19
|
+
* claims look ungrounded, let the user decide whether to re-ask. The
|
|
20
|
+
* re-prompt loop (generator iterates against evaluator findings until
|
|
21
|
+
* PASS) is a v2 concern once we know v1 catches real cases without
|
|
22
|
+
* false-positive noise.
|
|
23
|
+
*/
|
|
24
|
+
// ─── Evaluator system prompt ─────────────────────────────────────────────
|
|
25
|
+
//
|
|
26
|
+
// Principle-based, not example-enumerating. Specific tickers or phrasings
|
|
27
|
+
// hard-coded here would rot the moment the market changes. The rule is
|
|
28
|
+
// general: claim → tool result or explicit uncertainty.
|
|
29
|
+
const EVALUATOR_PROMPT = `You are a GROUNDING CHECK agent. Your job is to verify that an AI assistant's answer is grounded in tool-call evidence, not model memory.
|
|
30
|
+
|
|
31
|
+
## What you receive
|
|
32
|
+
- The user's question
|
|
33
|
+
- A list of tool calls made this turn (tool name, input summary, whether it succeeded)
|
|
34
|
+
- The assistant's final text answer
|
|
35
|
+
|
|
36
|
+
## What you check
|
|
37
|
+
Every **factual claim** in the answer must trace to ONE of:
|
|
38
|
+
(a) A successful tool call result from this turn, OR
|
|
39
|
+
(b) Explicit acknowledgment of uncertainty ("I'm not sure", "based on older data", "I'd need to check")
|
|
40
|
+
|
|
41
|
+
Claims that are ungrounded:
|
|
42
|
+
- Specific current-world facts stated with confidence but not backed by any tool call this turn
|
|
43
|
+
- Recommendations or conclusions that depend on unstated data (e.g. "you should sell" without a price lookup)
|
|
44
|
+
- Invented specifics — names, numbers, dates the model produced without a tool call supporting them
|
|
45
|
+
|
|
46
|
+
Claims that are grounded:
|
|
47
|
+
- Anything directly derived from a tool result shown in the turn
|
|
48
|
+
- General knowledge / definitions / reasoning that doesn't depend on current-world specifics
|
|
49
|
+
- Claims explicitly hedged as uncertain
|
|
50
|
+
|
|
51
|
+
## Output — exact format
|
|
52
|
+
|
|
53
|
+
VERDICT: GROUNDED | PARTIAL | UNGROUNDED
|
|
54
|
+
|
|
55
|
+
If not GROUNDED, list each ungrounded claim on its own line starting with "- " and the tool that should have been called, like:
|
|
56
|
+
- Claim: "<the ungrounded part, quoted briefly>" → missing tool: <TradingMarket | ExaAnswer | ExaSearch | WebSearch | ...>
|
|
57
|
+
|
|
58
|
+
Empty line between verdict and list. No other text. No preamble. No apology. Be terse.`;
|
|
59
|
+
// ─── Trigger policy ──────────────────────────────────────────────────────
|
|
60
|
+
const MIN_USER_CHARS = 20; // Short inputs are greetings/acks, not questions
|
|
61
|
+
const MIN_ANSWER_CHARS = 50; // Short answers are acks, not factual claims
|
|
62
|
+
/**
|
|
63
|
+
* Decide whether this turn warrants a grounding check. Principles:
|
|
64
|
+
* - Non-trivial user input (not a greeting, not a slash command)
|
|
65
|
+
* - Non-trivial assistant text output (not just a tool-result echo)
|
|
66
|
+
*
|
|
67
|
+
* Intentionally NOT gating on tool-type (read vs write) — the whole point
|
|
68
|
+
* of this module is to cover read-heavy turns the code verifier misses.
|
|
69
|
+
*/
|
|
70
|
+
export function shouldCheckGrounding(userInput, assistantText) {
|
|
71
|
+
if (process.env.FRANKLIN_NO_EVAL === '1')
|
|
72
|
+
return false;
|
|
73
|
+
const ui = userInput.trim();
|
|
74
|
+
if (ui.length < MIN_USER_CHARS)
|
|
75
|
+
return false;
|
|
76
|
+
if (ui.startsWith('/'))
|
|
77
|
+
return false;
|
|
78
|
+
if (assistantText.trim().length < MIN_ANSWER_CHARS)
|
|
79
|
+
return false;
|
|
80
|
+
return true;
|
|
81
|
+
}
|
|
82
|
+
// ─── Turn summary extraction ─────────────────────────────────────────────
|
|
83
|
+
/**
|
|
84
|
+
* Summarize the current turn for the evaluator: user question + tool calls
|
|
85
|
+
* + tool result snippets + assistant's final answer. Bounded to keep the
|
|
86
|
+
* evaluator call cheap; it doesn't need every byte of every tool output.
|
|
87
|
+
*/
|
|
88
|
+
function summarizeTurn(userInput, history, assistantText) {
|
|
89
|
+
const lines = [];
|
|
90
|
+
lines.push(`## User question`);
|
|
91
|
+
lines.push(userInput.trim().slice(0, 800));
|
|
92
|
+
lines.push('');
|
|
93
|
+
lines.push(`## Tool calls this turn`);
|
|
94
|
+
// Walk from the end of history back to (but not including) the user message.
|
|
95
|
+
// Each assistant tool_use and each user tool_result get condensed to one line.
|
|
96
|
+
let found = 0;
|
|
97
|
+
const toolLines = [];
|
|
98
|
+
for (let i = history.length - 1; i >= 0 && found < 40; i--) {
|
|
99
|
+
const msg = history[i];
|
|
100
|
+
if (msg.role === 'user' && typeof msg.content === 'string')
|
|
101
|
+
break;
|
|
102
|
+
if (msg.role === 'assistant' && Array.isArray(msg.content)) {
|
|
103
|
+
for (const part of msg.content) {
|
|
104
|
+
if (typeof part === 'object' && part.type === 'tool_use') {
|
|
105
|
+
const inputStr = JSON.stringify(part.input).slice(0, 160);
|
|
106
|
+
toolLines.unshift(` - ${part.name}(${inputStr})`);
|
|
107
|
+
found++;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
else if (msg.role === 'user' && Array.isArray(msg.content)) {
|
|
112
|
+
for (const part of msg.content) {
|
|
113
|
+
if (typeof part === 'object' && part.type === 'tool_result') {
|
|
114
|
+
const output = typeof part.content === 'string'
|
|
115
|
+
? part.content
|
|
116
|
+
: Array.isArray(part.content)
|
|
117
|
+
? part.content.map(c => c.text || '').join('\n')
|
|
118
|
+
: '';
|
|
119
|
+
const snippet = output.slice(0, 240).replace(/\s+/g, ' ');
|
|
120
|
+
toolLines.unshift(` → ${snippet}`);
|
|
121
|
+
found++;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
if (toolLines.length === 0) {
|
|
127
|
+
lines.push(' (none)');
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
lines.push(...toolLines);
|
|
131
|
+
}
|
|
132
|
+
lines.push('');
|
|
133
|
+
lines.push(`## Assistant's answer`);
|
|
134
|
+
lines.push(assistantText.trim().slice(0, 2400));
|
|
135
|
+
return lines.join('\n');
|
|
136
|
+
}
|
|
137
|
+
// ─── Verdict parser ──────────────────────────────────────────────────────
|
|
138
|
+
export function parseGroundingResponse(raw) {
|
|
139
|
+
const text = raw.trim();
|
|
140
|
+
const m = text.match(/VERDICT:\s*(GROUNDED|PARTIAL|UNGROUNDED)/i);
|
|
141
|
+
const verdict = m
|
|
142
|
+
? m[1].toUpperCase()
|
|
143
|
+
: 'PARTIAL'; // If the evaluator couldn't produce a clean verdict, err on the side of "flag for the user".
|
|
144
|
+
const issues = [];
|
|
145
|
+
const lines = text.split('\n');
|
|
146
|
+
for (const line of lines) {
|
|
147
|
+
const l = line.trim();
|
|
148
|
+
if (l.startsWith('- ') && l.length > 3) {
|
|
149
|
+
issues.push(l.slice(2).trim());
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
return { verdict, issues, raw: text };
|
|
153
|
+
}
|
|
154
|
+
// ─── Default evaluator model ─────────────────────────────────────────────
|
|
155
|
+
/** Cheap model for grading. Default matches existing verification.ts
|
|
156
|
+
* choice so both quality gates have the same cost profile. Override via
|
|
157
|
+
* `FRANKLIN_EVALUATOR_MODEL` to experiment with accuracy/cost trade-offs. */
|
|
158
|
+
export function evaluatorModel() {
|
|
159
|
+
return process.env.FRANKLIN_EVALUATOR_MODEL || 'nvidia/nemotron-ultra-253b';
|
|
160
|
+
}
|
|
161
|
+
// ─── Run grounding check ─────────────────────────────────────────────────
|
|
162
|
+
const MAX_EVAL_TOKENS = 512;
|
|
163
|
+
const EVAL_TIMEOUT_MS = 15_000;
|
|
164
|
+
export async function checkGrounding(userInput, history, assistantText, client, opts = {}) {
|
|
165
|
+
const model = opts.model || evaluatorModel();
|
|
166
|
+
const summary = summarizeTurn(userInput, history, assistantText);
|
|
167
|
+
// Run independently of the main agent — the evaluator gets NO tools
|
|
168
|
+
// (it just reads and grades). Limit tokens so a chatty evaluator can't
|
|
169
|
+
// balloon the cost of a cheap check.
|
|
170
|
+
const timeoutCtrl = new AbortController();
|
|
171
|
+
const timer = setTimeout(() => timeoutCtrl.abort(), EVAL_TIMEOUT_MS);
|
|
172
|
+
const signal = opts.abortSignal
|
|
173
|
+
? anySignal([opts.abortSignal, timeoutCtrl.signal])
|
|
174
|
+
: timeoutCtrl.signal;
|
|
175
|
+
try {
|
|
176
|
+
const response = await client.complete({
|
|
177
|
+
model,
|
|
178
|
+
system: EVALUATOR_PROMPT,
|
|
179
|
+
messages: [{ role: 'user', content: summary }],
|
|
180
|
+
tools: [],
|
|
181
|
+
max_tokens: MAX_EVAL_TOKENS,
|
|
182
|
+
}, signal);
|
|
183
|
+
let raw = '';
|
|
184
|
+
for (const part of response.content) {
|
|
185
|
+
if (typeof part === 'object' && part.type === 'text' && part.text) {
|
|
186
|
+
raw += part.text;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
if (!raw.trim()) {
|
|
190
|
+
return { verdict: 'SKIPPED', issues: [], raw: '(empty response)' };
|
|
191
|
+
}
|
|
192
|
+
return parseGroundingResponse(raw);
|
|
193
|
+
}
|
|
194
|
+
catch (err) {
|
|
195
|
+
return {
|
|
196
|
+
verdict: 'SKIPPED',
|
|
197
|
+
issues: [],
|
|
198
|
+
raw: `(evaluator error: ${err.message})`,
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
finally {
|
|
202
|
+
clearTimeout(timer);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
/** Compose multiple AbortSignals into one — aborts when any source aborts. */
|
|
206
|
+
function anySignal(signals) {
|
|
207
|
+
const ctrl = new AbortController();
|
|
208
|
+
for (const s of signals) {
|
|
209
|
+
if (s.aborted) {
|
|
210
|
+
ctrl.abort();
|
|
211
|
+
break;
|
|
212
|
+
}
|
|
213
|
+
s.addEventListener('abort', () => ctrl.abort(), { once: true });
|
|
214
|
+
}
|
|
215
|
+
return ctrl.signal;
|
|
216
|
+
}
|
|
217
|
+
// ─── Render result for the UI ────────────────────────────────────────────
|
|
218
|
+
/**
|
|
219
|
+
* Convert a grounding result into a user-facing follow-up message. Returns
|
|
220
|
+
* empty string when verdict is GROUNDED / SKIPPED — no reason to spam the
|
|
221
|
+
* user when the check agreed the answer was sound.
|
|
222
|
+
*/
|
|
223
|
+
export function renderGroundingFollowup(result) {
|
|
224
|
+
if (result.verdict === 'GROUNDED' || result.verdict === 'SKIPPED')
|
|
225
|
+
return '';
|
|
226
|
+
const header = result.verdict === 'UNGROUNDED'
|
|
227
|
+
? '⚠️ **Grounding check failed** — the previous answer relied on memory where a tool call was available:'
|
|
228
|
+
: '⚠️ **Grounding check flagged some claims** — re-run with the suggested tools for a verified answer:';
|
|
229
|
+
const body = result.issues.length > 0
|
|
230
|
+
? result.issues.map(i => `- ${i}`).join('\n')
|
|
231
|
+
: '(evaluator returned no specific items — check the transcript manually)';
|
|
232
|
+
return `\n\n${header}\n${body}\n\n_Ask again with an explicit instruction to call the tools, or disable these checks with \`FRANKLIN_NO_EVAL=1\`._`;
|
|
233
|
+
}
|
package/dist/agent/loop.js
CHANGED
|
@@ -25,6 +25,7 @@ import { routeRequest, parseRoutingProfile } from '../router/index.js';
|
|
|
25
25
|
import { recordOutcome } from '../router/local-elo.js';
|
|
26
26
|
import { shouldPlan, getPlanningPrompt, getExecutorModel, isExecutorStuck, toolCallSignature } from './planner.js';
|
|
27
27
|
import { shouldVerify, runVerification } from './verification.js';
|
|
28
|
+
import { shouldCheckGrounding, checkGrounding, renderGroundingFollowup } from './evaluator.js';
|
|
28
29
|
import { createSessionId, appendToSession, updateSessionMeta, pruneOldSessions, loadSessionHistory, loadSessionMeta, } from '../session/storage.js';
|
|
29
30
|
/**
|
|
30
31
|
* Atomically replace all elements in a history array.
|
|
@@ -1073,7 +1074,10 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
|
|
|
1073
1074
|
});
|
|
1074
1075
|
}
|
|
1075
1076
|
}
|
|
1076
|
-
// ── Verification gate: run adversarial checks on substantial work ──
|
|
1077
|
+
// ── Verification gate: run adversarial checks on substantial CODE work ──
|
|
1078
|
+
// Fires when the agent Edit/Write/Bash-ed enough to warrant running
|
|
1079
|
+
// the build + tests. Complements the grounding check below, which
|
|
1080
|
+
// covers read-heavy answers this verifier misses.
|
|
1077
1081
|
if (shouldVerify(turnToolCalls, turnToolCounts, lastUserInput || '')) {
|
|
1078
1082
|
try {
|
|
1079
1083
|
const vResult = await runVerification(history, capabilityMap, client, {
|
|
@@ -1102,6 +1106,31 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
|
|
|
1102
1106
|
// Verification errors never block the main flow
|
|
1103
1107
|
}
|
|
1104
1108
|
}
|
|
1109
|
+
// ── Grounding gate: check that factual claims trace to tool calls ──
|
|
1110
|
+
// Fires on any substantive answer to a non-trivial question. Designed
|
|
1111
|
+
// to catch the failure mode the code-verifier misses: model answers
|
|
1112
|
+
// a "what's X / should I buy Y" question from memory instead of
|
|
1113
|
+
// calling the live tools. Evaluator runs as a separate agent on a
|
|
1114
|
+
// cheap model; never blocks the turn, only appends a ⚠️ note when
|
|
1115
|
+
// the answer looks ungrounded so the user can re-ask.
|
|
1116
|
+
try {
|
|
1117
|
+
const assistantText = responseParts
|
|
1118
|
+
.filter(p => p.type === 'text' && typeof p.text === 'string')
|
|
1119
|
+
.map(p => p.text)
|
|
1120
|
+
.join('');
|
|
1121
|
+
if (shouldCheckGrounding(lastUserInput || '', assistantText)) {
|
|
1122
|
+
const gResult = await checkGrounding(lastUserInput, history, assistantText, client, {
|
|
1123
|
+
abortSignal: abort.signal,
|
|
1124
|
+
});
|
|
1125
|
+
const followup = renderGroundingFollowup(gResult);
|
|
1126
|
+
if (followup) {
|
|
1127
|
+
onEvent({ kind: 'text_delta', text: followup });
|
|
1128
|
+
}
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1131
|
+
catch {
|
|
1132
|
+
// Grounding check is best-effort — never block the main flow.
|
|
1133
|
+
}
|
|
1105
1134
|
// Record success for local Elo learning (include tool call count for efficiency)
|
|
1106
1135
|
if (lastRoutedCategory && lastRoutedModel) {
|
|
1107
1136
|
recordOutcome(lastRoutedCategory, lastRoutedModel, 'continued', turnToolCalls);
|
package/package.json
CHANGED