@blockrun/franklin 3.8.24 → 3.8.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/evaluator.d.ts +6 -0
- package/dist/agent/evaluator.js +44 -4
- package/dist/agent/loop.js +9 -14
- package/package.json +1 -1
|
@@ -38,6 +38,12 @@ export interface GroundingResult {
|
|
|
38
38
|
* of this module is to cover read-heavy turns the code verifier misses.
|
|
39
39
|
*/
|
|
40
40
|
export declare function shouldCheckGrounding(userInput: string, assistantText: string): boolean;
|
|
41
|
+
/**
|
|
42
|
+
* Find the `[FRANKLIN HARNESS PREFETCH]` block in the most recent user
|
|
43
|
+
* message (that's where intent-prefetch injects it). Returns the inner
|
|
44
|
+
* payload or null if no prefetch happened this turn.
|
|
45
|
+
*/
|
|
46
|
+
export declare function extractPrefetchBlock(history: Dialogue[]): string | null;
|
|
41
47
|
export declare function parseGroundingResponse(raw: string): GroundingResult;
|
|
42
48
|
/** Cheap model for grading. Default matches existing verification.ts
|
|
43
49
|
* choice so both quality gates have the same cost profile. Override via
|
package/dist/agent/evaluator.js
CHANGED
|
@@ -37,11 +37,13 @@ const EVALUATOR_PROMPT = `You are a GROUNDING CHECK agent. Your job is to verify
|
|
|
37
37
|
|
|
38
38
|
### A. Ungrounded claims
|
|
39
39
|
Every **factual claim** in the answer must trace to ONE of:
|
|
40
|
-
(a) A
|
|
40
|
+
(a) A tool call result from this turn (model-initiated OR listed under "Pre-fetched by Franklin harness"), OR
|
|
41
41
|
(b) Explicit acknowledgment of uncertainty ("I'm not sure", "based on older data")
|
|
42
42
|
|
|
43
|
+
**Harness-prefetched data is evidence.** When the turn includes a "Pre-fetched by Franklin harness" section, the data listed there was fetched live from tools on the assistant's behalf (TradingMarket, ExaAnswer, etc). Treat it identically to a model-initiated tool call — claims that reference prefetched prices, numbers, or news snippets are GROUNDED.
|
|
44
|
+
|
|
43
45
|
Flag as ungrounded:
|
|
44
|
-
- Specific current-world facts stated with confidence but not backed by any tool call this turn
|
|
46
|
+
- Specific current-world facts stated with confidence but not backed by any tool call this turn (including prefetch)
|
|
45
47
|
- Recommendations or conclusions that depend on unstated data (e.g. "you should sell" without a price lookup)
|
|
46
48
|
- Invented specifics — names, numbers, dates the model produced without a tool call supporting them
|
|
47
49
|
|
|
@@ -102,7 +104,20 @@ function summarizeTurn(userInput, history, assistantText) {
|
|
|
102
104
|
lines.push(`## User question`);
|
|
103
105
|
lines.push(userInput.trim().slice(0, 800));
|
|
104
106
|
lines.push('');
|
|
105
|
-
|
|
107
|
+
// ── Harness prefetch (treated as synthetic tool calls) ──
|
|
108
|
+
// When intent-prefetch fires, it prepends a [FRANKLIN HARNESS PREFETCH]
|
|
109
|
+
// block to the user message. The LLM answers based on that data, but
|
|
110
|
+
// the evaluator previously only looked for tool_use/tool_result pairs
|
|
111
|
+
// and missed the injection — flagging answers that were actually
|
|
112
|
+
// grounded in live data as UNGROUNDED. Surface the block explicitly so
|
|
113
|
+
// the evaluator counts it as evidence.
|
|
114
|
+
const prefetchBlock = extractPrefetchBlock(history);
|
|
115
|
+
if (prefetchBlock) {
|
|
116
|
+
lines.push(`## Pre-fetched by Franklin harness (counts as tool evidence)`);
|
|
117
|
+
lines.push(prefetchBlock.slice(0, 1200));
|
|
118
|
+
lines.push('');
|
|
119
|
+
}
|
|
120
|
+
lines.push(`## Tool calls this turn (model-initiated)`);
|
|
106
121
|
// Walk from the end of history back to (but not including) the user message.
|
|
107
122
|
// Each assistant tool_use and each user tool_result get condensed to one line.
|
|
108
123
|
let found = 0;
|
|
@@ -136,7 +151,7 @@ function summarizeTurn(userInput, history, assistantText) {
|
|
|
136
151
|
}
|
|
137
152
|
}
|
|
138
153
|
if (toolLines.length === 0) {
|
|
139
|
-
lines.push(' (none)');
|
|
154
|
+
lines.push(prefetchBlock ? ' (none — but harness pre-fetched data above)' : ' (none)');
|
|
140
155
|
}
|
|
141
156
|
else {
|
|
142
157
|
lines.push(...toolLines);
|
|
@@ -146,6 +161,31 @@ function summarizeTurn(userInput, history, assistantText) {
|
|
|
146
161
|
lines.push(assistantText.trim().slice(0, 2400));
|
|
147
162
|
return lines.join('\n');
|
|
148
163
|
}
|
|
164
|
+
/**
|
|
165
|
+
* Find the `[FRANKLIN HARNESS PREFETCH]` block in the most recent user
|
|
166
|
+
* message (that's where intent-prefetch injects it). Returns the inner
|
|
167
|
+
* payload or null if no prefetch happened this turn.
|
|
168
|
+
*/
|
|
169
|
+
export function extractPrefetchBlock(history) {
|
|
170
|
+
for (let i = history.length - 1; i >= 0; i--) {
|
|
171
|
+
const msg = history[i];
|
|
172
|
+
if (msg.role !== 'user')
|
|
173
|
+
continue;
|
|
174
|
+
const content = typeof msg.content === 'string' ? msg.content : null;
|
|
175
|
+
if (!content)
|
|
176
|
+
continue;
|
|
177
|
+
const startIdx = content.indexOf('[FRANKLIN HARNESS PREFETCH]');
|
|
178
|
+
if (startIdx < 0)
|
|
179
|
+
return null; // Most recent user message has no prefetch — we're done
|
|
180
|
+
// Capture from the marker up to (but not including) the "Original user message:" divider
|
|
181
|
+
const endMarker = '\nOriginal user message:';
|
|
182
|
+
const endIdx = content.indexOf(endMarker, startIdx);
|
|
183
|
+
if (endIdx < 0)
|
|
184
|
+
return content.slice(startIdx).trim();
|
|
185
|
+
return content.slice(startIdx, endIdx).trim();
|
|
186
|
+
}
|
|
187
|
+
return null;
|
|
188
|
+
}
|
|
149
189
|
// ─── Verdict parser ──────────────────────────────────────────────────────
|
|
150
190
|
export function parseGroundingResponse(raw) {
|
|
151
191
|
const text = raw.trim();
|
package/dist/agent/loop.js
CHANGED
|
@@ -711,22 +711,19 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
|
|
|
711
711
|
sessionId,
|
|
712
712
|
});
|
|
713
713
|
// ── Router: resolve routing profiles to concrete models ──
|
|
714
|
+
// Classifier always sees the user's ORIGINAL prompt for this turn —
|
|
715
|
+
// never the `[GROUNDING CHECK FAILED]` / `[VERIFICATION FAILED]` /
|
|
716
|
+
// pushback-annotated variants the loop injects mid-turn. Same input
|
|
717
|
+
// across iterations → same tier → stable resolved model. Stops the
|
|
718
|
+
// failure mode where a retry message classified as SIMPLE dropped
|
|
719
|
+
// a COMPLEX task down to gemini mid-way.
|
|
714
720
|
const routingProfile = parseRoutingProfile(config.model);
|
|
715
721
|
let resolvedModel = config.model;
|
|
716
722
|
let routingTier;
|
|
717
723
|
let routingConfidence;
|
|
718
724
|
let routingSavings;
|
|
719
725
|
if (routingProfile) {
|
|
720
|
-
|
|
721
|
-
const lastUser = [...history].reverse().find((m) => m.role === 'user');
|
|
722
|
-
const userText = typeof lastUser?.content === 'string'
|
|
723
|
-
? lastUser.content
|
|
724
|
-
: Array.isArray(lastUser?.content)
|
|
725
|
-
? lastUser.content
|
|
726
|
-
.filter(p => p.type === 'text')
|
|
727
|
-
.map(p => p.text ?? '')
|
|
728
|
-
.join(' ')
|
|
729
|
-
: '';
|
|
726
|
+
const userText = lastUserInput || '';
|
|
730
727
|
const routing = await routeRequestAsync(userText, routingProfile);
|
|
731
728
|
resolvedModel = routing.model;
|
|
732
729
|
routingTier = routing.tier;
|
|
@@ -734,10 +731,8 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
|
|
|
734
731
|
routingSavings = routing.savings;
|
|
735
732
|
lastRoutedModel = routing.model;
|
|
736
733
|
lastRoutedCategory = routing.signals[0] || '';
|
|
737
|
-
// Surface the routing decision
|
|
738
|
-
//
|
|
739
|
-
// users have no idea what's actually running — or worse, they
|
|
740
|
-
// believe they're stuck on the last-seen concrete name.
|
|
734
|
+
// Surface the routing decision on the first iteration so the user
|
|
735
|
+
// sees which concrete model got picked, not just "auto".
|
|
741
736
|
if (loopCount === 1) {
|
|
742
737
|
onEvent({
|
|
743
738
|
kind: 'text_delta',
|
package/package.json
CHANGED