@blockrun/franklin 3.8.15 → 3.8.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/evaluator.d.ts +11 -0
- package/dist/agent/evaluator.js +24 -0
- package/dist/agent/loop.js +35 -9
- package/dist/router/index.d.ts +14 -0
- package/dist/router/index.js +123 -0
- package/package.json +1 -1
|
@@ -53,4 +53,15 @@ export declare function checkGrounding(userInput: string, history: Dialogue[], a
|
|
|
53
53
|
* user when the check agreed the answer was sound.
|
|
54
54
|
*/
|
|
55
55
|
export declare function renderGroundingFollowup(result: GroundingResult): string;
|
|
56
|
+
/**
|
|
57
|
+
* Build a synthetic user message that instructs the agent to retry with the
|
|
58
|
+
* missing tools. Returned message goes into history so the model's next
|
|
59
|
+
* generation sees it as the most recent instruction. This is the GAN-like
|
|
60
|
+
* feedback loop pattern from Anthropic's harness-design writeup —
|
|
61
|
+
* evaluator findings feed back into the generator until PASS (or retry cap).
|
|
62
|
+
*
|
|
63
|
+
* Intentionally terse: the agent already has the original question in
|
|
64
|
+
* history; we only need to name the gap + the tools to use.
|
|
65
|
+
*/
|
|
66
|
+
export declare function buildGroundingRetryInstruction(result: GroundingResult, originalUserQuestion: string): string;
|
|
56
67
|
export type { CapabilityHandler };
|
package/dist/agent/evaluator.js
CHANGED
|
@@ -231,3 +231,27 @@ export function renderGroundingFollowup(result) {
|
|
|
231
231
|
: '(evaluator returned no specific items — check the transcript manually)';
|
|
232
232
|
return `\n\n${header}\n${body}\n\n_Ask again with an explicit instruction to call the tools, or disable these checks with \`FRANKLIN_NO_EVAL=1\`._`;
|
|
233
233
|
}
|
|
234
|
+
/**
|
|
235
|
+
* Build a synthetic user message that instructs the agent to retry with the
|
|
236
|
+
* missing tools. Returned message goes into history so the model's next
|
|
237
|
+
* generation sees it as the most recent instruction. This is the GAN-like
|
|
238
|
+
* feedback loop pattern from Anthropic's harness-design writeup —
|
|
239
|
+
* evaluator findings feed back into the generator until PASS (or retry cap).
|
|
240
|
+
*
|
|
241
|
+
* Intentionally terse: the agent already has the original question in
|
|
242
|
+
* history; we only need to name the gap + the tools to use.
|
|
243
|
+
*/
|
|
244
|
+
export function buildGroundingRetryInstruction(result, originalUserQuestion) {
|
|
245
|
+
const lines = [
|
|
246
|
+
'[GROUNDING CHECK FAILED]',
|
|
247
|
+
'Your previous answer stated facts without calling the relevant tools. Specifically:',
|
|
248
|
+
];
|
|
249
|
+
for (const issue of result.issues) {
|
|
250
|
+
lines.push(`- ${issue}`);
|
|
251
|
+
}
|
|
252
|
+
lines.push('');
|
|
253
|
+
lines.push('Retry: call the missing tools first, then give a concise final answer based on the tool results. Only claim what the tool outputs actually say. If a tool fails, say so rather than falling back to memory.');
|
|
254
|
+
lines.push('');
|
|
255
|
+
lines.push(`Original user question: ${originalUserQuestion.trim().slice(0, 500)}`);
|
|
256
|
+
return lines.join('\n');
|
|
257
|
+
}
|
package/dist/agent/loop.js
CHANGED
|
@@ -21,11 +21,11 @@ import { appendAudit, extractLastUserPrompt } from '../stats/audit.js';
|
|
|
21
21
|
import { estimateCost, OPUS_PRICING } from '../pricing.js';
|
|
22
22
|
import { maybeMidSessionExtract } from '../learnings/extractor.js';
|
|
23
23
|
import { extractMentions, buildEntityContext, loadEntities } from '../brain/store.js';
|
|
24
|
-
import {
|
|
24
|
+
import { routeRequestAsync, parseRoutingProfile } from '../router/index.js';
|
|
25
25
|
import { recordOutcome } from '../router/local-elo.js';
|
|
26
26
|
import { shouldPlan, getPlanningPrompt, getExecutorModel, isExecutorStuck, toolCallSignature } from './planner.js';
|
|
27
27
|
import { shouldVerify, runVerification } from './verification.js';
|
|
28
|
-
import { shouldCheckGrounding, checkGrounding, renderGroundingFollowup } from './evaluator.js';
|
|
28
|
+
import { shouldCheckGrounding, checkGrounding, renderGroundingFollowup, buildGroundingRetryInstruction, } from './evaluator.js';
|
|
29
29
|
import { createSessionId, appendToSession, updateSessionMeta, pruneOldSessions, loadSessionHistory, loadSessionMeta, } from '../session/storage.js';
|
|
30
30
|
/**
|
|
31
31
|
* Atomically replace all elements in a history array.
|
|
@@ -525,6 +525,14 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
|
|
|
525
525
|
let maxTokensOverride;
|
|
526
526
|
const turnIdleReference = lastSessionActivity;
|
|
527
527
|
lastSessionActivity = Date.now();
|
|
528
|
+
// ── Grounding retry state (per turn) ──
|
|
529
|
+
// When the post-response evaluator finds UNGROUNDED claims, we inject a
|
|
530
|
+
// corrective user message and re-enter the loop so the generator can
|
|
531
|
+
// answer again with the missing tool calls. 1-retry cap: if round 2
|
|
532
|
+
// still UNGROUNDED, ship the annotated response and let the user
|
|
533
|
+
// decide — avoids pathological loops, caps wall-clock cost.
|
|
534
|
+
let groundingRetryCount = 0;
|
|
535
|
+
const MAX_GROUNDING_RETRIES = 1;
|
|
528
536
|
// ── Plan-then-execute state (per turn) ──
|
|
529
537
|
let planActive = false;
|
|
530
538
|
let planPlannerModel = '';
|
|
@@ -688,7 +696,7 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
|
|
|
688
696
|
.map(p => p.text ?? '')
|
|
689
697
|
.join(' ')
|
|
690
698
|
: '';
|
|
691
|
-
const routing =
|
|
699
|
+
const routing = await routeRequestAsync(userText, routingProfile);
|
|
692
700
|
resolvedModel = routing.model;
|
|
693
701
|
routingTier = routing.tier;
|
|
694
702
|
routingConfidence = routing.confidence;
|
|
@@ -1107,12 +1115,15 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
|
|
|
1107
1115
|
}
|
|
1108
1116
|
}
|
|
1109
1117
|
// ── Grounding gate: check that factual claims trace to tool calls ──
|
|
1110
|
-
// Fires on any substantive answer to a non-trivial question.
|
|
1111
|
-
//
|
|
1112
|
-
//
|
|
1113
|
-
// calling the live tools.
|
|
1114
|
-
//
|
|
1115
|
-
//
|
|
1118
|
+
// Fires on any substantive answer to a non-trivial question. Catches
|
|
1119
|
+
// the failure mode the code-verifier misses: model answers a
|
|
1120
|
+
// "what's X / should I buy Y" question from memory instead of
|
|
1121
|
+
// calling the live tools.
|
|
1122
|
+
//
|
|
1123
|
+
// On UNGROUNDED: inject a corrective user message (GAN-style feedback)
|
|
1124
|
+
// and re-enter the loop so the generator can answer again with the
|
|
1125
|
+
// right tools. Up to MAX_GROUNDING_RETRIES attempts — after that,
|
|
1126
|
+
// annotate and ship so the user can decide.
|
|
1116
1127
|
try {
|
|
1117
1128
|
const assistantText = responseParts
|
|
1118
1129
|
.filter(p => p.type === 'text' && typeof p.text === 'string')
|
|
@@ -1122,6 +1133,21 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
|
|
|
1122
1133
|
const gResult = await checkGrounding(lastUserInput, history, assistantText, client, {
|
|
1123
1134
|
abortSignal: abort.signal,
|
|
1124
1135
|
});
|
|
1136
|
+
if (gResult.verdict === 'UNGROUNDED' && groundingRetryCount < MAX_GROUNDING_RETRIES) {
|
|
1137
|
+
groundingRetryCount++;
|
|
1138
|
+
const retryMsg = buildGroundingRetryInstruction(gResult, lastUserInput);
|
|
1139
|
+
const feedbackMsg = { role: 'user', content: retryMsg };
|
|
1140
|
+
history.push(feedbackMsg);
|
|
1141
|
+
persistSessionMessage(feedbackMsg);
|
|
1142
|
+
onEvent({
|
|
1143
|
+
kind: 'text_delta',
|
|
1144
|
+
text: '\n\n*Ungrounded claims detected — retrying with required tool calls...*\n\n',
|
|
1145
|
+
});
|
|
1146
|
+
continue; // Re-enter outer loop — generator will produce a new response.
|
|
1147
|
+
}
|
|
1148
|
+
// Either the verdict is acceptable (GROUNDED / PARTIAL / SKIPPED)
|
|
1149
|
+
// or we've hit the retry cap with UNGROUNDED still outstanding.
|
|
1150
|
+
// In both cases, surface the followup if one applies and exit.
|
|
1125
1151
|
const followup = renderGroundingFollowup(gResult);
|
|
1126
1152
|
if (followup) {
|
|
1127
1153
|
onEvent({ kind: 'text_delta', text: followup });
|
package/dist/router/index.d.ts
CHANGED
|
@@ -18,6 +18,20 @@ export interface RoutingResult {
|
|
|
18
18
|
signals: string[];
|
|
19
19
|
savings: number;
|
|
20
20
|
}
|
|
21
|
+
export type TierClassifier = (prompt: string) => Promise<Tier | null>;
|
|
22
|
+
/**
|
|
23
|
+
* Default LLM classifier — lazy-imports the ModelClient to avoid a hard
|
|
24
|
+
* cycle with agent/llm.ts (which itself imports routing helpers for virtual
|
|
25
|
+
* profile resolution). Callers can substitute their own classifier for
|
|
26
|
+
* tests by passing one to `routeRequestAsync`.
|
|
27
|
+
*/
|
|
28
|
+
export declare function llmClassifyRequest(prompt: string): Promise<Tier | null>;
|
|
29
|
+
/**
|
|
30
|
+
* Async router — LLM classifier first, keyword classifier as fallback.
|
|
31
|
+
* Profile-specific tier tables (AUTO / ECO / PREMIUM / FREE) still pick
|
|
32
|
+
* the concrete model; the classifier only picks the TIER.
|
|
33
|
+
*/
|
|
34
|
+
export declare function routeRequestAsync(prompt: string, profile?: RoutingProfile, classify?: TierClassifier): Promise<RoutingResult>;
|
|
21
35
|
export declare function routeRequest(prompt: string, profile?: RoutingProfile): RoutingResult;
|
|
22
36
|
/**
|
|
23
37
|
* Get fallback models for a tier
|
package/dist/router/index.js
CHANGED
|
@@ -267,6 +267,129 @@ function classicRouteRequest(prompt, profile) {
|
|
|
267
267
|
const savings = computeSavings(model);
|
|
268
268
|
return { model, tier, confidence, signals, savings };
|
|
269
269
|
}
|
|
270
|
+
// ─── LLM-based classifier ───
|
|
271
|
+
//
|
|
272
|
+
// Historical router was a 15-dimension keyword scorer — every new failure
|
|
273
|
+
// mode needed another KEYWORD list (CODE, REASONING, ANALYSIS, ...). Cheap
|
|
274
|
+
// to run but structurally wrong: keywords always lag reality, and users
|
|
275
|
+
// phrase the same intent fifty different ways. A free model can just
|
|
276
|
+
// *read* the prompt and tell us the tier.
|
|
277
|
+
//
|
|
278
|
+
// Design:
|
|
279
|
+
// - Classification prompt is one word answer: SIMPLE | MEDIUM | COMPLEX | REASONING
|
|
280
|
+
// - Runs on a free NVIDIA model — $0/call, so we can afford it on every turn
|
|
281
|
+
// - 2s hard timeout + strict parse; any failure falls through to the
|
|
282
|
+
// keyword classifier so we always have a routing answer
|
|
283
|
+
// - Exposed via async `routeRequestAsync(prompt, profile, classify?)`. Callers
|
|
284
|
+
// that can't be async (proxy, LLM-client bootstrap) keep using the sync
|
|
285
|
+
// `routeRequest`, which silently does keyword-only routing.
|
|
286
|
+
const CLASSIFIER_MODEL = process.env.FRANKLIN_ROUTER_MODEL || 'nvidia/nemotron-ultra-253b';
|
|
287
|
+
const CLASSIFIER_TIMEOUT_MS = 2_500;
|
|
288
|
+
const CLASSIFIER_SYSTEM = `You classify a user's message into ONE routing tier for a CLI agent. Reply with EXACTLY ONE WORD from the allowed set. No explanation, no punctuation, no quotes.
|
|
289
|
+
|
|
290
|
+
Tiers:
|
|
291
|
+
- SIMPLE — greetings, trivia, arithmetic, short definitions, yes/no questions. A single memory-based reply is acceptable.
|
|
292
|
+
- MEDIUM — multi-turn code edits, targeted bug fixes, lookups, summaries. Some tool use expected.
|
|
293
|
+
- COMPLEX — substantive engineering, analysis, recommendations, research questions that depend on current-world data (stock prices, current events, live market state). Multiple tool calls + synthesis.
|
|
294
|
+
- REASONING — formal proofs, derivations, deep chains of logic, multi-variable optimization.
|
|
295
|
+
|
|
296
|
+
If the message names a ticker, asks for a recommendation, or asks "why did X happen", it is COMPLEX or REASONING — never SIMPLE.
|
|
297
|
+
|
|
298
|
+
Answer format: a single word. SIMPLE or MEDIUM or COMPLEX or REASONING.`;
|
|
299
|
+
/**
|
|
300
|
+
* Parse a one-word classifier reply into a Tier. Returns null on junk so
|
|
301
|
+
* the caller can fall back to keyword classification.
|
|
302
|
+
*/
|
|
303
|
+
function parseTierWord(reply) {
|
|
304
|
+
const m = reply.trim().toUpperCase().match(/\b(SIMPLE|MEDIUM|COMPLEX|REASONING)\b/);
|
|
305
|
+
return m ? m[1] : null;
|
|
306
|
+
}
|
|
307
|
+
/**
|
|
308
|
+
* Default LLM classifier — lazy-imports the ModelClient to avoid a hard
|
|
309
|
+
* cycle with agent/llm.ts (which itself imports routing helpers for virtual
|
|
310
|
+
* profile resolution). Callers can substitute their own classifier for
|
|
311
|
+
* tests by passing one to `routeRequestAsync`.
|
|
312
|
+
*/
|
|
313
|
+
export async function llmClassifyRequest(prompt) {
|
|
314
|
+
if (!prompt || prompt.trim().length === 0)
|
|
315
|
+
return null;
|
|
316
|
+
// Very short messages: skip the classifier call, let keyword path decide.
|
|
317
|
+
// Saves ~500ms on "hi" / "thanks" / slash commands.
|
|
318
|
+
if (prompt.trim().length < 10)
|
|
319
|
+
return null;
|
|
320
|
+
let ModelClientCtor;
|
|
321
|
+
let chain;
|
|
322
|
+
let apiUrl;
|
|
323
|
+
try {
|
|
324
|
+
const llmMod = await import('../agent/llm.js');
|
|
325
|
+
const cfgMod = await import('../config.js');
|
|
326
|
+
ModelClientCtor = llmMod.ModelClient;
|
|
327
|
+
chain = cfgMod.loadChain();
|
|
328
|
+
apiUrl = cfgMod.API_URLS[chain];
|
|
329
|
+
}
|
|
330
|
+
catch {
|
|
331
|
+
return null;
|
|
332
|
+
}
|
|
333
|
+
const client = new ModelClientCtor({ apiUrl, chain });
|
|
334
|
+
const ctrl = new AbortController();
|
|
335
|
+
const timer = setTimeout(() => ctrl.abort(), CLASSIFIER_TIMEOUT_MS);
|
|
336
|
+
try {
|
|
337
|
+
const result = await client.complete({
|
|
338
|
+
model: CLASSIFIER_MODEL,
|
|
339
|
+
system: CLASSIFIER_SYSTEM,
|
|
340
|
+
messages: [{ role: 'user', content: prompt.slice(0, 2000) }],
|
|
341
|
+
tools: [],
|
|
342
|
+
max_tokens: 8,
|
|
343
|
+
}, ctrl.signal);
|
|
344
|
+
let text = '';
|
|
345
|
+
for (const part of result.content) {
|
|
346
|
+
if (typeof part === 'object' && part.type === 'text' && part.text)
|
|
347
|
+
text += part.text;
|
|
348
|
+
}
|
|
349
|
+
return parseTierWord(text);
|
|
350
|
+
}
|
|
351
|
+
catch {
|
|
352
|
+
return null;
|
|
353
|
+
}
|
|
354
|
+
finally {
|
|
355
|
+
clearTimeout(timer);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
/**
|
|
359
|
+
* Async router — LLM classifier first, keyword classifier as fallback.
|
|
360
|
+
* Profile-specific tier tables (AUTO / ECO / PREMIUM / FREE) still pick
|
|
361
|
+
* the concrete model; the classifier only picks the TIER.
|
|
362
|
+
*/
|
|
363
|
+
export async function routeRequestAsync(prompt, profile = 'auto', classify = llmClassifyRequest) {
|
|
364
|
+
// Free / short-circuit profiles — no classifier needed.
|
|
365
|
+
if (profile === 'free')
|
|
366
|
+
return routeRequest(prompt, profile);
|
|
367
|
+
const tier = await classify(prompt).catch(() => null);
|
|
368
|
+
if (!tier) {
|
|
369
|
+
// Classifier miss or disabled — fall through to the sync keyword router.
|
|
370
|
+
return routeRequest(prompt, profile);
|
|
371
|
+
}
|
|
372
|
+
// Build a RoutingResult from the LLM-picked tier using the same tier
|
|
373
|
+
// tables the keyword path uses. Keeps downstream code path-identical.
|
|
374
|
+
let tierConfigs;
|
|
375
|
+
switch (profile) {
|
|
376
|
+
case 'eco':
|
|
377
|
+
tierConfigs = ECO_TIERS;
|
|
378
|
+
break;
|
|
379
|
+
case 'premium':
|
|
380
|
+
tierConfigs = PREMIUM_TIERS;
|
|
381
|
+
break;
|
|
382
|
+
default: tierConfigs = AUTO_TIERS;
|
|
383
|
+
}
|
|
384
|
+
const model = tierConfigs[tier].primary;
|
|
385
|
+
return {
|
|
386
|
+
model,
|
|
387
|
+
tier,
|
|
388
|
+
confidence: 0.85, // LLM classification — medium-high confidence
|
|
389
|
+
signals: ['llm-classified'],
|
|
390
|
+
savings: computeSavings(model),
|
|
391
|
+
};
|
|
392
|
+
}
|
|
270
393
|
// ─── Main Router ───
|
|
271
394
|
export function routeRequest(prompt, profile = 'auto') {
|
|
272
395
|
// Free profile — always use free model
|
package/package.json
CHANGED