@blockrun/franklin 3.8.15 → 3.8.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,4 +53,15 @@ export declare function checkGrounding(userInput: string, history: Dialogue[], a
53
53
  * user when the check agreed the answer was sound.
54
54
  */
55
55
  export declare function renderGroundingFollowup(result: GroundingResult): string;
56
+ /**
57
+ * Build a synthetic user message that instructs the agent to retry with the
58
+ * missing tools. Returned message goes into history so the model's next
59
+ * generation sees it as the most recent instruction. This is the GAN-like
60
+ * feedback loop pattern from Anthropic's harness-design writeup —
61
+ * evaluator findings feed back into the generator until PASS (or retry cap).
62
+ *
63
+ * Intentionally terse: the agent already has the original question in
64
+ * history; we only need to name the gap + the tools to use.
65
+ */
66
+ export declare function buildGroundingRetryInstruction(result: GroundingResult, originalUserQuestion: string): string;
56
67
  export type { CapabilityHandler };
@@ -231,3 +231,27 @@ export function renderGroundingFollowup(result) {
231
231
  : '(evaluator returned no specific items — check the transcript manually)';
232
232
  return `\n\n${header}\n${body}\n\n_Ask again with an explicit instruction to call the tools, or disable these checks with \`FRANKLIN_NO_EVAL=1\`._`;
233
233
  }
234
+ /**
235
+ * Build a synthetic user message that instructs the agent to retry with the
236
+ * missing tools. Returned message goes into history so the model's next
237
+ * generation sees it as the most recent instruction. This is the GAN-like
238
+ * feedback loop pattern from Anthropic's harness-design writeup —
239
+ * evaluator findings feed back into the generator until PASS (or retry cap).
240
+ *
241
+ * Intentionally terse: the agent already has the original question in
242
+ * history; we only need to name the gap + the tools to use.
243
+ */
244
+ export function buildGroundingRetryInstruction(result, originalUserQuestion) {
245
+ const lines = [
246
+ '[GROUNDING CHECK FAILED]',
247
+ 'Your previous answer stated facts without calling the relevant tools. Specifically:',
248
+ ];
249
+ for (const issue of result.issues) {
250
+ lines.push(`- ${issue}`);
251
+ }
252
+ lines.push('');
253
+ lines.push('Retry: call the missing tools first, then give a concise final answer based on the tool results. Only claim what the tool outputs actually say. If a tool fails, say so rather than falling back to memory.');
254
+ lines.push('');
255
+ lines.push(`Original user question: ${originalUserQuestion.trim().slice(0, 500)}`);
256
+ return lines.join('\n');
257
+ }
@@ -21,11 +21,11 @@ import { appendAudit, extractLastUserPrompt } from '../stats/audit.js';
21
21
  import { estimateCost, OPUS_PRICING } from '../pricing.js';
22
22
  import { maybeMidSessionExtract } from '../learnings/extractor.js';
23
23
  import { extractMentions, buildEntityContext, loadEntities } from '../brain/store.js';
24
- import { routeRequest, parseRoutingProfile } from '../router/index.js';
24
+ import { routeRequestAsync, parseRoutingProfile } from '../router/index.js';
25
25
  import { recordOutcome } from '../router/local-elo.js';
26
26
  import { shouldPlan, getPlanningPrompt, getExecutorModel, isExecutorStuck, toolCallSignature } from './planner.js';
27
27
  import { shouldVerify, runVerification } from './verification.js';
28
- import { shouldCheckGrounding, checkGrounding, renderGroundingFollowup } from './evaluator.js';
28
+ import { shouldCheckGrounding, checkGrounding, renderGroundingFollowup, buildGroundingRetryInstruction, } from './evaluator.js';
29
29
  import { createSessionId, appendToSession, updateSessionMeta, pruneOldSessions, loadSessionHistory, loadSessionMeta, } from '../session/storage.js';
30
30
  /**
31
31
  * Atomically replace all elements in a history array.
@@ -525,6 +525,14 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
525
525
  let maxTokensOverride;
526
526
  const turnIdleReference = lastSessionActivity;
527
527
  lastSessionActivity = Date.now();
528
+ // ── Grounding retry state (per turn) ──
529
+ // When the post-response evaluator finds UNGROUNDED claims, we inject a
530
+ // corrective user message and re-enter the loop so the generator can
531
+ // answer again with the missing tool calls. 1-retry cap: if round 2
532
+ // still UNGROUNDED, ship the annotated response and let the user
533
+ // decide — avoids pathological loops, caps wall-clock cost.
534
+ let groundingRetryCount = 0;
535
+ const MAX_GROUNDING_RETRIES = 1;
528
536
  // ── Plan-then-execute state (per turn) ──
529
537
  let planActive = false;
530
538
  let planPlannerModel = '';
@@ -688,7 +696,7 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
688
696
  .map(p => p.text ?? '')
689
697
  .join(' ')
690
698
  : '';
691
- const routing = routeRequest(userText, routingProfile);
699
+ const routing = await routeRequestAsync(userText, routingProfile);
692
700
  resolvedModel = routing.model;
693
701
  routingTier = routing.tier;
694
702
  routingConfidence = routing.confidence;
@@ -1107,12 +1115,15 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
1107
1115
  }
1108
1116
  }
1109
1117
  // ── Grounding gate: check that factual claims trace to tool calls ──
1110
- // Fires on any substantive answer to a non-trivial question. Designed
1111
- // to catch the failure mode the code-verifier misses: model answers
1112
- // a "what's X / should I buy Y" question from memory instead of
1113
- // calling the live tools. Evaluator runs as a separate agent on a
1114
- // cheap model; never blocks the turn, only appends a ⚠️ note when
1115
- // the answer looks ungrounded so the user can re-ask.
1118
+ // Fires on any substantive answer to a non-trivial question. Catches
1119
+ // the failure mode the code-verifier misses: model answers a
1120
+ // "what's X / should I buy Y" question from memory instead of
1121
+ // calling the live tools.
1122
+ //
1123
+ // On UNGROUNDED: inject a corrective user message (GAN-style feedback)
1124
+ // and re-enter the loop so the generator can answer again with the
1125
+ // right tools. Up to MAX_GROUNDING_RETRIES attempts — after that,
1126
+ // annotate and ship so the user can decide.
1116
1127
  try {
1117
1128
  const assistantText = responseParts
1118
1129
  .filter(p => p.type === 'text' && typeof p.text === 'string')
@@ -1122,6 +1133,21 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
1122
1133
  const gResult = await checkGrounding(lastUserInput, history, assistantText, client, {
1123
1134
  abortSignal: abort.signal,
1124
1135
  });
1136
+ if (gResult.verdict === 'UNGROUNDED' && groundingRetryCount < MAX_GROUNDING_RETRIES) {
1137
+ groundingRetryCount++;
1138
+ const retryMsg = buildGroundingRetryInstruction(gResult, lastUserInput);
1139
+ const feedbackMsg = { role: 'user', content: retryMsg };
1140
+ history.push(feedbackMsg);
1141
+ persistSessionMessage(feedbackMsg);
1142
+ onEvent({
1143
+ kind: 'text_delta',
1144
+ text: '\n\n*Ungrounded claims detected — retrying with required tool calls...*\n\n',
1145
+ });
1146
+ continue; // Re-enter outer loop — generator will produce a new response.
1147
+ }
1148
+ // Either the verdict is acceptable (GROUNDED / PARTIAL / SKIPPED)
1149
+ // or we've hit the retry cap with UNGROUNDED still outstanding.
1150
+ // In both cases, surface the followup if one applies and exit.
1125
1151
  const followup = renderGroundingFollowup(gResult);
1126
1152
  if (followup) {
1127
1153
  onEvent({ kind: 'text_delta', text: followup });
package/dist/banner.js CHANGED
@@ -83,12 +83,16 @@ function padVisible(s, targetWidth) {
83
83
  return s + '\x1b[0m' + ' '.repeat(targetWidth - current);
84
84
  }
85
85
  export function printBanner(version) {
86
+ // Full portrait banner (Ben Franklin + FRANKLIN block art) is the default
87
+ // since v3.8.17 — it's the brand asset and the visual anchor for "Franklin
88
+ // is Ben Franklin's AI heir." Users who want a 2-line startup (scripting,
89
+ // narrow terminals, CI) can set FRANKLIN_BANNER=compact.
86
90
  const style = process.env.FRANKLIN_BANNER?.toLowerCase();
87
- if (style === 'full' || style === 'legacy') {
88
- printLegacyBanner(version);
91
+ if (style === 'compact' || style === 'minimal') {
92
+ printCompactBanner(version);
89
93
  }
90
94
  else {
91
- printCompactBanner(version);
95
+ printLegacyBanner(version);
92
96
  }
93
97
  // Kick off a background refresh for *next* startup, and print a hint now
94
98
  // if the cache already knows about a newer version. All wrapped in
@@ -71,8 +71,14 @@ export async function startCommand(options) {
71
71
  model = configModel;
72
72
  }
73
73
  else {
74
- // Default: free NVIDIA model zero wallet charges until user explicitly switches
75
- model = 'nvidia/nemotron-ultra-253b';
74
+ // Default: blockrun/auto the LLM router (v3.8.16) picks a model per
75
+ // prompt. SIMPLE questions route to cheap/fast models (gemini-flash,
76
+ // kimi); COMPLEX / REASONING to Sonnet 4.6 / Opus 4.7. Cost fallback
77
+ // to free models on 402 is handled in the agent loop, so an unfunded
78
+ // wallet still works — it just degrades to the free tier mid-session
79
+ // instead of starting there. Much better first-turn quality than the
80
+ // old nvidia-nemotron default, which stubbed tool use.
81
+ model = 'blockrun/auto';
76
82
  }
77
83
  const workDir = process.cwd();
78
84
  // --prompt batch mode: skip all interactive startup UI/side effects so
@@ -18,6 +18,20 @@ export interface RoutingResult {
18
18
  signals: string[];
19
19
  savings: number;
20
20
  }
21
+ export type TierClassifier = (prompt: string) => Promise<Tier | null>;
22
+ /**
23
+ * Default LLM classifier — lazy-imports the ModelClient to avoid a hard
24
+ * cycle with agent/llm.ts (which itself imports routing helpers for virtual
25
+ * profile resolution). Callers can substitute their own classifier for
26
+ * tests by passing one to `routeRequestAsync`.
27
+ */
28
+ export declare function llmClassifyRequest(prompt: string): Promise<Tier | null>;
29
+ /**
30
+ * Async router — LLM classifier first, keyword classifier as fallback.
31
+ * Profile-specific tier tables (AUTO / ECO / PREMIUM / FREE) still pick
32
+ * the concrete model; the classifier only picks the TIER.
33
+ */
34
+ export declare function routeRequestAsync(prompt: string, profile?: RoutingProfile, classify?: TierClassifier): Promise<RoutingResult>;
21
35
  export declare function routeRequest(prompt: string, profile?: RoutingProfile): RoutingResult;
22
36
  /**
23
37
  * Get fallback models for a tier
@@ -267,6 +267,129 @@ function classicRouteRequest(prompt, profile) {
267
267
  const savings = computeSavings(model);
268
268
  return { model, tier, confidence, signals, savings };
269
269
  }
270
+ // ─── LLM-based classifier ───
271
+ //
272
+ // Historical router was a 15-dimension keyword scorer — every new failure
273
+ // mode needed another KEYWORD list (CODE, REASONING, ANALYSIS, ...). Cheap
274
+ // to run but structurally wrong: keywords always lag reality, and users
275
+ // phrase the same intent fifty different ways. A free model can just
276
+ // *read* the prompt and tell us the tier.
277
+ //
278
+ // Design:
279
+ // - Classification prompt is one word answer: SIMPLE | MEDIUM | COMPLEX | REASONING
280
+ // - Runs on a free NVIDIA model — $0/call, so we can afford it on every turn
281
+ // - 2s hard timeout + strict parse; any failure falls through to the
282
+ // keyword classifier so we always have a routing answer
283
+ // - Exposed via async `routeRequestAsync(prompt, profile, classify?)`. Callers
284
+ // that can't be async (proxy, LLM-client bootstrap) keep using the sync
285
+ // `routeRequest`, which silently does keyword-only routing.
286
+ const CLASSIFIER_MODEL = process.env.FRANKLIN_ROUTER_MODEL || 'nvidia/nemotron-ultra-253b';
287
+ const CLASSIFIER_TIMEOUT_MS = 2_500;
288
+ const CLASSIFIER_SYSTEM = `You classify a user's message into ONE routing tier for a CLI agent. Reply with EXACTLY ONE WORD from the allowed set. No explanation, no punctuation, no quotes.
289
+
290
+ Tiers:
291
+ - SIMPLE — greetings, trivia, arithmetic, short definitions, yes/no questions. A single memory-based reply is acceptable.
292
+ - MEDIUM — multi-turn code edits, targeted bug fixes, lookups, summaries. Some tool use expected.
293
+ - COMPLEX — substantive engineering, analysis, recommendations, research questions that depend on current-world data (stock prices, current events, live market state). Multiple tool calls + synthesis.
294
+ - REASONING — formal proofs, derivations, deep chains of logic, multi-variable optimization.
295
+
296
+ If the message names a ticker, asks for a recommendation, or asks "why did X happen", it is COMPLEX or REASONING — never SIMPLE.
297
+
298
+ Answer format: a single word. SIMPLE or MEDIUM or COMPLEX or REASONING.`;
299
+ /**
300
+ * Parse a one-word classifier reply into a Tier. Returns null on junk so
301
+ * the caller can fall back to keyword classification.
302
+ */
303
+ function parseTierWord(reply) {
304
+ const m = reply.trim().toUpperCase().match(/\b(SIMPLE|MEDIUM|COMPLEX|REASONING)\b/);
305
+ return m ? m[1] : null;
306
+ }
307
+ /**
308
+ * Default LLM classifier — lazy-imports the ModelClient to avoid a hard
309
+ * cycle with agent/llm.ts (which itself imports routing helpers for virtual
310
+ * profile resolution). Callers can substitute their own classifier for
311
+ * tests by passing one to `routeRequestAsync`.
312
+ */
313
+ export async function llmClassifyRequest(prompt) {
314
+ if (!prompt || prompt.trim().length === 0)
315
+ return null;
316
+ // Very short messages: skip the classifier call, let keyword path decide.
317
+ // Saves ~500ms on "hi" / "thanks" / slash commands.
318
+ if (prompt.trim().length < 10)
319
+ return null;
320
+ let ModelClientCtor;
321
+ let chain;
322
+ let apiUrl;
323
+ try {
324
+ const llmMod = await import('../agent/llm.js');
325
+ const cfgMod = await import('../config.js');
326
+ ModelClientCtor = llmMod.ModelClient;
327
+ chain = cfgMod.loadChain();
328
+ apiUrl = cfgMod.API_URLS[chain];
329
+ }
330
+ catch {
331
+ return null;
332
+ }
333
+ const client = new ModelClientCtor({ apiUrl, chain });
334
+ const ctrl = new AbortController();
335
+ const timer = setTimeout(() => ctrl.abort(), CLASSIFIER_TIMEOUT_MS);
336
+ try {
337
+ const result = await client.complete({
338
+ model: CLASSIFIER_MODEL,
339
+ system: CLASSIFIER_SYSTEM,
340
+ messages: [{ role: 'user', content: prompt.slice(0, 2000) }],
341
+ tools: [],
342
+ max_tokens: 8,
343
+ }, ctrl.signal);
344
+ let text = '';
345
+ for (const part of result.content) {
346
+ if (typeof part === 'object' && part.type === 'text' && part.text)
347
+ text += part.text;
348
+ }
349
+ return parseTierWord(text);
350
+ }
351
+ catch {
352
+ return null;
353
+ }
354
+ finally {
355
+ clearTimeout(timer);
356
+ }
357
+ }
358
+ /**
359
+ * Async router — LLM classifier first, keyword classifier as fallback.
360
+ * Profile-specific tier tables (AUTO / ECO / PREMIUM / FREE) still pick
361
+ * the concrete model; the classifier only picks the TIER.
362
+ */
363
+ export async function routeRequestAsync(prompt, profile = 'auto', classify = llmClassifyRequest) {
364
+ // Free / short-circuit profiles — no classifier needed.
365
+ if (profile === 'free')
366
+ return routeRequest(prompt, profile);
367
+ const tier = await classify(prompt).catch(() => null);
368
+ if (!tier) {
369
+ // Classifier miss or disabled — fall through to the sync keyword router.
370
+ return routeRequest(prompt, profile);
371
+ }
372
+ // Build a RoutingResult from the LLM-picked tier using the same tier
373
+ // tables the keyword path uses. Keeps downstream code path-identical.
374
+ let tierConfigs;
375
+ switch (profile) {
376
+ case 'eco':
377
+ tierConfigs = ECO_TIERS;
378
+ break;
379
+ case 'premium':
380
+ tierConfigs = PREMIUM_TIERS;
381
+ break;
382
+ default: tierConfigs = AUTO_TIERS;
383
+ }
384
+ const model = tierConfigs[tier].primary;
385
+ return {
386
+ model,
387
+ tier,
388
+ confidence: 0.85, // LLM classification — medium-high confidence
389
+ signals: ['llm-classified'],
390
+ savings: computeSavings(model),
391
+ };
392
+ }
270
393
  // ─── Main Router ───
271
394
  export function routeRequest(prompt, profile = 'auto') {
272
395
  // Free profile — always use free model
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blockrun/franklin",
3
- "version": "3.8.15",
3
+ "version": "3.8.17",
4
4
  "description": "Franklin — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.",
5
5
  "type": "module",
6
6
  "exports": {