npm - @blockrun/franklin - Versions diffs - 3.15.90 → 3.15.92 - Mend

@blockrun/franklin 3.15.90 → 3.15.92

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/agent/loop.js CHANGED Viewed

@@ -27,7 +27,7 @@ import { setSessionPersistenceDisabled } from '../session/storage.js';
 import { estimateCost, OPUS_PRICING } from '../pricing.js';
 import { maybeMidSessionExtract } from '../learnings/extractor.js';
 import { extractMentions, buildEntityContext, loadEntities } from '../brain/store.js';
-import { routeRequestAsync, resolveTierToModel, parseRoutingProfile, getFallbackChain, pickFreeFallback } from '../router/index.js';
+import { routeRequestAsync, resolveTierToModel, parseRoutingProfile, getFallbackChain, pickFreeFallback, isVisionModel, messageNeedsVision, pickVisionSibling } from '../router/index.js';
 import { recordOutcome } from '../router/local-elo.js';
 import { shouldPlan, getPlanningPrompt, getExecutorModel, isExecutorStuck, toolCallSignature } from './planner.js';
 import { shouldVerify, runVerification } from './verification.js';
@@ -1118,6 +1118,16 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
                 onProgress: (id, text) => onEvent({ kind: 'capability_progress', id, text }),
                 sessionId,
             });
+            // ── Vision-need detection (per turn) ──
+            // Images enter a turn one of two ways: the user types an image path
+            // and the Read tool will inline bytes mid-turn, or the user references
+            // an image in their last message directly. We can only see (1) at this
+            // point — but that's the case we care about: the router has to decide
+            // BEFORE the model call which model to use. If the model can't see
+            // images, Read's tool_result image blocks get tokenized as base64 text
+            // by the gateway (verified 2026-05-09) and the model hallucinates from
+            // the "Image file: <path>" stub. Detect upfront, route accordingly.
+            const turnNeedsVision = loopCount === 1 && messageNeedsVision(lastUserInput);
             // ── Router: resolve routing profiles to concrete models ──
             // Uses the tier already decided by the turn-analyzer — one LLM call
             // up-front rather than a separate classifier here. Fallback to the
@@ -1129,8 +1139,8 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
             let routingSavings;
             if (routingProfile) {
                 const routing = turnAnalysis
-                    ? resolveTierToModel(turnAnalysis.tier, routingProfile)
-                    : await routeRequestAsync(lastUserInput || '', routingProfile);
+                    ? resolveTierToModel(turnAnalysis.tier, routingProfile, turnNeedsVision)
+                    : await routeRequestAsync(lastUserInput || '', routingProfile, undefined, turnNeedsVision);
                 resolvedModel = routing.model;
                 routingTier = routing.tier;
                 routingConfidence = routing.confidence;
@@ -1138,12 +1148,31 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
                 lastRoutedModel = routing.model;
                 lastRoutedCategory = routing.category || '';
                 if (loopCount === 1) {
+                    const visionTag = turnNeedsVision ? ' 👁️' : '';
                     onEvent({
                         kind: 'text_delta',
-                        text: `*Auto → ${routing.model}*\n\n`,
+                        text: `*Auto → ${routing.model}${visionTag}*\n\n`,
                     });
                 }
             }
+            else if (turnNeedsVision && !isVisionModel(resolvedModel)) {
+                // ── Manual-mode guard ──
+                // User explicitly picked a model that can't see images. Don't silently
+                // send the image — the model would only see the text stub from Read
+                // and hallucinate. Swap to the closest vision sibling JUST for this
+                // turn (next turn's model-recovery block at the top of the user-input
+                // handler resets to baseModel, so the user's intent isn't permanently
+                // overridden). Always emit a visible notice so the user knows their
+                // pick was overridden and why.
+                const original = resolvedModel;
+                const visionSwap = pickVisionSibling(original);
+                resolvedModel = visionSwap;
+                config.model = visionSwap;
+                onEvent({
+                    kind: 'text_delta',
+                    text: `*⚠️ ${original} can't see images — using ${visionSwap} for this turn.*\n\n`,
+                });
+            }
             // Update token estimation model for more accurate byte-per-token ratio
             setEstimationModel(resolvedModel);
             // ── Plan-then-execute: detect and activate ──

package/dist/commands/doctor.d.ts CHANGED Viewed

@@ -12,4 +12,5 @@
  */
 export declare function doctorCommand(opts?: {
     json?: boolean;
+    anomaly?: boolean;
 }): Promise<void>;

package/dist/commands/doctor.js CHANGED Viewed

@@ -247,6 +247,10 @@ function printHuman(checks) {
     console.log();
 }
 export async function doctorCommand(opts = {}) {
+    if (opts.anomaly) {
+        await anomalyReportCommand(opts);
+        return;
+    }
     const checks = await runChecks();
     if (opts.json) {
         const fails = checks.filter(c => c.status === 'fail').length;
@@ -257,3 +261,35 @@ export async function doctorCommand(opts = {}) {
     const fails = checks.filter(c => c.status === 'fail').length;
     process.exit(fails > 0 ? 1 : 0);
 }
+/**
+ * `franklin doctor --anomaly` — print failure spikes vs 30-day baseline.
+ * Exits non-zero when at least one anomaly is surfaced, so it can be
+ * wired into a cron / CI without parsing stdout.
+ */
+async function anomalyReportCommand(opts) {
+    const { getToolAnomalies } = await import('../stats/failures.js');
+    const reports = getToolAnomalies();
+    if (opts.json) {
+        process.stdout.write(JSON.stringify({ anomalies: reports }, null, 2) + '\n');
+        process.exit(reports.length > 0 ? 1 : 0);
+    }
+    console.log(chalk.bold('\n  franklin doctor --anomaly'));
+    console.log(chalk.dim('  Looking for (tool, category) failure spikes in the last 24h vs the 30-day baseline.\n'));
+    if (reports.length === 0) {
+        console.log(chalk.green('  No anomalies. Tool failure rates match the 30-day baseline.\n'));
+        process.exit(0);
+    }
+    for (const a of reports) {
+        const newType = !Number.isFinite(a.spikeRatio);
+        const header = `  ${chalk.red('•')} ${chalk.bold(a.toolName)} / ${chalk.yellow(a.category)}`;
+        const ratio = newType
+            ? chalk.red('NEW failure type (no baseline)')
+            : chalk.red(`${a.spikeRatio.toFixed(1)}× baseline`);
+        const counts = chalk.dim(`recent=${a.recentCount}, baseline=${a.baselineCount}`);
+        console.log(`${header}  ${ratio}  ${counts}`);
+        const trimmed = a.sampleMessage.length > 140 ? a.sampleMessage.slice(0, 140) + '…' : a.sampleMessage;
+        console.log(chalk.dim(`    sample: ${trimmed}`));
+    }
+    console.log(chalk.dim(`\n  ${reports.length} anomalies. Investigate before they snowball.\n`));
+    process.exit(1);
+}

package/dist/index.js CHANGED Viewed

@@ -185,6 +185,7 @@ program
     .command('doctor')
     .description('One-command health check (node, wallet, chain, gateway, MCP, telemetry)')
     .option('--json', 'Machine-readable output')
+    .option('--anomaly', 'Surface (tool, category) failure spikes vs 30-day baseline')
     .action(async (opts) => {
     const { doctorCommand } = await import('./commands/doctor.js');
     await doctorCommand(opts);

package/dist/proxy/server.js CHANGED Viewed

@@ -4,7 +4,7 @@ import { recordUsage } from '../stats/tracker.js';
 import { appendSettlementRow } from '../stats/cost-log.js';
 import { appendAudit } from '../stats/audit.js';
 import { buildFallbackChain, DEFAULT_FALLBACK_CONFIG, ROUTING_PROFILES, } from './fallback.js';
-import { routeRequest, parseRoutingProfile, } from '../router/index.js';
+import { routeRequest, parseRoutingProfile, isVisionModel, messagesNeedVision, pickVisionSibling, } from '../router/index.js';
 import { estimateCost } from '../pricing.js';
 import { VERSION } from '../config.js';
 // User-Agent for backend requests
@@ -342,6 +342,13 @@ export function createProxy(options) {
                             parsed.model = currentModel || DEFAULT_MODEL;
                         }
                         requestModel = parsed.model || DEFAULT_MODEL;
+                        // Vision-need detection: does this request carry an image? We
+                        // check messages[] for explicit image / image_url parts AND for
+                        // image paths embedded in text — Anthropic-format proxies stream
+                        // both shapes. Used both by the Auto router (pick a vision-capable
+                        // tier model) and by the manual-mode guard (swap when the user
+                        // explicitly picked a text-only model).
+                        const proxyNeedsVision = messagesNeedVision(parsed.messages || []);
                         // Smart routing: if model is a routing profile, classify and route
                         const routingProfile = parseRoutingProfile(requestModel);
                         if (routingProfile) {
@@ -360,13 +367,27 @@ export function createProxy(options) {
                                         .join('\n');
                                 }
                             }
-                            // Route the request
-                            const routing = routeRequest(promptText, routingProfile);
+                            // Route the request — propagate vision-need so AUTO_TIERS' V4
+                            // Pro default doesn't get picked for an image-bearing turn.
+                            const routing = routeRequest(promptText, routingProfile, proxyNeedsVision);
                             parsed.model = routing.model;
                             requestModel = routing.model;
                             logger.info(`[franklin] 🧠 Smart routing: ${routingProfile} → ${routing.tier} → ${routing.model} ` +
                                 `(${(routing.savings * 100).toFixed(0)}% savings) [${routing.signals.join(', ')}]`);
                         }
+                        else if (proxyNeedsVision && !isVisionModel(requestModel)) {
+                            // Manual-mode guard: user (or an upstream client) passed a
+                            // concrete text-only model alongside an image. Swap to the
+                            // family-closest vision sibling and log loudly — silently
+                            // sending the image would tokenize as base64 text and produce
+                            // a hallucinated answer. Same swap policy as the agent loop's
+                            // interactive path so behavior is consistent across surfaces.
+                            const original = requestModel;
+                            const visionSwap = pickVisionSibling(original);
+                            parsed.model = visionSwap;
+                            requestModel = visionSwap;
+                            logger.warn(`[franklin] 👁️  Vision swap: ${original} can't see images → ${visionSwap}`);
+                        }
                         {
                             const original = parsed.max_tokens;
                             const model = (parsed.model || '').toLowerCase();

package/dist/router/index.d.ts CHANGED Viewed

@@ -10,6 +10,7 @@
  * Local Elo adjustments personalize routing per user over time.
  */
 import { type Category } from './categories.js';
+export { isVisionModel, messageNeedsVision, messagesNeedVision, pickVisionSibling } from './vision.js';
 export type Tier = 'SIMPLE' | 'MEDIUM' | 'COMPLEX' | 'REASONING';
 export type RoutingProfile = 'auto' | 'free';
 export interface RoutingResult {
@@ -33,7 +34,7 @@ export declare function llmClassifyRequest(prompt: string): Promise<Tier | null>
  * Profile-specific tier tables (AUTO / ECO / PREMIUM / FREE) still pick
  * the concrete model; the classifier only picks the TIER.
  */
-export declare function routeRequestAsync(prompt: string, profile?: RoutingProfile, classify?: TierClassifier): Promise<RoutingResult>;
+export declare function routeRequestAsync(prompt: string, profile?: RoutingProfile, classify?: TierClassifier, needsVision?: boolean): Promise<RoutingResult>;
 /**
  * Map a pre-classified tier to a concrete model + savings using the profile's
  * tier table. No classifier call — assumes the caller already decided the
@@ -43,8 +44,8 @@ export declare function routeRequestAsync(prompt: string, profile?: RoutingProfi
  * Use this when you have a tier already. Use `routeRequestAsync` when you
  * need the classifier to produce the tier.
  */
-export declare function resolveTierToModel(tier: Tier, profile?: RoutingProfile): RoutingResult;
-export declare function routeRequest(prompt: string, profile?: RoutingProfile): RoutingResult;
+export declare function resolveTierToModel(tier: Tier, profile?: RoutingProfile, needsVision?: boolean): RoutingResult;
+export declare function routeRequest(prompt: string, profile?: RoutingProfile, needsVision?: boolean): RoutingResult;
 /**
  * Get fallback models for a tier
  */

package/dist/router/index.js CHANGED Viewed

@@ -16,6 +16,8 @@ import { BLOCKRUN_DIR } from '../config.js';
 import { detectCategory, mapCategoryToTier } from './categories.js';
 import { selectModel } from './selector.js';
 import { computeLocalElo, blendElo } from './local-elo.js';
+import { isVisionModel } from './vision.js';
+export { isVisionModel, messageNeedsVision, messagesNeedVision, pickVisionSibling } from './vision.js';
 // ─── Learned Weights Loading ───
 const WEIGHTS_FILE = path.join(BLOCKRUN_DIR, 'router-weights.json');
 let cachedWeights; // undefined = not loaded yet
@@ -69,6 +71,27 @@ const AUTO_TIERS = {
         ],
     },
 };
+/**
+ * If this turn carries an image, the picked tier model must be able to see it.
+ * Walks the tier's primary+fallback chain for the first vision-capable model;
+ * if none of them have vision, escalates to COMPLEX (Opus is always vision).
+ *
+ * Note: only applied when the caller signals needsVision=true. Without that
+ * hint the classic per-tier defaults still rule — V4 Pro's $0.50/$1.00 promo
+ * is the right SIMPLE/MEDIUM pick for text-only turns and we don't want to
+ * blanket-upgrade everyone to a vision model.
+ */
+function pickVisionTierModel(tier) {
+    const chain = [AUTO_TIERS[tier].primary, ...AUTO_TIERS[tier].fallback];
+    const visionInTier = chain.find(isVisionModel);
+    if (visionInTier)
+        return { model: visionInTier, tier, signal: 'vision-required' };
+    // Tier chain is fully text-only (unusual but possible if cheap tiers get
+    // re-tuned). Escalate to COMPLEX whose primary (Opus) is always vision.
+    const escalated = [AUTO_TIERS.COMPLEX.primary, ...AUTO_TIERS.COMPLEX.fallback]
+        .find(isVisionModel) ?? AUTO_TIERS.COMPLEX.primary;
+    return { model: escalated, tier: 'COMPLEX', signal: 'vision-escalated' };
+}
 // ─── Keywords for Classification ───
 //
 // Keyword fast-path uses English only by policy (English-only-source rule).
@@ -250,7 +273,7 @@ function classifyRequest(prompt, tokenCount) {
     return { tier, confidence, signals };
 }
 // ─── Classic Router (keyword-based fallback) ───
-function classicRouteRequest(prompt, profile) {
+function classicRouteRequest(prompt, profile, needsVision = false) {
     // Estimate token count (use byte length / 4 for better accuracy with non-ASCII)
     const byteLen = Buffer.byteLength(prompt, 'utf-8');
     const tokenCount = Math.ceil(byteLen / 4);
@@ -260,11 +283,21 @@ function classicRouteRequest(prompt, profile) {
     // 2026-05-03 — see comment on RoutingProfile above). 'free' is handled
     // earlier by the caller path; if it ever reaches here, fall through to
     // AUTO_TIERS rather than crashing.
-    const tierConfigs = AUTO_TIERS;
-    const model = tierConfigs[tier].primary;
+    let model;
+    let finalTier = tier;
+    const finalSignals = [...signals];
+    if (needsVision) {
+        const v = pickVisionTierModel(tier);
+        model = v.model;
+        finalTier = v.tier;
+        finalSignals.push(v.signal);
+    }
+    else {
+        model = AUTO_TIERS[tier].primary;
+    }
     const savings = computeSavings(model);
     const category = detectCategory(prompt, loadLearnedWeights()?.category_keywords).category;
-    return { model, tier, confidence, signals, savings, category };
+    return { model, tier: finalTier, confidence, signals: finalSignals, savings, category };
 }
 // ─── LLM-based classifier ───
 //
@@ -362,25 +395,35 @@ export async function llmClassifyRequest(prompt) {
  * Profile-specific tier tables (AUTO / ECO / PREMIUM / FREE) still pick
  * the concrete model; the classifier only picks the TIER.
  */
-export async function routeRequestAsync(prompt, profile = 'auto', classify = llmClassifyRequest) {
+export async function routeRequestAsync(prompt, profile = 'auto', classify = llmClassifyRequest, needsVision = false) {
     // Free / short-circuit profiles — no classifier needed.
     if (profile === 'free')
-        return routeRequest(prompt, profile);
+        return routeRequest(prompt, profile, needsVision);
     const tier = await classify(prompt).catch(() => null);
     if (!tier) {
         // Classifier miss or disabled — fall through to the sync keyword router.
-        return routeRequest(prompt, profile);
+        return routeRequest(prompt, profile, needsVision);
     }
     // Build a RoutingResult from the LLM-picked tier using the same tier
     // tables the keyword path uses. Keeps downstream code path-identical.
-    const tierConfigs = AUTO_TIERS;
-    const model = tierConfigs[tier].primary;
+    let model;
+    let finalTier = tier;
+    const signals = ['llm-classified'];
+    if (needsVision) {
+        const v = pickVisionTierModel(tier);
+        model = v.model;
+        finalTier = v.tier;
+        signals.push(v.signal);
+    }
+    else {
+        model = AUTO_TIERS[tier].primary;
+    }
     const category = detectCategory(prompt, loadLearnedWeights()?.category_keywords).category;
     return {
         model,
-        tier,
+        tier: finalTier,
         confidence: 0.85, // LLM classification — medium-high confidence
-        signals: ['llm-classified'],
+        signals,
         savings: computeSavings(model),
         category,
     };
@@ -394,36 +437,51 @@ export async function routeRequestAsync(prompt, profile = 'auto', classify = llm
  * Use this when you have a tier already. Use `routeRequestAsync` when you
  * need the classifier to produce the tier.
  */
-export function resolveTierToModel(tier, profile = 'auto') {
+export function resolveTierToModel(tier, profile = 'auto', needsVision = false) {
     // Free profile short-circuits — everything routes to a single free model.
+    // qwen3-coder-480b is text-only; on a vision turn the free profile can't
+    // help us. Caller should detect this and warn the user that Free won't
+    // handle images — for now we just return the free pick and let the model
+    // fail gracefully. (Open question: should we hard-fall to nvidia/llama-4-
+    // maverick here? Skipped until we see a real user hit this path.)
     if (profile === 'free') {
         return {
             model: 'nvidia/qwen3-coder-480b',
             tier: 'SIMPLE',
             confidence: 1.0,
-            signals: ['free-profile'],
+            signals: needsVision ? ['free-profile', 'vision-unsupported'] : ['free-profile'],
             savings: 1.0,
         };
     }
-    const tierConfigs = AUTO_TIERS;
-    const model = tierConfigs[tier].primary;
+    let model;
+    let finalTier = tier;
+    const signals = ['pre-classified'];
+    if (needsVision) {
+        const v = pickVisionTierModel(tier);
+        model = v.model;
+        finalTier = v.tier;
+        signals.push(v.signal);
+    }
+    else {
+        model = AUTO_TIERS[tier].primary;
+    }
     return {
         model,
-        tier,
+        tier: finalTier,
         confidence: 0.85,
-        signals: ['pre-classified'],
+        signals,
         savings: computeSavings(model),
     };
 }
 // ─── Main Router ───
-export function routeRequest(prompt, profile = 'auto') {
+export function routeRequest(prompt, profile = 'auto', needsVision = false) {
     // Free profile — always use free model
     if (profile === 'free') {
         return {
             model: 'nvidia/qwen3-coder-480b',
             tier: 'SIMPLE',
             confidence: 1.0,
-            signals: ['free-profile'],
+            signals: needsVision ? ['free-profile', 'vision-unsupported'] : ['free-profile'],
             savings: 1.0,
         };
     }
@@ -432,7 +490,7 @@ export function routeRequest(prompt, profile = 'auto') {
     // cheap/weak models on agentic work. Classic AUTO_TIERS defaults are
     // agent-tuned (Sonnet-tier backbone) and more predictable for users.
     if (profile === 'auto') {
-        return classicRouteRequest(prompt, profile);
+        return classicRouteRequest(prompt, profile, needsVision);
     }
     // ── Learned routing (if weights available) ──
     const weights = loadLearnedWeights();
@@ -457,6 +515,21 @@ export function routeRequest(prompt, profile = 'auto') {
         const selected = selectModel(category, profile, adjustedWeights);
         if (selected) {
             const tier = mapCategoryToTier(category);
+            // Vision-aware substitution: if the Elo-picked model is text-only but
+            // the turn needs vision, swap to the tier's first vision-capable model.
+            // We deliberately don't blend Elo with vision capability — vision is a
+            // hard requirement, not a quality dimension.
+            if (needsVision && !isVisionModel(selected.model)) {
+                const v = pickVisionTierModel(tier);
+                return {
+                    model: v.model,
+                    tier: v.tier,
+                    confidence,
+                    signals: [category, v.signal],
+                    savings: computeSavings(v.model),
+                    category,
+                };
+            }
             const savings = computeSavings(selected.model);
             return {
                 model: selected.model,
@@ -470,7 +543,7 @@ export function routeRequest(prompt, profile = 'auto') {
         // Fall through to classic if selectModel returns null (no candidates for category)
     }
     // ── Classic routing (keyword-based fallback) ──
-    return classicRouteRequest(prompt, profile);
+    return classicRouteRequest(prompt, profile, needsVision);
 }
 function computeSavings(model) {
     const opusCostPer1K = (OPUS_PRICING.input + OPUS_PRICING.output) / 2 / 1000;

package/dist/router/vision.d.ts ADDED Viewed

@@ -0,0 +1,51 @@
+/**
+ * Vision capability + image-attachment detection.
+ *
+ * Two jobs:
+ *   1. isVisionModel(id)        — does this gateway model accept image input?
+ *   2. messageNeedsVision(text) — does this user message reference an image?
+ *
+ * Source of truth: a hand-curated allowlist below. The gateway exposes a
+ * 'vision' category on /v1/models, but resolving it at routing time would
+ * make routeRequest async and gate sync proxy paths on a network call. The
+ * allowlist is small (~18 entries) and changes only when models do, which
+ * already touches the router + pricing tables — updating one more file is
+ * the right tradeoff vs. async fan-out across every routing callsite.
+ *
+ * Background: prior to this module, Auto routing could pick a text-only model
+ * (e.g. deepseek-v4-pro) on an image-bearing turn. The Read tool would still
+ * inline image bytes, the gateway would tokenize the base64 as text, and the
+ * model — having no vision pathway — would hallucinate based on the
+ * `Image file: <path>` description string. Expensive AND wrong.
+ */
+/** Does this concrete gateway model accept image input? */
+export declare function isVisionModel(modelId: string | undefined | null): boolean;
+/**
+ * Pick a vision-capable replacement closest to the user's chosen model.
+ * Prefers same provider family (so the user's intent — "I want Claude" vs
+ * "I want Gemini" — survives the swap), then falls back to a sensible
+ * vision default (Sonnet 4.6 — agent-tuned, mid-tier price).
+ */
+export declare function pickVisionSibling(modelId: string): string;
+/**
+ * Does this user-typed message reference an image file? Used by the router
+ * to bump Auto mode to a vision-capable tier, and by the manual-mode guard
+ * to swap a text-only model for one turn.
+ *
+ * Detection is intentionally a regex over file extensions rather than a
+ * filesystem stat — the user may type a path that doesn't yet exist
+ * (about to wget it) or a glob; what we care about is "does the model need
+ * eyes for this turn?" The false-positive risk is benign (we route to a
+ * slightly stronger model than strictly needed).
+ */
+export declare function messageNeedsVision(text: string | undefined | null): boolean;
+/**
+ * Messages-array variant: scans OpenAI- and Anthropic-format content blocks
+ * for explicit image parts (image / image_url / input_image) and for image
+ * paths embedded in text parts. Used by the proxy router which receives a
+ * fully-formed messages[] payload, not a single string.
+ */
+export declare function messagesNeedVision(messages: Array<{
+    role?: string;
+    content?: unknown;
+}> | undefined | null): boolean;

package/dist/router/vision.js ADDED Viewed

@@ -0,0 +1,127 @@
+/**
+ * Vision capability + image-attachment detection.
+ *
+ * Two jobs:
+ *   1. isVisionModel(id)        — does this gateway model accept image input?
+ *   2. messageNeedsVision(text) — does this user message reference an image?
+ *
+ * Source of truth: a hand-curated allowlist below. The gateway exposes a
+ * 'vision' category on /v1/models, but resolving it at routing time would
+ * make routeRequest async and gate sync proxy paths on a network call. The
+ * allowlist is small (~18 entries) and changes only when models do, which
+ * already touches the router + pricing tables — updating one more file is
+ * the right tradeoff vs. async fan-out across every routing callsite.
+ *
+ * Background: prior to this module, Auto routing could pick a text-only model
+ * (e.g. deepseek-v4-pro) on an image-bearing turn. The Read tool would still
+ * inline image bytes, the gateway would tokenize the base64 as text, and the
+ * model — having no vision pathway — would hallucinate based on the
+ * `Image file: <path>` description string. Expensive AND wrong.
+ */
+const VISION_MODELS = new Set([
+    // Anthropic — native vision across the line
+    'anthropic/claude-opus-4.7',
+    'anthropic/claude-opus-4.6',
+    'anthropic/claude-sonnet-4.6',
+    'anthropic/claude-haiku-4.5-20251001',
+    // OpenAI — multimodal flagships + o3 (Codex 5.3 is text-only, excluded)
+    'openai/gpt-5.5',
+    'openai/gpt-5.4',
+    'openai/gpt-5.4-pro',
+    'openai/gpt-5.2',
+    'openai/gpt-5.2-pro',
+    'openai/gpt-5-mini',
+    'openai/gpt-4.1',
+    'openai/o3',
+    // Google — vision baked into every Gemini SKU we surface
+    'google/gemini-3.1-pro',
+    'google/gemini-2.5-pro',
+    'google/gemini-2.5-flash',
+    // xAI — only Grok 4 base supports vision; grok-4-1-fast-reasoning is text-only
+    'xai/grok-4-0709',
+    'xai/grok-3',
+    // Moonshot — K2.6 added vision + reasoning when it replaced K2.5
+    'moonshot/kimi-k2.6',
+    // NVIDIA inference — Llama 4 Maverick is multimodal; deepseek/qwen-coder are not
+    'nvidia/llama-4-maverick',
+]);
+/** Does this concrete gateway model accept image input? */
+export function isVisionModel(modelId) {
+    if (!modelId)
+        return false;
+    return VISION_MODELS.has(modelId);
+}
+/** Lower-cased copy used for prefix family matching in pickVisionSibling. */
+const VISION_MODELS_LIST = Array.from(VISION_MODELS);
+/**
+ * Pick a vision-capable replacement closest to the user's chosen model.
+ * Prefers same provider family (so the user's intent — "I want Claude" vs
+ * "I want Gemini" — survives the swap), then falls back to a sensible
+ * vision default (Sonnet 4.6 — agent-tuned, mid-tier price).
+ */
+export function pickVisionSibling(modelId) {
+    const family = modelId.split('/')[0]?.toLowerCase();
+    if (family) {
+        const sibling = VISION_MODELS_LIST.find(m => m.startsWith(`${family}/`));
+        if (sibling)
+            return sibling;
+    }
+    return 'anthropic/claude-sonnet-4.6';
+}
+// Image file extensions Franklin's Read tool inlines as vision content. Keep
+// this in sync with IMAGE_MEDIA_TYPES in src/tools/read.ts — if Read learns a
+// new format (e.g. .avif), this regex needs to learn it too or routing will
+// silently miss it.
+//
+// We match the basename only ("foo.png"), preceded by any path separator or
+// punctuation. Trying to match full path prefixes ("./", "/", "~/", "C:\")
+// in one regex produced false negatives on Windows-style paths because of
+// the `:` and `\` separators. The basename anchor is enough — a bare
+// `foo.png` reference is what the Read tool actually needs to inline bytes.
+const IMAGE_PATH_RE = /(?:^|[\s"'`(<\[\\/])[\w@%+-]+\.(?:png|jpe?g|gif|webp)(?=$|[\s"'`)>\],.?!:;])/i;
+/**
+ * Does this user-typed message reference an image file? Used by the router
+ * to bump Auto mode to a vision-capable tier, and by the manual-mode guard
+ * to swap a text-only model for one turn.
+ *
+ * Detection is intentionally a regex over file extensions rather than a
+ * filesystem stat — the user may type a path that doesn't yet exist
+ * (about to wget it) or a glob; what we care about is "does the model need
+ * eyes for this turn?" The false-positive risk is benign (we route to a
+ * slightly stronger model than strictly needed).
+ */
+export function messageNeedsVision(text) {
+    if (!text)
+        return false;
+    return IMAGE_PATH_RE.test(text);
+}
+/**
+ * Messages-array variant: scans OpenAI- and Anthropic-format content blocks
+ * for explicit image parts (image / image_url / input_image) and for image
+ * paths embedded in text parts. Used by the proxy router which receives a
+ * fully-formed messages[] payload, not a single string.
+ */
+export function messagesNeedVision(messages) {
+    if (!messages || messages.length === 0)
+        return false;
+    for (const msg of messages) {
+        if (msg.role && msg.role !== 'user')
+            continue;
+        const content = msg.content;
+        if (typeof content === 'string') {
+            if (messageNeedsVision(content))
+                return true;
+            continue;
+        }
+        if (!Array.isArray(content))
+            continue;
+        for (const part of content) {
+            const t = part?.type;
+            if (t === 'image' || t === 'image_url' || t === 'input_image')
+                return true;
+            if (t === 'text' && messageNeedsVision(part.text))
+                return true;
+        }
+    }
+    return false;
+}

package/dist/stats/failures.d.ts CHANGED Viewed

@@ -1,7 +1,27 @@
 /**
  * Structured failure logging for self-evolution analysis.
  * Append-only JSONL at ~/.blockrun/failures.jsonl (capped 500 records).
+ *
+ * 2026-05-11: Adopted a Cursor-style tool-failure taxonomy on the
+ * `category` field. Lets us:
+ *   1. Tell at a glance whether a spike of failures is the model's
+ *      fault (InvalidArguments), the environment's fault
+ *      (UnexpectedEnvironment), an upstream's fault (ProviderError),
+ *      a user action (UserAborted), or a slow path (Timeout).
+ *   2. Build per-(tool, category) baselines for anomaly detection —
+ *      see `getToolAnomalies()` below.
+ *
+ * The existing single-line errorMessage column is preserved so older
+ * records still parse. classifyToolFailure() auto-classifies records
+ * without a category field on read, so historical entries flow into
+ * the same dashboards without a migration.
  */
+/**
+ * Coarse classification of a tool failure. Mirrors Cursor's published
+ * "Tool reliability" taxonomy so error dashboards translate cleanly
+ * across the industry, but tuned for Franklin's tool surface.
+ */
+export type ToolFailureCategory = 'InvalidArguments' | 'UnexpectedEnvironment' | 'ProviderError' | 'UserAborted' | 'Timeout' | 'Unknown';
 export interface FailureRecord {
     timestamp: number;
     model: string;
@@ -9,12 +29,66 @@ export interface FailureRecord {
     toolName?: string;
     errorMessage: string;
     recoveryAction?: string;
+    /**
+     * Coarse classification of the failure. Set by recordFailure() when
+     * a record is written, or auto-filled by loadFailures() for older
+     * records that pre-date this field.
+     */
+    category?: ToolFailureCategory;
 }
+/**
+ * Classify a tool failure by matching the error message + tool name
+ * against known patterns. Layered top-to-bottom — first match wins.
+ * `Unknown` is the catch-all; if you see one in production, the
+ * classifier needs a new branch (file a follow-up).
+ */
+export declare function classifyToolFailure(errorMessage: string, toolName?: string): ToolFailureCategory;
 export declare function recordFailure(record: FailureRecord): void;
 export declare function loadFailures(limit?: number): FailureRecord[];
 export declare function getFailureStats(): {
     byTool: Map<string, number>;
     byType: Map<string, number>;
+    byCategory: Map<ToolFailureCategory, number>;
     total: number;
     recentFailures: FailureRecord[];
 };
+export interface AnomalyReport {
+    toolName: string;
+    category: ToolFailureCategory;
+    recentCount: number;
+    baselineCount: number;
+    baselineWindowMs: number;
+    recentWindowMs: number;
+    /**
+     * Multiplier of recent-rate vs baseline-rate. Infinity when the
+     * baseline is zero (i.e. a new failure type appeared). 1.0 = same
+     * rate as baseline.
+     */
+    spikeRatio: number;
+    /** Most recent error message in this bucket — useful for triage. */
+    sampleMessage: string;
+}
+export interface AnomalyOptions {
+    /** Recent window in ms. Default 24h. */
+    recentWindowMs?: number;
+    /** Baseline window in ms (counted from now, includes the recent window). Default 30d. */
+    baselineWindowMs?: number;
+    /** Minimum recent count to consider — filters out single-flake noise. Default 3. */
+    minRecent?: number;
+    /** Minimum spike ratio to surface. Default 3.0. */
+    minSpikeRatio?: number;
+}
+/**
+ * Compute (tool, category) anomalies vs a rolling baseline.
+ *
+ * Returns the buckets where the recent failure rate is dramatically
+ * higher than baseline — sorted by spike severity. Skips buckets where
+ * `recentCount` is below `minRecent` to avoid surfacing every flaky
+ * one-off.
+ *
+ * A bucket with `baselineCount=0` and `recentCount >= minRecent` is
+ * always surfaced (spikeRatio = Infinity) — these are brand-new failure
+ * modes that the harness has never seen before, and they're the most
+ * important kind to investigate.
+ */
+export declare function getToolAnomalies(opts?: AnomalyOptions): AnomalyReport[];

package/dist/stats/failures.js CHANGED Viewed

@@ -1,16 +1,101 @@
 /**
  * Structured failure logging for self-evolution analysis.
  * Append-only JSONL at ~/.blockrun/failures.jsonl (capped 500 records).
+ *
+ * 2026-05-11: Adopted a Cursor-style tool-failure taxonomy on the
+ * `category` field. Lets us:
+ *   1. Tell at a glance whether a spike of failures is the model's
+ *      fault (InvalidArguments), the environment's fault
+ *      (UnexpectedEnvironment), an upstream's fault (ProviderError),
+ *      a user action (UserAborted), or a slow path (Timeout).
+ *   2. Build per-(tool, category) baselines for anomaly detection —
+ *      see `getToolAnomalies()` below.
+ *
+ * The existing single-line errorMessage column is preserved so older
+ * records still parse. classifyToolFailure() auto-classifies records
+ * without a category field on read, so historical entries flow into
+ * the same dashboards without a migration.
  */
 import fs from 'node:fs';
 import path from 'node:path';
 import { BLOCKRUN_DIR } from '../config.js';
-const FAILURES_FILE = path.join(BLOCKRUN_DIR, 'failures.jsonl');
+/**
+ * Resolve the failures-file path at call time, not module-load time, so
+ * tests can sandbox via FRANKLIN_HOME (already an established convention
+ * — see src/tasks/paths.ts). Production keeps the default
+ * ~/.blockrun/failures.jsonl path unchanged.
+ */
+function failuresFile() {
+    const home = process.env.FRANKLIN_HOME;
+    return home
+        ? path.join(home, 'failures.jsonl')
+        : path.join(BLOCKRUN_DIR, 'failures.jsonl');
+}
+/**
+ * Classify a tool failure by matching the error message + tool name
+ * against known patterns. Layered top-to-bottom — first match wins.
+ * `Unknown` is the catch-all; if you see one in production, the
+ * classifier needs a new branch (file a follow-up).
+ */
+export function classifyToolFailure(errorMessage, toolName) {
+    const m = (errorMessage || '').toLowerCase();
+    // UserAborted — user-initiated cancel or harness abort signal.
+    // Check first because abort messages often *contain* the word
+    // "timeout" or "error" and would otherwise misclassify.
+    if (/this operation was aborted|user aborted|user cancel|user_cancel|sigint|sigterm|operation cancell?ed|abortcontroller/.test(m)) {
+        return 'UserAborted';
+    }
+    // Timeout — distinct from ProviderError because the *call* succeeded
+    // (we sent the request) but exceeded our budget. Tool-level retries
+    // shouldn't retry these without escalating the budget.
+    if (/timed out after|timeout|deadline exceeded|etimedout|operation timed out|exceeded.*time/.test(m)) {
+        return 'Timeout';
+    }
+    // UnexpectedEnvironment — the world isn't as the model assumed.
+    // ENOENT / wallet missing / chain mismatch / cwd not a repo / etc.
+    if (/enoent|no such file|cannot find|does not exist|not a (git|directory)|wallet not (configured|found)|insufficient.*(balance|funds|lamports)|not logged in|chain mismatch|invalid wallet|command not found/.test(m)) {
+        return 'UnexpectedEnvironment';
+    }
+    // ProviderError — an upstream service we don't control returned bad.
+    // Rate limits, 5xx, gateway 4xx, network failures, fetch failures.
+    if (/rate.?limit|429|5\d\d|gateway|upstream|provider|fetch failed|econn(refused|reset)|enotfound|socket hang up|network error|http \d{3}|api error|gateway timeout/.test(m)) {
+        return 'ProviderError';
+    }
+    // InvalidArguments — the model called the tool wrong. Covers schema
+    // rejects, missing/extra fields, type mismatches, and the very common
+    // "cannot read properties of undefined" pattern that means we got an
+    // object shape we didn't expect from the model's input.
+    if (/invalid (argument|input|parameter|value|schema)|missing (required|argument|field|parameter)|expected.*(but|got|received)|cannot read (properties|property) of (undefined|null)|typeerror|schema (rejected|mismatch|validation)|bad request|400|invalid.*format|unrecognized/.test(m)) {
+        return 'InvalidArguments';
+    }
+    // Tool-specific tells.
+    if (toolName) {
+        const t = toolName.toLowerCase();
+        if (t === 'searchx' || t === 'posttox') {
+            if (/login wall|sign in|create account/.test(m))
+                return 'UnexpectedEnvironment';
+        }
+        if (t === 'bash') {
+            if (/permission denied|eacces/.test(m))
+                return 'UnexpectedEnvironment';
+        }
+    }
+    return 'Unknown';
+}
 const MAX_RECORDS = 500;
 export function recordFailure(record) {
+    if (process.env.FRANKLIN_NO_AUDIT === '1' || process.env.FRANKLIN_NO_PERSIST === '1')
+        return;
     try {
-        fs.mkdirSync(path.dirname(FAILURES_FILE), { recursive: true });
-        fs.appendFileSync(FAILURES_FILE, JSON.stringify(record) + '\n');
+        // Auto-classify on write so callsites don't need to know the
+        // taxonomy. Callers can still override by passing `category`
+        // explicitly (e.g. when the abort came from a known SIGINT handler).
+        const enriched = {
+            ...record,
+            category: record.category ?? classifyToolFailure(record.errorMessage, record.toolName),
+        };
+        fs.mkdirSync(path.dirname(failuresFile()), { recursive: true });
+        fs.appendFileSync(failuresFile(), JSON.stringify(enriched) + '\n');
         // Trim to MAX_RECORDS (only check periodically to avoid constant reads)
         if (Math.random() < 0.1) {
             trimFailures();
@@ -22,12 +107,12 @@ export function recordFailure(record) {
 }
 function trimFailures() {
     try {
-        if (!fs.existsSync(FAILURES_FILE))
+        if (!fs.existsSync(failuresFile()))
             return;
-        const lines = fs.readFileSync(FAILURES_FILE, 'utf-8').trim().split('\n');
+        const lines = fs.readFileSync(failuresFile(), 'utf-8').trim().split('\n');
         if (lines.length > MAX_RECORDS) {
             const trimmed = lines.slice(-MAX_RECORDS).join('\n') + '\n';
-            fs.writeFileSync(FAILURES_FILE, trimmed);
+            fs.writeFileSync(failuresFile(), trimmed);
         }
     }
     catch {
@@ -36,10 +121,19 @@ function trimFailures() {
 }
 export function loadFailures(limit = 100) {
     try {
-        if (!fs.existsSync(FAILURES_FILE))
+        if (!fs.existsSync(failuresFile()))
             return [];
-        const lines = fs.readFileSync(FAILURES_FILE, 'utf-8').trim().split('\n').filter(Boolean);
-        return lines.slice(-limit).map(l => JSON.parse(l));
+        const lines = fs.readFileSync(failuresFile(), 'utf-8').trim().split('\n').filter(Boolean);
+        return lines.slice(-limit).map(l => {
+            const parsed = JSON.parse(l);
+            // Auto-classify historical records that pre-date the `category`
+            // field. We don't rewrite the file — read-side enrichment keeps
+            // the on-disk shape append-only and idempotent.
+            if (!parsed.category) {
+                parsed.category = classifyToolFailure(parsed.errorMessage, parsed.toolName);
+            }
+            return parsed;
+        });
     }
     catch {
         return [];
@@ -49,15 +143,97 @@ export function getFailureStats() {
     const records = loadFailures(500);
     const byTool = new Map();
     const byType = new Map();
+    const byCategory = new Map();
     for (const r of records) {
         if (r.toolName)
             byTool.set(r.toolName, (byTool.get(r.toolName) ?? 0) + 1);
         byType.set(r.failureType, (byType.get(r.failureType) ?? 0) + 1);
+        if (r.category)
+            byCategory.set(r.category, (byCategory.get(r.category) ?? 0) + 1);
     }
     return {
         byTool,
         byType,
+        byCategory,
         total: records.length,
         recentFailures: records.slice(-10),
     };
 }
+/**
+ * Compute (tool, category) anomalies vs a rolling baseline.
+ *
+ * Returns the buckets where the recent failure rate is dramatically
+ * higher than baseline — sorted by spike severity. Skips buckets where
+ * `recentCount` is below `minRecent` to avoid surfacing every flaky
+ * one-off.
+ *
+ * A bucket with `baselineCount=0` and `recentCount >= minRecent` is
+ * always surfaced (spikeRatio = Infinity) — these are brand-new failure
+ * modes that the harness has never seen before, and they're the most
+ * important kind to investigate.
+ */
+export function getToolAnomalies(opts = {}) {
+    const recentWindowMs = opts.recentWindowMs ?? 24 * 60 * 60 * 1000;
+    const baselineWindowMs = opts.baselineWindowMs ?? 30 * 24 * 60 * 60 * 1000;
+    const minRecent = opts.minRecent ?? 3;
+    const minSpikeRatio = opts.minSpikeRatio ?? 3.0;
+    const now = Date.now();
+    const recentCutoff = now - recentWindowMs;
+    const baselineCutoff = now - baselineWindowMs;
+    // Bucket key = `${toolName}::${category}`.
+    const recentByBucket = new Map();
+    const baselineByBucket = new Map();
+    for (const r of loadFailures(500)) {
+        if (r.timestamp < baselineCutoff)
+            continue;
+        const tool = r.toolName ?? '<no-tool>';
+        const cat = r.category ?? 'Unknown';
+        const key = `${tool}::${cat}`;
+        if (r.timestamp >= recentCutoff) {
+            const existing = recentByBucket.get(key) ?? { count: 0, sample: r.errorMessage };
+            existing.count += 1;
+            existing.sample = r.errorMessage; // last seen wins; useful for triage
+            recentByBucket.set(key, existing);
+        }
+        else {
+            baselineByBucket.set(key, (baselineByBucket.get(key) ?? 0) + 1);
+        }
+    }
+    const reports = [];
+    for (const [key, { count: recentCount, sample }] of recentByBucket) {
+        if (recentCount < minRecent)
+            continue;
+        const baselineCount = baselineByBucket.get(key) ?? 0;
+        // Normalize rates by window length so spikes are comparable across
+        // different (recent, baseline) sizes. baseline window excludes the
+        // recent window by construction (we partitioned above).
+        const baselineWindowExclRecent = baselineWindowMs - recentWindowMs;
+        const recentRate = recentCount / recentWindowMs;
+        const baselineRate = baselineCount > 0
+            ? baselineCount / Math.max(1, baselineWindowExclRecent)
+            : 0;
+        const spikeRatio = baselineRate > 0
+            ? recentRate / baselineRate
+            : Number.POSITIVE_INFINITY;
+        if (spikeRatio < minSpikeRatio)
+            continue;
+        const [toolName, category] = key.split('::');
+        reports.push({
+            toolName,
+            category,
+            recentCount,
+            baselineCount,
+            baselineWindowMs,
+            recentWindowMs,
+            spikeRatio,
+            sampleMessage: sample,
+        });
+    }
+    // Sort: brand-new failures (spikeRatio = Infinity) first, then by ratio desc.
+    reports.sort((a, b) => {
+        if (a.spikeRatio === b.spikeRatio)
+            return b.recentCount - a.recentCount;
+        return b.spikeRatio - a.spikeRatio;
+    });
+    return reports;
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@blockrun/franklin",
-  "version": "3.15.90",
+  "version": "3.15.92",
   "description": "Franklin — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.",
   "type": "module",
   "exports": {