@blockrun/franklin 3.15.90 → 3.15.92

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,7 +27,7 @@ import { setSessionPersistenceDisabled } from '../session/storage.js';
27
27
  import { estimateCost, OPUS_PRICING } from '../pricing.js';
28
28
  import { maybeMidSessionExtract } from '../learnings/extractor.js';
29
29
  import { extractMentions, buildEntityContext, loadEntities } from '../brain/store.js';
30
- import { routeRequestAsync, resolveTierToModel, parseRoutingProfile, getFallbackChain, pickFreeFallback } from '../router/index.js';
30
+ import { routeRequestAsync, resolveTierToModel, parseRoutingProfile, getFallbackChain, pickFreeFallback, isVisionModel, messageNeedsVision, pickVisionSibling } from '../router/index.js';
31
31
  import { recordOutcome } from '../router/local-elo.js';
32
32
  import { shouldPlan, getPlanningPrompt, getExecutorModel, isExecutorStuck, toolCallSignature } from './planner.js';
33
33
  import { shouldVerify, runVerification } from './verification.js';
@@ -1118,6 +1118,16 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
1118
1118
  onProgress: (id, text) => onEvent({ kind: 'capability_progress', id, text }),
1119
1119
  sessionId,
1120
1120
  });
1121
+ // ── Vision-need detection (per turn) ──
1122
+ // Images enter a turn one of two ways: the user types an image path
1123
+ // and the Read tool will inline bytes mid-turn, or the user references
1124
+ // an image in their last message directly. We can only see (1) at this
1125
+ // point — but that's the case we care about: the router has to decide
1126
+ // BEFORE the model call which model to use. If the model can't see
1127
+ // images, Read's tool_result image blocks get tokenized as base64 text
1128
+ // by the gateway (verified 2026-05-09) and the model hallucinates from
1129
+ // the "Image file: <path>" stub. Detect upfront, route accordingly.
1130
+ const turnNeedsVision = loopCount === 1 && messageNeedsVision(lastUserInput);
1121
1131
  // ── Router: resolve routing profiles to concrete models ──
1122
1132
  // Uses the tier already decided by the turn-analyzer — one LLM call
1123
1133
  // up-front rather than a separate classifier here. Fallback to the
@@ -1129,8 +1139,8 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
1129
1139
  let routingSavings;
1130
1140
  if (routingProfile) {
1131
1141
  const routing = turnAnalysis
1132
- ? resolveTierToModel(turnAnalysis.tier, routingProfile)
1133
- : await routeRequestAsync(lastUserInput || '', routingProfile);
1142
+ ? resolveTierToModel(turnAnalysis.tier, routingProfile, turnNeedsVision)
1143
+ : await routeRequestAsync(lastUserInput || '', routingProfile, undefined, turnNeedsVision);
1134
1144
  resolvedModel = routing.model;
1135
1145
  routingTier = routing.tier;
1136
1146
  routingConfidence = routing.confidence;
@@ -1138,12 +1148,31 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
1138
1148
  lastRoutedModel = routing.model;
1139
1149
  lastRoutedCategory = routing.category || '';
1140
1150
  if (loopCount === 1) {
1151
+ const visionTag = turnNeedsVision ? ' 👁️' : '';
1141
1152
  onEvent({
1142
1153
  kind: 'text_delta',
1143
- text: `*Auto → ${routing.model}*\n\n`,
1154
+ text: `*Auto → ${routing.model}${visionTag}*\n\n`,
1144
1155
  });
1145
1156
  }
1146
1157
  }
1158
+ else if (turnNeedsVision && !isVisionModel(resolvedModel)) {
1159
+ // ── Manual-mode guard ──
1160
+ // User explicitly picked a model that can't see images. Don't silently
1161
+ // send the image — the model would only see the text stub from Read
1162
+ // and hallucinate. Swap to the closest vision sibling JUST for this
1163
+ // turn (next turn's model-recovery block at the top of the user-input
1164
+ // handler resets to baseModel, so the user's intent isn't permanently
1165
+ // overridden). Always emit a visible notice so the user knows their
1166
+ // pick was overridden and why.
1167
+ const original = resolvedModel;
1168
+ const visionSwap = pickVisionSibling(original);
1169
+ resolvedModel = visionSwap;
1170
+ config.model = visionSwap;
1171
+ onEvent({
1172
+ kind: 'text_delta',
1173
+ text: `*⚠️ ${original} can't see images — using ${visionSwap} for this turn.*\n\n`,
1174
+ });
1175
+ }
1147
1176
  // Update token estimation model for more accurate byte-per-token ratio
1148
1177
  setEstimationModel(resolvedModel);
1149
1178
  // ── Plan-then-execute: detect and activate ──
@@ -12,4 +12,5 @@
12
12
  */
13
13
  export declare function doctorCommand(opts?: {
14
14
  json?: boolean;
15
+ anomaly?: boolean;
15
16
  }): Promise<void>;
@@ -247,6 +247,10 @@ function printHuman(checks) {
247
247
  console.log();
248
248
  }
249
249
  export async function doctorCommand(opts = {}) {
250
+ if (opts.anomaly) {
251
+ await anomalyReportCommand(opts);
252
+ return;
253
+ }
250
254
  const checks = await runChecks();
251
255
  if (opts.json) {
252
256
  const fails = checks.filter(c => c.status === 'fail').length;
@@ -257,3 +261,35 @@ export async function doctorCommand(opts = {}) {
257
261
  const fails = checks.filter(c => c.status === 'fail').length;
258
262
  process.exit(fails > 0 ? 1 : 0);
259
263
  }
264
+ /**
265
+ * `franklin doctor --anomaly` — print failure spikes vs 30-day baseline.
266
+ * Exits non-zero when at least one anomaly is surfaced, so it can be
267
+ * wired into a cron / CI without parsing stdout.
268
+ */
269
+ async function anomalyReportCommand(opts) {
270
+ const { getToolAnomalies } = await import('../stats/failures.js');
271
+ const reports = getToolAnomalies();
272
+ if (opts.json) {
273
+ process.stdout.write(JSON.stringify({ anomalies: reports }, null, 2) + '\n');
274
+ process.exit(reports.length > 0 ? 1 : 0);
275
+ }
276
+ console.log(chalk.bold('\n franklin doctor --anomaly'));
277
+ console.log(chalk.dim(' Looking for (tool, category) failure spikes in the last 24h vs the 30-day baseline.\n'));
278
+ if (reports.length === 0) {
279
+ console.log(chalk.green(' No anomalies. Tool failure rates match the 30-day baseline.\n'));
280
+ process.exit(0);
281
+ }
282
+ for (const a of reports) {
283
+ const newType = !Number.isFinite(a.spikeRatio);
284
+ const header = ` ${chalk.red('•')} ${chalk.bold(a.toolName)} / ${chalk.yellow(a.category)}`;
285
+ const ratio = newType
286
+ ? chalk.red('NEW failure type (no baseline)')
287
+ : chalk.red(`${a.spikeRatio.toFixed(1)}× baseline`);
288
+ const counts = chalk.dim(`recent=${a.recentCount}, baseline=${a.baselineCount}`);
289
+ console.log(`${header} ${ratio} ${counts}`);
290
+ const trimmed = a.sampleMessage.length > 140 ? a.sampleMessage.slice(0, 140) + '…' : a.sampleMessage;
291
+ console.log(chalk.dim(` sample: ${trimmed}`));
292
+ }
293
+ console.log(chalk.dim(`\n ${reports.length} anomalies. Investigate before they snowball.\n`));
294
+ process.exit(1);
295
+ }
package/dist/index.js CHANGED
@@ -185,6 +185,7 @@ program
185
185
  .command('doctor')
186
186
  .description('One-command health check (node, wallet, chain, gateway, MCP, telemetry)')
187
187
  .option('--json', 'Machine-readable output')
188
+ .option('--anomaly', 'Surface (tool, category) failure spikes vs 30-day baseline')
188
189
  .action(async (opts) => {
189
190
  const { doctorCommand } = await import('./commands/doctor.js');
190
191
  await doctorCommand(opts);
@@ -4,7 +4,7 @@ import { recordUsage } from '../stats/tracker.js';
4
4
  import { appendSettlementRow } from '../stats/cost-log.js';
5
5
  import { appendAudit } from '../stats/audit.js';
6
6
  import { buildFallbackChain, DEFAULT_FALLBACK_CONFIG, ROUTING_PROFILES, } from './fallback.js';
7
- import { routeRequest, parseRoutingProfile, } from '../router/index.js';
7
+ import { routeRequest, parseRoutingProfile, isVisionModel, messagesNeedVision, pickVisionSibling, } from '../router/index.js';
8
8
  import { estimateCost } from '../pricing.js';
9
9
  import { VERSION } from '../config.js';
10
10
  // User-Agent for backend requests
@@ -342,6 +342,13 @@ export function createProxy(options) {
342
342
  parsed.model = currentModel || DEFAULT_MODEL;
343
343
  }
344
344
  requestModel = parsed.model || DEFAULT_MODEL;
345
+ // Vision-need detection: does this request carry an image? We
346
+ // check messages[] for explicit image / image_url parts AND for
347
+ // image paths embedded in text — Anthropic-format proxies stream
348
+ // both shapes. Used both by the Auto router (pick a vision-capable
349
+ // tier model) and by the manual-mode guard (swap when the user
350
+ // explicitly picked a text-only model).
351
+ const proxyNeedsVision = messagesNeedVision(parsed.messages || []);
345
352
  // Smart routing: if model is a routing profile, classify and route
346
353
  const routingProfile = parseRoutingProfile(requestModel);
347
354
  if (routingProfile) {
@@ -360,13 +367,27 @@ export function createProxy(options) {
360
367
  .join('\n');
361
368
  }
362
369
  }
363
- // Route the request
364
- const routing = routeRequest(promptText, routingProfile);
370
+ // Route the request — propagate vision-need so AUTO_TIERS' V4
371
+ // Pro default doesn't get picked for an image-bearing turn.
372
+ const routing = routeRequest(promptText, routingProfile, proxyNeedsVision);
365
373
  parsed.model = routing.model;
366
374
  requestModel = routing.model;
367
375
  logger.info(`[franklin] 🧠 Smart routing: ${routingProfile} → ${routing.tier} → ${routing.model} ` +
368
376
  `(${(routing.savings * 100).toFixed(0)}% savings) [${routing.signals.join(', ')}]`);
369
377
  }
378
+ else if (proxyNeedsVision && !isVisionModel(requestModel)) {
379
+ // Manual-mode guard: user (or an upstream client) passed a
380
+ // concrete text-only model alongside an image. Swap to the
381
+ // family-closest vision sibling and log loudly — silently
382
+ // sending the image would tokenize as base64 text and produce
383
+ // a hallucinated answer. Same swap policy as the agent loop's
384
+ // interactive path so behavior is consistent across surfaces.
385
+ const original = requestModel;
386
+ const visionSwap = pickVisionSibling(original);
387
+ parsed.model = visionSwap;
388
+ requestModel = visionSwap;
389
+ logger.warn(`[franklin] 👁️ Vision swap: ${original} can't see images → ${visionSwap}`);
390
+ }
370
391
  {
371
392
  const original = parsed.max_tokens;
372
393
  const model = (parsed.model || '').toLowerCase();
@@ -10,6 +10,7 @@
10
10
  * Local Elo adjustments personalize routing per user over time.
11
11
  */
12
12
  import { type Category } from './categories.js';
13
+ export { isVisionModel, messageNeedsVision, messagesNeedVision, pickVisionSibling } from './vision.js';
13
14
  export type Tier = 'SIMPLE' | 'MEDIUM' | 'COMPLEX' | 'REASONING';
14
15
  export type RoutingProfile = 'auto' | 'free';
15
16
  export interface RoutingResult {
@@ -33,7 +34,7 @@ export declare function llmClassifyRequest(prompt: string): Promise<Tier | null>
33
34
  * Profile-specific tier tables (AUTO / ECO / PREMIUM / FREE) still pick
34
35
  * the concrete model; the classifier only picks the TIER.
35
36
  */
36
- export declare function routeRequestAsync(prompt: string, profile?: RoutingProfile, classify?: TierClassifier): Promise<RoutingResult>;
37
+ export declare function routeRequestAsync(prompt: string, profile?: RoutingProfile, classify?: TierClassifier, needsVision?: boolean): Promise<RoutingResult>;
37
38
  /**
38
39
  * Map a pre-classified tier to a concrete model + savings using the profile's
39
40
  * tier table. No classifier call — assumes the caller already decided the
@@ -43,8 +44,8 @@ export declare function routeRequestAsync(prompt: string, profile?: RoutingProfi
43
44
  * Use this when you have a tier already. Use `routeRequestAsync` when you
44
45
  * need the classifier to produce the tier.
45
46
  */
46
- export declare function resolveTierToModel(tier: Tier, profile?: RoutingProfile): RoutingResult;
47
- export declare function routeRequest(prompt: string, profile?: RoutingProfile): RoutingResult;
47
+ export declare function resolveTierToModel(tier: Tier, profile?: RoutingProfile, needsVision?: boolean): RoutingResult;
48
+ export declare function routeRequest(prompt: string, profile?: RoutingProfile, needsVision?: boolean): RoutingResult;
48
49
  /**
49
50
  * Get fallback models for a tier
50
51
  */
@@ -16,6 +16,8 @@ import { BLOCKRUN_DIR } from '../config.js';
16
16
  import { detectCategory, mapCategoryToTier } from './categories.js';
17
17
  import { selectModel } from './selector.js';
18
18
  import { computeLocalElo, blendElo } from './local-elo.js';
19
+ import { isVisionModel } from './vision.js';
20
+ export { isVisionModel, messageNeedsVision, messagesNeedVision, pickVisionSibling } from './vision.js';
19
21
  // ─── Learned Weights Loading ───
20
22
  const WEIGHTS_FILE = path.join(BLOCKRUN_DIR, 'router-weights.json');
21
23
  let cachedWeights; // undefined = not loaded yet
@@ -69,6 +71,27 @@ const AUTO_TIERS = {
69
71
  ],
70
72
  },
71
73
  };
74
+ /**
75
+ * If this turn carries an image, the picked tier model must be able to see it.
76
+ * Walks the tier's primary+fallback chain for the first vision-capable model;
77
+ * if none of them have vision, escalates to COMPLEX (Opus is always vision).
78
+ *
79
+ * Note: only applied when the caller signals needsVision=true. Without that
80
+ * hint the classic per-tier defaults still rule — V4 Pro's $0.50/$1.00 promo
81
+ * is the right SIMPLE/MEDIUM pick for text-only turns and we don't want to
82
+ * blanket-upgrade everyone to a vision model.
83
+ */
84
+ function pickVisionTierModel(tier) {
85
+ const chain = [AUTO_TIERS[tier].primary, ...AUTO_TIERS[tier].fallback];
86
+ const visionInTier = chain.find(isVisionModel);
87
+ if (visionInTier)
88
+ return { model: visionInTier, tier, signal: 'vision-required' };
89
+ // Tier chain is fully text-only (unusual but possible if cheap tiers get
90
+ // re-tuned). Escalate to COMPLEX whose primary (Opus) is always vision.
91
+ const escalated = [AUTO_TIERS.COMPLEX.primary, ...AUTO_TIERS.COMPLEX.fallback]
92
+ .find(isVisionModel) ?? AUTO_TIERS.COMPLEX.primary;
93
+ return { model: escalated, tier: 'COMPLEX', signal: 'vision-escalated' };
94
+ }
72
95
  // ─── Keywords for Classification ───
73
96
  //
74
97
  // Keyword fast-path uses English only by policy (English-only-source rule).
@@ -250,7 +273,7 @@ function classifyRequest(prompt, tokenCount) {
250
273
  return { tier, confidence, signals };
251
274
  }
252
275
  // ─── Classic Router (keyword-based fallback) ───
253
- function classicRouteRequest(prompt, profile) {
276
+ function classicRouteRequest(prompt, profile, needsVision = false) {
254
277
  // Estimate token count (use byte length / 4 for better accuracy with non-ASCII)
255
278
  const byteLen = Buffer.byteLength(prompt, 'utf-8');
256
279
  const tokenCount = Math.ceil(byteLen / 4);
@@ -260,11 +283,21 @@ function classicRouteRequest(prompt, profile) {
260
283
  // 2026-05-03 — see comment on RoutingProfile above). 'free' is handled
261
284
  // earlier by the caller path; if it ever reaches here, fall through to
262
285
  // AUTO_TIERS rather than crashing.
263
- const tierConfigs = AUTO_TIERS;
264
- const model = tierConfigs[tier].primary;
286
+ let model;
287
+ let finalTier = tier;
288
+ const finalSignals = [...signals];
289
+ if (needsVision) {
290
+ const v = pickVisionTierModel(tier);
291
+ model = v.model;
292
+ finalTier = v.tier;
293
+ finalSignals.push(v.signal);
294
+ }
295
+ else {
296
+ model = AUTO_TIERS[tier].primary;
297
+ }
265
298
  const savings = computeSavings(model);
266
299
  const category = detectCategory(prompt, loadLearnedWeights()?.category_keywords).category;
267
- return { model, tier, confidence, signals, savings, category };
300
+ return { model, tier: finalTier, confidence, signals: finalSignals, savings, category };
268
301
  }
269
302
  // ─── LLM-based classifier ───
270
303
  //
@@ -362,25 +395,35 @@ export async function llmClassifyRequest(prompt) {
362
395
  * Profile-specific tier tables (AUTO / ECO / PREMIUM / FREE) still pick
363
396
  * the concrete model; the classifier only picks the TIER.
364
397
  */
365
- export async function routeRequestAsync(prompt, profile = 'auto', classify = llmClassifyRequest) {
398
+ export async function routeRequestAsync(prompt, profile = 'auto', classify = llmClassifyRequest, needsVision = false) {
366
399
  // Free / short-circuit profiles — no classifier needed.
367
400
  if (profile === 'free')
368
- return routeRequest(prompt, profile);
401
+ return routeRequest(prompt, profile, needsVision);
369
402
  const tier = await classify(prompt).catch(() => null);
370
403
  if (!tier) {
371
404
  // Classifier miss or disabled — fall through to the sync keyword router.
372
- return routeRequest(prompt, profile);
405
+ return routeRequest(prompt, profile, needsVision);
373
406
  }
374
407
  // Build a RoutingResult from the LLM-picked tier using the same tier
375
408
  // tables the keyword path uses. Keeps downstream code path-identical.
376
- const tierConfigs = AUTO_TIERS;
377
- const model = tierConfigs[tier].primary;
409
+ let model;
410
+ let finalTier = tier;
411
+ const signals = ['llm-classified'];
412
+ if (needsVision) {
413
+ const v = pickVisionTierModel(tier);
414
+ model = v.model;
415
+ finalTier = v.tier;
416
+ signals.push(v.signal);
417
+ }
418
+ else {
419
+ model = AUTO_TIERS[tier].primary;
420
+ }
378
421
  const category = detectCategory(prompt, loadLearnedWeights()?.category_keywords).category;
379
422
  return {
380
423
  model,
381
- tier,
424
+ tier: finalTier,
382
425
  confidence: 0.85, // LLM classification — medium-high confidence
383
- signals: ['llm-classified'],
426
+ signals,
384
427
  savings: computeSavings(model),
385
428
  category,
386
429
  };
@@ -394,36 +437,51 @@ export async function routeRequestAsync(prompt, profile = 'auto', classify = llm
394
437
  * Use this when you have a tier already. Use `routeRequestAsync` when you
395
438
  * need the classifier to produce the tier.
396
439
  */
397
- export function resolveTierToModel(tier, profile = 'auto') {
440
+ export function resolveTierToModel(tier, profile = 'auto', needsVision = false) {
398
441
  // Free profile short-circuits — everything routes to a single free model.
442
+ // qwen3-coder-480b is text-only; on a vision turn the free profile can't
443
+ // help us. Caller should detect this and warn the user that Free won't
444
+ // handle images — for now we just return the free pick and let the model
445
+ // fail gracefully. (Open question: should we hard-fall to nvidia/llama-4-
446
+ // maverick here? Skipped until we see a real user hit this path.)
399
447
  if (profile === 'free') {
400
448
  return {
401
449
  model: 'nvidia/qwen3-coder-480b',
402
450
  tier: 'SIMPLE',
403
451
  confidence: 1.0,
404
- signals: ['free-profile'],
452
+ signals: needsVision ? ['free-profile', 'vision-unsupported'] : ['free-profile'],
405
453
  savings: 1.0,
406
454
  };
407
455
  }
408
- const tierConfigs = AUTO_TIERS;
409
- const model = tierConfigs[tier].primary;
456
+ let model;
457
+ let finalTier = tier;
458
+ const signals = ['pre-classified'];
459
+ if (needsVision) {
460
+ const v = pickVisionTierModel(tier);
461
+ model = v.model;
462
+ finalTier = v.tier;
463
+ signals.push(v.signal);
464
+ }
465
+ else {
466
+ model = AUTO_TIERS[tier].primary;
467
+ }
410
468
  return {
411
469
  model,
412
- tier,
470
+ tier: finalTier,
413
471
  confidence: 0.85,
414
- signals: ['pre-classified'],
472
+ signals,
415
473
  savings: computeSavings(model),
416
474
  };
417
475
  }
418
476
  // ─── Main Router ───
419
- export function routeRequest(prompt, profile = 'auto') {
477
+ export function routeRequest(prompt, profile = 'auto', needsVision = false) {
420
478
  // Free profile — always use free model
421
479
  if (profile === 'free') {
422
480
  return {
423
481
  model: 'nvidia/qwen3-coder-480b',
424
482
  tier: 'SIMPLE',
425
483
  confidence: 1.0,
426
- signals: ['free-profile'],
484
+ signals: needsVision ? ['free-profile', 'vision-unsupported'] : ['free-profile'],
427
485
  savings: 1.0,
428
486
  };
429
487
  }
@@ -432,7 +490,7 @@ export function routeRequest(prompt, profile = 'auto') {
432
490
  // cheap/weak models on agentic work. Classic AUTO_TIERS defaults are
433
491
  // agent-tuned (Sonnet-tier backbone) and more predictable for users.
434
492
  if (profile === 'auto') {
435
- return classicRouteRequest(prompt, profile);
493
+ return classicRouteRequest(prompt, profile, needsVision);
436
494
  }
437
495
  // ── Learned routing (if weights available) ──
438
496
  const weights = loadLearnedWeights();
@@ -457,6 +515,21 @@ export function routeRequest(prompt, profile = 'auto') {
457
515
  const selected = selectModel(category, profile, adjustedWeights);
458
516
  if (selected) {
459
517
  const tier = mapCategoryToTier(category);
518
+ // Vision-aware substitution: if the Elo-picked model is text-only but
519
+ // the turn needs vision, swap to the tier's first vision-capable model.
520
+ // We deliberately don't blend Elo with vision capability — vision is a
521
+ // hard requirement, not a quality dimension.
522
+ if (needsVision && !isVisionModel(selected.model)) {
523
+ const v = pickVisionTierModel(tier);
524
+ return {
525
+ model: v.model,
526
+ tier: v.tier,
527
+ confidence,
528
+ signals: [category, v.signal],
529
+ savings: computeSavings(v.model),
530
+ category,
531
+ };
532
+ }
460
533
  const savings = computeSavings(selected.model);
461
534
  return {
462
535
  model: selected.model,
@@ -470,7 +543,7 @@ export function routeRequest(prompt, profile = 'auto') {
470
543
  // Fall through to classic if selectModel returns null (no candidates for category)
471
544
  }
472
545
  // ── Classic routing (keyword-based fallback) ──
473
- return classicRouteRequest(prompt, profile);
546
+ return classicRouteRequest(prompt, profile, needsVision);
474
547
  }
475
548
  function computeSavings(model) {
476
549
  const opusCostPer1K = (OPUS_PRICING.input + OPUS_PRICING.output) / 2 / 1000;
@@ -0,0 +1,51 @@
1
+ /**
2
+ * Vision capability + image-attachment detection.
3
+ *
4
+ * Two jobs:
5
+ * 1. isVisionModel(id) — does this gateway model accept image input?
6
+ * 2. messageNeedsVision(text) — does this user message reference an image?
7
+ *
8
+ * Source of truth: a hand-curated allowlist below. The gateway exposes a
9
+ * 'vision' category on /v1/models, but resolving it at routing time would
10
+ * make routeRequest async and gate sync proxy paths on a network call. The
11
+ * allowlist is small (~18 entries) and changes only when models do, which
12
+ * already touches the router + pricing tables — updating one more file is
13
+ * the right tradeoff vs. async fan-out across every routing callsite.
14
+ *
15
+ * Background: prior to this module, Auto routing could pick a text-only model
16
+ * (e.g. deepseek-v4-pro) on an image-bearing turn. The Read tool would still
17
+ * inline image bytes, the gateway would tokenize the base64 as text, and the
18
+ * model — having no vision pathway — would hallucinate based on the
19
+ * `Image file: <path>` description string. Expensive AND wrong.
20
+ */
21
+ /** Does this concrete gateway model accept image input? */
22
+ export declare function isVisionModel(modelId: string | undefined | null): boolean;
23
+ /**
24
+ * Pick a vision-capable replacement closest to the user's chosen model.
25
+ * Prefers same provider family (so the user's intent — "I want Claude" vs
26
+ * "I want Gemini" — survives the swap), then falls back to a sensible
27
+ * vision default (Sonnet 4.6 — agent-tuned, mid-tier price).
28
+ */
29
+ export declare function pickVisionSibling(modelId: string): string;
30
+ /**
31
+ * Does this user-typed message reference an image file? Used by the router
32
+ * to bump Auto mode to a vision-capable tier, and by the manual-mode guard
33
+ * to swap a text-only model for one turn.
34
+ *
35
+ * Detection is intentionally a regex over file extensions rather than a
36
+ * filesystem stat — the user may type a path that doesn't yet exist
37
+ * (about to wget it) or a glob; what we care about is "does the model need
38
+ * eyes for this turn?" The false-positive risk is benign (we route to a
39
+ * slightly stronger model than strictly needed).
40
+ */
41
+ export declare function messageNeedsVision(text: string | undefined | null): boolean;
42
+ /**
43
+ * Messages-array variant: scans OpenAI- and Anthropic-format content blocks
44
+ * for explicit image parts (image / image_url / input_image) and for image
45
+ * paths embedded in text parts. Used by the proxy router which receives a
46
+ * fully-formed messages[] payload, not a single string.
47
+ */
48
+ export declare function messagesNeedVision(messages: Array<{
49
+ role?: string;
50
+ content?: unknown;
51
+ }> | undefined | null): boolean;
@@ -0,0 +1,127 @@
1
+ /**
2
+ * Vision capability + image-attachment detection.
3
+ *
4
+ * Two jobs:
5
+ * 1. isVisionModel(id) — does this gateway model accept image input?
6
+ * 2. messageNeedsVision(text) — does this user message reference an image?
7
+ *
8
+ * Source of truth: a hand-curated allowlist below. The gateway exposes a
9
+ * 'vision' category on /v1/models, but resolving it at routing time would
10
+ * make routeRequest async and gate sync proxy paths on a network call. The
11
+ * allowlist is small (~18 entries) and changes only when models do, which
12
+ * already touches the router + pricing tables — updating one more file is
13
+ * the right tradeoff vs. async fan-out across every routing callsite.
14
+ *
15
+ * Background: prior to this module, Auto routing could pick a text-only model
16
+ * (e.g. deepseek-v4-pro) on an image-bearing turn. The Read tool would still
17
+ * inline image bytes, the gateway would tokenize the base64 as text, and the
18
+ * model — having no vision pathway — would hallucinate based on the
19
+ * `Image file: <path>` description string. Expensive AND wrong.
20
+ */
21
+ const VISION_MODELS = new Set([
22
+ // Anthropic — native vision across the line
23
+ 'anthropic/claude-opus-4.7',
24
+ 'anthropic/claude-opus-4.6',
25
+ 'anthropic/claude-sonnet-4.6',
26
+ 'anthropic/claude-haiku-4.5-20251001',
27
+ // OpenAI — multimodal flagships + o3 (Codex 5.3 is text-only, excluded)
28
+ 'openai/gpt-5.5',
29
+ 'openai/gpt-5.4',
30
+ 'openai/gpt-5.4-pro',
31
+ 'openai/gpt-5.2',
32
+ 'openai/gpt-5.2-pro',
33
+ 'openai/gpt-5-mini',
34
+ 'openai/gpt-4.1',
35
+ 'openai/o3',
36
+ // Google — vision baked into every Gemini SKU we surface
37
+ 'google/gemini-3.1-pro',
38
+ 'google/gemini-2.5-pro',
39
+ 'google/gemini-2.5-flash',
40
+ // xAI — only Grok 4 base supports vision; grok-4-1-fast-reasoning is text-only
41
+ 'xai/grok-4-0709',
42
+ 'xai/grok-3',
43
+ // Moonshot — K2.6 added vision + reasoning when it replaced K2.5
44
+ 'moonshot/kimi-k2.6',
45
+ // NVIDIA inference — Llama 4 Maverick is multimodal; deepseek/qwen-coder are not
46
+ 'nvidia/llama-4-maverick',
47
+ ]);
48
+ /** Does this concrete gateway model accept image input? */
49
+ export function isVisionModel(modelId) {
50
+ if (!modelId)
51
+ return false;
52
+ return VISION_MODELS.has(modelId);
53
+ }
54
+ /** Lower-cased copy used for prefix family matching in pickVisionSibling. */
55
+ const VISION_MODELS_LIST = Array.from(VISION_MODELS);
56
+ /**
57
+ * Pick a vision-capable replacement closest to the user's chosen model.
58
+ * Prefers same provider family (so the user's intent — "I want Claude" vs
59
+ * "I want Gemini" — survives the swap), then falls back to a sensible
60
+ * vision default (Sonnet 4.6 — agent-tuned, mid-tier price).
61
+ */
62
+ export function pickVisionSibling(modelId) {
63
+ const family = modelId.split('/')[0]?.toLowerCase();
64
+ if (family) {
65
+ const sibling = VISION_MODELS_LIST.find(m => m.startsWith(`${family}/`));
66
+ if (sibling)
67
+ return sibling;
68
+ }
69
+ return 'anthropic/claude-sonnet-4.6';
70
+ }
71
+ // Image file extensions Franklin's Read tool inlines as vision content. Keep
72
+ // this in sync with IMAGE_MEDIA_TYPES in src/tools/read.ts — if Read learns a
73
+ // new format (e.g. .avif), this regex needs to learn it too or routing will
74
+ // silently miss it.
75
+ //
76
+ // We match the basename only ("foo.png"), preceded by any path separator or
77
+ // punctuation. Trying to match full path prefixes ("./", "/", "~/", "C:\")
78
+ // in one regex produced false negatives on Windows-style paths because of
79
+ // the `:` and `\` separators. The basename anchor is enough — a bare
80
+ // `foo.png` reference is what the Read tool actually needs to inline bytes.
81
+ const IMAGE_PATH_RE = /(?:^|[\s"'`(<\[\\/])[\w@%+-]+\.(?:png|jpe?g|gif|webp)(?=$|[\s"'`)>\],.?!:;])/i;
82
+ /**
83
+ * Does this user-typed message reference an image file? Used by the router
84
+ * to bump Auto mode to a vision-capable tier, and by the manual-mode guard
85
+ * to swap a text-only model for one turn.
86
+ *
87
+ * Detection is intentionally a regex over file extensions rather than a
88
+ * filesystem stat — the user may type a path that doesn't yet exist
89
+ * (about to wget it) or a glob; what we care about is "does the model need
90
+ * eyes for this turn?" The false-positive risk is benign (we route to a
91
+ * slightly stronger model than strictly needed).
92
+ */
93
+ export function messageNeedsVision(text) {
94
+ if (!text)
95
+ return false;
96
+ return IMAGE_PATH_RE.test(text);
97
+ }
98
+ /**
99
+ * Messages-array variant: scans OpenAI- and Anthropic-format content blocks
100
+ * for explicit image parts (image / image_url / input_image) and for image
101
+ * paths embedded in text parts. Used by the proxy router which receives a
102
+ * fully-formed messages[] payload, not a single string.
103
+ */
104
+ export function messagesNeedVision(messages) {
105
+ if (!messages || messages.length === 0)
106
+ return false;
107
+ for (const msg of messages) {
108
+ if (msg.role && msg.role !== 'user')
109
+ continue;
110
+ const content = msg.content;
111
+ if (typeof content === 'string') {
112
+ if (messageNeedsVision(content))
113
+ return true;
114
+ continue;
115
+ }
116
+ if (!Array.isArray(content))
117
+ continue;
118
+ for (const part of content) {
119
+ const t = part?.type;
120
+ if (t === 'image' || t === 'image_url' || t === 'input_image')
121
+ return true;
122
+ if (t === 'text' && messageNeedsVision(part.text))
123
+ return true;
124
+ }
125
+ }
126
+ return false;
127
+ }
@@ -1,7 +1,27 @@
1
1
  /**
2
2
  * Structured failure logging for self-evolution analysis.
3
3
  * Append-only JSONL at ~/.blockrun/failures.jsonl (capped 500 records).
4
+ *
5
+ * 2026-05-11: Adopted a Cursor-style tool-failure taxonomy on the
6
+ * `category` field. Lets us:
7
+ * 1. Tell at a glance whether a spike of failures is the model's
8
+ * fault (InvalidArguments), the environment's fault
9
+ * (UnexpectedEnvironment), an upstream's fault (ProviderError),
10
+ * a user action (UserAborted), or a slow path (Timeout).
11
+ * 2. Build per-(tool, category) baselines for anomaly detection —
12
+ * see `getToolAnomalies()` below.
13
+ *
14
+ * The existing single-line errorMessage column is preserved so older
15
+ * records still parse. classifyToolFailure() auto-classifies records
16
+ * without a category field on read, so historical entries flow into
17
+ * the same dashboards without a migration.
4
18
  */
19
+ /**
20
+ * Coarse classification of a tool failure. Mirrors Cursor's published
21
+ * "Tool reliability" taxonomy so error dashboards translate cleanly
22
+ * across the industry, but tuned for Franklin's tool surface.
23
+ */
24
+ export type ToolFailureCategory = 'InvalidArguments' | 'UnexpectedEnvironment' | 'ProviderError' | 'UserAborted' | 'Timeout' | 'Unknown';
5
25
  export interface FailureRecord {
6
26
  timestamp: number;
7
27
  model: string;
@@ -9,12 +29,66 @@ export interface FailureRecord {
9
29
  toolName?: string;
10
30
  errorMessage: string;
11
31
  recoveryAction?: string;
32
+ /**
33
+ * Coarse classification of the failure. Set by recordFailure() when
34
+ * a record is written, or auto-filled by loadFailures() for older
35
+ * records that pre-date this field.
36
+ */
37
+ category?: ToolFailureCategory;
12
38
  }
39
+ /**
40
+ * Classify a tool failure by matching the error message + tool name
41
+ * against known patterns. Layered top-to-bottom — first match wins.
42
+ * `Unknown` is the catch-all; if you see one in production, the
43
+ * classifier needs a new branch (file a follow-up).
44
+ */
45
+ export declare function classifyToolFailure(errorMessage: string, toolName?: string): ToolFailureCategory;
13
46
  export declare function recordFailure(record: FailureRecord): void;
14
47
  export declare function loadFailures(limit?: number): FailureRecord[];
15
48
  export declare function getFailureStats(): {
16
49
  byTool: Map<string, number>;
17
50
  byType: Map<string, number>;
51
+ byCategory: Map<ToolFailureCategory, number>;
18
52
  total: number;
19
53
  recentFailures: FailureRecord[];
20
54
  };
55
+ export interface AnomalyReport {
56
+ toolName: string;
57
+ category: ToolFailureCategory;
58
+ recentCount: number;
59
+ baselineCount: number;
60
+ baselineWindowMs: number;
61
+ recentWindowMs: number;
62
+ /**
63
+ * Multiplier of recent-rate vs baseline-rate. Infinity when the
64
+ * baseline is zero (i.e. a new failure type appeared). 1.0 = same
65
+ * rate as baseline.
66
+ */
67
+ spikeRatio: number;
68
+ /** Most recent error message in this bucket — useful for triage. */
69
+ sampleMessage: string;
70
+ }
71
+ export interface AnomalyOptions {
72
+ /** Recent window in ms. Default 24h. */
73
+ recentWindowMs?: number;
74
+ /** Baseline window in ms (counted from now, includes the recent window). Default 30d. */
75
+ baselineWindowMs?: number;
76
+ /** Minimum recent count to consider — filters out single-flake noise. Default 3. */
77
+ minRecent?: number;
78
+ /** Minimum spike ratio to surface. Default 3.0. */
79
+ minSpikeRatio?: number;
80
+ }
81
+ /**
82
+ * Compute (tool, category) anomalies vs a rolling baseline.
83
+ *
84
+ * Returns the buckets where the recent failure rate is dramatically
85
+ * higher than baseline — sorted by spike severity. Skips buckets where
86
+ * `recentCount` is below `minRecent` to avoid surfacing every flaky
87
+ * one-off.
88
+ *
89
+ * A bucket with `baselineCount=0` and `recentCount >= minRecent` is
90
+ * always surfaced (spikeRatio = Infinity) — these are brand-new failure
91
+ * modes that the harness has never seen before, and they're the most
92
+ * important kind to investigate.
93
+ */
94
+ export declare function getToolAnomalies(opts?: AnomalyOptions): AnomalyReport[];
@@ -1,16 +1,101 @@
1
1
  /**
2
2
  * Structured failure logging for self-evolution analysis.
3
3
  * Append-only JSONL at ~/.blockrun/failures.jsonl (capped 500 records).
4
+ *
5
+ * 2026-05-11: Adopted a Cursor-style tool-failure taxonomy on the
6
+ * `category` field. Lets us:
7
+ * 1. Tell at a glance whether a spike of failures is the model's
8
+ * fault (InvalidArguments), the environment's fault
9
+ * (UnexpectedEnvironment), an upstream's fault (ProviderError),
10
+ * a user action (UserAborted), or a slow path (Timeout).
11
+ * 2. Build per-(tool, category) baselines for anomaly detection —
12
+ * see `getToolAnomalies()` below.
13
+ *
14
+ * The existing single-line errorMessage column is preserved so older
15
+ * records still parse. classifyToolFailure() auto-classifies records
16
+ * without a category field on read, so historical entries flow into
17
+ * the same dashboards without a migration.
4
18
  */
5
19
  import fs from 'node:fs';
6
20
  import path from 'node:path';
7
21
  import { BLOCKRUN_DIR } from '../config.js';
8
- const FAILURES_FILE = path.join(BLOCKRUN_DIR, 'failures.jsonl');
22
+ /**
23
+ * Resolve the failures-file path at call time, not module-load time, so
24
+ * tests can sandbox via FRANKLIN_HOME (already an established convention
25
+ * — see src/tasks/paths.ts). Production keeps the default
26
+ * ~/.blockrun/failures.jsonl path unchanged.
27
+ */
28
+ function failuresFile() {
29
+ const home = process.env.FRANKLIN_HOME;
30
+ return home
31
+ ? path.join(home, 'failures.jsonl')
32
+ : path.join(BLOCKRUN_DIR, 'failures.jsonl');
33
+ }
34
+ /**
35
+ * Classify a tool failure by matching the error message + tool name
36
+ * against known patterns. Layered top-to-bottom — first match wins.
37
+ * `Unknown` is the catch-all; if you see one in production, the
38
+ * classifier needs a new branch (file a follow-up).
39
+ */
40
+ export function classifyToolFailure(errorMessage, toolName) {
41
+ const m = (errorMessage || '').toLowerCase();
42
+ // UserAborted — user-initiated cancel or harness abort signal.
43
+ // Check first because abort messages often *contain* the word
44
+ // "timeout" or "error" and would otherwise misclassify.
45
+ if (/this operation was aborted|user aborted|user cancel|user_cancel|sigint|sigterm|operation cancell?ed|abortcontroller/.test(m)) {
46
+ return 'UserAborted';
47
+ }
48
+ // Timeout — distinct from ProviderError because the *call* succeeded
49
+ // (we sent the request) but exceeded our budget. Tool-level retries
50
+ // shouldn't retry these without escalating the budget.
51
+ if (/timed out after|timeout|deadline exceeded|etimedout|operation timed out|exceeded.*time/.test(m)) {
52
+ return 'Timeout';
53
+ }
54
+ // UnexpectedEnvironment — the world isn't as the model assumed.
55
+ // ENOENT / wallet missing / chain mismatch / cwd not a repo / etc.
56
+ if (/enoent|no such file|cannot find|does not exist|not a (git|directory)|wallet not (configured|found)|insufficient.*(balance|funds|lamports)|not logged in|chain mismatch|invalid wallet|command not found/.test(m)) {
57
+ return 'UnexpectedEnvironment';
58
+ }
59
+ // ProviderError — an upstream service we don't control returned bad.
60
+ // Rate limits, 5xx, gateway 4xx, network failures, fetch failures.
61
+ if (/rate.?limit|429|5\d\d|gateway|upstream|provider|fetch failed|econn(refused|reset)|enotfound|socket hang up|network error|http \d{3}|api error|gateway timeout/.test(m)) {
62
+ return 'ProviderError';
63
+ }
64
+ // InvalidArguments — the model called the tool wrong. Covers schema
65
+ // rejects, missing/extra fields, type mismatches, and the very common
66
+ // "cannot read properties of undefined" pattern that means we got an
67
+ // object shape we didn't expect from the model's input.
68
+ if (/invalid (argument|input|parameter|value|schema)|missing (required|argument|field|parameter)|expected.*(but|got|received)|cannot read (properties|property) of (undefined|null)|typeerror|schema (rejected|mismatch|validation)|bad request|400|invalid.*format|unrecognized/.test(m)) {
69
+ return 'InvalidArguments';
70
+ }
71
+ // Tool-specific tells.
72
+ if (toolName) {
73
+ const t = toolName.toLowerCase();
74
+ if (t === 'searchx' || t === 'posttox') {
75
+ if (/login wall|sign in|create account/.test(m))
76
+ return 'UnexpectedEnvironment';
77
+ }
78
+ if (t === 'bash') {
79
+ if (/permission denied|eacces/.test(m))
80
+ return 'UnexpectedEnvironment';
81
+ }
82
+ }
83
+ return 'Unknown';
84
+ }
9
85
  const MAX_RECORDS = 500;
10
86
  export function recordFailure(record) {
87
+ if (process.env.FRANKLIN_NO_AUDIT === '1' || process.env.FRANKLIN_NO_PERSIST === '1')
88
+ return;
11
89
  try {
12
- fs.mkdirSync(path.dirname(FAILURES_FILE), { recursive: true });
13
- fs.appendFileSync(FAILURES_FILE, JSON.stringify(record) + '\n');
90
+ // Auto-classify on write so callsites don't need to know the
91
+ // taxonomy. Callers can still override by passing `category`
92
+ // explicitly (e.g. when the abort came from a known SIGINT handler).
93
+ const enriched = {
94
+ ...record,
95
+ category: record.category ?? classifyToolFailure(record.errorMessage, record.toolName),
96
+ };
97
+ fs.mkdirSync(path.dirname(failuresFile()), { recursive: true });
98
+ fs.appendFileSync(failuresFile(), JSON.stringify(enriched) + '\n');
14
99
  // Trim to MAX_RECORDS (only check periodically to avoid constant reads)
15
100
  if (Math.random() < 0.1) {
16
101
  trimFailures();
@@ -22,12 +107,12 @@ export function recordFailure(record) {
22
107
  }
23
108
  function trimFailures() {
24
109
  try {
25
- if (!fs.existsSync(FAILURES_FILE))
110
+ if (!fs.existsSync(failuresFile()))
26
111
  return;
27
- const lines = fs.readFileSync(FAILURES_FILE, 'utf-8').trim().split('\n');
112
+ const lines = fs.readFileSync(failuresFile(), 'utf-8').trim().split('\n');
28
113
  if (lines.length > MAX_RECORDS) {
29
114
  const trimmed = lines.slice(-MAX_RECORDS).join('\n') + '\n';
30
- fs.writeFileSync(FAILURES_FILE, trimmed);
115
+ fs.writeFileSync(failuresFile(), trimmed);
31
116
  }
32
117
  }
33
118
  catch {
@@ -36,10 +121,19 @@ function trimFailures() {
36
121
  }
37
122
  export function loadFailures(limit = 100) {
38
123
  try {
39
- if (!fs.existsSync(FAILURES_FILE))
124
+ if (!fs.existsSync(failuresFile()))
40
125
  return [];
41
- const lines = fs.readFileSync(FAILURES_FILE, 'utf-8').trim().split('\n').filter(Boolean);
42
- return lines.slice(-limit).map(l => JSON.parse(l));
126
+ const lines = fs.readFileSync(failuresFile(), 'utf-8').trim().split('\n').filter(Boolean);
127
+ return lines.slice(-limit).map(l => {
128
+ const parsed = JSON.parse(l);
129
+ // Auto-classify historical records that pre-date the `category`
130
+ // field. We don't rewrite the file — read-side enrichment keeps
131
+ // the on-disk shape append-only and idempotent.
132
+ if (!parsed.category) {
133
+ parsed.category = classifyToolFailure(parsed.errorMessage, parsed.toolName);
134
+ }
135
+ return parsed;
136
+ });
43
137
  }
44
138
  catch {
45
139
  return [];
@@ -49,15 +143,97 @@ export function getFailureStats() {
49
143
  const records = loadFailures(500);
50
144
  const byTool = new Map();
51
145
  const byType = new Map();
146
+ const byCategory = new Map();
52
147
  for (const r of records) {
53
148
  if (r.toolName)
54
149
  byTool.set(r.toolName, (byTool.get(r.toolName) ?? 0) + 1);
55
150
  byType.set(r.failureType, (byType.get(r.failureType) ?? 0) + 1);
151
+ if (r.category)
152
+ byCategory.set(r.category, (byCategory.get(r.category) ?? 0) + 1);
56
153
  }
57
154
  return {
58
155
  byTool,
59
156
  byType,
157
+ byCategory,
60
158
  total: records.length,
61
159
  recentFailures: records.slice(-10),
62
160
  };
63
161
  }
162
+ /**
163
+ * Compute (tool, category) anomalies vs a rolling baseline.
164
+ *
165
+ * Returns the buckets where the recent failure rate is dramatically
166
+ * higher than baseline — sorted by spike severity. Skips buckets where
167
+ * `recentCount` is below `minRecent` to avoid surfacing every flaky
168
+ * one-off.
169
+ *
170
+ * A bucket with `baselineCount=0` and `recentCount >= minRecent` is
171
+ * always surfaced (spikeRatio = Infinity) — these are brand-new failure
172
+ * modes that the harness has never seen before, and they're the most
173
+ * important kind to investigate.
174
+ */
175
+ export function getToolAnomalies(opts = {}) {
176
+ const recentWindowMs = opts.recentWindowMs ?? 24 * 60 * 60 * 1000;
177
+ const baselineWindowMs = opts.baselineWindowMs ?? 30 * 24 * 60 * 60 * 1000;
178
+ const minRecent = opts.minRecent ?? 3;
179
+ const minSpikeRatio = opts.minSpikeRatio ?? 3.0;
180
+ const now = Date.now();
181
+ const recentCutoff = now - recentWindowMs;
182
+ const baselineCutoff = now - baselineWindowMs;
183
+ // Bucket key = `${toolName}::${category}`.
184
+ const recentByBucket = new Map();
185
+ const baselineByBucket = new Map();
186
+ for (const r of loadFailures(500)) {
187
+ if (r.timestamp < baselineCutoff)
188
+ continue;
189
+ const tool = r.toolName ?? '<no-tool>';
190
+ const cat = r.category ?? 'Unknown';
191
+ const key = `${tool}::${cat}`;
192
+ if (r.timestamp >= recentCutoff) {
193
+ const existing = recentByBucket.get(key) ?? { count: 0, sample: r.errorMessage };
194
+ existing.count += 1;
195
+ existing.sample = r.errorMessage; // last seen wins; useful for triage
196
+ recentByBucket.set(key, existing);
197
+ }
198
+ else {
199
+ baselineByBucket.set(key, (baselineByBucket.get(key) ?? 0) + 1);
200
+ }
201
+ }
202
+ const reports = [];
203
+ for (const [key, { count: recentCount, sample }] of recentByBucket) {
204
+ if (recentCount < minRecent)
205
+ continue;
206
+ const baselineCount = baselineByBucket.get(key) ?? 0;
207
+ // Normalize rates by window length so spikes are comparable across
208
+ // different (recent, baseline) sizes. baseline window excludes the
209
+ // recent window by construction (we partitioned above).
210
+ const baselineWindowExclRecent = baselineWindowMs - recentWindowMs;
211
+ const recentRate = recentCount / recentWindowMs;
212
+ const baselineRate = baselineCount > 0
213
+ ? baselineCount / Math.max(1, baselineWindowExclRecent)
214
+ : 0;
215
+ const spikeRatio = baselineRate > 0
216
+ ? recentRate / baselineRate
217
+ : Number.POSITIVE_INFINITY;
218
+ if (spikeRatio < minSpikeRatio)
219
+ continue;
220
+ const [toolName, category] = key.split('::');
221
+ reports.push({
222
+ toolName,
223
+ category,
224
+ recentCount,
225
+ baselineCount,
226
+ baselineWindowMs,
227
+ recentWindowMs,
228
+ spikeRatio,
229
+ sampleMessage: sample,
230
+ });
231
+ }
232
+ // Sort: brand-new failures (spikeRatio = Infinity) first, then by ratio desc.
233
+ reports.sort((a, b) => {
234
+ if (a.spikeRatio === b.spikeRatio)
235
+ return b.recentCount - a.recentCount;
236
+ return b.spikeRatio - a.spikeRatio;
237
+ });
238
+ return reports;
239
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blockrun/franklin",
3
- "version": "3.15.90",
3
+ "version": "3.15.92",
4
4
  "description": "Franklin — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.",
5
5
  "type": "module",
6
6
  "exports": {