npm - @blockrun/franklin - Versions diffs - 3.15.91 → 3.15.93 - Mend

@blockrun/franklin 3.15.91 → 3.15.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/commands/doctor.d.ts CHANGED Viewed

@@ -12,4 +12,5 @@
  */
 export declare function doctorCommand(opts?: {
     json?: boolean;
+    anomaly?: boolean;
 }): Promise<void>;

package/dist/commands/doctor.js CHANGED Viewed

@@ -17,9 +17,17 @@ import os from 'node:os';
 import { setupAgentWallet, setupAgentSolanaWallet, } from '@blockrun/llm';
 import { loadChain, API_URLS, VERSION, BLOCKRUN_DIR } from '../config.js';
 import { isTelemetryEnabled, readAllRecords } from '../telemetry/store.js';
-import { getAvailableUpdate, kickoffVersionCheck } from '../version-check.js';
+import { getAvailableUpdateFresh, kickoffVersionCheck } from '../version-check.js';
 async function runChecks() {
     const out = [];
+    // Kick off the authoritative version fetch FIRST, in parallel with the
+    // other checks. Doctor is a diagnostic — the user just asked "am I
+    // healthy?" — so a 24h-stale cache is the wrong answer. The fetch is
+    // bounded by the same 2s timeout the background check uses, and falls
+    // back to the cached value on failure. By the time we render the
+    // Franklin-version check below, the fetch has typically settled in
+    // <300ms (npm is fast) and we have a current answer.
+    const freshUpdatePromise = getAvailableUpdateFresh();
     // ── 1. Runtime ────────────────────────────────────────────────────
     const nodeVer = process.versions.node;
     const nodeMajor = parseInt(nodeVer.split('.')[0], 10);
@@ -30,10 +38,10 @@ async function runChecks() {
         remedy: nodeMajor >= 20 ? undefined : 'Upgrade Node.js: https://nodejs.org',
     });
     // ── 2. Franklin version ───────────────────────────────────────────
-    // Kick the daily cache refresh so subsequent doctor runs carry fresh
-    // data. Current run uses whatever's already cached.
+    // Keep kickoffVersionCheck() so non-doctor entry points (banner etc.)
+    // still warm the cache through their normal daily refresh path.
     kickoffVersionCheck();
-    const update = getAvailableUpdate();
+    const update = await freshUpdatePromise;
     out.push({
         name: 'Franklin',
         status: update ? 'warn' : 'ok',
@@ -247,6 +255,10 @@ function printHuman(checks) {
     console.log();
 }
 export async function doctorCommand(opts = {}) {
+    if (opts.anomaly) {
+        await anomalyReportCommand(opts);
+        return;
+    }
     const checks = await runChecks();
     if (opts.json) {
         const fails = checks.filter(c => c.status === 'fail').length;
@@ -257,3 +269,35 @@ export async function doctorCommand(opts = {}) {
     const fails = checks.filter(c => c.status === 'fail').length;
     process.exit(fails > 0 ? 1 : 0);
 }
+/**
+ * `franklin doctor --anomaly` — print failure spikes vs 30-day baseline.
+ * Exits non-zero when at least one anomaly is surfaced, so it can be
+ * wired into a cron / CI without parsing stdout.
+ */
+async function anomalyReportCommand(opts) {
+    const { getToolAnomalies } = await import('../stats/failures.js');
+    const reports = getToolAnomalies();
+    if (opts.json) {
+        process.stdout.write(JSON.stringify({ anomalies: reports }, null, 2) + '\n');
+        process.exit(reports.length > 0 ? 1 : 0);
+    }
+    console.log(chalk.bold('\n  franklin doctor --anomaly'));
+    console.log(chalk.dim('  Looking for (tool, category) failure spikes in the last 24h vs the 30-day baseline.\n'));
+    if (reports.length === 0) {
+        console.log(chalk.green('  No anomalies. Tool failure rates match the 30-day baseline.\n'));
+        process.exit(0);
+    }
+    for (const a of reports) {
+        const newType = !Number.isFinite(a.spikeRatio);
+        const header = `  ${chalk.red('•')} ${chalk.bold(a.toolName)} / ${chalk.yellow(a.category)}`;
+        const ratio = newType
+            ? chalk.red('NEW failure type (no baseline)')
+            : chalk.red(`${a.spikeRatio.toFixed(1)}× baseline`);
+        const counts = chalk.dim(`recent=${a.recentCount}, baseline=${a.baselineCount}`);
+        console.log(`${header}  ${ratio}  ${counts}`);
+        const trimmed = a.sampleMessage.length > 140 ? a.sampleMessage.slice(0, 140) + '…' : a.sampleMessage;
+        console.log(chalk.dim(`    sample: ${trimmed}`));
+    }
+    console.log(chalk.dim(`\n  ${reports.length} anomalies. Investigate before they snowball.\n`));
+    process.exit(1);
+}

package/dist/index.js CHANGED Viewed

@@ -185,6 +185,7 @@ program
     .command('doctor')
     .description('One-command health check (node, wallet, chain, gateway, MCP, telemetry)')
     .option('--json', 'Machine-readable output')
+    .option('--anomaly', 'Surface (tool, category) failure spikes vs 30-day baseline')
     .action(async (opts) => {
     const { doctorCommand } = await import('./commands/doctor.js');
     await doctorCommand(opts);

package/dist/stats/failures.d.ts CHANGED Viewed

@@ -1,7 +1,27 @@
 /**
  * Structured failure logging for self-evolution analysis.
  * Append-only JSONL at ~/.blockrun/failures.jsonl (capped 500 records).
+ *
+ * 2026-05-11: Adopted a Cursor-style tool-failure taxonomy on the
+ * `category` field. Lets us:
+ *   1. Tell at a glance whether a spike of failures is the model's
+ *      fault (InvalidArguments), the environment's fault
+ *      (UnexpectedEnvironment), an upstream's fault (ProviderError),
+ *      a user action (UserAborted), or a slow path (Timeout).
+ *   2. Build per-(tool, category) baselines for anomaly detection —
+ *      see `getToolAnomalies()` below.
+ *
+ * The existing single-line errorMessage column is preserved so older
+ * records still parse. classifyToolFailure() auto-classifies records
+ * without a category field on read, so historical entries flow into
+ * the same dashboards without a migration.
  */
+/**
+ * Coarse classification of a tool failure. Mirrors Cursor's published
+ * "Tool reliability" taxonomy so error dashboards translate cleanly
+ * across the industry, but tuned for Franklin's tool surface.
+ */
+export type ToolFailureCategory = 'InvalidArguments' | 'UnexpectedEnvironment' | 'ProviderError' | 'UserAborted' | 'Timeout' | 'Unknown';
 export interface FailureRecord {
     timestamp: number;
     model: string;
@@ -9,12 +29,66 @@ export interface FailureRecord {
     toolName?: string;
     errorMessage: string;
     recoveryAction?: string;
+    /**
+     * Coarse classification of the failure. Set by recordFailure() when
+     * a record is written, or auto-filled by loadFailures() for older
+     * records that pre-date this field.
+     */
+    category?: ToolFailureCategory;
 }
+/**
+ * Classify a tool failure by matching the error message + tool name
+ * against known patterns. Layered top-to-bottom — first match wins.
+ * `Unknown` is the catch-all; if you see one in production, the
+ * classifier needs a new branch (file a follow-up).
+ */
+export declare function classifyToolFailure(errorMessage: string, toolName?: string): ToolFailureCategory;
 export declare function recordFailure(record: FailureRecord): void;
 export declare function loadFailures(limit?: number): FailureRecord[];
 export declare function getFailureStats(): {
     byTool: Map<string, number>;
     byType: Map<string, number>;
+    byCategory: Map<ToolFailureCategory, number>;
     total: number;
     recentFailures: FailureRecord[];
 };
+export interface AnomalyReport {
+    toolName: string;
+    category: ToolFailureCategory;
+    recentCount: number;
+    baselineCount: number;
+    baselineWindowMs: number;
+    recentWindowMs: number;
+    /**
+     * Multiplier of recent-rate vs baseline-rate. Infinity when the
+     * baseline is zero (i.e. a new failure type appeared). 1.0 = same
+     * rate as baseline.
+     */
+    spikeRatio: number;
+    /** Most recent error message in this bucket — useful for triage. */
+    sampleMessage: string;
+}
+export interface AnomalyOptions {
+    /** Recent window in ms. Default 24h. */
+    recentWindowMs?: number;
+    /** Baseline window in ms (counted from now, includes the recent window). Default 30d. */
+    baselineWindowMs?: number;
+    /** Minimum recent count to consider — filters out single-flake noise. Default 3. */
+    minRecent?: number;
+    /** Minimum spike ratio to surface. Default 3.0. */
+    minSpikeRatio?: number;
+}
+/**
+ * Compute (tool, category) anomalies vs a rolling baseline.
+ *
+ * Returns the buckets where the recent failure rate is dramatically
+ * higher than baseline — sorted by spike severity. Skips buckets where
+ * `recentCount` is below `minRecent` to avoid surfacing every flaky
+ * one-off.
+ *
+ * A bucket with `baselineCount=0` and `recentCount >= minRecent` is
+ * always surfaced (spikeRatio = Infinity) — these are brand-new failure
+ * modes that the harness has never seen before, and they're the most
+ * important kind to investigate.
+ */
+export declare function getToolAnomalies(opts?: AnomalyOptions): AnomalyReport[];

package/dist/stats/failures.js CHANGED Viewed

@@ -1,16 +1,101 @@
 /**
  * Structured failure logging for self-evolution analysis.
  * Append-only JSONL at ~/.blockrun/failures.jsonl (capped 500 records).
+ *
+ * 2026-05-11: Adopted a Cursor-style tool-failure taxonomy on the
+ * `category` field. Lets us:
+ *   1. Tell at a glance whether a spike of failures is the model's
+ *      fault (InvalidArguments), the environment's fault
+ *      (UnexpectedEnvironment), an upstream's fault (ProviderError),
+ *      a user action (UserAborted), or a slow path (Timeout).
+ *   2. Build per-(tool, category) baselines for anomaly detection —
+ *      see `getToolAnomalies()` below.
+ *
+ * The existing single-line errorMessage column is preserved so older
+ * records still parse. classifyToolFailure() auto-classifies records
+ * without a category field on read, so historical entries flow into
+ * the same dashboards without a migration.
  */
 import fs from 'node:fs';
 import path from 'node:path';
 import { BLOCKRUN_DIR } from '../config.js';
-const FAILURES_FILE = path.join(BLOCKRUN_DIR, 'failures.jsonl');
+/**
+ * Resolve the failures-file path at call time, not module-load time, so
+ * tests can sandbox via FRANKLIN_HOME (already an established convention
+ * — see src/tasks/paths.ts). Production keeps the default
+ * ~/.blockrun/failures.jsonl path unchanged.
+ */
+function failuresFile() {
+    const home = process.env.FRANKLIN_HOME;
+    return home
+        ? path.join(home, 'failures.jsonl')
+        : path.join(BLOCKRUN_DIR, 'failures.jsonl');
+}
+/**
+ * Classify a tool failure by matching the error message + tool name
+ * against known patterns. Layered top-to-bottom — first match wins.
+ * `Unknown` is the catch-all; if you see one in production, the
+ * classifier needs a new branch (file a follow-up).
+ */
+export function classifyToolFailure(errorMessage, toolName) {
+    const m = (errorMessage || '').toLowerCase();
+    // UserAborted — user-initiated cancel or harness abort signal.
+    // Check first because abort messages often *contain* the word
+    // "timeout" or "error" and would otherwise misclassify.
+    if (/this operation was aborted|user aborted|user cancel|user_cancel|sigint|sigterm|operation cancell?ed|abortcontroller/.test(m)) {
+        return 'UserAborted';
+    }
+    // Timeout — distinct from ProviderError because the *call* succeeded
+    // (we sent the request) but exceeded our budget. Tool-level retries
+    // shouldn't retry these without escalating the budget.
+    if (/timed out after|timeout|deadline exceeded|etimedout|operation timed out|exceeded.*time/.test(m)) {
+        return 'Timeout';
+    }
+    // UnexpectedEnvironment — the world isn't as the model assumed.
+    // ENOENT / wallet missing / chain mismatch / cwd not a repo / etc.
+    if (/enoent|no such file|cannot find|does not exist|not a (git|directory)|wallet not (configured|found)|insufficient.*(balance|funds|lamports)|not logged in|chain mismatch|invalid wallet|command not found/.test(m)) {
+        return 'UnexpectedEnvironment';
+    }
+    // ProviderError — an upstream service we don't control returned bad.
+    // Rate limits, 5xx, gateway 4xx, network failures, fetch failures.
+    if (/rate.?limit|429|5\d\d|gateway|upstream|provider|fetch failed|econn(refused|reset)|enotfound|socket hang up|network error|http \d{3}|api error|gateway timeout/.test(m)) {
+        return 'ProviderError';
+    }
+    // InvalidArguments — the model called the tool wrong. Covers schema
+    // rejects, missing/extra fields, type mismatches, and the very common
+    // "cannot read properties of undefined" pattern that means we got an
+    // object shape we didn't expect from the model's input.
+    if (/invalid (argument|input|parameter|value|schema)|missing (required|argument|field|parameter)|expected.*(but|got|received)|cannot read (properties|property) of (undefined|null)|typeerror|schema (rejected|mismatch|validation)|bad request|400|invalid.*format|unrecognized/.test(m)) {
+        return 'InvalidArguments';
+    }
+    // Tool-specific tells.
+    if (toolName) {
+        const t = toolName.toLowerCase();
+        if (t === 'searchx' || t === 'posttox') {
+            if (/login wall|sign in|create account/.test(m))
+                return 'UnexpectedEnvironment';
+        }
+        if (t === 'bash') {
+            if (/permission denied|eacces/.test(m))
+                return 'UnexpectedEnvironment';
+        }
+    }
+    return 'Unknown';
+}
 const MAX_RECORDS = 500;
 export function recordFailure(record) {
+    if (process.env.FRANKLIN_NO_AUDIT === '1' || process.env.FRANKLIN_NO_PERSIST === '1')
+        return;
     try {
-        fs.mkdirSync(path.dirname(FAILURES_FILE), { recursive: true });
-        fs.appendFileSync(FAILURES_FILE, JSON.stringify(record) + '\n');
+        // Auto-classify on write so callsites don't need to know the
+        // taxonomy. Callers can still override by passing `category`
+        // explicitly (e.g. when the abort came from a known SIGINT handler).
+        const enriched = {
+            ...record,
+            category: record.category ?? classifyToolFailure(record.errorMessage, record.toolName),
+        };
+        fs.mkdirSync(path.dirname(failuresFile()), { recursive: true });
+        fs.appendFileSync(failuresFile(), JSON.stringify(enriched) + '\n');
         // Trim to MAX_RECORDS (only check periodically to avoid constant reads)
         if (Math.random() < 0.1) {
             trimFailures();
@@ -22,12 +107,12 @@ export function recordFailure(record) {
 }
 function trimFailures() {
     try {
-        if (!fs.existsSync(FAILURES_FILE))
+        if (!fs.existsSync(failuresFile()))
             return;
-        const lines = fs.readFileSync(FAILURES_FILE, 'utf-8').trim().split('\n');
+        const lines = fs.readFileSync(failuresFile(), 'utf-8').trim().split('\n');
         if (lines.length > MAX_RECORDS) {
             const trimmed = lines.slice(-MAX_RECORDS).join('\n') + '\n';
-            fs.writeFileSync(FAILURES_FILE, trimmed);
+            fs.writeFileSync(failuresFile(), trimmed);
         }
     }
     catch {
@@ -36,10 +121,19 @@ function trimFailures() {
 }
 export function loadFailures(limit = 100) {
     try {
-        if (!fs.existsSync(FAILURES_FILE))
+        if (!fs.existsSync(failuresFile()))
             return [];
-        const lines = fs.readFileSync(FAILURES_FILE, 'utf-8').trim().split('\n').filter(Boolean);
-        return lines.slice(-limit).map(l => JSON.parse(l));
+        const lines = fs.readFileSync(failuresFile(), 'utf-8').trim().split('\n').filter(Boolean);
+        return lines.slice(-limit).map(l => {
+            const parsed = JSON.parse(l);
+            // Auto-classify historical records that pre-date the `category`
+            // field. We don't rewrite the file — read-side enrichment keeps
+            // the on-disk shape append-only and idempotent.
+            if (!parsed.category) {
+                parsed.category = classifyToolFailure(parsed.errorMessage, parsed.toolName);
+            }
+            return parsed;
+        });
     }
     catch {
         return [];
@@ -49,15 +143,97 @@ export function getFailureStats() {
     const records = loadFailures(500);
     const byTool = new Map();
     const byType = new Map();
+    const byCategory = new Map();
     for (const r of records) {
         if (r.toolName)
             byTool.set(r.toolName, (byTool.get(r.toolName) ?? 0) + 1);
         byType.set(r.failureType, (byType.get(r.failureType) ?? 0) + 1);
+        if (r.category)
+            byCategory.set(r.category, (byCategory.get(r.category) ?? 0) + 1);
     }
     return {
         byTool,
         byType,
+        byCategory,
         total: records.length,
         recentFailures: records.slice(-10),
     };
 }
+/**
+ * Compute (tool, category) anomalies vs a rolling baseline.
+ *
+ * Returns the buckets where the recent failure rate is dramatically
+ * higher than baseline — sorted by spike severity. Skips buckets where
+ * `recentCount` is below `minRecent` to avoid surfacing every flaky
+ * one-off.
+ *
+ * A bucket with `baselineCount=0` and `recentCount >= minRecent` is
+ * always surfaced (spikeRatio = Infinity) — these are brand-new failure
+ * modes that the harness has never seen before, and they're the most
+ * important kind to investigate.
+ */
+export function getToolAnomalies(opts = {}) {
+    const recentWindowMs = opts.recentWindowMs ?? 24 * 60 * 60 * 1000;
+    const baselineWindowMs = opts.baselineWindowMs ?? 30 * 24 * 60 * 60 * 1000;
+    const minRecent = opts.minRecent ?? 3;
+    const minSpikeRatio = opts.minSpikeRatio ?? 3.0;
+    const now = Date.now();
+    const recentCutoff = now - recentWindowMs;
+    const baselineCutoff = now - baselineWindowMs;
+    // Bucket key = `${toolName}::${category}`.
+    const recentByBucket = new Map();
+    const baselineByBucket = new Map();
+    for (const r of loadFailures(500)) {
+        if (r.timestamp < baselineCutoff)
+            continue;
+        const tool = r.toolName ?? '<no-tool>';
+        const cat = r.category ?? 'Unknown';
+        const key = `${tool}::${cat}`;
+        if (r.timestamp >= recentCutoff) {
+            const existing = recentByBucket.get(key) ?? { count: 0, sample: r.errorMessage };
+            existing.count += 1;
+            existing.sample = r.errorMessage; // last seen wins; useful for triage
+            recentByBucket.set(key, existing);
+        }
+        else {
+            baselineByBucket.set(key, (baselineByBucket.get(key) ?? 0) + 1);
+        }
+    }
+    const reports = [];
+    for (const [key, { count: recentCount, sample }] of recentByBucket) {
+        if (recentCount < minRecent)
+            continue;
+        const baselineCount = baselineByBucket.get(key) ?? 0;
+        // Normalize rates by window length so spikes are comparable across
+        // different (recent, baseline) sizes. baseline window excludes the
+        // recent window by construction (we partitioned above).
+        const baselineWindowExclRecent = baselineWindowMs - recentWindowMs;
+        const recentRate = recentCount / recentWindowMs;
+        const baselineRate = baselineCount > 0
+            ? baselineCount / Math.max(1, baselineWindowExclRecent)
+            : 0;
+        const spikeRatio = baselineRate > 0
+            ? recentRate / baselineRate
+            : Number.POSITIVE_INFINITY;
+        if (spikeRatio < minSpikeRatio)
+            continue;
+        const [toolName, category] = key.split('::');
+        reports.push({
+            toolName,
+            category,
+            recentCount,
+            baselineCount,
+            baselineWindowMs,
+            recentWindowMs,
+            spikeRatio,
+            sampleMessage: sample,
+        });
+    }
+    // Sort: brand-new failures (spikeRatio = Infinity) first, then by ratio desc.
+    reports.sort((a, b) => {
+        if (a.spikeRatio === b.spikeRatio)
+            return b.recentCount - a.recentCount;
+        return b.spikeRatio - a.spikeRatio;
+    });
+    return reports;
+}

package/dist/version-check.d.ts CHANGED Viewed

@@ -37,3 +37,17 @@ export interface UpdateInfo {
  * background check settles — returns null (we don't speculate).
  */
 export declare function getAvailableUpdate(): UpdateInfo | null;
+/**
+ * Authoritative check that forces a fresh fetch (up to FETCH_TIMEOUT_MS).
+ * Use for on-demand diagnostics like `franklin doctor` where the user
+ * explicitly asked "am I up to date?" and a 24h-stale cache is the wrong
+ * answer. Verified 2026-05-11: between two same-day releases (3.15.91 →
+ * 3.15.92), the daily cache made `franklin doctor` show green for a user
+ * who was actually 4 versions behind (3.15.88), because they ran doctor
+ * in the brief gap between npm publish and the next cache refresh.
+ *
+ * Falls back to the cached value if the fetch fails (offline, slow npm,
+ * etc.) — same behavior as the cached check, just refreshed when
+ * possible.
+ */
+export declare function getAvailableUpdateFresh(): Promise<UpdateInfo | null>;

package/dist/version-check.js CHANGED Viewed

@@ -132,3 +132,30 @@ export function getAvailableUpdate() {
     }
     return null;
 }
+/**
+ * Authoritative check that forces a fresh fetch (up to FETCH_TIMEOUT_MS).
+ * Use for on-demand diagnostics like `franklin doctor` where the user
+ * explicitly asked "am I up to date?" and a 24h-stale cache is the wrong
+ * answer. Verified 2026-05-11: between two same-day releases (3.15.91 →
+ * 3.15.92), the daily cache made `franklin doctor` show green for a user
+ * who was actually 4 versions behind (3.15.88), because they ran doctor
+ * in the brief gap between npm publish and the next cache refresh.
+ *
+ * Falls back to the cached value if the fetch fails (offline, slow npm,
+ * etc.) — same behavior as the cached check, just refreshed when
+ * possible.
+ */
+export async function getAvailableUpdateFresh() {
+    if (isDisabled())
+        return getAvailableUpdate();
+    const latest = await fetchLatestVersion();
+    if (latest) {
+        writeCache({ latestVersion: latest, checkedAt: Date.now() });
+        if (compareSemver(latest, VERSION) > 0) {
+            return { current: VERSION, latest };
+        }
+        return null;
+    }
+    // Fetch failed — fall back to whatever the cache says.
+    return getAvailableUpdate();
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@blockrun/franklin",
-  "version": "3.15.91",
+  "version": "3.15.93",
   "description": "Franklin — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.",
   "type": "module",
   "exports": {