@blockrun/franklin 3.15.91 → 3.15.92

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,4 +12,5 @@
12
12
  */
13
13
  export declare function doctorCommand(opts?: {
14
14
  json?: boolean;
15
+ anomaly?: boolean;
15
16
  }): Promise<void>;
@@ -247,6 +247,10 @@ function printHuman(checks) {
247
247
  console.log();
248
248
  }
249
249
  export async function doctorCommand(opts = {}) {
250
+ if (opts.anomaly) {
251
+ await anomalyReportCommand(opts);
252
+ return;
253
+ }
250
254
  const checks = await runChecks();
251
255
  if (opts.json) {
252
256
  const fails = checks.filter(c => c.status === 'fail').length;
@@ -257,3 +261,35 @@ export async function doctorCommand(opts = {}) {
257
261
  const fails = checks.filter(c => c.status === 'fail').length;
258
262
  process.exit(fails > 0 ? 1 : 0);
259
263
  }
264
+ /**
265
+ * `franklin doctor --anomaly` — print failure spikes vs 30-day baseline.
266
+ * Exits non-zero when at least one anomaly is surfaced, so it can be
267
+ * wired into a cron / CI without parsing stdout.
268
+ */
269
+ async function anomalyReportCommand(opts) {
270
+ const { getToolAnomalies } = await import('../stats/failures.js');
271
+ const reports = getToolAnomalies();
272
+ if (opts.json) {
273
+ process.stdout.write(JSON.stringify({ anomalies: reports }, null, 2) + '\n');
274
+ process.exit(reports.length > 0 ? 1 : 0);
275
+ }
276
+ console.log(chalk.bold('\n franklin doctor --anomaly'));
277
+ console.log(chalk.dim(' Looking for (tool, category) failure spikes in the last 24h vs the 30-day baseline.\n'));
278
+ if (reports.length === 0) {
279
+ console.log(chalk.green(' No anomalies. Tool failure rates match the 30-day baseline.\n'));
280
+ process.exit(0);
281
+ }
282
+ for (const a of reports) {
283
+ const newType = !Number.isFinite(a.spikeRatio);
284
+ const header = ` ${chalk.red('•')} ${chalk.bold(a.toolName)} / ${chalk.yellow(a.category)}`;
285
+ const ratio = newType
286
+ ? chalk.red('NEW failure type (no baseline)')
287
+ : chalk.red(`${a.spikeRatio.toFixed(1)}× baseline`);
288
+ const counts = chalk.dim(`recent=${a.recentCount}, baseline=${a.baselineCount}`);
289
+ console.log(`${header} ${ratio} ${counts}`);
290
+ const trimmed = a.sampleMessage.length > 140 ? a.sampleMessage.slice(0, 140) + '…' : a.sampleMessage;
291
+ console.log(chalk.dim(` sample: ${trimmed}`));
292
+ }
293
+ console.log(chalk.dim(`\n ${reports.length} anomalies. Investigate before they snowball.\n`));
294
+ process.exit(1);
295
+ }
package/dist/index.js CHANGED
@@ -185,6 +185,7 @@ program
185
185
  .command('doctor')
186
186
  .description('One-command health check (node, wallet, chain, gateway, MCP, telemetry)')
187
187
  .option('--json', 'Machine-readable output')
188
+ .option('--anomaly', 'Surface (tool, category) failure spikes vs 30-day baseline')
188
189
  .action(async (opts) => {
189
190
  const { doctorCommand } = await import('./commands/doctor.js');
190
191
  await doctorCommand(opts);
@@ -1,7 +1,27 @@
1
1
  /**
2
2
  * Structured failure logging for self-evolution analysis.
3
3
  * Append-only JSONL at ~/.blockrun/failures.jsonl (capped 500 records).
4
+ *
5
+ * 2026-05-11: Adopted a Cursor-style tool-failure taxonomy on the
6
+ * `category` field. Lets us:
7
+ * 1. Tell at a glance whether a spike of failures is the model's
8
+ * fault (InvalidArguments), the environment's fault
9
+ * (UnexpectedEnvironment), an upstream's fault (ProviderError),
10
+ * a user action (UserAborted), or a slow path (Timeout).
11
+ * 2. Build per-(tool, category) baselines for anomaly detection —
12
+ * see `getToolAnomalies()` below.
13
+ *
14
+ * The existing single-line errorMessage column is preserved so older
15
+ * records still parse. classifyToolFailure() auto-classifies records
16
+ * without a category field on read, so historical entries flow into
17
+ * the same dashboards without a migration.
4
18
  */
19
+ /**
20
+ * Coarse classification of a tool failure. Mirrors Cursor's published
21
+ * "Tool reliability" taxonomy so error dashboards translate cleanly
22
+ * across the industry, but tuned for Franklin's tool surface.
23
+ */
24
+ export type ToolFailureCategory = 'InvalidArguments' | 'UnexpectedEnvironment' | 'ProviderError' | 'UserAborted' | 'Timeout' | 'Unknown';
5
25
  export interface FailureRecord {
6
26
  timestamp: number;
7
27
  model: string;
@@ -9,12 +29,66 @@ export interface FailureRecord {
9
29
  toolName?: string;
10
30
  errorMessage: string;
11
31
  recoveryAction?: string;
32
+ /**
33
+ * Coarse classification of the failure. Set by recordFailure() when
34
+ * a record is written, or auto-filled by loadFailures() for older
35
+ * records that pre-date this field.
36
+ */
37
+ category?: ToolFailureCategory;
12
38
  }
39
+ /**
40
+ * Classify a tool failure by matching the error message + tool name
41
+ * against known patterns. Layered top-to-bottom — first match wins.
42
+ * `Unknown` is the catch-all; if you see one in production, the
43
+ * classifier needs a new branch (file a follow-up).
44
+ */
45
+ export declare function classifyToolFailure(errorMessage: string, toolName?: string): ToolFailureCategory;
13
46
  export declare function recordFailure(record: FailureRecord): void;
14
47
  export declare function loadFailures(limit?: number): FailureRecord[];
15
48
  export declare function getFailureStats(): {
16
49
  byTool: Map<string, number>;
17
50
  byType: Map<string, number>;
51
+ byCategory: Map<ToolFailureCategory, number>;
18
52
  total: number;
19
53
  recentFailures: FailureRecord[];
20
54
  };
55
+ export interface AnomalyReport {
56
+ toolName: string;
57
+ category: ToolFailureCategory;
58
+ recentCount: number;
59
+ baselineCount: number;
60
+ baselineWindowMs: number;
61
+ recentWindowMs: number;
62
+ /**
63
+ * Multiplier of recent-rate vs baseline-rate. Infinity when the
64
+ * baseline is zero (i.e. a new failure type appeared). 1.0 = same
65
+ * rate as baseline.
66
+ */
67
+ spikeRatio: number;
68
+ /** Most recent error message in this bucket — useful for triage. */
69
+ sampleMessage: string;
70
+ }
71
+ export interface AnomalyOptions {
72
+ /** Recent window in ms. Default 24h. */
73
+ recentWindowMs?: number;
74
+ /** Baseline window in ms (counted from now, includes the recent window). Default 30d. */
75
+ baselineWindowMs?: number;
76
+ /** Minimum recent count to consider — filters out single-flake noise. Default 3. */
77
+ minRecent?: number;
78
+ /** Minimum spike ratio to surface. Default 3.0. */
79
+ minSpikeRatio?: number;
80
+ }
81
+ /**
82
+ * Compute (tool, category) anomalies vs a rolling baseline.
83
+ *
84
+ * Returns the buckets where the recent failure rate is dramatically
85
+ * higher than baseline — sorted by spike severity. Skips buckets where
86
+ * `recentCount` is below `minRecent` to avoid surfacing every flaky
87
+ * one-off.
88
+ *
89
+ * A bucket with `baselineCount=0` and `recentCount >= minRecent` is
90
+ * always surfaced (spikeRatio = Infinity) — these are brand-new failure
91
+ * modes that the harness has never seen before, and they're the most
92
+ * important kind to investigate.
93
+ */
94
+ export declare function getToolAnomalies(opts?: AnomalyOptions): AnomalyReport[];
@@ -1,16 +1,101 @@
1
1
  /**
2
2
  * Structured failure logging for self-evolution analysis.
3
3
  * Append-only JSONL at ~/.blockrun/failures.jsonl (capped 500 records).
4
+ *
5
+ * 2026-05-11: Adopted a Cursor-style tool-failure taxonomy on the
6
+ * `category` field. Lets us:
7
+ * 1. Tell at a glance whether a spike of failures is the model's
8
+ * fault (InvalidArguments), the environment's fault
9
+ * (UnexpectedEnvironment), an upstream's fault (ProviderError),
10
+ * a user action (UserAborted), or a slow path (Timeout).
11
+ * 2. Build per-(tool, category) baselines for anomaly detection —
12
+ * see `getToolAnomalies()` below.
13
+ *
14
+ * The existing single-line errorMessage column is preserved so older
15
+ * records still parse. classifyToolFailure() auto-classifies records
16
+ * without a category field on read, so historical entries flow into
17
+ * the same dashboards without a migration.
4
18
  */
5
19
  import fs from 'node:fs';
6
20
  import path from 'node:path';
7
21
  import { BLOCKRUN_DIR } from '../config.js';
8
- const FAILURES_FILE = path.join(BLOCKRUN_DIR, 'failures.jsonl');
22
+ /**
23
+ * Resolve the failures-file path at call time, not module-load time, so
24
+ * tests can sandbox via FRANKLIN_HOME (already an established convention
25
+ * — see src/tasks/paths.ts). Production keeps the default
26
+ * ~/.blockrun/failures.jsonl path unchanged.
27
+ */
28
+ function failuresFile() {
29
+ const home = process.env.FRANKLIN_HOME;
30
+ return home
31
+ ? path.join(home, 'failures.jsonl')
32
+ : path.join(BLOCKRUN_DIR, 'failures.jsonl');
33
+ }
34
+ /**
35
+ * Classify a tool failure by matching the error message + tool name
36
+ * against known patterns. Layered top-to-bottom — first match wins.
37
+ * `Unknown` is the catch-all; if you see one in production, the
38
+ * classifier needs a new branch (file a follow-up).
39
+ */
40
+ export function classifyToolFailure(errorMessage, toolName) {
41
+ const m = (errorMessage || '').toLowerCase();
42
+ // UserAborted — user-initiated cancel or harness abort signal.
43
+ // Check first because abort messages often *contain* the word
44
+ // "timeout" or "error" and would otherwise misclassify.
45
+ if (/this operation was aborted|user aborted|user cancel|user_cancel|sigint|sigterm|operation cancell?ed|abortcontroller/.test(m)) {
46
+ return 'UserAborted';
47
+ }
48
+ // Timeout — distinct from ProviderError because the *call* succeeded
49
+ // (we sent the request) but exceeded our budget. Tool-level retries
50
+ // shouldn't retry these without escalating the budget.
51
+ if (/timed out after|timeout|deadline exceeded|etimedout|operation timed out|exceeded.*time/.test(m)) {
52
+ return 'Timeout';
53
+ }
54
+ // UnexpectedEnvironment — the world isn't as the model assumed.
55
+ // ENOENT / wallet missing / chain mismatch / cwd not a repo / etc.
56
+ if (/enoent|no such file|cannot find|does not exist|not a (git|directory)|wallet not (configured|found)|insufficient.*(balance|funds|lamports)|not logged in|chain mismatch|invalid wallet|command not found/.test(m)) {
57
+ return 'UnexpectedEnvironment';
58
+ }
59
+ // ProviderError — an upstream service we don't control returned bad.
60
+ // Rate limits, 5xx, gateway 4xx, network failures, fetch failures.
61
+ if (/rate.?limit|429|5\d\d|gateway|upstream|provider|fetch failed|econn(refused|reset)|enotfound|socket hang up|network error|http \d{3}|api error|gateway timeout/.test(m)) {
62
+ return 'ProviderError';
63
+ }
64
+ // InvalidArguments — the model called the tool wrong. Covers schema
65
+ // rejects, missing/extra fields, type mismatches, and the very common
66
+ // "cannot read properties of undefined" pattern that means we got an
67
+ // object shape we didn't expect from the model's input.
68
+ if (/invalid (argument|input|parameter|value|schema)|missing (required|argument|field|parameter)|expected.*(but|got|received)|cannot read (properties|property) of (undefined|null)|typeerror|schema (rejected|mismatch|validation)|bad request|400|invalid.*format|unrecognized/.test(m)) {
69
+ return 'InvalidArguments';
70
+ }
71
+ // Tool-specific tells.
72
+ if (toolName) {
73
+ const t = toolName.toLowerCase();
74
+ if (t === 'searchx' || t === 'posttox') {
75
+ if (/login wall|sign in|create account/.test(m))
76
+ return 'UnexpectedEnvironment';
77
+ }
78
+ if (t === 'bash') {
79
+ if (/permission denied|eacces/.test(m))
80
+ return 'UnexpectedEnvironment';
81
+ }
82
+ }
83
+ return 'Unknown';
84
+ }
9
85
  const MAX_RECORDS = 500;
10
86
  export function recordFailure(record) {
87
+ if (process.env.FRANKLIN_NO_AUDIT === '1' || process.env.FRANKLIN_NO_PERSIST === '1')
88
+ return;
11
89
  try {
12
- fs.mkdirSync(path.dirname(FAILURES_FILE), { recursive: true });
13
- fs.appendFileSync(FAILURES_FILE, JSON.stringify(record) + '\n');
90
+ // Auto-classify on write so callsites don't need to know the
91
+ // taxonomy. Callers can still override by passing `category`
92
+ // explicitly (e.g. when the abort came from a known SIGINT handler).
93
+ const enriched = {
94
+ ...record,
95
+ category: record.category ?? classifyToolFailure(record.errorMessage, record.toolName),
96
+ };
97
+ fs.mkdirSync(path.dirname(failuresFile()), { recursive: true });
98
+ fs.appendFileSync(failuresFile(), JSON.stringify(enriched) + '\n');
14
99
  // Trim to MAX_RECORDS (only check periodically to avoid constant reads)
15
100
  if (Math.random() < 0.1) {
16
101
  trimFailures();
@@ -22,12 +107,12 @@ export function recordFailure(record) {
22
107
  }
23
108
  function trimFailures() {
24
109
  try {
25
- if (!fs.existsSync(FAILURES_FILE))
110
+ if (!fs.existsSync(failuresFile()))
26
111
  return;
27
- const lines = fs.readFileSync(FAILURES_FILE, 'utf-8').trim().split('\n');
112
+ const lines = fs.readFileSync(failuresFile(), 'utf-8').trim().split('\n');
28
113
  if (lines.length > MAX_RECORDS) {
29
114
  const trimmed = lines.slice(-MAX_RECORDS).join('\n') + '\n';
30
- fs.writeFileSync(FAILURES_FILE, trimmed);
115
+ fs.writeFileSync(failuresFile(), trimmed);
31
116
  }
32
117
  }
33
118
  catch {
@@ -36,10 +121,19 @@ function trimFailures() {
36
121
  }
37
122
  export function loadFailures(limit = 100) {
38
123
  try {
39
- if (!fs.existsSync(FAILURES_FILE))
124
+ if (!fs.existsSync(failuresFile()))
40
125
  return [];
41
- const lines = fs.readFileSync(FAILURES_FILE, 'utf-8').trim().split('\n').filter(Boolean);
42
- return lines.slice(-limit).map(l => JSON.parse(l));
126
+ const lines = fs.readFileSync(failuresFile(), 'utf-8').trim().split('\n').filter(Boolean);
127
+ return lines.slice(-limit).map(l => {
128
+ const parsed = JSON.parse(l);
129
+ // Auto-classify historical records that pre-date the `category`
130
+ // field. We don't rewrite the file — read-side enrichment keeps
131
+ // the on-disk shape append-only and idempotent.
132
+ if (!parsed.category) {
133
+ parsed.category = classifyToolFailure(parsed.errorMessage, parsed.toolName);
134
+ }
135
+ return parsed;
136
+ });
43
137
  }
44
138
  catch {
45
139
  return [];
@@ -49,15 +143,97 @@ export function getFailureStats() {
49
143
  const records = loadFailures(500);
50
144
  const byTool = new Map();
51
145
  const byType = new Map();
146
+ const byCategory = new Map();
52
147
  for (const r of records) {
53
148
  if (r.toolName)
54
149
  byTool.set(r.toolName, (byTool.get(r.toolName) ?? 0) + 1);
55
150
  byType.set(r.failureType, (byType.get(r.failureType) ?? 0) + 1);
151
+ if (r.category)
152
+ byCategory.set(r.category, (byCategory.get(r.category) ?? 0) + 1);
56
153
  }
57
154
  return {
58
155
  byTool,
59
156
  byType,
157
+ byCategory,
60
158
  total: records.length,
61
159
  recentFailures: records.slice(-10),
62
160
  };
63
161
  }
162
+ /**
163
+ * Compute (tool, category) anomalies vs a rolling baseline.
164
+ *
165
+ * Returns the buckets where the recent failure rate is dramatically
166
+ * higher than baseline — sorted by spike severity. Skips buckets where
167
+ * `recentCount` is below `minRecent` to avoid surfacing every flaky
168
+ * one-off.
169
+ *
170
+ * A bucket with `baselineCount=0` and `recentCount >= minRecent` is
171
+ * always surfaced (spikeRatio = Infinity) — these are brand-new failure
172
+ * modes that the harness has never seen before, and they're the most
173
+ * important kind to investigate.
174
+ */
175
+ export function getToolAnomalies(opts = {}) {
176
+ const recentWindowMs = opts.recentWindowMs ?? 24 * 60 * 60 * 1000;
177
+ const baselineWindowMs = opts.baselineWindowMs ?? 30 * 24 * 60 * 60 * 1000;
178
+ const minRecent = opts.minRecent ?? 3;
179
+ const minSpikeRatio = opts.minSpikeRatio ?? 3.0;
180
+ const now = Date.now();
181
+ const recentCutoff = now - recentWindowMs;
182
+ const baselineCutoff = now - baselineWindowMs;
183
+ // Bucket key = `${toolName}::${category}`.
184
+ const recentByBucket = new Map();
185
+ const baselineByBucket = new Map();
186
+ for (const r of loadFailures(500)) {
187
+ if (r.timestamp < baselineCutoff)
188
+ continue;
189
+ const tool = r.toolName ?? '<no-tool>';
190
+ const cat = r.category ?? 'Unknown';
191
+ const key = `${tool}::${cat}`;
192
+ if (r.timestamp >= recentCutoff) {
193
+ const existing = recentByBucket.get(key) ?? { count: 0, sample: r.errorMessage };
194
+ existing.count += 1;
195
+ existing.sample = r.errorMessage; // last seen wins; useful for triage
196
+ recentByBucket.set(key, existing);
197
+ }
198
+ else {
199
+ baselineByBucket.set(key, (baselineByBucket.get(key) ?? 0) + 1);
200
+ }
201
+ }
202
+ const reports = [];
203
+ for (const [key, { count: recentCount, sample }] of recentByBucket) {
204
+ if (recentCount < minRecent)
205
+ continue;
206
+ const baselineCount = baselineByBucket.get(key) ?? 0;
207
+ // Normalize rates by window length so spikes are comparable across
208
+ // different (recent, baseline) sizes. baseline window excludes the
209
+ // recent window by construction (we partitioned above).
210
+ const baselineWindowExclRecent = baselineWindowMs - recentWindowMs;
211
+ const recentRate = recentCount / recentWindowMs;
212
+ const baselineRate = baselineCount > 0
213
+ ? baselineCount / Math.max(1, baselineWindowExclRecent)
214
+ : 0;
215
+ const spikeRatio = baselineRate > 0
216
+ ? recentRate / baselineRate
217
+ : Number.POSITIVE_INFINITY;
218
+ if (spikeRatio < minSpikeRatio)
219
+ continue;
220
+ const [toolName, category] = key.split('::');
221
+ reports.push({
222
+ toolName,
223
+ category,
224
+ recentCount,
225
+ baselineCount,
226
+ baselineWindowMs,
227
+ recentWindowMs,
228
+ spikeRatio,
229
+ sampleMessage: sample,
230
+ });
231
+ }
232
+ // Sort: brand-new failures (spikeRatio = Infinity) first, then by ratio desc.
233
+ reports.sort((a, b) => {
234
+ if (a.spikeRatio === b.spikeRatio)
235
+ return b.recentCount - a.recentCount;
236
+ return b.spikeRatio - a.spikeRatio;
237
+ });
238
+ return reports;
239
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blockrun/franklin",
3
- "version": "3.15.91",
3
+ "version": "3.15.92",
4
4
  "description": "Franklin — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.",
5
5
  "type": "module",
6
6
  "exports": {