@blockrun/franklin 3.3.2 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/README.md +58 -7
  2. package/dist/agent/commands.d.ts +1 -1
  3. package/dist/agent/commands.js +128 -17
  4. package/dist/agent/compact.d.ts +2 -2
  5. package/dist/agent/compact.js +148 -22
  6. package/dist/agent/context.d.ts +8 -3
  7. package/dist/agent/context.js +301 -108
  8. package/dist/agent/error-classifier.d.ts +11 -2
  9. package/dist/agent/error-classifier.js +64 -10
  10. package/dist/agent/llm.d.ts +8 -1
  11. package/dist/agent/llm.js +114 -19
  12. package/dist/agent/loop.d.ts +1 -2
  13. package/dist/agent/loop.js +509 -61
  14. package/dist/agent/optimize.d.ts +2 -2
  15. package/dist/agent/optimize.js +9 -7
  16. package/dist/agent/permissions.d.ts +1 -1
  17. package/dist/agent/permissions.js +1 -1
  18. package/dist/agent/planner.d.ts +42 -0
  19. package/dist/agent/planner.js +110 -0
  20. package/dist/agent/reduce.d.ts +7 -1
  21. package/dist/agent/reduce.js +85 -3
  22. package/dist/agent/streaming-executor.d.ts +6 -1
  23. package/dist/agent/streaming-executor.js +83 -5
  24. package/dist/agent/tokens.d.ts +11 -2
  25. package/dist/agent/tokens.js +38 -5
  26. package/dist/agent/tool-guard.d.ts +27 -0
  27. package/dist/agent/tool-guard.js +324 -0
  28. package/dist/agent/types.d.ts +7 -1
  29. package/dist/agent/types.js +1 -1
  30. package/dist/banner.js +27 -40
  31. package/dist/brain/extract.d.ts +11 -0
  32. package/dist/brain/extract.js +154 -0
  33. package/dist/brain/index.d.ts +3 -0
  34. package/dist/brain/index.js +2 -0
  35. package/dist/brain/store.d.ts +42 -0
  36. package/dist/brain/store.js +225 -0
  37. package/dist/brain/types.d.ts +45 -0
  38. package/dist/brain/types.js +5 -0
  39. package/dist/commands/daemon.js +2 -1
  40. package/dist/commands/start.js +16 -3
  41. package/dist/config.js +1 -1
  42. package/dist/index.js +27 -2
  43. package/dist/learnings/extractor.d.ts +13 -0
  44. package/dist/learnings/extractor.js +69 -8
  45. package/dist/learnings/index.d.ts +1 -1
  46. package/dist/learnings/index.js +1 -1
  47. package/dist/learnings/store.js +42 -13
  48. package/dist/learnings/types.d.ts +1 -1
  49. package/dist/mcp/client.d.ts +1 -1
  50. package/dist/mcp/client.js +5 -5
  51. package/dist/mcp/config.d.ts +1 -1
  52. package/dist/mcp/config.js +1 -1
  53. package/dist/panel/html.d.ts +2 -0
  54. package/dist/panel/html.js +409 -146
  55. package/dist/panel/server.js +19 -0
  56. package/dist/pricing.js +3 -2
  57. package/dist/proxy/fallback.d.ts +3 -1
  58. package/dist/proxy/fallback.js +4 -4
  59. package/dist/proxy/server.js +29 -11
  60. package/dist/proxy/sse-translator.js +1 -1
  61. package/dist/router/categories.d.ts +21 -0
  62. package/dist/router/categories.js +96 -0
  63. package/dist/router/index.d.ts +9 -2
  64. package/dist/router/index.js +106 -27
  65. package/dist/router/local-elo.d.ts +32 -0
  66. package/dist/router/local-elo.js +107 -0
  67. package/dist/router/selector.d.ts +46 -0
  68. package/dist/router/selector.js +106 -0
  69. package/dist/session/storage.d.ts +5 -1
  70. package/dist/session/storage.js +24 -2
  71. package/dist/social/a11y.d.ts +1 -1
  72. package/dist/social/a11y.js +5 -1
  73. package/dist/social/browser.d.ts +5 -0
  74. package/dist/social/browser.js +22 -0
  75. package/dist/social/preflight.d.ts +4 -0
  76. package/dist/social/preflight.js +42 -3
  77. package/dist/stats/failures.d.ts +20 -0
  78. package/dist/stats/failures.js +63 -0
  79. package/dist/stats/format.d.ts +6 -0
  80. package/dist/stats/format.js +23 -0
  81. package/dist/stats/insights.js +1 -21
  82. package/dist/stats/session-tracker.d.ts +21 -0
  83. package/dist/stats/session-tracker.js +28 -0
  84. package/dist/stats/tracker.d.ts +1 -1
  85. package/dist/stats/tracker.js +1 -1
  86. package/dist/tools/bash.d.ts +14 -1
  87. package/dist/tools/bash.js +132 -7
  88. package/dist/tools/edit.js +77 -14
  89. package/dist/tools/glob.js +13 -3
  90. package/dist/tools/grep.js +30 -12
  91. package/dist/tools/imagegen.js +3 -3
  92. package/dist/tools/index.d.ts +1 -1
  93. package/dist/tools/index.js +5 -1
  94. package/dist/tools/read.d.ts +16 -2
  95. package/dist/tools/read.js +36 -8
  96. package/dist/tools/searchx.d.ts +6 -2
  97. package/dist/tools/searchx.js +221 -44
  98. package/dist/tools/subagent.js +37 -3
  99. package/dist/tools/task.js +43 -7
  100. package/dist/tools/validate.d.ts +11 -0
  101. package/dist/tools/validate.js +42 -0
  102. package/dist/tools/webfetch.js +18 -7
  103. package/dist/tools/websearch.js +41 -7
  104. package/dist/tools/write.js +26 -6
  105. package/dist/ui/app.js +31 -6
  106. package/dist/ui/model-picker.d.ts +1 -1
  107. package/dist/ui/model-picker.js +1 -1
  108. package/dist/ui/terminal.d.ts +1 -1
  109. package/dist/ui/terminal.js +1 -1
  110. package/package.json +2 -2
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Token optimization strategies for runcode.
2
+ * Token optimization strategies for Franklin.
3
3
  *
4
4
  * Five layers of optimization to minimize token usage:
5
5
  * 1. Tool result size budgeting — cap large outputs, keep preview
@@ -23,7 +23,7 @@ export declare function getMaxOutputTokens(model: string): number;
23
23
  export declare function budgetToolResults(history: Dialogue[]): Dialogue[];
24
24
  export declare function stripOldThinking(history: Dialogue[]): Dialogue[];
25
25
  /**
26
- * After an idle gap (>60 min), clear old tool results.
26
+ * After an idle gap (>30 min), clear old tool results.
27
27
  * When the user comes back after being away, old results are stale anyway.
28
28
  */
29
29
  export declare function timeBasedCleanup(history: Dialogue[], lastActivityTimestamp?: number): {
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Token optimization strategies for runcode.
2
+ * Token optimization strategies for Franklin.
3
3
  *
4
4
  * Five layers of optimization to minimize token usage:
5
5
  * 1. Tool result size budgeting — cap large outputs, keep preview
@@ -34,8 +34,10 @@ const MODEL_MAX_OUTPUT = {
34
34
  export function getMaxOutputTokens(model) {
35
35
  return MODEL_MAX_OUTPUT[model] ?? 16_384;
36
36
  }
37
- /** Idle gap (minutes) after which old tool results are cleared */
38
- const IDLE_GAP_THRESHOLD_MINUTES = 5;
37
+ /** Idle gap (minutes) after which old tool results are cleared.
38
+ * Set to 30 min — a coffee break shouldn't lose tool context.
39
+ * Was 5 min which was too aggressive (comment said 60, code said 5). */
40
+ const IDLE_GAP_THRESHOLD_MINUTES = 30;
39
41
  /** Number of recent tool results to keep during time-based cleanup */
40
42
  const KEEP_RECENT_TOOL_RESULTS = 3;
41
43
  // ─── 1. Tool Result Size Budgeting ─────────────────────────────────────────
@@ -140,7 +142,7 @@ export function stripOldThinking(history) {
140
142
  }
141
143
  // ─── 3. Time-Based Cleanup ─────────────────────────────────────────────────
142
144
  /**
143
- * After an idle gap (>60 min), clear old tool results.
145
+ * After an idle gap (>30 min), clear old tool results.
144
146
  * When the user comes back after being away, old results are stale anyway.
145
147
  */
146
148
  export function timeBasedCleanup(history, lastActivityTimestamp) {
@@ -240,7 +242,7 @@ export function optimizeHistory(history, opts) {
240
242
  result = stripped;
241
243
  changed = true;
242
244
  if (opts?.debug)
243
- console.error('[runcode] Stripped old thinking blocks');
245
+ console.error('[franklin] Stripped old thinking blocks');
244
246
  }
245
247
  // 2. Budget tool results
246
248
  const budgeted = budgetToolResults(result);
@@ -248,7 +250,7 @@ export function optimizeHistory(history, opts) {
248
250
  result = budgeted;
249
251
  changed = true;
250
252
  if (opts?.debug)
251
- console.error('[runcode] Budgeted oversized tool results');
253
+ console.error('[franklin] Budgeted oversized tool results');
252
254
  }
253
255
  // 3. Time-based cleanup
254
256
  const { history: cleaned, cleaned: didClean } = timeBasedCleanup(result, opts?.lastActivityTimestamp);
@@ -256,7 +258,7 @@ export function optimizeHistory(history, opts) {
256
258
  result = cleaned;
257
259
  changed = true;
258
260
  if (opts?.debug)
259
- console.error('[runcode] Cleared stale tool results after idle gap');
261
+ console.error('[franklin] Cleared stale tool results after idle gap');
260
262
  }
261
263
  return result;
262
264
  }
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Permission system for runcode.
2
+ * Permission system for Franklin.
3
3
  * Controls which tools can execute automatically vs. require user approval.
4
4
  */
5
5
  export type PermissionBehavior = 'allow' | 'deny' | 'ask';
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Permission system for runcode.
2
+ * Permission system for Franklin.
3
3
  * Controls which tools can execute automatically vs. require user approval.
4
4
  */
5
5
  import fs from 'node:fs';
@@ -0,0 +1,42 @@
1
+ /**
2
+ * Planner-Executor for Franklin
3
+ *
4
+ * Uses expensive models (Opus/Sonnet) for planning, then cheap/free models
5
+ * for execution. Saves 40-70% on complex tasks while maintaining quality.
6
+ *
7
+ * Flow: detect complexity → plan with strong model → execute with cheap model
8
+ * → escalate back to strong model if executor gets stuck
9
+ */
10
+ import type { Tier, RoutingProfile } from '../router/index.js';
11
+ /**
12
+ * Should this task use plan-then-execute?
13
+ * Returns true only for complex, multi-step tasks where the savings justify
14
+ * the overhead of an extra planning call.
15
+ */
16
+ export declare function shouldPlan(tier: Tier | undefined, profile: RoutingProfile | undefined, userText: string, ultrathink: boolean, planDisabled: boolean): boolean;
17
+ /**
18
+ * Returns the planning system prompt section.
19
+ * Injected alongside the normal system prompt during the planning call.
20
+ */
21
+ export declare function getPlanningPrompt(): string;
22
+ /**
23
+ * Pick the cheap executor model for a given routing profile.
24
+ * These models are good at following structured instructions (the plan)
25
+ * but much cheaper than the planning model.
26
+ */
27
+ export declare function getExecutorModel(profile: RoutingProfile): string;
28
+ /**
29
+ * Extract numbered steps from plan text.
30
+ * Handles formats like "1. Do X", "1) Do X", "Step 1: Do X".
31
+ */
32
+ export declare function parsePlanSteps(text: string): string[];
33
+ /**
34
+ * Detect if the executor model is stuck.
35
+ * Triggers when the model hits repeated errors or repeats the same tool call.
36
+ */
37
+ export declare function isExecutorStuck(consecutiveErrors: number, sameToolRepeat: boolean): boolean;
38
+ /**
39
+ * Build a signature for a tool call (name + first 100 chars of input JSON).
40
+ * Used to detect when the executor repeats the exact same call.
41
+ */
42
+ export declare function toolCallSignature(name: string, input: unknown): string;
@@ -0,0 +1,110 @@
1
+ /**
2
+ * Planner-Executor for Franklin
3
+ *
4
+ * Uses expensive models (Opus/Sonnet) for planning, then cheap/free models
5
+ * for execution. Saves 40-70% on complex tasks while maintaining quality.
6
+ *
7
+ * Flow: detect complexity → plan with strong model → execute with cheap model
8
+ * → escalate back to strong model if executor gets stuck
9
+ */
10
+ // ─── Agentic keywords that suggest multi-step work ───────────────────────
11
+ const AGENTIC_KEYWORDS = /\b(implement|refactor|build|fix|debug|migrate|deploy|create|add|remove|update|restructure|extract|rewrite|optimize|convert|integrate|setup|configure)\b/i;
12
+ const MULTI_STEP_PATTERN = /first.*then|step\s+\d|\d+\.\s|and\s+then|after\s+that|next\s*,|finally\b/i;
13
+ // ─── Detection ───────────────────────────────────────────────────────────
14
+ /**
15
+ * Should this task use plan-then-execute?
16
+ * Returns true only for complex, multi-step tasks where the savings justify
17
+ * the overhead of an extra planning call.
18
+ */
19
+ export function shouldPlan(tier, profile, userText, ultrathink, planDisabled) {
20
+ // Gate 1: only COMPLEX or REASONING tiers benefit from planning
21
+ if (tier !== 'COMPLEX' && tier !== 'REASONING')
22
+ return false;
23
+ // Gate 2: only auto or premium profiles (eco/free already cost-optimized)
24
+ if (profile !== 'auto' && profile !== 'premium')
25
+ return false;
26
+ // Gate 3: skip short queries — planning overhead not worth it
27
+ if (userText.length < 80)
28
+ return false;
29
+ // Gate 4: ultrathink already provides deep reasoning
30
+ if (ultrathink)
31
+ return false;
32
+ // Gate 5: user disabled planning for this session
33
+ if (planDisabled)
34
+ return false;
35
+ // Gate 6: must have agentic or multi-step signals
36
+ const hasAgenticKeyword = AGENTIC_KEYWORDS.test(userText);
37
+ const hasMultiStep = MULTI_STEP_PATTERN.test(userText);
38
+ return hasAgenticKeyword || hasMultiStep;
39
+ }
40
+ // ─── Planning Prompt ─────────────────────────────────────────────────────
41
+ /**
42
+ * Returns the planning system prompt section.
43
+ * Injected alongside the normal system prompt during the planning call.
44
+ */
45
+ export function getPlanningPrompt() {
46
+ return `# Planning Mode — Active
47
+ You are in planning mode. Produce a structured execution plan for the user's request.
48
+
49
+ Rules:
50
+ - Output a numbered list of concrete steps. Each step = one action.
51
+ - Include specific file paths, function names, or shell commands when known.
52
+ - If you need to explore the codebase first, make it step 1.
53
+ - Mark steps that can run in parallel with [PARALLEL].
54
+ - Keep the plan to 15 steps max.
55
+ - End with a verification step (run tests, check output, etc.).
56
+ - Output ONLY the numbered plan. No code blocks, no explanations, no preamble.`;
57
+ }
58
+ // ─── Executor Model Selection ────────────────────────────────────────────
59
+ /**
60
+ * Pick the cheap executor model for a given routing profile.
61
+ * These models are good at following structured instructions (the plan)
62
+ * but much cheaper than the planning model.
63
+ */
64
+ export function getExecutorModel(profile) {
65
+ switch (profile) {
66
+ case 'premium':
67
+ return 'moonshot/kimi-k2.5'; // Medium-tier, reliable execution
68
+ case 'auto':
69
+ default:
70
+ return 'google/gemini-2.5-flash'; // Cheap, fast, good at instructions
71
+ }
72
+ }
73
+ // ─── Plan Parsing ────────────────────────────────────────────────────────
74
+ /**
75
+ * Extract numbered steps from plan text.
76
+ * Handles formats like "1. Do X", "1) Do X", "Step 1: Do X".
77
+ */
78
+ export function parsePlanSteps(text) {
79
+ const lines = text.split('\n');
80
+ const steps = [];
81
+ for (const line of lines) {
82
+ const trimmed = line.trim();
83
+ // Match: "1. ...", "1) ...", "Step 1: ...", "- 1. ..."
84
+ if (/^(?:\d+[\.\):]|step\s+\d)/i.test(trimmed)) {
85
+ steps.push(trimmed);
86
+ }
87
+ }
88
+ return steps;
89
+ }
90
+ // ─── Stuck Detection ─────────────────────────────────────────────────────
91
+ /** Max consecutive tool errors before escalation */
92
+ const MAX_CONSECUTIVE_ERRORS = 3;
93
+ /**
94
+ * Detect if the executor model is stuck.
95
+ * Triggers when the model hits repeated errors or repeats the same tool call.
96
+ */
97
+ export function isExecutorStuck(consecutiveErrors, sameToolRepeat) {
98
+ if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS)
99
+ return true;
100
+ if (sameToolRepeat)
101
+ return true;
102
+ return false;
103
+ }
104
+ /**
105
+ * Build a signature for a tool call (name + first 100 chars of input JSON).
106
+ * Used to detect when the executor repeats the exact same call.
107
+ */
108
+ export function toolCallSignature(name, input) {
109
+ return `${name}::${JSON.stringify(input).slice(0, 100)}`;
110
+ }
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Token Reduction for runcode.
2
+ * Token Reduction for Franklin.
3
3
  * Original implementation — reduces context size through intelligent pruning.
4
4
  *
5
5
  * Strategy: instead of compression/encoding, we PRUNE redundant content.
@@ -42,6 +42,12 @@ export declare function deduplicateMessages(history: Dialogue[]): Dialogue[];
42
42
  * RTK-inspired: dedup_lines + strip_ansi pipeline stages.
43
43
  */
44
44
  export declare function deduplicateToolResultLines(history: Dialogue[]): Dialogue[];
45
+ /**
46
+ * When the same tool (WebSearch, Grep, etc.) is called 6+ times,
47
+ * collapse all but the last 3 results to one-line summaries.
48
+ * Prevents context snowball from search spam (e.g. 96 WebSearches).
49
+ */
50
+ export declare function collapseRepetitiveTools(history: Dialogue[]): Dialogue[];
45
51
  /**
46
52
  * Run all token reduction passes on conversation history.
47
53
  * Returns same reference if nothing changed (cheap identity check).
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Token Reduction for runcode.
2
+ * Token Reduction for Franklin.
3
3
  * Original implementation — reduces context size through intelligent pruning.
4
4
  *
5
5
  * Strategy: instead of compression/encoding, we PRUNE redundant content.
@@ -240,7 +240,82 @@ export function deduplicateToolResultLines(history) {
240
240
  });
241
241
  return modified ? result : history;
242
242
  }
243
- // ─── Pipeline ───────���───────────────────���─────────────────────────────────
243
+ // ─── 6. Repetitive Tool Collapse ─────────────────────────────────────────
244
+ /**
245
+ * When the same tool (WebSearch, Grep, etc.) is called 6+ times,
246
+ * collapse all but the last 3 results to one-line summaries.
247
+ * Prevents context snowball from search spam (e.g. 96 WebSearches).
248
+ */
249
+ export function collapseRepetitiveTools(history) {
250
+ // Count tool_use by name
251
+ const toolCounts = new Map();
252
+ for (const msg of history) {
253
+ if (msg.role !== 'assistant' || !Array.isArray(msg.content))
254
+ continue;
255
+ for (const part of msg.content) {
256
+ if (part.type === 'tool_use') {
257
+ const name = part.name ?? '';
258
+ toolCounts.set(name, (toolCounts.get(name) || 0) + 1);
259
+ }
260
+ }
261
+ }
262
+ // Only for tools called 6+ times
263
+ const repetitive = new Set();
264
+ for (const [name, count] of toolCounts) {
265
+ if (count >= 6)
266
+ repetitive.add(name);
267
+ }
268
+ if (repetitive.size === 0)
269
+ return history;
270
+ // Map tool_use_id → name, track call order per tool
271
+ const idToName = new Map();
272
+ const callOrder = new Map(); // name → [tool_use_id, ...]
273
+ for (const msg of history) {
274
+ if (msg.role !== 'assistant' || !Array.isArray(msg.content))
275
+ continue;
276
+ for (const part of msg.content) {
277
+ if (part.type === 'tool_use' && repetitive.has(part.name ?? '')) {
278
+ const name = part.name ?? '';
279
+ idToName.set(part.id, name);
280
+ if (!callOrder.has(name))
281
+ callOrder.set(name, []);
282
+ callOrder.get(name).push(part.id);
283
+ }
284
+ }
285
+ }
286
+ // Mark old IDs (all but last 3 per tool)
287
+ const oldIds = new Set();
288
+ for (const [, ids] of callOrder) {
289
+ for (let i = 0; i < ids.length - 3; i++) {
290
+ oldIds.add(ids[i]);
291
+ }
292
+ }
293
+ if (oldIds.size === 0)
294
+ return history;
295
+ // Collapse old results
296
+ let modified = false;
297
+ const result = history.map(msg => {
298
+ if (msg.role !== 'user' || !Array.isArray(msg.content))
299
+ return msg;
300
+ let changed = false;
301
+ const parts = msg.content.map(part => {
302
+ if (part.type !== 'tool_result' || !oldIds.has(part.tool_use_id))
303
+ return part;
304
+ const content = typeof part.content === 'string' ? part.content : JSON.stringify(part.content);
305
+ if (content.length <= 80)
306
+ return part;
307
+ changed = true;
308
+ const first = content.split('\n')[0].slice(0, 60);
309
+ return { ...part, content: `[${first}...]` };
310
+ });
311
+ if (!changed)
312
+ return msg;
313
+ modified = true;
314
+ return { ...msg, content: parts };
315
+ });
316
+ return modified ? result : history;
317
+ }
318
+ // ─── Pipeline ────────────────────────────────────────────────────────────
244
319
  /**
245
320
  * Run all token reduction passes on conversation history.
246
321
  * Returns same reference if nothing changed (cheap identity check).
@@ -250,6 +325,13 @@ export function reduceTokens(history, debug) {
250
325
  return history; // Skip for short conversations
251
326
  let current = history;
252
327
  let totalSaved = 0;
328
+ // Pass 0: Collapse repetitive tool results (e.g. 96 WebSearches with similar queries)
329
+ const collapsed = collapseRepetitiveTools(current);
330
+ if (collapsed !== current) {
331
+ const before = estimateChars(current);
332
+ current = collapsed;
333
+ totalSaved += before - estimateChars(current);
334
+ }
253
335
  // Pass 1: Age old tool results
254
336
  const aged = ageToolResults(current);
255
337
  if (aged !== current) {
@@ -288,7 +370,7 @@ export function reduceTokens(history, debug) {
288
370
  }
289
371
  if (debug && totalSaved > 500) {
290
372
  const tokensSaved = Math.round(totalSaved / 4);
291
- console.error(`[runcode] Token reduction: ~${tokensSaved} tokens saved`);
373
+ console.error(`[franklin] Token reduction: ~${tokensSaved} tokens saved`);
292
374
  }
293
375
  return current;
294
376
  }
@@ -1,23 +1,28 @@
1
1
  /**
2
- * Streaming Tool Executor for runcode.
2
+ * Streaming Tool Executor for Franklin.
3
3
  * Starts executing concurrent-safe tools while the model is still streaming.
4
4
  * Non-concurrent tools wait until the full response is received.
5
5
  */
6
6
  import type { CapabilityHandler, CapabilityInvocation, CapabilityResult, ExecutionScope } from './types.js';
7
7
  import type { PermissionManager } from './permissions.js';
8
+ import type { SessionToolGuard } from './tool-guard.js';
8
9
  export declare class StreamingExecutor {
9
10
  private handlers;
10
11
  private scope;
11
12
  private permissions?;
13
+ private guard?;
12
14
  private onStart;
13
15
  private onProgress?;
14
16
  private pending;
17
+ private sessionId;
15
18
  constructor(opts: {
16
19
  handlers: Map<string, CapabilityHandler>;
17
20
  scope: ExecutionScope;
18
21
  permissions?: PermissionManager;
22
+ guard?: SessionToolGuard;
19
23
  onStart: (id: string, name: string, preview?: string) => void;
20
24
  onProgress?: (id: string, text: string) => void;
25
+ sessionId?: string;
21
26
  });
22
27
  /**
23
28
  * Called when a tool_use block is fully received from the stream.
@@ -1,21 +1,53 @@
1
1
  /**
2
- * Streaming Tool Executor for runcode.
2
+ * Streaming Tool Executor for Franklin.
3
3
  * Starts executing concurrent-safe tools while the model is still streaming.
4
4
  * Non-concurrent tools wait until the full response is received.
5
5
  */
6
+ import { mkdirSync, writeFileSync } from 'node:fs';
7
+ import { join } from 'node:path';
8
+ import { recordFailure } from '../stats/failures.js';
9
+ import { BLOCKRUN_DIR } from '../config.js';
10
+ /** Persist a large tool result to disk and return a preview string.
11
+ * Inspired by Claude Code's toolResultStorage.ts. */
12
+ const PERSIST_THRESHOLD = 50_000;
13
+ const PREVIEW_SIZE = 2_000;
14
+ function persistLargeResult(sessionId, toolUseId, output) {
15
+ const dir = join(BLOCKRUN_DIR, 'tool-results', sessionId);
16
+ try {
17
+ mkdirSync(dir, { recursive: true });
18
+ const filePath = join(dir, `${toolUseId}.txt`);
19
+ writeFileSync(filePath, output, { flag: 'wx' }); // write-once (skip if exists)
20
+ // Generate preview — truncate at line boundary for clean output
21
+ let preview = output.slice(0, PREVIEW_SIZE);
22
+ const lastNl = preview.lastIndexOf('\n');
23
+ if (lastNl > PREVIEW_SIZE * 0.5) {
24
+ preview = preview.slice(0, lastNl);
25
+ }
26
+ return `<persisted-output>\nOutput too large (${(output.length / 1024).toFixed(1)}KB). Full output saved to: ${filePath}\n\nPreview (first ${PREVIEW_SIZE / 1000}KB):\n${preview}\n...\n</persisted-output>`;
27
+ }
28
+ catch {
29
+ // Fallback: simple truncation if disk write fails
30
+ return output.slice(0, PERSIST_THRESHOLD) +
31
+ `\n\n[Truncated: original was ${output.length.toLocaleString()} chars]`;
32
+ }
33
+ }
6
34
  export class StreamingExecutor {
7
35
  handlers;
8
36
  scope;
9
37
  permissions;
38
+ guard;
10
39
  onStart;
11
40
  onProgress;
12
41
  pending = [];
42
+ sessionId;
13
43
  constructor(opts) {
14
44
  this.handlers = opts.handlers;
15
45
  this.scope = opts.scope;
16
46
  this.permissions = opts.permissions;
47
+ this.guard = opts.guard;
17
48
  this.onStart = opts.onStart;
18
49
  this.onProgress = opts.onProgress;
50
+ this.sessionId = opts.sessionId || 'default';
19
51
  }
20
52
  /**
21
53
  * Called when a tool_use block is fully received from the stream.
@@ -24,7 +56,10 @@ export class StreamingExecutor {
24
56
  */
25
57
  onToolReceived(invocation) {
26
58
  const handler = this.handlers.get(invocation.name);
27
- const isConcurrent = handler?.concurrent ?? false;
59
+ // Dynamic concurrency check (e.g., Bash is concurrent only for read-only commands)
60
+ const isConcurrent = handler?.isConcurrentSafe
61
+ ? handler.isConcurrentSafe(invocation.input)
62
+ : (handler?.concurrent ?? false);
28
63
  if (isConcurrent) {
29
64
  // Concurrent tools are auto-allowed — start immediately and time from here
30
65
  const preview = this.inputPreview(invocation);
@@ -78,10 +113,17 @@ export class StreamingExecutor {
78
113
  }
79
114
  async executeWithPermissions(invocation, pendingCount = 1, callStart = true // false for concurrent tools (already called in onToolReceived)
80
115
  ) {
116
+ const guardResult = this.guard
117
+ ? await this.guard.beforeExecute(invocation, this.scope)
118
+ : null;
119
+ if (guardResult) {
120
+ return guardResult;
121
+ }
81
122
  // Permission check
82
123
  if (this.permissions) {
83
124
  const decision = await this.permissions.check(invocation.name, invocation.input);
84
125
  if (decision.behavior === 'deny') {
126
+ this.guard?.cancelInvocation(invocation.id);
85
127
  return {
86
128
  output: `Permission denied for ${invocation.name}: ${decision.reason || 'denied by policy'}. Do not retry — explain to the user what you were trying to do and ask how they'd like to proceed.`,
87
129
  isError: true,
@@ -90,6 +132,7 @@ export class StreamingExecutor {
90
132
  if (decision.behavior === 'ask') {
91
133
  const allowed = await this.permissions.promptUser(invocation.name, invocation.input, pendingCount);
92
134
  if (!allowed) {
135
+ this.guard?.cancelInvocation(invocation.id);
93
136
  return {
94
137
  output: `User denied permission for ${invocation.name}. Do not retry — ask the user what they'd like to do instead.`,
95
138
  isError: true,
@@ -102,9 +145,26 @@ export class StreamingExecutor {
102
145
  const preview = this.inputPreview(invocation);
103
146
  this.onStart(invocation.id, invocation.name, preview);
104
147
  }
105
- const handler = this.handlers.get(invocation.name);
148
+ let handler = this.handlers.get(invocation.name);
106
149
  if (!handler) {
107
- return { output: `Unknown capability: ${invocation.name}`, isError: true };
150
+ // Attempt repair: lowercase, normalize hyphens/spaces match
151
+ const attempted = invocation.name;
152
+ const lower = attempted.toLowerCase();
153
+ for (const [name, h] of this.handlers) {
154
+ if (name.toLowerCase() === lower || name.toLowerCase().replace(/[-_ ]/g, '') === lower.replace(/[-_ ]/g, '')) {
155
+ handler = h;
156
+ invocation = { ...invocation, name };
157
+ break;
158
+ }
159
+ }
160
+ if (!handler) {
161
+ this.guard?.cancelInvocation(invocation.id);
162
+ const available = [...this.handlers.keys()].join(', ');
163
+ return {
164
+ output: `Unknown tool "${attempted}". Available tools: ${available}. Check spelling and try again.`,
165
+ isError: true,
166
+ };
167
+ }
108
168
  }
109
169
  // Wire per-invocation progress to onProgress callback
110
170
  const progressScope = this.onProgress
@@ -114,9 +174,27 @@ export class StreamingExecutor {
114
174
  }
115
175
  : this.scope;
116
176
  try {
117
- return await handler.execute(invocation.input, progressScope);
177
+ let result = await handler.execute(invocation.input, progressScope);
178
+ this.guard?.afterExecute(invocation, result);
179
+ // Persist large results to disk with preview (inspired by Claude Code toolResultStorage)
180
+ // Instead of just truncating, save the full result to disk so it can be re-read later.
181
+ if (result.output.length > PERSIST_THRESHOLD) {
182
+ result = {
183
+ output: persistLargeResult(this.sessionId, invocation.id, result.output),
184
+ isError: result.isError,
185
+ };
186
+ }
187
+ return result;
118
188
  }
119
189
  catch (err) {
190
+ this.guard?.cancelInvocation(invocation.id);
191
+ recordFailure({
192
+ timestamp: Date.now(),
193
+ model: '', // not available at tool level
194
+ failureType: 'tool_error',
195
+ toolName: invocation.name,
196
+ errorMessage: err.message,
197
+ });
120
198
  return {
121
199
  output: `Error executing ${invocation.name}: ${err.message}`,
122
200
  isError: true,
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Token estimation for runcode.
2
+ * Token estimation for Franklin.
3
3
  * Uses byte-based heuristic (no external tokenizer dependency).
4
4
  * Anchors to actual API counts when available, estimates on top for new messages.
5
5
  */
@@ -22,9 +22,18 @@ export declare function getAnchoredTokenCount(history: Dialogue[]): {
22
22
  * Reset anchor (e.g., after compaction).
23
23
  */
24
24
  export declare function resetTokenAnchor(): void;
25
+ /**
26
+ * Set the current model for token estimation context.
27
+ * Called when the model is resolved in the agent loop.
28
+ */
29
+ export declare function setEstimationModel(model: string): void;
25
30
  /**
26
31
  * Estimate token count for a string using byte-length heuristic.
27
- * JSON-heavy content uses 2 bytes/token; general text uses 4.
32
+ * JSON-heavy content uses 2 bytes/token; general text uses model-specific ratio.
33
+ *
34
+ * Padding reduced from 1.33x to 1.15x to prevent premature compaction.
35
+ * The old 1.33x + ceil() combo caused ~36% overestimation, triggering
36
+ * auto-compact when context was still 15-20% below the actual limit.
28
37
  */
29
38
  export declare function estimateTokens(text: string, bytesPerToken?: number): number;
30
39
  /**
@@ -1,9 +1,30 @@
1
1
  /**
2
- * Token estimation for runcode.
2
+ * Token estimation for Franklin.
3
3
  * Uses byte-based heuristic (no external tokenizer dependency).
4
4
  * Anchors to actual API counts when available, estimates on top for new messages.
5
5
  */
6
6
  const DEFAULT_BYTES_PER_TOKEN = 4;
7
+ /**
8
+ * Model-specific bytes-per-token ratios for more accurate estimation.
9
+ * Claude tokenizes more efficiently (~3.5 bytes/token), GPT at ~4, Gemini at ~3.
10
+ */
11
+ const MODEL_BYTES_PER_TOKEN = {
12
+ 'anthropic': 3.5,
13
+ 'openai': 4,
14
+ 'google': 3,
15
+ 'deepseek': 3.5,
16
+ 'xai': 4,
17
+ 'zai': 4,
18
+ };
19
+ /** Get bytes-per-token ratio for a model. Falls back to DEFAULT_BYTES_PER_TOKEN. */
20
+ function getModelBytesPerToken(model) {
21
+ if (!model)
22
+ return DEFAULT_BYTES_PER_TOKEN;
23
+ const provider = model.split('/')[0];
24
+ return MODEL_BYTES_PER_TOKEN[provider] ?? DEFAULT_BYTES_PER_TOKEN;
25
+ }
26
+ // Store current model for token estimation context
27
+ let _currentModel;
7
28
  // ─── API-anchored token tracking ───────────────────────���──────────────────
8
29
  /** Last known actual token count from API response */
9
30
  let lastApiInputTokens = 0;
@@ -59,13 +80,25 @@ export function resetTokenAnchor() {
59
80
  lastApiOutputTokens = 0;
60
81
  lastApiMessageCount = 0;
61
82
  }
83
+ /**
84
+ * Set the current model for token estimation context.
85
+ * Called when the model is resolved in the agent loop.
86
+ */
87
+ export function setEstimationModel(model) {
88
+ _currentModel = model;
89
+ }
62
90
  /**
63
91
  * Estimate token count for a string using byte-length heuristic.
64
- * JSON-heavy content uses 2 bytes/token; general text uses 4.
92
+ * JSON-heavy content uses 2 bytes/token; general text uses model-specific ratio.
93
+ *
94
+ * Padding reduced from 1.33x to 1.15x to prevent premature compaction.
95
+ * The old 1.33x + ceil() combo caused ~36% overestimation, triggering
96
+ * auto-compact when context was still 15-20% below the actual limit.
65
97
  */
66
- export function estimateTokens(text, bytesPerToken = DEFAULT_BYTES_PER_TOKEN) {
67
- // Pad by 4/3 (~33%) for conservative estimation — better to over-count than under-count
68
- return Math.ceil(Buffer.byteLength(text, 'utf-8') / bytesPerToken * 1.33);
98
+ export function estimateTokens(text, bytesPerToken) {
99
+ const effectiveBPT = bytesPerToken ?? getModelBytesPerToken(_currentModel);
100
+ // Pad by 15% for safety margin — still conservative but not premature
101
+ return Math.ceil(Buffer.byteLength(text, 'utf-8') / effectiveBPT * 1.15);
69
102
  }
70
103
  /**
71
104
  * Estimate tokens for a content part.
@@ -0,0 +1,27 @@
1
+ import type { CapabilityInvocation, CapabilityResult, ExecutionScope } from './types.js';
2
+ export declare function normalizeSearchQuery(query: string): {
3
+ normalized: string;
4
+ tokens: string[];
5
+ };
6
+ export declare class SessionToolGuard {
7
+ private turn;
8
+ private webSearchesThisTurn;
9
+ private searchFamilies;
10
+ private searchCache;
11
+ private pendingSearches;
12
+ private recentReads;
13
+ private pendingReads;
14
+ private recentFetches;
15
+ private pendingFetches;
16
+ private toolErrorCounts;
17
+ startTurn(): void;
18
+ beforeExecute(invocation: CapabilityInvocation, scope: ExecutionScope): Promise<CapabilityResult | null>;
19
+ afterExecute(invocation: CapabilityInvocation, result: CapabilityResult): void;
20
+ cancelInvocation(invocationId: string): void;
21
+ private beforeWebSearch;
22
+ private beforeRead;
23
+ private beforeWebFetch;
24
+ private afterWebSearch;
25
+ private afterRead;
26
+ private afterWebFetch;
27
+ }