create-walle 0.9.13 → 0.9.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/README.md +8 -3
  2. package/bin/create-walle.js +232 -32
  3. package/bin/mcp-inject.js +18 -53
  4. package/package.json +3 -1
  5. package/template/claude-task-manager/api-prompts.js +11 -2
  6. package/template/claude-task-manager/approval-agent.js +7 -0
  7. package/template/claude-task-manager/db.js +94 -75
  8. package/template/claude-task-manager/docs/session-standup-command-center-design.md +242 -0
  9. package/template/claude-task-manager/docs/session-tooltip-freshness-design.md +224 -0
  10. package/template/claude-task-manager/docs/session-ux-issue-review-2026-05-01.md +369 -0
  11. package/template/claude-task-manager/fuzzy-utils.js +10 -2
  12. package/template/claude-task-manager/git-utils.js +140 -10
  13. package/template/claude-task-manager/lib/agent-capabilities.js +1 -1
  14. package/template/claude-task-manager/lib/agent-presets.js +38 -5
  15. package/template/claude-task-manager/lib/codex-terminal-final.js +53 -0
  16. package/template/claude-task-manager/lib/ctm-session-context-api.js +222 -0
  17. package/template/claude-task-manager/lib/session-diagnostics.js +56 -0
  18. package/template/claude-task-manager/lib/session-history.js +309 -16
  19. package/template/claude-task-manager/lib/session-standup.js +409 -0
  20. package/template/claude-task-manager/lib/session-stream.js +253 -20
  21. package/template/claude-task-manager/lib/standup-attention.js +200 -0
  22. package/template/claude-task-manager/lib/status-hooks.js +8 -2
  23. package/template/claude-task-manager/lib/update-telemetry.js +114 -0
  24. package/template/claude-task-manager/lib/walle-ctm-history.js +49 -6
  25. package/template/claude-task-manager/lib/walle-default-model.js +55 -0
  26. package/template/claude-task-manager/lib/walle-mcp-auto-config.js +66 -0
  27. package/template/claude-task-manager/lib/walle-supervisor.js +86 -19
  28. package/template/claude-task-manager/lib/walle-transcript.js +1 -3
  29. package/template/claude-task-manager/lib/worktree-cwd.js +82 -0
  30. package/template/claude-task-manager/package.json +1 -0
  31. package/template/claude-task-manager/providers/codex-mcp.js +104 -0
  32. package/template/claude-task-manager/providers/index.js +2 -0
  33. package/template/claude-task-manager/public/css/setup.css +2 -1
  34. package/template/claude-task-manager/public/css/walle.css +71 -0
  35. package/template/claude-task-manager/public/index.html +2388 -429
  36. package/template/claude-task-manager/public/js/message-renderer.js +314 -35
  37. package/template/claude-task-manager/public/js/session-search-utils.js +185 -3
  38. package/template/claude-task-manager/public/js/session-status-precedence.js +125 -0
  39. package/template/claude-task-manager/public/js/setup.js +62 -19
  40. package/template/claude-task-manager/public/js/stream-view.js +396 -55
  41. package/template/claude-task-manager/public/js/terminal-restore-state.js +57 -0
  42. package/template/claude-task-manager/public/js/walle-session.js +234 -26
  43. package/template/claude-task-manager/public/js/walle.js +143 -2
  44. package/template/claude-task-manager/server.js +1402 -433
  45. package/template/claude-task-manager/session-integrity.js +77 -28
  46. package/template/claude-task-manager/workers/approval-widget-validator.js +15 -5
  47. package/template/claude-task-manager/workers/scrollback-worker.js +5 -6
  48. package/template/claude-task-manager/workers/state-detectors/codex.js +6 -0
  49. package/template/package.json +1 -1
  50. package/template/wall-e/agent-runners/claude-code.js +2 -0
  51. package/template/wall-e/agent.js +63 -8
  52. package/template/wall-e/api-walle.js +330 -52
  53. package/template/wall-e/brain.js +291 -42
  54. package/template/wall-e/chat.js +172 -15
  55. package/template/wall-e/coding/compaction-service.js +19 -5
  56. package/template/wall-e/coding/stream-processor.js +22 -2
  57. package/template/wall-e/coding/workspace-replay.js +1 -4
  58. package/template/wall-e/coding-orchestrator.js +250 -80
  59. package/template/wall-e/compat.js +0 -28
  60. package/template/wall-e/context/context-builder.js +3 -1
  61. package/template/wall-e/embeddings.js +2 -7
  62. package/template/wall-e/eval/agent-runner.js +30 -9
  63. package/template/wall-e/eval/benchmark-generator.js +21 -1
  64. package/template/wall-e/eval/benchmarks/chat-eval.json +66 -6
  65. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -596
  66. package/template/wall-e/eval/cc-replay.js +1 -0
  67. package/template/wall-e/eval/codex-cli-baseline.js +633 -0
  68. package/template/wall-e/eval/debug-agent003.js +1 -0
  69. package/template/wall-e/eval/eval-orchestrator.js +3 -3
  70. package/template/wall-e/eval/run-agent-benchmarks.js +11 -3
  71. package/template/wall-e/eval/run-codex-cli-baseline.js +177 -0
  72. package/template/wall-e/eval/run-model-comparison.js +1 -0
  73. package/template/wall-e/eval/swebench-adapter.js +1 -0
  74. package/template/wall-e/evaluation/quorum-evaluator.js +0 -1
  75. package/template/wall-e/extraction/knowledge-extractor.js +1 -2
  76. package/template/wall-e/lib/mcp-integration.js +336 -0
  77. package/template/wall-e/llm/ollama.js +47 -8
  78. package/template/wall-e/llm/ollama.plugin.json +1 -1
  79. package/template/wall-e/llm/tool-adapter.js +1 -0
  80. package/template/wall-e/loops/ingest.js +42 -8
  81. package/template/wall-e/loops/initiative.js +87 -2
  82. package/template/wall-e/mcp-server.js +872 -19
  83. package/template/wall-e/memory/ctm-context-client.js +230 -0
  84. package/template/wall-e/memory/ctm-session-context.js +1376 -0
  85. package/template/wall-e/prompts/coding/memory-protocol.md +6 -0
  86. package/template/wall-e/server.js +30 -1
  87. package/template/wall-e/skills/_bundled/memory-search/SKILL.md +8 -0
  88. package/template/wall-e/skills/_bundled/scan-ctm-sessions/SKILL.md +20 -0
  89. package/template/wall-e/skills/_bundled/scan-ctm-sessions/run.js +43 -0
  90. package/template/wall-e/skills/_bundled/slack-mentions/run.js +471 -188
  91. package/template/wall-e/skills/skill-planner.js +86 -4
  92. package/template/wall-e/slack/socket-mode-listener.js +276 -0
  93. package/template/wall-e/telemetry.js +70 -2
  94. package/template/wall-e/tools/builtin-middleware.js +55 -2
  95. package/template/wall-e/tools/shell-policy.js +1 -1
  96. package/template/wall-e/tools/slack-owner.js +104 -0
  97. package/template/website/index.html +4 -4
  98. package/template/builder-journal.md +0 -17
@@ -54,32 +54,17 @@ const {
54
54
  shouldUseStreamProcessor,
55
55
  } = require('./coding/runtime-mode');
56
56
  const { createCodingTranscript } = require('./coding/transcript-writer');
57
+ const {
58
+ CompactionService,
59
+ DEFAULT_CONTEXT_WINDOW,
60
+ } = require('./coding/compaction-service');
61
+ const { estimateTokens, estimateMessagesTokens } = require('./context/token-counter');
57
62
 
58
63
  const MAX_CUMULATIVE_CONTEXT = 4000;
59
64
  const MAX_DIFF_SIZE = 50 * 1024; // 50KB
60
65
  const MAX_AGENT_TURNS = 50;
61
66
  const CHECKPOINT_INTERVAL = 5;
62
67
 
63
- // ANSI-safe truncation: avoid cutting inside CSI escape sequences.
64
- // Inspired by cmux SessionPersistence.swift scrollback truncation.
65
- function ansiSafeTruncate(text, maxLen) {
66
- if (text.length <= maxLen) return text;
67
- let end = maxLen;
68
- // If we're inside an ANSI escape sequence (ESC[...m), advance to its end
69
- // Look back up to 20 chars for an unclosed ESC[
70
- for (let i = end; i > Math.max(0, end - 20); i--) {
71
- if (text[i] === '\x1b' || (text[i] === '\x1B')) {
72
- // Found ESC — check if the sequence closes before our cut point
73
- const closeIdx = text.indexOf('m', i);
74
- if (closeIdx > end && closeIdx < end + 20) {
75
- end = closeIdx + 1; // include the closing 'm'
76
- }
77
- break;
78
- }
79
- }
80
- return text.slice(0, end);
81
- }
82
-
83
68
  // Coding-focused tool definitions (subset of local-tools)
84
69
  const CODING_TOOLS = [
85
70
  {
@@ -493,6 +478,156 @@ function providerSupportsToolCalls(provider) {
493
478
  return true;
494
479
  }
495
480
 
481
+ function positiveNumber(value) {
482
+ const n = Number(value);
483
+ return Number.isFinite(n) && n > 0 ? n : null;
484
+ }
485
+
486
+ function resolveCodingContextWindow(provider, opts = {}) {
487
+ const candidates = [
488
+ opts.compactionContextWindow,
489
+ opts.contextWindow,
490
+ opts.maxContextTokens,
491
+ opts.modelContextWindow,
492
+ provider?.maxContextTokens,
493
+ provider?.max_context_tokens,
494
+ provider?.contextWindow,
495
+ provider?.context_window,
496
+ provider?.metadata?.maxContextTokens,
497
+ provider?.metadata?.max_context_tokens,
498
+ provider?.metadata?.contextWindow,
499
+ provider?.modelInfo?.maxContextTokens,
500
+ provider?.modelInfo?.max_context_tokens,
501
+ ];
502
+ for (const candidate of candidates) {
503
+ const n = positiveNumber(candidate);
504
+ if (n) return n;
505
+ }
506
+ return DEFAULT_CONTEXT_WINDOW;
507
+ }
508
+
509
+ function createCodingCompactionService(provider, modelId, opts = {}) {
510
+ if (opts.autoCompact === false || opts.compaction === false || opts.disableCompaction === true) return null;
511
+ if (String(process.env.WALLE_CODING_AUTO_COMPACT || '').trim() === '0') return null;
512
+ if (opts.compactionService) return opts.compactionService;
513
+ return new CompactionService({
514
+ provider,
515
+ model: modelId,
516
+ contextWindow: resolveCodingContextWindow(provider, opts),
517
+ threshold: opts.compactionThreshold,
518
+ tailTokenBudget: opts.compactionTailTokenBudget,
519
+ keepRecentUserTurns: opts.compactionKeepRecentUserTurns,
520
+ });
521
+ }
522
+
523
+ async function maybeCompactCodingContext({
524
+ messages,
525
+ compactionService,
526
+ systemPrompt = '',
527
+ sessionId,
528
+ cwd,
529
+ transcript,
530
+ events,
531
+ emitProgress,
532
+ mode,
533
+ step = -1,
534
+ sessionMemory,
535
+ reason = 'context_threshold',
536
+ opts = {},
537
+ } = {}) {
538
+ if (!compactionService || !Array.isArray(messages) || messages.length < 2) return null;
539
+ const systemTokens = estimateTokens(systemPrompt || '');
540
+ const estimatedInputTokens = systemTokens + estimateMessagesTokens(messages);
541
+ if (!compactionService.shouldCompact({ messages, systemTokens })) return null;
542
+
543
+ emitProgress?.({
544
+ phase: mode || 'executing',
545
+ step,
546
+ message: 'Compacting coding context...',
547
+ });
548
+
549
+ const result = await compactionService.compact(messages, {
550
+ sessionId,
551
+ cwd,
552
+ reason,
553
+ transcript,
554
+ sessionMemory,
555
+ tailMode: opts.compactionTailMode || 'continue',
556
+ tailTokenBudget: opts.compactionTailTokenBudget,
557
+ keepRecentUserTurns: opts.compactionKeepRecentUserTurns,
558
+ continuePrompt: opts.compactionContinuePrompt,
559
+ });
560
+ if (!result?.compacted || !Array.isArray(result.messages)) return result;
561
+
562
+ messages.splice(0, messages.length, ...result.messages);
563
+ const detail = {
564
+ compactionId: result.metadata?.compactionId || '',
565
+ reason,
566
+ estimatedInputTokens,
567
+ tokensBefore: result.tokensBeforeCompaction,
568
+ tokensAfter: result.tokensAfterCompaction,
569
+ compactedMessages: result.metadata?.compacted_message_count || 0,
570
+ retainedMessages: result.metadata?.retained_message_count || 0,
571
+ tailMode: result.metadata?.tail_mode || '',
572
+ };
573
+ events?.emit?.('context.overflow', { tokens: result.tokensBeforeCompaction, sessionId });
574
+ events?.emit?.('context.compacted', { sessionId, ...detail });
575
+ emitProgress?.({
576
+ phase: mode || 'executing',
577
+ step,
578
+ message: `Context compacted (${detail.compactedMessages} messages summarized)`,
579
+ detail,
580
+ });
581
+ return result;
582
+ }
583
+
584
+ function shouldAutoFallbackToCli({ opts = {}, explicitProvider = false, requestedTools = [] } = {}) {
585
+ if (opts._cliFallbackAttempt) return false;
586
+ if (opts.allowCliFallback === false) return false;
587
+ if (process.env.WALLE_CODING_AUTO_CLI_FALLBACK === '0') return false;
588
+ if (explicitProvider && opts.allowCliFallback !== true) return false;
589
+ if (Array.isArray(requestedTools) && requestedTools.length === 0) return false;
590
+ return true;
591
+ }
592
+
593
+ function isProviderFailureRecoverableByCli(message) {
594
+ const text = String(message || '');
595
+ return /oauth_proxy_error|OAuth token not found|Invalid bearer token|authentication_error|API key not valid|exceeded your current quota|does not support tool calls|No LLM provider configured/i.test(text);
596
+ }
597
+
598
+ async function runCliFallback(prompt, opts = {}, { sid, cwd, reason, fromProvider, model, runtimeMode } = {}) {
599
+ const runnerId = opts.agentRunner || opts.agent_runner || 'claude-code';
600
+ if (opts.onProgress) {
601
+ opts.onProgress({
602
+ type: 'cli_fallback',
603
+ phase: opts.mode || 'executing',
604
+ step: -1,
605
+ message: `Falling back to ${runnerId}`,
606
+ detail: { reason, fromProvider },
607
+ });
608
+ }
609
+ const result = await runHeadless(prompt, {
610
+ cwd,
611
+ sessionId: sid,
612
+ timeoutMs: opts.timeoutMs,
613
+ budgetUsd: opts.budgetUsd,
614
+ runnerId,
615
+ model,
616
+ mode: opts.mode || 'build',
617
+ });
618
+ return {
619
+ ...result,
620
+ provider: result.provider || result.providerType || fromProvider,
621
+ model: result.model || model,
622
+ runtimeMode: runtimeMode?.id || runtimeMode,
623
+ fallback: {
624
+ runnerId,
625
+ fromProvider: fromProvider || null,
626
+ reason: String(reason || '').slice(0, 500),
627
+ },
628
+ };
629
+ }
630
+
496
631
  /**
497
632
  * Writes state object to JSON file.
498
633
  */
@@ -611,6 +746,7 @@ function saveCheckpointToBrain(sid, turn, messages, opts, totalInput, totalOutpu
611
746
  */
612
747
  async function runAgentLoop(prompt, opts = {}) {
613
748
  const { cwd, timeoutMs, maxTurns, provider, model, tools, onProgress } = opts;
749
+ const explicitProvider = !!provider;
614
750
  const sid = opts._resumeSessionId || crypto.randomUUID();
615
751
 
616
752
  // Persist activity start (Phase 2: Activity History)
@@ -677,12 +813,23 @@ async function runAgentLoop(prompt, opts = {}) {
677
813
  }
678
814
  if (requestedTools.length > 0 && !providerSupportsToolCalls(llm)) {
679
815
  const providerType = llm.type || 'unknown';
816
+ const message = `Provider ${providerType} does not support tool calls`;
817
+ if (shouldAutoFallbackToCli({ opts, explicitProvider, requestedTools }) && isProviderFailureRecoverableByCli(message)) {
818
+ return runCliFallback(prompt, opts, {
819
+ sid,
820
+ cwd: resolvedCwd,
821
+ reason: message,
822
+ fromProvider: providerType,
823
+ model,
824
+ runtimeMode,
825
+ });
826
+ }
680
827
  if (transcript?.appendPart) {
681
828
  transcript.appendPart({
682
829
  sessionId: sid,
683
830
  cwd: resolvedCwd,
684
831
  partType: 'error',
685
- data: { message: `Provider ${providerType} does not support tool calls` },
832
+ data: { message },
686
833
  });
687
834
  }
688
835
  return {
@@ -750,7 +897,7 @@ async function runAgentLoop(prompt, opts = {}) {
750
897
 
751
898
  const mw = opts.middleware || (() => {
752
899
  const m = new CodingMiddleware();
753
- registerBuiltinMiddleware(m, { cwd, provider: llm?.type, model: modelId, claudeMd: opts.claudeMd, mode: opts.mode, taskEnv: opts.env });
900
+ registerBuiltinMiddleware(m, { cwd, provider: llm?.type, model: modelId, claudeMd: opts.claudeMd, mode: opts.mode, taskEnv: opts.env, benchmark: opts.benchmark });
754
901
  return m;
755
902
  })();
756
903
  const events = opts.events || new CodingEvents();
@@ -810,8 +957,10 @@ async function runAgentLoop(prompt, opts = {}) {
810
957
  // ── Interactive Questions (B1) ──
811
958
  // Inspired by OpenCode Question service (packages/opencode/src/question/index.ts)
812
959
  const questionManager = opts.questionManager || new QuestionManager(events);
960
+ const compactionService = createCodingCompactionService(llm, modelId, opts);
813
961
 
814
962
  // projectInfo already detected above (before system prompt)
963
+ const llmCtxRef = { current: null }; // populated each turn (see llmCtx below)
815
964
 
816
965
  // Stream-native runtime: model deltas, tool states, snapshots, permissions,
817
966
  // and step boundaries are persisted as typed transcript parts while the loop
@@ -835,9 +984,15 @@ async function runAgentLoop(prompt, opts = {}) {
835
984
  if (call.name === 'list_directory' && input.directory && !path.isAbsolute(input.directory)) {
836
985
  input.directory = path.join(resolvedCwd, input.directory);
837
986
  }
987
+ if (call.name === 'run_shell' && !input.cwd) {
988
+ input.cwd = resolvedCwd;
989
+ }
838
990
  input.sessionId = sid;
839
991
  input.projectRoot = resolvedCwd;
840
- return toolRegistry.execute(call.name, input, { sessionId: sid, cwd: resolvedCwd, model: modelId, provider: llm.type });
992
+ const toolCtx = { sessionId: sid, cwd: resolvedCwd, model: modelId, provider: llm.type, runtimeMode: runtimeMode.id };
993
+ const finalInput = await mw.run('tool.before', toolCtx, call.name, input);
994
+ const result = await toolRegistry.execute(call.name, finalInput, toolCtx);
995
+ return mw.run('tool.after', toolCtx, call.name, finalInput, result);
841
996
  },
842
997
  });
843
998
  processor.on('event', (evt) => emitProgress({
@@ -851,6 +1006,7 @@ async function runAgentLoop(prompt, opts = {}) {
851
1006
  let streamStopReason = '';
852
1007
  let streamModel = modelId;
853
1008
  const streamErrors = [];
1009
+ let streamHadEdit = false;
854
1010
  for (let turnIndex = opts._resumeTurn || 0; turnIndex < turns; turnIndex++) {
855
1011
  const remaining = deadline - Date.now();
856
1012
  if (remaining <= 0) {
@@ -878,14 +1034,39 @@ async function runAgentLoop(prompt, opts = {}) {
878
1034
  runtimeMode: runtimeMode.id,
879
1035
  cwd: resolvedCwd,
880
1036
  });
1037
+ const llmCtx = { params: { maxTokens: taskFileHints.length >= 4 ? 8192 : 4096 }, system: systemPrompt, cwd: resolvedCwd,
1038
+ provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {},
1039
+ toolsAvailable: toolsForTurn.length > 0 };
1040
+ llmCtxRef.current = llmCtx;
1041
+ await mw.run('llm.before', llmCtx);
1042
+ await maybeCompactCodingContext({
1043
+ messages,
1044
+ compactionService,
1045
+ systemPrompt: llmCtx.system,
1046
+ sessionId: sid,
1047
+ cwd: resolvedCwd,
1048
+ transcript,
1049
+ events,
1050
+ emitProgress,
1051
+ mode: opts.mode || 'executing',
1052
+ step: turnIndex,
1053
+ sessionMemory: opts.sessionMemory,
1054
+ reason: 'stream_pre_turn',
1055
+ opts,
1056
+ });
881
1057
  turn = await processor.runTurn({
882
1058
  sessionId: sid,
883
1059
  cwd: resolvedCwd,
884
- system: systemPrompt,
1060
+ system: llmCtx.system,
885
1061
  messages,
886
1062
  tools: toolsForTurn,
887
1063
  maxTokens: taskFileHints.length >= 4 ? 8192 : 4096,
888
1064
  signal: ac.signal,
1065
+ maxTokens: llmCtx.params.maxTokens,
1066
+ temperature: llmCtx.params.temperature,
1067
+ thinking: llmCtx.params.thinking,
1068
+ reasoningEffort: llmCtx.params.reasoningEffort,
1069
+ options: llmCtx.params.options,
889
1070
  });
890
1071
  } finally {
891
1072
  clearTimeout(timer);
@@ -911,6 +1092,7 @@ async function runAgentLoop(prompt, opts = {}) {
911
1092
  content: turn.text,
912
1093
  stopReason: turn.stopReason,
913
1094
  });
1095
+ if (turn.hadEdit) streamHadEdit = true;
914
1096
 
915
1097
  if (turn.status === 'error') break;
916
1098
  if ((turn.toolCalls || []).length === 0) {
@@ -931,9 +1113,24 @@ async function runAgentLoop(prompt, opts = {}) {
931
1113
  }
932
1114
  if (turn.assistantMessage) messages.push(turn.assistantMessage);
933
1115
  if (turn.toolResultMessage) messages.push(turn.toolResultMessage);
1116
+ if (turn.verified && streamHadEdit) break;
934
1117
  if (turn.next !== 'continue') break;
935
1118
  }
936
1119
 
1120
+ if (streamStatus === 'error') {
1121
+ const errorText = streamErrors.join('\n');
1122
+ if (shouldAutoFallbackToCli({ opts, explicitProvider, requestedTools }) && isProviderFailureRecoverableByCli(errorText)) {
1123
+ return runCliFallback(prompt, opts, {
1124
+ sid,
1125
+ cwd: resolvedCwd,
1126
+ reason: errorText,
1127
+ fromProvider: llm.type || '',
1128
+ model,
1129
+ runtimeMode,
1130
+ });
1131
+ }
1132
+ }
1133
+
937
1134
  if (streamStatus === 'error' && transcript?.appendPart) {
938
1135
  transcript.appendPart({
939
1136
  sessionId: sid,
@@ -971,7 +1168,6 @@ async function runAgentLoop(prompt, opts = {}) {
971
1168
  // ── Bridge: event bus → middleware (A2) ──
972
1169
  // When the event bus fires, propagate to middleware's onEvent hook so
973
1170
  // registered middleware can react to file edits, reads, and context overflow.
974
- const llmCtxRef = { current: null }; // populated each turn (see llmCtx below)
975
1171
  const _bridgeHandlers = {};
976
1172
  for (const evtType of ['file.edited', 'file.read', 'context.overflow']) {
977
1173
  const handler = (data) => {
@@ -1073,70 +1269,33 @@ async function runAgentLoop(prompt, opts = {}) {
1073
1269
  const timer = setTimeout(() => ac.abort(), Math.min(remaining, perTurnCap));
1074
1270
 
1075
1271
  // Middleware: prepare LLM call
1272
+ const turnsRemaining = turns - turn;
1076
1273
  const llmCtx = { params: { maxTokens: taskFileHints.length >= 4 ? 8192 : 4096 }, system: systemPrompt, cwd: resolvedCwd,
1077
- provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {} };
1274
+ provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {},
1275
+ toolsAvailable: turnsRemaining > 1 };
1078
1276
  llmCtxRef.current = llmCtx; // expose to event bridge (A2)
1079
1277
  await mw.run('llm.before', llmCtx);
1080
1278
  let adaptedTools = await toolRegistry.getDefinitions(llmCtx);
1081
1279
 
1082
- // Context compaction (6b) -- prune old tool results when approaching context limit
1083
- const estimateTokens = (msgs) => {
1084
- let chars = 0;
1085
- for (const msg of msgs) {
1086
- if (typeof msg.content === 'string') chars += msg.content.length;
1087
- else if (Array.isArray(msg.content)) {
1088
- for (const part of msg.content) {
1089
- if (part.text) chars += part.text.length;
1090
- else if (part.content) chars += part.content.length;
1091
- }
1092
- }
1093
- }
1094
- return Math.ceil(chars * 0.25); // rough token estimate for English code
1095
- };
1096
-
1097
- const contextLimit = 200000; // conservative for most models
1098
- const reservedBuffer = 20000;
1099
- const totalTokens = estimateTokens(messages);
1100
- if (totalTokens >= contextLimit - reservedBuffer) {
1101
- events.emit('context.overflow', { tokens: totalTokens, sessionId: sid });
1102
- // Prune oldest tool results, keep last ~40K tokens worth
1103
- const protectChars = 160000; // ~40K tokens * 4 chars/token
1104
- let charsSeen = 0;
1105
- for (let m = messages.length - 1; m >= 1; m--) { // never prune first user msg
1106
- const msg = messages[m];
1107
- if (typeof msg.content === 'string') charsSeen += msg.content.length;
1108
- else if (Array.isArray(msg.content)) {
1109
- for (const part of msg.content) {
1110
- charsSeen += (part.text || part.content || '').length;
1111
- }
1112
- }
1113
- if (charsSeen >= protectChars) {
1114
- // Prune everything older than index m
1115
- for (let j = 1; j < m; j++) {
1116
- const old = messages[j];
1117
- if (Array.isArray(old.content)) {
1118
- old.content = old.content.map(part => {
1119
- if (part.type === 'tool_result' && part.content) {
1120
- const text = typeof part.content === 'string' ? part.content
1121
- : Array.isArray(part.content) ? part.content.map(c => c.text || '').join('')
1122
- : String(part.content);
1123
- if (text.length > 200) {
1124
- return { ...part, content: ansiSafeTruncate(text, 200) + '\n[compacted]' };
1125
- }
1126
- }
1127
- return part;
1128
- });
1129
- }
1130
- }
1131
- break;
1132
- }
1133
- }
1134
- }
1280
+ await maybeCompactCodingContext({
1281
+ messages,
1282
+ compactionService,
1283
+ systemPrompt: llmCtx.system,
1284
+ sessionId: sid,
1285
+ cwd: resolvedCwd,
1286
+ transcript,
1287
+ events,
1288
+ emitProgress,
1289
+ mode: opts.mode || 'executing',
1290
+ step: turn,
1291
+ sessionMemory: opts.sessionMemory,
1292
+ reason: 'legacy_pre_turn',
1293
+ opts,
1294
+ });
1135
1295
 
1136
1296
  // Graceful max-steps degradation (6n)
1137
1297
  // Note: warnings are appended to the LAST message's content (not as separate
1138
1298
  // user messages) to avoid consecutive user messages which the API rejects.
1139
- const turnsRemaining = turns - turn;
1140
1299
  if (turnsRemaining <= 1) {
1141
1300
  // Final turn: disable tools, force structured summary
1142
1301
  adaptedTools = [];
@@ -1519,6 +1678,17 @@ async function runAgentLoop(prompt, opts = {}) {
1519
1678
  if (questionManager) questionManager.clear();
1520
1679
  try { require('./tools/file-tracker').clearSession(sid); } catch {}
1521
1680
 
1681
+ if (shouldAutoFallbackToCli({ opts, explicitProvider, requestedTools }) && isProviderFailureRecoverableByCli(err.message)) {
1682
+ return runCliFallback(prompt, opts, {
1683
+ sid,
1684
+ cwd: resolvedCwd,
1685
+ reason: err.message,
1686
+ fromProvider: llm?.type || '',
1687
+ model,
1688
+ runtimeMode,
1689
+ });
1690
+ }
1691
+
1522
1692
  return {
1523
1693
  success: false,
1524
1694
  output: finalOutput,
@@ -5,27 +5,6 @@
5
5
  // Called at boot to log warnings; queried by /telemetry skill for removal candidates.
6
6
 
7
7
  const COMPAT_REGISTRY = {
8
- embedding_v1_table: {
9
- addedIn: '0.8.0',
10
- deprecatedIn: '0.12.0',
11
- removeAfter: '1.0.0',
12
- replacedBy: 'Per-model embedding_vec_<key> tables',
13
- telemetryKey: 'embedding_v1_migration',
14
- },
15
- legacy_knowledge_array: {
16
- addedIn: '0.5.0',
17
- deprecatedIn: '0.12.0',
18
- removeAfter: '1.0.0',
19
- replacedBy: '{ knowledge: [...], classifications: [...] } format',
20
- telemetryKey: 'legacy_knowledge_format',
21
- },
22
- legacy_quorum_consensus: {
23
- addedIn: '0.7.0',
24
- deprecatedIn: '0.12.0',
25
- removeAfter: '1.0.0',
26
- replacedBy: 'Evaluator-scored quorum (workerResponse flow)',
27
- telemetryKey: 'legacy_quorum_consensus',
28
- },
29
8
  chat_json_mode: {
30
9
  addedIn: '0.5.0',
31
10
  deprecatedIn: null,
@@ -33,13 +12,6 @@ const COMPAT_REGISTRY = {
33
12
  replacedBy: '?stream=1 SSE mode',
34
13
  telemetryKey: 'chat_json_mode',
35
14
  },
36
- old_env_gemini_key: {
37
- addedIn: '0.6.0',
38
- deprecatedIn: '0.12.0',
39
- removeAfter: '1.0.0',
40
- replacedBy: 'GOOGLE_API_KEY environment variable',
41
- telemetryKey: 'old_env_gemini_key',
42
- },
43
15
  devbox_gateway: {
44
16
  addedIn: '0.4.0',
45
17
  deprecatedIn: null,
@@ -283,6 +283,7 @@ Relevant memories and knowledge are provided above. If they answer the question,
283
283
  ### Step 2: SEARCH — only if the context above is insufficient
284
284
  Call search_memories to find additional evidence. Batch multiple searches in ONE turn.
285
285
  Use different query angles: English keywords, Chinese terms, source filters.
286
+ For private, remembered, or work-context questions, use Wall-E memory before public web_fetch. This includes prior conversations, decisions, preferences, people, teams, projects, tools, Slack/email/calendar context, and "last time" / "do you know" / "what did we discuss" prompts. Use public web only for public/current facts or after memory misses.
286
287
 
287
288
  ### Step 3: THINK — reason through the evidence
288
289
  Use the **think** tool before responding to:
@@ -308,7 +309,8 @@ function buildToolRefBlock(ownerName, intent) {
308
309
  const lines = ['### Tools'];
309
310
  if (intent === 'knowledge') {
310
311
  lines.push(`- **think**: Internal scratchpad (${ownerName} won't see). Use BEFORE every substantive response.`);
311
- lines.push('- **search_memories**: Hybrid search (BM25 + vector). source:"slack" for Slack only. Batch multiple searches in one turn.');
312
+ lines.push('- **search_memories**: Hybrid search (BM25 + vector). Use for private/user/work memory: prior conversations, decisions, preferences, projects, people, and Slack/email/calendar context. source:"slack" for Slack only. Batch multiple searches in one turn.');
313
+ lines.push('- **lookup_person**: Person profile lookup. Use alongside search_memories for colleague/role/team questions.');
312
314
  lines.push('- **remember_fact**: Store facts the user teaches you.');
313
315
  }
314
316
  lines.push('- **run_skill / mcp_call / list_mcp_tools**: Actions and external services.');
@@ -101,7 +101,7 @@ function getEmbeddingModel() {
101
101
  function _hasApiKey(provider) {
102
102
  // Check process.env first, then fall back to model_providers table in brain DB
103
103
  switch (provider) {
104
- case 'google': return !!(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY || _hasProviderKey('google'));
104
+ case 'google': return !!(process.env.GOOGLE_API_KEY || _hasProviderKey('google'));
105
105
  case 'voyage': return !!process.env.VOYAGE_API_KEY;
106
106
  case 'openai': return !!(process.env.OPENAI_API_KEY || _hasProviderKey('openai'));
107
107
  case 'ollama': return _isOllamaAvailable();
@@ -278,11 +278,8 @@ async function _googleEmbed(texts, config) {
278
278
  */
279
279
  function _resolveGoogleCredential() {
280
280
  // 1. Static API key from env
281
- const envKey = process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY;
281
+ const envKey = process.env.GOOGLE_API_KEY;
282
282
  if (envKey && !envKey.startsWith('ya29.')) {
283
- if (!process.env.GOOGLE_API_KEY && process.env.GEMINI_API_KEY) {
284
- try { require('./compat').recordCompatUsage('old_env_gemini_key'); } catch {}
285
- }
286
283
  return { type: 'api_key', token: envKey, expired: false };
287
284
  }
288
285
 
@@ -475,8 +472,6 @@ function _migrateOldTable(db, config) {
475
472
  const oldExists = db.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='embedding_vec'").get();
476
473
  if (!oldExists) return;
477
474
 
478
- try { require('./compat').recordCompatUsage('embedding_v1_migration'); } catch {}
479
-
480
475
  const oldCount = db.prepare('SELECT count(*) as c FROM embedding_map WHERE model = ?').get(config.name)?.c || 0;
481
476
  if (oldCount === 0) {
482
477
  // Old table has data but model column might be from a different model — just drop
@@ -100,10 +100,13 @@ async function runAgentBenchmark(benchmark, options = {}) {
100
100
  }
101
101
 
102
102
  // Run the agent loop with hard timeout safety net
103
- const effectiveTimeout = timeoutMs || (expectations.maxTurns || 20) * 30000;
103
+ const maxTurns = expectations.maxTurns || 20;
104
+ const turnBudgetTimeout = maxTurns * 30000;
105
+ const effectiveTimeout = Math.min(timeoutMs || turnBudgetTimeout, turnBudgetTimeout);
104
106
  const agentPromise = runAgentLoop(benchmark.prompt, {
105
107
  cwd: sandboxDir,
106
108
  timeoutMs: effectiveTimeout,
109
+ maxTurns,
107
110
  provider,
108
111
  model,
109
112
  mode: 'build',
@@ -111,6 +114,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
111
114
  headless: true,
112
115
  headlessPolicy: 'allow',
113
116
  permissionTimeoutMs: 0,
117
+ persistTranscript: false,
114
118
  });
115
119
  let timeoutHandle;
116
120
  const timeoutPromise = new Promise((_, reject) => {
@@ -131,7 +135,9 @@ async function runAgentBenchmark(benchmark, options = {}) {
131
135
  const actualToolCalls = extractToolCalls(result);
132
136
  const toolCallDetails = extractToolCallDetails(result);
133
137
  const actualFileChanges = await getModifiedFiles(sandboxDir);
134
- const actualTurns = (result.log || []).length || actualToolCalls.length;
138
+ const externalRunnerId = result.runnerId || result.fallback?.runnerId || null;
139
+ const externalRunnerWork = Boolean(externalRunnerId && actualFileChanges.length > 0);
140
+ const actualTurns = (result.log || []).length || actualToolCalls.length || (externalRunnerId ? 1 : 0);
135
141
 
136
142
  // Run test command if specified (validate against allowlist)
137
143
  let testsPassed = null;
@@ -156,6 +162,10 @@ async function runAgentBenchmark(benchmark, options = {}) {
156
162
  const inputTokens = usage.inputTokens ?? usage.input ?? 0;
157
163
  const expectedFileChanges = expectations.expectedFileChanges || [];
158
164
  const missingExpectedWork = expectedFileChanges.length > 0 && actualFileChanges.length === 0;
165
+ const attemptedFileChange = actualToolCalls.some((call) => {
166
+ const name = typeof call === 'string' ? call : call?.name;
167
+ return /edit|write|patch|create|delete|modify/i.test(String(name || ''));
168
+ });
159
169
  const testRegression = (expectations.testCommand && testsPassed === false);
160
170
  const rawError = result.stderr || result.error || null;
161
171
  const validatedByTests = Boolean(
@@ -164,9 +174,11 @@ async function runAgentBenchmark(benchmark, options = {}) {
164
174
  actualFileChanges.length > 0
165
175
  );
166
176
  const fatalError = rawError && !validatedByTests ? rawError : null;
167
- const noEffort = (actualToolCalls.length === 0) || (inputTokens === 0) || missingExpectedWork;
177
+ const noEffort = (actualToolCalls.length === 0 && !externalRunnerWork) ||
178
+ (inputTokens === 0 && !externalRunnerWork) ||
179
+ missingExpectedWork;
168
180
  const hadError = !!fatalError;
169
- const validatedSuccess = Boolean(result.success || validatedByTests) && !hadError && !noEffort && !testRegression;
181
+ const validatedSuccess = Boolean(result.success || validatedByTests || externalRunnerWork) && !hadError && !noEffort && !testRegression;
170
182
 
171
183
  // Score the result
172
184
  let score = scoreAgentResult(benchmark, {
@@ -199,7 +211,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
199
211
  : testRegression
200
212
  ? 'tests_failed'
201
213
  : missingExpectedWork
202
- ? 'no_file_changes'
214
+ ? attemptedFileChange ? 'missing_expected_changes' : 'no_file_changes'
203
215
  : 'no_effort' },
204
216
  };
205
217
  }
@@ -296,6 +308,10 @@ function scoreAgentResult(benchmark, actual) {
296
308
  });
297
309
  }
298
310
 
311
+ function isTrustedAgentResult(result = {}) {
312
+ return result.success === true && !result.error && result.testsPassed === true;
313
+ }
314
+
299
315
  /**
300
316
  * Run a multi-turn benchmark — sends each turn's prompt sequentially,
301
317
  * accumulating conversation context. Scores after the final turn.
@@ -333,6 +349,7 @@ async function runMultiTurnBenchmark(benchmark, options = {}) {
333
349
  headless: true,
334
350
  headlessPolicy: 'allow',
335
351
  permissionTimeoutMs: 0,
352
+ persistTranscript: false,
336
353
  messages, // pass accumulated conversation
337
354
  });
338
355
 
@@ -353,7 +370,9 @@ async function runMultiTurnBenchmark(benchmark, options = {}) {
353
370
  const costDollars = estimateCost(totalUsage, provider?.type || provider || 'anthropic', model);
354
371
 
355
372
  const actualFileChanges = await getModifiedFiles(sandboxDir);
356
- const actualTurns = totalTurns;
373
+ const externalRunnerId = lastResult?.runnerId || lastResult?.fallback?.runnerId || null;
374
+ const externalRunnerWork = Boolean(externalRunnerId && actualFileChanges.length > 0);
375
+ const actualTurns = totalTurns || (externalRunnerId ? 1 : 0);
357
376
 
358
377
  let testsPassed = null;
359
378
  let testsAfter = null;
@@ -390,7 +409,8 @@ async function runMultiTurnBenchmark(benchmark, options = {}) {
390
409
  // Same hard-zero floor as single-turn — see runAgentBenchmark for rationale.
391
410
  const inputTokens = totalUsage.inputTokens ?? 0;
392
411
  const hadError = !!(lastResult?.stderr || lastResult?.error);
393
- const noEffort = (allToolCalls.length === 0) || (inputTokens === 0);
412
+ const noEffort = (allToolCalls.length === 0 && !externalRunnerWork) ||
413
+ (inputTokens === 0 && !externalRunnerWork);
394
414
  const testRegression = (expectations.testCommand && testsPassed === false);
395
415
  if (hadError || noEffort || testRegression) {
396
416
  score = {
@@ -507,7 +527,7 @@ async function runAgentBenchmarkSuite(options = {}) {
507
527
  outputTokens: result.outputTokens ?? null,
508
528
  scorerVersion: DEFAULT_SCORER_VERSION,
509
529
  scoringMethod,
510
- trusted: !result.error && result.testsPassed === true,
530
+ trusted: isTrustedAgentResult(result),
511
531
  runConfig: { timeoutMs, scoringMethod },
512
532
  }, {
513
533
  suite: 'coding-agent',
@@ -517,7 +537,7 @@ async function runAgentBenchmarkSuite(options = {}) {
517
537
  model: resolveModelName(model),
518
538
  scoringMethod,
519
539
  scorerVersion: DEFAULT_SCORER_VERSION,
520
- trusted: !result.error && result.testsPassed === true,
540
+ trusted: isTrustedAgentResult(result),
521
541
  runConfig: { timeoutMs, scoringMethod },
522
542
  }));
523
543
  } catch { /* non-fatal */ }
@@ -666,6 +686,7 @@ module.exports = {
666
686
  runMultiTurnBenchmark,
667
687
  runAgentBenchmarkSuite,
668
688
  scoreAgentResult,
689
+ isTrustedAgentResult,
669
690
  extractToolCalls,
670
691
  extractToolCallDetails,
671
692
  countTests,