npm - create-walle - Versions diffs - 0.9.13 → 0.9.15 - Mend

create-walle 0.9.13 → 0.9.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

package/README.md +8 -3
package/bin/create-walle.js +232 -32
package/bin/mcp-inject.js +18 -53
package/package.json +3 -1
package/template/claude-task-manager/api-prompts.js +11 -2
package/template/claude-task-manager/approval-agent.js +7 -0
package/template/claude-task-manager/db.js +94 -75
package/template/claude-task-manager/docs/session-standup-command-center-design.md +242 -0
package/template/claude-task-manager/docs/session-tooltip-freshness-design.md +224 -0
package/template/claude-task-manager/docs/session-ux-issue-review-2026-05-01.md +369 -0
package/template/claude-task-manager/fuzzy-utils.js +10 -2
package/template/claude-task-manager/git-utils.js +140 -10
package/template/claude-task-manager/lib/agent-capabilities.js +1 -1
package/template/claude-task-manager/lib/agent-presets.js +38 -5
package/template/claude-task-manager/lib/codex-terminal-final.js +53 -0
package/template/claude-task-manager/lib/ctm-session-context-api.js +222 -0
package/template/claude-task-manager/lib/session-diagnostics.js +56 -0
package/template/claude-task-manager/lib/session-history.js +309 -16
package/template/claude-task-manager/lib/session-standup.js +409 -0
package/template/claude-task-manager/lib/session-stream.js +253 -20
package/template/claude-task-manager/lib/standup-attention.js +200 -0
package/template/claude-task-manager/lib/status-hooks.js +8 -2
package/template/claude-task-manager/lib/update-telemetry.js +114 -0
package/template/claude-task-manager/lib/walle-ctm-history.js +49 -6
package/template/claude-task-manager/lib/walle-default-model.js +55 -0
package/template/claude-task-manager/lib/walle-mcp-auto-config.js +66 -0
package/template/claude-task-manager/lib/walle-supervisor.js +86 -19
package/template/claude-task-manager/lib/walle-transcript.js +1 -3
package/template/claude-task-manager/lib/worktree-cwd.js +82 -0
package/template/claude-task-manager/package.json +1 -0
package/template/claude-task-manager/providers/codex-mcp.js +104 -0
package/template/claude-task-manager/providers/index.js +2 -0
package/template/claude-task-manager/public/css/setup.css +2 -1
package/template/claude-task-manager/public/css/walle.css +71 -0
package/template/claude-task-manager/public/index.html +2388 -429
package/template/claude-task-manager/public/js/message-renderer.js +314 -35
package/template/claude-task-manager/public/js/session-search-utils.js +185 -3
package/template/claude-task-manager/public/js/session-status-precedence.js +125 -0
package/template/claude-task-manager/public/js/setup.js +62 -19
package/template/claude-task-manager/public/js/stream-view.js +396 -55
package/template/claude-task-manager/public/js/terminal-restore-state.js +57 -0
package/template/claude-task-manager/public/js/walle-session.js +234 -26
package/template/claude-task-manager/public/js/walle.js +143 -2
package/template/claude-task-manager/server.js +1402 -433
package/template/claude-task-manager/session-integrity.js +77 -28
package/template/claude-task-manager/workers/approval-widget-validator.js +15 -5
package/template/claude-task-manager/workers/scrollback-worker.js +5 -6
package/template/claude-task-manager/workers/state-detectors/codex.js +6 -0
package/template/package.json +1 -1
package/template/wall-e/agent-runners/claude-code.js +2 -0
package/template/wall-e/agent.js +63 -8
package/template/wall-e/api-walle.js +330 -52
package/template/wall-e/brain.js +291 -42
package/template/wall-e/chat.js +172 -15
package/template/wall-e/coding/compaction-service.js +19 -5
package/template/wall-e/coding/stream-processor.js +22 -2
package/template/wall-e/coding/workspace-replay.js +1 -4
package/template/wall-e/coding-orchestrator.js +250 -80
package/template/wall-e/compat.js +0 -28
package/template/wall-e/context/context-builder.js +3 -1
package/template/wall-e/embeddings.js +2 -7
package/template/wall-e/eval/agent-runner.js +30 -9
package/template/wall-e/eval/benchmark-generator.js +21 -1
package/template/wall-e/eval/benchmarks/chat-eval.json +66 -6
package/template/wall-e/eval/benchmarks/coding-agent.json +0 -596
package/template/wall-e/eval/cc-replay.js +1 -0
package/template/wall-e/eval/codex-cli-baseline.js +633 -0
package/template/wall-e/eval/debug-agent003.js +1 -0
package/template/wall-e/eval/eval-orchestrator.js +3 -3
package/template/wall-e/eval/run-agent-benchmarks.js +11 -3
package/template/wall-e/eval/run-codex-cli-baseline.js +177 -0
package/template/wall-e/eval/run-model-comparison.js +1 -0
package/template/wall-e/eval/swebench-adapter.js +1 -0
package/template/wall-e/evaluation/quorum-evaluator.js +0 -1
package/template/wall-e/extraction/knowledge-extractor.js +1 -2
package/template/wall-e/lib/mcp-integration.js +336 -0
package/template/wall-e/llm/ollama.js +47 -8
package/template/wall-e/llm/ollama.plugin.json +1 -1
package/template/wall-e/llm/tool-adapter.js +1 -0
package/template/wall-e/loops/ingest.js +42 -8
package/template/wall-e/loops/initiative.js +87 -2
package/template/wall-e/mcp-server.js +872 -19
package/template/wall-e/memory/ctm-context-client.js +230 -0
package/template/wall-e/memory/ctm-session-context.js +1376 -0
package/template/wall-e/prompts/coding/memory-protocol.md +6 -0
package/template/wall-e/server.js +30 -1
package/template/wall-e/skills/_bundled/memory-search/SKILL.md +8 -0
package/template/wall-e/skills/_bundled/scan-ctm-sessions/SKILL.md +20 -0
package/template/wall-e/skills/_bundled/scan-ctm-sessions/run.js +43 -0
package/template/wall-e/skills/_bundled/slack-mentions/run.js +471 -188
package/template/wall-e/skills/skill-planner.js +86 -4
package/template/wall-e/slack/socket-mode-listener.js +276 -0
package/template/wall-e/telemetry.js +70 -2
package/template/wall-e/tools/builtin-middleware.js +55 -2
package/template/wall-e/tools/shell-policy.js +1 -1
package/template/wall-e/tools/slack-owner.js +104 -0
package/template/website/index.html +4 -4
package/template/builder-journal.md +0 -17

package/template/wall-e/coding-orchestrator.js CHANGED Viewed

@@ -54,32 +54,17 @@ const {
   shouldUseStreamProcessor,
 } = require('./coding/runtime-mode');
 const { createCodingTranscript } = require('./coding/transcript-writer');
+const {
+  CompactionService,
+  DEFAULT_CONTEXT_WINDOW,
+} = require('./coding/compaction-service');
+const { estimateTokens, estimateMessagesTokens } = require('./context/token-counter');
 const MAX_CUMULATIVE_CONTEXT = 4000;
 const MAX_DIFF_SIZE = 50 * 1024; // 50KB
 const MAX_AGENT_TURNS = 50;
 const CHECKPOINT_INTERVAL = 5;
-// ANSI-safe truncation: avoid cutting inside CSI escape sequences.
-// Inspired by cmux SessionPersistence.swift scrollback truncation.
-function ansiSafeTruncate(text, maxLen) {
-  if (text.length <= maxLen) return text;
-  let end = maxLen;
-  // If we're inside an ANSI escape sequence (ESC[...m), advance to its end
-  // Look back up to 20 chars for an unclosed ESC[
-  for (let i = end; i > Math.max(0, end - 20); i--) {
-    if (text[i] === '\x1b' || (text[i] === '\x1B')) {
-      // Found ESC — check if the sequence closes before our cut point
-      const closeIdx = text.indexOf('m', i);
-      if (closeIdx > end && closeIdx < end + 20) {
-        end = closeIdx + 1; // include the closing 'm'
-      }
-      break;
-    }
-  }
-  return text.slice(0, end);
-}
 // Coding-focused tool definitions (subset of local-tools)
 const CODING_TOOLS = [
   {
@@ -493,6 +478,156 @@ function providerSupportsToolCalls(provider) {
   return true;
 }
+function positiveNumber(value) {
+  const n = Number(value);
+  return Number.isFinite(n) && n > 0 ? n : null;
+}
+function resolveCodingContextWindow(provider, opts = {}) {
+  const candidates = [
+    opts.compactionContextWindow,
+    opts.contextWindow,
+    opts.maxContextTokens,
+    opts.modelContextWindow,
+    provider?.maxContextTokens,
+    provider?.max_context_tokens,
+    provider?.contextWindow,
+    provider?.context_window,
+    provider?.metadata?.maxContextTokens,
+    provider?.metadata?.max_context_tokens,
+    provider?.metadata?.contextWindow,
+    provider?.modelInfo?.maxContextTokens,
+    provider?.modelInfo?.max_context_tokens,
+  ];
+  for (const candidate of candidates) {
+    const n = positiveNumber(candidate);
+    if (n) return n;
+  }
+  return DEFAULT_CONTEXT_WINDOW;
+}
+function createCodingCompactionService(provider, modelId, opts = {}) {
+  if (opts.autoCompact === false || opts.compaction === false || opts.disableCompaction === true) return null;
+  if (String(process.env.WALLE_CODING_AUTO_COMPACT || '').trim() === '0') return null;
+  if (opts.compactionService) return opts.compactionService;
+  return new CompactionService({
+    provider,
+    model: modelId,
+    contextWindow: resolveCodingContextWindow(provider, opts),
+    threshold: opts.compactionThreshold,
+    tailTokenBudget: opts.compactionTailTokenBudget,
+    keepRecentUserTurns: opts.compactionKeepRecentUserTurns,
+  });
+}
+async function maybeCompactCodingContext({
+  messages,
+  compactionService,
+  systemPrompt = '',
+  sessionId,
+  cwd,
+  transcript,
+  events,
+  emitProgress,
+  mode,
+  step = -1,
+  sessionMemory,
+  reason = 'context_threshold',
+  opts = {},
+} = {}) {
+  if (!compactionService || !Array.isArray(messages) || messages.length < 2) return null;
+  const systemTokens = estimateTokens(systemPrompt || '');
+  const estimatedInputTokens = systemTokens + estimateMessagesTokens(messages);
+  if (!compactionService.shouldCompact({ messages, systemTokens })) return null;
+  emitProgress?.({
+    phase: mode || 'executing',
+    step,
+    message: 'Compacting coding context...',
+  });
+  const result = await compactionService.compact(messages, {
+    sessionId,
+    cwd,
+    reason,
+    transcript,
+    sessionMemory,
+    tailMode: opts.compactionTailMode || 'continue',
+    tailTokenBudget: opts.compactionTailTokenBudget,
+    keepRecentUserTurns: opts.compactionKeepRecentUserTurns,
+    continuePrompt: opts.compactionContinuePrompt,
+  });
+  if (!result?.compacted || !Array.isArray(result.messages)) return result;
+  messages.splice(0, messages.length, ...result.messages);
+  const detail = {
+    compactionId: result.metadata?.compactionId || '',
+    reason,
+    estimatedInputTokens,
+    tokensBefore: result.tokensBeforeCompaction,
+    tokensAfter: result.tokensAfterCompaction,
+    compactedMessages: result.metadata?.compacted_message_count || 0,
+    retainedMessages: result.metadata?.retained_message_count || 0,
+    tailMode: result.metadata?.tail_mode || '',
+  };
+  events?.emit?.('context.overflow', { tokens: result.tokensBeforeCompaction, sessionId });
+  events?.emit?.('context.compacted', { sessionId, ...detail });
+  emitProgress?.({
+    phase: mode || 'executing',
+    step,
+    message: `Context compacted (${detail.compactedMessages} messages summarized)`,
+    detail,
+  });
+  return result;
+}
+function shouldAutoFallbackToCli({ opts = {}, explicitProvider = false, requestedTools = [] } = {}) {
+  if (opts._cliFallbackAttempt) return false;
+  if (opts.allowCliFallback === false) return false;
+  if (process.env.WALLE_CODING_AUTO_CLI_FALLBACK === '0') return false;
+  if (explicitProvider && opts.allowCliFallback !== true) return false;
+  if (Array.isArray(requestedTools) && requestedTools.length === 0) return false;
+  return true;
+}
+function isProviderFailureRecoverableByCli(message) {
+  const text = String(message || '');
+  return /oauth_proxy_error|OAuth token not found|Invalid bearer token|authentication_error|API key not valid|exceeded your current quota|does not support tool calls|No LLM provider configured/i.test(text);
+}
+async function runCliFallback(prompt, opts = {}, { sid, cwd, reason, fromProvider, model, runtimeMode } = {}) {
+  const runnerId = opts.agentRunner || opts.agent_runner || 'claude-code';
+  if (opts.onProgress) {
+    opts.onProgress({
+      type: 'cli_fallback',
+      phase: opts.mode || 'executing',
+      step: -1,
+      message: `Falling back to ${runnerId}`,
+      detail: { reason, fromProvider },
+    });
+  }
+  const result = await runHeadless(prompt, {
+    cwd,
+    sessionId: sid,
+    timeoutMs: opts.timeoutMs,
+    budgetUsd: opts.budgetUsd,
+    runnerId,
+    model,
+    mode: opts.mode || 'build',
+  });
+  return {
+    ...result,
+    provider: result.provider || result.providerType || fromProvider,
+    model: result.model || model,
+    runtimeMode: runtimeMode?.id || runtimeMode,
+    fallback: {
+      runnerId,
+      fromProvider: fromProvider || null,
+      reason: String(reason || '').slice(0, 500),
+    },
+  };
+}
 /**
  * Writes state object to JSON file.
  */
@@ -611,6 +746,7 @@ function saveCheckpointToBrain(sid, turn, messages, opts, totalInput, totalOutpu
  */
 async function runAgentLoop(prompt, opts = {}) {
   const { cwd, timeoutMs, maxTurns, provider, model, tools, onProgress } = opts;
+  const explicitProvider = !!provider;
   const sid = opts._resumeSessionId || crypto.randomUUID();
   // Persist activity start (Phase 2: Activity History)
@@ -677,12 +813,23 @@ async function runAgentLoop(prompt, opts = {}) {
   }
   if (requestedTools.length > 0 && !providerSupportsToolCalls(llm)) {
     const providerType = llm.type || 'unknown';
+    const message = `Provider ${providerType} does not support tool calls`;
+    if (shouldAutoFallbackToCli({ opts, explicitProvider, requestedTools }) && isProviderFailureRecoverableByCli(message)) {
+      return runCliFallback(prompt, opts, {
+        sid,
+        cwd: resolvedCwd,
+        reason: message,
+        fromProvider: providerType,
+        model,
+        runtimeMode,
+      });
+    }
     if (transcript?.appendPart) {
       transcript.appendPart({
         sessionId: sid,
         cwd: resolvedCwd,
         partType: 'error',
-        data: { message: `Provider ${providerType} does not support tool calls` },
+        data: { message },
       });
     }
     return {
@@ -750,7 +897,7 @@ async function runAgentLoop(prompt, opts = {}) {
   const mw = opts.middleware || (() => {
     const m = new CodingMiddleware();
-    registerBuiltinMiddleware(m, { cwd, provider: llm?.type, model: modelId, claudeMd: opts.claudeMd, mode: opts.mode, taskEnv: opts.env });
+    registerBuiltinMiddleware(m, { cwd, provider: llm?.type, model: modelId, claudeMd: opts.claudeMd, mode: opts.mode, taskEnv: opts.env, benchmark: opts.benchmark });
     return m;
   })();
   const events = opts.events || new CodingEvents();
@@ -810,8 +957,10 @@ async function runAgentLoop(prompt, opts = {}) {
   // ── Interactive Questions (B1) ──
   // Inspired by OpenCode Question service (packages/opencode/src/question/index.ts)
   const questionManager = opts.questionManager || new QuestionManager(events);
+  const compactionService = createCodingCompactionService(llm, modelId, opts);
   // projectInfo already detected above (before system prompt)
+  const llmCtxRef = { current: null }; // populated each turn (see llmCtx below)
   // Stream-native runtime: model deltas, tool states, snapshots, permissions,
   // and step boundaries are persisted as typed transcript parts while the loop
@@ -835,9 +984,15 @@ async function runAgentLoop(prompt, opts = {}) {
         if (call.name === 'list_directory' && input.directory && !path.isAbsolute(input.directory)) {
           input.directory = path.join(resolvedCwd, input.directory);
         }
+        if (call.name === 'run_shell' && !input.cwd) {
+          input.cwd = resolvedCwd;
+        }
         input.sessionId = sid;
         input.projectRoot = resolvedCwd;
-        return toolRegistry.execute(call.name, input, { sessionId: sid, cwd: resolvedCwd, model: modelId, provider: llm.type });
+        const toolCtx = { sessionId: sid, cwd: resolvedCwd, model: modelId, provider: llm.type, runtimeMode: runtimeMode.id };
+        const finalInput = await mw.run('tool.before', toolCtx, call.name, input);
+        const result = await toolRegistry.execute(call.name, finalInput, toolCtx);
+        return mw.run('tool.after', toolCtx, call.name, finalInput, result);
       },
     });
     processor.on('event', (evt) => emitProgress({
@@ -851,6 +1006,7 @@ async function runAgentLoop(prompt, opts = {}) {
     let streamStopReason = '';
     let streamModel = modelId;
     const streamErrors = [];
+    let streamHadEdit = false;
     for (let turnIndex = opts._resumeTurn || 0; turnIndex < turns; turnIndex++) {
       const remaining = deadline - Date.now();
       if (remaining <= 0) {
@@ -878,14 +1034,39 @@ async function runAgentLoop(prompt, opts = {}) {
             runtimeMode: runtimeMode.id,
             cwd: resolvedCwd,
           });
+        const llmCtx = { params: { maxTokens: taskFileHints.length >= 4 ? 8192 : 4096 }, system: systemPrompt, cwd: resolvedCwd,
+          provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {},
+          toolsAvailable: toolsForTurn.length > 0 };
+        llmCtxRef.current = llmCtx;
+        await mw.run('llm.before', llmCtx);
+        await maybeCompactCodingContext({
+          messages,
+          compactionService,
+          systemPrompt: llmCtx.system,
+          sessionId: sid,
+          cwd: resolvedCwd,
+          transcript,
+          events,
+          emitProgress,
+          mode: opts.mode || 'executing',
+          step: turnIndex,
+          sessionMemory: opts.sessionMemory,
+          reason: 'stream_pre_turn',
+          opts,
+        });
         turn = await processor.runTurn({
           sessionId: sid,
           cwd: resolvedCwd,
-          system: systemPrompt,
+          system: llmCtx.system,
           messages,
           tools: toolsForTurn,
           maxTokens: taskFileHints.length >= 4 ? 8192 : 4096,
           signal: ac.signal,
+          maxTokens: llmCtx.params.maxTokens,
+          temperature: llmCtx.params.temperature,
+          thinking: llmCtx.params.thinking,
+          reasoningEffort: llmCtx.params.reasoningEffort,
+          options: llmCtx.params.options,
         });
       } finally {
         clearTimeout(timer);
@@ -911,6 +1092,7 @@ async function runAgentLoop(prompt, opts = {}) {
         content: turn.text,
         stopReason: turn.stopReason,
       });
+      if (turn.hadEdit) streamHadEdit = true;
       if (turn.status === 'error') break;
       if ((turn.toolCalls || []).length === 0) {
@@ -931,9 +1113,24 @@ async function runAgentLoop(prompt, opts = {}) {
       }
       if (turn.assistantMessage) messages.push(turn.assistantMessage);
       if (turn.toolResultMessage) messages.push(turn.toolResultMessage);
+      if (turn.verified && streamHadEdit) break;
       if (turn.next !== 'continue') break;
     }
+    if (streamStatus === 'error') {
+      const errorText = streamErrors.join('\n');
+      if (shouldAutoFallbackToCli({ opts, explicitProvider, requestedTools }) && isProviderFailureRecoverableByCli(errorText)) {
+        return runCliFallback(prompt, opts, {
+          sid,
+          cwd: resolvedCwd,
+          reason: errorText,
+          fromProvider: llm.type || '',
+          model,
+          runtimeMode,
+        });
+      }
+    }
     if (streamStatus === 'error' && transcript?.appendPart) {
       transcript.appendPart({
         sessionId: sid,
@@ -971,7 +1168,6 @@ async function runAgentLoop(prompt, opts = {}) {
   // ── Bridge: event bus → middleware (A2) ──
   // When the event bus fires, propagate to middleware's onEvent hook so
   // registered middleware can react to file edits, reads, and context overflow.
-  const llmCtxRef = { current: null }; // populated each turn (see llmCtx below)
   const _bridgeHandlers = {};
   for (const evtType of ['file.edited', 'file.read', 'context.overflow']) {
     const handler = (data) => {
@@ -1073,70 +1269,33 @@ async function runAgentLoop(prompt, opts = {}) {
       const timer = setTimeout(() => ac.abort(), Math.min(remaining, perTurnCap));
       // Middleware: prepare LLM call
+      const turnsRemaining = turns - turn;
       const llmCtx = { params: { maxTokens: taskFileHints.length >= 4 ? 8192 : 4096 }, system: systemPrompt, cwd: resolvedCwd,
-        provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {} };
+        provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {},
+        toolsAvailable: turnsRemaining > 1 };
       llmCtxRef.current = llmCtx; // expose to event bridge (A2)
       await mw.run('llm.before', llmCtx);
       let adaptedTools = await toolRegistry.getDefinitions(llmCtx);
-      // Context compaction (6b) -- prune old tool results when approaching context limit
-      const estimateTokens = (msgs) => {
-        let chars = 0;
-        for (const msg of msgs) {
-          if (typeof msg.content === 'string') chars += msg.content.length;
-          else if (Array.isArray(msg.content)) {
-            for (const part of msg.content) {
-              if (part.text) chars += part.text.length;
-              else if (part.content) chars += part.content.length;
-            }
-          }
-        }
-        return Math.ceil(chars * 0.25); // rough token estimate for English code
-      };
-      const contextLimit = 200000; // conservative for most models
-      const reservedBuffer = 20000;
-      const totalTokens = estimateTokens(messages);
-      if (totalTokens >= contextLimit - reservedBuffer) {
-        events.emit('context.overflow', { tokens: totalTokens, sessionId: sid });
-        // Prune oldest tool results, keep last ~40K tokens worth
-        const protectChars = 160000; // ~40K tokens * 4 chars/token
-        let charsSeen = 0;
-        for (let m = messages.length - 1; m >= 1; m--) { // never prune first user msg
-          const msg = messages[m];
-          if (typeof msg.content === 'string') charsSeen += msg.content.length;
-          else if (Array.isArray(msg.content)) {
-            for (const part of msg.content) {
-              charsSeen += (part.text || part.content || '').length;
-            }
-          }
-          if (charsSeen >= protectChars) {
-            // Prune everything older than index m
-            for (let j = 1; j < m; j++) {
-              const old = messages[j];
-              if (Array.isArray(old.content)) {
-                old.content = old.content.map(part => {
-                  if (part.type === 'tool_result' && part.content) {
-                    const text = typeof part.content === 'string' ? part.content
-                      : Array.isArray(part.content) ? part.content.map(c => c.text || '').join('')
-                      : String(part.content);
-                    if (text.length > 200) {
-                      return { ...part, content: ansiSafeTruncate(text, 200) + '\n[compacted]' };
-                    }
-                  }
-                  return part;
-                });
-              }
-            }
-            break;
-          }
-        }
-      }
+      await maybeCompactCodingContext({
+        messages,
+        compactionService,
+        systemPrompt: llmCtx.system,
+        sessionId: sid,
+        cwd: resolvedCwd,
+        transcript,
+        events,
+        emitProgress,
+        mode: opts.mode || 'executing',
+        step: turn,
+        sessionMemory: opts.sessionMemory,
+        reason: 'legacy_pre_turn',
+        opts,
+      });
       // Graceful max-steps degradation (6n)
       // Note: warnings are appended to the LAST message's content (not as separate
       // user messages) to avoid consecutive user messages which the API rejects.
-      const turnsRemaining = turns - turn;
       if (turnsRemaining <= 1) {
         // Final turn: disable tools, force structured summary
         adaptedTools = [];
@@ -1519,6 +1678,17 @@ async function runAgentLoop(prompt, opts = {}) {
     if (questionManager) questionManager.clear();
     try { require('./tools/file-tracker').clearSession(sid); } catch {}
+    if (shouldAutoFallbackToCli({ opts, explicitProvider, requestedTools }) && isProviderFailureRecoverableByCli(err.message)) {
+      return runCliFallback(prompt, opts, {
+        sid,
+        cwd: resolvedCwd,
+        reason: err.message,
+        fromProvider: llm?.type || '',
+        model,
+        runtimeMode,
+      });
+    }
     return {
       success: false,
       output: finalOutput,

package/template/wall-e/compat.js CHANGED Viewed

@@ -5,27 +5,6 @@
 // Called at boot to log warnings; queried by /telemetry skill for removal candidates.
 const COMPAT_REGISTRY = {
-  embedding_v1_table: {
-    addedIn: '0.8.0',
-    deprecatedIn: '0.12.0',
-    removeAfter: '1.0.0',
-    replacedBy: 'Per-model embedding_vec_<key> tables',
-    telemetryKey: 'embedding_v1_migration',
-  },
-  legacy_knowledge_array: {
-    addedIn: '0.5.0',
-    deprecatedIn: '0.12.0',
-    removeAfter: '1.0.0',
-    replacedBy: '{ knowledge: [...], classifications: [...] } format',
-    telemetryKey: 'legacy_knowledge_format',
-  },
-  legacy_quorum_consensus: {
-    addedIn: '0.7.0',
-    deprecatedIn: '0.12.0',
-    removeAfter: '1.0.0',
-    replacedBy: 'Evaluator-scored quorum (workerResponse flow)',
-    telemetryKey: 'legacy_quorum_consensus',
-  },
   chat_json_mode: {
     addedIn: '0.5.0',
     deprecatedIn: null,
@@ -33,13 +12,6 @@ const COMPAT_REGISTRY = {
     replacedBy: '?stream=1 SSE mode',
     telemetryKey: 'chat_json_mode',
   },
-  old_env_gemini_key: {
-    addedIn: '0.6.0',
-    deprecatedIn: '0.12.0',
-    removeAfter: '1.0.0',
-    replacedBy: 'GOOGLE_API_KEY environment variable',
-    telemetryKey: 'old_env_gemini_key',
-  },
   devbox_gateway: {
     addedIn: '0.4.0',
     deprecatedIn: null,

package/template/wall-e/context/context-builder.js CHANGED Viewed

@@ -283,6 +283,7 @@ Relevant memories and knowledge are provided above. If they answer the question,
 ### Step 2: SEARCH — only if the context above is insufficient
 Call search_memories to find additional evidence. Batch multiple searches in ONE turn.
 Use different query angles: English keywords, Chinese terms, source filters.
+For private, remembered, or work-context questions, use Wall-E memory before public web_fetch. This includes prior conversations, decisions, preferences, people, teams, projects, tools, Slack/email/calendar context, and "last time" / "do you know" / "what did we discuss" prompts. Use public web only for public/current facts or after memory misses.
 ### Step 3: THINK — reason through the evidence
 Use the **think** tool before responding to:
@@ -308,7 +309,8 @@ function buildToolRefBlock(ownerName, intent) {
   const lines = ['### Tools'];
   if (intent === 'knowledge') {
     lines.push(`- **think**: Internal scratchpad (${ownerName} won't see). Use BEFORE every substantive response.`);
-    lines.push('- **search_memories**: Hybrid search (BM25 + vector). source:"slack" for Slack only. Batch multiple searches in one turn.');
+    lines.push('- **search_memories**: Hybrid search (BM25 + vector). Use for private/user/work memory: prior conversations, decisions, preferences, projects, people, and Slack/email/calendar context. source:"slack" for Slack only. Batch multiple searches in one turn.');
+    lines.push('- **lookup_person**: Person profile lookup. Use alongside search_memories for colleague/role/team questions.');
     lines.push('- **remember_fact**: Store facts the user teaches you.');
   }
   lines.push('- **run_skill / mcp_call / list_mcp_tools**: Actions and external services.');

package/template/wall-e/embeddings.js CHANGED Viewed

@@ -101,7 +101,7 @@ function getEmbeddingModel() {
 function _hasApiKey(provider) {
   // Check process.env first, then fall back to model_providers table in brain DB
   switch (provider) {
-    case 'google': return !!(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY || _hasProviderKey('google'));
+    case 'google': return !!(process.env.GOOGLE_API_KEY || _hasProviderKey('google'));
     case 'voyage': return !!process.env.VOYAGE_API_KEY;
     case 'openai': return !!(process.env.OPENAI_API_KEY || _hasProviderKey('openai'));
     case 'ollama': return _isOllamaAvailable();
@@ -278,11 +278,8 @@ async function _googleEmbed(texts, config) {
  */
 function _resolveGoogleCredential() {
   // 1. Static API key from env
-  const envKey = process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY;
+  const envKey = process.env.GOOGLE_API_KEY;
   if (envKey && !envKey.startsWith('ya29.')) {
-    if (!process.env.GOOGLE_API_KEY && process.env.GEMINI_API_KEY) {
-      try { require('./compat').recordCompatUsage('old_env_gemini_key'); } catch {}
-    }
     return { type: 'api_key', token: envKey, expired: false };
   }
@@ -475,8 +472,6 @@ function _migrateOldTable(db, config) {
   const oldExists = db.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='embedding_vec'").get();
   if (!oldExists) return;
-  try { require('./compat').recordCompatUsage('embedding_v1_migration'); } catch {}
   const oldCount = db.prepare('SELECT count(*) as c FROM embedding_map WHERE model = ?').get(config.name)?.c || 0;
   if (oldCount === 0) {
     // Old table has data but model column might be from a different model — just drop

package/template/wall-e/eval/agent-runner.js CHANGED Viewed

@@ -100,10 +100,13 @@ async function runAgentBenchmark(benchmark, options = {}) {
     }
     // Run the agent loop with hard timeout safety net
-    const effectiveTimeout = timeoutMs || (expectations.maxTurns || 20) * 30000;
+    const maxTurns = expectations.maxTurns || 20;
+    const turnBudgetTimeout = maxTurns * 30000;
+    const effectiveTimeout = Math.min(timeoutMs || turnBudgetTimeout, turnBudgetTimeout);
     const agentPromise = runAgentLoop(benchmark.prompt, {
       cwd: sandboxDir,
       timeoutMs: effectiveTimeout,
+      maxTurns,
       provider,
       model,
       mode: 'build',
@@ -111,6 +114,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
       headless: true,
       headlessPolicy: 'allow',
       permissionTimeoutMs: 0,
+      persistTranscript: false,
     });
     let timeoutHandle;
     const timeoutPromise = new Promise((_, reject) => {
@@ -131,7 +135,9 @@ async function runAgentBenchmark(benchmark, options = {}) {
     const actualToolCalls = extractToolCalls(result);
     const toolCallDetails = extractToolCallDetails(result);
     const actualFileChanges = await getModifiedFiles(sandboxDir);
-    const actualTurns = (result.log || []).length || actualToolCalls.length;
+    const externalRunnerId = result.runnerId || result.fallback?.runnerId || null;
+    const externalRunnerWork = Boolean(externalRunnerId && actualFileChanges.length > 0);
+    const actualTurns = (result.log || []).length || actualToolCalls.length || (externalRunnerId ? 1 : 0);
     // Run test command if specified (validate against allowlist)
     let testsPassed = null;
@@ -156,6 +162,10 @@ async function runAgentBenchmark(benchmark, options = {}) {
     const inputTokens = usage.inputTokens ?? usage.input ?? 0;
     const expectedFileChanges = expectations.expectedFileChanges || [];
     const missingExpectedWork = expectedFileChanges.length > 0 && actualFileChanges.length === 0;
+    const attemptedFileChange = actualToolCalls.some((call) => {
+      const name = typeof call === 'string' ? call : call?.name;
+      return /edit|write|patch|create|delete|modify/i.test(String(name || ''));
+    });
     const testRegression = (expectations.testCommand && testsPassed === false);
     const rawError = result.stderr || result.error || null;
     const validatedByTests = Boolean(
@@ -164,9 +174,11 @@ async function runAgentBenchmark(benchmark, options = {}) {
       actualFileChanges.length > 0
     );
     const fatalError = rawError && !validatedByTests ? rawError : null;
-    const noEffort = (actualToolCalls.length === 0) || (inputTokens === 0) || missingExpectedWork;
+    const noEffort = (actualToolCalls.length === 0 && !externalRunnerWork) ||
+      (inputTokens === 0 && !externalRunnerWork) ||
+      missingExpectedWork;
     const hadError = !!fatalError;
-    const validatedSuccess = Boolean(result.success || validatedByTests) && !hadError && !noEffort && !testRegression;
+    const validatedSuccess = Boolean(result.success || validatedByTests || externalRunnerWork) && !hadError && !noEffort && !testRegression;
     // Score the result
     let score = scoreAgentResult(benchmark, {
@@ -199,7 +211,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
             : testRegression
               ? 'tests_failed'
               : missingExpectedWork
-                ? 'no_file_changes'
+                ? attemptedFileChange ? 'missing_expected_changes' : 'no_file_changes'
                 : 'no_effort' },
       };
     }
@@ -296,6 +308,10 @@ function scoreAgentResult(benchmark, actual) {
   });
 }
+function isTrustedAgentResult(result = {}) {
+  return result.success === true && !result.error && result.testsPassed === true;
+}
 /**
  * Run a multi-turn benchmark — sends each turn's prompt sequentially,
  * accumulating conversation context. Scores after the final turn.
@@ -333,6 +349,7 @@ async function runMultiTurnBenchmark(benchmark, options = {}) {
         headless: true,
         headlessPolicy: 'allow',
         permissionTimeoutMs: 0,
+        persistTranscript: false,
         messages, // pass accumulated conversation
       });
@@ -353,7 +370,9 @@ async function runMultiTurnBenchmark(benchmark, options = {}) {
     const costDollars = estimateCost(totalUsage, provider?.type || provider || 'anthropic', model);
     const actualFileChanges = await getModifiedFiles(sandboxDir);
-    const actualTurns = totalTurns;
+    const externalRunnerId = lastResult?.runnerId || lastResult?.fallback?.runnerId || null;
+    const externalRunnerWork = Boolean(externalRunnerId && actualFileChanges.length > 0);
+    const actualTurns = totalTurns || (externalRunnerId ? 1 : 0);
     let testsPassed = null;
     let testsAfter = null;
@@ -390,7 +409,8 @@ async function runMultiTurnBenchmark(benchmark, options = {}) {
     // Same hard-zero floor as single-turn — see runAgentBenchmark for rationale.
     const inputTokens = totalUsage.inputTokens ?? 0;
     const hadError = !!(lastResult?.stderr || lastResult?.error);
-    const noEffort = (allToolCalls.length === 0) || (inputTokens === 0);
+    const noEffort = (allToolCalls.length === 0 && !externalRunnerWork) ||
+      (inputTokens === 0 && !externalRunnerWork);
     const testRegression = (expectations.testCommand && testsPassed === false);
     if (hadError || noEffort || testRegression) {
       score = {
@@ -507,7 +527,7 @@ async function runAgentBenchmarkSuite(options = {}) {
           outputTokens: result.outputTokens ?? null,
           scorerVersion: DEFAULT_SCORER_VERSION,
           scoringMethod,
-          trusted: !result.error && result.testsPassed === true,
+          trusted: isTrustedAgentResult(result),
           runConfig: { timeoutMs, scoringMethod },
         }, {
           suite: 'coding-agent',
@@ -517,7 +537,7 @@ async function runAgentBenchmarkSuite(options = {}) {
           model: resolveModelName(model),
           scoringMethod,
           scorerVersion: DEFAULT_SCORER_VERSION,
-          trusted: !result.error && result.testsPassed === true,
+          trusted: isTrustedAgentResult(result),
           runConfig: { timeoutMs, scoringMethod },
         }));
       } catch { /* non-fatal */ }
@@ -666,6 +686,7 @@ module.exports = {
   runMultiTurnBenchmark,
   runAgentBenchmarkSuite,
   scoreAgentResult,
+  isTrustedAgentResult,
   extractToolCalls,
   extractToolCallDetails,
   countTests,