npm - create-walle - Versions diffs - 0.9.13 → 0.9.14 - Mend

create-walle 0.9.13 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

package/README.md +6 -1
package/bin/create-walle.js +195 -30
package/bin/mcp-inject.js +18 -53
package/package.json +3 -1
package/template/claude-task-manager/approval-agent.js +7 -0
package/template/claude-task-manager/docs/session-standup-command-center-design.md +242 -0
package/template/claude-task-manager/git-utils.js +111 -3
package/template/claude-task-manager/lib/session-history.js +144 -16
package/template/claude-task-manager/lib/session-standup.js +409 -0
package/template/claude-task-manager/lib/standup-attention.js +200 -0
package/template/claude-task-manager/lib/status-hooks.js +8 -2
package/template/claude-task-manager/lib/update-telemetry.js +114 -0
package/template/claude-task-manager/lib/walle-default-model.js +55 -0
package/template/claude-task-manager/lib/walle-mcp-auto-config.js +62 -0
package/template/claude-task-manager/lib/walle-supervisor.js +83 -19
package/template/claude-task-manager/lib/worktree-cwd.js +82 -0
package/template/claude-task-manager/providers/codex-mcp.js +104 -0
package/template/claude-task-manager/providers/index.js +2 -0
package/template/claude-task-manager/public/css/setup.css +2 -1
package/template/claude-task-manager/public/css/walle.css +5 -0
package/template/claude-task-manager/public/index.html +1596 -283
package/template/claude-task-manager/public/js/session-search-utils.js +171 -1
package/template/claude-task-manager/public/js/setup.js +62 -19
package/template/claude-task-manager/public/js/stream-view.js +55 -6
package/template/claude-task-manager/public/js/walle-session.js +73 -16
package/template/claude-task-manager/public/js/walle.js +34 -2
package/template/claude-task-manager/server.js +780 -177
package/template/claude-task-manager/session-integrity.js +58 -15
package/template/claude-task-manager/workers/approval-widget-validator.js +15 -5
package/template/claude-task-manager/workers/state-detectors/codex.js +6 -0
package/template/package.json +1 -1
package/template/wall-e/agent.js +36 -7
package/template/wall-e/api-walle.js +72 -20
package/template/wall-e/coding/stream-processor.js +22 -2
package/template/wall-e/coding-orchestrator.js +26 -6
package/template/wall-e/eval/agent-runner.js +16 -4
package/template/wall-e/eval/benchmark-generator.js +21 -1
package/template/wall-e/eval/benchmarks/coding-agent.json +0 -596
package/template/wall-e/eval/codex-cli-baseline.js +633 -0
package/template/wall-e/eval/eval-orchestrator.js +3 -3
package/template/wall-e/eval/run-agent-benchmarks.js +11 -3
package/template/wall-e/eval/run-codex-cli-baseline.js +177 -0
package/template/wall-e/lib/mcp-integration.js +220 -0
package/template/wall-e/llm/ollama.js +47 -8
package/template/wall-e/llm/ollama.plugin.json +1 -1
package/template/wall-e/llm/tool-adapter.js +1 -0
package/template/wall-e/loops/ingest.js +42 -8
package/template/wall-e/mcp-server.js +272 -10
package/template/wall-e/memory/ctm-session-context.js +910 -0
package/template/wall-e/server.js +26 -1
package/template/wall-e/skills/_bundled/scan-ctm-sessions/SKILL.md +20 -0
package/template/wall-e/skills/_bundled/scan-ctm-sessions/run.js +43 -0
package/template/wall-e/skills/skill-planner.js +52 -3
package/template/wall-e/tools/builtin-middleware.js +55 -2
package/template/wall-e/tools/shell-policy.js +1 -1
package/template/wall-e/tools/slack-owner.js +104 -0
package/template/website/index.html +2 -2
package/template/builder-journal.md +0 -17

package/template/claude-task-manager/session-integrity.js CHANGED Viewed

@@ -15,11 +15,60 @@
 const fs = require('fs');
 const path = require('path');
 const claudeDesktopSessions = require('./lib/claude-desktop-sessions');
+const { codexRolloutIdFromPath, findCodexSessionFiles } = require('./lib/session-history');
 const CLAUDE_PROJECTS_DIR = path.join(process.env.HOME, '.claude', 'projects');
 // --- Detection ---
+function sessionFileIdFromPath(filePath) {
+  const virtual = claudeDesktopSessions.parseVirtualSessionPath(filePath);
+  if (virtual) return virtual.sessionId;
+  const codexId = codexRolloutIdFromPath(filePath);
+  if (codexId) return codexId;
+  const base = path.basename(filePath).replace(/\.jsonl(\.bak)?$/, '');
+  const uuid = base.match(/([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$/i);
+  return uuid ? uuid[1].toLowerCase() : base;
+}
+function fileEntryFromPath(filePath, expectedFileId, projectEntry = '') {
+  if (!filePath) return null;
+  const actualFileId = sessionFileIdFromPath(filePath);
+  if (expectedFileId && actualFileId !== expectedFileId) return null;
+  try {
+    const sourcePath = claudeDesktopSessions.sourcePathForStat(filePath);
+    const stat = fs.statSync(sourcePath);
+    if (!stat.isFile()) return null;
+    return { filePath, stat, projectEntry };
+  } catch {
+    return null;
+  }
+}
+function addFileIndexEntry(fileIndex, filePath, projectEntry) {
+  const file = fileEntryFromPath(filePath, null, projectEntry);
+  if (file) fileIndex[sessionFileIdFromPath(filePath)] = file;
+}
+function resolveDbSessionFile(row, expectedFileId, fileIndex) {
+  if (!expectedFileId) return null;
+  if (fileIndex[expectedFileId]) return fileIndex[expectedFileId];
+  const stored = fileEntryFromPath(row?.jsonl_path, expectedFileId);
+  if (stored) return stored;
+  if ((row?.provider === 'codex') || String(row?.jsonl_path || '').includes(`${path.sep}.codex${path.sep}sessions${path.sep}`)) {
+    try {
+      for (const filePath of findCodexSessionFiles(expectedFileId)) {
+        const file = fileEntryFromPath(filePath, expectedFileId);
+        if (file) return file;
+      }
+    } catch {}
+  }
+  return null;
+}
 function dbTimestampFromIso(value) {
   if (!value) return '';
   const ms = new Date(value).getTime();
@@ -138,7 +187,7 @@ function detectMismatches(db, getAllSessionFiles) {
     } catch {}
     const slugCol = hasSlugColumn ? 'a.slug' : "'' AS slug";
     allSessions = db.prepare(`
-      SELECT c.id, c.title, c.user_renamed, c.starred, c.project_path, c.cwd,
+      SELECT c.id, c.provider, c.title, c.user_renamed, c.starred, c.project_path, c.cwd,
         c.created_at, c.updated_at,
         a.agent_session_id, a.jsonl_path, a.file_size, a.first_message,
         a.modified_at, a.hostname, a.model, a.git_branch, a.user_msg_count,
@@ -156,12 +205,7 @@ function detectMismatches(db, getAllSessionFiles) {
   const fileIndex = {}; // uuid -> { filePath, stat, projectEntry }
   try {
     for (const { filePath, projectEntry } of getAllSessionFiles()) {
-      const virtual = claudeDesktopSessions.parseVirtualSessionPath(filePath);
-      const uuid = virtual ? virtual.sessionId : path.basename(filePath).replace(/\.jsonl(\.bak)?$/, '');
-      try {
-        const stat = fs.statSync(claudeDesktopSessions.sourcePathForStat(filePath));
-        fileIndex[uuid] = { filePath, stat, projectEntry };
-      } catch {}
+      addFileIndexEntry(fileIndex, filePath, projectEntry);
     }
   } catch (e) {
     issues.push({ type: 'scan_error', severity: 'warning', sessionId: null,
@@ -183,7 +227,7 @@ function detectMismatches(db, getAllSessionFiles) {
     // Skip DB-only rows with no file expectation (legacy tabs with no agent_session_id)
     const expectedFileId = (agentId && agentId !== sid) ? agentId : sid;
-    const file = fileIndex[expectedFileId];
+    const file = resolveDbSessionFile(row, expectedFileId, fileIndex);
     // Check 1: Missing file
     if (!file && row.file_size > 0) {
@@ -193,6 +237,7 @@ function detectMismatches(db, getAllSessionFiles) {
           expected_file_id: expectedFileId,
           db_file_size: row.file_size,
           db_jsonl_path: row.jsonl_path || '',
+          db_provider: row.provider || '',
           db_title: row.title || '',
         },
         suggestion: 'File may have been deleted or moved. Check .jsonl.bak variant.',
@@ -215,6 +260,7 @@ function detectMismatches(db, getAllSessionFiles) {
             db_file_size: row.file_size,
             actual_file_size: file.stat.size,
             size_diff: sizeDiff,
+            db_jsonl_path: row.jsonl_path || '',
           },
           suggestion: 'DB metadata is stale — will be refreshed on next session list load.',
         });
@@ -439,12 +485,7 @@ function recoverMismatches(db, issues, getAllSessionFiles) {
   const fileIndex = {};
   try {
     for (const { filePath, projectEntry } of getAllSessionFiles()) {
-      const virtual = claudeDesktopSessions.parseVirtualSessionPath(filePath);
-      const uuid = virtual ? virtual.sessionId : path.basename(filePath).replace(/\.jsonl(\.bak)?$/, '');
-      try {
-        const stat = fs.statSync(claudeDesktopSessions.sourcePathForStat(filePath));
-        fileIndex[uuid] = { filePath, stat, projectEntry };
-      } catch {}
+      addFileIndexEntry(fileIndex, filePath, projectEntry);
     }
   } catch {}
@@ -505,7 +546,9 @@ function recoverMismatches(db, issues, getAllSessionFiles) {
       case 'stale_metadata': {
         // Refresh metadata from actual file
         const fileId = issue.details.file_id;
-        const file = fileIndex[fileId];
+        const file = fileIndex[fileId]
+          || fileEntryFromPath(issue.details?.db_jsonl_path, fileId)
+          || resolveDbSessionFile({ provider: 'codex', jsonl_path: issue.details?.db_jsonl_path || '' }, fileId, fileIndex);
         if (!file) { result.skipped++; break; }
         try {
           db.prepare('UPDATE agent_sessions SET file_size = ?, modified_at = ?, updated_at = datetime(\'now\') WHERE ctm_session_id = ?')

package/template/claude-task-manager/workers/approval-widget-validator.js CHANGED Viewed

@@ -28,12 +28,12 @@ const ABOVE_ANCHOR_DEPTH = 40;
 // Claude Code: "Esc to cancel". Codex: "Press enter to confirm or esc to cancel".
 const ANCHOR_RE = /Esc to cancel|esc to cancel|Press enter to confirm/;
-// Yes-option pattern. Accepts an optional selection-arrow prefix in any of
+// Approval-option pattern. Accepts an optional selection-arrow prefix in any of
 // the forms different CLIs use: ❯ (Claude Code), ›/▶/▸ (Cursor/others), or
 // plain ASCII > (Codex). Without this, Codex's "> 1. Yes, proceed (y)" would
 // be skipped over and the validator would lock onto option 2 ("2. Yes, ...")
 // — which is unstyled in Codex's renderer and trips no-widget-formatting.
-const YES_RE = /^\s*(?:[❯›▶▸>]\s*)?\d+\.\s*Yes\b/i;
+const YES_RE = /^\s*(?:[❯›▶▸>]\s*)?\d+\.\s*(?:Yes|Allow)\b/i;
 /**
  * Check if the terminal is currently displaying an active approval widget.
@@ -142,9 +142,10 @@ function _hasWidgetFormatting(buf, yesRow, totalRows) {
   const yesText = yesLine.translateToString(true);
   if (/[❯›▶▸]/.test(yesText)) return true;
-  // Check for "❯" marker anywhere in bottom 3 rows
-  const arrowScanStart = Math.max(yesRow - 1, totalRows - 5);
-  for (let row = arrowScanStart; row < totalRows; row++) {
+  // Check for a selection marker near the approval options. Codex MCP forms can
+  // select option 2 ("Allow for this session"), while option 1 is the first
+  // approval-shaped line used for anchoring.
+  for (let row = Math.max(0, yesRow - 1); row < Math.min(totalRows, yesRow + 8); row++) {
     const line = buf.getLine(buf.viewportY + row);
     if (!line) continue;
     const text = line.translateToString(true);
@@ -152,6 +153,15 @@ function _hasWidgetFormatting(buf, yesRow, totalRows) {
     if (/[❯›▶▸]/.test(text)) return true;
   }
+  // Check for "❯" marker anywhere in bottom 5 rows for prompts whose option
+  // block is pushed down by long wrapped content.
+  for (let row = Math.max(0, totalRows - 5); row < totalRows; row++) {
+    const line = buf.getLine(buf.viewportY + row);
+    if (!line) continue;
+    const text = line.translateToString(true);
+    if (/[❯›▶▸]/.test(text)) return true;
+  }
   // Check for ANSI foreground color on the Yes-option line.
   // xterm's BufferLine.getCell(x) returns an IBufferCell with .getFgColor()
   // (0 = default). Any non-default fg color = styled = widget.

package/template/claude-task-manager/workers/state-detectors/codex.js CHANGED Viewed

@@ -47,6 +47,12 @@ function isCodexStatusRedraw(data) {
 module.exports = {
   ...baseDetector,
   id: 'codex',
+  // Codex's ratatui status frames arrive in bursts. A short Claude-style
+  // debounce lets the sidebar bounce between Running and Waiting/Idle while the
+  // terminal still says "Working". Keep the busy state stable long enough for
+  // multiple server heartbeats to confirm or renew it, while explicit
+  // approval/choice prompts still bypass this elsewhere.
+  idleDebounceMs: 15000,
   isActiveChunk(data) {
     if (!baseDetector.isActiveChunk(data)) return false;

package/template/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "walle",
-  "version": "0.9.13",
+  "version": "0.9.14",
   "private": true,
   "description": "Wall-E — your personal digital twin",
   "scripts": {

package/template/wall-e/agent.js CHANGED Viewed

@@ -109,7 +109,8 @@ function bootstrapSkills() {
     description: 'Scan Claude Code session files for new conversations',
     trigger_type: 'interval',
     trigger_config: JSON.stringify({ interval_ms: 60000 }),
-    prompt_template: 'Scan the Claude Code session directory at ~/.claude/projects/ for any new or updated .jsonl session files. Read the most recently modified files and extract user messages and assistant responses as observations.',
+    prompt_template: 'INTERNAL_SKILL:scan-ctm-sessions',
+    execution: 'script',
   });
   brain.insertSkill({
@@ -140,16 +141,43 @@ function bootstrapSkills() {
 function syncBundledSkills() {
   const filesystemSkills = loadAllSkills();
   const dbSkills = brain.listSkills({});
-  const dbNames = new Set(dbSkills.map(s => s.name));
+  const dbByName = new Map(dbSkills.map(s => [s.name, s]));
+  const dbNames = new Set(dbByName.keys());
   let added = 0;
+  let updated = 0;
   for (const skill of filesystemSkills) {
-    if (dbNames.has(skill.name)) continue;
     const triggerType = (skill.trigger && skill.trigger.type) || skill.execution || 'manual';
     const triggerConfig = skill.trigger && skill.trigger.interval_ms
       ? JSON.stringify({ interval_ms: skill.trigger.interval_ms })
       : null;
+    const promptTemplate = skill.execution === 'script'
+      ? `INTERNAL_SKILL:${skill.name}`
+      : skill.instructions || '';
+    if (dbNames.has(skill.name)) {
+      const existing = dbByName.get(skill.name);
+      // Upgrade legacy prompt-based CTM scanning to the deterministic script
+      // path. Session continuity must not depend on live LLM/network access.
+      if (skill.name === 'scan-ctm-sessions' && skill.execution === 'script' && existing) {
+        const updates = {};
+        if (existing.execution !== 'script') updates.execution = 'script';
+        if (existing.prompt_template !== promptTemplate) updates.prompt_template = promptTemplate;
+        if (existing.trigger_type !== triggerType) updates.trigger_type = triggerType;
+        if (triggerConfig && existing.trigger_config !== triggerConfig) updates.trigger_config = triggerConfig;
+        if (existing.auto_disabled_at) {
+          updates.enabled = 1;
+          updates.auto_disabled_at = null;
+        }
+        if (existing.auto_disabled_reason) updates.auto_disabled_reason = null;
+        if (Object.keys(updates).length > 0) {
+          brain.updateSkill(existing.id, updates);
+          updated++;
+          console.log(`[wall-e] Updated bundled skill: ${skill.name}`);
+        }
+      }
+      continue;
+    }
     brain.insertSkill({
       name: skill.name,
@@ -159,9 +187,7 @@ function syncBundledSkills() {
       // Persist the legacy `INTERNAL_SKILL:` marker so downgrades can still
       // dispatch script skills via the prompt_template fallback. The schema
       // column `execution` is the authoritative source going forward.
-      prompt_template: skill.execution === 'script'
-        ? `INTERNAL_SKILL:${skill.name}`
-        : skill.instructions || '',
+      prompt_template: promptTemplate,
       execution: skill.execution === 'script' ? 'script' : 'agent',
     });
     added++;
@@ -171,6 +197,9 @@ function syncBundledSkills() {
   if (added > 0) {
     console.log(`[wall-e] Synced ${added} new bundled skill(s) to DB`);
   }
+  if (updated > 0) {
+    console.log(`[wall-e] Updated ${updated} bundled skill(s) in DB`);
+  }
 }
 function bootstrapTasks() {

package/template/wall-e/api-walle.js CHANGED Viewed

@@ -364,25 +364,78 @@ function handleWalleApi(req, res, url) {
   if (p === '/api/wall-e/slack/status' && m === 'GET') {
     try {
       const slackMcp = require('./tools/slack-mcp');
+      const { getSlackOwnerRepairState } = require('./tools/slack-owner');
       const token = slackMcp.loadToken();
-      jsonResponse(res, { data: { authenticated: !!token?.access_token, team: token?.team_name, user: token?.user_id, obtained_at: token?.obtained_at } });
+      const owner = getSlackOwnerRepairState();
+      jsonResponse(res, {
+        data: {
+          authenticated: !!token?.access_token,
+          team: token?.team_name,
+          user: token?.user_id,
+          obtained_at: token?.obtained_at,
+          owner_configured: owner.configured,
+          owner_can_repair: owner.canRepair,
+        },
+      });
     } catch (e) {
       jsonResponse(res, { data: { authenticated: false } });
     }
     return true;
   }
+  // POST /api/wall-e/slack/repair-owner — derive Slack owner id from OAuth token
+  if (p === '/api/wall-e/slack/repair-owner' && m === 'POST') {
+    try {
+      const { repairSlackOwnerIdentity } = require('./tools/slack-owner');
+      const { clearServiceAlerts } = require('./skills/skill-planner');
+      const result = repairSlackOwnerIdentity({ persist: true });
+      if (!result.ok) {
+        return jsonResponse(res, {
+          ok: false,
+          error: result.error || 'Could not repair Slack owner identity',
+          needsSlackAuth: !!result.needsSlackAuth,
+        }, result.needsSlackAuth ? 409 : 500), true;
+      }
+      clearServiceAlerts('slack');
+      return jsonResponse(res, {
+        ok: true,
+        user_id_configured: true,
+        source: result.source,
+        persisted: !!result.persisted,
+        already_configured: !!result.alreadyConfigured,
+      }), true;
+    } catch (e) {
+      return jsonResponse(res, { ok: false, error: e.message }, 500), true;
+    }
+  }
   // POST /api/wall-e/slack/auth — start OAuth flow (opens browser)
   if (p === '/api/wall-e/slack/auth' && m === 'POST') {
     try {
       const slackMcp = require('./tools/slack-mcp');
       // If already authenticated, return immediately
       if (slackMcp.isAuthenticatedSync()) {
+        try {
+          const { repairSlackOwnerIdentity } = require('./tools/slack-owner');
+          const { clearServiceAlerts } = require('./skills/skill-planner');
+          const repaired = repairSlackOwnerIdentity({ persist: true });
+          if (repaired.ok) clearServiceAlerts('slack');
+        } catch (repairErr) {
+          console.warn('[wall-e] Slack owner repair skipped:', repairErr.message);
+        }
         jsonResponse(res, { ok: true, already: true });
         return true;
       }
       // Start OAuth — opens browser, temp server on port 3118 handles callback
       slackMcp.authenticate().then(() => {
+        try {
+          const { repairSlackOwnerIdentity } = require('./tools/slack-owner');
+          const { clearServiceAlerts } = require('./skills/skill-planner');
+          const repaired = repairSlackOwnerIdentity({ persist: true });
+          if (repaired.ok) clearServiceAlerts('slack');
+        } catch (repairErr) {
+          console.error('[wall-e] Slack owner repair failed:', repairErr.message);
+        }
         console.log('[wall-e] Slack OAuth completed');
       }).catch(err => {
         console.error('[wall-e] Slack OAuth failed:', err.message);
@@ -714,24 +767,9 @@ function handleWalleApi(req, res, url) {
   // GET /api/wall-e/mcp/integrations — check which AI tools have Wall-E MCP configured
   if (p === '/api/wall-e/mcp/integrations' && m === 'GET') {
     try {
-      const fs = require('fs');
-      const { MCP_TARGETS } = require('../create-walle/bin/mcp-inject');
+      const { detectMcpIntegrations } = require('./lib/mcp-integration');
       const wallePort = parseInt(process.env.WALL_E_PORT) || 3457;
-      const home = process.env.HOME;
-      const results = MCP_TARGETS.map(target => {
-        const detectPath = path.join(home, target.detectDir);
-        const configPath = path.join(home, target.configPath);
-        if (!fs.existsSync(detectPath)) return { tool: target.tool, status: 'not_installed' };
-        try {
-          const config = JSON.parse(fs.readFileSync(configPath, 'utf8'));
-          const entry = config?.mcpServers?.['wall-e'];
-          if (entry && entry.url === `http://localhost:${wallePort}/mcp`) return { tool: target.tool, status: 'configured', configPath };
-          if (entry) return { tool: target.tool, status: 'wrong_port', configPath };
-          return { tool: target.tool, status: 'not_configured', configPath };
-        } catch {
-          return { tool: target.tool, status: 'not_configured', configPath };
-        }
-      });
+      const results = detectMcpIntegrations(wallePort);
       jsonResponse(res, { data: results, wallePort });
     } catch (e) {
       jsonResponse(res, { data: [], error: e.message });
@@ -742,9 +780,9 @@ function handleWalleApi(req, res, url) {
   // POST /api/wall-e/mcp/inject — run MCP config injection for all detected AI tools
   if (p === '/api/wall-e/mcp/inject' && m === 'POST') {
     try {
-      const { injectMcpConfigs } = require('../create-walle/bin/mcp-inject');
+      const { ensureMcpIntegrations } = require('./lib/mcp-integration');
       const wallePort = parseInt(process.env.WALL_E_PORT) || 3457;
-      const results = injectMcpConfigs(wallePort);
+      const results = ensureMcpIntegrations(wallePort);
       const added = results.filter(r => r.action === 'added' || r.action === 'updated').length;
       try { require('./telemetry').track('mcp_inject', { added, total: results.length }); } catch {}
       jsonResponse(res, { ok: true, results });
@@ -754,6 +792,20 @@ function handleWalleApi(req, res, url) {
     return true;
   }
+  // GET /api/wall-e/mcp/test - verify the live Wall-E MCP endpoint responds
+  if (p === '/api/wall-e/mcp/test' && m === 'GET') {
+    try {
+      const { testWallEMcpEndpoint } = require('./lib/mcp-integration');
+      const wallePort = parseInt(process.env.WALL_E_PORT) || 3457;
+      testWallEMcpEndpoint(wallePort, { timeoutMs: 1500 })
+        .then(result => jsonResponse(res, { data: result, wallePort }))
+        .catch(e => jsonResponse(res, { data: { ok: false, error: e.message }, wallePort }, 500));
+    } catch (e) {
+      jsonResponse(res, { data: { ok: false, error: e.message } }, 500);
+    }
+    return true;
+  }
   // GET /api/wall-e/status
   if (p === '/api/wall-e/status' && m === 'GET') {
     const result = getStatus();

package/template/wall-e/coding/stream-processor.js CHANGED Viewed

@@ -84,6 +84,9 @@ class StreamProcessor extends EventEmitter {
       stopReason: '',
       status: 'running',
       errors: [],
+      toolErrors: [],
+      hadEdit: false,
+      verified: false,
       events: [],
     };
@@ -111,7 +114,7 @@ class StreamProcessor extends EventEmitter {
         const snapshot = await this.snapshotService.captureStepFinish({ sessionId, cwd, messageId: assistantMessageId });
         if (snapshot) await this._record(sessionId, cwd, 'snapshot', snapshot);
       }
-      state.status = state.errors.length > 0 ? 'error' : 'finished';
+      state.status = 'finished';
     } catch (err) {
       state.status = 'error';
       state.errors.push(err.message);
@@ -135,6 +138,8 @@ class StreamProcessor extends EventEmitter {
         toolCalls: state.toolCalls,
       }),
       toolResultMessage: state.toolResults.length > 0 ? toolResultMessage(state.toolResults) : null,
+      hadEdit: state.hadEdit,
+      verified: state.verified,
       next: state.status === 'error' ? 'stop' : state.toolResults.length > 0 ? 'continue' : 'stop',
     };
   }
@@ -223,6 +228,8 @@ class StreamProcessor extends EventEmitter {
         input: call.input,
       });
       const result = await this.toolExecutor(call, { sessionId, cwd, model: state.model, provider: state.provider });
+      if (isEditTool(call.name) && !result?.error) state.hadEdit = true;
+      if (isSuccessfulTestCommand(call, result)) state.verified = true;
       state.toolResults.push({ toolCallId: call.id, name: call.name, result });
       await this._record(sessionId, cwd, 'tool', {
         state: 'completed',
@@ -231,7 +238,7 @@ class StreamProcessor extends EventEmitter {
         result,
       });
     } catch (err) {
-      state.errors.push(err.message);
+      state.toolErrors.push(err.message);
       state.toolResults.push({ toolCallId: call.id, name: call.name, error: err.message });
       await this._record(sessionId, cwd, 'tool', {
         state: 'error',
@@ -262,7 +269,20 @@ class StreamProcessor extends EventEmitter {
   }
 }
+function isEditTool(name) {
+  return ['edit_file', 'write_file', 'apply_patch', 'multi_edit'].includes(name);
+}
+function isSuccessfulTestCommand(call, result) {
+  if (call?.name !== 'run_shell') return false;
+  const command = String(call.input?.command || '');
+  if (!/\b(test|spec|jest|mocha|pytest|npm\s+test|node\s+test\.js)\b/i.test(command)) return false;
+  if (result?.error || result?.exitCode) return false;
+  return true;
+}
 module.exports = {
   StreamProcessor,
   streamFromChat,
+  isSuccessfulTestCommand,
 };

package/template/wall-e/coding-orchestrator.js CHANGED Viewed

@@ -750,7 +750,7 @@ async function runAgentLoop(prompt, opts = {}) {
   const mw = opts.middleware || (() => {
     const m = new CodingMiddleware();
-    registerBuiltinMiddleware(m, { cwd, provider: llm?.type, model: modelId, claudeMd: opts.claudeMd, mode: opts.mode, taskEnv: opts.env });
+    registerBuiltinMiddleware(m, { cwd, provider: llm?.type, model: modelId, claudeMd: opts.claudeMd, mode: opts.mode, taskEnv: opts.env, benchmark: opts.benchmark });
     return m;
   })();
   const events = opts.events || new CodingEvents();
@@ -812,6 +812,7 @@ async function runAgentLoop(prompt, opts = {}) {
   const questionManager = opts.questionManager || new QuestionManager(events);
   // projectInfo already detected above (before system prompt)
+  const llmCtxRef = { current: null }; // populated each turn (see llmCtx below)
   // Stream-native runtime: model deltas, tool states, snapshots, permissions,
   // and step boundaries are persisted as typed transcript parts while the loop
@@ -835,9 +836,15 @@ async function runAgentLoop(prompt, opts = {}) {
         if (call.name === 'list_directory' && input.directory && !path.isAbsolute(input.directory)) {
           input.directory = path.join(resolvedCwd, input.directory);
         }
+        if (call.name === 'run_shell' && !input.cwd) {
+          input.cwd = resolvedCwd;
+        }
         input.sessionId = sid;
         input.projectRoot = resolvedCwd;
-        return toolRegistry.execute(call.name, input, { sessionId: sid, cwd: resolvedCwd, model: modelId, provider: llm.type });
+        const toolCtx = { sessionId: sid, cwd: resolvedCwd, model: modelId, provider: llm.type, runtimeMode: runtimeMode.id };
+        const finalInput = await mw.run('tool.before', toolCtx, call.name, input);
+        const result = await toolRegistry.execute(call.name, finalInput, toolCtx);
+        return mw.run('tool.after', toolCtx, call.name, finalInput, result);
       },
     });
     processor.on('event', (evt) => emitProgress({
@@ -851,6 +858,7 @@ async function runAgentLoop(prompt, opts = {}) {
     let streamStopReason = '';
     let streamModel = modelId;
     const streamErrors = [];
+    let streamHadEdit = false;
     for (let turnIndex = opts._resumeTurn || 0; turnIndex < turns; turnIndex++) {
       const remaining = deadline - Date.now();
       if (remaining <= 0) {
@@ -878,14 +886,24 @@ async function runAgentLoop(prompt, opts = {}) {
             runtimeMode: runtimeMode.id,
             cwd: resolvedCwd,
           });
+        const llmCtx = { params: { maxTokens: taskFileHints.length >= 4 ? 8192 : 4096 }, system: systemPrompt, cwd: resolvedCwd,
+          provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {},
+          toolsAvailable: toolsForTurn.length > 0 };
+        llmCtxRef.current = llmCtx;
+        await mw.run('llm.before', llmCtx);
         turn = await processor.runTurn({
           sessionId: sid,
           cwd: resolvedCwd,
-          system: systemPrompt,
+          system: llmCtx.system,
           messages,
           tools: toolsForTurn,
           maxTokens: taskFileHints.length >= 4 ? 8192 : 4096,
           signal: ac.signal,
+          maxTokens: llmCtx.params.maxTokens,
+          temperature: llmCtx.params.temperature,
+          thinking: llmCtx.params.thinking,
+          reasoningEffort: llmCtx.params.reasoningEffort,
+          options: llmCtx.params.options,
         });
       } finally {
         clearTimeout(timer);
@@ -911,6 +929,7 @@ async function runAgentLoop(prompt, opts = {}) {
         content: turn.text,
         stopReason: turn.stopReason,
       });
+      if (turn.hadEdit) streamHadEdit = true;
       if (turn.status === 'error') break;
       if ((turn.toolCalls || []).length === 0) {
@@ -931,6 +950,7 @@ async function runAgentLoop(prompt, opts = {}) {
       }
       if (turn.assistantMessage) messages.push(turn.assistantMessage);
       if (turn.toolResultMessage) messages.push(turn.toolResultMessage);
+      if (turn.verified && streamHadEdit) break;
       if (turn.next !== 'continue') break;
     }
@@ -971,7 +991,6 @@ async function runAgentLoop(prompt, opts = {}) {
   // ── Bridge: event bus → middleware (A2) ──
   // When the event bus fires, propagate to middleware's onEvent hook so
   // registered middleware can react to file edits, reads, and context overflow.
-  const llmCtxRef = { current: null }; // populated each turn (see llmCtx below)
   const _bridgeHandlers = {};
   for (const evtType of ['file.edited', 'file.read', 'context.overflow']) {
     const handler = (data) => {
@@ -1073,8 +1092,10 @@ async function runAgentLoop(prompt, opts = {}) {
       const timer = setTimeout(() => ac.abort(), Math.min(remaining, perTurnCap));
       // Middleware: prepare LLM call
+      const turnsRemaining = turns - turn;
       const llmCtx = { params: { maxTokens: taskFileHints.length >= 4 ? 8192 : 4096 }, system: systemPrompt, cwd: resolvedCwd,
-        provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {} };
+        provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {},
+        toolsAvailable: turnsRemaining > 1 };
       llmCtxRef.current = llmCtx; // expose to event bridge (A2)
       await mw.run('llm.before', llmCtx);
       let adaptedTools = await toolRegistry.getDefinitions(llmCtx);
@@ -1136,7 +1157,6 @@ async function runAgentLoop(prompt, opts = {}) {
       // Graceful max-steps degradation (6n)
       // Note: warnings are appended to the LAST message's content (not as separate
       // user messages) to avoid consecutive user messages which the API rejects.
-      const turnsRemaining = turns - turn;
       if (turnsRemaining <= 1) {
         // Final turn: disable tools, force structured summary
         adaptedTools = [];

package/template/wall-e/eval/agent-runner.js CHANGED Viewed

@@ -100,10 +100,13 @@ async function runAgentBenchmark(benchmark, options = {}) {
     }
     // Run the agent loop with hard timeout safety net
-    const effectiveTimeout = timeoutMs || (expectations.maxTurns || 20) * 30000;
+    const maxTurns = expectations.maxTurns || 20;
+    const turnBudgetTimeout = maxTurns * 30000;
+    const effectiveTimeout = Math.min(timeoutMs || turnBudgetTimeout, turnBudgetTimeout);
     const agentPromise = runAgentLoop(benchmark.prompt, {
       cwd: sandboxDir,
       timeoutMs: effectiveTimeout,
+      maxTurns,
       provider,
       model,
       mode: 'build',
@@ -156,6 +159,10 @@ async function runAgentBenchmark(benchmark, options = {}) {
     const inputTokens = usage.inputTokens ?? usage.input ?? 0;
     const expectedFileChanges = expectations.expectedFileChanges || [];
     const missingExpectedWork = expectedFileChanges.length > 0 && actualFileChanges.length === 0;
+    const attemptedFileChange = actualToolCalls.some((call) => {
+      const name = typeof call === 'string' ? call : call?.name;
+      return /edit|write|patch|create|delete|modify/i.test(String(name || ''));
+    });
     const testRegression = (expectations.testCommand && testsPassed === false);
     const rawError = result.stderr || result.error || null;
     const validatedByTests = Boolean(
@@ -199,7 +206,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
             : testRegression
               ? 'tests_failed'
               : missingExpectedWork
-                ? 'no_file_changes'
+                ? attemptedFileChange ? 'missing_expected_changes' : 'no_file_changes'
                 : 'no_effort' },
       };
     }
@@ -296,6 +303,10 @@ function scoreAgentResult(benchmark, actual) {
   });
 }
+function isTrustedAgentResult(result = {}) {
+  return result.success === true && !result.error && result.testsPassed === true;
+}
 /**
  * Run a multi-turn benchmark — sends each turn's prompt sequentially,
  * accumulating conversation context. Scores after the final turn.
@@ -507,7 +518,7 @@ async function runAgentBenchmarkSuite(options = {}) {
           outputTokens: result.outputTokens ?? null,
           scorerVersion: DEFAULT_SCORER_VERSION,
           scoringMethod,
-          trusted: !result.error && result.testsPassed === true,
+          trusted: isTrustedAgentResult(result),
           runConfig: { timeoutMs, scoringMethod },
         }, {
           suite: 'coding-agent',
@@ -517,7 +528,7 @@ async function runAgentBenchmarkSuite(options = {}) {
           model: resolveModelName(model),
           scoringMethod,
           scorerVersion: DEFAULT_SCORER_VERSION,
-          trusted: !result.error && result.testsPassed === true,
+          trusted: isTrustedAgentResult(result),
           runConfig: { timeoutMs, scoringMethod },
         }));
       } catch { /* non-fatal */ }
@@ -666,6 +677,7 @@ module.exports = {
   runMultiTurnBenchmark,
   runAgentBenchmarkSuite,
   scoreAgentResult,
+  isTrustedAgentResult,
   extractToolCalls,
   extractToolCallDetails,
   countTests,