create-walle 0.9.13 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +6 -1
  2. package/bin/create-walle.js +195 -30
  3. package/bin/mcp-inject.js +18 -53
  4. package/package.json +3 -1
  5. package/template/claude-task-manager/approval-agent.js +7 -0
  6. package/template/claude-task-manager/docs/session-standup-command-center-design.md +242 -0
  7. package/template/claude-task-manager/git-utils.js +111 -3
  8. package/template/claude-task-manager/lib/session-history.js +144 -16
  9. package/template/claude-task-manager/lib/session-standup.js +409 -0
  10. package/template/claude-task-manager/lib/standup-attention.js +200 -0
  11. package/template/claude-task-manager/lib/status-hooks.js +8 -2
  12. package/template/claude-task-manager/lib/update-telemetry.js +114 -0
  13. package/template/claude-task-manager/lib/walle-default-model.js +55 -0
  14. package/template/claude-task-manager/lib/walle-mcp-auto-config.js +62 -0
  15. package/template/claude-task-manager/lib/walle-supervisor.js +83 -19
  16. package/template/claude-task-manager/lib/worktree-cwd.js +82 -0
  17. package/template/claude-task-manager/providers/codex-mcp.js +104 -0
  18. package/template/claude-task-manager/providers/index.js +2 -0
  19. package/template/claude-task-manager/public/css/setup.css +2 -1
  20. package/template/claude-task-manager/public/css/walle.css +5 -0
  21. package/template/claude-task-manager/public/index.html +1596 -283
  22. package/template/claude-task-manager/public/js/session-search-utils.js +171 -1
  23. package/template/claude-task-manager/public/js/setup.js +62 -19
  24. package/template/claude-task-manager/public/js/stream-view.js +55 -6
  25. package/template/claude-task-manager/public/js/walle-session.js +73 -16
  26. package/template/claude-task-manager/public/js/walle.js +34 -2
  27. package/template/claude-task-manager/server.js +780 -177
  28. package/template/claude-task-manager/session-integrity.js +58 -15
  29. package/template/claude-task-manager/workers/approval-widget-validator.js +15 -5
  30. package/template/claude-task-manager/workers/state-detectors/codex.js +6 -0
  31. package/template/package.json +1 -1
  32. package/template/wall-e/agent.js +36 -7
  33. package/template/wall-e/api-walle.js +72 -20
  34. package/template/wall-e/coding/stream-processor.js +22 -2
  35. package/template/wall-e/coding-orchestrator.js +26 -6
  36. package/template/wall-e/eval/agent-runner.js +16 -4
  37. package/template/wall-e/eval/benchmark-generator.js +21 -1
  38. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -596
  39. package/template/wall-e/eval/codex-cli-baseline.js +633 -0
  40. package/template/wall-e/eval/eval-orchestrator.js +3 -3
  41. package/template/wall-e/eval/run-agent-benchmarks.js +11 -3
  42. package/template/wall-e/eval/run-codex-cli-baseline.js +177 -0
  43. package/template/wall-e/lib/mcp-integration.js +220 -0
  44. package/template/wall-e/llm/ollama.js +47 -8
  45. package/template/wall-e/llm/ollama.plugin.json +1 -1
  46. package/template/wall-e/llm/tool-adapter.js +1 -0
  47. package/template/wall-e/loops/ingest.js +42 -8
  48. package/template/wall-e/mcp-server.js +272 -10
  49. package/template/wall-e/memory/ctm-session-context.js +910 -0
  50. package/template/wall-e/server.js +26 -1
  51. package/template/wall-e/skills/_bundled/scan-ctm-sessions/SKILL.md +20 -0
  52. package/template/wall-e/skills/_bundled/scan-ctm-sessions/run.js +43 -0
  53. package/template/wall-e/skills/skill-planner.js +52 -3
  54. package/template/wall-e/tools/builtin-middleware.js +55 -2
  55. package/template/wall-e/tools/shell-policy.js +1 -1
  56. package/template/wall-e/tools/slack-owner.js +104 -0
  57. package/template/website/index.html +2 -2
  58. package/template/builder-journal.md +0 -17
@@ -15,11 +15,60 @@
15
15
  const fs = require('fs');
16
16
  const path = require('path');
17
17
  const claudeDesktopSessions = require('./lib/claude-desktop-sessions');
18
+ const { codexRolloutIdFromPath, findCodexSessionFiles } = require('./lib/session-history');
18
19
 
19
20
  const CLAUDE_PROJECTS_DIR = path.join(process.env.HOME, '.claude', 'projects');
20
21
 
21
22
  // --- Detection ---
22
23
 
24
+ function sessionFileIdFromPath(filePath) {
25
+ const virtual = claudeDesktopSessions.parseVirtualSessionPath(filePath);
26
+ if (virtual) return virtual.sessionId;
27
+ const codexId = codexRolloutIdFromPath(filePath);
28
+ if (codexId) return codexId;
29
+ const base = path.basename(filePath).replace(/\.jsonl(\.bak)?$/, '');
30
+ const uuid = base.match(/([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$/i);
31
+ return uuid ? uuid[1].toLowerCase() : base;
32
+ }
33
+
34
+ function fileEntryFromPath(filePath, expectedFileId, projectEntry = '') {
35
+ if (!filePath) return null;
36
+ const actualFileId = sessionFileIdFromPath(filePath);
37
+ if (expectedFileId && actualFileId !== expectedFileId) return null;
38
+ try {
39
+ const sourcePath = claudeDesktopSessions.sourcePathForStat(filePath);
40
+ const stat = fs.statSync(sourcePath);
41
+ if (!stat.isFile()) return null;
42
+ return { filePath, stat, projectEntry };
43
+ } catch {
44
+ return null;
45
+ }
46
+ }
47
+
48
+ function addFileIndexEntry(fileIndex, filePath, projectEntry) {
49
+ const file = fileEntryFromPath(filePath, null, projectEntry);
50
+ if (file) fileIndex[sessionFileIdFromPath(filePath)] = file;
51
+ }
52
+
53
+ function resolveDbSessionFile(row, expectedFileId, fileIndex) {
54
+ if (!expectedFileId) return null;
55
+ if (fileIndex[expectedFileId]) return fileIndex[expectedFileId];
56
+
57
+ const stored = fileEntryFromPath(row?.jsonl_path, expectedFileId);
58
+ if (stored) return stored;
59
+
60
+ if ((row?.provider === 'codex') || String(row?.jsonl_path || '').includes(`${path.sep}.codex${path.sep}sessions${path.sep}`)) {
61
+ try {
62
+ for (const filePath of findCodexSessionFiles(expectedFileId)) {
63
+ const file = fileEntryFromPath(filePath, expectedFileId);
64
+ if (file) return file;
65
+ }
66
+ } catch {}
67
+ }
68
+
69
+ return null;
70
+ }
71
+
23
72
  function dbTimestampFromIso(value) {
24
73
  if (!value) return '';
25
74
  const ms = new Date(value).getTime();
@@ -138,7 +187,7 @@ function detectMismatches(db, getAllSessionFiles) {
138
187
  } catch {}
139
188
  const slugCol = hasSlugColumn ? 'a.slug' : "'' AS slug";
140
189
  allSessions = db.prepare(`
141
- SELECT c.id, c.title, c.user_renamed, c.starred, c.project_path, c.cwd,
190
+ SELECT c.id, c.provider, c.title, c.user_renamed, c.starred, c.project_path, c.cwd,
142
191
  c.created_at, c.updated_at,
143
192
  a.agent_session_id, a.jsonl_path, a.file_size, a.first_message,
144
193
  a.modified_at, a.hostname, a.model, a.git_branch, a.user_msg_count,
@@ -156,12 +205,7 @@ function detectMismatches(db, getAllSessionFiles) {
156
205
  const fileIndex = {}; // uuid -> { filePath, stat, projectEntry }
157
206
  try {
158
207
  for (const { filePath, projectEntry } of getAllSessionFiles()) {
159
- const virtual = claudeDesktopSessions.parseVirtualSessionPath(filePath);
160
- const uuid = virtual ? virtual.sessionId : path.basename(filePath).replace(/\.jsonl(\.bak)?$/, '');
161
- try {
162
- const stat = fs.statSync(claudeDesktopSessions.sourcePathForStat(filePath));
163
- fileIndex[uuid] = { filePath, stat, projectEntry };
164
- } catch {}
208
+ addFileIndexEntry(fileIndex, filePath, projectEntry);
165
209
  }
166
210
  } catch (e) {
167
211
  issues.push({ type: 'scan_error', severity: 'warning', sessionId: null,
@@ -183,7 +227,7 @@ function detectMismatches(db, getAllSessionFiles) {
183
227
 
184
228
  // Skip DB-only rows with no file expectation (legacy tabs with no agent_session_id)
185
229
  const expectedFileId = (agentId && agentId !== sid) ? agentId : sid;
186
- const file = fileIndex[expectedFileId];
230
+ const file = resolveDbSessionFile(row, expectedFileId, fileIndex);
187
231
 
188
232
  // Check 1: Missing file
189
233
  if (!file && row.file_size > 0) {
@@ -193,6 +237,7 @@ function detectMismatches(db, getAllSessionFiles) {
193
237
  expected_file_id: expectedFileId,
194
238
  db_file_size: row.file_size,
195
239
  db_jsonl_path: row.jsonl_path || '',
240
+ db_provider: row.provider || '',
196
241
  db_title: row.title || '',
197
242
  },
198
243
  suggestion: 'File may have been deleted or moved. Check .jsonl.bak variant.',
@@ -215,6 +260,7 @@ function detectMismatches(db, getAllSessionFiles) {
215
260
  db_file_size: row.file_size,
216
261
  actual_file_size: file.stat.size,
217
262
  size_diff: sizeDiff,
263
+ db_jsonl_path: row.jsonl_path || '',
218
264
  },
219
265
  suggestion: 'DB metadata is stale — will be refreshed on next session list load.',
220
266
  });
@@ -439,12 +485,7 @@ function recoverMismatches(db, issues, getAllSessionFiles) {
439
485
  const fileIndex = {};
440
486
  try {
441
487
  for (const { filePath, projectEntry } of getAllSessionFiles()) {
442
- const virtual = claudeDesktopSessions.parseVirtualSessionPath(filePath);
443
- const uuid = virtual ? virtual.sessionId : path.basename(filePath).replace(/\.jsonl(\.bak)?$/, '');
444
- try {
445
- const stat = fs.statSync(claudeDesktopSessions.sourcePathForStat(filePath));
446
- fileIndex[uuid] = { filePath, stat, projectEntry };
447
- } catch {}
488
+ addFileIndexEntry(fileIndex, filePath, projectEntry);
448
489
  }
449
490
  } catch {}
450
491
 
@@ -505,7 +546,9 @@ function recoverMismatches(db, issues, getAllSessionFiles) {
505
546
  case 'stale_metadata': {
506
547
  // Refresh metadata from actual file
507
548
  const fileId = issue.details.file_id;
508
- const file = fileIndex[fileId];
549
+ const file = fileIndex[fileId]
550
+ || fileEntryFromPath(issue.details?.db_jsonl_path, fileId)
551
+ || resolveDbSessionFile({ provider: 'codex', jsonl_path: issue.details?.db_jsonl_path || '' }, fileId, fileIndex);
509
552
  if (!file) { result.skipped++; break; }
510
553
  try {
511
554
  db.prepare('UPDATE agent_sessions SET file_size = ?, modified_at = ?, updated_at = datetime(\'now\') WHERE ctm_session_id = ?')
@@ -28,12 +28,12 @@ const ABOVE_ANCHOR_DEPTH = 40;
28
28
  // Claude Code: "Esc to cancel". Codex: "Press enter to confirm or esc to cancel".
29
29
  const ANCHOR_RE = /Esc to cancel|esc to cancel|Press enter to confirm/;
30
30
 
31
- // Yes-option pattern. Accepts an optional selection-arrow prefix in any of
31
+ // Approval-option pattern. Accepts an optional selection-arrow prefix in any of
32
32
  // the forms different CLIs use: ❯ (Claude Code), ›/▶/▸ (Cursor/others), or
33
33
  // plain ASCII > (Codex). Without this, Codex's "> 1. Yes, proceed (y)" would
34
34
  // be skipped over and the validator would lock onto option 2 ("2. Yes, ...")
35
35
  // — which is unstyled in Codex's renderer and trips no-widget-formatting.
36
- const YES_RE = /^\s*(?:[❯›▶▸>]\s*)?\d+\.\s*Yes\b/i;
36
+ const YES_RE = /^\s*(?:[❯›▶▸>]\s*)?\d+\.\s*(?:Yes|Allow)\b/i;
37
37
 
38
38
  /**
39
39
  * Check if the terminal is currently displaying an active approval widget.
@@ -142,9 +142,10 @@ function _hasWidgetFormatting(buf, yesRow, totalRows) {
142
142
  const yesText = yesLine.translateToString(true);
143
143
  if (/[❯›▶▸]/.test(yesText)) return true;
144
144
 
145
- // Check for "❯" marker anywhere in bottom 3 rows
146
- const arrowScanStart = Math.max(yesRow - 1, totalRows - 5);
147
- for (let row = arrowScanStart; row < totalRows; row++) {
145
+ // Check for a selection marker near the approval options. Codex MCP forms can
146
+ // select option 2 ("Allow for this session"), while option 1 is the first
147
+ // approval-shaped line used for anchoring.
148
+ for (let row = Math.max(0, yesRow - 1); row < Math.min(totalRows, yesRow + 8); row++) {
148
149
  const line = buf.getLine(buf.viewportY + row);
149
150
  if (!line) continue;
150
151
  const text = line.translateToString(true);
@@ -152,6 +153,15 @@ function _hasWidgetFormatting(buf, yesRow, totalRows) {
152
153
  if (/[❯›▶▸]/.test(text)) return true;
153
154
  }
154
155
 
156
+ // Check for "❯" marker anywhere in bottom 5 rows for prompts whose option
157
+ // block is pushed down by long wrapped content.
158
+ for (let row = Math.max(0, totalRows - 5); row < totalRows; row++) {
159
+ const line = buf.getLine(buf.viewportY + row);
160
+ if (!line) continue;
161
+ const text = line.translateToString(true);
162
+ if (/[❯›▶▸]/.test(text)) return true;
163
+ }
164
+
155
165
  // Check for ANSI foreground color on the Yes-option line.
156
166
  // xterm's BufferLine.getCell(x) returns an IBufferCell with .getFgColor()
157
167
  // (0 = default). Any non-default fg color = styled = widget.
@@ -47,6 +47,12 @@ function isCodexStatusRedraw(data) {
47
47
  module.exports = {
48
48
  ...baseDetector,
49
49
  id: 'codex',
50
+ // Codex's ratatui status frames arrive in bursts. A short Claude-style
51
+ // debounce lets the sidebar bounce between Running and Waiting/Idle while the
52
+ // terminal still says "Working". Keep the busy state stable long enough for
53
+ // multiple server heartbeats to confirm or renew it, while explicit
54
+ // approval/choice prompts still bypass this elsewhere.
55
+ idleDebounceMs: 15000,
50
56
 
51
57
  isActiveChunk(data) {
52
58
  if (!baseDetector.isActiveChunk(data)) return false;
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "walle",
3
- "version": "0.9.13",
3
+ "version": "0.9.14",
4
4
  "private": true,
5
5
  "description": "Wall-E — your personal digital twin",
6
6
  "scripts": {
@@ -109,7 +109,8 @@ function bootstrapSkills() {
109
109
  description: 'Scan Claude Code session files for new conversations',
110
110
  trigger_type: 'interval',
111
111
  trigger_config: JSON.stringify({ interval_ms: 60000 }),
112
- prompt_template: 'Scan the Claude Code session directory at ~/.claude/projects/ for any new or updated .jsonl session files. Read the most recently modified files and extract user messages and assistant responses as observations.',
112
+ prompt_template: 'INTERNAL_SKILL:scan-ctm-sessions',
113
+ execution: 'script',
113
114
  });
114
115
 
115
116
  brain.insertSkill({
@@ -140,16 +141,43 @@ function bootstrapSkills() {
140
141
  function syncBundledSkills() {
141
142
  const filesystemSkills = loadAllSkills();
142
143
  const dbSkills = brain.listSkills({});
143
- const dbNames = new Set(dbSkills.map(s => s.name));
144
+ const dbByName = new Map(dbSkills.map(s => [s.name, s]));
145
+ const dbNames = new Set(dbByName.keys());
144
146
 
145
147
  let added = 0;
148
+ let updated = 0;
146
149
  for (const skill of filesystemSkills) {
147
- if (dbNames.has(skill.name)) continue;
148
-
149
150
  const triggerType = (skill.trigger && skill.trigger.type) || skill.execution || 'manual';
150
151
  const triggerConfig = skill.trigger && skill.trigger.interval_ms
151
152
  ? JSON.stringify({ interval_ms: skill.trigger.interval_ms })
152
153
  : null;
154
+ const promptTemplate = skill.execution === 'script'
155
+ ? `INTERNAL_SKILL:${skill.name}`
156
+ : skill.instructions || '';
157
+
158
+ if (dbNames.has(skill.name)) {
159
+ const existing = dbByName.get(skill.name);
160
+ // Upgrade legacy prompt-based CTM scanning to the deterministic script
161
+ // path. Session continuity must not depend on live LLM/network access.
162
+ if (skill.name === 'scan-ctm-sessions' && skill.execution === 'script' && existing) {
163
+ const updates = {};
164
+ if (existing.execution !== 'script') updates.execution = 'script';
165
+ if (existing.prompt_template !== promptTemplate) updates.prompt_template = promptTemplate;
166
+ if (existing.trigger_type !== triggerType) updates.trigger_type = triggerType;
167
+ if (triggerConfig && existing.trigger_config !== triggerConfig) updates.trigger_config = triggerConfig;
168
+ if (existing.auto_disabled_at) {
169
+ updates.enabled = 1;
170
+ updates.auto_disabled_at = null;
171
+ }
172
+ if (existing.auto_disabled_reason) updates.auto_disabled_reason = null;
173
+ if (Object.keys(updates).length > 0) {
174
+ brain.updateSkill(existing.id, updates);
175
+ updated++;
176
+ console.log(`[wall-e] Updated bundled skill: ${skill.name}`);
177
+ }
178
+ }
179
+ continue;
180
+ }
153
181
 
154
182
  brain.insertSkill({
155
183
  name: skill.name,
@@ -159,9 +187,7 @@ function syncBundledSkills() {
159
187
  // Persist the legacy `INTERNAL_SKILL:` marker so downgrades can still
160
188
  // dispatch script skills via the prompt_template fallback. The schema
161
189
  // column `execution` is the authoritative source going forward.
162
- prompt_template: skill.execution === 'script'
163
- ? `INTERNAL_SKILL:${skill.name}`
164
- : skill.instructions || '',
190
+ prompt_template: promptTemplate,
165
191
  execution: skill.execution === 'script' ? 'script' : 'agent',
166
192
  });
167
193
  added++;
@@ -171,6 +197,9 @@ function syncBundledSkills() {
171
197
  if (added > 0) {
172
198
  console.log(`[wall-e] Synced ${added} new bundled skill(s) to DB`);
173
199
  }
200
+ if (updated > 0) {
201
+ console.log(`[wall-e] Updated ${updated} bundled skill(s) in DB`);
202
+ }
174
203
  }
175
204
 
176
205
  function bootstrapTasks() {
@@ -364,25 +364,78 @@ function handleWalleApi(req, res, url) {
364
364
  if (p === '/api/wall-e/slack/status' && m === 'GET') {
365
365
  try {
366
366
  const slackMcp = require('./tools/slack-mcp');
367
+ const { getSlackOwnerRepairState } = require('./tools/slack-owner');
367
368
  const token = slackMcp.loadToken();
368
- jsonResponse(res, { data: { authenticated: !!token?.access_token, team: token?.team_name, user: token?.user_id, obtained_at: token?.obtained_at } });
369
+ const owner = getSlackOwnerRepairState();
370
+ jsonResponse(res, {
371
+ data: {
372
+ authenticated: !!token?.access_token,
373
+ team: token?.team_name,
374
+ user: token?.user_id,
375
+ obtained_at: token?.obtained_at,
376
+ owner_configured: owner.configured,
377
+ owner_can_repair: owner.canRepair,
378
+ },
379
+ });
369
380
  } catch (e) {
370
381
  jsonResponse(res, { data: { authenticated: false } });
371
382
  }
372
383
  return true;
373
384
  }
374
385
 
386
+ // POST /api/wall-e/slack/repair-owner — derive Slack owner id from OAuth token
387
+ if (p === '/api/wall-e/slack/repair-owner' && m === 'POST') {
388
+ try {
389
+ const { repairSlackOwnerIdentity } = require('./tools/slack-owner');
390
+ const { clearServiceAlerts } = require('./skills/skill-planner');
391
+ const result = repairSlackOwnerIdentity({ persist: true });
392
+ if (!result.ok) {
393
+ return jsonResponse(res, {
394
+ ok: false,
395
+ error: result.error || 'Could not repair Slack owner identity',
396
+ needsSlackAuth: !!result.needsSlackAuth,
397
+ }, result.needsSlackAuth ? 409 : 500), true;
398
+ }
399
+ clearServiceAlerts('slack');
400
+ return jsonResponse(res, {
401
+ ok: true,
402
+ user_id_configured: true,
403
+ source: result.source,
404
+ persisted: !!result.persisted,
405
+ already_configured: !!result.alreadyConfigured,
406
+ }), true;
407
+ } catch (e) {
408
+ return jsonResponse(res, { ok: false, error: e.message }, 500), true;
409
+ }
410
+ }
411
+
375
412
  // POST /api/wall-e/slack/auth — start OAuth flow (opens browser)
376
413
  if (p === '/api/wall-e/slack/auth' && m === 'POST') {
377
414
  try {
378
415
  const slackMcp = require('./tools/slack-mcp');
379
416
  // If already authenticated, return immediately
380
417
  if (slackMcp.isAuthenticatedSync()) {
418
+ try {
419
+ const { repairSlackOwnerIdentity } = require('./tools/slack-owner');
420
+ const { clearServiceAlerts } = require('./skills/skill-planner');
421
+ const repaired = repairSlackOwnerIdentity({ persist: true });
422
+ if (repaired.ok) clearServiceAlerts('slack');
423
+ } catch (repairErr) {
424
+ console.warn('[wall-e] Slack owner repair skipped:', repairErr.message);
425
+ }
381
426
  jsonResponse(res, { ok: true, already: true });
382
427
  return true;
383
428
  }
384
429
  // Start OAuth — opens browser, temp server on port 3118 handles callback
385
430
  slackMcp.authenticate().then(() => {
431
+ try {
432
+ const { repairSlackOwnerIdentity } = require('./tools/slack-owner');
433
+ const { clearServiceAlerts } = require('./skills/skill-planner');
434
+ const repaired = repairSlackOwnerIdentity({ persist: true });
435
+ if (repaired.ok) clearServiceAlerts('slack');
436
+ } catch (repairErr) {
437
+ console.error('[wall-e] Slack owner repair failed:', repairErr.message);
438
+ }
386
439
  console.log('[wall-e] Slack OAuth completed');
387
440
  }).catch(err => {
388
441
  console.error('[wall-e] Slack OAuth failed:', err.message);
@@ -714,24 +767,9 @@ function handleWalleApi(req, res, url) {
714
767
  // GET /api/wall-e/mcp/integrations — check which AI tools have Wall-E MCP configured
715
768
  if (p === '/api/wall-e/mcp/integrations' && m === 'GET') {
716
769
  try {
717
- const fs = require('fs');
718
- const { MCP_TARGETS } = require('../create-walle/bin/mcp-inject');
770
+ const { detectMcpIntegrations } = require('./lib/mcp-integration');
719
771
  const wallePort = parseInt(process.env.WALL_E_PORT) || 3457;
720
- const home = process.env.HOME;
721
- const results = MCP_TARGETS.map(target => {
722
- const detectPath = path.join(home, target.detectDir);
723
- const configPath = path.join(home, target.configPath);
724
- if (!fs.existsSync(detectPath)) return { tool: target.tool, status: 'not_installed' };
725
- try {
726
- const config = JSON.parse(fs.readFileSync(configPath, 'utf8'));
727
- const entry = config?.mcpServers?.['wall-e'];
728
- if (entry && entry.url === `http://localhost:${wallePort}/mcp`) return { tool: target.tool, status: 'configured', configPath };
729
- if (entry) return { tool: target.tool, status: 'wrong_port', configPath };
730
- return { tool: target.tool, status: 'not_configured', configPath };
731
- } catch {
732
- return { tool: target.tool, status: 'not_configured', configPath };
733
- }
734
- });
772
+ const results = detectMcpIntegrations(wallePort);
735
773
  jsonResponse(res, { data: results, wallePort });
736
774
  } catch (e) {
737
775
  jsonResponse(res, { data: [], error: e.message });
@@ -742,9 +780,9 @@ function handleWalleApi(req, res, url) {
742
780
  // POST /api/wall-e/mcp/inject — run MCP config injection for all detected AI tools
743
781
  if (p === '/api/wall-e/mcp/inject' && m === 'POST') {
744
782
  try {
745
- const { injectMcpConfigs } = require('../create-walle/bin/mcp-inject');
783
+ const { ensureMcpIntegrations } = require('./lib/mcp-integration');
746
784
  const wallePort = parseInt(process.env.WALL_E_PORT) || 3457;
747
- const results = injectMcpConfigs(wallePort);
785
+ const results = ensureMcpIntegrations(wallePort);
748
786
  const added = results.filter(r => r.action === 'added' || r.action === 'updated').length;
749
787
  try { require('./telemetry').track('mcp_inject', { added, total: results.length }); } catch {}
750
788
  jsonResponse(res, { ok: true, results });
@@ -754,6 +792,20 @@ function handleWalleApi(req, res, url) {
754
792
  return true;
755
793
  }
756
794
 
795
+ // GET /api/wall-e/mcp/test - verify the live Wall-E MCP endpoint responds
796
+ if (p === '/api/wall-e/mcp/test' && m === 'GET') {
797
+ try {
798
+ const { testWallEMcpEndpoint } = require('./lib/mcp-integration');
799
+ const wallePort = parseInt(process.env.WALL_E_PORT) || 3457;
800
+ testWallEMcpEndpoint(wallePort, { timeoutMs: 1500 })
801
+ .then(result => jsonResponse(res, { data: result, wallePort }))
802
+ .catch(e => jsonResponse(res, { data: { ok: false, error: e.message }, wallePort }, 500));
803
+ } catch (e) {
804
+ jsonResponse(res, { data: { ok: false, error: e.message } }, 500);
805
+ }
806
+ return true;
807
+ }
808
+
757
809
  // GET /api/wall-e/status
758
810
  if (p === '/api/wall-e/status' && m === 'GET') {
759
811
  const result = getStatus();
@@ -84,6 +84,9 @@ class StreamProcessor extends EventEmitter {
84
84
  stopReason: '',
85
85
  status: 'running',
86
86
  errors: [],
87
+ toolErrors: [],
88
+ hadEdit: false,
89
+ verified: false,
87
90
  events: [],
88
91
  };
89
92
 
@@ -111,7 +114,7 @@ class StreamProcessor extends EventEmitter {
111
114
  const snapshot = await this.snapshotService.captureStepFinish({ sessionId, cwd, messageId: assistantMessageId });
112
115
  if (snapshot) await this._record(sessionId, cwd, 'snapshot', snapshot);
113
116
  }
114
- state.status = state.errors.length > 0 ? 'error' : 'finished';
117
+ state.status = 'finished';
115
118
  } catch (err) {
116
119
  state.status = 'error';
117
120
  state.errors.push(err.message);
@@ -135,6 +138,8 @@ class StreamProcessor extends EventEmitter {
135
138
  toolCalls: state.toolCalls,
136
139
  }),
137
140
  toolResultMessage: state.toolResults.length > 0 ? toolResultMessage(state.toolResults) : null,
141
+ hadEdit: state.hadEdit,
142
+ verified: state.verified,
138
143
  next: state.status === 'error' ? 'stop' : state.toolResults.length > 0 ? 'continue' : 'stop',
139
144
  };
140
145
  }
@@ -223,6 +228,8 @@ class StreamProcessor extends EventEmitter {
223
228
  input: call.input,
224
229
  });
225
230
  const result = await this.toolExecutor(call, { sessionId, cwd, model: state.model, provider: state.provider });
231
+ if (isEditTool(call.name) && !result?.error) state.hadEdit = true;
232
+ if (isSuccessfulTestCommand(call, result)) state.verified = true;
226
233
  state.toolResults.push({ toolCallId: call.id, name: call.name, result });
227
234
  await this._record(sessionId, cwd, 'tool', {
228
235
  state: 'completed',
@@ -231,7 +238,7 @@ class StreamProcessor extends EventEmitter {
231
238
  result,
232
239
  });
233
240
  } catch (err) {
234
- state.errors.push(err.message);
241
+ state.toolErrors.push(err.message);
235
242
  state.toolResults.push({ toolCallId: call.id, name: call.name, error: err.message });
236
243
  await this._record(sessionId, cwd, 'tool', {
237
244
  state: 'error',
@@ -262,7 +269,20 @@ class StreamProcessor extends EventEmitter {
262
269
  }
263
270
  }
264
271
 
272
+ function isEditTool(name) {
273
+ return ['edit_file', 'write_file', 'apply_patch', 'multi_edit'].includes(name);
274
+ }
275
+
276
+ function isSuccessfulTestCommand(call, result) {
277
+ if (call?.name !== 'run_shell') return false;
278
+ const command = String(call.input?.command || '');
279
+ if (!/\b(test|spec|jest|mocha|pytest|npm\s+test|node\s+test\.js)\b/i.test(command)) return false;
280
+ if (result?.error || result?.exitCode) return false;
281
+ return true;
282
+ }
283
+
265
284
  module.exports = {
266
285
  StreamProcessor,
267
286
  streamFromChat,
287
+ isSuccessfulTestCommand,
268
288
  };
@@ -750,7 +750,7 @@ async function runAgentLoop(prompt, opts = {}) {
750
750
 
751
751
  const mw = opts.middleware || (() => {
752
752
  const m = new CodingMiddleware();
753
- registerBuiltinMiddleware(m, { cwd, provider: llm?.type, model: modelId, claudeMd: opts.claudeMd, mode: opts.mode, taskEnv: opts.env });
753
+ registerBuiltinMiddleware(m, { cwd, provider: llm?.type, model: modelId, claudeMd: opts.claudeMd, mode: opts.mode, taskEnv: opts.env, benchmark: opts.benchmark });
754
754
  return m;
755
755
  })();
756
756
  const events = opts.events || new CodingEvents();
@@ -812,6 +812,7 @@ async function runAgentLoop(prompt, opts = {}) {
812
812
  const questionManager = opts.questionManager || new QuestionManager(events);
813
813
 
814
814
  // projectInfo already detected above (before system prompt)
815
+ const llmCtxRef = { current: null }; // populated each turn (see llmCtx below)
815
816
 
816
817
  // Stream-native runtime: model deltas, tool states, snapshots, permissions,
817
818
  // and step boundaries are persisted as typed transcript parts while the loop
@@ -835,9 +836,15 @@ async function runAgentLoop(prompt, opts = {}) {
835
836
  if (call.name === 'list_directory' && input.directory && !path.isAbsolute(input.directory)) {
836
837
  input.directory = path.join(resolvedCwd, input.directory);
837
838
  }
839
+ if (call.name === 'run_shell' && !input.cwd) {
840
+ input.cwd = resolvedCwd;
841
+ }
838
842
  input.sessionId = sid;
839
843
  input.projectRoot = resolvedCwd;
840
- return toolRegistry.execute(call.name, input, { sessionId: sid, cwd: resolvedCwd, model: modelId, provider: llm.type });
844
+ const toolCtx = { sessionId: sid, cwd: resolvedCwd, model: modelId, provider: llm.type, runtimeMode: runtimeMode.id };
845
+ const finalInput = await mw.run('tool.before', toolCtx, call.name, input);
846
+ const result = await toolRegistry.execute(call.name, finalInput, toolCtx);
847
+ return mw.run('tool.after', toolCtx, call.name, finalInput, result);
841
848
  },
842
849
  });
843
850
  processor.on('event', (evt) => emitProgress({
@@ -851,6 +858,7 @@ async function runAgentLoop(prompt, opts = {}) {
851
858
  let streamStopReason = '';
852
859
  let streamModel = modelId;
853
860
  const streamErrors = [];
861
+ let streamHadEdit = false;
854
862
  for (let turnIndex = opts._resumeTurn || 0; turnIndex < turns; turnIndex++) {
855
863
  const remaining = deadline - Date.now();
856
864
  if (remaining <= 0) {
@@ -878,14 +886,24 @@ async function runAgentLoop(prompt, opts = {}) {
878
886
  runtimeMode: runtimeMode.id,
879
887
  cwd: resolvedCwd,
880
888
  });
889
+ const llmCtx = { params: { maxTokens: taskFileHints.length >= 4 ? 8192 : 4096 }, system: systemPrompt, cwd: resolvedCwd,
890
+ provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {},
891
+ toolsAvailable: toolsForTurn.length > 0 };
892
+ llmCtxRef.current = llmCtx;
893
+ await mw.run('llm.before', llmCtx);
881
894
  turn = await processor.runTurn({
882
895
  sessionId: sid,
883
896
  cwd: resolvedCwd,
884
- system: systemPrompt,
897
+ system: llmCtx.system,
885
898
  messages,
886
899
  tools: toolsForTurn,
887
900
  maxTokens: taskFileHints.length >= 4 ? 8192 : 4096,
888
901
  signal: ac.signal,
902
+ maxTokens: llmCtx.params.maxTokens,
903
+ temperature: llmCtx.params.temperature,
904
+ thinking: llmCtx.params.thinking,
905
+ reasoningEffort: llmCtx.params.reasoningEffort,
906
+ options: llmCtx.params.options,
889
907
  });
890
908
  } finally {
891
909
  clearTimeout(timer);
@@ -911,6 +929,7 @@ async function runAgentLoop(prompt, opts = {}) {
911
929
  content: turn.text,
912
930
  stopReason: turn.stopReason,
913
931
  });
932
+ if (turn.hadEdit) streamHadEdit = true;
914
933
 
915
934
  if (turn.status === 'error') break;
916
935
  if ((turn.toolCalls || []).length === 0) {
@@ -931,6 +950,7 @@ async function runAgentLoop(prompt, opts = {}) {
931
950
  }
932
951
  if (turn.assistantMessage) messages.push(turn.assistantMessage);
933
952
  if (turn.toolResultMessage) messages.push(turn.toolResultMessage);
953
+ if (turn.verified && streamHadEdit) break;
934
954
  if (turn.next !== 'continue') break;
935
955
  }
936
956
 
@@ -971,7 +991,6 @@ async function runAgentLoop(prompt, opts = {}) {
971
991
  // ── Bridge: event bus → middleware (A2) ──
972
992
  // When the event bus fires, propagate to middleware's onEvent hook so
973
993
  // registered middleware can react to file edits, reads, and context overflow.
974
- const llmCtxRef = { current: null }; // populated each turn (see llmCtx below)
975
994
  const _bridgeHandlers = {};
976
995
  for (const evtType of ['file.edited', 'file.read', 'context.overflow']) {
977
996
  const handler = (data) => {
@@ -1073,8 +1092,10 @@ async function runAgentLoop(prompt, opts = {}) {
1073
1092
  const timer = setTimeout(() => ac.abort(), Math.min(remaining, perTurnCap));
1074
1093
 
1075
1094
  // Middleware: prepare LLM call
1095
+ const turnsRemaining = turns - turn;
1076
1096
  const llmCtx = { params: { maxTokens: taskFileHints.length >= 4 ? 8192 : 4096 }, system: systemPrompt, cwd: resolvedCwd,
1077
- provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {} };
1097
+ provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {},
1098
+ toolsAvailable: turnsRemaining > 1 };
1078
1099
  llmCtxRef.current = llmCtx; // expose to event bridge (A2)
1079
1100
  await mw.run('llm.before', llmCtx);
1080
1101
  let adaptedTools = await toolRegistry.getDefinitions(llmCtx);
@@ -1136,7 +1157,6 @@ async function runAgentLoop(prompt, opts = {}) {
1136
1157
  // Graceful max-steps degradation (6n)
1137
1158
  // Note: warnings are appended to the LAST message's content (not as separate
1138
1159
  // user messages) to avoid consecutive user messages which the API rejects.
1139
- const turnsRemaining = turns - turn;
1140
1160
  if (turnsRemaining <= 1) {
1141
1161
  // Final turn: disable tools, force structured summary
1142
1162
  adaptedTools = [];
@@ -100,10 +100,13 @@ async function runAgentBenchmark(benchmark, options = {}) {
100
100
  }
101
101
 
102
102
  // Run the agent loop with hard timeout safety net
103
- const effectiveTimeout = timeoutMs || (expectations.maxTurns || 20) * 30000;
103
+ const maxTurns = expectations.maxTurns || 20;
104
+ const turnBudgetTimeout = maxTurns * 30000;
105
+ const effectiveTimeout = Math.min(timeoutMs || turnBudgetTimeout, turnBudgetTimeout);
104
106
  const agentPromise = runAgentLoop(benchmark.prompt, {
105
107
  cwd: sandboxDir,
106
108
  timeoutMs: effectiveTimeout,
109
+ maxTurns,
107
110
  provider,
108
111
  model,
109
112
  mode: 'build',
@@ -156,6 +159,10 @@ async function runAgentBenchmark(benchmark, options = {}) {
156
159
  const inputTokens = usage.inputTokens ?? usage.input ?? 0;
157
160
  const expectedFileChanges = expectations.expectedFileChanges || [];
158
161
  const missingExpectedWork = expectedFileChanges.length > 0 && actualFileChanges.length === 0;
162
+ const attemptedFileChange = actualToolCalls.some((call) => {
163
+ const name = typeof call === 'string' ? call : call?.name;
164
+ return /edit|write|patch|create|delete|modify/i.test(String(name || ''));
165
+ });
159
166
  const testRegression = (expectations.testCommand && testsPassed === false);
160
167
  const rawError = result.stderr || result.error || null;
161
168
  const validatedByTests = Boolean(
@@ -199,7 +206,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
199
206
  : testRegression
200
207
  ? 'tests_failed'
201
208
  : missingExpectedWork
202
- ? 'no_file_changes'
209
+ ? attemptedFileChange ? 'missing_expected_changes' : 'no_file_changes'
203
210
  : 'no_effort' },
204
211
  };
205
212
  }
@@ -296,6 +303,10 @@ function scoreAgentResult(benchmark, actual) {
296
303
  });
297
304
  }
298
305
 
306
+ function isTrustedAgentResult(result = {}) {
307
+ return result.success === true && !result.error && result.testsPassed === true;
308
+ }
309
+
299
310
  /**
300
311
  * Run a multi-turn benchmark — sends each turn's prompt sequentially,
301
312
  * accumulating conversation context. Scores after the final turn.
@@ -507,7 +518,7 @@ async function runAgentBenchmarkSuite(options = {}) {
507
518
  outputTokens: result.outputTokens ?? null,
508
519
  scorerVersion: DEFAULT_SCORER_VERSION,
509
520
  scoringMethod,
510
- trusted: !result.error && result.testsPassed === true,
521
+ trusted: isTrustedAgentResult(result),
511
522
  runConfig: { timeoutMs, scoringMethod },
512
523
  }, {
513
524
  suite: 'coding-agent',
@@ -517,7 +528,7 @@ async function runAgentBenchmarkSuite(options = {}) {
517
528
  model: resolveModelName(model),
518
529
  scoringMethod,
519
530
  scorerVersion: DEFAULT_SCORER_VERSION,
520
- trusted: !result.error && result.testsPassed === true,
531
+ trusted: isTrustedAgentResult(result),
521
532
  runConfig: { timeoutMs, scoringMethod },
522
533
  }));
523
534
  } catch { /* non-fatal */ }
@@ -666,6 +677,7 @@ module.exports = {
666
677
  runMultiTurnBenchmark,
667
678
  runAgentBenchmarkSuite,
668
679
  scoreAgentResult,
680
+ isTrustedAgentResult,
669
681
  extractToolCalls,
670
682
  extractToolCallDetails,
671
683
  countTests,