create-walle 0.9.13 → 0.9.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/README.md +8 -3
  2. package/bin/create-walle.js +232 -32
  3. package/bin/mcp-inject.js +18 -53
  4. package/package.json +3 -1
  5. package/template/claude-task-manager/api-prompts.js +11 -2
  6. package/template/claude-task-manager/approval-agent.js +7 -0
  7. package/template/claude-task-manager/db.js +94 -75
  8. package/template/claude-task-manager/docs/session-standup-command-center-design.md +242 -0
  9. package/template/claude-task-manager/docs/session-tooltip-freshness-design.md +224 -0
  10. package/template/claude-task-manager/docs/session-ux-issue-review-2026-05-01.md +369 -0
  11. package/template/claude-task-manager/fuzzy-utils.js +10 -2
  12. package/template/claude-task-manager/git-utils.js +140 -10
  13. package/template/claude-task-manager/lib/agent-capabilities.js +1 -1
  14. package/template/claude-task-manager/lib/agent-presets.js +38 -5
  15. package/template/claude-task-manager/lib/codex-terminal-final.js +53 -0
  16. package/template/claude-task-manager/lib/ctm-session-context-api.js +222 -0
  17. package/template/claude-task-manager/lib/session-diagnostics.js +56 -0
  18. package/template/claude-task-manager/lib/session-history.js +309 -16
  19. package/template/claude-task-manager/lib/session-standup.js +409 -0
  20. package/template/claude-task-manager/lib/session-stream.js +253 -20
  21. package/template/claude-task-manager/lib/standup-attention.js +200 -0
  22. package/template/claude-task-manager/lib/status-hooks.js +8 -2
  23. package/template/claude-task-manager/lib/update-telemetry.js +114 -0
  24. package/template/claude-task-manager/lib/walle-ctm-history.js +49 -6
  25. package/template/claude-task-manager/lib/walle-default-model.js +55 -0
  26. package/template/claude-task-manager/lib/walle-mcp-auto-config.js +66 -0
  27. package/template/claude-task-manager/lib/walle-supervisor.js +86 -19
  28. package/template/claude-task-manager/lib/walle-transcript.js +1 -3
  29. package/template/claude-task-manager/lib/worktree-cwd.js +82 -0
  30. package/template/claude-task-manager/package.json +1 -0
  31. package/template/claude-task-manager/providers/codex-mcp.js +104 -0
  32. package/template/claude-task-manager/providers/index.js +2 -0
  33. package/template/claude-task-manager/public/css/setup.css +2 -1
  34. package/template/claude-task-manager/public/css/walle.css +71 -0
  35. package/template/claude-task-manager/public/index.html +2388 -429
  36. package/template/claude-task-manager/public/js/message-renderer.js +314 -35
  37. package/template/claude-task-manager/public/js/session-search-utils.js +185 -3
  38. package/template/claude-task-manager/public/js/session-status-precedence.js +125 -0
  39. package/template/claude-task-manager/public/js/setup.js +62 -19
  40. package/template/claude-task-manager/public/js/stream-view.js +396 -55
  41. package/template/claude-task-manager/public/js/terminal-restore-state.js +57 -0
  42. package/template/claude-task-manager/public/js/walle-session.js +234 -26
  43. package/template/claude-task-manager/public/js/walle.js +143 -2
  44. package/template/claude-task-manager/server.js +1402 -433
  45. package/template/claude-task-manager/session-integrity.js +77 -28
  46. package/template/claude-task-manager/workers/approval-widget-validator.js +15 -5
  47. package/template/claude-task-manager/workers/scrollback-worker.js +5 -6
  48. package/template/claude-task-manager/workers/state-detectors/codex.js +6 -0
  49. package/template/package.json +1 -1
  50. package/template/wall-e/agent-runners/claude-code.js +2 -0
  51. package/template/wall-e/agent.js +63 -8
  52. package/template/wall-e/api-walle.js +330 -52
  53. package/template/wall-e/brain.js +291 -42
  54. package/template/wall-e/chat.js +172 -15
  55. package/template/wall-e/coding/compaction-service.js +19 -5
  56. package/template/wall-e/coding/stream-processor.js +22 -2
  57. package/template/wall-e/coding/workspace-replay.js +1 -4
  58. package/template/wall-e/coding-orchestrator.js +250 -80
  59. package/template/wall-e/compat.js +0 -28
  60. package/template/wall-e/context/context-builder.js +3 -1
  61. package/template/wall-e/embeddings.js +2 -7
  62. package/template/wall-e/eval/agent-runner.js +30 -9
  63. package/template/wall-e/eval/benchmark-generator.js +21 -1
  64. package/template/wall-e/eval/benchmarks/chat-eval.json +66 -6
  65. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -596
  66. package/template/wall-e/eval/cc-replay.js +1 -0
  67. package/template/wall-e/eval/codex-cli-baseline.js +633 -0
  68. package/template/wall-e/eval/debug-agent003.js +1 -0
  69. package/template/wall-e/eval/eval-orchestrator.js +3 -3
  70. package/template/wall-e/eval/run-agent-benchmarks.js +11 -3
  71. package/template/wall-e/eval/run-codex-cli-baseline.js +177 -0
  72. package/template/wall-e/eval/run-model-comparison.js +1 -0
  73. package/template/wall-e/eval/swebench-adapter.js +1 -0
  74. package/template/wall-e/evaluation/quorum-evaluator.js +0 -1
  75. package/template/wall-e/extraction/knowledge-extractor.js +1 -2
  76. package/template/wall-e/lib/mcp-integration.js +336 -0
  77. package/template/wall-e/llm/ollama.js +47 -8
  78. package/template/wall-e/llm/ollama.plugin.json +1 -1
  79. package/template/wall-e/llm/tool-adapter.js +1 -0
  80. package/template/wall-e/loops/ingest.js +42 -8
  81. package/template/wall-e/loops/initiative.js +87 -2
  82. package/template/wall-e/mcp-server.js +872 -19
  83. package/template/wall-e/memory/ctm-context-client.js +230 -0
  84. package/template/wall-e/memory/ctm-session-context.js +1376 -0
  85. package/template/wall-e/prompts/coding/memory-protocol.md +6 -0
  86. package/template/wall-e/server.js +30 -1
  87. package/template/wall-e/skills/_bundled/memory-search/SKILL.md +8 -0
  88. package/template/wall-e/skills/_bundled/scan-ctm-sessions/SKILL.md +20 -0
  89. package/template/wall-e/skills/_bundled/scan-ctm-sessions/run.js +43 -0
  90. package/template/wall-e/skills/_bundled/slack-mentions/run.js +471 -188
  91. package/template/wall-e/skills/skill-planner.js +86 -4
  92. package/template/wall-e/slack/socket-mode-listener.js +276 -0
  93. package/template/wall-e/telemetry.js +70 -2
  94. package/template/wall-e/tools/builtin-middleware.js +55 -2
  95. package/template/wall-e/tools/shell-policy.js +1 -1
  96. package/template/wall-e/tools/slack-owner.js +104 -0
  97. package/template/website/index.html +4 -4
  98. package/template/builder-journal.md +0 -17
@@ -0,0 +1,633 @@
1
+ 'use strict';
2
+
3
+ const crypto = require('crypto');
4
+ const fs = require('fs');
5
+ const os = require('os');
6
+ const path = require('path');
7
+ const { execFile, execFileSync, spawn } = require('child_process');
8
+ const { promisify } = require('util');
9
+
10
+ const execFileAsync = promisify(execFile);
11
+
12
+ const {
13
+ cleanupSandbox,
14
+ countTests,
15
+ scoreAgentResult,
16
+ setupSandbox,
17
+ } = require('./agent-runner');
18
+ const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
19
+
20
+ const DEFAULT_RESULTS_DIR = path.join(os.homedir(), '.walle', 'eval-results', 'codex-cli-baseline');
21
+ const DEFAULT_TIMEOUT_MS = 600_000;
22
+ const ALLOWED_TEST_COMMANDS = /^(npm test|node test\.js|pytest|python -m pytest|make test)$/;
23
+
24
+ function buildCodexPrompt(benchmark = {}) {
25
+ const expectations = benchmark.agentExpectations || {};
26
+ const parts = [
27
+ benchmark.prompt || '',
28
+ '',
29
+ 'You are running as an external Codex CLI baseline for a Wall-E coding-agent benchmark.',
30
+ 'Work only in this repository. Do not commit changes. Make the requested code changes directly.',
31
+ ];
32
+ if (expectations.testCommand) {
33
+ parts.push(`After editing, run this validation command if practical: ${expectations.testCommand}`);
34
+ }
35
+ if (Array.isArray(expectations.expectedFileChanges) && expectations.expectedFileChanges.length > 0) {
36
+ parts.push(`Expected changed file(s), when appropriate: ${expectations.expectedFileChanges.join(', ')}`);
37
+ }
38
+ return parts.filter(Boolean).join('\n');
39
+ }
40
+
41
+ function buildCodexExecArgs({
42
+ prompt,
43
+ cwd,
44
+ model,
45
+ outFile,
46
+ fullAuto = true,
47
+ sandbox = null,
48
+ json = true,
49
+ dangerouslyBypassSandbox = false,
50
+ configOverrides = [],
51
+ } = {}) {
52
+ if (!cwd) throw new Error('cwd is required');
53
+ if (!outFile) throw new Error('outFile is required');
54
+ const args = [
55
+ 'exec',
56
+ '--skip-git-repo-check',
57
+ '--ephemeral',
58
+ '--output-last-message', outFile,
59
+ '-C', cwd,
60
+ ];
61
+ if (model) args.push('-m', model);
62
+ if (json) args.push('--json');
63
+ for (const override of configOverrides || []) {
64
+ if (override) args.push('-c', override);
65
+ }
66
+ if (dangerouslyBypassSandbox) {
67
+ args.push('--dangerously-bypass-approvals-and-sandbox');
68
+ } else if (fullAuto) {
69
+ args.push('--full-auto');
70
+ } else if (sandbox) {
71
+ args.push('--sandbox', sandbox);
72
+ }
73
+ args.push(prompt || '');
74
+ return args;
75
+ }
76
+
77
+ function codexEnv({ useEnvOpenAIKey = false, env = process.env } = {}) {
78
+ const childEnv = { ...env };
79
+ if (!useEnvOpenAIKey) delete childEnv.OPENAI_API_KEY;
80
+ if (!childEnv.OTEL_SDK_DISABLED) childEnv.OTEL_SDK_DISABLED = 'true';
81
+ return childEnv;
82
+ }
83
+
84
+ async function runCodexExec({
85
+ prompt,
86
+ cwd,
87
+ model,
88
+ timeoutMs = DEFAULT_TIMEOUT_MS,
89
+ useEnvOpenAIKey = false,
90
+ fullAuto = true,
91
+ sandbox = null,
92
+ dangerouslyBypassSandbox = false,
93
+ json = true,
94
+ allowMcp = false,
95
+ disableMcpServers = null,
96
+ configOverrides = [],
97
+ } = {}) {
98
+ const tmpDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'walle-codex-baseline-'));
99
+ const outFile = path.join(tmpDir, 'last-message.txt');
100
+ const effectiveConfigOverrides = [
101
+ ...configOverrides,
102
+ ...buildMcpDisableConfigOverrides({ allowMcp, disableMcpServers }),
103
+ ];
104
+ const args = buildCodexExecArgs({
105
+ prompt,
106
+ cwd,
107
+ model,
108
+ outFile,
109
+ fullAuto,
110
+ sandbox,
111
+ json,
112
+ dangerouslyBypassSandbox,
113
+ configOverrides: effectiveConfigOverrides,
114
+ });
115
+ const started = Date.now();
116
+
117
+ try {
118
+ const result = await spawnCodex(args, {
119
+ cwd,
120
+ env: codexEnv({ useEnvOpenAIKey }),
121
+ timeoutMs,
122
+ });
123
+ let output = '';
124
+ try { output = fs.readFileSync(outFile, 'utf8'); } catch {}
125
+ const telemetry = json ? parseCodexJsonEvents(result.stdout) : parseCodexFormattedOutput(result.stderr || result.stdout);
126
+ return {
127
+ ...result,
128
+ args,
129
+ ...telemetry,
130
+ output: output || result.stdout || '',
131
+ latencyMs: Date.now() - started,
132
+ };
133
+ } finally {
134
+ fs.rmSync(tmpDir, { recursive: true, force: true });
135
+ }
136
+ }
137
+
138
+ function spawnCodex(args, { cwd, env, timeoutMs } = {}) {
139
+ return new Promise((resolve, reject) => {
140
+ let proc;
141
+ try {
142
+ proc = spawn('codex', args, {
143
+ cwd,
144
+ env,
145
+ stdio: ['ignore', 'pipe', 'pipe'],
146
+ });
147
+ } catch (err) {
148
+ reject(err);
149
+ return;
150
+ }
151
+
152
+ let stdout = '';
153
+ let stderr = '';
154
+ let timedOut = false;
155
+ const timer = setTimeout(() => {
156
+ timedOut = true;
157
+ try { proc.kill('SIGTERM'); } catch {}
158
+ }, timeoutMs || DEFAULT_TIMEOUT_MS);
159
+ if (typeof timer.unref === 'function') timer.unref();
160
+
161
+ proc.stdout.on('data', (chunk) => { stdout += chunk.toString(); });
162
+ proc.stderr.on('data', (chunk) => { stderr += chunk.toString(); });
163
+ proc.on('error', (err) => {
164
+ clearTimeout(timer);
165
+ reject(err);
166
+ });
167
+ proc.on('close', (code, signal) => {
168
+ clearTimeout(timer);
169
+ resolve({
170
+ code,
171
+ signal,
172
+ timedOut,
173
+ stdout,
174
+ stderr,
175
+ });
176
+ });
177
+ });
178
+ }
179
+
180
+ async function getModifiedFiles(dir) {
181
+ try {
182
+ const { stdout } = await execFileAsync('git', ['diff', '--name-only', 'HEAD'], { cwd: dir });
183
+ const untracked = (await execFileAsync('git', ['ls-files', '--others', '--exclude-standard'], { cwd: dir })).stdout;
184
+ return [...stdout.trim().split('\n'), ...untracked.trim().split('\n')].filter(Boolean);
185
+ } catch {
186
+ return [];
187
+ }
188
+ }
189
+
190
+ function testCommandAllowed(command) {
191
+ return !!command && ALLOWED_TEST_COMMANDS.test(command);
192
+ }
193
+
194
+ function buildMcpDisableConfigOverrides({ allowMcp = false, disableMcpServers = null } = {}) {
195
+ if (allowMcp) return [];
196
+ const servers = disableMcpServers || discoverEnabledCodexMcpServers();
197
+ return [...new Set(servers)]
198
+ .filter(Boolean)
199
+ .map((name) => `mcp_servers.${tomlPathSegment(name)}.enabled=false`);
200
+ }
201
+
202
+ function discoverEnabledCodexMcpServers() {
203
+ try {
204
+ const stdout = execFileSync('codex', ['mcp', 'list'], {
205
+ encoding: 'utf8',
206
+ stdio: ['ignore', 'pipe', 'ignore'],
207
+ timeout: 10_000,
208
+ });
209
+ return parseEnabledCodexMcpServers(stdout);
210
+ } catch {
211
+ return [];
212
+ }
213
+ }
214
+
215
+ function parseEnabledCodexMcpServers(output = '') {
216
+ const names = [];
217
+ for (const line of String(output || '').split(/\r?\n/)) {
218
+ const trimmed = line.trim();
219
+ if (!trimmed || /^Name\b/.test(trimmed) || /^WARNING\b/.test(trimmed)) continue;
220
+ if (!/\benabled\b/.test(trimmed)) continue;
221
+ const [name] = trimmed.split(/\s+/);
222
+ if (name) names.push(name);
223
+ }
224
+ return names;
225
+ }
226
+
227
+ function tomlPathSegment(value) {
228
+ const raw = String(value || '');
229
+ return /^[A-Za-z_][A-Za-z0-9_]*$/.test(raw) ? raw : JSON.stringify(raw);
230
+ }
231
+
232
+ function parseCodexJsonEvents(stdout = '') {
233
+ const events = [];
234
+ const toolCalls = [];
235
+ const toolCallDetails = [];
236
+ const errors = [];
237
+ const usage = {
238
+ inputTokens: 0,
239
+ cachedInputTokens: 0,
240
+ outputTokens: 0,
241
+ };
242
+ let turns = 0;
243
+
244
+ for (const line of String(stdout || '').split(/\r?\n/)) {
245
+ const trimmed = line.trim();
246
+ if (!trimmed) continue;
247
+ let event;
248
+ try {
249
+ event = JSON.parse(trimmed);
250
+ } catch {
251
+ continue;
252
+ }
253
+ events.push(event);
254
+
255
+ if (event.type === 'turn.completed') {
256
+ turns += 1;
257
+ const eventUsage = event.usage || event.turn?.usage || {};
258
+ usage.inputTokens += Number(eventUsage.input_tokens ?? eventUsage.inputTokens ?? 0);
259
+ usage.cachedInputTokens += Number(eventUsage.cached_input_tokens ?? eventUsage.cachedInputTokens ?? 0);
260
+ usage.outputTokens += Number(eventUsage.output_tokens ?? eventUsage.outputTokens ?? 0);
261
+ }
262
+
263
+ if (event.type === 'error') {
264
+ errors.push(event.message || event.error?.message || event.error || 'codex json error');
265
+ }
266
+
267
+ if (event.type !== 'item.completed' || !event.item) continue;
268
+ const detail = classifyCodexItem(event.item);
269
+ if (!detail) continue;
270
+ toolCalls.push(detail.name);
271
+ toolCallDetails.push(detail);
272
+ }
273
+
274
+ return {
275
+ codexEvents: events,
276
+ codexErrors: errors,
277
+ toolCalls,
278
+ toolCallDetails,
279
+ turns,
280
+ usage: {
281
+ inputTokens: usage.inputTokens || null,
282
+ cachedInputTokens: usage.cachedInputTokens || null,
283
+ outputTokens: usage.outputTokens || null,
284
+ },
285
+ };
286
+ }
287
+
288
+ function parseCodexFormattedOutput(output = '') {
289
+ const toolCalls = [];
290
+ const toolCallDetails = [];
291
+ for (const line of String(output || '').split(/\r?\n/)) {
292
+ const commandMatch = line.match(/^\s*(?:exec|shell)\s+(.+)$/i);
293
+ if (!commandMatch) continue;
294
+ const detail = classifyCodexCommand(commandMatch[1]);
295
+ toolCalls.push(detail.name);
296
+ toolCallDetails.push(detail);
297
+ }
298
+ return {
299
+ toolCalls,
300
+ toolCallDetails,
301
+ turns: toolCalls.length ? 1 : 0,
302
+ usage: {
303
+ inputTokens: null,
304
+ cachedInputTokens: null,
305
+ outputTokens: parseCodexTokenCount(output),
306
+ },
307
+ };
308
+ }
309
+
310
+ function classifyCodexItem(item = {}) {
311
+ const itemType = String(item.type || item.item_type || '').toLowerCase();
312
+ if (itemType.includes('command') || itemType === 'exec' || itemType === 'shell') {
313
+ return classifyCodexCommand(item.command || item.cmd || item.text || item.arguments?.cmd || '', item);
314
+ }
315
+ if (itemType.includes('file')) {
316
+ return {
317
+ name: 'edit_file',
318
+ source: 'codex-cli',
319
+ codexType: item.type || null,
320
+ path: item.path || item.file || item.file_path || null,
321
+ status: item.status || null,
322
+ };
323
+ }
324
+ if (itemType.includes('mcp')) {
325
+ return {
326
+ name: 'mcp_tool_call',
327
+ source: 'codex-cli',
328
+ codexType: item.type || null,
329
+ tool: item.name || item.tool || null,
330
+ status: item.status || null,
331
+ };
332
+ }
333
+ if (itemType.includes('web_search') || itemType.includes('web-search')) {
334
+ return {
335
+ name: 'web_search',
336
+ source: 'codex-cli',
337
+ codexType: item.type || null,
338
+ status: item.status || null,
339
+ };
340
+ }
341
+ return null;
342
+ }
343
+
344
+ function classifyCodexCommand(command = '', item = {}) {
345
+ const raw = String(command || '');
346
+ const lowered = raw.toLowerCase();
347
+ let name = 'run_shell';
348
+ const writesViaRedirection = /\b(tee|cat|printf|echo)\b[\s\S]*[>]/.test(lowered);
349
+ const patchLike = /\b(apply_patch|git\s+apply|perl\s+-0?pi|python[^\n]*write_text|writefilesync)\b/.test(lowered);
350
+ if (/\b(npm\s+test|node\s+test\.js|pytest|python\s+-m\s+pytest|make\s+test)\b/.test(lowered)) {
351
+ name = 'run_shell';
352
+ } else if (patchLike || writesViaRedirection) {
353
+ name = 'edit_file';
354
+ } else if (/\b(rg|grep)\b/.test(lowered)) {
355
+ name = 'grep_files';
356
+ } else if (/\b(find|ls|tree)\b/.test(lowered)) {
357
+ name = 'list_directory';
358
+ } else if (/\b(sed\s+-n|cat|head|tail|awk|nl)\b/.test(lowered)) {
359
+ name = 'read_file';
360
+ }
361
+ return {
362
+ name,
363
+ source: 'codex-cli',
364
+ codexType: item.type || 'command_execution',
365
+ command: raw,
366
+ status: item.status || null,
367
+ };
368
+ }
369
+
370
+ async function runCodexCliBaselineBenchmark(benchmark, options = {}) {
371
+ const expectations = benchmark.agentExpectations || {};
372
+ const fixtureName = expectations.projectFixture || 'express-basic';
373
+ const timeoutMs = options.timeoutMs || DEFAULT_TIMEOUT_MS;
374
+ let sandboxDir = null;
375
+ let keepSandbox = false;
376
+ const started = Date.now();
377
+
378
+ try {
379
+ sandboxDir = setupSandbox(fixtureName);
380
+ let testsBefore = null;
381
+ let totalTests = null;
382
+ if (testCommandAllowed(expectations.testCommand)) {
383
+ const beforeCounts = countTests(sandboxDir, expectations.testCommand);
384
+ testsBefore = beforeCounts.passed;
385
+ totalTests = beforeCounts.total;
386
+ }
387
+
388
+ if (options.dryRun) {
389
+ return {
390
+ benchmarkId: benchmark.id,
391
+ runner: 'codex-cli-baseline',
392
+ provider: 'codex-cli-baseline',
393
+ model: options.model || null,
394
+ success: true,
395
+ status: 'dry_run_ok',
396
+ sandboxCreated: sandboxDir,
397
+ latencyMs: Date.now() - started,
398
+ };
399
+ }
400
+
401
+ const prompt = buildCodexPrompt(benchmark);
402
+ const codex = await (options.runCodexExec || runCodexExec)({
403
+ prompt,
404
+ cwd: sandboxDir,
405
+ model: options.model,
406
+ timeoutMs,
407
+ useEnvOpenAIKey: !!options.useEnvOpenAIKey,
408
+ fullAuto: options.fullAuto !== false,
409
+ sandbox: options.sandbox || null,
410
+ dangerouslyBypassSandbox: !!options.dangerouslyBypassSandbox,
411
+ json: options.json !== false,
412
+ allowMcp: !!options.allowMcp,
413
+ disableMcpServers: options.disableMcpServers || null,
414
+ configOverrides: options.configOverrides || [],
415
+ });
416
+
417
+ const actualFileChanges = await getModifiedFiles(sandboxDir);
418
+ let testsPassed = null;
419
+ let testsAfter = null;
420
+ if (testCommandAllowed(expectations.testCommand)) {
421
+ try {
422
+ execFileSync('sh', ['-c', expectations.testCommand], {
423
+ cwd: sandboxDir,
424
+ timeout: options.testTimeoutMs || 30_000,
425
+ stdio: 'pipe',
426
+ });
427
+ testsPassed = true;
428
+ } catch {
429
+ testsPassed = false;
430
+ }
431
+ const afterCounts = countTests(sandboxDir, expectations.testCommand);
432
+ testsAfter = afterCounts.passed;
433
+ if (totalTests == null) totalTests = afterCounts.total;
434
+ }
435
+
436
+ const expectedFileChanges = expectations.expectedFileChanges || [];
437
+ const missingExpectedWork = expectedFileChanges.length > 0 && actualFileChanges.length === 0;
438
+ const testRegression = !!(expectations.testCommand && testsPassed === false);
439
+ const codexErrors = Array.isArray(codex.codexErrors) && codex.codexErrors.length
440
+ ? codex.codexErrors.join('; ')
441
+ : null;
442
+ const exitError = codex.timedOut
443
+ ? 'codex exec timed out'
444
+ : codex.code === 0
445
+ ? codexErrors
446
+ : `codex exec exited ${codex.code}${codex.signal ? ` (${codex.signal})` : ''}: ${(codex.stderr || codex.stdout || '').slice(-1000)}`;
447
+ const success = !exitError && !missingExpectedWork && !testRegression;
448
+ const actualToolCalls = codex.toolCalls || [];
449
+ const toolCallDetails = codex.toolCallDetails || [];
450
+ const usage = codex.usage || {};
451
+
452
+ let score = scoreAgentResult(benchmark, {
453
+ actualToolCalls,
454
+ actualFileChanges,
455
+ actualTurns: codex.turns || 1,
456
+ testsPassed,
457
+ output: codex.output || codex.stdout || '',
458
+ success,
459
+ sandboxDir,
460
+ costDollars: null,
461
+ testsBefore,
462
+ testsAfter,
463
+ totalTests,
464
+ toolCallDetails,
465
+ });
466
+
467
+ if (!success) {
468
+ score = {
469
+ composite: 0,
470
+ dimensions: {
471
+ ...(score.dimensions || {}),
472
+ _zeroed: true,
473
+ _zeroReason: exitError
474
+ ? 'codex_error'
475
+ : testRegression
476
+ ? 'tests_failed'
477
+ : 'no_file_changes',
478
+ },
479
+ };
480
+ }
481
+
482
+ keepSandbox = !!options.keepFailures && !success;
483
+ return {
484
+ benchmarkId: benchmark.id,
485
+ runner: 'codex-cli-baseline',
486
+ provider: 'codex-cli-baseline',
487
+ model: options.model || null,
488
+ success,
489
+ score,
490
+ latencyMs: codex.latencyMs || (Date.now() - started),
491
+ actualToolCalls,
492
+ actualFileChanges,
493
+ actualTurns: codex.turns || 1,
494
+ testsPassed,
495
+ testsBefore,
496
+ testsAfter,
497
+ totalTests,
498
+ inputTokens: usage.inputTokens ?? null,
499
+ outputTokens: usage.outputTokens ?? parseCodexTokenCount(codex.stdout),
500
+ dimensionsJson: JSON.stringify(score.dimensions || {}),
501
+ output: (codex.output || codex.stdout || '').slice(0, 2000),
502
+ stderr: filterNonFatalCodexStderr(codex.stderr || '').slice(0, 2000),
503
+ error: exitError,
504
+ sandboxDir: keepSandbox ? sandboxDir : null,
505
+ };
506
+ } catch (err) {
507
+ keepSandbox = !!options.keepFailures;
508
+ return {
509
+ benchmarkId: benchmark.id,
510
+ runner: 'codex-cli-baseline',
511
+ provider: 'codex-cli-baseline',
512
+ model: options.model || null,
513
+ success: false,
514
+ score: { composite: 0, dimensions: { _zeroed: true, _zeroReason: 'exception' } },
515
+ latencyMs: Date.now() - started,
516
+ error: err.message,
517
+ sandboxDir: keepSandbox ? sandboxDir : null,
518
+ };
519
+ } finally {
520
+ if (sandboxDir && !keepSandbox) cleanupSandbox(sandboxDir);
521
+ }
522
+ }
523
+
524
+ function parseCodexTokenCount(stdout = '') {
525
+ const match = String(stdout || '').match(/tokens used\s*\n\s*([\d,]+)/i);
526
+ return match ? parseInt(match[1].replace(/,/g, ''), 10) : null;
527
+ }
528
+
529
+ function filterNonFatalCodexStderr(stderr = '') {
530
+ return String(stderr || '')
531
+ .split(/\r?\n/)
532
+ .filter((line) => {
533
+ if (/Reading additional input from stdin/.test(line)) return false;
534
+ if (/opentelemetry_sdk/.test(line)) return false;
535
+ if (/BatchSpanProcessor/.test(line)) return false;
536
+ return true;
537
+ })
538
+ .join('\n')
539
+ .trim();
540
+ }
541
+
542
+ function summarizeBaselineResults(results = []) {
543
+ const total = results.length;
544
+ const passed = results.filter((r) => r.success).length;
545
+ const avgComposite = total
546
+ ? results.reduce((sum, r) => sum + (r.score?.composite || 0), 0) / total
547
+ : 0;
548
+ const failures = {};
549
+ for (const result of results) {
550
+ if (!result.success) {
551
+ const reason = result.score?.dimensions?._zeroReason || result.error || 'unknown';
552
+ failures[reason] = (failures[reason] || 0) + 1;
553
+ }
554
+ }
555
+ return { total, passed, failed: total - passed, avgComposite, failures };
556
+ }
557
+
558
+ function writeBaselineArtifact(result, { resultsDir = DEFAULT_RESULTS_DIR } = {}) {
559
+ fs.mkdirSync(resultsDir, { recursive: true });
560
+ const stamp = new Date().toISOString().replace(/[:.]/g, '-');
561
+ const file = path.join(resultsDir, `${stamp}-${safeId(result.benchmarkId || 'codex-baseline')}.json`);
562
+ fs.writeFileSync(file, JSON.stringify(result, null, 2) + '\n');
563
+ return file;
564
+ }
565
+
566
+ function storeBaselineResult({ brain, runId, benchmark, result, model, timeoutMs }) {
567
+ if (!brain || typeof brain.insertBenchmarkResult !== 'function') return;
568
+ const scoringMethod = benchmark.agentExpectations?.testCommand
569
+ ? 'codex-cli-baseline+tests'
570
+ : 'codex-cli-baseline';
571
+ brain.insertBenchmarkResult(decorateBenchmarkResult({
572
+ runId,
573
+ suite: 'coding-agent',
574
+ promptId: benchmark.id,
575
+ taskType: 'coding-agent',
576
+ difficulty: benchmark.difficulty,
577
+ provider: 'codex-cli-baseline',
578
+ model: model || 'codex-default',
579
+ prompt: benchmark.prompt,
580
+ response: result.output || '',
581
+ traitScore: null,
582
+ matchedTraits: [],
583
+ compositeScore: result.score?.composite || 0,
584
+ latencyMs: result.latencyMs,
585
+ error: result.error,
586
+ timestamp: result.timestamp,
587
+ costDollars: null,
588
+ testsBefore: result.testsBefore ?? null,
589
+ testsAfter: result.testsAfter ?? null,
590
+ totalTests: result.totalTests ?? null,
591
+ dimensionsJson: result.dimensionsJson || null,
592
+ inputTokens: null,
593
+ outputTokens: result.outputTokens ?? null,
594
+ scorerVersion: DEFAULT_SCORER_VERSION,
595
+ scoringMethod,
596
+ trusted: result.success === true && result.testsPassed === true && !result.error,
597
+ runConfig: { timeoutMs, scoringMethod, externalRunner: 'codex-cli' },
598
+ }, {
599
+ suite: 'coding-agent',
600
+ benchmark,
601
+ runId,
602
+ provider: 'codex-cli-baseline',
603
+ model: model || 'codex-default',
604
+ scoringMethod,
605
+ scorerVersion: DEFAULT_SCORER_VERSION,
606
+ trusted: result.success === true && result.testsPassed === true && !result.error,
607
+ runConfig: { timeoutMs, scoringMethod, externalRunner: 'codex-cli' },
608
+ }));
609
+ }
610
+
611
+ function safeId(value) {
612
+ return String(value || 'unknown').replace(/[^a-z0-9_.-]+/gi, '-').slice(0, 80);
613
+ }
614
+
615
+ module.exports = {
616
+ DEFAULT_RESULTS_DIR,
617
+ buildCodexExecArgs,
618
+ buildMcpDisableConfigOverrides,
619
+ buildCodexPrompt,
620
+ codexEnv,
621
+ classifyCodexCommand,
622
+ classifyCodexItem,
623
+ discoverEnabledCodexMcpServers,
624
+ filterNonFatalCodexStderr,
625
+ parseEnabledCodexMcpServers,
626
+ parseCodexTokenCount,
627
+ parseCodexJsonEvents,
628
+ runCodexCliBaselineBenchmark,
629
+ runCodexExec,
630
+ summarizeBaselineResults,
631
+ writeBaselineArtifact,
632
+ storeBaselineResult,
633
+ };
@@ -31,6 +31,7 @@ if (!bench) { console.error('No benchmark:', benchId); process.exit(1); }
31
31
  cwd: dir,
32
32
  timeoutMs: 120000,
33
33
  mode: 'build',
34
+ persistTranscript: false,
34
35
  onProgress: (e) => {
35
36
  if (e.message) {
36
37
  const detail = e.detail ? ` ${JSON.stringify(e.detail).slice(0, 120)}` : '';
@@ -5,7 +5,7 @@ const path = require('path');
5
5
  const fs = require('fs');
6
6
 
7
7
  const { pLimit, getAvailableProviders } = require('./head-to-head');
8
- const { runAgentBenchmark, runMultiTurnBenchmark } = require('./agent-runner');
8
+ const { runAgentBenchmark, runMultiTurnBenchmark, isTrustedAgentResult } = require('./agent-runner');
9
9
  const { createClient } = require('../llm/client');
10
10
  const { createAnthropicFromEnv } = require('../llm/anthropic');
11
11
  const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
@@ -566,7 +566,7 @@ class EvalOrchestrator extends EventEmitter {
566
566
  outputTokens: result.outputTokens ?? null,
567
567
  scorerVersion: DEFAULT_SCORER_VERSION,
568
568
  scoringMethod,
569
- trusted: !result.error && result.testsPassed === true,
569
+ trusted: isTrustedAgentResult(result),
570
570
  runConfig: {
571
571
  timeoutMs: this.timeoutMs,
572
572
  concurrency: this.concurrency,
@@ -581,7 +581,7 @@ class EvalOrchestrator extends EventEmitter {
581
581
  model: item.model,
582
582
  scoringMethod,
583
583
  scorerVersion: DEFAULT_SCORER_VERSION,
584
- trusted: !result.error && result.testsPassed === true,
584
+ trusted: isTrustedAgentResult(result),
585
585
  runConfig: {
586
586
  timeoutMs: this.timeoutMs,
587
587
  concurrency: this.concurrency,