create-walle 0.9.13 → 0.9.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -3
- package/bin/create-walle.js +232 -32
- package/bin/mcp-inject.js +18 -53
- package/package.json +3 -1
- package/template/claude-task-manager/api-prompts.js +11 -2
- package/template/claude-task-manager/approval-agent.js +7 -0
- package/template/claude-task-manager/db.js +94 -75
- package/template/claude-task-manager/docs/session-standup-command-center-design.md +242 -0
- package/template/claude-task-manager/docs/session-tooltip-freshness-design.md +224 -0
- package/template/claude-task-manager/docs/session-ux-issue-review-2026-05-01.md +369 -0
- package/template/claude-task-manager/fuzzy-utils.js +10 -2
- package/template/claude-task-manager/git-utils.js +140 -10
- package/template/claude-task-manager/lib/agent-capabilities.js +1 -1
- package/template/claude-task-manager/lib/agent-presets.js +38 -5
- package/template/claude-task-manager/lib/codex-terminal-final.js +53 -0
- package/template/claude-task-manager/lib/ctm-session-context-api.js +222 -0
- package/template/claude-task-manager/lib/session-diagnostics.js +56 -0
- package/template/claude-task-manager/lib/session-history.js +309 -16
- package/template/claude-task-manager/lib/session-standup.js +409 -0
- package/template/claude-task-manager/lib/session-stream.js +253 -20
- package/template/claude-task-manager/lib/standup-attention.js +200 -0
- package/template/claude-task-manager/lib/status-hooks.js +8 -2
- package/template/claude-task-manager/lib/update-telemetry.js +114 -0
- package/template/claude-task-manager/lib/walle-ctm-history.js +49 -6
- package/template/claude-task-manager/lib/walle-default-model.js +55 -0
- package/template/claude-task-manager/lib/walle-mcp-auto-config.js +66 -0
- package/template/claude-task-manager/lib/walle-supervisor.js +86 -19
- package/template/claude-task-manager/lib/walle-transcript.js +1 -3
- package/template/claude-task-manager/lib/worktree-cwd.js +82 -0
- package/template/claude-task-manager/package.json +1 -0
- package/template/claude-task-manager/providers/codex-mcp.js +104 -0
- package/template/claude-task-manager/providers/index.js +2 -0
- package/template/claude-task-manager/public/css/setup.css +2 -1
- package/template/claude-task-manager/public/css/walle.css +71 -0
- package/template/claude-task-manager/public/index.html +2388 -429
- package/template/claude-task-manager/public/js/message-renderer.js +314 -35
- package/template/claude-task-manager/public/js/session-search-utils.js +185 -3
- package/template/claude-task-manager/public/js/session-status-precedence.js +125 -0
- package/template/claude-task-manager/public/js/setup.js +62 -19
- package/template/claude-task-manager/public/js/stream-view.js +396 -55
- package/template/claude-task-manager/public/js/terminal-restore-state.js +57 -0
- package/template/claude-task-manager/public/js/walle-session.js +234 -26
- package/template/claude-task-manager/public/js/walle.js +143 -2
- package/template/claude-task-manager/server.js +1402 -433
- package/template/claude-task-manager/session-integrity.js +77 -28
- package/template/claude-task-manager/workers/approval-widget-validator.js +15 -5
- package/template/claude-task-manager/workers/scrollback-worker.js +5 -6
- package/template/claude-task-manager/workers/state-detectors/codex.js +6 -0
- package/template/package.json +1 -1
- package/template/wall-e/agent-runners/claude-code.js +2 -0
- package/template/wall-e/agent.js +63 -8
- package/template/wall-e/api-walle.js +330 -52
- package/template/wall-e/brain.js +291 -42
- package/template/wall-e/chat.js +172 -15
- package/template/wall-e/coding/compaction-service.js +19 -5
- package/template/wall-e/coding/stream-processor.js +22 -2
- package/template/wall-e/coding/workspace-replay.js +1 -4
- package/template/wall-e/coding-orchestrator.js +250 -80
- package/template/wall-e/compat.js +0 -28
- package/template/wall-e/context/context-builder.js +3 -1
- package/template/wall-e/embeddings.js +2 -7
- package/template/wall-e/eval/agent-runner.js +30 -9
- package/template/wall-e/eval/benchmark-generator.js +21 -1
- package/template/wall-e/eval/benchmarks/chat-eval.json +66 -6
- package/template/wall-e/eval/benchmarks/coding-agent.json +0 -596
- package/template/wall-e/eval/cc-replay.js +1 -0
- package/template/wall-e/eval/codex-cli-baseline.js +633 -0
- package/template/wall-e/eval/debug-agent003.js +1 -0
- package/template/wall-e/eval/eval-orchestrator.js +3 -3
- package/template/wall-e/eval/run-agent-benchmarks.js +11 -3
- package/template/wall-e/eval/run-codex-cli-baseline.js +177 -0
- package/template/wall-e/eval/run-model-comparison.js +1 -0
- package/template/wall-e/eval/swebench-adapter.js +1 -0
- package/template/wall-e/evaluation/quorum-evaluator.js +0 -1
- package/template/wall-e/extraction/knowledge-extractor.js +1 -2
- package/template/wall-e/lib/mcp-integration.js +336 -0
- package/template/wall-e/llm/ollama.js +47 -8
- package/template/wall-e/llm/ollama.plugin.json +1 -1
- package/template/wall-e/llm/tool-adapter.js +1 -0
- package/template/wall-e/loops/ingest.js +42 -8
- package/template/wall-e/loops/initiative.js +87 -2
- package/template/wall-e/mcp-server.js +872 -19
- package/template/wall-e/memory/ctm-context-client.js +230 -0
- package/template/wall-e/memory/ctm-session-context.js +1376 -0
- package/template/wall-e/prompts/coding/memory-protocol.md +6 -0
- package/template/wall-e/server.js +30 -1
- package/template/wall-e/skills/_bundled/memory-search/SKILL.md +8 -0
- package/template/wall-e/skills/_bundled/scan-ctm-sessions/SKILL.md +20 -0
- package/template/wall-e/skills/_bundled/scan-ctm-sessions/run.js +43 -0
- package/template/wall-e/skills/_bundled/slack-mentions/run.js +471 -188
- package/template/wall-e/skills/skill-planner.js +86 -4
- package/template/wall-e/slack/socket-mode-listener.js +276 -0
- package/template/wall-e/telemetry.js +70 -2
- package/template/wall-e/tools/builtin-middleware.js +55 -2
- package/template/wall-e/tools/shell-policy.js +1 -1
- package/template/wall-e/tools/slack-owner.js +104 -0
- package/template/website/index.html +4 -4
- package/template/builder-journal.md +0 -17
|
@@ -0,0 +1,633 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const crypto = require('crypto');
|
|
4
|
+
const fs = require('fs');
|
|
5
|
+
const os = require('os');
|
|
6
|
+
const path = require('path');
|
|
7
|
+
const { execFile, execFileSync, spawn } = require('child_process');
|
|
8
|
+
const { promisify } = require('util');
|
|
9
|
+
|
|
10
|
+
const execFileAsync = promisify(execFile);
|
|
11
|
+
|
|
12
|
+
const {
|
|
13
|
+
cleanupSandbox,
|
|
14
|
+
countTests,
|
|
15
|
+
scoreAgentResult,
|
|
16
|
+
setupSandbox,
|
|
17
|
+
} = require('./agent-runner');
|
|
18
|
+
const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
|
|
19
|
+
|
|
20
|
+
const DEFAULT_RESULTS_DIR = path.join(os.homedir(), '.walle', 'eval-results', 'codex-cli-baseline');
|
|
21
|
+
const DEFAULT_TIMEOUT_MS = 600_000;
|
|
22
|
+
const ALLOWED_TEST_COMMANDS = /^(npm test|node test\.js|pytest|python -m pytest|make test)$/;
|
|
23
|
+
|
|
24
|
+
function buildCodexPrompt(benchmark = {}) {
|
|
25
|
+
const expectations = benchmark.agentExpectations || {};
|
|
26
|
+
const parts = [
|
|
27
|
+
benchmark.prompt || '',
|
|
28
|
+
'',
|
|
29
|
+
'You are running as an external Codex CLI baseline for a Wall-E coding-agent benchmark.',
|
|
30
|
+
'Work only in this repository. Do not commit changes. Make the requested code changes directly.',
|
|
31
|
+
];
|
|
32
|
+
if (expectations.testCommand) {
|
|
33
|
+
parts.push(`After editing, run this validation command if practical: ${expectations.testCommand}`);
|
|
34
|
+
}
|
|
35
|
+
if (Array.isArray(expectations.expectedFileChanges) && expectations.expectedFileChanges.length > 0) {
|
|
36
|
+
parts.push(`Expected changed file(s), when appropriate: ${expectations.expectedFileChanges.join(', ')}`);
|
|
37
|
+
}
|
|
38
|
+
return parts.filter(Boolean).join('\n');
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function buildCodexExecArgs({
|
|
42
|
+
prompt,
|
|
43
|
+
cwd,
|
|
44
|
+
model,
|
|
45
|
+
outFile,
|
|
46
|
+
fullAuto = true,
|
|
47
|
+
sandbox = null,
|
|
48
|
+
json = true,
|
|
49
|
+
dangerouslyBypassSandbox = false,
|
|
50
|
+
configOverrides = [],
|
|
51
|
+
} = {}) {
|
|
52
|
+
if (!cwd) throw new Error('cwd is required');
|
|
53
|
+
if (!outFile) throw new Error('outFile is required');
|
|
54
|
+
const args = [
|
|
55
|
+
'exec',
|
|
56
|
+
'--skip-git-repo-check',
|
|
57
|
+
'--ephemeral',
|
|
58
|
+
'--output-last-message', outFile,
|
|
59
|
+
'-C', cwd,
|
|
60
|
+
];
|
|
61
|
+
if (model) args.push('-m', model);
|
|
62
|
+
if (json) args.push('--json');
|
|
63
|
+
for (const override of configOverrides || []) {
|
|
64
|
+
if (override) args.push('-c', override);
|
|
65
|
+
}
|
|
66
|
+
if (dangerouslyBypassSandbox) {
|
|
67
|
+
args.push('--dangerously-bypass-approvals-and-sandbox');
|
|
68
|
+
} else if (fullAuto) {
|
|
69
|
+
args.push('--full-auto');
|
|
70
|
+
} else if (sandbox) {
|
|
71
|
+
args.push('--sandbox', sandbox);
|
|
72
|
+
}
|
|
73
|
+
args.push(prompt || '');
|
|
74
|
+
return args;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function codexEnv({ useEnvOpenAIKey = false, env = process.env } = {}) {
|
|
78
|
+
const childEnv = { ...env };
|
|
79
|
+
if (!useEnvOpenAIKey) delete childEnv.OPENAI_API_KEY;
|
|
80
|
+
if (!childEnv.OTEL_SDK_DISABLED) childEnv.OTEL_SDK_DISABLED = 'true';
|
|
81
|
+
return childEnv;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
async function runCodexExec({
|
|
85
|
+
prompt,
|
|
86
|
+
cwd,
|
|
87
|
+
model,
|
|
88
|
+
timeoutMs = DEFAULT_TIMEOUT_MS,
|
|
89
|
+
useEnvOpenAIKey = false,
|
|
90
|
+
fullAuto = true,
|
|
91
|
+
sandbox = null,
|
|
92
|
+
dangerouslyBypassSandbox = false,
|
|
93
|
+
json = true,
|
|
94
|
+
allowMcp = false,
|
|
95
|
+
disableMcpServers = null,
|
|
96
|
+
configOverrides = [],
|
|
97
|
+
} = {}) {
|
|
98
|
+
const tmpDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'walle-codex-baseline-'));
|
|
99
|
+
const outFile = path.join(tmpDir, 'last-message.txt');
|
|
100
|
+
const effectiveConfigOverrides = [
|
|
101
|
+
...configOverrides,
|
|
102
|
+
...buildMcpDisableConfigOverrides({ allowMcp, disableMcpServers }),
|
|
103
|
+
];
|
|
104
|
+
const args = buildCodexExecArgs({
|
|
105
|
+
prompt,
|
|
106
|
+
cwd,
|
|
107
|
+
model,
|
|
108
|
+
outFile,
|
|
109
|
+
fullAuto,
|
|
110
|
+
sandbox,
|
|
111
|
+
json,
|
|
112
|
+
dangerouslyBypassSandbox,
|
|
113
|
+
configOverrides: effectiveConfigOverrides,
|
|
114
|
+
});
|
|
115
|
+
const started = Date.now();
|
|
116
|
+
|
|
117
|
+
try {
|
|
118
|
+
const result = await spawnCodex(args, {
|
|
119
|
+
cwd,
|
|
120
|
+
env: codexEnv({ useEnvOpenAIKey }),
|
|
121
|
+
timeoutMs,
|
|
122
|
+
});
|
|
123
|
+
let output = '';
|
|
124
|
+
try { output = fs.readFileSync(outFile, 'utf8'); } catch {}
|
|
125
|
+
const telemetry = json ? parseCodexJsonEvents(result.stdout) : parseCodexFormattedOutput(result.stderr || result.stdout);
|
|
126
|
+
return {
|
|
127
|
+
...result,
|
|
128
|
+
args,
|
|
129
|
+
...telemetry,
|
|
130
|
+
output: output || result.stdout || '',
|
|
131
|
+
latencyMs: Date.now() - started,
|
|
132
|
+
};
|
|
133
|
+
} finally {
|
|
134
|
+
fs.rmSync(tmpDir, { recursive: true, force: true });
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
function spawnCodex(args, { cwd, env, timeoutMs } = {}) {
|
|
139
|
+
return new Promise((resolve, reject) => {
|
|
140
|
+
let proc;
|
|
141
|
+
try {
|
|
142
|
+
proc = spawn('codex', args, {
|
|
143
|
+
cwd,
|
|
144
|
+
env,
|
|
145
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
146
|
+
});
|
|
147
|
+
} catch (err) {
|
|
148
|
+
reject(err);
|
|
149
|
+
return;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
let stdout = '';
|
|
153
|
+
let stderr = '';
|
|
154
|
+
let timedOut = false;
|
|
155
|
+
const timer = setTimeout(() => {
|
|
156
|
+
timedOut = true;
|
|
157
|
+
try { proc.kill('SIGTERM'); } catch {}
|
|
158
|
+
}, timeoutMs || DEFAULT_TIMEOUT_MS);
|
|
159
|
+
if (typeof timer.unref === 'function') timer.unref();
|
|
160
|
+
|
|
161
|
+
proc.stdout.on('data', (chunk) => { stdout += chunk.toString(); });
|
|
162
|
+
proc.stderr.on('data', (chunk) => { stderr += chunk.toString(); });
|
|
163
|
+
proc.on('error', (err) => {
|
|
164
|
+
clearTimeout(timer);
|
|
165
|
+
reject(err);
|
|
166
|
+
});
|
|
167
|
+
proc.on('close', (code, signal) => {
|
|
168
|
+
clearTimeout(timer);
|
|
169
|
+
resolve({
|
|
170
|
+
code,
|
|
171
|
+
signal,
|
|
172
|
+
timedOut,
|
|
173
|
+
stdout,
|
|
174
|
+
stderr,
|
|
175
|
+
});
|
|
176
|
+
});
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
async function getModifiedFiles(dir) {
|
|
181
|
+
try {
|
|
182
|
+
const { stdout } = await execFileAsync('git', ['diff', '--name-only', 'HEAD'], { cwd: dir });
|
|
183
|
+
const untracked = (await execFileAsync('git', ['ls-files', '--others', '--exclude-standard'], { cwd: dir })).stdout;
|
|
184
|
+
return [...stdout.trim().split('\n'), ...untracked.trim().split('\n')].filter(Boolean);
|
|
185
|
+
} catch {
|
|
186
|
+
return [];
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function testCommandAllowed(command) {
|
|
191
|
+
return !!command && ALLOWED_TEST_COMMANDS.test(command);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
function buildMcpDisableConfigOverrides({ allowMcp = false, disableMcpServers = null } = {}) {
|
|
195
|
+
if (allowMcp) return [];
|
|
196
|
+
const servers = disableMcpServers || discoverEnabledCodexMcpServers();
|
|
197
|
+
return [...new Set(servers)]
|
|
198
|
+
.filter(Boolean)
|
|
199
|
+
.map((name) => `mcp_servers.${tomlPathSegment(name)}.enabled=false`);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function discoverEnabledCodexMcpServers() {
|
|
203
|
+
try {
|
|
204
|
+
const stdout = execFileSync('codex', ['mcp', 'list'], {
|
|
205
|
+
encoding: 'utf8',
|
|
206
|
+
stdio: ['ignore', 'pipe', 'ignore'],
|
|
207
|
+
timeout: 10_000,
|
|
208
|
+
});
|
|
209
|
+
return parseEnabledCodexMcpServers(stdout);
|
|
210
|
+
} catch {
|
|
211
|
+
return [];
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
function parseEnabledCodexMcpServers(output = '') {
|
|
216
|
+
const names = [];
|
|
217
|
+
for (const line of String(output || '').split(/\r?\n/)) {
|
|
218
|
+
const trimmed = line.trim();
|
|
219
|
+
if (!trimmed || /^Name\b/.test(trimmed) || /^WARNING\b/.test(trimmed)) continue;
|
|
220
|
+
if (!/\benabled\b/.test(trimmed)) continue;
|
|
221
|
+
const [name] = trimmed.split(/\s+/);
|
|
222
|
+
if (name) names.push(name);
|
|
223
|
+
}
|
|
224
|
+
return names;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function tomlPathSegment(value) {
|
|
228
|
+
const raw = String(value || '');
|
|
229
|
+
return /^[A-Za-z_][A-Za-z0-9_]*$/.test(raw) ? raw : JSON.stringify(raw);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
function parseCodexJsonEvents(stdout = '') {
|
|
233
|
+
const events = [];
|
|
234
|
+
const toolCalls = [];
|
|
235
|
+
const toolCallDetails = [];
|
|
236
|
+
const errors = [];
|
|
237
|
+
const usage = {
|
|
238
|
+
inputTokens: 0,
|
|
239
|
+
cachedInputTokens: 0,
|
|
240
|
+
outputTokens: 0,
|
|
241
|
+
};
|
|
242
|
+
let turns = 0;
|
|
243
|
+
|
|
244
|
+
for (const line of String(stdout || '').split(/\r?\n/)) {
|
|
245
|
+
const trimmed = line.trim();
|
|
246
|
+
if (!trimmed) continue;
|
|
247
|
+
let event;
|
|
248
|
+
try {
|
|
249
|
+
event = JSON.parse(trimmed);
|
|
250
|
+
} catch {
|
|
251
|
+
continue;
|
|
252
|
+
}
|
|
253
|
+
events.push(event);
|
|
254
|
+
|
|
255
|
+
if (event.type === 'turn.completed') {
|
|
256
|
+
turns += 1;
|
|
257
|
+
const eventUsage = event.usage || event.turn?.usage || {};
|
|
258
|
+
usage.inputTokens += Number(eventUsage.input_tokens ?? eventUsage.inputTokens ?? 0);
|
|
259
|
+
usage.cachedInputTokens += Number(eventUsage.cached_input_tokens ?? eventUsage.cachedInputTokens ?? 0);
|
|
260
|
+
usage.outputTokens += Number(eventUsage.output_tokens ?? eventUsage.outputTokens ?? 0);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if (event.type === 'error') {
|
|
264
|
+
errors.push(event.message || event.error?.message || event.error || 'codex json error');
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
if (event.type !== 'item.completed' || !event.item) continue;
|
|
268
|
+
const detail = classifyCodexItem(event.item);
|
|
269
|
+
if (!detail) continue;
|
|
270
|
+
toolCalls.push(detail.name);
|
|
271
|
+
toolCallDetails.push(detail);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
return {
|
|
275
|
+
codexEvents: events,
|
|
276
|
+
codexErrors: errors,
|
|
277
|
+
toolCalls,
|
|
278
|
+
toolCallDetails,
|
|
279
|
+
turns,
|
|
280
|
+
usage: {
|
|
281
|
+
inputTokens: usage.inputTokens || null,
|
|
282
|
+
cachedInputTokens: usage.cachedInputTokens || null,
|
|
283
|
+
outputTokens: usage.outputTokens || null,
|
|
284
|
+
},
|
|
285
|
+
};
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
function parseCodexFormattedOutput(output = '') {
|
|
289
|
+
const toolCalls = [];
|
|
290
|
+
const toolCallDetails = [];
|
|
291
|
+
for (const line of String(output || '').split(/\r?\n/)) {
|
|
292
|
+
const commandMatch = line.match(/^\s*(?:exec|shell)\s+(.+)$/i);
|
|
293
|
+
if (!commandMatch) continue;
|
|
294
|
+
const detail = classifyCodexCommand(commandMatch[1]);
|
|
295
|
+
toolCalls.push(detail.name);
|
|
296
|
+
toolCallDetails.push(detail);
|
|
297
|
+
}
|
|
298
|
+
return {
|
|
299
|
+
toolCalls,
|
|
300
|
+
toolCallDetails,
|
|
301
|
+
turns: toolCalls.length ? 1 : 0,
|
|
302
|
+
usage: {
|
|
303
|
+
inputTokens: null,
|
|
304
|
+
cachedInputTokens: null,
|
|
305
|
+
outputTokens: parseCodexTokenCount(output),
|
|
306
|
+
},
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
function classifyCodexItem(item = {}) {
|
|
311
|
+
const itemType = String(item.type || item.item_type || '').toLowerCase();
|
|
312
|
+
if (itemType.includes('command') || itemType === 'exec' || itemType === 'shell') {
|
|
313
|
+
return classifyCodexCommand(item.command || item.cmd || item.text || item.arguments?.cmd || '', item);
|
|
314
|
+
}
|
|
315
|
+
if (itemType.includes('file')) {
|
|
316
|
+
return {
|
|
317
|
+
name: 'edit_file',
|
|
318
|
+
source: 'codex-cli',
|
|
319
|
+
codexType: item.type || null,
|
|
320
|
+
path: item.path || item.file || item.file_path || null,
|
|
321
|
+
status: item.status || null,
|
|
322
|
+
};
|
|
323
|
+
}
|
|
324
|
+
if (itemType.includes('mcp')) {
|
|
325
|
+
return {
|
|
326
|
+
name: 'mcp_tool_call',
|
|
327
|
+
source: 'codex-cli',
|
|
328
|
+
codexType: item.type || null,
|
|
329
|
+
tool: item.name || item.tool || null,
|
|
330
|
+
status: item.status || null,
|
|
331
|
+
};
|
|
332
|
+
}
|
|
333
|
+
if (itemType.includes('web_search') || itemType.includes('web-search')) {
|
|
334
|
+
return {
|
|
335
|
+
name: 'web_search',
|
|
336
|
+
source: 'codex-cli',
|
|
337
|
+
codexType: item.type || null,
|
|
338
|
+
status: item.status || null,
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
return null;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
function classifyCodexCommand(command = '', item = {}) {
|
|
345
|
+
const raw = String(command || '');
|
|
346
|
+
const lowered = raw.toLowerCase();
|
|
347
|
+
let name = 'run_shell';
|
|
348
|
+
const writesViaRedirection = /\b(tee|cat|printf|echo)\b[\s\S]*[>]/.test(lowered);
|
|
349
|
+
const patchLike = /\b(apply_patch|git\s+apply|perl\s+-0?pi|python[^\n]*write_text|writefilesync)\b/.test(lowered);
|
|
350
|
+
if (/\b(npm\s+test|node\s+test\.js|pytest|python\s+-m\s+pytest|make\s+test)\b/.test(lowered)) {
|
|
351
|
+
name = 'run_shell';
|
|
352
|
+
} else if (patchLike || writesViaRedirection) {
|
|
353
|
+
name = 'edit_file';
|
|
354
|
+
} else if (/\b(rg|grep)\b/.test(lowered)) {
|
|
355
|
+
name = 'grep_files';
|
|
356
|
+
} else if (/\b(find|ls|tree)\b/.test(lowered)) {
|
|
357
|
+
name = 'list_directory';
|
|
358
|
+
} else if (/\b(sed\s+-n|cat|head|tail|awk|nl)\b/.test(lowered)) {
|
|
359
|
+
name = 'read_file';
|
|
360
|
+
}
|
|
361
|
+
return {
|
|
362
|
+
name,
|
|
363
|
+
source: 'codex-cli',
|
|
364
|
+
codexType: item.type || 'command_execution',
|
|
365
|
+
command: raw,
|
|
366
|
+
status: item.status || null,
|
|
367
|
+
};
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
async function runCodexCliBaselineBenchmark(benchmark, options = {}) {
|
|
371
|
+
const expectations = benchmark.agentExpectations || {};
|
|
372
|
+
const fixtureName = expectations.projectFixture || 'express-basic';
|
|
373
|
+
const timeoutMs = options.timeoutMs || DEFAULT_TIMEOUT_MS;
|
|
374
|
+
let sandboxDir = null;
|
|
375
|
+
let keepSandbox = false;
|
|
376
|
+
const started = Date.now();
|
|
377
|
+
|
|
378
|
+
try {
|
|
379
|
+
sandboxDir = setupSandbox(fixtureName);
|
|
380
|
+
let testsBefore = null;
|
|
381
|
+
let totalTests = null;
|
|
382
|
+
if (testCommandAllowed(expectations.testCommand)) {
|
|
383
|
+
const beforeCounts = countTests(sandboxDir, expectations.testCommand);
|
|
384
|
+
testsBefore = beforeCounts.passed;
|
|
385
|
+
totalTests = beforeCounts.total;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
if (options.dryRun) {
|
|
389
|
+
return {
|
|
390
|
+
benchmarkId: benchmark.id,
|
|
391
|
+
runner: 'codex-cli-baseline',
|
|
392
|
+
provider: 'codex-cli-baseline',
|
|
393
|
+
model: options.model || null,
|
|
394
|
+
success: true,
|
|
395
|
+
status: 'dry_run_ok',
|
|
396
|
+
sandboxCreated: sandboxDir,
|
|
397
|
+
latencyMs: Date.now() - started,
|
|
398
|
+
};
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
const prompt = buildCodexPrompt(benchmark);
|
|
402
|
+
const codex = await (options.runCodexExec || runCodexExec)({
|
|
403
|
+
prompt,
|
|
404
|
+
cwd: sandboxDir,
|
|
405
|
+
model: options.model,
|
|
406
|
+
timeoutMs,
|
|
407
|
+
useEnvOpenAIKey: !!options.useEnvOpenAIKey,
|
|
408
|
+
fullAuto: options.fullAuto !== false,
|
|
409
|
+
sandbox: options.sandbox || null,
|
|
410
|
+
dangerouslyBypassSandbox: !!options.dangerouslyBypassSandbox,
|
|
411
|
+
json: options.json !== false,
|
|
412
|
+
allowMcp: !!options.allowMcp,
|
|
413
|
+
disableMcpServers: options.disableMcpServers || null,
|
|
414
|
+
configOverrides: options.configOverrides || [],
|
|
415
|
+
});
|
|
416
|
+
|
|
417
|
+
const actualFileChanges = await getModifiedFiles(sandboxDir);
|
|
418
|
+
let testsPassed = null;
|
|
419
|
+
let testsAfter = null;
|
|
420
|
+
if (testCommandAllowed(expectations.testCommand)) {
|
|
421
|
+
try {
|
|
422
|
+
execFileSync('sh', ['-c', expectations.testCommand], {
|
|
423
|
+
cwd: sandboxDir,
|
|
424
|
+
timeout: options.testTimeoutMs || 30_000,
|
|
425
|
+
stdio: 'pipe',
|
|
426
|
+
});
|
|
427
|
+
testsPassed = true;
|
|
428
|
+
} catch {
|
|
429
|
+
testsPassed = false;
|
|
430
|
+
}
|
|
431
|
+
const afterCounts = countTests(sandboxDir, expectations.testCommand);
|
|
432
|
+
testsAfter = afterCounts.passed;
|
|
433
|
+
if (totalTests == null) totalTests = afterCounts.total;
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
const expectedFileChanges = expectations.expectedFileChanges || [];
|
|
437
|
+
const missingExpectedWork = expectedFileChanges.length > 0 && actualFileChanges.length === 0;
|
|
438
|
+
const testRegression = !!(expectations.testCommand && testsPassed === false);
|
|
439
|
+
const codexErrors = Array.isArray(codex.codexErrors) && codex.codexErrors.length
|
|
440
|
+
? codex.codexErrors.join('; ')
|
|
441
|
+
: null;
|
|
442
|
+
const exitError = codex.timedOut
|
|
443
|
+
? 'codex exec timed out'
|
|
444
|
+
: codex.code === 0
|
|
445
|
+
? codexErrors
|
|
446
|
+
: `codex exec exited ${codex.code}${codex.signal ? ` (${codex.signal})` : ''}: ${(codex.stderr || codex.stdout || '').slice(-1000)}`;
|
|
447
|
+
const success = !exitError && !missingExpectedWork && !testRegression;
|
|
448
|
+
const actualToolCalls = codex.toolCalls || [];
|
|
449
|
+
const toolCallDetails = codex.toolCallDetails || [];
|
|
450
|
+
const usage = codex.usage || {};
|
|
451
|
+
|
|
452
|
+
let score = scoreAgentResult(benchmark, {
|
|
453
|
+
actualToolCalls,
|
|
454
|
+
actualFileChanges,
|
|
455
|
+
actualTurns: codex.turns || 1,
|
|
456
|
+
testsPassed,
|
|
457
|
+
output: codex.output || codex.stdout || '',
|
|
458
|
+
success,
|
|
459
|
+
sandboxDir,
|
|
460
|
+
costDollars: null,
|
|
461
|
+
testsBefore,
|
|
462
|
+
testsAfter,
|
|
463
|
+
totalTests,
|
|
464
|
+
toolCallDetails,
|
|
465
|
+
});
|
|
466
|
+
|
|
467
|
+
if (!success) {
|
|
468
|
+
score = {
|
|
469
|
+
composite: 0,
|
|
470
|
+
dimensions: {
|
|
471
|
+
...(score.dimensions || {}),
|
|
472
|
+
_zeroed: true,
|
|
473
|
+
_zeroReason: exitError
|
|
474
|
+
? 'codex_error'
|
|
475
|
+
: testRegression
|
|
476
|
+
? 'tests_failed'
|
|
477
|
+
: 'no_file_changes',
|
|
478
|
+
},
|
|
479
|
+
};
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
keepSandbox = !!options.keepFailures && !success;
|
|
483
|
+
return {
|
|
484
|
+
benchmarkId: benchmark.id,
|
|
485
|
+
runner: 'codex-cli-baseline',
|
|
486
|
+
provider: 'codex-cli-baseline',
|
|
487
|
+
model: options.model || null,
|
|
488
|
+
success,
|
|
489
|
+
score,
|
|
490
|
+
latencyMs: codex.latencyMs || (Date.now() - started),
|
|
491
|
+
actualToolCalls,
|
|
492
|
+
actualFileChanges,
|
|
493
|
+
actualTurns: codex.turns || 1,
|
|
494
|
+
testsPassed,
|
|
495
|
+
testsBefore,
|
|
496
|
+
testsAfter,
|
|
497
|
+
totalTests,
|
|
498
|
+
inputTokens: usage.inputTokens ?? null,
|
|
499
|
+
outputTokens: usage.outputTokens ?? parseCodexTokenCount(codex.stdout),
|
|
500
|
+
dimensionsJson: JSON.stringify(score.dimensions || {}),
|
|
501
|
+
output: (codex.output || codex.stdout || '').slice(0, 2000),
|
|
502
|
+
stderr: filterNonFatalCodexStderr(codex.stderr || '').slice(0, 2000),
|
|
503
|
+
error: exitError,
|
|
504
|
+
sandboxDir: keepSandbox ? sandboxDir : null,
|
|
505
|
+
};
|
|
506
|
+
} catch (err) {
|
|
507
|
+
keepSandbox = !!options.keepFailures;
|
|
508
|
+
return {
|
|
509
|
+
benchmarkId: benchmark.id,
|
|
510
|
+
runner: 'codex-cli-baseline',
|
|
511
|
+
provider: 'codex-cli-baseline',
|
|
512
|
+
model: options.model || null,
|
|
513
|
+
success: false,
|
|
514
|
+
score: { composite: 0, dimensions: { _zeroed: true, _zeroReason: 'exception' } },
|
|
515
|
+
latencyMs: Date.now() - started,
|
|
516
|
+
error: err.message,
|
|
517
|
+
sandboxDir: keepSandbox ? sandboxDir : null,
|
|
518
|
+
};
|
|
519
|
+
} finally {
|
|
520
|
+
if (sandboxDir && !keepSandbox) cleanupSandbox(sandboxDir);
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
function parseCodexTokenCount(stdout = '') {
|
|
525
|
+
const match = String(stdout || '').match(/tokens used\s*\n\s*([\d,]+)/i);
|
|
526
|
+
return match ? parseInt(match[1].replace(/,/g, ''), 10) : null;
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
function filterNonFatalCodexStderr(stderr = '') {
|
|
530
|
+
return String(stderr || '')
|
|
531
|
+
.split(/\r?\n/)
|
|
532
|
+
.filter((line) => {
|
|
533
|
+
if (/Reading additional input from stdin/.test(line)) return false;
|
|
534
|
+
if (/opentelemetry_sdk/.test(line)) return false;
|
|
535
|
+
if (/BatchSpanProcessor/.test(line)) return false;
|
|
536
|
+
return true;
|
|
537
|
+
})
|
|
538
|
+
.join('\n')
|
|
539
|
+
.trim();
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
function summarizeBaselineResults(results = []) {
|
|
543
|
+
const total = results.length;
|
|
544
|
+
const passed = results.filter((r) => r.success).length;
|
|
545
|
+
const avgComposite = total
|
|
546
|
+
? results.reduce((sum, r) => sum + (r.score?.composite || 0), 0) / total
|
|
547
|
+
: 0;
|
|
548
|
+
const failures = {};
|
|
549
|
+
for (const result of results) {
|
|
550
|
+
if (!result.success) {
|
|
551
|
+
const reason = result.score?.dimensions?._zeroReason || result.error || 'unknown';
|
|
552
|
+
failures[reason] = (failures[reason] || 0) + 1;
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
return { total, passed, failed: total - passed, avgComposite, failures };
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
function writeBaselineArtifact(result, { resultsDir = DEFAULT_RESULTS_DIR } = {}) {
|
|
559
|
+
fs.mkdirSync(resultsDir, { recursive: true });
|
|
560
|
+
const stamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
561
|
+
const file = path.join(resultsDir, `${stamp}-${safeId(result.benchmarkId || 'codex-baseline')}.json`);
|
|
562
|
+
fs.writeFileSync(file, JSON.stringify(result, null, 2) + '\n');
|
|
563
|
+
return file;
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
function storeBaselineResult({ brain, runId, benchmark, result, model, timeoutMs }) {
|
|
567
|
+
if (!brain || typeof brain.insertBenchmarkResult !== 'function') return;
|
|
568
|
+
const scoringMethod = benchmark.agentExpectations?.testCommand
|
|
569
|
+
? 'codex-cli-baseline+tests'
|
|
570
|
+
: 'codex-cli-baseline';
|
|
571
|
+
brain.insertBenchmarkResult(decorateBenchmarkResult({
|
|
572
|
+
runId,
|
|
573
|
+
suite: 'coding-agent',
|
|
574
|
+
promptId: benchmark.id,
|
|
575
|
+
taskType: 'coding-agent',
|
|
576
|
+
difficulty: benchmark.difficulty,
|
|
577
|
+
provider: 'codex-cli-baseline',
|
|
578
|
+
model: model || 'codex-default',
|
|
579
|
+
prompt: benchmark.prompt,
|
|
580
|
+
response: result.output || '',
|
|
581
|
+
traitScore: null,
|
|
582
|
+
matchedTraits: [],
|
|
583
|
+
compositeScore: result.score?.composite || 0,
|
|
584
|
+
latencyMs: result.latencyMs,
|
|
585
|
+
error: result.error,
|
|
586
|
+
timestamp: result.timestamp,
|
|
587
|
+
costDollars: null,
|
|
588
|
+
testsBefore: result.testsBefore ?? null,
|
|
589
|
+
testsAfter: result.testsAfter ?? null,
|
|
590
|
+
totalTests: result.totalTests ?? null,
|
|
591
|
+
dimensionsJson: result.dimensionsJson || null,
|
|
592
|
+
inputTokens: null,
|
|
593
|
+
outputTokens: result.outputTokens ?? null,
|
|
594
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
595
|
+
scoringMethod,
|
|
596
|
+
trusted: result.success === true && result.testsPassed === true && !result.error,
|
|
597
|
+
runConfig: { timeoutMs, scoringMethod, externalRunner: 'codex-cli' },
|
|
598
|
+
}, {
|
|
599
|
+
suite: 'coding-agent',
|
|
600
|
+
benchmark,
|
|
601
|
+
runId,
|
|
602
|
+
provider: 'codex-cli-baseline',
|
|
603
|
+
model: model || 'codex-default',
|
|
604
|
+
scoringMethod,
|
|
605
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
606
|
+
trusted: result.success === true && result.testsPassed === true && !result.error,
|
|
607
|
+
runConfig: { timeoutMs, scoringMethod, externalRunner: 'codex-cli' },
|
|
608
|
+
}));
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
function safeId(value) {
|
|
612
|
+
return String(value || 'unknown').replace(/[^a-z0-9_.-]+/gi, '-').slice(0, 80);
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
module.exports = {
|
|
616
|
+
DEFAULT_RESULTS_DIR,
|
|
617
|
+
buildCodexExecArgs,
|
|
618
|
+
buildMcpDisableConfigOverrides,
|
|
619
|
+
buildCodexPrompt,
|
|
620
|
+
codexEnv,
|
|
621
|
+
classifyCodexCommand,
|
|
622
|
+
classifyCodexItem,
|
|
623
|
+
discoverEnabledCodexMcpServers,
|
|
624
|
+
filterNonFatalCodexStderr,
|
|
625
|
+
parseEnabledCodexMcpServers,
|
|
626
|
+
parseCodexTokenCount,
|
|
627
|
+
parseCodexJsonEvents,
|
|
628
|
+
runCodexCliBaselineBenchmark,
|
|
629
|
+
runCodexExec,
|
|
630
|
+
summarizeBaselineResults,
|
|
631
|
+
writeBaselineArtifact,
|
|
632
|
+
storeBaselineResult,
|
|
633
|
+
};
|
|
@@ -31,6 +31,7 @@ if (!bench) { console.error('No benchmark:', benchId); process.exit(1); }
|
|
|
31
31
|
cwd: dir,
|
|
32
32
|
timeoutMs: 120000,
|
|
33
33
|
mode: 'build',
|
|
34
|
+
persistTranscript: false,
|
|
34
35
|
onProgress: (e) => {
|
|
35
36
|
if (e.message) {
|
|
36
37
|
const detail = e.detail ? ` ${JSON.stringify(e.detail).slice(0, 120)}` : '';
|
|
@@ -5,7 +5,7 @@ const path = require('path');
|
|
|
5
5
|
const fs = require('fs');
|
|
6
6
|
|
|
7
7
|
const { pLimit, getAvailableProviders } = require('./head-to-head');
|
|
8
|
-
const { runAgentBenchmark, runMultiTurnBenchmark } = require('./agent-runner');
|
|
8
|
+
const { runAgentBenchmark, runMultiTurnBenchmark, isTrustedAgentResult } = require('./agent-runner');
|
|
9
9
|
const { createClient } = require('../llm/client');
|
|
10
10
|
const { createAnthropicFromEnv } = require('../llm/anthropic');
|
|
11
11
|
const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
|
|
@@ -566,7 +566,7 @@ class EvalOrchestrator extends EventEmitter {
|
|
|
566
566
|
outputTokens: result.outputTokens ?? null,
|
|
567
567
|
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
568
568
|
scoringMethod,
|
|
569
|
-
trusted:
|
|
569
|
+
trusted: isTrustedAgentResult(result),
|
|
570
570
|
runConfig: {
|
|
571
571
|
timeoutMs: this.timeoutMs,
|
|
572
572
|
concurrency: this.concurrency,
|
|
@@ -581,7 +581,7 @@ class EvalOrchestrator extends EventEmitter {
|
|
|
581
581
|
model: item.model,
|
|
582
582
|
scoringMethod,
|
|
583
583
|
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
584
|
-
trusted:
|
|
584
|
+
trusted: isTrustedAgentResult(result),
|
|
585
585
|
runConfig: {
|
|
586
586
|
timeoutMs: this.timeoutMs,
|
|
587
587
|
concurrency: this.concurrency,
|