create-walle 0.9.11 → 0.9.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/package.json +2 -2
- package/template/bin/dev.sh +7 -1
- package/template/bin/setup.js +53 -9
- package/template/bin/sync-images.js +53 -0
- package/template/builder-journal.md +17 -0
- package/template/claude-task-manager/api-prompts.js +98 -13
- package/template/claude-task-manager/api-reviews.js +82 -5
- package/template/claude-task-manager/db.js +32 -5
- package/template/claude-task-manager/docs/session-capture-foundation-design.md +1273 -0
- package/template/claude-task-manager/lib/claude-desktop-sessions.js +696 -0
- package/template/claude-task-manager/lib/coding-agent-models.js +49 -1
- package/template/claude-task-manager/lib/session-capture.js +421 -0
- package/template/claude-task-manager/lib/session-history.js +135 -15
- package/template/claude-task-manager/lib/session-jobs.js +10 -5
- package/template/claude-task-manager/lib/session-stream.js +87 -19
- package/template/claude-task-manager/lib/setup-provider-config.js +115 -0
- package/template/claude-task-manager/lib/walle-ctm-history.js +72 -0
- package/template/claude-task-manager/lib/walle-session-context.js +61 -0
- package/template/claude-task-manager/lib/walle-transcript.js +176 -0
- package/template/claude-task-manager/public/css/setup.css +35 -8
- package/template/claude-task-manager/public/css/walle-session.css +56 -0
- package/template/claude-task-manager/public/css/walle.css +120 -0
- package/template/claude-task-manager/public/index.html +814 -181
- package/template/claude-task-manager/public/js/message-renderer.js +148 -19
- package/template/claude-task-manager/public/js/reviews.js +120 -62
- package/template/claude-task-manager/public/js/setup.js +75 -31
- package/template/claude-task-manager/public/js/stream-view.js +115 -55
- package/template/claude-task-manager/public/js/walle-session.js +84 -2
- package/template/claude-task-manager/public/js/walle.js +308 -54
- package/template/claude-task-manager/server.js +1092 -146
- package/template/claude-task-manager/session-integrity.js +181 -54
- package/template/claude-task-manager/session-utils.js +123 -41
- package/template/claude-task-manager/workers/state-detectors/codex.js +5 -2
- package/template/package.json +1 -1
- package/template/wall-e/adapters/ctm.js +39 -18
- package/template/wall-e/agent-runners/contract.js +17 -0
- package/template/wall-e/agent-runners/index.js +22 -0
- package/template/wall-e/agent-runtime/harness.js +212 -0
- package/template/wall-e/agent-runtime/index.js +8 -0
- package/template/wall-e/agent-runtime/registry.js +67 -0
- package/template/wall-e/agent-runtime/session-store.js +179 -0
- package/template/wall-e/agent-runtime/spawn.js +208 -0
- package/template/wall-e/api-walle.js +174 -7
- package/template/wall-e/brain.js +266 -28
- package/template/wall-e/channels/policy.js +88 -0
- package/template/wall-e/channels/registry.js +15 -1
- package/template/wall-e/channels/reply-dispatcher.js +70 -0
- package/template/wall-e/channels/session-bindings.js +51 -0
- package/template/wall-e/chat/code-review-context.js +29 -0
- package/template/wall-e/chat.js +188 -42
- package/template/wall-e/coding/acp-adapter.js +188 -0
- package/template/wall-e/coding/agent-catalog.js +129 -0
- package/template/wall-e/coding/compaction-service.js +247 -0
- package/template/wall-e/coding/execution-trace.js +3 -0
- package/template/wall-e/coding/instruction-service.js +224 -0
- package/template/wall-e/coding/model-message.js +67 -0
- package/template/wall-e/coding/permission-rules-store.js +111 -0
- package/template/wall-e/coding/permission-service.js +266 -0
- package/template/wall-e/coding/prompt-bundle.js +67 -0
- package/template/wall-e/coding/prompt-runtime.js +243 -0
- package/template/wall-e/coding/provider-transform.js +188 -0
- package/template/wall-e/coding/runtime-mode.js +132 -0
- package/template/wall-e/coding/snapshot-service.js +155 -0
- package/template/wall-e/coding/stream-processor.js +268 -0
- package/template/wall-e/coding/task-tool.js +255 -0
- package/template/wall-e/coding/tool-registry.js +361 -0
- package/template/wall-e/coding/transcript-writer.js +143 -0
- package/template/wall-e/coding/workspace-replay.js +324 -0
- package/template/wall-e/coding-context.js +4 -22
- package/template/wall-e/coding-orchestrator.js +307 -18
- package/template/wall-e/coding-prompts.js +44 -3
- package/template/wall-e/context/context-builder.js +43 -1
- package/template/wall-e/context/topic-matcher.js +1 -1
- package/template/wall-e/eval/agent-runner.js +59 -13
- package/template/wall-e/eval/benchmarks/memory-retrieval.json +155 -57
- package/template/wall-e/eval/benchmarks.js +100 -16
- package/template/wall-e/eval/eval-orchestrator.js +218 -8
- package/template/wall-e/eval/harvester.js +62 -5
- package/template/wall-e/eval/head-to-head.js +23 -2
- package/template/wall-e/eval/humaneval-adapter.js +30 -5
- package/template/wall-e/eval/livecodebench-adapter.js +29 -5
- package/template/wall-e/eval/manifest.js +186 -0
- package/template/wall-e/eval/run-agent-benchmarks.js +66 -2
- package/template/wall-e/eval/session-retrieval-benchmark.js +150 -0
- package/template/wall-e/eval/session-transcripts.js +57 -4
- package/template/wall-e/eval/swebench-adapter.js +109 -3
- package/template/wall-e/evaluation/agent-router.js +53 -1
- package/template/wall-e/evaluation/coding-quorum.js +48 -1
- package/template/wall-e/evaluation/router.js +4 -2
- package/template/wall-e/evaluation/tier-selector.js +11 -1
- package/template/wall-e/extraction/contradiction.js +2 -2
- package/template/wall-e/extraction/indexer.js +2 -1
- package/template/wall-e/extraction/knowledge-extractor.js +2 -2
- package/template/wall-e/hooks/cli.js +92 -0
- package/template/wall-e/hooks/discovery.js +119 -0
- package/template/wall-e/hooks/index.js +7 -0
- package/template/wall-e/hooks/manifest.js +55 -0
- package/template/wall-e/hooks/runtime.js +84 -0
- package/template/wall-e/hooks/session-memory.js +225 -0
- package/template/wall-e/http/auth.js +6 -2
- package/template/wall-e/http/chat-api.js +54 -8
- package/template/wall-e/integrations/claude-plugin/hooks/hooks.json +27 -0
- package/template/wall-e/integrations/claude-plugin/hooks/walle-precompact-hook.sh +5 -0
- package/template/wall-e/integrations/claude-plugin/hooks/walle-stop-hook.sh +5 -0
- package/template/wall-e/integrations/codex-plugin/hooks/walle-hook.sh +7 -0
- package/template/wall-e/integrations/codex-plugin/hooks.json +37 -0
- package/template/wall-e/listening/calendar.js +3 -1
- package/template/wall-e/llm/client.js +64 -10
- package/template/wall-e/llm/google.js +39 -5
- package/template/wall-e/llm/ollama.js +1 -1
- package/template/wall-e/llm/ollama.plugin.json +1 -1
- package/template/wall-e/llm/provider-availability.js +10 -0
- package/template/wall-e/llm/provider-error.js +269 -0
- package/template/wall-e/llm/tool-adapter.js +48 -12
- package/template/wall-e/loops/boot.js +2 -1
- package/template/wall-e/loops/initiative.js +2 -2
- package/template/wall-e/loops/tasks.js +8 -47
- package/template/wall-e/loops/workspace-prompts.js +20 -0
- package/template/wall-e/mcp-server.js +442 -1
- package/template/wall-e/memory/session-ingest-service.js +159 -0
- package/template/wall-e/memory/source-indexer.js +289 -0
- package/template/wall-e/plugins/discovery.js +83 -0
- package/template/wall-e/plugins/manifest-loader.js +50 -10
- package/template/wall-e/plugins/manifest-schema.js +69 -0
- package/template/wall-e/plugins/model-catalog.js +55 -0
- package/template/wall-e/prompts/coding/base.txt +2 -0
- package/template/wall-e/prompts/coding/deepseek.txt +1 -0
- package/template/wall-e/prompts/coding/memory-protocol.md +9 -0
- package/template/wall-e/prompts/coding/plan.txt +1 -0
- package/template/wall-e/runtime/execution-trace.js +220 -0
- package/template/wall-e/security/audit.js +266 -0
- package/template/wall-e/security/ssrf.js +236 -0
- package/template/wall-e/session-files.js +303 -0
- package/template/wall-e/skills/_bundled/slack-backfill/SKILL.md +3 -0
- package/template/wall-e/skills/_bundled/slack-sync/SKILL.md +3 -0
- package/template/wall-e/skills/internal-skill-registry.js +2 -2
- package/template/wall-e/skills/script-skill-runner.js +143 -0
- package/template/wall-e/skills/skill-executor.js +5 -6
- package/template/wall-e/skills/skill-fallback.js +3 -1
- package/template/wall-e/skills/skill-harness-registry.js +7 -8
- package/template/wall-e/skills/skill-planner.js +52 -4
- package/template/wall-e/skills/slack-ingest.js +11 -3
- package/template/wall-e/sources/base.js +90 -0
- package/template/wall-e/sources/builtin.js +33 -0
- package/template/wall-e/sources/claude-code-jsonl.js +78 -0
- package/template/wall-e/sources/codex-jsonl.js +125 -0
- package/template/wall-e/sources/coding-session-utils.js +117 -0
- package/template/wall-e/sources/contract-suite.js +59 -0
- package/template/wall-e/sources/gemini-jsonl.js +85 -0
- package/template/wall-e/sources/index.js +9 -0
- package/template/wall-e/sources/jsonl-utils.js +181 -0
- package/template/wall-e/sources/record-types.js +252 -0
- package/template/wall-e/sources/registry.js +92 -0
- package/template/wall-e/sources/transforms.js +100 -0
- package/template/wall-e/sources/walle-jsonl.js +108 -0
- package/template/wall-e/tools/coding-middleware.js +31 -1
- package/template/wall-e/tools/file-tracker.js +25 -1
- package/template/wall-e/tools/local-tools.js +75 -47
- package/template/wall-e/tools/session-sharing.js +68 -1
- package/template/wall-e/tools/shell-analyzer.js +1 -1
- package/template/wall-e/tools/shell-policy.js +47 -0
- package/template/wall-e/tools/snapshot.js +42 -0
- package/template/wall-e/training/harvester.js +62 -5
- package/template/wall-e/utils/repair.js +253 -1
- package/template/website/index.html +3 -3
- package/template/wall-e/skills/_bundled/slack-mentions/.watched-threads.json +0 -18
|
@@ -6,6 +6,7 @@ const crypto = require('crypto');
|
|
|
6
6
|
const { execFileSync, execFile } = require('child_process');
|
|
7
7
|
const { promisify } = require('util');
|
|
8
8
|
const execFileAsync = promisify(execFile);
|
|
9
|
+
const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
|
|
9
10
|
|
|
10
11
|
const DEFAULT_TIMEOUT_MS = 600_000; // 10 minutes — coding agents can take long
|
|
11
12
|
const FIXTURES_DIR = path.join(__dirname, 'fixtures');
|
|
@@ -106,11 +107,18 @@ async function runAgentBenchmark(benchmark, options = {}) {
|
|
|
106
107
|
provider,
|
|
107
108
|
model,
|
|
108
109
|
mode: 'build',
|
|
110
|
+
benchmark: true,
|
|
111
|
+
headless: true,
|
|
112
|
+
headlessPolicy: 'allow',
|
|
113
|
+
permissionTimeoutMs: 0,
|
|
114
|
+
});
|
|
115
|
+
let timeoutHandle;
|
|
116
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
117
|
+
timeoutHandle = setTimeout(() => reject(new Error('Hard timeout exceeded')), effectiveTimeout + 60000); // +1min grace
|
|
118
|
+
if (typeof timeoutHandle.unref === 'function') timeoutHandle.unref();
|
|
109
119
|
});
|
|
110
|
-
const timeoutPromise = new Promise((_, reject) =>
|
|
111
|
-
setTimeout(() => reject(new Error('Hard timeout exceeded')), effectiveTimeout + 60000) // +1min grace
|
|
112
|
-
);
|
|
113
120
|
const result = await Promise.race([agentPromise, timeoutPromise]);
|
|
121
|
+
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
114
122
|
|
|
115
123
|
const latencyMs = Date.now() - startTime;
|
|
116
124
|
|
|
@@ -145,6 +153,21 @@ async function runAgentBenchmark(benchmark, options = {}) {
|
|
|
145
153
|
if (totalTests === null) totalTests = afterCounts.total;
|
|
146
154
|
}
|
|
147
155
|
|
|
156
|
+
const inputTokens = usage.inputTokens ?? usage.input ?? 0;
|
|
157
|
+
const expectedFileChanges = expectations.expectedFileChanges || [];
|
|
158
|
+
const missingExpectedWork = expectedFileChanges.length > 0 && actualFileChanges.length === 0;
|
|
159
|
+
const testRegression = (expectations.testCommand && testsPassed === false);
|
|
160
|
+
const rawError = result.stderr || result.error || null;
|
|
161
|
+
const validatedByTests = Boolean(
|
|
162
|
+
expectations.testCommand &&
|
|
163
|
+
testsPassed === true &&
|
|
164
|
+
actualFileChanges.length > 0
|
|
165
|
+
);
|
|
166
|
+
const fatalError = rawError && !validatedByTests ? rawError : null;
|
|
167
|
+
const noEffort = (actualToolCalls.length === 0) || (inputTokens === 0) || missingExpectedWork;
|
|
168
|
+
const hadError = !!fatalError;
|
|
169
|
+
const validatedSuccess = Boolean(result.success || validatedByTests) && !hadError && !noEffort && !testRegression;
|
|
170
|
+
|
|
148
171
|
// Score the result
|
|
149
172
|
let score = scoreAgentResult(benchmark, {
|
|
150
173
|
actualToolCalls,
|
|
@@ -152,7 +175,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
|
|
|
152
175
|
actualTurns,
|
|
153
176
|
testsPassed,
|
|
154
177
|
output: result.output || '',
|
|
155
|
-
success:
|
|
178
|
+
success: validatedSuccess,
|
|
156
179
|
sandboxDir,
|
|
157
180
|
costDollars,
|
|
158
181
|
testsBefore,
|
|
@@ -167,21 +190,23 @@ async function runAgentBenchmark(benchmark, options = {}) {
|
|
|
167
190
|
// through process-metric weights (turnEconomy, errorHandling, costEfficiency).
|
|
168
191
|
// That inflated past failure-investigation thresholds and reported FAIL as
|
|
169
192
|
// PASS. Cap explicitly here.
|
|
170
|
-
const inputTokens = usage.inputTokens ?? usage.input ?? 0;
|
|
171
|
-
const hadError = !!(result.stderr || result.error);
|
|
172
|
-
const noEffort = (actualToolCalls.length === 0) || (inputTokens === 0);
|
|
173
|
-
const testRegression = (expectations.testCommand && testsPassed === false);
|
|
174
193
|
if (hadError || noEffort || testRegression) {
|
|
175
194
|
score = {
|
|
176
195
|
composite: 0,
|
|
177
196
|
dimensions: { ...(score.dimensions || {}), _zeroed: true,
|
|
178
|
-
_zeroReason: hadError
|
|
197
|
+
_zeroReason: hadError
|
|
198
|
+
? 'error'
|
|
199
|
+
: testRegression
|
|
200
|
+
? 'tests_failed'
|
|
201
|
+
: missingExpectedWork
|
|
202
|
+
? 'no_file_changes'
|
|
203
|
+
: 'no_effort' },
|
|
179
204
|
};
|
|
180
205
|
}
|
|
181
206
|
|
|
182
207
|
return {
|
|
183
208
|
benchmarkId: benchmark.id,
|
|
184
|
-
success:
|
|
209
|
+
success: validatedSuccess,
|
|
185
210
|
score,
|
|
186
211
|
latencyMs,
|
|
187
212
|
actualToolCalls,
|
|
@@ -196,7 +221,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
|
|
|
196
221
|
outputTokens: usage.outputTokens ?? usage.output ?? null,
|
|
197
222
|
dimensionsJson: JSON.stringify(score.dimensions || {}),
|
|
198
223
|
output: (result.output || '').slice(0, 2000),
|
|
199
|
-
error:
|
|
224
|
+
error: fatalError,
|
|
200
225
|
};
|
|
201
226
|
} catch (err) {
|
|
202
227
|
return {
|
|
@@ -304,6 +329,10 @@ async function runMultiTurnBenchmark(benchmark, options = {}) {
|
|
|
304
329
|
provider,
|
|
305
330
|
model,
|
|
306
331
|
mode: 'build',
|
|
332
|
+
benchmark: true,
|
|
333
|
+
headless: true,
|
|
334
|
+
headlessPolicy: 'allow',
|
|
335
|
+
permissionTimeoutMs: 0,
|
|
307
336
|
messages, // pass accumulated conversation
|
|
308
337
|
});
|
|
309
338
|
|
|
@@ -449,7 +478,10 @@ async function runAgentBenchmarkSuite(options = {}) {
|
|
|
449
478
|
// Store result
|
|
450
479
|
if (brain && typeof brain.insertBenchmarkResult === 'function') {
|
|
451
480
|
try {
|
|
452
|
-
|
|
481
|
+
const scoringMethod = benchmark.agentExpectations?.testCommand
|
|
482
|
+
? 'agent-rubric+tests'
|
|
483
|
+
: 'agent-rubric';
|
|
484
|
+
brain.insertBenchmarkResult(decorateBenchmarkResult({
|
|
453
485
|
runId,
|
|
454
486
|
suite: 'coding-agent',
|
|
455
487
|
promptId: benchmark.id,
|
|
@@ -473,7 +505,21 @@ async function runAgentBenchmarkSuite(options = {}) {
|
|
|
473
505
|
dimensionsJson: result.dimensionsJson || null,
|
|
474
506
|
inputTokens: result.inputTokens ?? null,
|
|
475
507
|
outputTokens: result.outputTokens ?? null,
|
|
476
|
-
|
|
508
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
509
|
+
scoringMethod,
|
|
510
|
+
trusted: !result.error && result.testsPassed === true,
|
|
511
|
+
runConfig: { timeoutMs, scoringMethod },
|
|
512
|
+
}, {
|
|
513
|
+
suite: 'coding-agent',
|
|
514
|
+
benchmark,
|
|
515
|
+
runId,
|
|
516
|
+
provider: provider?.type || 'default',
|
|
517
|
+
model: resolveModelName(model),
|
|
518
|
+
scoringMethod,
|
|
519
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
520
|
+
trusted: !result.error && result.testsPassed === true,
|
|
521
|
+
runConfig: { timeoutMs, scoringMethod },
|
|
522
|
+
}));
|
|
477
523
|
} catch { /* non-fatal */ }
|
|
478
524
|
}
|
|
479
525
|
}
|
|
@@ -1,82 +1,180 @@
|
|
|
1
1
|
[
|
|
2
2
|
{
|
|
3
|
-
"id": "
|
|
4
|
-
"prompt": "
|
|
5
|
-
"taskType": "memory-retrieval",
|
|
6
|
-
"difficulty": "easy",
|
|
7
|
-
"expectedTraits": ["references context", "mentions Rust", "mentions Phoenix", "accurate extraction"],
|
|
8
|
-
"tags": ["recall", "simple"]
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "memory-002",
|
|
12
|
-
"prompt": "Context: Yesterday the user mentioned they have a meeting with the design team on Thursday at 2pm. They also need to submit the Q3 report by Friday. Their manager's name is Sarah Chen.\n\nQuestion: What are the user's upcoming deadlines and meetings?",
|
|
13
|
-
"taskType": "memory-retrieval",
|
|
14
|
-
"difficulty": "easy",
|
|
15
|
-
"expectedTraits": ["references context", "mentions Thursday meeting", "mentions Friday report", "accurate extraction"],
|
|
16
|
-
"tags": ["recall", "schedule"]
|
|
17
|
-
},
|
|
18
|
-
{
|
|
19
|
-
"id": "memory-003",
|
|
20
|
-
"prompt": "Context: The user's tech stack includes Next.js for frontend, FastAPI for backend, PostgreSQL for the database, and Redis for caching. They deploy on AWS using ECS. The frontend is hosted on Vercel.\n\nQuestion: If the user needs to add a new API endpoint, which framework and language should they use based on their stack?",
|
|
3
|
+
"id": "session-recall-001",
|
|
4
|
+
"prompt": "Use Wall-E memory to answer: in the prior Codex parser session, which file was changed and what test command proved the fix?",
|
|
21
5
|
"taskType": "memory-retrieval",
|
|
22
6
|
"difficulty": "medium",
|
|
23
|
-
"expectedTraits": ["
|
|
24
|
-
"tags": ["
|
|
7
|
+
"expectedTraits": ["searches session memory", "mentions parser.js", "mentions node --test", "cites session id"],
|
|
8
|
+
"tags": ["session-recall", "coding", "sanitized-real-shape"],
|
|
9
|
+
"retrieval": {
|
|
10
|
+
"query": "parser src/parser.js node",
|
|
11
|
+
"expectedSourceIds": ["codex:sanitized-parser"],
|
|
12
|
+
"expectedSnippets": ["src/parser.js", "node --test tests/parser.test.js"],
|
|
13
|
+
"seedMemories": [
|
|
14
|
+
{
|
|
15
|
+
"source": "codex-jsonl",
|
|
16
|
+
"source_id": "codex:sanitized-parser:exchange:1",
|
|
17
|
+
"memory_type": "coding_session_exchange",
|
|
18
|
+
"timestamp": "2026-04-28T09:15:00.000Z",
|
|
19
|
+
"cwd": "/repo/app",
|
|
20
|
+
"content": "Q: Fix the parser crash when quoted values include commas.\nA: Decision: keep the tokenizer stateful instead of adding a regex split. Files: src/parser.js, tests/parser.test.js. Command: node --test tests/parser.test.js passed.",
|
|
21
|
+
"metadata": {
|
|
22
|
+
"sourceId": "codex:sanitized-parser",
|
|
23
|
+
"cwd": "/repo/app",
|
|
24
|
+
"gitBranch": "fix/parser-quoted-values",
|
|
25
|
+
"filesEdited": ["src/parser.js", "tests/parser.test.js"],
|
|
26
|
+
"commands": ["node --test tests/parser.test.js"]
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
}
|
|
25
31
|
},
|
|
26
32
|
{
|
|
27
|
-
"id": "
|
|
28
|
-
"prompt": "
|
|
33
|
+
"id": "session-recall-002",
|
|
34
|
+
"prompt": "Use Wall-E memory to answer: why did the prior Claude session reject the batch write approach for the queue worker?",
|
|
29
35
|
"taskType": "memory-retrieval",
|
|
30
36
|
"difficulty": "medium",
|
|
31
|
-
"expectedTraits": ["
|
|
32
|
-
"tags": ["
|
|
37
|
+
"expectedTraits": ["searches session memory", "mentions lock contention", "mentions queue-worker.js", "cites session id"],
|
|
38
|
+
"tags": ["decision", "failure", "sanitized-real-shape"],
|
|
39
|
+
"retrieval": {
|
|
40
|
+
"query": "queue worker lock contention",
|
|
41
|
+
"expectedSourceIds": ["claude:sanitized-queue"],
|
|
42
|
+
"expectedSnippets": ["lock contention", "queue-worker.js"],
|
|
43
|
+
"seedMemories": [
|
|
44
|
+
{
|
|
45
|
+
"source": "claude-code-jsonl",
|
|
46
|
+
"source_id": "claude:sanitized-queue:exchange:4",
|
|
47
|
+
"memory_type": "coding_session_exchange",
|
|
48
|
+
"timestamp": "2026-04-27T17:22:00.000Z",
|
|
49
|
+
"cwd": "/repo/service",
|
|
50
|
+
"content": "Q: Speed up queue persistence.\nA: Blocker: batch writes increased SQLite lock contention under concurrent workers. Decision: keep single-row retry with jitter. Files: workers/queue-worker.js, tests/queue-worker.test.js. Command: npm test -- queue-worker passed.",
|
|
51
|
+
"metadata": {
|
|
52
|
+
"sourceId": "claude:sanitized-queue",
|
|
53
|
+
"cwd": "/repo/service",
|
|
54
|
+
"gitBranch": "fix/queue-locking",
|
|
55
|
+
"filesEdited": ["workers/queue-worker.js", "tests/queue-worker.test.js"],
|
|
56
|
+
"commands": ["npm test -- queue-worker"]
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
]
|
|
60
|
+
}
|
|
33
61
|
},
|
|
34
62
|
{
|
|
35
|
-
"id": "
|
|
36
|
-
"prompt": "
|
|
63
|
+
"id": "session-recall-003",
|
|
64
|
+
"prompt": "Use Wall-E memory to answer: which browser test covered the transcript blank-space regression?",
|
|
37
65
|
"taskType": "memory-retrieval",
|
|
38
66
|
"difficulty": "medium",
|
|
39
|
-
"expectedTraits": ["
|
|
40
|
-
"tags": ["
|
|
67
|
+
"expectedTraits": ["searches session memory", "mentions codex-blank-space.spec.js", "mentions blank gap", "cites session id"],
|
|
68
|
+
"tags": ["ui-regression", "browser-test", "sanitized-real-shape"],
|
|
69
|
+
"retrieval": {
|
|
70
|
+
"query": "blank-gap codex-blank-space.spec.js",
|
|
71
|
+
"expectedSourceIds": ["walle:sanitized-terminal-ui"],
|
|
72
|
+
"expectedSnippets": ["codex-blank-space.spec.js", "completed-turn blank-gap compaction"],
|
|
73
|
+
"seedMemories": [
|
|
74
|
+
{
|
|
75
|
+
"source": "walle-jsonl",
|
|
76
|
+
"source_id": "walle:sanitized-terminal-ui:assistant:12",
|
|
77
|
+
"memory_type": "coding_session_assistant_message",
|
|
78
|
+
"timestamp": "2026-04-29T11:05:00.000Z",
|
|
79
|
+
"cwd": "/repo/tools",
|
|
80
|
+
"content": "Decision: fix completed-turn blank-gap compaction in the Codex terminal renderer. Files: claude-task-manager/public/session-stream.js, claude-task-manager/tests/codex-blank-space.spec.js. Command: npx playwright test claude-task-manager/tests/codex-blank-space.spec.js passed.",
|
|
81
|
+
"metadata": {
|
|
82
|
+
"sourceId": "walle:sanitized-terminal-ui",
|
|
83
|
+
"cwd": "/repo/tools",
|
|
84
|
+
"gitBranch": "fix/codex-terminal-blank-gap",
|
|
85
|
+
"filesEdited": ["claude-task-manager/public/session-stream.js", "claude-task-manager/tests/codex-blank-space.spec.js"],
|
|
86
|
+
"commands": ["npx playwright test claude-task-manager/tests/codex-blank-space.spec.js"]
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
]
|
|
90
|
+
}
|
|
41
91
|
},
|
|
42
92
|
{
|
|
43
|
-
"id": "
|
|
44
|
-
"prompt": "
|
|
45
|
-
"taskType": "memory-retrieval",
|
|
46
|
-
"difficulty": "medium",
|
|
47
|
-
"expectedTraits": ["references context", "mentions OrderService", "mentions port 3002", "mentions AuthService for SLA", "accurate extraction"],
|
|
48
|
-
"tags": ["multi-fact", "architecture"]
|
|
49
|
-
},
|
|
50
|
-
{
|
|
51
|
-
"id": "memory-007",
|
|
52
|
-
"prompt": "Context: The user mentioned they tried three different approaches to solve a caching problem: 1) Redis with 5-minute TTL (too many cache misses), 2) In-memory LRU cache (worked but lost on restart), 3) Redis with write-through strategy (current solution, working well).\n\nQuestion: What caching approach is the user currently using and why did they reject the alternatives?",
|
|
53
|
-
"taskType": "memory-retrieval",
|
|
54
|
-
"difficulty": "medium",
|
|
55
|
-
"expectedTraits": ["references context", "mentions write-through", "explains rejected approaches", "accurate extraction"],
|
|
56
|
-
"tags": ["history", "decision"]
|
|
57
|
-
},
|
|
58
|
-
{
|
|
59
|
-
"id": "memory-008",
|
|
60
|
-
"prompt": "Context: The user has NOT mentioned anything about their deployment pipeline or CI/CD setup.\n\nQuestion: What CI/CD tool does the user use for their deployments?",
|
|
93
|
+
"id": "session-recall-004",
|
|
94
|
+
"prompt": "Use Wall-E memory to answer: what was the next step after the model routing quorum session?",
|
|
61
95
|
"taskType": "memory-retrieval",
|
|
62
96
|
"difficulty": "hard",
|
|
63
|
-
"expectedTraits": ["
|
|
64
|
-
"tags": ["
|
|
97
|
+
"expectedTraits": ["searches diary", "mentions router inputs", "mentions evaluation", "cites diary/session id"],
|
|
98
|
+
"tags": ["diary", "handoff", "sanitized-real-shape"],
|
|
99
|
+
"retrieval": {
|
|
100
|
+
"query": "model routing quorum next step router inputs evaluation diary",
|
|
101
|
+
"expectedSourceIds": ["diary:walle:sanitized-quorum:stop"],
|
|
102
|
+
"expectedSnippets": ["feed quorum results into routing", "run trusted evaluation"],
|
|
103
|
+
"seedMemories": [
|
|
104
|
+
{
|
|
105
|
+
"source": "walle-diary",
|
|
106
|
+
"source_id": "diary:walle:sanitized-quorum:stop",
|
|
107
|
+
"memory_type": "agent_diary",
|
|
108
|
+
"timestamp": "2026-04-26T20:30:00.000Z",
|
|
109
|
+
"cwd": "/repo/tools",
|
|
110
|
+
"content": "Agent diary for walle session sanitized-quorum (stop)\nSummary: feed quorum results into routing instead of leaving them as a side report.\nChanged files: wall-e/evaluation/coding-quorum.js; wall-e/routing/model-router.js\nDecisions: use reviewer/quorum/security data as router inputs.\nNext steps: run trusted evaluation on coding-agent-real cases.",
|
|
111
|
+
"metadata": {
|
|
112
|
+
"sourceId": "diary:walle:sanitized-quorum:stop",
|
|
113
|
+
"sessionId": "sanitized-quorum",
|
|
114
|
+
"agent": "walle",
|
|
115
|
+
"event": "stop",
|
|
116
|
+
"cwd": "/repo/tools",
|
|
117
|
+
"changed_files": ["wall-e/evaluation/coding-quorum.js", "wall-e/routing/model-router.js"],
|
|
118
|
+
"next_steps": ["run trusted evaluation on coding-agent-real cases"]
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
]
|
|
122
|
+
}
|
|
65
123
|
},
|
|
66
124
|
{
|
|
67
|
-
"id": "
|
|
68
|
-
"prompt": "
|
|
125
|
+
"id": "session-recall-005",
|
|
126
|
+
"prompt": "Use Wall-E memory to answer: which source adapter handled Gemini JSONL and what privacy class did it use?",
|
|
69
127
|
"taskType": "memory-retrieval",
|
|
70
128
|
"difficulty": "medium",
|
|
71
|
-
"expectedTraits": ["
|
|
72
|
-
"tags": ["
|
|
129
|
+
"expectedTraits": ["searches session memory", "mentions gemini-jsonl", "mentions pii_potential", "cites session id"],
|
|
130
|
+
"tags": ["source-adapter", "privacy", "sanitized-real-shape"],
|
|
131
|
+
"retrieval": {
|
|
132
|
+
"query": "Gemini JSONL source adapter privacy class pii_potential",
|
|
133
|
+
"expectedSourceIds": ["codex:sanitized-source-adapters"],
|
|
134
|
+
"expectedSnippets": ["gemini-jsonl", "pii_potential"],
|
|
135
|
+
"seedMemories": [
|
|
136
|
+
{
|
|
137
|
+
"source": "codex-jsonl",
|
|
138
|
+
"source_id": "codex:sanitized-source-adapters:assistant:8",
|
|
139
|
+
"memory_type": "coding_session_assistant_message",
|
|
140
|
+
"timestamp": "2026-04-29T14:02:00.000Z",
|
|
141
|
+
"cwd": "/repo/tools",
|
|
142
|
+
"content": "Decision: register claude-code-jsonl, codex-jsonl, gemini-jsonl, and walle-jsonl as first-party source adapters. The default privacy class for coding session adapters is pii_potential because local transcripts can include personal context.",
|
|
143
|
+
"metadata": {
|
|
144
|
+
"sourceId": "codex:sanitized-source-adapters",
|
|
145
|
+
"cwd": "/repo/tools",
|
|
146
|
+
"filesEdited": ["wall-e/sources/gemini-jsonl.js", "wall-e/sources/builtin.js"]
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
]
|
|
150
|
+
}
|
|
73
151
|
},
|
|
74
152
|
{
|
|
75
|
-
"id": "
|
|
76
|
-
"prompt": "
|
|
153
|
+
"id": "session-recall-006",
|
|
154
|
+
"prompt": "Use Wall-E memory to answer honestly: do we have a remembered decision about replacing SQLite with ChromaDB?",
|
|
77
155
|
"taskType": "memory-retrieval",
|
|
78
|
-
"difficulty": "
|
|
79
|
-
"expectedTraits": ["
|
|
80
|
-
"tags": ["recall", "
|
|
156
|
+
"difficulty": "hard",
|
|
157
|
+
"expectedTraits": ["searches session memory", "says do not replace SQLite", "mentions sqlite-vec", "does not hallucinate approval"],
|
|
158
|
+
"tags": ["negative-recall", "architecture", "sanitized-real-shape"],
|
|
159
|
+
"retrieval": {
|
|
160
|
+
"query": "replace SQLite with ChromaDB sqlite-vec decision",
|
|
161
|
+
"expectedSourceIds": ["claude:sanitized-memory-architecture"],
|
|
162
|
+
"expectedSnippets": ["Keep SQLite plus sqlite-vec", "Do not adopt ChromaDB"],
|
|
163
|
+
"seedMemories": [
|
|
164
|
+
{
|
|
165
|
+
"source": "claude-code-jsonl",
|
|
166
|
+
"source_id": "claude:sanitized-memory-architecture:exchange:2",
|
|
167
|
+
"memory_type": "coding_session_exchange",
|
|
168
|
+
"timestamp": "2026-04-29T13:45:00.000Z",
|
|
169
|
+
"cwd": "/repo/tools",
|
|
170
|
+
"content": "Q: Should Wall-E adopt ChromaDB from the reference project?\nA: Decision: Keep SQLite plus sqlite-vec. Do not adopt ChromaDB; port the source-adapter semantics and retrieval tests instead.",
|
|
171
|
+
"metadata": {
|
|
172
|
+
"sourceId": "claude:sanitized-memory-architecture",
|
|
173
|
+
"cwd": "/repo/tools",
|
|
174
|
+
"gitBranch": "feat/session-memory-protocol"
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
]
|
|
178
|
+
}
|
|
81
179
|
}
|
|
82
180
|
]
|
|
@@ -4,6 +4,7 @@ const fs = require('fs');
|
|
|
4
4
|
const path = require('path');
|
|
5
5
|
const crypto = require('crypto');
|
|
6
6
|
const { createClient } = require('../llm/client');
|
|
7
|
+
const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
|
|
7
8
|
|
|
8
9
|
// ============================================================
|
|
9
10
|
// Constants
|
|
@@ -161,6 +162,23 @@ const TRAIT_MATCHERS = {
|
|
|
161
162
|
'mentions conventional commits': (r) => /conventional\s+commit|feat:|fix:|chore:/i.test(r),
|
|
162
163
|
'mentions no force push to main': (r) => /force\s+push|--force|never.*push.*main|no.*force/i.test(r),
|
|
163
164
|
'lists all five preferences': (r) => /dark\s+mode|timezone|America\/Los_Angeles|English|daily\s+digest|verbose/i.test(r),
|
|
165
|
+
'searches session memory': (r) => /session|memory|remember|transcript|source|found/i.test(r),
|
|
166
|
+
'mentions parser.js': (r) => /parser\.js/i.test(r),
|
|
167
|
+
'mentions node --test': (r) => /node\s+--test/i.test(r),
|
|
168
|
+
'cites session id': (r) => /(?:session|source)[\s_-]?id|codex:sanitized|claude:sanitized|walle:sanitized|sanitized-[\w-]+/i.test(r),
|
|
169
|
+
'mentions lock contention': (r) => /lock\s+contention/i.test(r),
|
|
170
|
+
'mentions queue-worker.js': (r) => /queue-worker\.js/i.test(r),
|
|
171
|
+
'mentions codex-blank-space.spec.js': (r) => /codex-blank-space\.spec\.js/i.test(r),
|
|
172
|
+
'mentions blank gap': (r) => /blank[-\s]?gap/i.test(r),
|
|
173
|
+
'searches diary': (r) => /diary|agent diary|remember|memory|source/i.test(r),
|
|
174
|
+
'mentions router inputs': (r) => /router\s+inputs|routing.*inputs/i.test(r),
|
|
175
|
+
'mentions evaluation': (r) => /evaluation|eval|trusted\s+evaluation/i.test(r),
|
|
176
|
+
'cites diary/session id': (r) => /diary|session[\s_-]?id|sanitized-quorum|source[\s_-]?id/i.test(r),
|
|
177
|
+
'mentions gemini-jsonl': (r) => /gemini-jsonl/i.test(r),
|
|
178
|
+
'mentions pii_potential': (r) => /pii_potential/i.test(r),
|
|
179
|
+
'says do not replace SQLite': (r) => /do\s+not\s+(?:adopt|replace|use).*SQLite|keep\s+SQLite|SQLite.*not\s+replace/i.test(r),
|
|
180
|
+
'mentions sqlite-vec': (r) => /sqlite-vec/i.test(r),
|
|
181
|
+
'does not hallucinate approval': (r) => /do\s+not\s+(?:adopt|replace)|no\s+approval|not\s+approved|rejected|keep\s+SQLite/i.test(r),
|
|
164
182
|
|
|
165
183
|
// --- Coding-agent traits ---
|
|
166
184
|
'uses edit over write': (r) => /edit_file|apply_patch|multi_edit/i.test(r) && !/write_file/i.test(r),
|
|
@@ -214,6 +232,15 @@ const TRAIT_MATCHERS = {
|
|
|
214
232
|
'asks clarifying questions': (r) => /ask_user|AskUserQuestion/i.test(r),
|
|
215
233
|
};
|
|
216
234
|
|
|
235
|
+
const UNSCORABLE_TRAITS = new Set([
|
|
236
|
+
'accurate',
|
|
237
|
+
'correct solution',
|
|
238
|
+
'references context',
|
|
239
|
+
'accurate extraction',
|
|
240
|
+
'not hallucinated',
|
|
241
|
+
'does not hallucinate',
|
|
242
|
+
]);
|
|
243
|
+
|
|
217
244
|
// ============================================================
|
|
218
245
|
// Suite loading
|
|
219
246
|
// ============================================================
|
|
@@ -298,6 +325,12 @@ function loadBenchmarkSuite(suiteName) {
|
|
|
298
325
|
if (!hasTraits && !hasReplyChecks && !hasToolChecks && !hasMockTools && !hasAgentExpectations && !isEdgeCase) {
|
|
299
326
|
throw new Error(`Benchmark entry ${entry.id} has no scoring signal (expectedTraits / expectedInReply / expectedTools / mockToolResults / agentExpectations)`);
|
|
300
327
|
}
|
|
328
|
+
if (hasTraits) {
|
|
329
|
+
const unknownTraits = entry.expectedTraits.filter(t => !TRAIT_MATCHERS[t] && !UNSCORABLE_TRAITS.has(t));
|
|
330
|
+
if (unknownTraits.length) {
|
|
331
|
+
throw new Error(`Benchmark entry ${entry.id} has unknown expectedTraits: ${unknownTraits.join(', ')}`);
|
|
332
|
+
}
|
|
333
|
+
}
|
|
301
334
|
}
|
|
302
335
|
|
|
303
336
|
return { name: suiteName, prompts };
|
|
@@ -315,16 +348,41 @@ function loadBenchmarkSuite(suiteName) {
|
|
|
315
348
|
*/
|
|
316
349
|
function scoreTrait(response, trait) {
|
|
317
350
|
if (!response || typeof response !== 'string') return false;
|
|
351
|
+
if (UNSCORABLE_TRAITS.has(trait)) return false;
|
|
318
352
|
const matcher = TRAIT_MATCHERS[trait];
|
|
319
|
-
if (!matcher)
|
|
320
|
-
// Unknown trait — fallback: search for the trait keywords in the response
|
|
321
|
-
const keywords = trait.toLowerCase().split(/\s+/);
|
|
322
|
-
const lower = response.toLowerCase();
|
|
323
|
-
return keywords.some((kw) => kw.length > 3 && lower.includes(kw));
|
|
324
|
-
}
|
|
353
|
+
if (!matcher) return false;
|
|
325
354
|
return matcher(response);
|
|
326
355
|
}
|
|
327
356
|
|
|
357
|
+
function scoreTraitsDetailed(response, expectedTraits) {
|
|
358
|
+
const detail = {
|
|
359
|
+
score: 0,
|
|
360
|
+
matched: [],
|
|
361
|
+
missed: [],
|
|
362
|
+
unscored: [],
|
|
363
|
+
unknown: [],
|
|
364
|
+
scoredCount: 0,
|
|
365
|
+
};
|
|
366
|
+
if (!expectedTraits || expectedTraits.length === 0) return detail;
|
|
367
|
+
|
|
368
|
+
for (const trait of expectedTraits) {
|
|
369
|
+
if (UNSCORABLE_TRAITS.has(trait)) {
|
|
370
|
+
detail.unscored.push(trait);
|
|
371
|
+
continue;
|
|
372
|
+
}
|
|
373
|
+
if (!TRAIT_MATCHERS[trait]) {
|
|
374
|
+
detail.unknown.push(trait);
|
|
375
|
+
continue;
|
|
376
|
+
}
|
|
377
|
+
detail.scoredCount++;
|
|
378
|
+
if (scoreTrait(response, trait)) detail.matched.push(trait);
|
|
379
|
+
else detail.missed.push(trait);
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
detail.score = detail.scoredCount > 0 ? detail.matched.length / detail.scoredCount : 0;
|
|
383
|
+
return detail;
|
|
384
|
+
}
|
|
385
|
+
|
|
328
386
|
/**
|
|
329
387
|
* Score a response against multiple expected traits.
|
|
330
388
|
* @param {string} response - LLM response text
|
|
@@ -332,9 +390,7 @@ function scoreTrait(response, trait) {
|
|
|
332
390
|
* @returns {number} 0.0 to 1.0 based on percentage of traits matched
|
|
333
391
|
*/
|
|
334
392
|
function scoreTraits(response, expectedTraits) {
|
|
335
|
-
|
|
336
|
-
const matched = expectedTraits.filter((t) => scoreTrait(response, t)).length;
|
|
337
|
-
return matched / expectedTraits.length;
|
|
393
|
+
return scoreTraitsDetailed(response, expectedTraits).score;
|
|
338
394
|
}
|
|
339
395
|
|
|
340
396
|
// ============================================================
|
|
@@ -407,11 +463,13 @@ async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFA
|
|
|
407
463
|
providerScores[providerKey].errors++;
|
|
408
464
|
}
|
|
409
465
|
|
|
410
|
-
// Score traits
|
|
411
|
-
|
|
412
|
-
const
|
|
413
|
-
?
|
|
414
|
-
:
|
|
466
|
+
// Score traits. Some dataset traits are intentionally marked unscorable:
|
|
467
|
+
// they document desired behavior but must not inflate automatic scores.
|
|
468
|
+
const traitDetail = response
|
|
469
|
+
? scoreTraitsDetailed(response, entry.expectedTraits)
|
|
470
|
+
: scoreTraitsDetailed('', entry.expectedTraits);
|
|
471
|
+
const traitScore = traitDetail.score;
|
|
472
|
+
const matchedTraits = traitDetail.matched;
|
|
415
473
|
|
|
416
474
|
// Optional LLM judge
|
|
417
475
|
let judgeScore = null;
|
|
@@ -430,6 +488,9 @@ async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFA
|
|
|
430
488
|
const compositeScore = judgeScore != null
|
|
431
489
|
? traitScore * 0.6 + judgeScore * 0.4
|
|
432
490
|
: traitScore;
|
|
491
|
+
const scoringMethod = judgeScore != null
|
|
492
|
+
? 'trait+judge'
|
|
493
|
+
: traitDetail.scoredCount > 0 ? 'traits' : 'unscored-traits';
|
|
433
494
|
|
|
434
495
|
providerScores[providerKey].total += compositeScore;
|
|
435
496
|
providerScores[providerKey].count++;
|
|
@@ -443,7 +504,7 @@ async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFA
|
|
|
443
504
|
...(judgeScore != null ? { codeQuality: judgeScore } : {}),
|
|
444
505
|
};
|
|
445
506
|
|
|
446
|
-
const resultEntry = {
|
|
507
|
+
const resultEntry = decorateBenchmarkResult({
|
|
447
508
|
runId,
|
|
448
509
|
suite,
|
|
449
510
|
promptId: entry.id,
|
|
@@ -464,9 +525,30 @@ async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFA
|
|
|
464
525
|
outputTokens: usage?.output ?? usage?.completion_tokens ?? null,
|
|
465
526
|
genTokPerSec: usage?.genTokPerSec ?? null,
|
|
466
527
|
dimensionsJson: JSON.stringify(dimensions),
|
|
528
|
+
modelMetadataJson: JSON.stringify({
|
|
529
|
+
matchedTraits,
|
|
530
|
+
missedTraits: traitDetail.missed,
|
|
531
|
+
unscoredTraits: traitDetail.unscored,
|
|
532
|
+
unknownTraits: traitDetail.unknown,
|
|
533
|
+
scoredTraitCount: traitDetail.scoredCount,
|
|
534
|
+
}),
|
|
467
535
|
error,
|
|
536
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
537
|
+
scoringMethod,
|
|
538
|
+
trusted: !error && judgeScore != null,
|
|
539
|
+
runConfig: { timeoutMs },
|
|
468
540
|
timestamp: new Date().toISOString(),
|
|
469
|
-
}
|
|
541
|
+
}, {
|
|
542
|
+
suite,
|
|
543
|
+
benchmark: entry,
|
|
544
|
+
runId,
|
|
545
|
+
provider: provider.type,
|
|
546
|
+
model: provider.model,
|
|
547
|
+
scoringMethod,
|
|
548
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
549
|
+
trusted: !error && judgeScore != null,
|
|
550
|
+
runConfig: { timeoutMs },
|
|
551
|
+
});
|
|
470
552
|
|
|
471
553
|
results.push(resultEntry);
|
|
472
554
|
|
|
@@ -571,8 +653,10 @@ module.exports = {
|
|
|
571
653
|
loadAllBenchmarks,
|
|
572
654
|
scoreTrait,
|
|
573
655
|
scoreTraits,
|
|
656
|
+
scoreTraitsDetailed,
|
|
574
657
|
runBenchmark,
|
|
575
658
|
getBenchmarkLeaderboard,
|
|
576
659
|
TRAIT_MATCHERS,
|
|
660
|
+
UNSCORABLE_TRAITS,
|
|
577
661
|
BENCHMARKS_DIR,
|
|
578
662
|
};
|