create-walle 0.9.11 → 0.9.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. package/README.md +3 -3
  2. package/package.json +2 -2
  3. package/template/bin/dev.sh +7 -1
  4. package/template/bin/setup.js +53 -9
  5. package/template/bin/sync-images.js +53 -0
  6. package/template/builder-journal.md +17 -0
  7. package/template/claude-task-manager/api-prompts.js +98 -13
  8. package/template/claude-task-manager/api-reviews.js +82 -5
  9. package/template/claude-task-manager/db.js +32 -5
  10. package/template/claude-task-manager/docs/session-capture-foundation-design.md +1273 -0
  11. package/template/claude-task-manager/lib/claude-desktop-sessions.js +696 -0
  12. package/template/claude-task-manager/lib/coding-agent-models.js +49 -1
  13. package/template/claude-task-manager/lib/session-capture.js +421 -0
  14. package/template/claude-task-manager/lib/session-history.js +135 -15
  15. package/template/claude-task-manager/lib/session-jobs.js +10 -5
  16. package/template/claude-task-manager/lib/session-stream.js +87 -19
  17. package/template/claude-task-manager/lib/setup-provider-config.js +115 -0
  18. package/template/claude-task-manager/lib/walle-ctm-history.js +72 -0
  19. package/template/claude-task-manager/lib/walle-session-context.js +61 -0
  20. package/template/claude-task-manager/lib/walle-transcript.js +176 -0
  21. package/template/claude-task-manager/public/css/setup.css +35 -8
  22. package/template/claude-task-manager/public/css/walle-session.css +56 -0
  23. package/template/claude-task-manager/public/css/walle.css +120 -0
  24. package/template/claude-task-manager/public/index.html +814 -181
  25. package/template/claude-task-manager/public/js/message-renderer.js +148 -19
  26. package/template/claude-task-manager/public/js/reviews.js +120 -62
  27. package/template/claude-task-manager/public/js/setup.js +75 -31
  28. package/template/claude-task-manager/public/js/stream-view.js +115 -55
  29. package/template/claude-task-manager/public/js/walle-session.js +84 -2
  30. package/template/claude-task-manager/public/js/walle.js +308 -54
  31. package/template/claude-task-manager/server.js +1092 -146
  32. package/template/claude-task-manager/session-integrity.js +181 -54
  33. package/template/claude-task-manager/session-utils.js +123 -41
  34. package/template/claude-task-manager/workers/state-detectors/codex.js +5 -2
  35. package/template/package.json +1 -1
  36. package/template/wall-e/adapters/ctm.js +39 -18
  37. package/template/wall-e/agent-runners/contract.js +17 -0
  38. package/template/wall-e/agent-runners/index.js +22 -0
  39. package/template/wall-e/agent-runtime/harness.js +212 -0
  40. package/template/wall-e/agent-runtime/index.js +8 -0
  41. package/template/wall-e/agent-runtime/registry.js +67 -0
  42. package/template/wall-e/agent-runtime/session-store.js +179 -0
  43. package/template/wall-e/agent-runtime/spawn.js +208 -0
  44. package/template/wall-e/api-walle.js +174 -7
  45. package/template/wall-e/brain.js +266 -28
  46. package/template/wall-e/channels/policy.js +88 -0
  47. package/template/wall-e/channels/registry.js +15 -1
  48. package/template/wall-e/channels/reply-dispatcher.js +70 -0
  49. package/template/wall-e/channels/session-bindings.js +51 -0
  50. package/template/wall-e/chat/code-review-context.js +29 -0
  51. package/template/wall-e/chat.js +188 -42
  52. package/template/wall-e/coding/acp-adapter.js +188 -0
  53. package/template/wall-e/coding/agent-catalog.js +129 -0
  54. package/template/wall-e/coding/compaction-service.js +247 -0
  55. package/template/wall-e/coding/execution-trace.js +3 -0
  56. package/template/wall-e/coding/instruction-service.js +224 -0
  57. package/template/wall-e/coding/model-message.js +67 -0
  58. package/template/wall-e/coding/permission-rules-store.js +111 -0
  59. package/template/wall-e/coding/permission-service.js +266 -0
  60. package/template/wall-e/coding/prompt-bundle.js +67 -0
  61. package/template/wall-e/coding/prompt-runtime.js +243 -0
  62. package/template/wall-e/coding/provider-transform.js +188 -0
  63. package/template/wall-e/coding/runtime-mode.js +132 -0
  64. package/template/wall-e/coding/snapshot-service.js +155 -0
  65. package/template/wall-e/coding/stream-processor.js +268 -0
  66. package/template/wall-e/coding/task-tool.js +255 -0
  67. package/template/wall-e/coding/tool-registry.js +361 -0
  68. package/template/wall-e/coding/transcript-writer.js +143 -0
  69. package/template/wall-e/coding/workspace-replay.js +324 -0
  70. package/template/wall-e/coding-context.js +4 -22
  71. package/template/wall-e/coding-orchestrator.js +307 -18
  72. package/template/wall-e/coding-prompts.js +44 -3
  73. package/template/wall-e/context/context-builder.js +43 -1
  74. package/template/wall-e/context/topic-matcher.js +1 -1
  75. package/template/wall-e/eval/agent-runner.js +59 -13
  76. package/template/wall-e/eval/benchmarks/memory-retrieval.json +155 -57
  77. package/template/wall-e/eval/benchmarks.js +100 -16
  78. package/template/wall-e/eval/eval-orchestrator.js +218 -8
  79. package/template/wall-e/eval/harvester.js +62 -5
  80. package/template/wall-e/eval/head-to-head.js +23 -2
  81. package/template/wall-e/eval/humaneval-adapter.js +30 -5
  82. package/template/wall-e/eval/livecodebench-adapter.js +29 -5
  83. package/template/wall-e/eval/manifest.js +186 -0
  84. package/template/wall-e/eval/run-agent-benchmarks.js +66 -2
  85. package/template/wall-e/eval/session-retrieval-benchmark.js +150 -0
  86. package/template/wall-e/eval/session-transcripts.js +57 -4
  87. package/template/wall-e/eval/swebench-adapter.js +109 -3
  88. package/template/wall-e/evaluation/agent-router.js +53 -1
  89. package/template/wall-e/evaluation/coding-quorum.js +48 -1
  90. package/template/wall-e/evaluation/router.js +4 -2
  91. package/template/wall-e/evaluation/tier-selector.js +11 -1
  92. package/template/wall-e/extraction/contradiction.js +2 -2
  93. package/template/wall-e/extraction/indexer.js +2 -1
  94. package/template/wall-e/extraction/knowledge-extractor.js +2 -2
  95. package/template/wall-e/hooks/cli.js +92 -0
  96. package/template/wall-e/hooks/discovery.js +119 -0
  97. package/template/wall-e/hooks/index.js +7 -0
  98. package/template/wall-e/hooks/manifest.js +55 -0
  99. package/template/wall-e/hooks/runtime.js +84 -0
  100. package/template/wall-e/hooks/session-memory.js +225 -0
  101. package/template/wall-e/http/auth.js +6 -2
  102. package/template/wall-e/http/chat-api.js +54 -8
  103. package/template/wall-e/integrations/claude-plugin/hooks/hooks.json +27 -0
  104. package/template/wall-e/integrations/claude-plugin/hooks/walle-precompact-hook.sh +5 -0
  105. package/template/wall-e/integrations/claude-plugin/hooks/walle-stop-hook.sh +5 -0
  106. package/template/wall-e/integrations/codex-plugin/hooks/walle-hook.sh +7 -0
  107. package/template/wall-e/integrations/codex-plugin/hooks.json +37 -0
  108. package/template/wall-e/listening/calendar.js +3 -1
  109. package/template/wall-e/llm/client.js +64 -10
  110. package/template/wall-e/llm/google.js +39 -5
  111. package/template/wall-e/llm/ollama.js +1 -1
  112. package/template/wall-e/llm/ollama.plugin.json +1 -1
  113. package/template/wall-e/llm/provider-availability.js +10 -0
  114. package/template/wall-e/llm/provider-error.js +269 -0
  115. package/template/wall-e/llm/tool-adapter.js +48 -12
  116. package/template/wall-e/loops/boot.js +2 -1
  117. package/template/wall-e/loops/initiative.js +2 -2
  118. package/template/wall-e/loops/tasks.js +8 -47
  119. package/template/wall-e/loops/workspace-prompts.js +20 -0
  120. package/template/wall-e/mcp-server.js +442 -1
  121. package/template/wall-e/memory/session-ingest-service.js +159 -0
  122. package/template/wall-e/memory/source-indexer.js +289 -0
  123. package/template/wall-e/plugins/discovery.js +83 -0
  124. package/template/wall-e/plugins/manifest-loader.js +50 -10
  125. package/template/wall-e/plugins/manifest-schema.js +69 -0
  126. package/template/wall-e/plugins/model-catalog.js +55 -0
  127. package/template/wall-e/prompts/coding/base.txt +2 -0
  128. package/template/wall-e/prompts/coding/deepseek.txt +1 -0
  129. package/template/wall-e/prompts/coding/memory-protocol.md +9 -0
  130. package/template/wall-e/prompts/coding/plan.txt +1 -0
  131. package/template/wall-e/runtime/execution-trace.js +220 -0
  132. package/template/wall-e/security/audit.js +266 -0
  133. package/template/wall-e/security/ssrf.js +236 -0
  134. package/template/wall-e/session-files.js +303 -0
  135. package/template/wall-e/skills/_bundled/slack-backfill/SKILL.md +3 -0
  136. package/template/wall-e/skills/_bundled/slack-sync/SKILL.md +3 -0
  137. package/template/wall-e/skills/internal-skill-registry.js +2 -2
  138. package/template/wall-e/skills/script-skill-runner.js +143 -0
  139. package/template/wall-e/skills/skill-executor.js +5 -6
  140. package/template/wall-e/skills/skill-fallback.js +3 -1
  141. package/template/wall-e/skills/skill-harness-registry.js +7 -8
  142. package/template/wall-e/skills/skill-planner.js +52 -4
  143. package/template/wall-e/skills/slack-ingest.js +11 -3
  144. package/template/wall-e/sources/base.js +90 -0
  145. package/template/wall-e/sources/builtin.js +33 -0
  146. package/template/wall-e/sources/claude-code-jsonl.js +78 -0
  147. package/template/wall-e/sources/codex-jsonl.js +125 -0
  148. package/template/wall-e/sources/coding-session-utils.js +117 -0
  149. package/template/wall-e/sources/contract-suite.js +59 -0
  150. package/template/wall-e/sources/gemini-jsonl.js +85 -0
  151. package/template/wall-e/sources/index.js +9 -0
  152. package/template/wall-e/sources/jsonl-utils.js +181 -0
  153. package/template/wall-e/sources/record-types.js +252 -0
  154. package/template/wall-e/sources/registry.js +92 -0
  155. package/template/wall-e/sources/transforms.js +100 -0
  156. package/template/wall-e/sources/walle-jsonl.js +108 -0
  157. package/template/wall-e/tools/coding-middleware.js +31 -1
  158. package/template/wall-e/tools/file-tracker.js +25 -1
  159. package/template/wall-e/tools/local-tools.js +75 -47
  160. package/template/wall-e/tools/session-sharing.js +68 -1
  161. package/template/wall-e/tools/shell-analyzer.js +1 -1
  162. package/template/wall-e/tools/shell-policy.js +47 -0
  163. package/template/wall-e/tools/snapshot.js +42 -0
  164. package/template/wall-e/training/harvester.js +62 -5
  165. package/template/wall-e/utils/repair.js +253 -1
  166. package/template/website/index.html +3 -3
  167. package/template/wall-e/skills/_bundled/slack-mentions/.watched-threads.json +0 -18
@@ -6,6 +6,7 @@ const crypto = require('crypto');
6
6
  const { execFileSync, execFile } = require('child_process');
7
7
  const { promisify } = require('util');
8
8
  const execFileAsync = promisify(execFile);
9
+ const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
9
10
 
10
11
  const DEFAULT_TIMEOUT_MS = 600_000; // 10 minutes — coding agents can take long
11
12
  const FIXTURES_DIR = path.join(__dirname, 'fixtures');
@@ -106,11 +107,18 @@ async function runAgentBenchmark(benchmark, options = {}) {
106
107
  provider,
107
108
  model,
108
109
  mode: 'build',
110
+ benchmark: true,
111
+ headless: true,
112
+ headlessPolicy: 'allow',
113
+ permissionTimeoutMs: 0,
114
+ });
115
+ let timeoutHandle;
116
+ const timeoutPromise = new Promise((_, reject) => {
117
+ timeoutHandle = setTimeout(() => reject(new Error('Hard timeout exceeded')), effectiveTimeout + 60000); // +1min grace
118
+ if (typeof timeoutHandle.unref === 'function') timeoutHandle.unref();
109
119
  });
110
- const timeoutPromise = new Promise((_, reject) =>
111
- setTimeout(() => reject(new Error('Hard timeout exceeded')), effectiveTimeout + 60000) // +1min grace
112
- );
113
120
  const result = await Promise.race([agentPromise, timeoutPromise]);
121
+ if (timeoutHandle) clearTimeout(timeoutHandle);
114
122
 
115
123
  const latencyMs = Date.now() - startTime;
116
124
 
@@ -145,6 +153,21 @@ async function runAgentBenchmark(benchmark, options = {}) {
145
153
  if (totalTests === null) totalTests = afterCounts.total;
146
154
  }
147
155
 
156
+ const inputTokens = usage.inputTokens ?? usage.input ?? 0;
157
+ const expectedFileChanges = expectations.expectedFileChanges || [];
158
+ const missingExpectedWork = expectedFileChanges.length > 0 && actualFileChanges.length === 0;
159
+ const testRegression = (expectations.testCommand && testsPassed === false);
160
+ const rawError = result.stderr || result.error || null;
161
+ const validatedByTests = Boolean(
162
+ expectations.testCommand &&
163
+ testsPassed === true &&
164
+ actualFileChanges.length > 0
165
+ );
166
+ const fatalError = rawError && !validatedByTests ? rawError : null;
167
+ const noEffort = (actualToolCalls.length === 0) || (inputTokens === 0) || missingExpectedWork;
168
+ const hadError = !!fatalError;
169
+ const validatedSuccess = Boolean(result.success || validatedByTests) && !hadError && !noEffort && !testRegression;
170
+
148
171
  // Score the result
149
172
  let score = scoreAgentResult(benchmark, {
150
173
  actualToolCalls,
@@ -152,7 +175,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
152
175
  actualTurns,
153
176
  testsPassed,
154
177
  output: result.output || '',
155
- success: result.success,
178
+ success: validatedSuccess,
156
179
  sandboxDir,
157
180
  costDollars,
158
181
  testsBefore,
@@ -167,21 +190,23 @@ async function runAgentBenchmark(benchmark, options = {}) {
167
190
  // through process-metric weights (turnEconomy, errorHandling, costEfficiency).
168
191
  // That inflated past failure-investigation thresholds and reported FAIL as
169
192
  // PASS. Cap explicitly here.
170
- const inputTokens = usage.inputTokens ?? usage.input ?? 0;
171
- const hadError = !!(result.stderr || result.error);
172
- const noEffort = (actualToolCalls.length === 0) || (inputTokens === 0);
173
- const testRegression = (expectations.testCommand && testsPassed === false);
174
193
  if (hadError || noEffort || testRegression) {
175
194
  score = {
176
195
  composite: 0,
177
196
  dimensions: { ...(score.dimensions || {}), _zeroed: true,
178
- _zeroReason: hadError ? 'error' : noEffort ? 'no_effort' : 'tests_failed' },
197
+ _zeroReason: hadError
198
+ ? 'error'
199
+ : testRegression
200
+ ? 'tests_failed'
201
+ : missingExpectedWork
202
+ ? 'no_file_changes'
203
+ : 'no_effort' },
179
204
  };
180
205
  }
181
206
 
182
207
  return {
183
208
  benchmarkId: benchmark.id,
184
- success: result.success,
209
+ success: validatedSuccess,
185
210
  score,
186
211
  latencyMs,
187
212
  actualToolCalls,
@@ -196,7 +221,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
196
221
  outputTokens: usage.outputTokens ?? usage.output ?? null,
197
222
  dimensionsJson: JSON.stringify(score.dimensions || {}),
198
223
  output: (result.output || '').slice(0, 2000),
199
- error: result.stderr || result.error || null,
224
+ error: fatalError,
200
225
  };
201
226
  } catch (err) {
202
227
  return {
@@ -304,6 +329,10 @@ async function runMultiTurnBenchmark(benchmark, options = {}) {
304
329
  provider,
305
330
  model,
306
331
  mode: 'build',
332
+ benchmark: true,
333
+ headless: true,
334
+ headlessPolicy: 'allow',
335
+ permissionTimeoutMs: 0,
307
336
  messages, // pass accumulated conversation
308
337
  });
309
338
 
@@ -449,7 +478,10 @@ async function runAgentBenchmarkSuite(options = {}) {
449
478
  // Store result
450
479
  if (brain && typeof brain.insertBenchmarkResult === 'function') {
451
480
  try {
452
- brain.insertBenchmarkResult({
481
+ const scoringMethod = benchmark.agentExpectations?.testCommand
482
+ ? 'agent-rubric+tests'
483
+ : 'agent-rubric';
484
+ brain.insertBenchmarkResult(decorateBenchmarkResult({
453
485
  runId,
454
486
  suite: 'coding-agent',
455
487
  promptId: benchmark.id,
@@ -473,7 +505,21 @@ async function runAgentBenchmarkSuite(options = {}) {
473
505
  dimensionsJson: result.dimensionsJson || null,
474
506
  inputTokens: result.inputTokens ?? null,
475
507
  outputTokens: result.outputTokens ?? null,
476
- });
508
+ scorerVersion: DEFAULT_SCORER_VERSION,
509
+ scoringMethod,
510
+ trusted: !result.error && result.testsPassed === true,
511
+ runConfig: { timeoutMs, scoringMethod },
512
+ }, {
513
+ suite: 'coding-agent',
514
+ benchmark,
515
+ runId,
516
+ provider: provider?.type || 'default',
517
+ model: resolveModelName(model),
518
+ scoringMethod,
519
+ scorerVersion: DEFAULT_SCORER_VERSION,
520
+ trusted: !result.error && result.testsPassed === true,
521
+ runConfig: { timeoutMs, scoringMethod },
522
+ }));
477
523
  } catch { /* non-fatal */ }
478
524
  }
479
525
  }
@@ -1,82 +1,180 @@
1
1
  [
2
2
  {
3
- "id": "memory-001",
4
- "prompt": "Context: The user's preferred programming language is Rust. They work at Acme Corp as a senior engineer. Their current project is called 'Phoenix' which is a real-time data pipeline.\n\nQuestion: What programming language does the user prefer and what is their current project?",
5
- "taskType": "memory-retrieval",
6
- "difficulty": "easy",
7
- "expectedTraits": ["references context", "mentions Rust", "mentions Phoenix", "accurate extraction"],
8
- "tags": ["recall", "simple"]
9
- },
10
- {
11
- "id": "memory-002",
12
- "prompt": "Context: Yesterday the user mentioned they have a meeting with the design team on Thursday at 2pm. They also need to submit the Q3 report by Friday. Their manager's name is Sarah Chen.\n\nQuestion: What are the user's upcoming deadlines and meetings?",
13
- "taskType": "memory-retrieval",
14
- "difficulty": "easy",
15
- "expectedTraits": ["references context", "mentions Thursday meeting", "mentions Friday report", "accurate extraction"],
16
- "tags": ["recall", "schedule"]
17
- },
18
- {
19
- "id": "memory-003",
20
- "prompt": "Context: The user's tech stack includes Next.js for frontend, FastAPI for backend, PostgreSQL for the database, and Redis for caching. They deploy on AWS using ECS. The frontend is hosted on Vercel.\n\nQuestion: If the user needs to add a new API endpoint, which framework and language should they use based on their stack?",
3
+ "id": "session-recall-001",
4
+ "prompt": "Use Wall-E memory to answer: in the prior Codex parser session, which file was changed and what test command proved the fix?",
21
5
  "taskType": "memory-retrieval",
22
6
  "difficulty": "medium",
23
- "expectedTraits": ["references context", "mentions FastAPI", "mentions Python", "not hallucinated"],
24
- "tags": ["inference", "tech-stack"]
7
+ "expectedTraits": ["searches session memory", "mentions parser.js", "mentions node --test", "cites session id"],
8
+ "tags": ["session-recall", "coding", "sanitized-real-shape"],
9
+ "retrieval": {
10
+ "query": "parser src/parser.js node",
11
+ "expectedSourceIds": ["codex:sanitized-parser"],
12
+ "expectedSnippets": ["src/parser.js", "node --test tests/parser.test.js"],
13
+ "seedMemories": [
14
+ {
15
+ "source": "codex-jsonl",
16
+ "source_id": "codex:sanitized-parser:exchange:1",
17
+ "memory_type": "coding_session_exchange",
18
+ "timestamp": "2026-04-28T09:15:00.000Z",
19
+ "cwd": "/repo/app",
20
+ "content": "Q: Fix the parser crash when quoted values include commas.\nA: Decision: keep the tokenizer stateful instead of adding a regex split. Files: src/parser.js, tests/parser.test.js. Command: node --test tests/parser.test.js passed.",
21
+ "metadata": {
22
+ "sourceId": "codex:sanitized-parser",
23
+ "cwd": "/repo/app",
24
+ "gitBranch": "fix/parser-quoted-values",
25
+ "filesEdited": ["src/parser.js", "tests/parser.test.js"],
26
+ "commands": ["node --test tests/parser.test.js"]
27
+ }
28
+ }
29
+ ]
30
+ }
25
31
  },
26
32
  {
27
- "id": "memory-004",
28
- "prompt": "Context: The user previously said they don't like using ORMs because they had bad experiences with ActiveRecord in a previous Rails project. They prefer writing raw SQL or using query builders like Knex.js.\n\nQuestion: The user is starting a new Node.js project and needs database access. What approach should you recommend and why?",
33
+ "id": "session-recall-002",
34
+ "prompt": "Use Wall-E memory to answer: why did the prior Claude session reject the batch write approach for the queue worker?",
29
35
  "taskType": "memory-retrieval",
30
36
  "difficulty": "medium",
31
- "expectedTraits": ["references context", "recommends Knex or raw SQL", "mentions ORM preference", "respects user preference"],
32
- "tags": ["preference", "recommendation"]
37
+ "expectedTraits": ["searches session memory", "mentions lock contention", "mentions queue-worker.js", "cites session id"],
38
+ "tags": ["decision", "failure", "sanitized-real-shape"],
39
+ "retrieval": {
40
+ "query": "queue worker lock contention",
41
+ "expectedSourceIds": ["claude:sanitized-queue"],
42
+ "expectedSnippets": ["lock contention", "queue-worker.js"],
43
+ "seedMemories": [
44
+ {
45
+ "source": "claude-code-jsonl",
46
+ "source_id": "claude:sanitized-queue:exchange:4",
47
+ "memory_type": "coding_session_exchange",
48
+ "timestamp": "2026-04-27T17:22:00.000Z",
49
+ "cwd": "/repo/service",
50
+ "content": "Q: Speed up queue persistence.\nA: Blocker: batch writes increased SQLite lock contention under concurrent workers. Decision: keep single-row retry with jitter. Files: workers/queue-worker.js, tests/queue-worker.test.js. Command: npm test -- queue-worker passed.",
51
+ "metadata": {
52
+ "sourceId": "claude:sanitized-queue",
53
+ "cwd": "/repo/service",
54
+ "gitBranch": "fix/queue-locking",
55
+ "filesEdited": ["workers/queue-worker.js", "tests/queue-worker.test.js"],
56
+ "commands": ["npm test -- queue-worker"]
57
+ }
58
+ }
59
+ ]
60
+ }
33
61
  },
34
62
  {
35
- "id": "memory-005",
36
- "prompt": "Context: The user's company has a strict no-cloud policy for sensitive data. All PII must stay on-premises. They use MinIO as an S3-compatible object store and Harbor for container registry.\n\nQuestion: The user wants to add log aggregation. What should they consider given their constraints?",
63
+ "id": "session-recall-003",
64
+ "prompt": "Use Wall-E memory to answer: which browser test covered the transcript blank-space regression?",
37
65
  "taskType": "memory-retrieval",
38
66
  "difficulty": "medium",
39
- "expectedTraits": ["references context", "mentions on-premises requirement", "suggests self-hosted options", "not hallucinated"],
40
- "tags": ["constraint", "infrastructure"]
67
+ "expectedTraits": ["searches session memory", "mentions codex-blank-space.spec.js", "mentions blank gap", "cites session id"],
68
+ "tags": ["ui-regression", "browser-test", "sanitized-real-shape"],
69
+ "retrieval": {
70
+ "query": "blank-gap codex-blank-space.spec.js",
71
+ "expectedSourceIds": ["walle:sanitized-terminal-ui"],
72
+ "expectedSnippets": ["codex-blank-space.spec.js", "completed-turn blank-gap compaction"],
73
+ "seedMemories": [
74
+ {
75
+ "source": "walle-jsonl",
76
+ "source_id": "walle:sanitized-terminal-ui:assistant:12",
77
+ "memory_type": "coding_session_assistant_message",
78
+ "timestamp": "2026-04-29T11:05:00.000Z",
79
+ "cwd": "/repo/tools",
80
+ "content": "Decision: fix completed-turn blank-gap compaction in the Codex terminal renderer. Files: claude-task-manager/public/session-stream.js, claude-task-manager/tests/codex-blank-space.spec.js. Command: npx playwright test claude-task-manager/tests/codex-blank-space.spec.js passed.",
81
+ "metadata": {
82
+ "sourceId": "walle:sanitized-terminal-ui",
83
+ "cwd": "/repo/tools",
84
+ "gitBranch": "fix/codex-terminal-blank-gap",
85
+ "filesEdited": ["claude-task-manager/public/session-stream.js", "claude-task-manager/tests/codex-blank-space.spec.js"],
86
+ "commands": ["npx playwright test claude-task-manager/tests/codex-blank-space.spec.js"]
87
+ }
88
+ }
89
+ ]
90
+ }
41
91
  },
42
92
  {
43
- "id": "memory-006",
44
- "prompt": "Context: In a previous conversation, the user described three microservices: AuthService (handles login, JWT tokens, runs on port 3001), OrderService (processes orders, talks to Stripe, port 3002), and NotificationService (sends emails via SendGrid, port 3003). AuthService is the most critical.\n\nQuestion: Which service handles payments and what port does it run on? Also, which service should have the highest uptime SLA?",
45
- "taskType": "memory-retrieval",
46
- "difficulty": "medium",
47
- "expectedTraits": ["references context", "mentions OrderService", "mentions port 3002", "mentions AuthService for SLA", "accurate extraction"],
48
- "tags": ["multi-fact", "architecture"]
49
- },
50
- {
51
- "id": "memory-007",
52
- "prompt": "Context: The user mentioned they tried three different approaches to solve a caching problem: 1) Redis with 5-minute TTL (too many cache misses), 2) In-memory LRU cache (worked but lost on restart), 3) Redis with write-through strategy (current solution, working well).\n\nQuestion: What caching approach is the user currently using and why did they reject the alternatives?",
53
- "taskType": "memory-retrieval",
54
- "difficulty": "medium",
55
- "expectedTraits": ["references context", "mentions write-through", "explains rejected approaches", "accurate extraction"],
56
- "tags": ["history", "decision"]
57
- },
58
- {
59
- "id": "memory-008",
60
- "prompt": "Context: The user has NOT mentioned anything about their deployment pipeline or CI/CD setup.\n\nQuestion: What CI/CD tool does the user use for their deployments?",
93
+ "id": "session-recall-004",
94
+ "prompt": "Use Wall-E memory to answer: what was the next step after the model routing quorum session?",
61
95
  "taskType": "memory-retrieval",
62
96
  "difficulty": "hard",
63
- "expectedTraits": ["acknowledges missing info", "does not hallucinate", "asks for clarification", "not hallucinated"],
64
- "tags": ["negative-recall", "honesty"]
97
+ "expectedTraits": ["searches diary", "mentions router inputs", "mentions evaluation", "cites diary/session id"],
98
+ "tags": ["diary", "handoff", "sanitized-real-shape"],
99
+ "retrieval": {
100
+ "query": "model routing quorum next step router inputs evaluation diary",
101
+ "expectedSourceIds": ["diary:walle:sanitized-quorum:stop"],
102
+ "expectedSnippets": ["feed quorum results into routing", "run trusted evaluation"],
103
+ "seedMemories": [
104
+ {
105
+ "source": "walle-diary",
106
+ "source_id": "diary:walle:sanitized-quorum:stop",
107
+ "memory_type": "agent_diary",
108
+ "timestamp": "2026-04-26T20:30:00.000Z",
109
+ "cwd": "/repo/tools",
110
+ "content": "Agent diary for walle session sanitized-quorum (stop)\nSummary: feed quorum results into routing instead of leaving them as a side report.\nChanged files: wall-e/evaluation/coding-quorum.js; wall-e/routing/model-router.js\nDecisions: use reviewer/quorum/security data as router inputs.\nNext steps: run trusted evaluation on coding-agent-real cases.",
111
+ "metadata": {
112
+ "sourceId": "diary:walle:sanitized-quorum:stop",
113
+ "sessionId": "sanitized-quorum",
114
+ "agent": "walle",
115
+ "event": "stop",
116
+ "cwd": "/repo/tools",
117
+ "changed_files": ["wall-e/evaluation/coding-quorum.js", "wall-e/routing/model-router.js"],
118
+ "next_steps": ["run trusted evaluation on coding-agent-real cases"]
119
+ }
120
+ }
121
+ ]
122
+ }
65
123
  },
66
124
  {
67
- "id": "memory-009",
68
- "prompt": "Context: The user's team follows these conventions: branch naming is `type/JIRA-123-description`, commit messages use conventional commits (feat:, fix:, chore:), PRs require 2 approvals, and they use squash merging. They also said never to force push to main.\n\nQuestion: I want to create a new feature branch for ticket PROJ-456 about adding user avatars. What should I name it and what rules should I follow?",
125
+ "id": "session-recall-005",
126
+ "prompt": "Use Wall-E memory to answer: which source adapter handled Gemini JSONL and what privacy class did it use?",
69
127
  "taskType": "memory-retrieval",
70
128
  "difficulty": "medium",
71
- "expectedTraits": ["references context", "correct branch name format", "mentions conventional commits", "mentions no force push to main"],
72
- "tags": ["conventions", "git"]
129
+ "expectedTraits": ["searches session memory", "mentions gemini-jsonl", "mentions pii_potential", "cites session id"],
130
+ "tags": ["source-adapter", "privacy", "sanitized-real-shape"],
131
+ "retrieval": {
132
+ "query": "Gemini JSONL source adapter privacy class pii_potential",
133
+ "expectedSourceIds": ["codex:sanitized-source-adapters"],
134
+ "expectedSnippets": ["gemini-jsonl", "pii_potential"],
135
+ "seedMemories": [
136
+ {
137
+ "source": "codex-jsonl",
138
+ "source_id": "codex:sanitized-source-adapters:assistant:8",
139
+ "memory_type": "coding_session_assistant_message",
140
+ "timestamp": "2026-04-29T14:02:00.000Z",
141
+ "cwd": "/repo/tools",
142
+ "content": "Decision: register claude-code-jsonl, codex-jsonl, gemini-jsonl, and walle-jsonl as first-party source adapters. The default privacy class for coding session adapters is pii_potential because local transcripts can include personal context.",
143
+ "metadata": {
144
+ "sourceId": "codex:sanitized-source-adapters",
145
+ "cwd": "/repo/tools",
146
+ "filesEdited": ["wall-e/sources/gemini-jsonl.js", "wall-e/sources/builtin.js"]
147
+ }
148
+ }
149
+ ]
150
+ }
73
151
  },
74
152
  {
75
- "id": "memory-010",
76
- "prompt": "Context: The user previously set these preferences: dark mode enabled, timezone is America/Los_Angeles, preferred language is English, notification frequency is daily digest, and they want verbose logging during development but minimal logging in production.\n\nQuestion: Summarize all of the user's configuration preferences.",
153
+ "id": "session-recall-006",
154
+ "prompt": "Use Wall-E memory to answer honestly: do we have a remembered decision about replacing SQLite with ChromaDB?",
77
155
  "taskType": "memory-retrieval",
78
- "difficulty": "easy",
79
- "expectedTraits": ["references context", "lists all five preferences", "accurate extraction", "not hallucinated"],
80
- "tags": ["recall", "comprehensive"]
156
+ "difficulty": "hard",
157
+ "expectedTraits": ["searches session memory", "says do not replace SQLite", "mentions sqlite-vec", "does not hallucinate approval"],
158
+ "tags": ["negative-recall", "architecture", "sanitized-real-shape"],
159
+ "retrieval": {
160
+ "query": "replace SQLite with ChromaDB sqlite-vec decision",
161
+ "expectedSourceIds": ["claude:sanitized-memory-architecture"],
162
+ "expectedSnippets": ["Keep SQLite plus sqlite-vec", "Do not adopt ChromaDB"],
163
+ "seedMemories": [
164
+ {
165
+ "source": "claude-code-jsonl",
166
+ "source_id": "claude:sanitized-memory-architecture:exchange:2",
167
+ "memory_type": "coding_session_exchange",
168
+ "timestamp": "2026-04-29T13:45:00.000Z",
169
+ "cwd": "/repo/tools",
170
+ "content": "Q: Should Wall-E adopt ChromaDB from the reference project?\nA: Decision: Keep SQLite plus sqlite-vec. Do not adopt ChromaDB; port the source-adapter semantics and retrieval tests instead.",
171
+ "metadata": {
172
+ "sourceId": "claude:sanitized-memory-architecture",
173
+ "cwd": "/repo/tools",
174
+ "gitBranch": "feat/session-memory-protocol"
175
+ }
176
+ }
177
+ ]
178
+ }
81
179
  }
82
180
  ]
@@ -4,6 +4,7 @@ const fs = require('fs');
4
4
  const path = require('path');
5
5
  const crypto = require('crypto');
6
6
  const { createClient } = require('../llm/client');
7
+ const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
7
8
 
8
9
  // ============================================================
9
10
  // Constants
@@ -161,6 +162,23 @@ const TRAIT_MATCHERS = {
161
162
  'mentions conventional commits': (r) => /conventional\s+commit|feat:|fix:|chore:/i.test(r),
162
163
  'mentions no force push to main': (r) => /force\s+push|--force|never.*push.*main|no.*force/i.test(r),
163
164
  'lists all five preferences': (r) => /dark\s+mode|timezone|America\/Los_Angeles|English|daily\s+digest|verbose/i.test(r),
165
+ 'searches session memory': (r) => /session|memory|remember|transcript|source|found/i.test(r),
166
+ 'mentions parser.js': (r) => /parser\.js/i.test(r),
167
+ 'mentions node --test': (r) => /node\s+--test/i.test(r),
168
+ 'cites session id': (r) => /(?:session|source)[\s_-]?id|codex:sanitized|claude:sanitized|walle:sanitized|sanitized-[\w-]+/i.test(r),
169
+ 'mentions lock contention': (r) => /lock\s+contention/i.test(r),
170
+ 'mentions queue-worker.js': (r) => /queue-worker\.js/i.test(r),
171
+ 'mentions codex-blank-space.spec.js': (r) => /codex-blank-space\.spec\.js/i.test(r),
172
+ 'mentions blank gap': (r) => /blank[-\s]?gap/i.test(r),
173
+ 'searches diary': (r) => /diary|agent diary|remember|memory|source/i.test(r),
174
+ 'mentions router inputs': (r) => /router\s+inputs|routing.*inputs/i.test(r),
175
+ 'mentions evaluation': (r) => /evaluation|eval|trusted\s+evaluation/i.test(r),
176
+ 'cites diary/session id': (r) => /diary|session[\s_-]?id|sanitized-quorum|source[\s_-]?id/i.test(r),
177
+ 'mentions gemini-jsonl': (r) => /gemini-jsonl/i.test(r),
178
+ 'mentions pii_potential': (r) => /pii_potential/i.test(r),
179
+ 'says do not replace SQLite': (r) => /do\s+not\s+(?:adopt|replace|use).*SQLite|keep\s+SQLite|SQLite.*not\s+replace/i.test(r),
180
+ 'mentions sqlite-vec': (r) => /sqlite-vec/i.test(r),
181
+ 'does not hallucinate approval': (r) => /do\s+not\s+(?:adopt|replace)|no\s+approval|not\s+approved|rejected|keep\s+SQLite/i.test(r),
164
182
 
165
183
  // --- Coding-agent traits ---
166
184
  'uses edit over write': (r) => /edit_file|apply_patch|multi_edit/i.test(r) && !/write_file/i.test(r),
@@ -214,6 +232,15 @@ const TRAIT_MATCHERS = {
214
232
  'asks clarifying questions': (r) => /ask_user|AskUserQuestion/i.test(r),
215
233
  };
216
234
 
235
+ const UNSCORABLE_TRAITS = new Set([
236
+ 'accurate',
237
+ 'correct solution',
238
+ 'references context',
239
+ 'accurate extraction',
240
+ 'not hallucinated',
241
+ 'does not hallucinate',
242
+ ]);
243
+
217
244
  // ============================================================
218
245
  // Suite loading
219
246
  // ============================================================
@@ -298,6 +325,12 @@ function loadBenchmarkSuite(suiteName) {
298
325
  if (!hasTraits && !hasReplyChecks && !hasToolChecks && !hasMockTools && !hasAgentExpectations && !isEdgeCase) {
299
326
  throw new Error(`Benchmark entry ${entry.id} has no scoring signal (expectedTraits / expectedInReply / expectedTools / mockToolResults / agentExpectations)`);
300
327
  }
328
+ if (hasTraits) {
329
+ const unknownTraits = entry.expectedTraits.filter(t => !TRAIT_MATCHERS[t] && !UNSCORABLE_TRAITS.has(t));
330
+ if (unknownTraits.length) {
331
+ throw new Error(`Benchmark entry ${entry.id} has unknown expectedTraits: ${unknownTraits.join(', ')}`);
332
+ }
333
+ }
301
334
  }
302
335
 
303
336
  return { name: suiteName, prompts };
@@ -315,16 +348,41 @@ function loadBenchmarkSuite(suiteName) {
315
348
  */
316
349
  function scoreTrait(response, trait) {
317
350
  if (!response || typeof response !== 'string') return false;
351
+ if (UNSCORABLE_TRAITS.has(trait)) return false;
318
352
  const matcher = TRAIT_MATCHERS[trait];
319
- if (!matcher) {
320
- // Unknown trait — fallback: search for the trait keywords in the response
321
- const keywords = trait.toLowerCase().split(/\s+/);
322
- const lower = response.toLowerCase();
323
- return keywords.some((kw) => kw.length > 3 && lower.includes(kw));
324
- }
353
+ if (!matcher) return false;
325
354
  return matcher(response);
326
355
  }
327
356
 
357
+ function scoreTraitsDetailed(response, expectedTraits) {
358
+ const detail = {
359
+ score: 0,
360
+ matched: [],
361
+ missed: [],
362
+ unscored: [],
363
+ unknown: [],
364
+ scoredCount: 0,
365
+ };
366
+ if (!expectedTraits || expectedTraits.length === 0) return detail;
367
+
368
+ for (const trait of expectedTraits) {
369
+ if (UNSCORABLE_TRAITS.has(trait)) {
370
+ detail.unscored.push(trait);
371
+ continue;
372
+ }
373
+ if (!TRAIT_MATCHERS[trait]) {
374
+ detail.unknown.push(trait);
375
+ continue;
376
+ }
377
+ detail.scoredCount++;
378
+ if (scoreTrait(response, trait)) detail.matched.push(trait);
379
+ else detail.missed.push(trait);
380
+ }
381
+
382
+ detail.score = detail.scoredCount > 0 ? detail.matched.length / detail.scoredCount : 0;
383
+ return detail;
384
+ }
385
+
328
386
  /**
329
387
  * Score a response against multiple expected traits.
330
388
  * @param {string} response - LLM response text
@@ -332,9 +390,7 @@ function scoreTrait(response, trait) {
332
390
  * @returns {number} 0.0 to 1.0 based on percentage of traits matched
333
391
  */
334
392
  function scoreTraits(response, expectedTraits) {
335
- if (!expectedTraits || expectedTraits.length === 0) return 0;
336
- const matched = expectedTraits.filter((t) => scoreTrait(response, t)).length;
337
- return matched / expectedTraits.length;
393
+ return scoreTraitsDetailed(response, expectedTraits).score;
338
394
  }
339
395
 
340
396
  // ============================================================
@@ -407,11 +463,13 @@ async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFA
407
463
  providerScores[providerKey].errors++;
408
464
  }
409
465
 
410
- // Score traits
411
- const traitScore = response ? scoreTraits(response, entry.expectedTraits) : 0;
412
- const matchedTraits = response
413
- ? entry.expectedTraits.filter((t) => scoreTrait(response, t))
414
- : [];
466
+ // Score traits. Some dataset traits are intentionally marked unscorable:
467
+ // they document desired behavior but must not inflate automatic scores.
468
+ const traitDetail = response
469
+ ? scoreTraitsDetailed(response, entry.expectedTraits)
470
+ : scoreTraitsDetailed('', entry.expectedTraits);
471
+ const traitScore = traitDetail.score;
472
+ const matchedTraits = traitDetail.matched;
415
473
 
416
474
  // Optional LLM judge
417
475
  let judgeScore = null;
@@ -430,6 +488,9 @@ async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFA
430
488
  const compositeScore = judgeScore != null
431
489
  ? traitScore * 0.6 + judgeScore * 0.4
432
490
  : traitScore;
491
+ const scoringMethod = judgeScore != null
492
+ ? 'trait+judge'
493
+ : traitDetail.scoredCount > 0 ? 'traits' : 'unscored-traits';
433
494
 
434
495
  providerScores[providerKey].total += compositeScore;
435
496
  providerScores[providerKey].count++;
@@ -443,7 +504,7 @@ async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFA
443
504
  ...(judgeScore != null ? { codeQuality: judgeScore } : {}),
444
505
  };
445
506
 
446
- const resultEntry = {
507
+ const resultEntry = decorateBenchmarkResult({
447
508
  runId,
448
509
  suite,
449
510
  promptId: entry.id,
@@ -464,9 +525,30 @@ async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFA
464
525
  outputTokens: usage?.output ?? usage?.completion_tokens ?? null,
465
526
  genTokPerSec: usage?.genTokPerSec ?? null,
466
527
  dimensionsJson: JSON.stringify(dimensions),
528
+ modelMetadataJson: JSON.stringify({
529
+ matchedTraits,
530
+ missedTraits: traitDetail.missed,
531
+ unscoredTraits: traitDetail.unscored,
532
+ unknownTraits: traitDetail.unknown,
533
+ scoredTraitCount: traitDetail.scoredCount,
534
+ }),
467
535
  error,
536
+ scorerVersion: DEFAULT_SCORER_VERSION,
537
+ scoringMethod,
538
+ trusted: !error && judgeScore != null,
539
+ runConfig: { timeoutMs },
468
540
  timestamp: new Date().toISOString(),
469
- };
541
+ }, {
542
+ suite,
543
+ benchmark: entry,
544
+ runId,
545
+ provider: provider.type,
546
+ model: provider.model,
547
+ scoringMethod,
548
+ scorerVersion: DEFAULT_SCORER_VERSION,
549
+ trusted: !error && judgeScore != null,
550
+ runConfig: { timeoutMs },
551
+ });
470
552
 
471
553
  results.push(resultEntry);
472
554
 
@@ -571,8 +653,10 @@ module.exports = {
571
653
  loadAllBenchmarks,
572
654
  scoreTrait,
573
655
  scoreTraits,
656
+ scoreTraitsDetailed,
574
657
  runBenchmark,
575
658
  getBenchmarkLeaderboard,
576
659
  TRAIT_MATCHERS,
660
+ UNSCORABLE_TRAITS,
577
661
  BENCHMARKS_DIR,
578
662
  };