npm - create-walle - Versions diffs - 0.9.11 → 0.9.13 - Mend

create-walle 0.9.11 → 0.9.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (167) hide show

package/README.md +3 -3
package/package.json +2 -2
package/template/bin/dev.sh +7 -1
package/template/bin/setup.js +53 -9
package/template/bin/sync-images.js +53 -0
package/template/builder-journal.md +17 -0
package/template/claude-task-manager/api-prompts.js +98 -13
package/template/claude-task-manager/api-reviews.js +82 -5
package/template/claude-task-manager/db.js +32 -5
package/template/claude-task-manager/docs/session-capture-foundation-design.md +1273 -0
package/template/claude-task-manager/lib/claude-desktop-sessions.js +696 -0
package/template/claude-task-manager/lib/coding-agent-models.js +49 -1
package/template/claude-task-manager/lib/session-capture.js +421 -0
package/template/claude-task-manager/lib/session-history.js +135 -15
package/template/claude-task-manager/lib/session-jobs.js +10 -5
package/template/claude-task-manager/lib/session-stream.js +87 -19
package/template/claude-task-manager/lib/setup-provider-config.js +115 -0
package/template/claude-task-manager/lib/walle-ctm-history.js +72 -0
package/template/claude-task-manager/lib/walle-session-context.js +61 -0
package/template/claude-task-manager/lib/walle-transcript.js +176 -0
package/template/claude-task-manager/public/css/setup.css +35 -8
package/template/claude-task-manager/public/css/walle-session.css +56 -0
package/template/claude-task-manager/public/css/walle.css +120 -0
package/template/claude-task-manager/public/index.html +814 -181
package/template/claude-task-manager/public/js/message-renderer.js +148 -19
package/template/claude-task-manager/public/js/reviews.js +120 -62
package/template/claude-task-manager/public/js/setup.js +75 -31
package/template/claude-task-manager/public/js/stream-view.js +115 -55
package/template/claude-task-manager/public/js/walle-session.js +84 -2
package/template/claude-task-manager/public/js/walle.js +308 -54
package/template/claude-task-manager/server.js +1092 -146
package/template/claude-task-manager/session-integrity.js +181 -54
package/template/claude-task-manager/session-utils.js +123 -41
package/template/claude-task-manager/workers/state-detectors/codex.js +5 -2
package/template/package.json +1 -1
package/template/wall-e/adapters/ctm.js +39 -18
package/template/wall-e/agent-runners/contract.js +17 -0
package/template/wall-e/agent-runners/index.js +22 -0
package/template/wall-e/agent-runtime/harness.js +212 -0
package/template/wall-e/agent-runtime/index.js +8 -0
package/template/wall-e/agent-runtime/registry.js +67 -0
package/template/wall-e/agent-runtime/session-store.js +179 -0
package/template/wall-e/agent-runtime/spawn.js +208 -0
package/template/wall-e/api-walle.js +174 -7
package/template/wall-e/brain.js +266 -28
package/template/wall-e/channels/policy.js +88 -0
package/template/wall-e/channels/registry.js +15 -1
package/template/wall-e/channels/reply-dispatcher.js +70 -0
package/template/wall-e/channels/session-bindings.js +51 -0
package/template/wall-e/chat/code-review-context.js +29 -0
package/template/wall-e/chat.js +188 -42
package/template/wall-e/coding/acp-adapter.js +188 -0
package/template/wall-e/coding/agent-catalog.js +129 -0
package/template/wall-e/coding/compaction-service.js +247 -0
package/template/wall-e/coding/execution-trace.js +3 -0
package/template/wall-e/coding/instruction-service.js +224 -0
package/template/wall-e/coding/model-message.js +67 -0
package/template/wall-e/coding/permission-rules-store.js +111 -0
package/template/wall-e/coding/permission-service.js +266 -0
package/template/wall-e/coding/prompt-bundle.js +67 -0
package/template/wall-e/coding/prompt-runtime.js +243 -0
package/template/wall-e/coding/provider-transform.js +188 -0
package/template/wall-e/coding/runtime-mode.js +132 -0
package/template/wall-e/coding/snapshot-service.js +155 -0
package/template/wall-e/coding/stream-processor.js +268 -0
package/template/wall-e/coding/task-tool.js +255 -0
package/template/wall-e/coding/tool-registry.js +361 -0
package/template/wall-e/coding/transcript-writer.js +143 -0
package/template/wall-e/coding/workspace-replay.js +324 -0
package/template/wall-e/coding-context.js +4 -22
package/template/wall-e/coding-orchestrator.js +307 -18
package/template/wall-e/coding-prompts.js +44 -3
package/template/wall-e/context/context-builder.js +43 -1
package/template/wall-e/context/topic-matcher.js +1 -1
package/template/wall-e/eval/agent-runner.js +59 -13
package/template/wall-e/eval/benchmarks/memory-retrieval.json +155 -57
package/template/wall-e/eval/benchmarks.js +100 -16
package/template/wall-e/eval/eval-orchestrator.js +218 -8
package/template/wall-e/eval/harvester.js +62 -5
package/template/wall-e/eval/head-to-head.js +23 -2
package/template/wall-e/eval/humaneval-adapter.js +30 -5
package/template/wall-e/eval/livecodebench-adapter.js +29 -5
package/template/wall-e/eval/manifest.js +186 -0
package/template/wall-e/eval/run-agent-benchmarks.js +66 -2
package/template/wall-e/eval/session-retrieval-benchmark.js +150 -0
package/template/wall-e/eval/session-transcripts.js +57 -4
package/template/wall-e/eval/swebench-adapter.js +109 -3
package/template/wall-e/evaluation/agent-router.js +53 -1
package/template/wall-e/evaluation/coding-quorum.js +48 -1
package/template/wall-e/evaluation/router.js +4 -2
package/template/wall-e/evaluation/tier-selector.js +11 -1
package/template/wall-e/extraction/contradiction.js +2 -2
package/template/wall-e/extraction/indexer.js +2 -1
package/template/wall-e/extraction/knowledge-extractor.js +2 -2
package/template/wall-e/hooks/cli.js +92 -0
package/template/wall-e/hooks/discovery.js +119 -0
package/template/wall-e/hooks/index.js +7 -0
package/template/wall-e/hooks/manifest.js +55 -0
package/template/wall-e/hooks/runtime.js +84 -0
package/template/wall-e/hooks/session-memory.js +225 -0
package/template/wall-e/http/auth.js +6 -2
package/template/wall-e/http/chat-api.js +54 -8
package/template/wall-e/integrations/claude-plugin/hooks/hooks.json +27 -0
package/template/wall-e/integrations/claude-plugin/hooks/walle-precompact-hook.sh +5 -0
package/template/wall-e/integrations/claude-plugin/hooks/walle-stop-hook.sh +5 -0
package/template/wall-e/integrations/codex-plugin/hooks/walle-hook.sh +7 -0
package/template/wall-e/integrations/codex-plugin/hooks.json +37 -0
package/template/wall-e/listening/calendar.js +3 -1
package/template/wall-e/llm/client.js +64 -10
package/template/wall-e/llm/google.js +39 -5
package/template/wall-e/llm/ollama.js +1 -1
package/template/wall-e/llm/ollama.plugin.json +1 -1
package/template/wall-e/llm/provider-availability.js +10 -0
package/template/wall-e/llm/provider-error.js +269 -0
package/template/wall-e/llm/tool-adapter.js +48 -12
package/template/wall-e/loops/boot.js +2 -1
package/template/wall-e/loops/initiative.js +2 -2
package/template/wall-e/loops/tasks.js +8 -47
package/template/wall-e/loops/workspace-prompts.js +20 -0
package/template/wall-e/mcp-server.js +442 -1
package/template/wall-e/memory/session-ingest-service.js +159 -0
package/template/wall-e/memory/source-indexer.js +289 -0
package/template/wall-e/plugins/discovery.js +83 -0
package/template/wall-e/plugins/manifest-loader.js +50 -10
package/template/wall-e/plugins/manifest-schema.js +69 -0
package/template/wall-e/plugins/model-catalog.js +55 -0
package/template/wall-e/prompts/coding/base.txt +2 -0
package/template/wall-e/prompts/coding/deepseek.txt +1 -0
package/template/wall-e/prompts/coding/memory-protocol.md +9 -0
package/template/wall-e/prompts/coding/plan.txt +1 -0
package/template/wall-e/runtime/execution-trace.js +220 -0
package/template/wall-e/security/audit.js +266 -0
package/template/wall-e/security/ssrf.js +236 -0
package/template/wall-e/session-files.js +303 -0
package/template/wall-e/skills/_bundled/slack-backfill/SKILL.md +3 -0
package/template/wall-e/skills/_bundled/slack-sync/SKILL.md +3 -0
package/template/wall-e/skills/internal-skill-registry.js +2 -2
package/template/wall-e/skills/script-skill-runner.js +143 -0
package/template/wall-e/skills/skill-executor.js +5 -6
package/template/wall-e/skills/skill-fallback.js +3 -1
package/template/wall-e/skills/skill-harness-registry.js +7 -8
package/template/wall-e/skills/skill-planner.js +52 -4
package/template/wall-e/skills/slack-ingest.js +11 -3
package/template/wall-e/sources/base.js +90 -0
package/template/wall-e/sources/builtin.js +33 -0
package/template/wall-e/sources/claude-code-jsonl.js +78 -0
package/template/wall-e/sources/codex-jsonl.js +125 -0
package/template/wall-e/sources/coding-session-utils.js +117 -0
package/template/wall-e/sources/contract-suite.js +59 -0
package/template/wall-e/sources/gemini-jsonl.js +85 -0
package/template/wall-e/sources/index.js +9 -0
package/template/wall-e/sources/jsonl-utils.js +181 -0
package/template/wall-e/sources/record-types.js +252 -0
package/template/wall-e/sources/registry.js +92 -0
package/template/wall-e/sources/transforms.js +100 -0
package/template/wall-e/sources/walle-jsonl.js +108 -0
package/template/wall-e/tools/coding-middleware.js +31 -1
package/template/wall-e/tools/file-tracker.js +25 -1
package/template/wall-e/tools/local-tools.js +75 -47
package/template/wall-e/tools/session-sharing.js +68 -1
package/template/wall-e/tools/shell-analyzer.js +1 -1
package/template/wall-e/tools/shell-policy.js +47 -0
package/template/wall-e/tools/snapshot.js +42 -0
package/template/wall-e/training/harvester.js +62 -5
package/template/wall-e/utils/repair.js +253 -1
package/template/website/index.html +3 -3
package/template/wall-e/skills/_bundled/slack-mentions/.watched-threads.json +0 -18

package/template/wall-e/eval/agent-runner.js CHANGED Viewed

@@ -6,6 +6,7 @@ const crypto = require('crypto');
 const { execFileSync, execFile } = require('child_process');
 const { promisify } = require('util');
 const execFileAsync = promisify(execFile);
+const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
 const DEFAULT_TIMEOUT_MS = 600_000; // 10 minutes — coding agents can take long
 const FIXTURES_DIR = path.join(__dirname, 'fixtures');
@@ -106,11 +107,18 @@ async function runAgentBenchmark(benchmark, options = {}) {
       provider,
       model,
       mode: 'build',
+      benchmark: true,
+      headless: true,
+      headlessPolicy: 'allow',
+      permissionTimeoutMs: 0,
+    });
+    let timeoutHandle;
+    const timeoutPromise = new Promise((_, reject) => {
+      timeoutHandle = setTimeout(() => reject(new Error('Hard timeout exceeded')), effectiveTimeout + 60000); // +1min grace
+      if (typeof timeoutHandle.unref === 'function') timeoutHandle.unref();
     });
-    const timeoutPromise = new Promise((_, reject) =>
-      setTimeout(() => reject(new Error('Hard timeout exceeded')), effectiveTimeout + 60000) // +1min grace
-    );
     const result = await Promise.race([agentPromise, timeoutPromise]);
+    if (timeoutHandle) clearTimeout(timeoutHandle);
     const latencyMs = Date.now() - startTime;
@@ -145,6 +153,21 @@ async function runAgentBenchmark(benchmark, options = {}) {
       if (totalTests === null) totalTests = afterCounts.total;
     }
+    const inputTokens = usage.inputTokens ?? usage.input ?? 0;
+    const expectedFileChanges = expectations.expectedFileChanges || [];
+    const missingExpectedWork = expectedFileChanges.length > 0 && actualFileChanges.length === 0;
+    const testRegression = (expectations.testCommand && testsPassed === false);
+    const rawError = result.stderr || result.error || null;
+    const validatedByTests = Boolean(
+      expectations.testCommand &&
+      testsPassed === true &&
+      actualFileChanges.length > 0
+    );
+    const fatalError = rawError && !validatedByTests ? rawError : null;
+    const noEffort = (actualToolCalls.length === 0) || (inputTokens === 0) || missingExpectedWork;
+    const hadError = !!fatalError;
+    const validatedSuccess = Boolean(result.success || validatedByTests) && !hadError && !noEffort && !testRegression;
     // Score the result
     let score = scoreAgentResult(benchmark, {
       actualToolCalls,
@@ -152,7 +175,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
       actualTurns,
       testsPassed,
       output: result.output || '',
-      success: result.success,
+      success: validatedSuccess,
       sandboxDir,
       costDollars,
       testsBefore,
@@ -167,21 +190,23 @@ async function runAgentBenchmark(benchmark, options = {}) {
     // through process-metric weights (turnEconomy, errorHandling, costEfficiency).
     // That inflated past failure-investigation thresholds and reported FAIL as
     // PASS. Cap explicitly here.
-    const inputTokens = usage.inputTokens ?? usage.input ?? 0;
-    const hadError = !!(result.stderr || result.error);
-    const noEffort = (actualToolCalls.length === 0) || (inputTokens === 0);
-    const testRegression = (expectations.testCommand && testsPassed === false);
     if (hadError || noEffort || testRegression) {
       score = {
         composite: 0,
         dimensions: { ...(score.dimensions || {}), _zeroed: true,
-          _zeroReason: hadError ? 'error' : noEffort ? 'no_effort' : 'tests_failed' },
+          _zeroReason: hadError
+            ? 'error'
+            : testRegression
+              ? 'tests_failed'
+              : missingExpectedWork
+                ? 'no_file_changes'
+                : 'no_effort' },
       };
     }
     return {
       benchmarkId: benchmark.id,
-      success: result.success,
+      success: validatedSuccess,
       score,
       latencyMs,
       actualToolCalls,
@@ -196,7 +221,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
       outputTokens: usage.outputTokens ?? usage.output ?? null,
       dimensionsJson: JSON.stringify(score.dimensions || {}),
       output: (result.output || '').slice(0, 2000),
-      error: result.stderr || result.error || null,
+      error: fatalError,
     };
   } catch (err) {
     return {
@@ -304,6 +329,10 @@ async function runMultiTurnBenchmark(benchmark, options = {}) {
         provider,
         model,
         mode: 'build',
+        benchmark: true,
+        headless: true,
+        headlessPolicy: 'allow',
+        permissionTimeoutMs: 0,
         messages, // pass accumulated conversation
       });
@@ -449,7 +478,10 @@ async function runAgentBenchmarkSuite(options = {}) {
     // Store result
     if (brain && typeof brain.insertBenchmarkResult === 'function') {
       try {
-        brain.insertBenchmarkResult({
+        const scoringMethod = benchmark.agentExpectations?.testCommand
+          ? 'agent-rubric+tests'
+          : 'agent-rubric';
+        brain.insertBenchmarkResult(decorateBenchmarkResult({
           runId,
           suite: 'coding-agent',
           promptId: benchmark.id,
@@ -473,7 +505,21 @@ async function runAgentBenchmarkSuite(options = {}) {
           dimensionsJson: result.dimensionsJson || null,
           inputTokens: result.inputTokens ?? null,
           outputTokens: result.outputTokens ?? null,
-        });
+          scorerVersion: DEFAULT_SCORER_VERSION,
+          scoringMethod,
+          trusted: !result.error && result.testsPassed === true,
+          runConfig: { timeoutMs, scoringMethod },
+        }, {
+          suite: 'coding-agent',
+          benchmark,
+          runId,
+          provider: provider?.type || 'default',
+          model: resolveModelName(model),
+          scoringMethod,
+          scorerVersion: DEFAULT_SCORER_VERSION,
+          trusted: !result.error && result.testsPassed === true,
+          runConfig: { timeoutMs, scoringMethod },
+        }));
       } catch { /* non-fatal */ }
     }
   }

package/template/wall-e/eval/benchmarks/memory-retrieval.json CHANGED Viewed

@@ -1,82 +1,180 @@
 [
   {
-    "id": "memory-001",
-    "prompt": "Context: The user's preferred programming language is Rust. They work at Acme Corp as a senior engineer. Their current project is called 'Phoenix' which is a real-time data pipeline.\n\nQuestion: What programming language does the user prefer and what is their current project?",
-    "taskType": "memory-retrieval",
-    "difficulty": "easy",
-    "expectedTraits": ["references context", "mentions Rust", "mentions Phoenix", "accurate extraction"],
-    "tags": ["recall", "simple"]
-  },
-  {
-    "id": "memory-002",
-    "prompt": "Context: Yesterday the user mentioned they have a meeting with the design team on Thursday at 2pm. They also need to submit the Q3 report by Friday. Their manager's name is Sarah Chen.\n\nQuestion: What are the user's upcoming deadlines and meetings?",
-    "taskType": "memory-retrieval",
-    "difficulty": "easy",
-    "expectedTraits": ["references context", "mentions Thursday meeting", "mentions Friday report", "accurate extraction"],
-    "tags": ["recall", "schedule"]
-  },
-  {
-    "id": "memory-003",
-    "prompt": "Context: The user's tech stack includes Next.js for frontend, FastAPI for backend, PostgreSQL for the database, and Redis for caching. They deploy on AWS using ECS. The frontend is hosted on Vercel.\n\nQuestion: If the user needs to add a new API endpoint, which framework and language should they use based on their stack?",
+    "id": "session-recall-001",
+    "prompt": "Use Wall-E memory to answer: in the prior Codex parser session, which file was changed and what test command proved the fix?",
     "taskType": "memory-retrieval",
     "difficulty": "medium",
-    "expectedTraits": ["references context", "mentions FastAPI", "mentions Python", "not hallucinated"],
-    "tags": ["inference", "tech-stack"]
+    "expectedTraits": ["searches session memory", "mentions parser.js", "mentions node --test", "cites session id"],
+    "tags": ["session-recall", "coding", "sanitized-real-shape"],
+    "retrieval": {
+      "query": "parser src/parser.js node",
+      "expectedSourceIds": ["codex:sanitized-parser"],
+      "expectedSnippets": ["src/parser.js", "node --test tests/parser.test.js"],
+      "seedMemories": [
+        {
+          "source": "codex-jsonl",
+          "source_id": "codex:sanitized-parser:exchange:1",
+          "memory_type": "coding_session_exchange",
+          "timestamp": "2026-04-28T09:15:00.000Z",
+          "cwd": "/repo/app",
+          "content": "Q: Fix the parser crash when quoted values include commas.\nA: Decision: keep the tokenizer stateful instead of adding a regex split. Files: src/parser.js, tests/parser.test.js. Command: node --test tests/parser.test.js passed.",
+          "metadata": {
+            "sourceId": "codex:sanitized-parser",
+            "cwd": "/repo/app",
+            "gitBranch": "fix/parser-quoted-values",
+            "filesEdited": ["src/parser.js", "tests/parser.test.js"],
+            "commands": ["node --test tests/parser.test.js"]
+          }
+        }
+      ]
+    }
   },
   {
-    "id": "memory-004",
-    "prompt": "Context: The user previously said they don't like using ORMs because they had bad experiences with ActiveRecord in a previous Rails project. They prefer writing raw SQL or using query builders like Knex.js.\n\nQuestion: The user is starting a new Node.js project and needs database access. What approach should you recommend and why?",
+    "id": "session-recall-002",
+    "prompt": "Use Wall-E memory to answer: why did the prior Claude session reject the batch write approach for the queue worker?",
     "taskType": "memory-retrieval",
     "difficulty": "medium",
-    "expectedTraits": ["references context", "recommends Knex or raw SQL", "mentions ORM preference", "respects user preference"],
-    "tags": ["preference", "recommendation"]
+    "expectedTraits": ["searches session memory", "mentions lock contention", "mentions queue-worker.js", "cites session id"],
+    "tags": ["decision", "failure", "sanitized-real-shape"],
+    "retrieval": {
+      "query": "queue worker lock contention",
+      "expectedSourceIds": ["claude:sanitized-queue"],
+      "expectedSnippets": ["lock contention", "queue-worker.js"],
+      "seedMemories": [
+        {
+          "source": "claude-code-jsonl",
+          "source_id": "claude:sanitized-queue:exchange:4",
+          "memory_type": "coding_session_exchange",
+          "timestamp": "2026-04-27T17:22:00.000Z",
+          "cwd": "/repo/service",
+          "content": "Q: Speed up queue persistence.\nA: Blocker: batch writes increased SQLite lock contention under concurrent workers. Decision: keep single-row retry with jitter. Files: workers/queue-worker.js, tests/queue-worker.test.js. Command: npm test -- queue-worker passed.",
+          "metadata": {
+            "sourceId": "claude:sanitized-queue",
+            "cwd": "/repo/service",
+            "gitBranch": "fix/queue-locking",
+            "filesEdited": ["workers/queue-worker.js", "tests/queue-worker.test.js"],
+            "commands": ["npm test -- queue-worker"]
+          }
+        }
+      ]
+    }
   },
   {
-    "id": "memory-005",
-    "prompt": "Context: The user's company has a strict no-cloud policy for sensitive data. All PII must stay on-premises. They use MinIO as an S3-compatible object store and Harbor for container registry.\n\nQuestion: The user wants to add log aggregation. What should they consider given their constraints?",
+    "id": "session-recall-003",
+    "prompt": "Use Wall-E memory to answer: which browser test covered the transcript blank-space regression?",
     "taskType": "memory-retrieval",
     "difficulty": "medium",
-    "expectedTraits": ["references context", "mentions on-premises requirement", "suggests self-hosted options", "not hallucinated"],
-    "tags": ["constraint", "infrastructure"]
+    "expectedTraits": ["searches session memory", "mentions codex-blank-space.spec.js", "mentions blank gap", "cites session id"],
+    "tags": ["ui-regression", "browser-test", "sanitized-real-shape"],
+    "retrieval": {
+      "query": "blank-gap codex-blank-space.spec.js",
+      "expectedSourceIds": ["walle:sanitized-terminal-ui"],
+      "expectedSnippets": ["codex-blank-space.spec.js", "completed-turn blank-gap compaction"],
+      "seedMemories": [
+        {
+          "source": "walle-jsonl",
+          "source_id": "walle:sanitized-terminal-ui:assistant:12",
+          "memory_type": "coding_session_assistant_message",
+          "timestamp": "2026-04-29T11:05:00.000Z",
+          "cwd": "/repo/tools",
+          "content": "Decision: fix completed-turn blank-gap compaction in the Codex terminal renderer. Files: claude-task-manager/public/session-stream.js, claude-task-manager/tests/codex-blank-space.spec.js. Command: npx playwright test claude-task-manager/tests/codex-blank-space.spec.js passed.",
+          "metadata": {
+            "sourceId": "walle:sanitized-terminal-ui",
+            "cwd": "/repo/tools",
+            "gitBranch": "fix/codex-terminal-blank-gap",
+            "filesEdited": ["claude-task-manager/public/session-stream.js", "claude-task-manager/tests/codex-blank-space.spec.js"],
+            "commands": ["npx playwright test claude-task-manager/tests/codex-blank-space.spec.js"]
+          }
+        }
+      ]
+    }
   },
   {
-    "id": "memory-006",
-    "prompt": "Context: In a previous conversation, the user described three microservices: AuthService (handles login, JWT tokens, runs on port 3001), OrderService (processes orders, talks to Stripe, port 3002), and NotificationService (sends emails via SendGrid, port 3003). AuthService is the most critical.\n\nQuestion: Which service handles payments and what port does it run on? Also, which service should have the highest uptime SLA?",
-    "taskType": "memory-retrieval",
-    "difficulty": "medium",
-    "expectedTraits": ["references context", "mentions OrderService", "mentions port 3002", "mentions AuthService for SLA", "accurate extraction"],
-    "tags": ["multi-fact", "architecture"]
-  },
-  {
-    "id": "memory-007",
-    "prompt": "Context: The user mentioned they tried three different approaches to solve a caching problem: 1) Redis with 5-minute TTL (too many cache misses), 2) In-memory LRU cache (worked but lost on restart), 3) Redis with write-through strategy (current solution, working well).\n\nQuestion: What caching approach is the user currently using and why did they reject the alternatives?",
-    "taskType": "memory-retrieval",
-    "difficulty": "medium",
-    "expectedTraits": ["references context", "mentions write-through", "explains rejected approaches", "accurate extraction"],
-    "tags": ["history", "decision"]
-  },
-  {
-    "id": "memory-008",
-    "prompt": "Context: The user has NOT mentioned anything about their deployment pipeline or CI/CD setup.\n\nQuestion: What CI/CD tool does the user use for their deployments?",
+    "id": "session-recall-004",
+    "prompt": "Use Wall-E memory to answer: what was the next step after the model routing quorum session?",
     "taskType": "memory-retrieval",
     "difficulty": "hard",
-    "expectedTraits": ["acknowledges missing info", "does not hallucinate", "asks for clarification", "not hallucinated"],
-    "tags": ["negative-recall", "honesty"]
+    "expectedTraits": ["searches diary", "mentions router inputs", "mentions evaluation", "cites diary/session id"],
+    "tags": ["diary", "handoff", "sanitized-real-shape"],
+    "retrieval": {
+      "query": "model routing quorum next step router inputs evaluation diary",
+      "expectedSourceIds": ["diary:walle:sanitized-quorum:stop"],
+      "expectedSnippets": ["feed quorum results into routing", "run trusted evaluation"],
+      "seedMemories": [
+        {
+          "source": "walle-diary",
+          "source_id": "diary:walle:sanitized-quorum:stop",
+          "memory_type": "agent_diary",
+          "timestamp": "2026-04-26T20:30:00.000Z",
+          "cwd": "/repo/tools",
+          "content": "Agent diary for walle session sanitized-quorum (stop)\nSummary: feed quorum results into routing instead of leaving them as a side report.\nChanged files: wall-e/evaluation/coding-quorum.js; wall-e/routing/model-router.js\nDecisions: use reviewer/quorum/security data as router inputs.\nNext steps: run trusted evaluation on coding-agent-real cases.",
+          "metadata": {
+            "sourceId": "diary:walle:sanitized-quorum:stop",
+            "sessionId": "sanitized-quorum",
+            "agent": "walle",
+            "event": "stop",
+            "cwd": "/repo/tools",
+            "changed_files": ["wall-e/evaluation/coding-quorum.js", "wall-e/routing/model-router.js"],
+            "next_steps": ["run trusted evaluation on coding-agent-real cases"]
+          }
+        }
+      ]
+    }
   },
   {
-    "id": "memory-009",
-    "prompt": "Context: The user's team follows these conventions: branch naming is `type/JIRA-123-description`, commit messages use conventional commits (feat:, fix:, chore:), PRs require 2 approvals, and they use squash merging. They also said never to force push to main.\n\nQuestion: I want to create a new feature branch for ticket PROJ-456 about adding user avatars. What should I name it and what rules should I follow?",
+    "id": "session-recall-005",
+    "prompt": "Use Wall-E memory to answer: which source adapter handled Gemini JSONL and what privacy class did it use?",
     "taskType": "memory-retrieval",
     "difficulty": "medium",
-    "expectedTraits": ["references context", "correct branch name format", "mentions conventional commits", "mentions no force push to main"],
-    "tags": ["conventions", "git"]
+    "expectedTraits": ["searches session memory", "mentions gemini-jsonl", "mentions pii_potential", "cites session id"],
+    "tags": ["source-adapter", "privacy", "sanitized-real-shape"],
+    "retrieval": {
+      "query": "Gemini JSONL source adapter privacy class pii_potential",
+      "expectedSourceIds": ["codex:sanitized-source-adapters"],
+      "expectedSnippets": ["gemini-jsonl", "pii_potential"],
+      "seedMemories": [
+        {
+          "source": "codex-jsonl",
+          "source_id": "codex:sanitized-source-adapters:assistant:8",
+          "memory_type": "coding_session_assistant_message",
+          "timestamp": "2026-04-29T14:02:00.000Z",
+          "cwd": "/repo/tools",
+          "content": "Decision: register claude-code-jsonl, codex-jsonl, gemini-jsonl, and walle-jsonl as first-party source adapters. The default privacy class for coding session adapters is pii_potential because local transcripts can include personal context.",
+          "metadata": {
+            "sourceId": "codex:sanitized-source-adapters",
+            "cwd": "/repo/tools",
+            "filesEdited": ["wall-e/sources/gemini-jsonl.js", "wall-e/sources/builtin.js"]
+          }
+        }
+      ]
+    }
   },
   {
-    "id": "memory-010",
-    "prompt": "Context: The user previously set these preferences: dark mode enabled, timezone is America/Los_Angeles, preferred language is English, notification frequency is daily digest, and they want verbose logging during development but minimal logging in production.\n\nQuestion: Summarize all of the user's configuration preferences.",
+    "id": "session-recall-006",
+    "prompt": "Use Wall-E memory to answer honestly: do we have a remembered decision about replacing SQLite with ChromaDB?",
     "taskType": "memory-retrieval",
-    "difficulty": "easy",
-    "expectedTraits": ["references context", "lists all five preferences", "accurate extraction", "not hallucinated"],
-    "tags": ["recall", "comprehensive"]
+    "difficulty": "hard",
+    "expectedTraits": ["searches session memory", "says do not replace SQLite", "mentions sqlite-vec", "does not hallucinate approval"],
+    "tags": ["negative-recall", "architecture", "sanitized-real-shape"],
+    "retrieval": {
+      "query": "replace SQLite with ChromaDB sqlite-vec decision",
+      "expectedSourceIds": ["claude:sanitized-memory-architecture"],
+      "expectedSnippets": ["Keep SQLite plus sqlite-vec", "Do not adopt ChromaDB"],
+      "seedMemories": [
+        {
+          "source": "claude-code-jsonl",
+          "source_id": "claude:sanitized-memory-architecture:exchange:2",
+          "memory_type": "coding_session_exchange",
+          "timestamp": "2026-04-29T13:45:00.000Z",
+          "cwd": "/repo/tools",
+          "content": "Q: Should Wall-E adopt ChromaDB from the reference project?\nA: Decision: Keep SQLite plus sqlite-vec. Do not adopt ChromaDB; port the source-adapter semantics and retrieval tests instead.",
+          "metadata": {
+            "sourceId": "claude:sanitized-memory-architecture",
+            "cwd": "/repo/tools",
+            "gitBranch": "feat/session-memory-protocol"
+          }
+        }
+      ]
+    }
   }
 ]

package/template/wall-e/eval/benchmarks.js CHANGED Viewed

@@ -4,6 +4,7 @@ const fs = require('fs');
 const path = require('path');
 const crypto = require('crypto');
 const { createClient } = require('../llm/client');
+const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
 // ============================================================
 // Constants
@@ -161,6 +162,23 @@ const TRAIT_MATCHERS = {
   'mentions conventional commits': (r) => /conventional\s+commit|feat:|fix:|chore:/i.test(r),
   'mentions no force push to main': (r) => /force\s+push|--force|never.*push.*main|no.*force/i.test(r),
   'lists all five preferences': (r) => /dark\s+mode|timezone|America\/Los_Angeles|English|daily\s+digest|verbose/i.test(r),
+  'searches session memory':  (r) => /session|memory|remember|transcript|source|found/i.test(r),
+  'mentions parser.js':       (r) => /parser\.js/i.test(r),
+  'mentions node --test':     (r) => /node\s+--test/i.test(r),
+  'cites session id':         (r) => /(?:session|source)[\s_-]?id|codex:sanitized|claude:sanitized|walle:sanitized|sanitized-[\w-]+/i.test(r),
+  'mentions lock contention': (r) => /lock\s+contention/i.test(r),
+  'mentions queue-worker.js': (r) => /queue-worker\.js/i.test(r),
+  'mentions codex-blank-space.spec.js': (r) => /codex-blank-space\.spec\.js/i.test(r),
+  'mentions blank gap':       (r) => /blank[-\s]?gap/i.test(r),
+  'searches diary':           (r) => /diary|agent diary|remember|memory|source/i.test(r),
+  'mentions router inputs':   (r) => /router\s+inputs|routing.*inputs/i.test(r),
+  'mentions evaluation':      (r) => /evaluation|eval|trusted\s+evaluation/i.test(r),
+  'cites diary/session id':   (r) => /diary|session[\s_-]?id|sanitized-quorum|source[\s_-]?id/i.test(r),
+  'mentions gemini-jsonl':    (r) => /gemini-jsonl/i.test(r),
+  'mentions pii_potential':   (r) => /pii_potential/i.test(r),
+  'says do not replace SQLite': (r) => /do\s+not\s+(?:adopt|replace|use).*SQLite|keep\s+SQLite|SQLite.*not\s+replace/i.test(r),
+  'mentions sqlite-vec':      (r) => /sqlite-vec/i.test(r),
+  'does not hallucinate approval': (r) => /do\s+not\s+(?:adopt|replace)|no\s+approval|not\s+approved|rejected|keep\s+SQLite/i.test(r),
   // --- Coding-agent traits ---
   'uses edit over write':       (r) => /edit_file|apply_patch|multi_edit/i.test(r) && !/write_file/i.test(r),
@@ -214,6 +232,15 @@ const TRAIT_MATCHERS = {
   'asks clarifying questions':  (r) => /ask_user|AskUserQuestion/i.test(r),
 };
+const UNSCORABLE_TRAITS = new Set([
+  'accurate',
+  'correct solution',
+  'references context',
+  'accurate extraction',
+  'not hallucinated',
+  'does not hallucinate',
+]);
 // ============================================================
 // Suite loading
 // ============================================================
@@ -298,6 +325,12 @@ function loadBenchmarkSuite(suiteName) {
     if (!hasTraits && !hasReplyChecks && !hasToolChecks && !hasMockTools && !hasAgentExpectations && !isEdgeCase) {
       throw new Error(`Benchmark entry ${entry.id} has no scoring signal (expectedTraits / expectedInReply / expectedTools / mockToolResults / agentExpectations)`);
     }
+    if (hasTraits) {
+      const unknownTraits = entry.expectedTraits.filter(t => !TRAIT_MATCHERS[t] && !UNSCORABLE_TRAITS.has(t));
+      if (unknownTraits.length) {
+        throw new Error(`Benchmark entry ${entry.id} has unknown expectedTraits: ${unknownTraits.join(', ')}`);
+      }
+    }
   }
   return { name: suiteName, prompts };
@@ -315,16 +348,41 @@ function loadBenchmarkSuite(suiteName) {
  */
 function scoreTrait(response, trait) {
   if (!response || typeof response !== 'string') return false;
+  if (UNSCORABLE_TRAITS.has(trait)) return false;
   const matcher = TRAIT_MATCHERS[trait];
-  if (!matcher) {
-    // Unknown trait — fallback: search for the trait keywords in the response
-    const keywords = trait.toLowerCase().split(/\s+/);
-    const lower = response.toLowerCase();
-    return keywords.some((kw) => kw.length > 3 && lower.includes(kw));
-  }
+  if (!matcher) return false;
   return matcher(response);
 }
+function scoreTraitsDetailed(response, expectedTraits) {
+  const detail = {
+    score: 0,
+    matched: [],
+    missed: [],
+    unscored: [],
+    unknown: [],
+    scoredCount: 0,
+  };
+  if (!expectedTraits || expectedTraits.length === 0) return detail;
+  for (const trait of expectedTraits) {
+    if (UNSCORABLE_TRAITS.has(trait)) {
+      detail.unscored.push(trait);
+      continue;
+    }
+    if (!TRAIT_MATCHERS[trait]) {
+      detail.unknown.push(trait);
+      continue;
+    }
+    detail.scoredCount++;
+    if (scoreTrait(response, trait)) detail.matched.push(trait);
+    else detail.missed.push(trait);
+  }
+  detail.score = detail.scoredCount > 0 ? detail.matched.length / detail.scoredCount : 0;
+  return detail;
+}
 /**
  * Score a response against multiple expected traits.
  * @param {string} response - LLM response text
@@ -332,9 +390,7 @@ function scoreTrait(response, trait) {
  * @returns {number} 0.0 to 1.0 based on percentage of traits matched
  */
 function scoreTraits(response, expectedTraits) {
-  if (!expectedTraits || expectedTraits.length === 0) return 0;
-  const matched = expectedTraits.filter((t) => scoreTrait(response, t)).length;
-  return matched / expectedTraits.length;
+  return scoreTraitsDetailed(response, expectedTraits).score;
 }
 // ============================================================
@@ -407,11 +463,13 @@ async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFA
         providerScores[providerKey].errors++;
       }
-      // Score traits
-      const traitScore = response ? scoreTraits(response, entry.expectedTraits) : 0;
-      const matchedTraits = response
-        ? entry.expectedTraits.filter((t) => scoreTrait(response, t))
-        : [];
+      // Score traits. Some dataset traits are intentionally marked unscorable:
+      // they document desired behavior but must not inflate automatic scores.
+      const traitDetail = response
+        ? scoreTraitsDetailed(response, entry.expectedTraits)
+        : scoreTraitsDetailed('', entry.expectedTraits);
+      const traitScore = traitDetail.score;
+      const matchedTraits = traitDetail.matched;
       // Optional LLM judge
       let judgeScore = null;
@@ -430,6 +488,9 @@ async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFA
       const compositeScore = judgeScore != null
         ? traitScore * 0.6 + judgeScore * 0.4
         : traitScore;
+      const scoringMethod = judgeScore != null
+        ? 'trait+judge'
+        : traitDetail.scoredCount > 0 ? 'traits' : 'unscored-traits';
       providerScores[providerKey].total += compositeScore;
       providerScores[providerKey].count++;
@@ -443,7 +504,7 @@ async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFA
         ...(judgeScore != null ? { codeQuality: judgeScore } : {}),
       };
-      const resultEntry = {
+      const resultEntry = decorateBenchmarkResult({
         runId,
         suite,
         promptId: entry.id,
@@ -464,9 +525,30 @@ async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFA
         outputTokens: usage?.output ?? usage?.completion_tokens ?? null,
         genTokPerSec: usage?.genTokPerSec ?? null,
         dimensionsJson: JSON.stringify(dimensions),
+        modelMetadataJson: JSON.stringify({
+          matchedTraits,
+          missedTraits: traitDetail.missed,
+          unscoredTraits: traitDetail.unscored,
+          unknownTraits: traitDetail.unknown,
+          scoredTraitCount: traitDetail.scoredCount,
+        }),
         error,
+        scorerVersion: DEFAULT_SCORER_VERSION,
+        scoringMethod,
+        trusted: !error && judgeScore != null,
+        runConfig: { timeoutMs },
         timestamp: new Date().toISOString(),
-      };
+      }, {
+        suite,
+        benchmark: entry,
+        runId,
+        provider: provider.type,
+        model: provider.model,
+        scoringMethod,
+        scorerVersion: DEFAULT_SCORER_VERSION,
+        trusted: !error && judgeScore != null,
+        runConfig: { timeoutMs },
+      });
       results.push(resultEntry);
@@ -571,8 +653,10 @@ module.exports = {
   loadAllBenchmarks,
   scoreTrait,
   scoreTraits,
+  scoreTraitsDetailed,
   runBenchmark,
   getBenchmarkLeaderboard,
   TRAIT_MATCHERS,
+  UNSCORABLE_TRAITS,
   BENCHMARKS_DIR,
 };