npm - create-walle - Versions diffs - 0.9.11 → 0.9.13 - Mend

create-walle 0.9.11 → 0.9.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (167) hide show

package/README.md +3 -3
package/package.json +2 -2
package/template/bin/dev.sh +7 -1
package/template/bin/setup.js +53 -9
package/template/bin/sync-images.js +53 -0
package/template/builder-journal.md +17 -0
package/template/claude-task-manager/api-prompts.js +98 -13
package/template/claude-task-manager/api-reviews.js +82 -5
package/template/claude-task-manager/db.js +32 -5
package/template/claude-task-manager/docs/session-capture-foundation-design.md +1273 -0
package/template/claude-task-manager/lib/claude-desktop-sessions.js +696 -0
package/template/claude-task-manager/lib/coding-agent-models.js +49 -1
package/template/claude-task-manager/lib/session-capture.js +421 -0
package/template/claude-task-manager/lib/session-history.js +135 -15
package/template/claude-task-manager/lib/session-jobs.js +10 -5
package/template/claude-task-manager/lib/session-stream.js +87 -19
package/template/claude-task-manager/lib/setup-provider-config.js +115 -0
package/template/claude-task-manager/lib/walle-ctm-history.js +72 -0
package/template/claude-task-manager/lib/walle-session-context.js +61 -0
package/template/claude-task-manager/lib/walle-transcript.js +176 -0
package/template/claude-task-manager/public/css/setup.css +35 -8
package/template/claude-task-manager/public/css/walle-session.css +56 -0
package/template/claude-task-manager/public/css/walle.css +120 -0
package/template/claude-task-manager/public/index.html +814 -181
package/template/claude-task-manager/public/js/message-renderer.js +148 -19
package/template/claude-task-manager/public/js/reviews.js +120 -62
package/template/claude-task-manager/public/js/setup.js +75 -31
package/template/claude-task-manager/public/js/stream-view.js +115 -55
package/template/claude-task-manager/public/js/walle-session.js +84 -2
package/template/claude-task-manager/public/js/walle.js +308 -54
package/template/claude-task-manager/server.js +1092 -146
package/template/claude-task-manager/session-integrity.js +181 -54
package/template/claude-task-manager/session-utils.js +123 -41
package/template/claude-task-manager/workers/state-detectors/codex.js +5 -2
package/template/package.json +1 -1
package/template/wall-e/adapters/ctm.js +39 -18
package/template/wall-e/agent-runners/contract.js +17 -0
package/template/wall-e/agent-runners/index.js +22 -0
package/template/wall-e/agent-runtime/harness.js +212 -0
package/template/wall-e/agent-runtime/index.js +8 -0
package/template/wall-e/agent-runtime/registry.js +67 -0
package/template/wall-e/agent-runtime/session-store.js +179 -0
package/template/wall-e/agent-runtime/spawn.js +208 -0
package/template/wall-e/api-walle.js +174 -7
package/template/wall-e/brain.js +266 -28
package/template/wall-e/channels/policy.js +88 -0
package/template/wall-e/channels/registry.js +15 -1
package/template/wall-e/channels/reply-dispatcher.js +70 -0
package/template/wall-e/channels/session-bindings.js +51 -0
package/template/wall-e/chat/code-review-context.js +29 -0
package/template/wall-e/chat.js +188 -42
package/template/wall-e/coding/acp-adapter.js +188 -0
package/template/wall-e/coding/agent-catalog.js +129 -0
package/template/wall-e/coding/compaction-service.js +247 -0
package/template/wall-e/coding/execution-trace.js +3 -0
package/template/wall-e/coding/instruction-service.js +224 -0
package/template/wall-e/coding/model-message.js +67 -0
package/template/wall-e/coding/permission-rules-store.js +111 -0
package/template/wall-e/coding/permission-service.js +266 -0
package/template/wall-e/coding/prompt-bundle.js +67 -0
package/template/wall-e/coding/prompt-runtime.js +243 -0
package/template/wall-e/coding/provider-transform.js +188 -0
package/template/wall-e/coding/runtime-mode.js +132 -0
package/template/wall-e/coding/snapshot-service.js +155 -0
package/template/wall-e/coding/stream-processor.js +268 -0
package/template/wall-e/coding/task-tool.js +255 -0
package/template/wall-e/coding/tool-registry.js +361 -0
package/template/wall-e/coding/transcript-writer.js +143 -0
package/template/wall-e/coding/workspace-replay.js +324 -0
package/template/wall-e/coding-context.js +4 -22
package/template/wall-e/coding-orchestrator.js +307 -18
package/template/wall-e/coding-prompts.js +44 -3
package/template/wall-e/context/context-builder.js +43 -1
package/template/wall-e/context/topic-matcher.js +1 -1
package/template/wall-e/eval/agent-runner.js +59 -13
package/template/wall-e/eval/benchmarks/memory-retrieval.json +155 -57
package/template/wall-e/eval/benchmarks.js +100 -16
package/template/wall-e/eval/eval-orchestrator.js +218 -8
package/template/wall-e/eval/harvester.js +62 -5
package/template/wall-e/eval/head-to-head.js +23 -2
package/template/wall-e/eval/humaneval-adapter.js +30 -5
package/template/wall-e/eval/livecodebench-adapter.js +29 -5
package/template/wall-e/eval/manifest.js +186 -0
package/template/wall-e/eval/run-agent-benchmarks.js +66 -2
package/template/wall-e/eval/session-retrieval-benchmark.js +150 -0
package/template/wall-e/eval/session-transcripts.js +57 -4
package/template/wall-e/eval/swebench-adapter.js +109 -3
package/template/wall-e/evaluation/agent-router.js +53 -1
package/template/wall-e/evaluation/coding-quorum.js +48 -1
package/template/wall-e/evaluation/router.js +4 -2
package/template/wall-e/evaluation/tier-selector.js +11 -1
package/template/wall-e/extraction/contradiction.js +2 -2
package/template/wall-e/extraction/indexer.js +2 -1
package/template/wall-e/extraction/knowledge-extractor.js +2 -2
package/template/wall-e/hooks/cli.js +92 -0
package/template/wall-e/hooks/discovery.js +119 -0
package/template/wall-e/hooks/index.js +7 -0
package/template/wall-e/hooks/manifest.js +55 -0
package/template/wall-e/hooks/runtime.js +84 -0
package/template/wall-e/hooks/session-memory.js +225 -0
package/template/wall-e/http/auth.js +6 -2
package/template/wall-e/http/chat-api.js +54 -8
package/template/wall-e/integrations/claude-plugin/hooks/hooks.json +27 -0
package/template/wall-e/integrations/claude-plugin/hooks/walle-precompact-hook.sh +5 -0
package/template/wall-e/integrations/claude-plugin/hooks/walle-stop-hook.sh +5 -0
package/template/wall-e/integrations/codex-plugin/hooks/walle-hook.sh +7 -0
package/template/wall-e/integrations/codex-plugin/hooks.json +37 -0
package/template/wall-e/listening/calendar.js +3 -1
package/template/wall-e/llm/client.js +64 -10
package/template/wall-e/llm/google.js +39 -5
package/template/wall-e/llm/ollama.js +1 -1
package/template/wall-e/llm/ollama.plugin.json +1 -1
package/template/wall-e/llm/provider-availability.js +10 -0
package/template/wall-e/llm/provider-error.js +269 -0
package/template/wall-e/llm/tool-adapter.js +48 -12
package/template/wall-e/loops/boot.js +2 -1
package/template/wall-e/loops/initiative.js +2 -2
package/template/wall-e/loops/tasks.js +8 -47
package/template/wall-e/loops/workspace-prompts.js +20 -0
package/template/wall-e/mcp-server.js +442 -1
package/template/wall-e/memory/session-ingest-service.js +159 -0
package/template/wall-e/memory/source-indexer.js +289 -0
package/template/wall-e/plugins/discovery.js +83 -0
package/template/wall-e/plugins/manifest-loader.js +50 -10
package/template/wall-e/plugins/manifest-schema.js +69 -0
package/template/wall-e/plugins/model-catalog.js +55 -0
package/template/wall-e/prompts/coding/base.txt +2 -0
package/template/wall-e/prompts/coding/deepseek.txt +1 -0
package/template/wall-e/prompts/coding/memory-protocol.md +9 -0
package/template/wall-e/prompts/coding/plan.txt +1 -0
package/template/wall-e/runtime/execution-trace.js +220 -0
package/template/wall-e/security/audit.js +266 -0
package/template/wall-e/security/ssrf.js +236 -0
package/template/wall-e/session-files.js +303 -0
package/template/wall-e/skills/_bundled/slack-backfill/SKILL.md +3 -0
package/template/wall-e/skills/_bundled/slack-sync/SKILL.md +3 -0
package/template/wall-e/skills/internal-skill-registry.js +2 -2
package/template/wall-e/skills/script-skill-runner.js +143 -0
package/template/wall-e/skills/skill-executor.js +5 -6
package/template/wall-e/skills/skill-fallback.js +3 -1
package/template/wall-e/skills/skill-harness-registry.js +7 -8
package/template/wall-e/skills/skill-planner.js +52 -4
package/template/wall-e/skills/slack-ingest.js +11 -3
package/template/wall-e/sources/base.js +90 -0
package/template/wall-e/sources/builtin.js +33 -0
package/template/wall-e/sources/claude-code-jsonl.js +78 -0
package/template/wall-e/sources/codex-jsonl.js +125 -0
package/template/wall-e/sources/coding-session-utils.js +117 -0
package/template/wall-e/sources/contract-suite.js +59 -0
package/template/wall-e/sources/gemini-jsonl.js +85 -0
package/template/wall-e/sources/index.js +9 -0
package/template/wall-e/sources/jsonl-utils.js +181 -0
package/template/wall-e/sources/record-types.js +252 -0
package/template/wall-e/sources/registry.js +92 -0
package/template/wall-e/sources/transforms.js +100 -0
package/template/wall-e/sources/walle-jsonl.js +108 -0
package/template/wall-e/tools/coding-middleware.js +31 -1
package/template/wall-e/tools/file-tracker.js +25 -1
package/template/wall-e/tools/local-tools.js +75 -47
package/template/wall-e/tools/session-sharing.js +68 -1
package/template/wall-e/tools/shell-analyzer.js +1 -1
package/template/wall-e/tools/shell-policy.js +47 -0
package/template/wall-e/tools/snapshot.js +42 -0
package/template/wall-e/training/harvester.js +62 -5
package/template/wall-e/utils/repair.js +253 -1
package/template/website/index.html +3 -3
package/template/wall-e/skills/_bundled/slack-mentions/.watched-threads.json +0 -18

package/template/wall-e/eval/manifest.js ADDED Viewed

@@ -0,0 +1,186 @@
+'use strict';
+const crypto = require('crypto');
+const path = require('path');
+const { execFileSync } = require('child_process');
+const DEFAULT_DATASET_VERSION = 'local-v1';
+const DEFAULT_SCORER_VERSION = 'wall-e-eval-v2';
+const DEFAULT_EVALUATOR_VERSION = 'wall-e-evaluator-v2';
+let cachedRepoSha;
+function stableStringify(value) {
+  if (value === null || typeof value !== 'object') return JSON.stringify(value);
+  if (Array.isArray(value)) return '[' + value.map(stableStringify).join(',') + ']';
+  return '{' + Object.keys(value).sort().map((key) => (
+    JSON.stringify(key) + ':' + stableStringify(value[key])
+  )).join(',') + '}';
+}
+function sha256(value) {
+  return crypto.createHash('sha256').update(String(value ?? '')).digest('hex');
+}
+function hashObject(value) {
+  return sha256(stableStringify(value));
+}
+function getRepoSha(cwd = path.resolve(__dirname, '..')) {
+  if (cachedRepoSha !== undefined) return cachedRepoSha;
+  try {
+    cachedRepoSha = execFileSync('git', ['rev-parse', 'HEAD'], {
+      cwd,
+      encoding: 'utf8',
+      stdio: ['ignore', 'pipe', 'ignore'],
+    }).trim() || null;
+  } catch {
+    cachedRepoSha = null;
+  }
+  return cachedRepoSha;
+}
+function safeJson(value) {
+  if (value == null) return null;
+  if (typeof value === 'string') return value;
+  try { return JSON.stringify(value); } catch { return null; }
+}
+function pickRunConfig(config = {}) {
+  const allowed = [
+    'temperature', 'seed', 'maxTokens', 'timeoutMs', 'concurrency',
+    'budgetDollars', 'suite', 'taskType', 'scoringMethod',
+  ];
+  const out = {};
+  for (const key of allowed) {
+    if (config[key] !== undefined) out[key] = config[key];
+  }
+  return out;
+}
+function samplePayloadForHash(benchmark = {}) {
+  return {
+    id: benchmark.id || benchmark.promptId || benchmark.benchmark_id || null,
+    prompt: benchmark.prompt || '',
+    taskType: benchmark.taskType || null,
+    difficulty: benchmark.difficulty || null,
+    expectedTraits: benchmark.expectedTraits || null,
+    expectedInReply: benchmark.expectedInReply || null,
+    agentExpectations: benchmark.agentExpectations || null,
+  };
+}
+function buildEvalManifest({
+  suite,
+  benchmark = {},
+  runId,
+  provider,
+  model,
+  runConfig = {},
+  scorerVersion = DEFAULT_SCORER_VERSION,
+  evaluatorVersion = DEFAULT_EVALUATOR_VERSION,
+  scoringMethod,
+  artifactPath,
+  trusted,
+} = {}) {
+  const sampleId = benchmark.sampleId || benchmark.id || benchmark.promptId || benchmark.benchmark_id || null;
+  const datasetVersion = benchmark.datasetVersion || `${suite || 'unknown'}:${DEFAULT_DATASET_VERSION}`;
+  const datasetHash = benchmark.datasetHash || hashObject({
+    suite: suite || 'unknown',
+    datasetVersion,
+    sample: samplePayloadForHash(benchmark),
+  });
+  const promptHash = sha256(benchmark.prompt || '');
+  const sanitizedConfig = pickRunConfig({ ...runConfig, suite, scoringMethod });
+  const repoSha = getRepoSha();
+  const manifest = {
+    runId: runId || null,
+    suite: suite || null,
+    datasetVersion,
+    datasetHash,
+    sampleId,
+    promptHash,
+    provider: provider || null,
+    model: model || null,
+    modelSnapshot: benchmark.modelSnapshot || model || null,
+    scorerVersion,
+    evaluatorVersion,
+    scoringMethod: scoringMethod || null,
+    repoSha,
+    runConfig: sanitizedConfig,
+    artifactPath: artifactPath || null,
+    trusted: trusted === undefined ? null : !!trusted,
+  };
+  return {
+    sampleId,
+    datasetVersion,
+    datasetHash,
+    promptHash,
+    repoSha,
+    scorerVersion,
+    evaluatorVersion,
+    scoringMethod: scoringMethod || null,
+    runConfigJson: safeJson(sanitizedConfig),
+    evalManifestJson: safeJson(manifest),
+    artifactPath: artifactPath || null,
+    modelSnapshot: benchmark.modelSnapshot || model || null,
+    temperature: sanitizedConfig.temperature ?? null,
+    seed: sanitizedConfig.seed ?? null,
+  };
+}
+function decorateBenchmarkResult(entry = {}, context = {}) {
+  const suite = entry.suite || context.suite;
+  const benchmark = {
+    ...(context.benchmark || {}),
+    id: entry.promptId || entry.benchmark_id || context.benchmark?.id,
+    prompt: entry.prompt || context.benchmark?.prompt,
+    taskType: entry.taskType || context.benchmark?.taskType,
+    difficulty: entry.difficulty || context.benchmark?.difficulty,
+  };
+  const manifest = buildEvalManifest({
+    suite,
+    benchmark,
+    runId: entry.runId || context.runId,
+    provider: entry.provider || context.provider,
+    model: entry.model || context.model,
+    runConfig: context.runConfig || {},
+    scorerVersion: entry.scorerVersion || context.scorerVersion,
+    evaluatorVersion: entry.evaluatorVersion || context.evaluatorVersion,
+    scoringMethod: entry.scoringMethod || context.scoringMethod,
+    artifactPath: entry.artifactPath || context.artifactPath,
+    trusted: entry.trusted ?? context.trusted,
+  });
+  return {
+    ...entry,
+    sampleId: entry.sampleId || manifest.sampleId,
+    datasetVersion: entry.datasetVersion || manifest.datasetVersion,
+    datasetHash: entry.datasetHash || manifest.datasetHash,
+    promptHash: entry.promptHash || manifest.promptHash,
+    repoSha: entry.repoSha || manifest.repoSha,
+    scorerVersion: entry.scorerVersion || manifest.scorerVersion,
+    evaluatorVersion: entry.evaluatorVersion || manifest.evaluatorVersion,
+    scoringMethod: entry.scoringMethod || manifest.scoringMethod,
+    runConfigJson: entry.runConfigJson || manifest.runConfigJson,
+    evalManifestJson: entry.evalManifestJson || manifest.evalManifestJson,
+    artifactPath: entry.artifactPath || manifest.artifactPath,
+    modelSnapshot: entry.modelSnapshot || manifest.modelSnapshot,
+    temperature: entry.temperature ?? manifest.temperature,
+    seed: entry.seed ?? manifest.seed,
+  };
+}
+module.exports = {
+  DEFAULT_DATASET_VERSION,
+  DEFAULT_SCORER_VERSION,
+  DEFAULT_EVALUATOR_VERSION,
+  stableStringify,
+  sha256,
+  hashObject,
+  getRepoSha,
+  buildEvalManifest,
+  decorateBenchmarkResult,
+};

package/template/wall-e/eval/run-agent-benchmarks.js CHANGED Viewed

@@ -25,9 +25,11 @@ try {
 } catch {}
 const path = require('path');
+const crypto = require('crypto');
 process.chdir(path.join(__dirname, '..'));
-const { setupSandbox, cleanupSandbox, runAgentBenchmark, runAgentBenchmarkSuite } = require('./agent-runner');
+const { setupSandbox, cleanupSandbox, runAgentBenchmark, runAgentBenchmarkSuite, resolveModelName } = require('./agent-runner');
+const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
 const benchmarks = require('./benchmarks/coding-agent.json');
 /**
@@ -227,6 +229,7 @@ async function main() {
   console.log(`Running ${selectedBenchmarks.length} benchmarks...\n`);
+  const runId = crypto.randomUUID();
   const results = [];
   for (const benchmark of selectedBenchmarks) {
     const startTime = Date.now();
@@ -244,7 +247,10 @@ async function main() {
       });
       const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
+      result.runId = runId;
+      result.timestamp = new Date().toISOString();
       results.push(result);
+      storeBenchmarkResult({ brain, runId, benchmark, result, provider, modelId, timeoutMs: 600000 });
       console.log(`  Success: ${result.success}`);
       console.log(`  Score: ${(result.score?.composite || 0).toFixed(3)}`);
@@ -263,7 +269,16 @@ async function main() {
       if (result.error) console.log(`  Error: ${result.error}`);
     } catch (err) {
       console.error(`  EXCEPTION: ${err.message}`);
-      results.push({ benchmarkId: benchmark.id, success: false, error: err.message, score: { composite: 0 } });
+      const result = {
+        benchmarkId: benchmark.id,
+        success: false,
+        error: err.message,
+        score: { composite: 0 },
+        runId,
+        timestamp: new Date().toISOString(),
+      };
+      results.push(result);
+      storeBenchmarkResult({ brain, runId, benchmark, result, provider, modelId, timeoutMs: 600000 });
     }
     console.log('');
   }
@@ -312,3 +327,52 @@ main().catch(err => {
   console.error('Fatal error:', err);
   process.exit(1);
 });
+function storeBenchmarkResult({ brain, runId, benchmark, result, provider, modelId, timeoutMs }) {
+  if (!brain || typeof brain.insertBenchmarkResult !== 'function') return;
+  try {
+    const scoringMethod = benchmark.agentExpectations?.testCommand
+      ? 'agent-rubric+tests'
+      : 'agent-rubric';
+    brain.insertBenchmarkResult(decorateBenchmarkResult({
+      runId,
+      suite: 'coding-agent',
+      promptId: benchmark.id,
+      taskType: 'coding-agent',
+      difficulty: benchmark.difficulty,
+      provider: provider?.type || 'default',
+      model: resolveModelName(modelId),
+      prompt: benchmark.prompt,
+      response: result.output || '',
+      traitScore: null,
+      matchedTraits: [],
+      compositeScore: result.score?.composite || 0,
+      latencyMs: result.latencyMs,
+      error: result.error,
+      timestamp: result.timestamp,
+      costDollars: result.costDollars || null,
+      testsBefore: result.testsBefore ?? null,
+      testsAfter: result.testsAfter ?? null,
+      totalTests: result.totalTests ?? null,
+      dimensionsJson: result.dimensionsJson || null,
+      inputTokens: result.inputTokens ?? null,
+      outputTokens: result.outputTokens ?? null,
+      scorerVersion: DEFAULT_SCORER_VERSION,
+      scoringMethod,
+      trusted: !result.error && result.testsPassed === true,
+      runConfig: { timeoutMs, scoringMethod },
+    }, {
+      suite: 'coding-agent',
+      benchmark,
+      runId,
+      provider: provider?.type || 'default',
+      model: resolveModelName(modelId),
+      scoringMethod,
+      scorerVersion: DEFAULT_SCORER_VERSION,
+      trusted: !result.error && result.testsPassed === true,
+      runConfig: { timeoutMs, scoringMethod },
+    }));
+  } catch (err) {
+    console.warn(`  [WARN] Failed to store benchmark result: ${err.message}`);
+  }
+}

package/template/wall-e/eval/session-retrieval-benchmark.js ADDED Viewed

@@ -0,0 +1,150 @@
+'use strict';
+const fs = require('node:fs');
+const path = require('node:path');
+const brainDefault = require('../brain');
+const { indexMemory } = require('../memory/source-indexer');
+const DEFAULT_CASES_PATH = path.join(__dirname, 'benchmarks', 'memory-retrieval.json');
+function loadMemoryRetrievalCases(filePath = DEFAULT_CASES_PATH) {
+  const parsed = JSON.parse(fs.readFileSync(filePath, 'utf8'));
+  return Array.isArray(parsed) ? parsed : [];
+}
+function seedBenchmarkMemories({ brain = brainDefault, cases = [] } = {}) {
+  let inserted = 0;
+  let indexed = 0;
+  for (const bench of cases) {
+    const memories = bench.retrieval?.seedMemories || [];
+    for (const seed of memories) {
+      const result = brain.insertMemory({
+        source: seed.source || 'codex-jsonl',
+        source_id: seed.source_id,
+        source_channel: seed.cwd || '',
+        memory_type: seed.memory_type || 'coding_session_exchange',
+        direction: seed.direction || 'exchange',
+        subject: seed.subject || seed.source_id,
+        content: seed.content,
+        content_raw: seed.content,
+        metadata: JSON.stringify(seed.metadata || {}),
+        importance: seed.importance ?? 0.7,
+        timestamp: seed.timestamp || new Date().toISOString(),
+      });
+      if (!result) continue;
+      inserted++;
+      indexMemory({
+        ...seed,
+        id: result.id,
+        source: seed.source || 'codex-jsonl',
+        source_id: seed.source_id,
+        source_channel: seed.cwd || '',
+        memory_type: seed.memory_type || 'coding_session_exchange',
+        content: seed.content,
+        metadata: JSON.stringify(seed.metadata || {}),
+        timestamp: seed.timestamp || new Date().toISOString(),
+      }, { brain });
+      indexed++;
+    }
+  }
+  return { inserted, indexed };
+}
+function searchRetrievalCase({ brain = brainDefault, query, limit = 10 } = {}) {
+  const max = Math.min(Math.max(Number(limit) || 10, 1), 50);
+  const direct = brain.searchMemories({ query, limit: max * 3 });
+  let indexed = [];
+  try {
+    const rows = brain.searchMemoryIndex({ query, limit: max * 3 });
+    indexed = hydrateIndexRows(brain, rows);
+  } catch {}
+  return mergeById(direct, indexed).slice(0, max);
+}
+function scoreRetrievalCase(bench, results, { ks = [5, 10] } = {}) {
+  const expected = new Set(bench.retrieval?.expectedSourceIds || []);
+  const sourceIds = results.map(resultSourceId);
+  const out = {
+    id: bench.id,
+    query: bench.retrieval?.query || '',
+    expected: [...expected],
+    returned: sourceIds,
+  };
+  for (const k of ks) {
+    out[`hit_at_${k}`] = sourceIds.slice(0, k).some((id) => expected.has(id));
+  }
+  return out;
+}
+function runMemoryRetrievalBenchmark({ brain = brainDefault, cases = loadMemoryRetrievalCases(), seed = false, limit = 10 } = {}) {
+  if (seed) seedBenchmarkMemories({ brain, cases });
+  const results = [];
+  for (const bench of cases) {
+    const query = bench.retrieval?.query || bench.prompt || '';
+    const hits = searchRetrievalCase({ brain, query, limit });
+    results.push(scoreRetrievalCase(bench, hits));
+  }
+  return summarizeRetrievalResults(results);
+}
+function summarizeRetrievalResults(results) {
+  const total = results.length || 1;
+  const hitAt5 = results.filter((result) => result.hit_at_5).length;
+  const hitAt10 = results.filter((result) => result.hit_at_10).length;
+  return {
+    total: results.length,
+    recall_at_5: hitAt5 / total,
+    recall_at_10: hitAt10 / total,
+    results,
+  };
+}
+function hydrateIndexRows(brain, rows) {
+  if (!rows?.length) return [];
+  const ids = [...new Set(rows.map((row) => row.memory_id).filter(Boolean))];
+  if (!ids.length) return [];
+  const placeholders = ids.map(() => '?').join(',');
+  return brain.getDb().prepare(`
+    SELECT * FROM memories
+    WHERE archived_at IS NULL AND id IN (${placeholders})
+  `).all(...ids);
+}
+function mergeById(...groups) {
+  const seen = new Set();
+  const merged = [];
+  for (const group of groups) {
+    for (const item of group || []) {
+      if (!item?.id || seen.has(item.id)) continue;
+      seen.add(item.id);
+      merged.push(item);
+    }
+  }
+  return merged;
+}
+function resultSourceId(result = {}) {
+  try {
+    const metadata = JSON.parse(result.metadata || '{}');
+    if (metadata?.sourceId) return metadata.sourceId;
+  } catch {}
+  const sourceId = String(result.source_id || '');
+  const parts = sourceId.split(':');
+  return parts.length > 2 ? parts.slice(0, 2).join(':') : sourceId;
+}
+if (require.main === module) {
+  brainDefault.initDb();
+  const summary = runMemoryRetrievalBenchmark({ seed: process.argv.includes('--seed') });
+  console.log(JSON.stringify(summary, null, 2));
+}
+module.exports = {
+  loadMemoryRetrievalCases,
+  resultSourceId,
+  runMemoryRetrievalBenchmark,
+  scoreRetrievalCase,
+  searchRetrievalCase,
+  seedBenchmarkMemories,
+  summarizeRetrievalResults,
+};

package/template/wall-e/eval/session-transcripts.js CHANGED Viewed

@@ -13,6 +13,12 @@ const {
 const DEFAULT_TRANSCRIPT_ROOTS = {
   claude: path.join(os.homedir(), '.claude', 'projects'),
   codex: path.join(os.homedir(), '.codex', 'sessions'),
+  walle: process.env.WALLE_SESSIONS_DIR
+    || process.env.WALL_E_SESSIONS_DIR
+    || process.env.WALL_E_SESSION_DIR
+    || process.env.WALLE_SESSION_DIR
+    || (process.env.WALLE_DEV_DIR ? path.join(process.env.WALLE_DEV_DIR, 'sessions') : '')
+    || path.join(process.env.WALL_E_DATA_DIR || path.join(os.homedir(), '.walle'), 'sessions'),
 };
 const MIN_PROMPT_CHARS = 20;
@@ -21,9 +27,12 @@ function detectTranscriptSource(jsonlPath, events = null) {
   const normalized = path.normalize(jsonlPath || '');
   if (normalized.includes(`${path.sep}.claude${path.sep}projects${path.sep}`)) return 'claude';
   if (normalized.includes(`${path.sep}.codex${path.sep}sessions${path.sep}`)) return 'codex';
+  if (normalized.includes(`${path.sep}.walle${path.sep}sessions${path.sep}`)) return 'walle';
   const sample = events || readJsonlEvents(jsonlPath).slice(0, 50);
   for (const evt of sample) {
+    if (evt?.walle?.schema === 'wall-e-session-v1') return 'walle';
+    if (evt?.provider === 'walle' || evt?.type === 'walle_part') return 'walle';
     if (evt?.type === 'session_meta' && evt.payload?.originator === 'codex-tui') return 'codex';
     if (evt?.type === 'turn_context' && evt.payload?.cwd) return 'codex';
     if (evt?.type === 'response_item' && evt.payload) return 'codex';
@@ -57,8 +66,9 @@ function parseTranscriptJsonl(jsonlPath, { repoPath = null, minPromptChars = MIN
   const source = detectTranscriptSource(jsonlPath, events);
   let session = null;
-  if (source === 'claude') session = parseClaudeTranscript(jsonlPath, events, minPromptChars);
+  if (source === 'claude') session = parseClaudeTranscript(jsonlPath, events, minPromptChars, 'claude');
   else if (source === 'codex') session = parseCodexTranscript(jsonlPath, events, minPromptChars);
+  else if (source === 'walle') session = parseWalleTranscript(jsonlPath, events, minPromptChars);
   else session = parseUnknownTranscript(jsonlPath, events, minPromptChars);
   if (!session || !session.cwd) return null;
@@ -77,8 +87,8 @@ function readJsonlEvents(jsonlPath) {
   return events;
 }
-function parseClaudeTranscript(jsonlPath, events, minPromptChars) {
-  const session = baseSession(jsonlPath, 'claude');
+function parseClaudeTranscript(jsonlPath, events, minPromptChars, source = 'claude') {
+  const session = baseSession(jsonlPath, source);
   const editedFiles = new Set();
   for (const evt of events) {
@@ -152,6 +162,48 @@ function parseCodexTranscript(jsonlPath, events, minPromptChars) {
   return session;
 }
+function parseWalleTranscript(jsonlPath, events, minPromptChars) {
+  const session = baseSession(jsonlPath, 'walle');
+  const editedFiles = new Set();
+  for (const evt of events) {
+    const ts = evt.timestamp || null;
+    if (ts) setSessionTime(session, ts);
+    if (evt.cwd && !session.cwd) session.cwd = evt.cwd;
+    if (evt.gitBranch && !session.gitBranch) session.gitBranch = evt.gitBranch;
+    if (evt.type === 'session_meta') {
+      session.sessionId = evt.sessionId || session.sessionId;
+      if (evt.cwd && !session.cwd) session.cwd = evt.cwd;
+      if (evt.gitBranch && !session.gitBranch) session.gitBranch = evt.gitBranch;
+      continue;
+    }
+    if (evt.type === 'user') {
+      const text = cleanUserText(extractMessageText(evt.message || evt));
+      if (isReplayableUserText(text, minPromptChars)) session.userMessages.push(text);
+      continue;
+    }
+    if (evt.type === 'assistant') {
+      const text = extractMessageText(evt.message || evt);
+      if (text) session.assistantMessages.push(text);
+      for (const call of harvestExtractToolCalls(evt.message || evt)) {
+        addToolCall(session, editedFiles, call.name, call.input || {}, ts);
+      }
+      continue;
+    }
+    if (evt.type === 'walle_part' && evt.partType === 'tool_call') {
+      const data = evt.data || {};
+      addToolCall(session, editedFiles, data.name || data.tool || 'tool', data.input || {}, ts);
+    }
+  }
+  finishSession(session, editedFiles, events.length);
+  return session;
+}
 function parseUnknownTranscript(jsonlPath, events, minPromptChars) {
   const session = baseSession(jsonlPath, 'unknown');
   const editedFiles = new Set();
@@ -434,7 +486,8 @@ function resolveRoots(roots, source) {
   if (roots) return Array.isArray(roots) ? roots : [roots];
   if (source === 'claude') return [DEFAULT_TRANSCRIPT_ROOTS.claude];
   if (source === 'codex') return [DEFAULT_TRANSCRIPT_ROOTS.codex];
-  return [DEFAULT_TRANSCRIPT_ROOTS.claude, DEFAULT_TRANSCRIPT_ROOTS.codex];
+  if (source === 'walle') return [DEFAULT_TRANSCRIPT_ROOTS.walle];
+  return [DEFAULT_TRANSCRIPT_ROOTS.claude, DEFAULT_TRANSCRIPT_ROOTS.codex, DEFAULT_TRANSCRIPT_ROOTS.walle];
 }
 module.exports = {

package/template/wall-e/eval/swebench-adapter.js CHANGED Viewed

@@ -15,6 +15,9 @@ const fs = require('fs');
 const path = require('path');
 const os = require('os');
 const https = require('https');
+const crypto = require('crypto');
+const { resolveModelName } = require('./agent-runner');
+const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
 const CACHE_DIR = path.join(os.homedir(), '.walle', 'swebench-cache');
 const DATASET_URL =
@@ -148,6 +151,7 @@ async function runSWEBenchTask(task, options = {}) {
     let agentResult = null;
     if (runAgentLoop) {
       try {
+        let timeoutHandle;
         agentResult = await Promise.race([
           runAgentLoop(mapped.prompt, {
             brain,
@@ -155,10 +159,12 @@ async function runSWEBenchTask(task, options = {}) {
             model,
             maxTurns: 30,
           }),
-          new Promise((_, reject) =>
-            setTimeout(() => reject(new Error('Agent timeout')), timeoutMs)
-          ),
+          new Promise((_, reject) => {
+            timeoutHandle = setTimeout(() => reject(new Error('Agent timeout')), timeoutMs);
+            if (typeof timeoutHandle.unref === 'function') timeoutHandle.unref();
+          }),
         ]);
+        if (timeoutHandle) clearTimeout(timeoutHandle);
       } catch (err) {
         return {
           taskId: mapped.id,
@@ -229,10 +235,110 @@ async function loadCuratedSubset() {
   return JSON.parse(fs.readFileSync(filePath, 'utf8'));
 }
+async function runSWEBenchSuite(options = {}) {
+  const {
+    brain,
+    runAgentLoop,
+    provider,
+    providerType,
+    model,
+    maxTasks,
+    signal,
+    timeoutMs,
+    runId: providedRunId,
+  } = options;
+  const allTasks = await loadCuratedSubset();
+  const tasks = maxTasks ? allTasks.slice(0, maxTasks) : allTasks;
+  const runId = providedRunId || crypto.randomUUID();
+  const results = [];
+  let totalPassed = 0;
+  for (const rawTask of tasks) {
+    if (signal?.aborted) break;
+    const mapped = mapTaskToPrompt(rawTask);
+    const result = await runSWEBenchTask(mapped, {
+      brain,
+      runAgentLoop,
+      provider,
+      model,
+      timeoutMs,
+    });
+    results.push(result);
+    if (result.success) totalPassed++;
+    if (brain && typeof brain.insertBenchmarkResult === 'function') {
+      try {
+        const scoringMethod = 'swebench-docker-tests';
+        brain.insertBenchmarkResult(decorateBenchmarkResult({
+          runId,
+          suite: 'swebench-lite',
+          promptId: mapped.id,
+          taskType: 'coding-agent',
+          difficulty: rawTask.difficulty || 'medium',
+          provider: providerType || 'unknown',
+          model: resolveModelName(model),
+          prompt: mapped.prompt,
+          response: result.testOutput || '',
+          traitScore: null,
+          compositeScore: result.score?.composite || 0,
+          latencyMs: result.elapsedMs || null,
+          error: result.error || null,
+          testsBefore: null,
+          testsAfter: result.success ? 1 : 0,
+          totalTests: result.error ? 0 : 1,
+          dimensionsJson: JSON.stringify(result.score?.dimensions || {}),
+          modelMetadataJson: JSON.stringify({
+            repo: mapped.repo,
+            baseCommit: mapped.baseCommit,
+            agentTurns: result.agentTurns || 0,
+            testOutput: result.testOutput || null,
+          }),
+          datasetVersion: 'swebench-lite:curated-30',
+          scorerVersion: DEFAULT_SCORER_VERSION,
+          scoringMethod,
+          trusted: !result.error,
+          runConfig: { maxTasks, timeoutMs, scoringMethod },
+        }, {
+          suite: 'swebench-lite',
+          benchmark: {
+            id: mapped.id,
+            prompt: mapped.prompt,
+            taskType: 'coding-agent',
+            difficulty: rawTask.difficulty || 'medium',
+            datasetVersion: 'swebench-lite:curated-30',
+          },
+          runId,
+          provider: providerType || 'unknown',
+          model: resolveModelName(model),
+          scorerVersion: DEFAULT_SCORER_VERSION,
+          scoringMethod,
+          trusted: !result.error,
+          runConfig: { maxTasks, timeoutMs, scoringMethod },
+        }));
+      } catch {}
+    }
+  }
+  return {
+    runId,
+    suite: 'swebench-lite',
+    model: resolveModelName(model),
+    totalTasks: tasks.length,
+    passed: totalPassed,
+    passAt1: tasks.length > 0 ? totalPassed / tasks.length : 0,
+    avgScore: results.reduce((s, r) => s + (r.score?.composite || 0), 0) / Math.max(results.length, 1),
+    totalCost: results.reduce((s, r) => s + (r.costDollars || 0), 0),
+    results,
+  };
+}
 module.exports = {
   downloadDataset,
   mapTaskToPrompt,
   runSWEBenchTask,
+  runSWEBenchSuite,
   loadCuratedSubset,
   CACHE_DIR,
 };