npm - create-walle - Versions diffs - 0.9.11 → 0.9.13 - Mend

create-walle 0.9.11 → 0.9.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (167) hide show

package/README.md +3 -3
package/package.json +2 -2
package/template/bin/dev.sh +7 -1
package/template/bin/setup.js +53 -9
package/template/bin/sync-images.js +53 -0
package/template/builder-journal.md +17 -0
package/template/claude-task-manager/api-prompts.js +98 -13
package/template/claude-task-manager/api-reviews.js +82 -5
package/template/claude-task-manager/db.js +32 -5
package/template/claude-task-manager/docs/session-capture-foundation-design.md +1273 -0
package/template/claude-task-manager/lib/claude-desktop-sessions.js +696 -0
package/template/claude-task-manager/lib/coding-agent-models.js +49 -1
package/template/claude-task-manager/lib/session-capture.js +421 -0
package/template/claude-task-manager/lib/session-history.js +135 -15
package/template/claude-task-manager/lib/session-jobs.js +10 -5
package/template/claude-task-manager/lib/session-stream.js +87 -19
package/template/claude-task-manager/lib/setup-provider-config.js +115 -0
package/template/claude-task-manager/lib/walle-ctm-history.js +72 -0
package/template/claude-task-manager/lib/walle-session-context.js +61 -0
package/template/claude-task-manager/lib/walle-transcript.js +176 -0
package/template/claude-task-manager/public/css/setup.css +35 -8
package/template/claude-task-manager/public/css/walle-session.css +56 -0
package/template/claude-task-manager/public/css/walle.css +120 -0
package/template/claude-task-manager/public/index.html +814 -181
package/template/claude-task-manager/public/js/message-renderer.js +148 -19
package/template/claude-task-manager/public/js/reviews.js +120 -62
package/template/claude-task-manager/public/js/setup.js +75 -31
package/template/claude-task-manager/public/js/stream-view.js +115 -55
package/template/claude-task-manager/public/js/walle-session.js +84 -2
package/template/claude-task-manager/public/js/walle.js +308 -54
package/template/claude-task-manager/server.js +1092 -146
package/template/claude-task-manager/session-integrity.js +181 -54
package/template/claude-task-manager/session-utils.js +123 -41
package/template/claude-task-manager/workers/state-detectors/codex.js +5 -2
package/template/package.json +1 -1
package/template/wall-e/adapters/ctm.js +39 -18
package/template/wall-e/agent-runners/contract.js +17 -0
package/template/wall-e/agent-runners/index.js +22 -0
package/template/wall-e/agent-runtime/harness.js +212 -0
package/template/wall-e/agent-runtime/index.js +8 -0
package/template/wall-e/agent-runtime/registry.js +67 -0
package/template/wall-e/agent-runtime/session-store.js +179 -0
package/template/wall-e/agent-runtime/spawn.js +208 -0
package/template/wall-e/api-walle.js +174 -7
package/template/wall-e/brain.js +266 -28
package/template/wall-e/channels/policy.js +88 -0
package/template/wall-e/channels/registry.js +15 -1
package/template/wall-e/channels/reply-dispatcher.js +70 -0
package/template/wall-e/channels/session-bindings.js +51 -0
package/template/wall-e/chat/code-review-context.js +29 -0
package/template/wall-e/chat.js +188 -42
package/template/wall-e/coding/acp-adapter.js +188 -0
package/template/wall-e/coding/agent-catalog.js +129 -0
package/template/wall-e/coding/compaction-service.js +247 -0
package/template/wall-e/coding/execution-trace.js +3 -0
package/template/wall-e/coding/instruction-service.js +224 -0
package/template/wall-e/coding/model-message.js +67 -0
package/template/wall-e/coding/permission-rules-store.js +111 -0
package/template/wall-e/coding/permission-service.js +266 -0
package/template/wall-e/coding/prompt-bundle.js +67 -0
package/template/wall-e/coding/prompt-runtime.js +243 -0
package/template/wall-e/coding/provider-transform.js +188 -0
package/template/wall-e/coding/runtime-mode.js +132 -0
package/template/wall-e/coding/snapshot-service.js +155 -0
package/template/wall-e/coding/stream-processor.js +268 -0
package/template/wall-e/coding/task-tool.js +255 -0
package/template/wall-e/coding/tool-registry.js +361 -0
package/template/wall-e/coding/transcript-writer.js +143 -0
package/template/wall-e/coding/workspace-replay.js +324 -0
package/template/wall-e/coding-context.js +4 -22
package/template/wall-e/coding-orchestrator.js +307 -18
package/template/wall-e/coding-prompts.js +44 -3
package/template/wall-e/context/context-builder.js +43 -1
package/template/wall-e/context/topic-matcher.js +1 -1
package/template/wall-e/eval/agent-runner.js +59 -13
package/template/wall-e/eval/benchmarks/memory-retrieval.json +155 -57
package/template/wall-e/eval/benchmarks.js +100 -16
package/template/wall-e/eval/eval-orchestrator.js +218 -8
package/template/wall-e/eval/harvester.js +62 -5
package/template/wall-e/eval/head-to-head.js +23 -2
package/template/wall-e/eval/humaneval-adapter.js +30 -5
package/template/wall-e/eval/livecodebench-adapter.js +29 -5
package/template/wall-e/eval/manifest.js +186 -0
package/template/wall-e/eval/run-agent-benchmarks.js +66 -2
package/template/wall-e/eval/session-retrieval-benchmark.js +150 -0
package/template/wall-e/eval/session-transcripts.js +57 -4
package/template/wall-e/eval/swebench-adapter.js +109 -3
package/template/wall-e/evaluation/agent-router.js +53 -1
package/template/wall-e/evaluation/coding-quorum.js +48 -1
package/template/wall-e/evaluation/router.js +4 -2
package/template/wall-e/evaluation/tier-selector.js +11 -1
package/template/wall-e/extraction/contradiction.js +2 -2
package/template/wall-e/extraction/indexer.js +2 -1
package/template/wall-e/extraction/knowledge-extractor.js +2 -2
package/template/wall-e/hooks/cli.js +92 -0
package/template/wall-e/hooks/discovery.js +119 -0
package/template/wall-e/hooks/index.js +7 -0
package/template/wall-e/hooks/manifest.js +55 -0
package/template/wall-e/hooks/runtime.js +84 -0
package/template/wall-e/hooks/session-memory.js +225 -0
package/template/wall-e/http/auth.js +6 -2
package/template/wall-e/http/chat-api.js +54 -8
package/template/wall-e/integrations/claude-plugin/hooks/hooks.json +27 -0
package/template/wall-e/integrations/claude-plugin/hooks/walle-precompact-hook.sh +5 -0
package/template/wall-e/integrations/claude-plugin/hooks/walle-stop-hook.sh +5 -0
package/template/wall-e/integrations/codex-plugin/hooks/walle-hook.sh +7 -0
package/template/wall-e/integrations/codex-plugin/hooks.json +37 -0
package/template/wall-e/listening/calendar.js +3 -1
package/template/wall-e/llm/client.js +64 -10
package/template/wall-e/llm/google.js +39 -5
package/template/wall-e/llm/ollama.js +1 -1
package/template/wall-e/llm/ollama.plugin.json +1 -1
package/template/wall-e/llm/provider-availability.js +10 -0
package/template/wall-e/llm/provider-error.js +269 -0
package/template/wall-e/llm/tool-adapter.js +48 -12
package/template/wall-e/loops/boot.js +2 -1
package/template/wall-e/loops/initiative.js +2 -2
package/template/wall-e/loops/tasks.js +8 -47
package/template/wall-e/loops/workspace-prompts.js +20 -0
package/template/wall-e/mcp-server.js +442 -1
package/template/wall-e/memory/session-ingest-service.js +159 -0
package/template/wall-e/memory/source-indexer.js +289 -0
package/template/wall-e/plugins/discovery.js +83 -0
package/template/wall-e/plugins/manifest-loader.js +50 -10
package/template/wall-e/plugins/manifest-schema.js +69 -0
package/template/wall-e/plugins/model-catalog.js +55 -0
package/template/wall-e/prompts/coding/base.txt +2 -0
package/template/wall-e/prompts/coding/deepseek.txt +1 -0
package/template/wall-e/prompts/coding/memory-protocol.md +9 -0
package/template/wall-e/prompts/coding/plan.txt +1 -0
package/template/wall-e/runtime/execution-trace.js +220 -0
package/template/wall-e/security/audit.js +266 -0
package/template/wall-e/security/ssrf.js +236 -0
package/template/wall-e/session-files.js +303 -0
package/template/wall-e/skills/_bundled/slack-backfill/SKILL.md +3 -0
package/template/wall-e/skills/_bundled/slack-sync/SKILL.md +3 -0
package/template/wall-e/skills/internal-skill-registry.js +2 -2
package/template/wall-e/skills/script-skill-runner.js +143 -0
package/template/wall-e/skills/skill-executor.js +5 -6
package/template/wall-e/skills/skill-fallback.js +3 -1
package/template/wall-e/skills/skill-harness-registry.js +7 -8
package/template/wall-e/skills/skill-planner.js +52 -4
package/template/wall-e/skills/slack-ingest.js +11 -3
package/template/wall-e/sources/base.js +90 -0
package/template/wall-e/sources/builtin.js +33 -0
package/template/wall-e/sources/claude-code-jsonl.js +78 -0
package/template/wall-e/sources/codex-jsonl.js +125 -0
package/template/wall-e/sources/coding-session-utils.js +117 -0
package/template/wall-e/sources/contract-suite.js +59 -0
package/template/wall-e/sources/gemini-jsonl.js +85 -0
package/template/wall-e/sources/index.js +9 -0
package/template/wall-e/sources/jsonl-utils.js +181 -0
package/template/wall-e/sources/record-types.js +252 -0
package/template/wall-e/sources/registry.js +92 -0
package/template/wall-e/sources/transforms.js +100 -0
package/template/wall-e/sources/walle-jsonl.js +108 -0
package/template/wall-e/tools/coding-middleware.js +31 -1
package/template/wall-e/tools/file-tracker.js +25 -1
package/template/wall-e/tools/local-tools.js +75 -47
package/template/wall-e/tools/session-sharing.js +68 -1
package/template/wall-e/tools/shell-analyzer.js +1 -1
package/template/wall-e/tools/shell-policy.js +47 -0
package/template/wall-e/tools/snapshot.js +42 -0
package/template/wall-e/training/harvester.js +62 -5
package/template/wall-e/utils/repair.js +253 -1
package/template/website/index.html +3 -3
package/template/wall-e/skills/_bundled/slack-mentions/.watched-threads.json +0 -18

package/template/wall-e/eval/eval-orchestrator.js CHANGED Viewed

@@ -8,6 +8,7 @@ const { pLimit, getAvailableProviders } = require('./head-to-head');
 const { runAgentBenchmark, runMultiTurnBenchmark } = require('./agent-runner');
 const { createClient } = require('../llm/client');
 const { createAnthropicFromEnv } = require('../llm/anthropic');
+const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
 // ============================================================
 // Benchmark suite loader
@@ -22,14 +23,89 @@ const SUITE_FILES = {
   'chat-eval':    'chat-eval.json',
   'reasoning':    'reasoning.json',
   'memory-retrieval': 'memory-retrieval.json',
-  'humaneval-plus': null,    // handled by humaneval-adapter.js (no JSON file)
-  'livecodebench':  null,    // handled by livecodebench-adapter.js (no JSON file)
+};
+const ADAPTER_SUITE_METADATA = {
+  'humaneval-plus': {
+    name: 'humaneval-plus',
+    count: null,
+    taskTypes: ['coding'],
+    difficulties: ['easy', 'medium', 'hard'],
+    adapter: true,
+    description: 'EvalPlus HumanEval+ Python function-generation tasks',
+  },
+  livecodebench: {
+    name: 'livecodebench',
+    count: null,
+    taskTypes: ['coding'],
+    difficulties: ['easy', 'medium', 'hard'],
+    adapter: true,
+    description: 'LiveCodeBench code-generation tasks with date filtering',
+  },
+  'swebench-lite': {
+    name: 'swebench-lite',
+    count: 30,
+    taskTypes: ['coding-agent'],
+    difficulties: ['medium', 'hard'],
+    adapter: true,
+    description: 'Curated SWE-bench Lite issue-fixing tasks',
+  },
+};
+function isAdapterSuite(suiteName) {
+  return !!ADAPTER_SUITE_METADATA[suiteName];
+}
+function listAdapterSuites() {
+  return Object.values(ADAPTER_SUITE_METADATA).map((s) => ({ ...s }));
+}
+const DEFAULT_ADAPTER_RUNNERS = {
+  'humaneval-plus': async ({ brain, providerInfo, model, runId, maxTasks, signal }) => {
+    const { runHumanEvalSuite } = require('./humaneval-adapter');
+    return runHumanEvalSuite({
+      brain,
+      providerType: providerInfo.provider,
+      config: { apiKey: providerInfo.apiKey, baseUrl: providerInfo.baseUrl },
+      model,
+      runId,
+      maxTasks,
+      signal,
+    });
+  },
+  livecodebench: async ({ brain, providerInfo, model, runId, maxTasks, afterDate, signal }) => {
+    const { runLiveCodeBenchSuite } = require('./livecodebench-adapter');
+    return runLiveCodeBenchSuite({
+      brain,
+      providerType: providerInfo.provider,
+      config: { apiKey: providerInfo.apiKey, baseUrl: providerInfo.baseUrl },
+      model,
+      runId,
+      maxTasks,
+      afterDate,
+      signal,
+    });
+  },
+  'swebench-lite': async ({ brain, providerInfo, model, runId, maxTasks, signal, runAgentLoop, timeoutMs }) => {
+    const { runSWEBenchSuite } = require('./swebench-adapter');
+    return runSWEBenchSuite({
+      brain,
+      provider: providerInfo.client,
+      providerType: providerInfo.provider,
+      model,
+      runId,
+      maxTasks,
+      signal,
+      runAgentLoop,
+      timeoutMs,
+    });
+  },
 };
 function loadSuite(suiteName) {
+  if (isAdapterSuite(suiteName)) return [];
   if (!(suiteName in SUITE_FILES)) throw new Error(`Unknown suite: ${suiteName}`);
   const file = SUITE_FILES[suiteName];
-  if (!file) return []; // adapter-based suite (humaneval-plus, livecodebench) — loaded externally
   const filePath = path.join(BENCHMARKS_DIR, file);
   if (!fs.existsSync(filePath)) throw new Error(`Suite file not found: ${filePath}`);
   return JSON.parse(fs.readFileSync(filePath, 'utf8'));
@@ -83,6 +159,7 @@ class EvalOrchestrator extends EventEmitter {
     this.totalSpent = 0;
     this.running = false;
     this.aborted = false;
+    this.adapterRunners = { ...DEFAULT_ADAPTER_RUNNERS, ...(options.adapterRunners || {}) };
   }
   /**
@@ -94,12 +171,16 @@ class EvalOrchestrator extends EventEmitter {
    * @param {Function} params.runAgentLoop - Agent loop function for coding benchmarks
    * @returns {Promise<object>} Run summary
    */
-  async run({ suite, models, benchmarkIds, runAgentLoop }) {
+  async run({ suite, models, benchmarkIds, runAgentLoop, maxTasks, afterDate }) {
     if (this.running) throw new Error('Orchestrator is already running');
     this.running = true;
     this.aborted = false;
     try {
+      if (isAdapterSuite(suite)) {
+        return await this._runAdapterSuite({ suite, models, benchmarkIds, runAgentLoop, maxTasks, afterDate });
+      }
       // 1. Load benchmarks
       const benchmarks = suite === 'all' ? loadAllSuites() : loadSuite(suite);
       const filtered = benchmarkIds && benchmarkIds.length > 0
@@ -330,7 +411,7 @@ class EvalOrchestrator extends EventEmitter {
         } else {
           client = createClient(providerType, { apiKey, baseUrl });
         }
-        map[model] = { client, provider: providerType, registryId };
+        map[model] = { client, provider: providerType, registryId, apiKey, baseUrl };
       } catch (err) {
         this.emit('error', { benchmarkId: null, model, error: `Failed to create client: ${err.message}` });
       }
@@ -338,6 +419,102 @@ class EvalOrchestrator extends EventEmitter {
     return map;
   }
+  async _runAdapterSuite({ suite, models, benchmarkIds, runAgentLoop, maxTasks, afterDate }) {
+    const runner = this.adapterRunners[suite];
+    if (!runner) throw new Error(`No adapter runner configured for suite: ${suite}`);
+    if (!models || models.length === 0) {
+      const summary = { runId: this.runId, status: 'error', error: 'No models specified', models: {}, totalBenchmarks: 0, totalSpent: 0 };
+      this.emit('error', { benchmarkId: null, model: null, error: 'No models specified. Provide at least one model.' });
+      this.emit('run-complete', { runId: this.runId, summary });
+      return summary;
+    }
+    const providerMap = this._resolveProviders(models);
+    const adapterMaxTasks = maxTasks || (benchmarkIds && benchmarkIds.length ? benchmarkIds.length : undefined);
+    const limit = pLimit(this.concurrency);
+    const modelResults = {};
+    const orchestrator = this;
+    const adapterSignal = { get aborted() { return orchestrator.aborted; } };
+    const tasks = models.map((model) => limit(async () => {
+      const providerInfo = providerMap[model];
+      if (!providerInfo) {
+        this.emit('error', { benchmarkId: suite, model, error: `No provider found for model: ${model}` });
+        return null;
+      }
+      this.emit('benchmark-start', {
+        benchmarkId: suite,
+        model,
+        startedAt: new Date().toISOString(),
+      });
+      let result;
+      try {
+        result = await runner({
+          brain: this.brain,
+          providerInfo,
+          model,
+          runId: this.runId,
+          maxTasks: adapterMaxTasks,
+          afterDate,
+          signal: adapterSignal,
+          runAgentLoop,
+          timeoutMs: this.timeoutMs,
+        });
+      } catch (err) {
+        result = {
+          suite,
+          model,
+          totalTasks: 0,
+          avgScore: 0,
+          totalCost: 0,
+          error: err.message,
+          results: [],
+        };
+        this.emit('error', { benchmarkId: suite, model, error: err.message });
+      }
+      const totalCost = result.totalCost || 0;
+      if (!isLocalModel(providerInfo.provider)) {
+        this.spent[model] = (this.spent[model] || 0) + totalCost;
+        this.totalSpent += totalCost;
+      }
+      modelResults[model] = result;
+      this.emit('model-complete', {
+        model,
+        avgScore: Math.round((result.avgScore || 0) * 1000) / 1000,
+        totalCost: Math.round(totalCost * 1_000_000) / 1_000_000,
+        benchmarksRun: result.totalTasks || result.results?.length || 0,
+      });
+      return result;
+    }));
+    await Promise.all(tasks);
+    const summary = {
+      runId: this.runId,
+      status: this.aborted ? 'aborted' : 'complete',
+      totalBenchmarks: Object.values(modelResults).reduce((s, r) => s + (r.totalTasks || r.results?.length || 0), 0),
+      totalSpent: Math.round(this.totalSpent * 1_000_000) / 1_000_000,
+      models: {},
+      suite,
+      adapter: true,
+    };
+    for (const [model, result] of Object.entries(modelResults)) {
+      summary.models[model] = {
+        avgScore: Math.round((result.avgScore || 0) * 1000) / 1000,
+        totalCost: Math.round((result.totalCost || 0) * 1_000_000) / 1_000_000,
+        benchmarksRun: result.totalTasks || result.results?.length || 0,
+        errors: (result.results || []).filter(r => r.error).length + (result.error ? 1 : 0),
+      };
+    }
+    this.emit('run-complete', { runId: this.runId, summary });
+    return summary;
+  }
   /**
    * Get set of completed benchmark keys for this runId (for resume).
    */
@@ -363,7 +540,10 @@ class EvalOrchestrator extends EventEmitter {
     if (!this.brain || typeof this.brain.insertBenchmarkResult !== 'function') return;
     try {
-      this.brain.insertBenchmarkResult({
+      const scoringMethod = item.benchmark.agentExpectations?.testCommand
+        ? 'agent-rubric+tests'
+        : 'agent-rubric';
+      this.brain.insertBenchmarkResult(decorateBenchmarkResult({
         runId: this.runId,
         suite: item.benchmark._suite || 'coding-agent',
         promptId: item.benchmark.id,
@@ -384,7 +564,31 @@ class EvalOrchestrator extends EventEmitter {
         dimensionsJson: result.dimensionsJson || null,
         inputTokens: result.inputTokens ?? null,
         outputTokens: result.outputTokens ?? null,
-      });
+        scorerVersion: DEFAULT_SCORER_VERSION,
+        scoringMethod,
+        trusted: !result.error && result.testsPassed === true,
+        runConfig: {
+          timeoutMs: this.timeoutMs,
+          concurrency: this.concurrency,
+          budgetDollars: this.budgetDollars,
+          scoringMethod,
+        },
+      }, {
+        suite: item.benchmark._suite || 'coding-agent',
+        benchmark: item.benchmark,
+        runId: this.runId,
+        provider: item.provider.provider,
+        model: item.model,
+        scoringMethod,
+        scorerVersion: DEFAULT_SCORER_VERSION,
+        trusted: !result.error && result.testsPassed === true,
+        runConfig: {
+          timeoutMs: this.timeoutMs,
+          concurrency: this.concurrency,
+          budgetDollars: this.budgetDollars,
+          scoringMethod,
+        },
+      }));
     } catch { /* non-fatal */ }
   }
@@ -427,4 +631,10 @@ class EvalOrchestrator extends EventEmitter {
   }
 }
-module.exports = { EvalOrchestrator };
+module.exports = {
+  EvalOrchestrator,
+  ADAPTER_SUITE_METADATA,
+  isAdapterSuite,
+  listAdapterSuites,
+  loadSuite,
+};

package/template/wall-e/eval/harvester.js CHANGED Viewed

@@ -5,6 +5,17 @@ const path = require('path');
 const { createHash } = require('crypto');
 const { execFileSync } = require('child_process');
+let claudeDesktopSessions = null;
+function getClaudeDesktopSessions() {
+  if (claudeDesktopSessions) return claudeDesktopSessions;
+  try {
+    claudeDesktopSessions = require('../../claude-task-manager/lib/claude-desktop-sessions');
+  } catch {
+    return null;
+  }
+  return claudeDesktopSessions;
+}
 // --- Task type classification ---
 function classifyTaskType(content) {
@@ -86,6 +97,46 @@ async function harvestClaudeCodeSessions(since) {
   return samples;
 }
+// --- Claude Desktop Session Harvesting ---
+async function harvestClaudeDesktopSessions(since) {
+  const reader = getClaudeDesktopSessions();
+  if (!reader) return [];
+  const sessions = reader.listSessions();
+  const samples = [];
+  for (const session of sessions) {
+    if (since && session.updatedAt && session.updatedAt <= since) continue;
+    const messages = Array.isArray(session.messages) ? session.messages : [];
+    for (let i = 0; i < messages.length - 1; i++) {
+      const userMsg = messages[i];
+      const assistantMsg = messages[i + 1];
+      if (userMsg.role !== 'user' || assistantMsg.role !== 'assistant') continue;
+      const userContent = userMsg.text || '';
+      const assistantContent = assistantMsg.text || '';
+      if (!userContent || userContent.length < 20) continue;
+      if (!assistantContent || assistantContent.length < 20) continue;
+      samples.push({
+        id: contentHash('claude-desktop', `${session.uuid}:${i}:${userContent}`),
+        source: 'claude-desktop',
+        session_id: session.uuid,
+        timestamp: userMsg.timestamp || session.updatedAt || session.createdAt || new Date().toISOString(),
+        task_type: classifyTaskType(userContent),
+        prompt: userContent,
+        response: assistantContent,
+        tool_calls: [],
+        outcome: 'unknown',
+        outcome_signal: { git_committed: false, git_diff: null, task_status: null, user_corrected: false },
+        model: session.model || 'unknown',
+        quality_label: 0.5,
+      });
+    }
+  }
+  return samples;
+}
 // --- Codex Session Harvesting ---
 async function harvestCodexSessions(since) {
@@ -144,12 +195,13 @@ async function harvestCodexSessions(since) {
 // --- CTM Session Harvesting ---
-async function harvestCtmSessions(since) {
-  const dataDir = process.env.WALL_E_DATA_DIR || path.join(process.env.HOME, '.walle', 'data');
+async function harvestCtmSessions(since, dataDirOverride = null) {
+  const dataDir = dataDirOverride || process.env.WALL_E_DATA_DIR || path.join(process.env.HOME, '.walle', 'data');
   const ctmDbPath = path.join(dataDir, 'task-manager.db');
   if (!fs.existsSync(ctmDbPath)) return [];
-  const Database = require('better-sqlite3');
+  let Database;
+  try { Database = require('better-sqlite3'); } catch { return []; }
   let ctmDb;
   try {
     ctmDb = new Database(ctmDbPath, { readonly: true, fileMustExist: true });
@@ -534,10 +586,11 @@ async function runHarvest({ incremental = true, brain, dataDir } = {}) {
   // Harvest from each source
   const claudeSamples = await harvestClaudeCodeSessions(getSince('claude-code'));
+  const claudeDesktopSamples = await harvestClaudeDesktopSessions(getSince('claude-desktop'));
   const codexSamples = await harvestCodexSessions(getSince('codex'));
   const chatSamples = await harvestWalleChat(brain, getSince('walle-chat'));
   const taskSamples = await harvestWalleTasks(brain, getSince('walle-task'));
-  const ctmSamples = await harvestCtmSessions(getSince('ctm-sessions'));
+  const ctmSamples = await harvestCtmSessions(getSince('ctm-sessions'), dataDir);
   // Harvest coding agent sessions and store in brain
   const codingAgentSessions = await harvestCodingAgentSessions(getSince('coding-agent'));
@@ -549,7 +602,7 @@ async function runHarvest({ incremental = true, brain, dataDir } = {}) {
     if (typeof brain.insertCodingSession === 'function') brain.insertCodingSession(session);
   }
-  allSamples.push(...claudeSamples, ...codexSamples, ...chatSamples, ...taskSamples, ...ctmSamples);
+  allSamples.push(...claudeSamples, ...claudeDesktopSamples, ...codexSamples, ...chatSamples, ...taskSamples, ...ctmSamples);
   // Deduplicate by content hash
   const seen = new Set();
@@ -584,6 +637,9 @@ async function runHarvest({ incremental = true, brain, dataDir } = {}) {
     if (claudeSamples.length > 0) {
       brain.updateHarvestState('claude-code', { lastProcessedAt: now, totalHarvested: claudeSamples.length });
     }
+    if (claudeDesktopSamples.length > 0) {
+      brain.updateHarvestState('claude-desktop', { lastProcessedAt: now, totalHarvested: claudeDesktopSamples.length });
+    }
     if (codexSamples.length > 0) {
       brain.updateHarvestState('codex', { lastProcessedAt: now, totalHarvested: codexSamples.length });
     }
@@ -610,6 +666,7 @@ module.exports = {
   classifyTaskType,
   contentHash,
   harvestClaudeCodeSessions,
+  harvestClaudeDesktopSessions,
   harvestCodexSessions,
   harvestCtmSessions,
   harvestWalleChat,

package/template/wall-e/eval/head-to-head.js CHANGED Viewed

@@ -3,7 +3,7 @@
 const { randomUUID } = require('node:crypto');
 const { createClient } = require('../llm/client');
 const { heuristicScore } = require('./evaluator');
-const { scoreTraits } = require('./benchmarks');
+const { scoreTraitsDetailed, TRAIT_MATCHERS, UNSCORABLE_TRAITS } = require('./benchmarks');
 // ============================================================
 // Concurrency limiter (inline, no external dependency)
@@ -122,7 +122,28 @@ function getAvailableProviders(brain) {
 function traitScore(response, expectedTraits) {
   if (!expectedTraits || expectedTraits.length === 0) return null;
-  return scoreTraits(response, expectedTraits);
+  const text = String(response || '').toLowerCase();
+  let matched = 0;
+  let scored = 0;
+  const knownTraits = [];
+  for (const trait of expectedTraits) {
+    if (UNSCORABLE_TRAITS.has(trait)) continue;
+    if (TRAIT_MATCHERS[trait]) {
+      knownTraits.push(trait);
+      continue;
+    }
+    scored++;
+    if (text.includes(String(trait).toLowerCase())) matched++;
+  }
+  if (knownTraits.length > 0) {
+    const detail = scoreTraitsDetailed(response, knownTraits);
+    matched += detail.matched.length;
+    scored += detail.scoredCount;
+  }
+  return scored > 0 ? matched / scored : 0;
 }
 // ============================================================

package/template/wall-e/eval/humaneval-adapter.js CHANGED Viewed

@@ -6,6 +6,7 @@ const crypto = require('crypto');
 const { execFileSync } = require('child_process');
 const { createClient } = require('../llm/client');
 const { resolveModelName } = require('./agent-runner');
+const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
 const SUITE_NAME = 'humaneval-plus';
 const CACHE_DIR = path.join(os.homedir(), '.walle', 'eval-cache');
@@ -192,7 +193,8 @@ async function runHumanEvalTask(task, options = {}) {
     costDollars,
     response: response.slice(0, 2000),
     code: code.slice(0, 2000),
-    error: error || testError || null,
+    error: error || null,
+    testError: testError || null,
     usage,
   };
 }
@@ -222,13 +224,13 @@ function scoreHumanEvalQuality(code) {
  * @returns {Promise<object>} Suite results
  */
 async function runHumanEvalSuite(options = {}) {
-  const { brain, providerType, config, model, maxTasks, signal } = options;
+  const { brain, providerType, config, model, maxTasks, signal, runId: providedRunId } = options;
   const allTasks = await loadHumanEvalDataset();
   const tasks = maxTasks ? allTasks.slice(0, maxTasks) : allTasks;
   const client = createClient(providerType || 'anthropic', config || {});
-  const runId = crypto.randomUUID();
+  const runId = providedRunId || crypto.randomUUID();
   const results = [];
   let totalPassed = 0;
@@ -247,7 +249,8 @@ async function runHumanEvalSuite(options = {}) {
     // Store in brain
     if (brain && typeof brain.insertBenchmarkResult === 'function') {
       try {
-        brain.insertBenchmarkResult({
+        const scoringMethod = 'executable-tests';
+        brain.insertBenchmarkResult(decorateBenchmarkResult({
           runId,
           suite: SUITE_NAME,
           promptId: task.task_id,
@@ -266,7 +269,29 @@ async function runHumanEvalSuite(options = {}) {
           testsAfter: result.passed ? 1 : 0,
           totalTests: 1,
           dimensionsJson: JSON.stringify(result.score.dimensions),
-        });
+          modelMetadataJson: JSON.stringify({ testError: result.testError || null }),
+          datasetVersion: 'humaneval-plus:evalplus-master',
+          scorerVersion: DEFAULT_SCORER_VERSION,
+          scoringMethod,
+          trusted: !result.error,
+          runConfig: { maxTasks, scoringMethod },
+        }, {
+          suite: SUITE_NAME,
+          benchmark: {
+            id: task.task_id,
+            prompt: task.prompt,
+            taskType: 'coding',
+            difficulty: taskDifficulty(task.task_id),
+            datasetVersion: 'humaneval-plus:evalplus-master',
+          },
+          runId,
+          provider: providerType || 'unknown',
+          model: resolveModelName(model),
+          scorerVersion: DEFAULT_SCORER_VERSION,
+          scoringMethod,
+          trusted: !result.error,
+          runConfig: { maxTasks, scoringMethod },
+        }));
       } catch {}
     }
   }

package/template/wall-e/eval/livecodebench-adapter.js CHANGED Viewed

@@ -7,6 +7,7 @@ const crypto = require('crypto');
 const { execFileSync } = require('child_process');
 const { createClient } = require('../llm/client');
 const { resolveModelName } = require('./agent-runner');
+const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
 const SUITE_NAME = 'livecodebench';
 const CACHE_DIR = path.join(os.homedir(), '.walle', 'eval-cache');
@@ -187,7 +188,7 @@ async function runLiveCodeBenchTask(task, options = {}) {
  * Run the full LiveCodeBench suite.
  */
 async function runLiveCodeBenchSuite(options = {}) {
-  const { brain, providerType, config, model, maxTasks, signal, afterDate } = options;
+  const { brain, providerType, config, model, maxTasks, signal, afterDate, runId: providedRunId } = options;
   let allTasks = await loadLiveCodeBenchDataset();
@@ -201,7 +202,7 @@ async function runLiveCodeBenchSuite(options = {}) {
   const tasks = maxTasks ? allTasks.slice(0, maxTasks) : allTasks;
   const client = createClient(providerType || 'anthropic', config || {});
-  const runId = crypto.randomUUID();
+  const runId = providedRunId || crypto.randomUUID();
   const results = [];
   let totalPassed = 0;
@@ -219,7 +220,9 @@ async function runLiveCodeBenchSuite(options = {}) {
     // Store in brain
     if (brain && typeof brain.insertBenchmarkResult === 'function') {
       try {
-        brain.insertBenchmarkResult({
+        const scoringMethod = 'executable-tests';
+        const prompt = task.question_content || task.prompt || task.description || '';
+        brain.insertBenchmarkResult(decorateBenchmarkResult({
           runId,
           suite: SUITE_NAME,
           promptId: result.taskId,
@@ -227,7 +230,7 @@ async function runLiveCodeBenchSuite(options = {}) {
           difficulty: taskDifficulty(task),
           provider: providerType || 'unknown',
           model: resolveModelName(model),
-          prompt: (task.question_content || '').slice(0, 2000),
+          prompt: prompt.slice(0, 2000),
           response: result.response || '',
           traitScore: null,
           compositeScore: result.score.composite,
@@ -238,7 +241,28 @@ async function runLiveCodeBenchSuite(options = {}) {
           testsAfter: result.passedCases || 0,
           totalTests: result.totalCases || 0,
           dimensionsJson: JSON.stringify(result.score.dimensions),
-        });
+          datasetVersion: 'livecodebench:release_v6',
+          scorerVersion: DEFAULT_SCORER_VERSION,
+          scoringMethod,
+          trusted: !result.error,
+          runConfig: { maxTasks, afterDate, scoringMethod },
+        }, {
+          suite: SUITE_NAME,
+          benchmark: {
+            id: result.taskId,
+            prompt,
+            taskType: 'coding',
+            difficulty: taskDifficulty(task),
+            datasetVersion: 'livecodebench:release_v6',
+          },
+          runId,
+          provider: providerType || 'unknown',
+          model: resolveModelName(model),
+          scorerVersion: DEFAULT_SCORER_VERSION,
+          scoringMethod,
+          trusted: !result.error,
+          runConfig: { maxTasks, afterDate, scoringMethod },
+        }));
       } catch {}
     }
   }