npm - create-walle - Versions diffs - 0.9.21 → 0.9.23 - Mend

create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (500) hide show

package/template/wall-e/eval/benchmarks.js DELETED Viewed

@@ -1,669 +0,0 @@
-'use strict';
-const fs = require('fs');
-const path = require('path');
-const crypto = require('crypto');
-const { createClient } = require('../llm/client');
-const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
-// ============================================================
-// Constants
-// ============================================================
-const BENCHMARKS_DIR = path.join(__dirname, 'benchmarks');
-const VALID_TASK_TYPES = ['coding', 'chat', 'reasoning', 'memory-retrieval', 'coding-agent'];
-const VALID_DIFFICULTIES = ['easy', 'medium', 'hard'];
-const DEFAULT_TIMEOUT_MS = 60_000;
-// ============================================================
-// Trait scoring — regex/heuristic checks
-// ============================================================
-/**
- * Mapping from trait name to regex or function that tests for that trait.
- * Returns true if the response exhibits the trait.
- */
-const TRAIT_MATCHERS = {
-  // --- Coding traits ---
-  'has code block':           (r) => /```[\s\S]*?```/.test(r),
-  'defines function':         (r) => /\b(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|def\s+\w+\s*\()/i.test(r),
-  'uses setTimeout':          (r) => /setTimeout/i.test(r),
-  'uses clearTimeout':        (r) => /clearTimeout/i.test(r),
-  'uses csv module':          (r) => /\bcsv\b/i.test(r),
-  'returns list':             (r) => /\breturn\b.*\[|list|List/i.test(r),
-  'handles headers':          (r) => /header|column|field/i.test(r),
-  'uses async/await':         (r) => /async\s+|await\s+/i.test(r),
-  'has retry logic':          (r) => /retry|retries|attempt/i.test(r),
-  'has backoff':              (r) => /backoff|exponential|delay\s*\*|Math\.pow/i.test(r),
-  'uses dynamic programming': (r) => /\bdp\b|dynamic\s+program|table|matrix|memo/i.test(r),
-  'uses binary search':       (r) => /binary\s+search|low.*high|mid|left.*right/i.test(r),
-  'handles not found':        (r) => /-1|not\s+found|None/i.test(r),
-  'returns number':           (r) => /return\s+\d|return\s+\w+\s*(;|\n)/i.test(r),
-  'identifies null check missing': (r) => /null|undefined|optional\s+chaining|\?\.|guard|check/i.test(r),
-  'adds optional chaining or guard': (r) => /\?\.|&&|if\s*\(|guard|check/i.test(r),
-  'checks response status':   (r) => /response\.(ok|status)|\.ok\b|status\s*(===|!==|==|!=)/i.test(r),
-  'identifies missing extend/concatenation': (r) => /extend|concat|\+\s*=|result\s*\+|\.extend|\.concat/i.test(r),
-  'fixes recursive call':     (r) => /result\s*(=|\+=|\.extend|\.concat)|return.*flatten/i.test(r),
-  'explains the bug':         (r) => /bug|issue|problem|because|result.*lost|not.*captured|discard/i.test(r),
-  'uses Map or linked list':  (r) => /\bMap\b|linked\s*list|doubly/i.test(r),
-  'implements eviction':      (r) => /evict|delete|remove.*oldest|remove.*least/i.test(r),
-  'has get and put methods':  (r) => /\bget\s*\(|\bput\s*\(|\bset\s*\(/i.test(r),
-  'uses decorator pattern':   (r) => /def\s+\w+\s*\(\s*func|@\w+|wrapper|wraps/i.test(r),
-  'tracks timestamps':        (r) => /time|timestamp|datetime|clock/i.test(r),
-  'raises exception':         (r) => /raise|throw|Error|Exception/i.test(r),
-  'handles arrays':           (r) => /Array\.isArray|instanceof\s+Array|\barray\b.*concat|\.concat/i.test(r),
-  'handles nested objects':   (r) => /typeof.*object|recursive|nested|deep/i.test(r),
-  'handles primitives':       (r) => /string|number|boolean|primitive|typeof/i.test(r),
-  'separates validation':     (r) => /validat|middleware|schema|joi|zod|check.*input/i.test(r),
-  'extracts service layer':   (r) => /service|layer|separate.*concern|module|extract/i.test(r),
-  'adds try-catch':           (r) => /try\s*\{|catch\s*\(|error\s*handl/i.test(r),
-  'improves structure':       (r) => /refactor|separate|extract|clean|modular/i.test(r),
-  'implements all four methods': (r) => /\bon\s*\(.*\)|\boff\s*\(.*\)|\bonce\s*\(.*\)|\bemit\s*\(.*\)/i.test(r),
-  'handles once correctly':   (r) => /once|removeListener|off.*after|single\s*invocation/i.test(r),
-  'mentions max listeners or cleanup': (r) => /max.*listener|cleanup|leak|removeAll|memory/i.test(r),
-  'has concurrency limit':    (r) => /concurren|parallel.*limit|max.*running|semaphore/i.test(r),
-  'has priority support':     (r) => /priorit|queue.*sort|high.*low|urgency/i.test(r),
-  'has cancellation':         (r) => /cancel|abort|remove.*pending|clear.*queue/i.test(r),
-  'has jsdoc':                (r) => /\/\*\*[\s\S]*?\*\/|@param|@returns/i.test(r),
-  'uses context manager':     (r) => /__enter__|__exit__|with\s+|contextmanager/i.test(r),
-  'handles rollback':         (r) => /rollback|ROLLBACK/i.test(r),
-  'supports nesting':         (r) => /savepoint|nested|SAVEPOINT/i.test(r),
-  'mentions heap snapshots':  (r) => /heap\s*snapshot|--inspect|heapdump|v8/i.test(r),
-  'identifies unbounded cache': (r) => /unbounded|grow.*without|no.*eviction|plain\s*object.*cache/i.test(r),
-  'suggests WeakMap or LRU':  (r) => /WeakMap|LRU|lru-cache|bounded/i.test(r),
-  'mentions event listener leaks': (r) => /event\s*listener.*leak|removeListener|off\(|maxListeners/i.test(r),
-  'handles empty strings':        (r) => /empty\s*string|''|""|\.length\s*===?\s*0|!str/i.test(r),
-  // --- Chat traits ---
-  'greeting':                 (r) => /\b(hi|hello|hey|welcome|greetings)\b/i.test(r),
-  'lists capabilities':       (r) => /can\s+help|able\s+to|assist.*with|I\s+can/i.test(r),
-  'friendly tone':            (r) => /glad|happy|great|wonderful|!\s|welcome/i.test(r),
-  'invites follow-up':        (r) => /feel\s+free|let\s+me\s+know|any.*question|don't\s+hesitate|anything\s+else/i.test(r),
-  'empathetic tone':          (r) => /understand|sounds.*tough|sorry\s+to\s+hear|that.*challenging|must\s+be/i.test(r),
-  'actionable advice':        (r) => /try\s|consider|suggest|recommend|step|could\s|should\s|here'?s?\s*(what|how)/i.test(r),
-  'suggests prioritization':  (r) => /prioriti|important.*first|urgent|rank|triage|eisenhower/i.test(r),
-  'concise':                  (r) => r.length < 2000,
-  'uses analogy or simple language': (r) => /like\s+a|think\s+of\s+it|imagine|analogy|simply\s+put|in\s+other\s+words/i.test(r),
-  'mentions HTTP methods':    (r) => /GET|POST|PUT|DELETE|PATCH|HTTP\s+method/i.test(r),
-  'accurate':                 (_r) => true, // can't verify accuracy heuristically — always true as baseline
-  'clear distinction':        (r) => /difference|while|whereas|on\s+the\s+other\s+hand|in\s+contrast|unlike/i.test(r),
-  'uses examples':            (r) => /for\s+example|such\s+as|e\.g\.|instance|like\s+\w+/i.test(r),
-  'suggests ownership':       (r) => /own\s+up|take\s+responsib|acknowledge|admit|transparent/i.test(r),
-  'constructive framing':     (r) => /learn|grow|opportunity|moving\s+forward|improve|next\s+time/i.test(r),
-  'recommends specific language': (r) => /JavaScript|TypeScript|Python|HTML|CSS/i.test(r),
-  'explains reasoning':       (r) => /because|reason|since|this\s+is\s+why|due\s+to/i.test(r),
-  'mentions learning resources': (r) => /tutorial|course|documentation|freeCodeCamp|MDN|book|resource|Udemy|YouTube/i.test(r),
-  'acknowledges thanks':      (r) => /you're\s+welcome|glad|happy\s+to|my\s+pleasure|no\s+problem|anytime/i.test(r),
-  'acknowledges disagreement': (r) => /valid\s+point|understand.*perspective|fair\s+point|you're\s+right|good\s+point|that\s+makes\s+sense/i.test(r),
-  'validates their point':    (r) => /valid|good\s+point|makes\s+sense|right|agree|fair/i.test(r),
-  'non-defensive':            (r) => !/wrong|incorrect|actually\s+no|you\s+should\s+have/i.test(r),
-  'constructive':             (r) => /consider|suggest|might|could|option|alternative/i.test(r),
-  'lists pros and cons':      (r) => /pro|con|advantage|disadvantage|benefit|drawback|\+\s|✓|✗|-\s/i.test(r),
-  'covers both sides':        (r) => /remote.*office|office.*remote|both|on\s+the\s+other|however/i.test(r),
-  'uses structure':           (r) => /^(\s*[-*]\s|\s*\d+[\.\)]\s|#{1,3}\s)/m.test(r),
-  'celebrates achievement':   (r) => /congrat|awesome|amazing|fantastic|great\s+job|well\s+done|exciting/i.test(r),
-  'enthusiastic tone':        (r) => /!|exciting|love|fantastic|wonderful|awesome|amazing/i.test(r),
-  'encouraging':              (r) => /keep\s+going|great\s+start|proud|milestone|first\s+of\s+many/i.test(r),
-  // --- Reasoning traits ---
-  'multiple options considered': (r) => /option|alternative|approach|choice|comparison|versus|vs\./i.test(r),
-  'pros and cons':            (r) => /pro|con|advantage|disadvantage|trade-?off|benefit|drawback/i.test(r),
-  'numbered steps':           (r) => /^\s*\d+[\.\)]\s/m.test(r),
-  'step-by-step':             (r) => /step\s+\d|first.*then.*finally|step-by-step|^\s*\d+[\.\)]/mi.test(r),
-  'conclusion':               (r) => /recommend|conclusion|therefore|in\s+summary|overall|my\s+suggestion|I('d|\s+would)\s+(go|choose|recommend|suggest)/i.test(r),
-  'considers use case fit':   (r) => /use\s+case|depends\s+on|your\s+scenario|for\s+your|requirements/i.test(r),
-  'considers team size':      (r) => /team\s+size|small\s+team|5\s+developer|developer|staffing/i.test(r),
-  'correct solution':         (_r) => true, // heuristic can't verify — baseline true
-  'correct conclusion':       (r) => /conclude|conclusion|therefore|so\s+the\s+answer|result\s+is/i.test(r),
-  'explains why':             (r) => /because|reason|since|this\s+is\s+why|due\s+to|explains/i.test(r),
-  'explains constraints':     (r) => /constraint|rule|cannot|must\s+not|only\s+holds/i.test(r),
-  'checks logs first':        (r) => /log|logging|check.*log|grep.*log|tail/i.test(r),
-  'considers deployment changes': (r) => /deploy|rollback|diff|last\s+week|recent\s+change|what\s+changed/i.test(r),
-  'systematic approach':      (r) => /systematic|methodical|step.*step|first.*then|diagnos/i.test(r),
-  'mentions monitoring':      (r) => /monitor|alert|metric|dashboard|grafana|datadog|observ/i.test(r),
-  'identifies logical fallacy': (r) => /fallacy|cannot\s+conclude|does\s+not\s+follow|invalid|not\s+necessarily/i.test(r),
-  'phased approach':          (r) => /phase\s+\d|stage\s+\d|first\s+phase|phase\s+1|incremental/i.test(r),
-  'considers risk':           (r) => /risk|careful|fallback|rollback|gradual|safety/i.test(r),
-  'mentions strangler pattern': (r) => /strangler|strangler\s+fig|facade|proxy.*route|incremental.*migrat/i.test(r),
-  'realistic timeline':       (r) => /week|month|sprint|timeline|3\s+months|quarter/i.test(r),
-  'considers age and risk':   (r) => /age|30|risk\s+tolerance|time\s+horizon|young|long.term/i.test(r),
-  'specific allocation':      (r) => /\d+\s*%|percent|allocation|split|ratio/i.test(r),
-  'uses heat trick':          (r) => /heat|warm|hot|temperature|touch|feel/i.test(r),
-  'explains logic':           (r) => /because|therefore|since|so\s+we\s+know|this\s+means/i.test(r),
-  'considers scale':          (r) => /scale|50K|growth|users|traffic/i.test(r),
-  'shows calculation':        (r) => /\d+\s*[\+\-\*\/\%]\s*\d+|=\s*\d+|P\s*\(|probability/i.test(r),
-  'uses inclusion-exclusion': (r) => /inclusion.exclusion|union|P\(A\s*(∪|or|OR|\|).*B\)|60\s*\+\s*50\s*-\s*30|80/i.test(r),
-  'correct answer':           (r) => /20\s*%|0\.2|20\s+percent/i.test(r),
-  // --- Memory-retrieval traits ---
-  'references context':       (r) => true, // baseline — the real check is the specific facts
-  'mentions Rust':            (r) => /\bRust\b/i.test(r),
-  'mentions Phoenix':         (r) => /\bPhoenix\b/i.test(r),
-  'accurate extraction':      (_r) => true, // can't auto-verify — baseline true
-  'mentions Thursday meeting': (r) => /Thursday|Thurs/i.test(r),
-  'mentions Friday report':   (r) => /Friday|Fri/i.test(r),
-  'mentions FastAPI':         (r) => /FastAPI/i.test(r),
-  'mentions Python':          (r) => /\bPython\b/i.test(r),
-  'not hallucinated':         (_r) => true, // can't auto-detect hallucination — baseline true
-  'recommends Knex or raw SQL': (r) => /Knex|raw\s+SQL|query\s+builder/i.test(r),
-  'mentions ORM preference':  (r) => /ORM|ActiveRecord|prefer|dislike/i.test(r),
-  'respects user preference': (r) => /prefer|based\s+on|you\s+(mentioned|said|noted)|previous/i.test(r),
-  'mentions on-premises requirement': (r) => /on.prem|no.cloud|self.host|local|sensitive\s+data/i.test(r),
-  'suggests self-hosted options': (r) => /ELK|Elasticsearch|Loki|Grafana|Graylog|Fluentd|self.host/i.test(r),
-  'mentions OrderService':    (r) => /OrderService/i.test(r),
-  'mentions port 3002':       (r) => /3002/.test(r),
-  'mentions AuthService for SLA': (r) => /AuthService.*(?:critical|SLA|uptime|highest)|(?:critical|SLA|uptime|highest).*AuthService/i.test(r),
-  'mentions write-through':   (r) => /write.through/i.test(r),
-  'explains rejected approaches': (r) => /cache\s+miss|lost\s+on\s+restart|TTL|in.memory/i.test(r),
-  'acknowledges missing info': (r) => /not\s+mention|no\s+information|haven't\s+shared|don't\s+have|not\s+specified|unclear/i.test(r),
-  'does not hallucinate':     (_r) => true, // baseline
-  'asks for clarification':   (r) => /could\s+you|what.*use|which.*tool|tell\s+me|please\s+share|can\s+you\s+share|\?/i.test(r),
-  'correct branch name format': (r) => /feat\/PROJ-456/i.test(r),
-  'mentions conventional commits': (r) => /conventional\s+commit|feat:|fix:|chore:/i.test(r),
-  'mentions no force push to main': (r) => /force\s+push|--force|never.*push.*main|no.*force/i.test(r),
-  'lists all five preferences': (r) => /dark\s+mode|timezone|America\/Los_Angeles|English|daily\s+digest|verbose/i.test(r),
-  'searches session memory':  (r) => /session|memory|remember|transcript|source|found/i.test(r),
-  'mentions parser.js':       (r) => /parser\.js/i.test(r),
-  'mentions node --test':     (r) => /node\s+--test/i.test(r),
-  'cites session id':         (r) => /(?:session|source)[\s_-]?id|codex:sanitized|claude:sanitized|walle:sanitized|sanitized-[\w-]+/i.test(r),
-  'mentions lock contention': (r) => /lock\s+contention/i.test(r),
-  'mentions queue-worker.js': (r) => /queue-worker\.js/i.test(r),
-  'mentions codex-blank-space.spec.js': (r) => /codex-blank-space\.spec\.js/i.test(r),
-  'mentions blank gap':       (r) => /blank[-\s]?gap/i.test(r),
-  'searches diary':           (r) => /diary|agent diary|remember|memory|source/i.test(r),
-  'mentions router inputs':   (r) => /router\s+inputs|routing.*inputs/i.test(r),
-  'mentions evaluation':      (r) => /evaluation|eval|trusted\s+evaluation/i.test(r),
-  'cites diary/session id':   (r) => /diary|session[\s_-]?id|sanitized-quorum|source[\s_-]?id/i.test(r),
-  'mentions gemini-jsonl':    (r) => /gemini-jsonl/i.test(r),
-  'mentions pii_potential':   (r) => /pii_potential/i.test(r),
-  'says do not replace SQLite': (r) => /do\s+not\s+(?:adopt|replace|use).*SQLite|keep\s+SQLite|SQLite.*not\s+replace/i.test(r),
-  'mentions sqlite-vec':      (r) => /sqlite-vec/i.test(r),
-  'does not hallucinate approval': (r) => /do\s+not\s+(?:adopt|replace)|no\s+approval|not\s+approved|rejected|keep\s+SQLite/i.test(r),
-  'uses Wall-E memory':       (r) => /Wall-?E|memory|remember|source|retriev|context/i.test(r),
-  'mentions direct':          (r) => /\bdirect\b|concise|straightforward/i.test(r),
-  'mentions evidence':        (r) => /evidence|cite|source|verified|proof/i.test(r),
-  'mentions thorough verification': (r) => /thorough\s+verification|verify|validated|test|evidence/i.test(r),
-  'does not search public web first': (r) => /before\s+public\s+web|not\s+(?:search|use).*public|Wall-?E.*first|memory.*first/i.test(r),
-  'mentions colleague context': (r) => /colleague|work\s+context|planning|team\s+strategy|prioriti[sz]ation/i.test(r),
-  'cites memory evidence':    (r) => /source|source[_\s-]?id|memory|evidence|person:sanitized|sanitized-casey/i.test(r),
-  // --- Coding-agent traits ---
-  'uses edit over write':       (r) => /edit_file|apply_patch|multi_edit/i.test(r) && !/write_file/i.test(r),
-  'reads before writing':       (r) => {
-    const readIdx = r.search(/read_file|glob|grep_files/i);
-    const writeIdx = r.search(/write_file|edit_file|apply_patch/i);
-    return readIdx >= 0 && writeIdx >= 0 && readIdx < writeIdx;
-  },
-  'runs tests after changes':   (r) => {
-    const editIdx = r.search(/edit_file|write_file|apply_patch/i);
-    const testIdx = r.search(/npm test|pytest|run_shell.*test/i);
-    return editIdx >= 0 && testIdx >= 0 && editIdx < testIdx;
-  },
-  'uses LSP diagnostics':       (r) => /lsp_diagnostics|lsp_symbols/i.test(r),
-  'plans before executing':     (r) => {
-    const planIdx = r.search(/update_todos|plan|step\s*1/i);
-    const execIdx = r.search(/edit_file|write_file/i);
-    return planIdx >= 0 && execIdx >= 0 && planIdx < execIdx;
-  },
-  'efficient tool use':         (r) => {
-    const toolCalls = (r.match(/\b(read_file|write_file|edit_file|run_shell|glob|grep_files)\b/gi) || []);
-    return toolCalls.length > 0 && toolCalls.length <= 15;
-  },
-  'creates todos':              (r) => /update_todos/i.test(r),
-  'uses glob for discovery':    (r) => /glob/i.test(r),
-  'uses grep for search':       (r) => /grep_files/i.test(r),
-  'handles errors gracefully':  (r) => /error|catch|try|failed|retry/i.test(r),
-  'multi-file coordination':    (r) => {
-    const files = new Set((r.match(/(?:read_file|edit_file|write_file).*?['"]([^'"]+)['"]/gi) || []).map(m => m.match(/['"]([^'"]+)['"]/)?.[1]).filter(Boolean));
-    return files.size >= 2;
-  },
-  // --- Tool diversity traits (coding-agent) ---
-  'uses lsp tools':             (r) => /lsp_references|lsp_definition|lsp_diagnostics|lsp_symbols|lsp_hover|lsp_implementation/i.test(r),
-  'uses grep before edit':      (r) => {
-    const grepIdx = r.search(/grep_files|Grep/i);
-    const editIdx = r.search(/edit_file|Edit/i);
-    return grepIdx >= 0 && editIdx >= 0 && grepIdx < editIdx;
-  },
-  'minimal file writes':        (r) => {
-    const writes = (r.match(/\b(write_file|Write)\b/gi) || []).length;
-    const edits = (r.match(/\b(edit_file|Edit)\b/gi) || []).length;
-    // Prefer edits over writes; penalize if writes > edits
-    return edits > 0 && writes <= edits;
-  },
-  'uses search before write':   (r) => {
-    const searchIdx = r.search(/grep_files|glob|Grep|Glob/i);
-    const writeIdx = r.search(/write_file|edit_file|Write|Edit/i);
-    return searchIdx >= 0 && writeIdx >= 0 && searchIdx < writeIdx;
-  },
-  'asks clarifying questions':  (r) => /ask_user|AskUserQuestion/i.test(r),
-};
-const UNSCORABLE_TRAITS = new Set([
-  'accurate',
-  'correct solution',
-  'references context',
-  'accurate extraction',
-  'not hallucinated',
-  'does not hallucinate',
-]);
-// ============================================================
-// Suite loading
-// ============================================================
-/**
- * List available benchmark suite names.
- * @returns {string[]}
- */
-// Suites with a non-standard schema that needs a dedicated loader rather
-// than the generic validator below (e.g. SWE-bench files use the upstream
-// Princeton/SWE-bench schema with `instance_id` instead of `id`).
-const ALT_SCHEMA_SUITE_PREFIXES = ['swebench-', 'swebench_'];
-function listBenchmarkSuites() {
-  if (!fs.existsSync(BENCHMARKS_DIR)) return [];
-  return fs.readdirSync(BENCHMARKS_DIR)
-    .filter((f) => f.endsWith('.json'))
-    .map((f) => f.replace(/\.json$/, ''))
-    .filter((name) => !ALT_SCHEMA_SUITE_PREFIXES.some((prefix) => name.startsWith(prefix)));
-}
-/**
- * Load and validate a benchmark suite by name.
- * @param {string} suiteName - e.g. 'coding', 'chat'
- * @returns {{ name: string, prompts: object[] }}
- */
-function loadBenchmarkSuite(suiteName) {
-  const filePath = path.join(BENCHMARKS_DIR, `${suiteName}.json`);
-  if (!fs.existsSync(filePath)) {
-    throw new Error(`Benchmark suite not found: ${suiteName} (looked in ${filePath})`);
-  }
-  const raw = fs.readFileSync(filePath, 'utf-8');
-  let prompts;
-  try {
-    prompts = JSON.parse(raw);
-  } catch (e) {
-    throw new Error(`Invalid JSON in benchmark suite ${suiteName}: ${e.message}`);
-  }
-  if (!Array.isArray(prompts) || prompts.length === 0) {
-    throw new Error(`Benchmark suite ${suiteName} must be a non-empty array`);
-  }
-  // Skip in-file metadata entries (e.g. {"_comment": "Section A: ..."} markers
-  // used to annotate sections of the JSON). They are not benchmarks.
-  prompts = prompts.filter((entry) => entry && typeof entry === 'object' && entry._comment === undefined);
-  for (const entry of prompts) {
-    if (!entry.id || typeof entry.id !== 'string') {
-      throw new Error(`Benchmark entry missing valid 'id' in suite ${suiteName}`);
-    }
-    // prompt must be a string; empty string is allowed for explicit
-    // edge-case tests (e.g. ce-I1 tagged ["edge-case","empty"] — the
-    // empty input IS the test).
-    if (typeof entry.prompt !== 'string') {
-      throw new Error(`Benchmark entry ${entry.id} missing valid 'prompt'`);
-    }
-    if (!VALID_TASK_TYPES.includes(entry.taskType)) {
-      throw new Error(`Benchmark entry ${entry.id} has invalid taskType: ${entry.taskType}`);
-    }
-    if (!VALID_DIFFICULTIES.includes(entry.difficulty)) {
-      throw new Error(`Benchmark entry ${entry.id} has invalid difficulty: ${entry.difficulty}`);
-    }
-    // Scoring signal must come from at least one of:
-    //   - expectedTraits (regex/heuristic matchers)
-    //   - expectedInReply / forbiddenInReply (substring/regex on the reply)
-    //   - expectedTools / forbiddenTools (tool-routing scoring)
-    //   - mockToolResults (tool-call shape scoring)
-    //   - agentExpectations (coding-agent: expected tools/files/tests)
-    //   - edge-case / adversarial marker (the absence/refusal is the test)
-    const hasTraits = Array.isArray(entry.expectedTraits) && entry.expectedTraits.length > 0;
-    const hasReplyChecks = (Array.isArray(entry.expectedInReply) && entry.expectedInReply.length > 0) ||
-                           (Array.isArray(entry.forbiddenInReply) && entry.forbiddenInReply.length > 0);
-    const hasToolChecks = (Array.isArray(entry.expectedTools) && entry.expectedTools.length > 0) ||
-                          (Array.isArray(entry.forbiddenTools) && entry.forbiddenTools.length > 0);
-    const hasMockTools = entry.mockToolResults && typeof entry.mockToolResults === 'object' &&
-                          Object.keys(entry.mockToolResults).length > 0;
-    const hasAgentExpectations = entry.agentExpectations && typeof entry.agentExpectations === 'object';
-    const isEdgeCase = entry.category === 'edge-case' || entry.category === 'adversarial' ||
-                       (Array.isArray(entry.tags) && (entry.tags.includes('edge-case') || entry.tags.includes('adversarial')));
-    if (!hasTraits && !hasReplyChecks && !hasToolChecks && !hasMockTools && !hasAgentExpectations && !isEdgeCase) {
-      throw new Error(`Benchmark entry ${entry.id} has no scoring signal (expectedTraits / expectedInReply / expectedTools / mockToolResults / agentExpectations)`);
-    }
-    if (hasTraits) {
-      const unknownTraits = entry.expectedTraits.filter(t => !TRAIT_MATCHERS[t] && !UNSCORABLE_TRAITS.has(t));
-      if (unknownTraits.length) {
-        throw new Error(`Benchmark entry ${entry.id} has unknown expectedTraits: ${unknownTraits.join(', ')}`);
-      }
-    }
-  }
-  return { name: suiteName, prompts };
-}
-// ============================================================
-// Trait scoring
-// ============================================================
-/**
- * Check if a response exhibits a single trait.
- * @param {string} response - LLM response text
- * @param {string} trait - Trait name to check
- * @returns {boolean}
- */
-function scoreTrait(response, trait) {
-  if (!response || typeof response !== 'string') return false;
-  if (UNSCORABLE_TRAITS.has(trait)) return false;
-  const matcher = TRAIT_MATCHERS[trait];
-  if (!matcher) return false;
-  return matcher(response);
-}
-function scoreTraitsDetailed(response, expectedTraits) {
-  const detail = {
-    score: 0,
-    matched: [],
-    missed: [],
-    unscored: [],
-    unknown: [],
-    scoredCount: 0,
-  };
-  if (!expectedTraits || expectedTraits.length === 0) return detail;
-  for (const trait of expectedTraits) {
-    if (UNSCORABLE_TRAITS.has(trait)) {
-      detail.unscored.push(trait);
-      continue;
-    }
-    if (!TRAIT_MATCHERS[trait]) {
-      detail.unknown.push(trait);
-      continue;
-    }
-    detail.scoredCount++;
-    if (scoreTrait(response, trait)) detail.matched.push(trait);
-    else detail.missed.push(trait);
-  }
-  detail.score = detail.scoredCount > 0 ? detail.matched.length / detail.scoredCount : 0;
-  return detail;
-}
-/**
- * Score a response against multiple expected traits.
- * @param {string} response - LLM response text
- * @param {string[]} expectedTraits - Array of trait names
- * @returns {number} 0.0 to 1.0 based on percentage of traits matched
- */
-function scoreTraits(response, expectedTraits) {
-  return scoreTraitsDetailed(response, expectedTraits).score;
-}
-// ============================================================
-// Benchmark runner
-// ============================================================
-/**
- * Run a benchmark suite against one or more providers.
- *
- * @param {object} brain - Brain instance (must have insertBenchmarkResult method)
- * @param {object} options
- * @param {string} options.suite - Suite name to run
- * @param {Array<{type: string, model: string, config?: object}>} options.providers - Providers to test
- * @param {Function} [options.judgeFn] - Optional LLM-as-judge function: (prompt, response) => { score, feedback }
- * @param {number} [options.timeoutMs] - Per-prompt timeout in ms (default: 60000)
- * @param {AbortSignal} [options.signal] - Abort signal to cancel the run
- * @returns {Promise<{ runId: string, results: object[], leaderboard: object }>}
- */
-async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFAULT_TIMEOUT_MS, signal } = {}) {
-  if (!suite) throw new Error('suite is required');
-  if (!providers || providers.length === 0) throw new Error('providers is required');
-  const runId = crypto.randomUUID();
-  const { prompts } = loadBenchmarkSuite(suite);
-  const results = [];
-  const providerScores = {}; // { providerKey: { total: 0, count: 0 } }
-  for (const entry of prompts) {
-    if (signal?.aborted) break;
-    for (const provider of providers) {
-      if (signal?.aborted) break;
-      const providerKey = `${provider.type}/${provider.model}`;
-      if (!providerScores[providerKey]) {
-        providerScores[providerKey] = { total: 0, count: 0, errors: 0 };
-      }
-      let response = null;
-      let latencyMs = 0;
-      let error = null;
-      let usage = null;
-      let timer;
-      try {
-        const client = createClient(provider.type, provider.config || {});
-        const abortCtl = new AbortController();
-        timer = setTimeout(() => abortCtl.abort(), timeoutMs);
-        // Combine external signal with timeout
-        if (signal) {
-          signal.addEventListener('abort', () => abortCtl.abort(), { once: true });
-        }
-        const startTime = Date.now();
-        const result = await client.chat({
-          model: provider.model,
-          messages: [{ role: 'user', content: entry.prompt }],
-          maxTokens: 2048,
-          signal: abortCtl.signal,
-        });
-        latencyMs = Date.now() - startTime;
-        clearTimeout(timer);
-        response = result.content || '';
-        usage = result.usage || null;
-      } catch (err) {
-        clearTimeout(timer);
-        error = err.message || String(err);
-        providerScores[providerKey].errors++;
-      }
-      // Score traits. Some dataset traits are intentionally marked unscorable:
-      // they document desired behavior but must not inflate automatic scores.
-      const traitDetail = response
-        ? scoreTraitsDetailed(response, entry.expectedTraits)
-        : scoreTraitsDetailed('', entry.expectedTraits);
-      const traitScore = traitDetail.score;
-      const matchedTraits = traitDetail.matched;
-      // Optional LLM judge
-      let judgeScore = null;
-      let judgeFeedback = null;
-      if (judgeFn && response) {
-        try {
-          const judgeResult = await judgeFn(entry.prompt, response);
-          judgeScore = judgeResult.score;
-          judgeFeedback = judgeResult.feedback;
-        } catch (_err) {
-          // judge failure is non-fatal
-        }
-      }
-      // Composite score: trait score (weight 0.6) + judge score (weight 0.4) when judge available
-      const compositeScore = judgeScore != null
-        ? traitScore * 0.6 + judgeScore * 0.4
-        : traitScore;
-      const scoringMethod = judgeScore != null
-        ? 'trait+judge'
-        : traitDetail.scoredCount > 0 ? 'traits' : 'unscored-traits';
-      providerScores[providerKey].total += compositeScore;
-      providerScores[providerKey].count++;
-      // Map chat scoring onto the dimension rubric so the leaderboard's
-      // dimensional view has data to show. Chat is single-turn — agent-loop
-      // dims (toolEfficiency, turnEconomy, iterativeRefinement, etc.) don't
-      // apply, so leave those undefined. Aggregator skips undefined dims.
-      const dimensions = {
-        correctness: traitScore,
-        ...(judgeScore != null ? { codeQuality: judgeScore } : {}),
-      };
-      const resultEntry = decorateBenchmarkResult({
-        runId,
-        suite,
-        promptId: entry.id,
-        taskType: entry.taskType,
-        difficulty: entry.difficulty,
-        provider: provider.type,
-        model: provider.model,
-        prompt: entry.prompt,
-        response,
-        traitScore,
-        matchedTraits,
-        judgeScore,
-        judgeFeedback,
-        compositeScore,
-        latencyMs,
-        usage,
-        inputTokens: usage?.input ?? usage?.prompt_tokens ?? null,
-        outputTokens: usage?.output ?? usage?.completion_tokens ?? null,
-        genTokPerSec: usage?.genTokPerSec ?? null,
-        dimensionsJson: JSON.stringify(dimensions),
-        modelMetadataJson: JSON.stringify({
-          matchedTraits,
-          missedTraits: traitDetail.missed,
-          unscoredTraits: traitDetail.unscored,
-          unknownTraits: traitDetail.unknown,
-          scoredTraitCount: traitDetail.scoredCount,
-        }),
-        error,
-        scorerVersion: DEFAULT_SCORER_VERSION,
-        scoringMethod,
-        trusted: !error && judgeScore != null,
-        runConfig: { timeoutMs },
-        timestamp: new Date().toISOString(),
-      }, {
-        suite,
-        benchmark: entry,
-        runId,
-        provider: provider.type,
-        model: provider.model,
-        scoringMethod,
-        scorerVersion: DEFAULT_SCORER_VERSION,
-        trusted: !error && judgeScore != null,
-        runConfig: { timeoutMs },
-      });
-      results.push(resultEntry);
-      // Persist to brain
-      if (brain && typeof brain.insertBenchmarkResult === 'function') {
-        try {
-          brain.insertBenchmarkResult(resultEntry);
-        } catch (_err) {
-          // storage failure is non-fatal
-        }
-      }
-    }
-  }
-  // Build leaderboard
-  const leaderboard = {};
-  for (const [key, data] of Object.entries(providerScores)) {
-    leaderboard[key] = {
-      avgScore: data.count > 0 ? data.total / data.count : 0,
-      totalPrompts: data.count,
-      errors: data.errors,
-    };
-  }
-  return { runId, results, leaderboard };
-}
-// ============================================================
-// Leaderboard aggregation
-// ============================================================
-/**
- * Get aggregated benchmark leaderboard from stored results.
- *
- * Expected brain method signature:
- *   brain.getBenchmarkResults({ suite, days }) => Array<{ provider, model, compositeScore, error }>
- *
- * @param {object} brain - Brain instance
- * @param {object} options
- * @param {string} [options.suite] - Filter by suite name
- * @param {number} [options.days] - Filter to last N days
- * @returns {object} Leaderboard: { 'provider/model': { avgScore, totalRuns, errors } }
- */
-function getBenchmarkLeaderboard(brain, { suite, days } = {}) {
-  if (!brain || typeof brain.getBenchmarkResults !== 'function') {
-    throw new Error('brain.getBenchmarkResults is required');
-  }
-  const results = brain.getBenchmarkResults({ suite, days });
-  const scores = {};
-  for (const r of results) {
-    const key = `${r.provider}/${r.model}`;
-    if (!scores[key]) {
-      scores[key] = { total: 0, count: 0, errors: 0 };
-    }
-    if (r.error) {
-      scores[key].errors++;
-    }
-    scores[key].total += r.compositeScore || 0;
-    scores[key].count++;
-  }
-  const leaderboard = {};
-  for (const [key, data] of Object.entries(scores)) {
-    leaderboard[key] = {
-      avgScore: data.count > 0 ? data.total / data.count : 0,
-      totalRuns: data.count,
-      errors: data.errors,
-    };
-  }
-  return leaderboard;
-}
-/**
- * Load all benchmarks across all suites, attaching _suite and difficulty metadata.
- * Used by aggregator for difficulty-based task type mapping.
- * @returns {Array<Object>}
- */
-function loadAllBenchmarks() {
-  const all = [];
-  for (const suite of listBenchmarkSuites()) {
-    try {
-      const { prompts } = loadBenchmarkSuite(suite);
-      for (const b of prompts) {
-        b._suite = suite;
-        all.push(b);
-      }
-    } catch { /* skip malformed suites */ }
-  }
-  return all;
-}
-// ============================================================
-// Exports
-// ============================================================
-module.exports = {
-  listBenchmarkSuites,
-  loadBenchmarkSuite,
-  loadAllBenchmarks,
-  scoreTrait,
-  scoreTraits,
-  scoreTraitsDetailed,
-  runBenchmark,
-  getBenchmarkLeaderboard,
-  TRAIT_MATCHERS,
-  UNSCORABLE_TRAITS,
-  BENCHMARKS_DIR,
-};