npm - create-walle - Versions diffs - 0.9.21 → 0.9.23 - Mend

create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (500) hide show

package/template/wall-e/eval/run-coding-agent-real.js DELETED Viewed

@@ -1,187 +0,0 @@
-#!/usr/bin/env node
-'use strict';
-try {
-  const envPath = require('path').resolve(__dirname, '..', '..', '.env');
-  const lines = require('fs').readFileSync(envPath, 'utf8').split('\n');
-  for (const line of lines) {
-    const match = line.match(/^([A-Z_]+)=(.*)$/);
-    if (match && !process.env[match[1]]) process.env[match[1]] = match[2];
-  }
-} catch {}
-const path = require('path');
-process.chdir(path.join(__dirname, '..'));
-const {
-  DEFAULT_REAL_CATALOG_PATH,
-  buildReplayCatalog,
-  createProviderClient,
-  defaultReplayTimeoutMs,
-  loadCatalog,
-  preflightProvider,
-  runReplayEntry,
-  summarizeResults,
-} = require('./coding-agent-real');
-async function main() {
-  const args = parseArgs(process.argv.slice(2));
-  const catalogPath = path.resolve(args.catalog || DEFAULT_REAL_CATALOG_PATH);
-  const dryRun = !!args['dry-run'];
-  const limit = args.limit ? Number(args.limit) : 20;
-  if (args.help) {
-    printHelp();
-    return;
-  }
-  if (args.reap) {
-    const repoPath = path.resolve(args.repo || process.cwd());
-    const result = buildReplayCatalog({
-      repoPath,
-      source: args.source || 'all',
-      roots: args.root,
-      sinceDays: args['since-days'] ? Number(args['since-days']) : 14,
-      limit,
-      catalogPath,
-      requireEdits: args['require-edits'] !== 'false',
-      requireCodingIntent: args['require-coding-intent'] !== 'false',
-    });
-    console.log(`Reaped ${result.added.length} real replay entries into ${result.catalogPath}`);
-    console.log(`Scanned: ${result.scanned}; catalog total: ${result.total}`);
-    console.log(`Skipped: ${JSON.stringify(result.skipped)}`);
-    if (!dryRun && !args.run) return;
-  }
-  const catalog = loadCatalog(catalogPath);
-  const selected = selectEntries(catalog, {
-    id: args.id,
-    limit,
-    source: args.source,
-  });
-  if (!selected.length) {
-    console.error(`No real replay entries selected from ${catalogPath}`);
-    process.exitCode = 1;
-    return;
-  }
-  const providerType = args.provider || null;
-  initBrain();
-  const providerCheck = preflightProvider(providerType, { dryRun });
-  console.log(`Catalog: ${catalogPath}`);
-  console.log(`Entries: ${selected.length}`);
-  console.log(`Mode: ${dryRun ? 'dry-run' : 'real model run'}`);
-  console.log(`Provider preflight: ${providerCheck.status}${providerCheck.providerType ? ` (${providerCheck.providerType})` : ''}`);
-  if (!providerCheck.ok) {
-    console.error(`Provider is not ready: ${providerCheck.status}`);
-    process.exitCode = 1;
-    return;
-  }
-  let runAgentLoop = null;
-  let provider = null;
-  if (!dryRun) {
-    const orchestrator = require('../coding-orchestrator');
-    runAgentLoop = orchestrator.runAgentLoop;
-    provider = createProviderClient(providerType);
-  }
-  const results = [];
-  for (const entry of selected) {
-    console.log(`\n--- ${entry.id} (${entry.difficulty || 'unknown'}) ---`);
-    console.log(`Prompt: ${(entry.prompt || '').replace(/\s+/g, ' ').slice(0, 140)}`);
-    const result = await runReplayEntry(entry, {
-      runAgentLoop,
-      provider,
-      model: args.model || null,
-      dryRun,
-      record: !!args.record,
-      resultsDir: args['results-dir'],
-      keepFailures: !!args['keep-failures'],
-      timeoutMs: args.timeout ? Number(args.timeout) : defaultReplayTimeoutMs(entry),
-    });
-    results.push(result);
-    const score = result.replay?.scores?.composite;
-    console.log(`Success: ${result.success}`);
-    console.log(`Preflight: ${result.preflight.status}`);
-    if (typeof score === 'number') console.log(`Score: ${score.toFixed(3)}`);
-    if (result.failureType) console.log(`Failure: ${result.failureType}`);
-    if (result.artifactPath) console.log(`Artifact: ${result.artifactPath}`);
-  }
-  const summary = summarizeResults(results);
-  console.log('\n=== Summary ===');
-  console.log(`Passed: ${summary.passed}/${summary.total}`);
-  if (summary.avgComposite != null) console.log(`Avg composite: ${summary.avgComposite.toFixed(3)}`);
-  console.log(`Failures: ${JSON.stringify(summary.failureCounts)}`);
-  if (summary.failed > 0) process.exitCode = 1;
-}
-function initBrain() {
-  try {
-    const brain = require('../brain');
-    brain.initDb();
-    return brain;
-  } catch (err) {
-    console.warn(`Brain not available: ${err.message}`);
-    return null;
-  }
-}
-function selectEntries(catalog, { id, limit, source } = {}) {
-  let entries = catalog;
-  if (id) entries = entries.filter((entry) => entry.id === id);
-  if (source && source !== 'all') entries = entries.filter((entry) => entry.realReplay?.source === source);
-  return entries.slice(0, limit || entries.length);
-}
-function parseArgs(argv) {
-  const out = {};
-  for (let i = 0; i < argv.length; i += 1) {
-    const arg = argv[i];
-    if (!arg.startsWith('--')) continue;
-    const key = arg.slice(2);
-    if (['reap', 'dry-run', 'record', 'keep-failures', 'run', 'help'].includes(key)) {
-      out[key] = true;
-    } else {
-      out[key] = argv[i + 1];
-      i += 1;
-    }
-  }
-  return out;
-}
-function printHelp() {
-  console.log(`Usage:
-  node eval/run-coding-agent-real.js --reap --repo /path/to/repo --source claude|codex|all
-  node eval/run-coding-agent-real.js --dry-run --catalog eval/benchmarks/coding-agent-real.json
-  node eval/run-coding-agent-real.js --id agent-real-codex-... --provider openai --model gpt-5.4-mini --record
-Options:
-  --reap                 Harvest Claude/Codex JSONL sessions into the real catalog
-  --run                  After --reap, run the selected entries too
-  --repo <path>          Repo path used to filter transcript cwd values
-  --root <path>          Transcript root override
-  --source <value>       claude, codex, or all
-  --since-days <n>       Transcript mtime window for reap
-  --limit <n>            Entry limit
-  --catalog <path>       Catalog JSON path
-  --dry-run              Verify catalog preflight and sandbox creation only
-  --provider <type>      openai, deepseek, moonshot, anthropic, google, ollama, mlx, claude-cli, codex-cli
-  --model <id>           Model override for runAgentLoop
-  --record               Write result JSON artifacts
-  --results-dir <path>   Artifact directory
-  --keep-failures        Leave failed replay sandbox worktrees for inspection
-  --require-edits false  Allow non-editing sessions into the catalog
-  --require-coding-intent false
-                         Allow non-standalone prompts into the catalog
-`);
-}
-main()
-  .then(() => process.exit(process.exitCode || 0))
-  .catch((err) => {
-    console.error(err.stack || err.message);
-    process.exit(1);
-  });

package/template/wall-e/eval/run-eval.js DELETED Viewed

@@ -1,435 +0,0 @@
-'use strict';
-/**
- * CLI entry point for running the full eval pipeline across all models.
- *
- * Usage:
- *   node eval/run-eval.js                           # all models, all benchmarks
- *   node eval/run-eval.js --suite coding-agent       # specific suite
- *   node eval/run-eval.js --models gemma4:e4b,gemma4:26b  # specific models
- *   node eval/run-eval.js --id agent-001             # single benchmark
- *   node eval/run-eval.js --budget 5.0               # cost cap in dollars
- *   node eval/run-eval.js --timeout 300000           # per-benchmark timeout ms
- *   node eval/run-eval.js --concurrency 1            # parallel benchmarks
- *   node eval/run-eval.js --dry-run                  # list work items without running
- *   node eval/run-eval.js --check-providers           # test provider health and exit
- */
-const fs = require('fs');
-const path = require('path');
-const { EvalOrchestrator } = require('./eval-orchestrator');
-// ── Default model roster ──
-const DEFAULT_MODELS = [
-  // Frontier
-  'claude-opus-4-7',
-  'gpt-5.5',
-  'deepseek-v4-pro',
-  'kimi-k2.6',
-  // Daily
-  'claude-sonnet-4-6',
-  'gpt-5.4',
-  'gemini-2.5-pro',
-  'kimi-k2.5',
-  // Budget
-  'claude-haiku-4-5-20251001',
-  'gpt-5.4-mini',
-  'gemini-2.5-flash',
-  'deepseek-v4-flash',
-  // Local (Ollama)
-  'gemma4:e4b',
-  'gemma4:26b',
-];
-// ── Arg parsing ──
-const args = process.argv.slice(2);
-function getArg(flag, fallback) {
-  const i = args.indexOf(flag);
-  return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
-}
-const hasFlag = (flag) => args.includes(flag);
-const suite = getArg('--suite', 'coding-agent');
-const modelArg = getArg('--models', null);
-const models = modelArg ? modelArg.split(',').map(s => s.trim()) : DEFAULT_MODELS;
-const benchmarkId = getArg('--id', null);
-const budget = parseFloat(getArg('--budget', '10.0'));
-const timeout = parseInt(getArg('--timeout', '600000'), 10);
-const concurrency = parseInt(getArg('--concurrency', '2'), 10);
-const dryRun = hasFlag('--dry-run');
-const resumeId = getArg('--resume', null);
-const checkProviders = hasFlag('--check-providers');
-/**
- * Pre-flight check: test each provider with a minimal API call.
- * Returns map of providerType -> { ok: boolean, error?: string, model?: string }
- */
-async function checkProviderHealth(brain) {
-  const results = {};
-  const { createClient } = require('../llm/client');
-  const { createAnthropicFromEnv } = require('../llm/anthropic');
-  const providers = [
-    { type: 'anthropic', testModel: 'claude-haiku-4-5-20251001', envKeys: ['ANTHROPIC_API_KEY', 'ANTHROPIC_AUTH_TOKEN', 'PORTKEY_API_KEY'] },
-    { type: 'openai', testModel: 'gpt-5.4-mini', envKeys: ['OPENAI_API_KEY'] },
-    { type: 'google', testModel: 'gemini-2.5-flash', envKeys: ['GOOGLE_API_KEY', 'GEMINI_API_KEY'] },
-    { type: 'deepseek', testModel: 'deepseek-v4-flash', envKeys: ['DEEPSEEK_API_KEY'] },
-    { type: 'moonshot', testModel: 'kimi-k2.6', envKeys: ['MOONSHOT_API_KEY'] },
-  ];
-  // Check cloud providers
-  for (const prov of providers) {
-    let hasKey = false;
-    let apiKey = null;
-    // Check brain DB for stored key
-    if (brain) {
-      try {
-        const row = brain.getDb().prepare(
-          'SELECT api_key_encrypted FROM model_providers WHERE type = ? AND enabled = 1 AND api_key_encrypted IS NOT NULL LIMIT 1'
-        ).get(prov.type);
-        if (row && row.api_key_encrypted) {
-          hasKey = true;
-          try { apiKey = brain.decrypt(row.api_key_encrypted); } catch {}
-          if (!apiKey) apiKey = row.api_key_encrypted; // plaintext fallback
-        }
-      } catch {}
-    }
-    // Check env vars
-    if (!hasKey) {
-      for (const key of prov.envKeys) {
-        if (process.env[key]) {
-          hasKey = true;
-          apiKey = process.env[key];
-          break;
-        }
-      }
-    }
-    if (!hasKey) {
-      results[prov.type] = { ok: false, error: 'No API key configured' };
-      continue;
-    }
-    // Make a minimal test call
-    try {
-      let client;
-      if (prov.type === 'anthropic') {
-        client = createAnthropicFromEnv();
-      } else {
-        client = createClient(prov.type, { apiKey });
-      }
-      await client.chat({
-        model: prov.testModel,
-        messages: [{ role: 'user', content: 'hi' }],
-        maxTokens: 1,
-      });
-      results[prov.type] = { ok: true, model: prov.testModel };
-    } catch (err) {
-      const msg = err.message || String(err);
-      let diagnosis = msg;
-      // Gemini CLI OAuth tokens (ya29.*) use Google's private Code Assist API
-      // (cloudcode-pa.googleapis.com) and cannot auth with the public Gemini API.
-      if (apiKey && apiKey.startsWith('ya29.')) {
-        diagnosis = `Stored key is a Gemini CLI OAuth token (ya29.*). These tokens use ` +
-          `Google's private Code Assist API and don't work with the public Gemini API. ` +
-          `Get a Gemini API key from https://aistudio.google.com/apikey`;
-      } else if (msg.includes('429') || msg.includes('quota') || msg.includes('rate')) {
-        const url = prov.type === 'openai'
-          ? 'https://platform.openai.com/account/billing'
-          : prov.type === 'google'
-            ? 'https://aistudio.google.com/apikey'
-            : prov.type === 'moonshot'
-              ? 'https://platform.kimi.ai/console'
-              : '';
-        diagnosis = `Billing quota exceeded (HTTP 429).${url ? ` Check billing at ${url}` : ''}`;
-      } else if (msg.includes('401') || msg.includes('unauthorized') || msg.includes('invalid_api_key')) {
-        diagnosis = `Invalid API key (HTTP 401). Check your API key configuration.`;
-      } else if (msg.includes('API_KEY_INVALID') || msg.includes('INVALID_ARGUMENT')) {
-        diagnosis = `Invalid API key. Get a Gemini API key from https://aistudio.google.com/apikey`;
-      }
-      results[prov.type] = { ok: false, error: diagnosis };
-    }
-  }
-  // Check Ollama
-  try {
-    const resp = await fetch('http://localhost:11434/api/tags', { signal: AbortSignal.timeout(3000) });
-    if (resp.ok) {
-      const data = await resp.json();
-      const modelCount = (data.models || []).length;
-      results['ollama'] = { ok: true, model: `${modelCount} models available` };
-    } else {
-      results['ollama'] = { ok: false, error: `Ollama returned ${resp.status}` };
-    }
-  } catch {
-    results['ollama'] = { ok: false, error: 'Ollama not running (localhost:11434)' };
-  }
-  return results;
-}
-(async () => {
-  // Load brain
-  let brain = null;
-  try {
-    brain = require('../brain');
-    brain.initDb();
-    console.log('[eval] Brain loaded');
-  } catch (err) {
-    console.warn(`[eval] Brain not available: ${err.message}`);
-  }
-  // Handle --check-providers
-  if (checkProviders) {
-    console.log('\n[eval] Checking provider health...\n');
-    const health = await checkProviderHealth(brain);
-    for (const [provider, status] of Object.entries(health)) {
-      const icon = status.ok ? 'OK' : 'FAIL';
-      const detail = status.ok ? `(${status.model})` : `-- ${status.error}`;
-      const label = provider.charAt(0).toUpperCase() + provider.slice(1);
-      console.log(`  ${label}: ${icon} ${detail}`);
-    }
-    const okCount = Object.values(health).filter(s => s.ok).length;
-    const totalCount = Object.keys(health).length;
-    console.log(`\n[eval] ${okCount}/${totalCount} providers healthy`);
-    if (brain) brain.closeDb(true);
-    process.exit(okCount > 0 ? 0 : 1);
-  }
-  // Load coding orchestrator for runAgentLoop
-  let runAgentLoop;
-  try {
-    const codingOrch = require('../coding-orchestrator');
-    runAgentLoop = codingOrch.runAgentLoop;
-    console.log('[eval] Coding orchestrator loaded');
-  } catch (err) {
-    console.error(`[eval] Failed to load coding-orchestrator: ${err.message}`);
-    process.exit(1);
-  }
-  // Check which models are actually available
-  console.log(`\n[eval] Model roster (${models.length}):`);
-  const availableModels = [];
-  // Cache Ollama model list (one fetch instead of per-model)
-  let ollamaModels = null;
-  try {
-    const resp = await fetch('http://localhost:11434/api/tags', { signal: AbortSignal.timeout(3000) });
-    if (resp.ok) {
-      const data = await resp.json();
-      ollamaModels = new Set((data.models || []).map(m => m.name));
-    }
-  } catch { /* Ollama not running */ }
-  for (const model of models) {
-    const isOllama = model.includes(':');
-    if (isOllama) {
-      const found = ollamaModels && ollamaModels.has(model);
-      const reason = ollamaModels ? (found ? '' : ' — NOT FOUND') : ' — NOT RUNNING';
-      console.log(`  ${found ? '+' : '-'} ${model} (ollama${reason})`);
-      if (found) availableModels.push(model);
-    } else {
-      // Cloud model — check if brain has API key for its provider
-      let providerType = 'anthropic';
-      if (model.startsWith('gpt-')) providerType = 'openai';
-      else if (model.startsWith('gemini-')) providerType = 'google';
-      else if (model.startsWith('deepseek-')) providerType = 'deepseek';
-      else if (model.startsWith('kimi-') || model.startsWith('moonshot-')) providerType = 'moonshot';
-      let hasKey = false;
-      if (brain) {
-        try {
-          const row = brain.getDb().prepare(
-            'SELECT id FROM model_providers WHERE type = ? AND enabled = 1 AND api_key_encrypted IS NOT NULL LIMIT 1'
-          ).get(providerType);
-          hasKey = !!row;
-        } catch {}
-      }
-      // Also check env vars (including Portkey/gateway for Anthropic)
-      if (!hasKey) {
-        const envMap = {
-          anthropic: ['ANTHROPIC_API_KEY', 'ANTHROPIC_AUTH_TOKEN', 'PORTKEY_API_KEY'],
-          openai: ['OPENAI_API_KEY'],
-          google: ['GOOGLE_API_KEY', 'GEMINI_API_KEY'],
-          deepseek: ['DEEPSEEK_API_KEY'],
-          moonshot: ['MOONSHOT_API_KEY'],
-        };
-        hasKey = (envMap[providerType] || []).some(k => !!process.env[k]);
-      }
-      console.log(`  ${hasKey ? '+' : '-'} ${model} (${providerType}${hasKey ? '' : ' — NO API KEY'})`);
-      if (hasKey) availableModels.push(model);
-    }
-  }
-  if (availableModels.length === 0) {
-    console.error('\n[eval] No models available. Configure API keys in setup or start Ollama.');
-    process.exit(1);
-  }
-  console.log(`\n[eval] Available: ${availableModels.length}/${models.length} models`);
-  // Load benchmarks to show work items
-  const BENCHMARKS_DIR = path.join(__dirname, 'benchmarks');
-  let benchmarks;
-  if (suite === 'all') {
-    benchmarks = [];
-    for (const fname of fs.readdirSync(BENCHMARKS_DIR).filter(n => n.endsWith('.json'))) {
-      try {
-        const items = JSON.parse(fs.readFileSync(path.join(BENCHMARKS_DIR, fname), 'utf8'));
-        benchmarks.push(...items);
-      } catch {}
-    }
-  } else {
-    const file = path.join(BENCHMARKS_DIR, `${suite}.json`);
-    if (!fs.existsSync(file)) {
-      console.error(`[eval] Suite not found: ${suite} (looked in ${file})`);
-      process.exit(1);
-    }
-    benchmarks = JSON.parse(fs.readFileSync(file, 'utf8'));
-  }
-  if (benchmarkId) {
-    benchmarks = benchmarks.filter(b => b.id === benchmarkId);
-  }
-  const totalWork = availableModels.length * benchmarks.length;
-  console.log(`[eval] Suite: ${suite} (${benchmarks.length} benchmarks)`);
-  console.log(`[eval] Total work items: ${totalWork} (${availableModels.length} models x ${benchmarks.length} benchmarks)`);
-  console.log(`[eval] Budget: $${budget.toFixed(2)}, Timeout: ${timeout / 1000}s/benchmark, Concurrency: ${concurrency}`);
-  if (dryRun) {
-    console.log('\n[eval] DRY RUN — would execute:');
-    for (const model of availableModels) {
-      for (const b of benchmarks) {
-        console.log(`  ${model} x ${b.id} (${b.difficulty})`);
-      }
-    }
-    process.exit(0);
-  }
-  // Pre-flight: warn about unhealthy cloud providers
-  const cloudModels = availableModels.filter(m => !m.includes(':'));
-  if (cloudModels.length > 0) {
-    const health = await checkProviderHealth(brain);
-    const unhealthy = Object.entries(health).filter(([, s]) => !s.ok);
-    if (unhealthy.length > 0) {
-      console.log('\n[eval] Provider warnings:');
-      for (const [provider, status] of unhealthy) {
-        const affectedModels = availableModels.filter(m => {
-          if (provider === 'anthropic') return m.startsWith('claude-');
-          if (provider === 'openai') return m.startsWith('gpt-') || m.startsWith('o1') || m.startsWith('o3') || m.startsWith('o4');
-          if (provider === 'google') return m.startsWith('gemini-');
-          if (provider === 'deepseek') return m.startsWith('deepseek-');
-          if (provider === 'moonshot') return m.startsWith('kimi-') || m.startsWith('moonshot-');
-          return false;
-        });
-        if (affectedModels.length > 0) {
-          console.warn(`  ${provider}: ${status.error} (skipping ${affectedModels.length} models: ${affectedModels.join(', ')})`);
-          // Remove unhealthy models from availableModels
-          for (const m of affectedModels) {
-            const idx = availableModels.indexOf(m);
-            if (idx >= 0) availableModels.splice(idx, 1);
-          }
-        }
-      }
-      if (availableModels.length === 0) {
-        console.error('\n[eval] No healthy models remaining after pre-flight check.');
-        if (brain) brain.closeDb(true);
-        process.exit(1);
-      }
-      console.log(`[eval] Proceeding with ${availableModels.length} healthy models\n`);
-    }
-  }
-  // Create and run orchestrator
-  const orch = new EvalOrchestrator({
-    concurrency,
-    budgetDollars: budget,
-    timeoutMs: timeout,
-    brain,
-    runId: resumeId || undefined,
-  });
-  // Wire up events for live progress
-  orch.on('benchmark-start', ({ benchmarkId: bid, model }) => {
-    console.log(`\n[START] ${model} x ${bid}`);
-  });
-  orch.on('benchmark-complete', ({ benchmarkId: bid, model, composite, costDollars, elapsed }) => {
-    console.log(`[DONE]  ${model} x ${bid}: score=${composite.toFixed(3)} cost=$${(costDollars || 0).toFixed(6)} time=${(elapsed / 1000).toFixed(1)}s`);
-  });
-  orch.on('model-complete', ({ model, avgScore, totalCost, benchmarksRun }) => {
-    console.log(`\n[MODEL] ${model}: avg=${avgScore.toFixed(3)} cost=$${totalCost.toFixed(6)} runs=${benchmarksRun}`);
-  });
-  orch.on('budget-warning', ({ spent, budget: bgt, remaining, model }) => {
-    const ctx = model ? ` (${model})` : '';
-    console.warn(`\n[BUDGET] $${spent.toFixed(4)} / $${bgt.toFixed(2)} spent${ctx}, remaining: $${(remaining || 0).toFixed(4)}`);
-  });
-  orch.on('error', ({ benchmarkId: bid, model, error }) => {
-    console.error(`[ERROR] ${model || '?'} x ${bid || '?'}: ${error}`);
-  });
-  // Handle Ctrl+C gracefully
-  process.on('SIGINT', () => {
-    console.log('\n[eval] Aborting... (run can be resumed with --resume ' + orch.runId + ')');
-    orch.abort();
-  });
-  console.log(`\n[eval] Starting run ${orch.runId}...\n`);
-  const startTime = Date.now();
-  const summary = await orch.run({
-    suite,
-    models: availableModels,
-    benchmarkIds: benchmarkId ? [benchmarkId] : undefined,
-    runAgentLoop,
-  });
-  const totalElapsed = ((Date.now() - startTime) / 1000).toFixed(0);
-  console.log('\n' + '='.repeat(60));
-  console.log('EVAL RUN COMPLETE');
-  console.log('='.repeat(60));
-  console.log(`Run ID:     ${summary.runId}`);
-  console.log(`Status:     ${summary.status}`);
-  console.log(`Benchmarks: ${summary.totalBenchmarks}`);
-  console.log(`Total cost: $${summary.totalSpent.toFixed(6)}`);
-  console.log(`Elapsed:    ${totalElapsed}s`);
-  console.log('');
-  // Leaderboard
-  const sorted = Object.entries(summary.models)
-    .sort(([, a], [, b]) => b.avgScore - a.avgScore);
-  console.log('LEADERBOARD:');
-  console.log('-'.repeat(60));
-  console.log('Rank  Model                          Avg Score  Cost      Errors');
-  console.log('-'.repeat(60));
-  sorted.forEach(([model, stats], i) => {
-    const name = model.padEnd(30);
-    const score = stats.avgScore.toFixed(3).padStart(9);
-    const cost = ('$' + stats.totalCost.toFixed(4)).padStart(9);
-    const errors = String(stats.errors).padStart(6);
-    console.log(`${String(i + 1).padStart(4)}  ${name} ${score} ${cost} ${errors}`);
-  });
-  console.log('-'.repeat(60));
-  if (summary.status === 'aborted') {
-    console.log(`\nResume with: node eval/run-eval.js --resume ${summary.runId}`);
-  }
-  if (brain) brain.closeDb(true);
-})();