npm - create-walle - Versions diffs - 0.9.21 → 0.9.23 - Mend

create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (500) hide show

package/template/wall-e/eval/run-agent-benchmarks.js DELETED Viewed

@@ -1,386 +0,0 @@
-#!/usr/bin/env node
-'use strict';
-/**
- * CLI runner for coding-agent benchmarks.
- * Usage:
- *   node eval/run-agent-benchmarks.js                                # run all (uses .env defaults)
- *   node eval/run-agent-benchmarks.js --id agent-001                 # run single benchmark
- *   node eval/run-agent-benchmarks.js --dry-run                      # test sandbox setup only
- *   node eval/run-agent-benchmarks.js --provider ollama --model gemma4:e4b
- *   node eval/run-agent-benchmarks.js --suite coding-agent           # default suite
- *   node eval/run-agent-benchmarks.js --suite swebench-30            # curated 30-task SWE-bench subset
- *   node eval/run-agent-benchmarks.js --suite swebench-lite          # full SWE-bench Lite
- *   node eval/run-agent-benchmarks.js --suite all                    # coding-agent + swebench-30
- */
-// Load .env from project root (same as CTM server.js) for API keys
-try {
-  const envPath = require('path').resolve(__dirname, '..', '..', '.env');
-  const lines = require('fs').readFileSync(envPath, 'utf8').split('\n');
-  for (const line of lines) {
-    const match = line.match(/^([A-Z_]+)=(.*)$/);
-    if (match && !process.env[match[1]]) process.env[match[1]] = match[2];
-  }
-} catch {}
-const path = require('path');
-const crypto = require('crypto');
-process.chdir(path.join(__dirname, '..'));
-const { setupSandbox, cleanupSandbox, runAgentBenchmark, runAgentBenchmarkSuite, resolveModelName, isTrustedAgentResult } = require('./agent-runner');
-const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
-const benchmarks = require('./benchmarks/coding-agent.json');
-/**
- * Run a SWE-bench suite (swebench-30 or swebench-lite).
- * Checks Docker availability first and skips with a message if not present.
- */
-async function runSWEBenchSuite(suite, { dryRun, filterId, providerType, modelId } = {}) {
-  const { loadCuratedSubset, downloadDataset, mapTaskToPrompt, runSWEBenchTask } = require('./swebench-adapter');
-  const { isDockerAvailable } = require('./swebench-docker');
-  const suiteName = suite === 'swebench-30' ? 'SWE-bench Lite (curated 30)' : 'SWE-bench Lite (full)';
-  console.log(`=== Wall-E ${suiteName} Runner ===`);
-  // Check Docker availability
-  const dockerOk = await isDockerAvailable();
-  if (!dockerOk) {
-    console.log('');
-    console.log('Docker is not available. SWE-bench tasks require Docker for sandboxed execution.');
-    console.log('Install Docker and ensure the daemon is running, then retry.');
-    console.log('');
-    console.log('Skipping SWE-bench suite.');
-    return;
-  }
-  // Load tasks
-  let tasks;
-  if (suite === 'swebench-30') {
-    tasks = await loadCuratedSubset();
-    console.log(`Tasks: ${tasks.length} (curated subset)`);
-  } else {
-    tasks = await downloadDataset();
-    console.log(`Tasks: ${tasks.length} (full SWE-bench Lite)`);
-  }
-  if (filterId) {
-    tasks = tasks.filter(t => t.instance_id === filterId);
-    console.log(`Filtered to: ${tasks.length} tasks`);
-  }
-  if (dryRun) {
-    console.log('Mode: dry-run');
-    console.log('');
-    for (const task of tasks.slice(0, 5)) {
-      const mapped = mapTaskToPrompt(task);
-      console.log(`  ${mapped.id} (${task.difficulty || 'unknown'}) - ${task.repo}`);
-      console.log(`    Prompt: ${mapped.prompt.slice(0, 80)}...`);
-    }
-    if (tasks.length > 5) console.log(`  ... and ${tasks.length - 5} more`);
-    return;
-  }
-  if (providerType) console.log(`Provider: ${providerType}`);
-  if (modelId) console.log(`Model: ${modelId}`);
-  console.log('');
-  const results = [];
-  for (const task of tasks) {
-    const mapped = mapTaskToPrompt(task);
-    console.log(`--- ${mapped.id} (${task.difficulty || 'unknown'}) ---`);
-    console.log(`  Repo: ${task.repo}`);
-    const result = await runSWEBenchTask(task, {
-      brain: null,
-      provider: null,
-      model: modelId,
-      timeoutMs: 900_000,
-    });
-    results.push(result);
-    console.log(`  Success: ${result.success}`);
-    console.log(`  Score: ${(result.score?.composite || 0).toFixed(3)}`);
-    if (result.error) console.log(`  Error: ${result.error}`);
-    console.log('');
-  }
-  // Summary
-  console.log('=== SWE-BENCH SUMMARY ===');
-  const succeeded = results.filter(r => r.success).length;
-  console.log(`Passed: ${succeeded}/${results.length}`);
-  if (results.length > 0) {
-    const avgScore = results.reduce((sum, r) => sum + (r.score?.composite || 0), 0) / results.length;
-    console.log(`Average score: ${avgScore.toFixed(3)}`);
-  }
-}
-async function main() {
-  const args = process.argv.slice(2);
-  const dryRun = args.includes('--dry-run');
-  const idIdx = args.indexOf('--id');
-  const filterId = idIdx >= 0 ? args[idIdx + 1] : null;
-  const providerIdx = args.indexOf('--provider');
-  const providerType = providerIdx >= 0 ? args[providerIdx + 1] : null;
-  const modelIdx = args.indexOf('--model');
-  const modelId = modelIdx >= 0 ? args[modelIdx + 1] : null;
-  const suiteIdx = args.indexOf('--suite');
-  const suite = suiteIdx >= 0 ? args[suiteIdx + 1] : 'coding-agent';
-  // --- SWE-bench suites: delegate to swebench adapter ---
-  if (suite === 'swebench-30' || suite === 'swebench-lite') {
-    await runSWEBenchSuite(suite, { dryRun, filterId, providerType, modelId });
-    return;
-  }
-  if (suite === 'all') {
-    // Run coding-agent first, then swebench-30
-    // Fall through below for coding-agent, then run swebench-30 at the end
-  }
-  console.log('=== Wall-E Coding Agent Benchmark Runner ===');
-  console.log(`Suite: ${suite === 'all' ? 'all (coding-agent + swebench-30)' : suite}`);
-  console.log(`Benchmarks: ${benchmarks.length} total`);
-  if (filterId) console.log(`Filtering: ${filterId}`);
-  if (dryRun) console.log('Mode: dry-run (sandbox setup only)');
-  if (providerType) console.log(`Provider: ${providerType}`);
-  if (modelId) console.log(`Model: ${modelId}`);
-  console.log('');
-  // Dry run: just test sandbox setup/teardown
-  if (dryRun) {
-    console.log('--- Dry Run: Testing Sandbox Setup ---');
-    for (const fixtureName of ['express-basic', 'express-buggy', 'express-rename-data', 'node-cli', 'monorepo-basic', 'fullstack-app', 'wall-e-subset']) {
-      try {
-        const dir = setupSandbox(fixtureName);
-        console.log(`  [OK] ${fixtureName} -> ${dir}`);
-        // Run test command if available
-        const { execFileSync } = require('child_process');
-        try {
-          execFileSync('node', ['test.js'], { cwd: dir, timeout: 10000, stdio: 'pipe' });
-          console.log(`  [OK] ${fixtureName} tests pass`);
-        } catch (e) {
-          console.log(`  [WARN] ${fixtureName} tests: ${e.message.split('\n')[0]}`);
-        }
-        cleanupSandbox(dir);
-        console.log(`  [OK] ${fixtureName} cleanup`);
-      } catch (err) {
-        console.error(`  [FAIL] ${fixtureName}: ${err.message}`);
-      }
-    }
-    return;
-  }
-  // Full run: needs runAgentLoop
-  let runAgentLoop;
-  try {
-    const orchestrator = require('../coding-orchestrator');
-    runAgentLoop = orchestrator.runAgentLoop;
-  } catch (err) {
-    console.error(`Failed to load coding-orchestrator: ${err.message}`);
-    console.error('Ensure you are running from the wall-e directory with dependencies installed.');
-    process.exit(1);
-  }
-  // Load brain first — needed for API keys and result storage
-  let brain = null;
-  try {
-    brain = require('../brain');
-    brain.initDb();
-    console.log('Brain loaded (providers + result storage)');
-  } catch (err) {
-    console.log(`Brain not available: ${err.message}`);
-  }
-  // Resolve provider — use brain DB keys, fall back to env vars
-  let provider = null;
-  if (providerType) {
-    try {
-      const { createClient } = require('../llm/client');
-      // getDefaultClient reads keys from brain DB, but we need a specific provider
-      let apiKey = null;
-      if (brain) {
-        try {
-          const row = brain.getDb().prepare(
-            'SELECT api_key_encrypted FROM model_providers WHERE type = ? AND enabled = 1 AND api_key_encrypted IS NOT NULL LIMIT 1'
-          ).get(providerType);
-          apiKey = row?.api_key_encrypted;
-        } catch {}
-      }
-      // Fall back to env vars
-      if (!apiKey) {
-        apiKey = process.env.ANTHROPIC_API_KEY || process.env.OPENAI_API_KEY || process.env.GOOGLE_API_KEY;
-      }
-      provider = createClient(providerType, { apiKey });
-      console.log(`Provider: ${providerType} (key from ${apiKey ? 'brain DB' : 'env'})`);
-    } catch (err) {
-      console.warn(`Could not create ${providerType} client: ${err.message}, using default`);
-    }
-  }
-  const selectedBenchmarks = filterId
-    ? benchmarks.filter(b => b.id === filterId)
-    : benchmarks;
-  if (selectedBenchmarks.length === 0) {
-    console.error(`No benchmark found with id: ${filterId}`);
-    process.exit(1);
-  }
-  console.log(`Running ${selectedBenchmarks.length} benchmarks...\n`);
-  const runId = crypto.randomUUID();
-  const results = [];
-  for (const benchmark of selectedBenchmarks) {
-    const startTime = Date.now();
-    console.log(`--- ${benchmark.id} (${benchmark.difficulty}) ---`);
-    console.log(`  Prompt: ${benchmark.prompt.slice(0, 100)}...`);
-    console.log(`  Fixture: ${benchmark.agentExpectations?.projectFixture || 'express-basic'}`);
-    try {
-      const result = await runAgentBenchmark(benchmark, {
-        runAgentLoop,
-        brain,
-        timeoutMs: 600000,
-        provider,
-        model: modelId,
-      });
-      const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
-      result.runId = runId;
-      result.timestamp = new Date().toISOString();
-      results.push(result);
-      storeBenchmarkResult({ brain, runId, benchmark, result, provider, modelId, timeoutMs: 600000 });
-      console.log(`  Success: ${result.success}`);
-      console.log(`  Score: ${(result.score?.composite || 0).toFixed(3)}`);
-      if (result.score?.dimensions) {
-        const d = result.score.dimensions;
-        console.log(`    Tool efficiency: ${(d.toolEfficiency || 0).toFixed(3)}`);
-        console.log(`    Correctness:     ${(d.correctness || 0).toFixed(3)}`);
-        console.log(`    Plan quality:    ${(d.planQuality || 0).toFixed(3)}`);
-        console.log(`    Turn economy:    ${(d.turnEconomy || 0).toFixed(3)}`);
-        console.log(`    Error handling:  ${(d.errorHandling || 0).toFixed(3)}`);
-      }
-      console.log(`  Tools: ${(result.actualToolCalls || []).join(', ') || 'none'}`);
-      console.log(`  Files: ${(result.actualFileChanges || []).join(', ') || 'none'}`);
-      console.log(`  Tests: ${result.testsPassed === null ? 'N/A' : result.testsPassed}`);
-      console.log(`  Time: ${elapsed}s`);
-      if (result.error) console.log(`  Error: ${result.error}`);
-    } catch (err) {
-      console.error(`  EXCEPTION: ${err.message}`);
-      const result = {
-        benchmarkId: benchmark.id,
-        success: false,
-        error: err.message,
-        score: { composite: 0 },
-        runId,
-        timestamp: new Date().toISOString(),
-      };
-      results.push(result);
-      storeBenchmarkResult({ brain, runId, benchmark, result, provider, modelId, timeoutMs: 600000 });
-    }
-    console.log('');
-  }
-  // Summary
-  console.log('=== SUMMARY ===');
-  const succeeded = results.filter(r => r.success).length;
-  const avgScore = results.reduce((sum, r) => sum + (r.score?.composite || 0), 0) / results.length;
-  console.log(`Passed: ${succeeded}/${results.length}`);
-  console.log(`Average composite score: ${avgScore.toFixed(3)}`);
-  console.log('');
-  // Per-difficulty breakdown
-  for (const diff of ['easy', 'medium', 'hard']) {
-    const subset = results.filter((r, i) => selectedBenchmarks[i]?.difficulty === diff);
-    if (subset.length === 0) continue;
-    const avg = subset.reduce((sum, r) => sum + (r.score?.composite || 0), 0) / subset.length;
-    const passed = subset.filter(r => r.success).length;
-    console.log(`  ${diff}: ${passed}/${subset.length} passed, avg score ${avg.toFixed(3)}`);
-  }
-  // Detect regressions if brain is available
-  if (brain) {
-    try {
-      const { detectRegressions } = require('./agent-runner');
-      const regressions = detectRegressions(brain, results);
-      if (regressions.length > 0) {
-        console.log(`\n!!! ${regressions.length} REGRESSIONS DETECTED !!!`);
-        for (const r of regressions) {
-          console.log(`  ${r.benchmarkId}: dropped ${r.dropPct}% (baseline ${r.baselineAvg.toFixed(3)} -> ${r.currentScore.toFixed(3)})`);
-        }
-      }
-    } catch {}
-  }
-  // If suite is 'all', also run swebench-30
-  if (suite === 'all') {
-    console.log('\n');
-    await runSWEBenchSuite('swebench-30', { dryRun, filterId, providerType, modelId });
-  }
-  process.exit(succeeded === results.length ? 0 : 1);
-}
-main().catch(err => {
-  console.error('Fatal error:', err);
-  process.exit(1);
-});
-function storeBenchmarkResult({ brain, runId, benchmark, result, provider, modelId, timeoutMs }) {
-  if (!brain || typeof brain.insertBenchmarkResult !== 'function') return;
-  try {
-    const scoringMethod = benchmark.agentExpectations?.testCommand
-      ? 'agent-rubric+tests'
-      : 'agent-rubric';
-    const trusted = isTrustedBenchmarkResult(result);
-    brain.insertBenchmarkResult(decorateBenchmarkResult({
-      runId,
-      suite: 'coding-agent',
-      promptId: benchmark.id,
-      taskType: 'coding-agent',
-      difficulty: benchmark.difficulty,
-      provider: provider?.type || 'default',
-      model: resolveModelName(modelId),
-      prompt: benchmark.prompt,
-      response: result.output || '',
-      traitScore: null,
-      matchedTraits: [],
-      compositeScore: result.score?.composite || 0,
-      latencyMs: result.latencyMs,
-      error: result.error,
-      timestamp: result.timestamp,
-      costDollars: result.costDollars || null,
-      testsBefore: result.testsBefore ?? null,
-      testsAfter: result.testsAfter ?? null,
-      totalTests: result.totalTests ?? null,
-      dimensionsJson: result.dimensionsJson || null,
-      inputTokens: result.inputTokens ?? null,
-      outputTokens: result.outputTokens ?? null,
-      scorerVersion: DEFAULT_SCORER_VERSION,
-      scoringMethod,
-      trusted,
-      runConfig: { timeoutMs, scoringMethod },
-    }, {
-      suite: 'coding-agent',
-      benchmark,
-      runId,
-      provider: provider?.type || 'default',
-      model: resolveModelName(modelId),
-      scoringMethod,
-      scorerVersion: DEFAULT_SCORER_VERSION,
-      trusted,
-      runConfig: { timeoutMs, scoringMethod },
-    }));
-  } catch (err) {
-    console.warn(`  [WARN] Failed to store benchmark result: ${err.message}`);
-  }
-}
-function isTrustedBenchmarkResult(result = {}) {
-  return !result.error &&
-    result.testsPassed === true &&
-    (result.score?.composite || 0) > 0 &&
-    result.score?.dimensions?._zeroed !== true;
-}

package/template/wall-e/eval/run-codex-cli-baseline.js DELETED Viewed

@@ -1,177 +0,0 @@
-#!/usr/bin/env node
-'use strict';
-try {
-  const envPath = require('path').resolve(__dirname, '..', '..', '.env');
-  const lines = require('fs').readFileSync(envPath, 'utf8').split('\n');
-  for (const line of lines) {
-    const match = line.match(/^([A-Z_]+)=(.*)$/);
-    if (match && !process.env[match[1]]) process.env[match[1]] = match[2];
-  }
-} catch {}
-const crypto = require('crypto');
-const path = require('path');
-process.chdir(path.join(__dirname, '..'));
-const benchmarks = require('./benchmarks/coding-agent.json');
-const {
-  DEFAULT_RESULTS_DIR,
-  runCodexCliBaselineBenchmark,
-  storeBaselineResult,
-  summarizeBaselineResults,
-  writeBaselineArtifact,
-} = require('./codex-cli-baseline');
-async function main() {
-  const args = parseArgs(process.argv.slice(2));
-  if (args.help) {
-    printHelp();
-    return;
-  }
-  const selected = selectBenchmarks({
-    id: args.id,
-    limit: args.limit ? Number(args.limit) : null,
-    difficulty: args.difficulty,
-  });
-  if (!selected.length) {
-    console.error('No benchmarks selected');
-    process.exitCode = 1;
-    return;
-  }
-  const timeoutMs = args.timeout ? Number(args.timeout) : 600_000;
-  const runId = crypto.randomUUID();
-  const dryRun = !!args['dry-run'];
-  const model = args.model || null;
-  const resultsDir = args['results-dir'] || DEFAULT_RESULTS_DIR;
-  const brain = initBrain();
-  console.log('=== Codex CLI Baseline Runner ===');
-  console.log(`Run: ${runId}`);
-  console.log(`Benchmarks: ${selected.length}/${benchmarks.length}`);
-  console.log(`Mode: ${dryRun ? 'dry-run' : 'codex exec baseline'}`);
-  console.log(`Model: ${model || 'codex default'}`);
-  console.log(`Auth: ${args['use-env-openai-key'] ? 'env OPENAI_API_KEY allowed' : 'ChatGPT/Codex auth preferred (OPENAI_API_KEY stripped)'}`);
-  console.log('');
-  const results = [];
-  for (const benchmark of selected) {
-    console.log(`--- ${benchmark.id} (${benchmark.difficulty || 'unknown'}) ---`);
-    console.log(`  Prompt: ${String(benchmark.prompt || '').replace(/\s+/g, ' ').slice(0, 120)}`);
-    console.log(`  Fixture: ${benchmark.agentExpectations?.projectFixture || 'express-basic'}`);
-    const started = Date.now();
-    const result = await runCodexCliBaselineBenchmark(benchmark, {
-      dryRun,
-      model,
-      timeoutMs,
-      keepFailures: !!args['keep-failures'],
-      useEnvOpenAIKey: !!args['use-env-openai-key'],
-      dangerouslyBypassSandbox: !!args['dangerously-bypass-sandbox'],
-      allowMcp: !!args['allow-mcp'],
-      disableMcpServers: normalizeListArg(args['disable-mcp']),
-    });
-    result.runId = runId;
-    result.timestamp = new Date().toISOString();
-    results.push(result);
-    if (args.record) {
-      result.artifactPath = writeBaselineArtifact(result, { resultsDir });
-    }
-    storeBaselineResult({ brain, runId, benchmark, result, model, timeoutMs });
-    console.log(`  Success: ${result.success}`);
-    if (result.status) console.log(`  Status: ${result.status}`);
-    if (result.score) console.log(`  Score: ${(result.score.composite || 0).toFixed(3)}`);
-    console.log(`  Files: ${(result.actualFileChanges || []).join(', ') || 'none'}`);
-    console.log(`  Tests: ${result.testsPassed == null ? 'N/A' : result.testsPassed}`);
-    console.log(`  Time: ${((Date.now() - started) / 1000).toFixed(1)}s`);
-    if (result.error) console.log(`  Error: ${String(result.error).split('\n')[0]}`);
-    if (result.sandboxDir) console.log(`  Kept sandbox: ${result.sandboxDir}`);
-    if (result.artifactPath) console.log(`  Artifact: ${result.artifactPath}`);
-    console.log('');
-  }
-  const summary = summarizeBaselineResults(results);
-  console.log('=== SUMMARY ===');
-  console.log(`Passed: ${summary.passed}/${summary.total}`);
-  console.log(`Average composite score: ${summary.avgComposite.toFixed(3)}`);
-  console.log(`Failures: ${JSON.stringify(summary.failures)}`);
-  if (!dryRun && summary.failed > 0) process.exitCode = 1;
-}
-function initBrain() {
-  try {
-    const brain = require('../brain');
-    brain.initDb();
-    return brain;
-  } catch (err) {
-    console.warn(`Brain not available: ${err.message}`);
-    return null;
-  }
-}
-function selectBenchmarks({ id, limit, difficulty } = {}) {
-  let selected = benchmarks;
-  if (id) selected = selected.filter((benchmark) => benchmark.id === id);
-  if (difficulty) selected = selected.filter((benchmark) => benchmark.difficulty === difficulty);
-  if (limit) selected = selected.slice(0, limit);
-  return selected;
-}
-function parseArgs(argv) {
-  const out = {};
-  for (let i = 0; i < argv.length; i += 1) {
-    const arg = argv[i];
-    if (!arg.startsWith('--')) continue;
-    const key = arg.slice(2);
-    if (['dry-run', 'record', 'keep-failures', 'use-env-openai-key', 'dangerously-bypass-sandbox', 'allow-mcp', 'help'].includes(key)) {
-      out[key] = true;
-    } else {
-      if (out[key] === undefined) out[key] = argv[i + 1];
-      else if (Array.isArray(out[key])) out[key].push(argv[i + 1]);
-      else out[key] = [out[key], argv[i + 1]];
-      i += 1;
-    }
-  }
-  return out;
-}
-function normalizeListArg(value) {
-  if (value == null) return null;
-  const values = Array.isArray(value) ? value : [value];
-  return values
-    .flatMap((entry) => String(entry || '').split(','))
-    .map((entry) => entry.trim())
-    .filter(Boolean);
-}
-function printHelp() {
-  console.log(`Usage:
-  node eval/run-codex-cli-baseline.js --id agent-001 --model gpt-5.5
-  node eval/run-codex-cli-baseline.js --limit 5 --model gpt-5.5 --record
-  node eval/run-codex-cli-baseline.js --dry-run --difficulty easy
-Options:
-  --id <id>                    Run one benchmark id
-  --limit <n>                  Limit selected benchmarks
-  --difficulty <easy|medium|hard>
-  --model <id>                 Forwarded verbatim to: codex exec -m <id>
-  --timeout <ms>               Per-benchmark timeout (default 600000)
-  --record                     Write result JSON artifacts
-  --results-dir <path>         Artifact directory
-  --keep-failures              Keep failed sandbox directories
-  --use-env-openai-key         Let codex inherit OPENAI_API_KEY instead of preferring ChatGPT auth
-  --allow-mcp                  Let codex load enabled MCP servers from local config
-  --disable-mcp <name[,name]>  Explicit MCP server names to disable; defaults to auto-discover enabled servers
-  --dangerously-bypass-sandbox Pass codex's bypass flag. Use only inside an external sandbox.
-  --dry-run                    Verify fixture setup only; does not invoke codex
-`);
-}
-main().catch((err) => {
-  console.error(err.stack || err.message);
-  process.exit(1);
-});