npm - create-walle - Versions diffs - 0.9.21 → 0.9.23 - Mend

create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (500) hide show

package/template/wall-e/eval/agent-runner.js DELETED Viewed

@@ -1,772 +0,0 @@
-'use strict';
-const fs = require('fs');
-const path = require('path');
-const os = require('os');
-const crypto = require('crypto');
-const { execFileSync, execFile } = require('child_process');
-const { promisify } = require('util');
-const execFileAsync = promisify(execFile);
-const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
-const { testCommandAllowed } = require('./allowed-test-commands');
-const { createArtifactTranscript, exportBenchmarkArtifacts } = require('./meta-harness/trace-exporter');
-const DEFAULT_TIMEOUT_MS = 600_000; // 10 minutes — coding agents can take long
-const FIXTURES_DIR = path.join(__dirname, 'fixtures');
-let _estimateProviderCost;
-function getEstimateProviderCost() {
-  if (!_estimateProviderCost) {
-    try {
-      _estimateProviderCost = require('./head-to-head').estimateProviderCost;
-    } catch { _estimateProviderCost = () => 0; }
-  }
-  return _estimateProviderCost;
-}
-/**
- * Set up a sandboxed project from a fixture.
- * Copies fixture to temp dir, initializes git.
- */
-function setupSandbox(fixtureName) {
-  const fixtureDir = path.join(FIXTURES_DIR, fixtureName);
-  if (!fs.existsSync(fixtureDir)) {
-    throw new Error(`Fixture not found: ${fixtureName}`);
-  }
-  // Use HOME-based temp dir because local-tools restricts file access to $HOME
-  const homeDir = process.env.HOME || os.homedir();
-  const tmpDir = path.join(homeDir, '.walle', 'bench-sandbox', `bench-${crypto.randomUUID().slice(0, 8)}`);
-  fs.mkdirSync(tmpDir, { recursive: true });
-  // Copy fixture files recursively
-  copyDirSync(fixtureDir, tmpDir);
-  // Init git
-  try {
-    execFileSync('git', ['init'], { cwd: tmpDir, stdio: 'pipe' });
-    execFileSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe' });
-    execFileSync('git', ['commit', '-m', 'Initial fixture'], { cwd: tmpDir, stdio: 'pipe', env: { ...process.env, GIT_AUTHOR_NAME: 'test', GIT_AUTHOR_EMAIL: 'test@test.com', GIT_COMMITTER_NAME: 'test', GIT_COMMITTER_EMAIL: 'test@test.com' } });
-  } catch (e) {
-    // git init is best-effort
-  }
-  return tmpDir;
-}
-function copyDirSync(src, dest) {
-  fs.mkdirSync(dest, { recursive: true });
-  for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
-    const srcPath = path.join(src, entry.name);
-    const destPath = path.join(dest, entry.name);
-    if (entry.isDirectory()) {
-      copyDirSync(srcPath, destPath);
-    } else {
-      fs.copyFileSync(srcPath, destPath);
-    }
-  }
-}
-function cleanupSandbox(dir) {
-  try {
-    fs.rmSync(dir, { recursive: true, force: true });
-  } catch { /* best-effort cleanup */ }
-}
-/**
- * Run a single agent benchmark.
- * @param {object} benchmark - Benchmark entry with agentExpectations
- * @param {object} options - { runAgentLoop, brain, timeoutMs, provider, model }
- * @returns {Promise<object>} Result with scores and metadata
- */
-async function runAgentBenchmark(benchmark, options = {}) {
-  const maxAttempts = options.retryHarnessFailures === false
-    ? 1
-    : Math.max(1, options.maxHarnessAttempts || 2);
-  const started = Date.now();
-  const previousErrors = [];
-  let lastResult = null;
-  for (let attempt = 1; attempt <= maxAttempts; attempt++) {
-    lastResult = await runAgentBenchmarkAttempt(benchmark, { ...options, _attempt: attempt });
-    lastResult.attempts = attempt;
-    if (attempt > 1) {
-      lastResult.latencyMs = Date.now() - started;
-      lastResult.previousErrors = previousErrors.slice();
-    }
-    if (!shouldRetryAgentBenchmarkResult(lastResult, attempt, maxAttempts)) return lastResult;
-    previousErrors.push(lastResult.error || 'retryable harness failure');
-  }
-  if (lastResult) {
-    lastResult.latencyMs = Date.now() - started;
-    lastResult.previousErrors = previousErrors.slice();
-  }
-  return lastResult;
-}
-async function runAgentBenchmarkAttempt(benchmark, options = {}) {
-  const { runAgentLoop, timeoutMs = DEFAULT_TIMEOUT_MS, provider, model, artifactDir } = options;
-  if (!runAgentLoop) throw new Error('runAgentLoop function is required');
-  const expectations = benchmark.agentExpectations || {};
-  const fixtureName = expectations.projectFixture || 'express-basic';
-  let sandboxDir;
-  let artifactContext = null;
-  let exportedArtifact = null;
-  const startTime = Date.now();
-  const startedAt = new Date(startTime).toISOString();
-  try {
-    sandboxDir = setupSandbox(fixtureName);
-    if (artifactDir) {
-      artifactContext = createArtifactTranscript({
-        artifactDir,
-        cwd: sandboxDir,
-        label: String(benchmark.prompt || '').slice(0, 160),
-        modelId: model || '',
-        modelProvider: provider?.type || String(provider || ''),
-      });
-    }
-    // Count tests before agent run
-    let testsBefore = null;
-    let totalTests = null;
-    if (testCommandAllowed(expectations.testCommand)) {
-      const testCounts = countTests(sandboxDir, expectations.testCommand);
-      testsBefore = testCounts.passed;
-      totalTests = testCounts.total;
-    }
-    // Run the agent loop with hard timeout safety net
-    const maxTurns = expectations.maxTurns || 20;
-    const turnBudgetTimeout = maxTurns * 30000;
-    const effectiveTimeout = Math.min(timeoutMs || turnBudgetTimeout, turnBudgetTimeout);
-    const agentPromise = runAgentLoop(benchmark.prompt, {
-      cwd: sandboxDir,
-      timeoutMs: effectiveTimeout,
-      maxTurns,
-      provider,
-      model,
-      mode: 'build',
-      benchmark: true,
-      headless: true,
-      headlessPolicy: 'allow',
-      permissionTimeoutMs: 0,
-      persistTranscript: artifactContext ? true : false,
-      transcript: artifactContext?.transcript || null,
-    });
-    let timeoutHandle;
-    const timeoutPromise = new Promise((_, reject) => {
-      timeoutHandle = setTimeout(() => reject(new Error('Hard timeout exceeded')), effectiveTimeout + 60000); // +1min grace
-      if (typeof timeoutHandle.unref === 'function') timeoutHandle.unref();
-    });
-    const result = await Promise.race([agentPromise, timeoutPromise]);
-    if (timeoutHandle) clearTimeout(timeoutHandle);
-    const latencyMs = Date.now() - startTime;
-    // Estimate cost from LLM usage
-    const usage = result.usage || {};
-    const estimateCost = getEstimateProviderCost();
-    const costDollars = estimateCost(usage, provider?.type || provider || 'anthropic', model);
-    // Collect actual results
-    const actualToolCalls = extractToolCalls(result);
-    const toolCallDetails = extractToolCallDetails(result);
-    const actualFileChanges = await getModifiedFiles(sandboxDir);
-    const externalRunnerId = result.runnerId || result.fallback?.runnerId || null;
-    const externalRunnerWork = Boolean(externalRunnerId && actualFileChanges.length > 0);
-    const actualTurns = (result.log || []).length || actualToolCalls.length || (externalRunnerId ? 1 : 0);
-    // Run test command if specified (validate against allowlist)
-    let testsPassed = null;
-    let testsAfter = null;
-    if (testCommandAllowed(expectations.testCommand)) {
-      try {
-        execFileSync('sh', ['-c', expectations.testCommand], {
-          cwd: sandboxDir,
-          timeout: 30000,
-          stdio: 'pipe',
-        });
-        testsPassed = true;
-      } catch {
-        testsPassed = false;
-      }
-      // Count tests after agent run
-      const afterCounts = countTests(sandboxDir, expectations.testCommand);
-      testsAfter = afterCounts.passed;
-      if (totalTests === null) totalTests = afterCounts.total;
-    }
-    const inputTokens = usage.inputTokens ?? usage.input ?? 0;
-    const expectedFileChanges = expectations.expectedFileChanges || [];
-    const missingExpectedWork = expectedFileChanges.length > 0 && actualFileChanges.length === 0;
-    const attemptedFileChange = actualToolCalls.some((call) => {
-      const name = typeof call === 'string' ? call : call?.name;
-      return /edit|write|patch|create|delete|modify/i.test(String(name || ''));
-    });
-    const testRegression = (expectations.testCommand && testsPassed === false);
-    const rawError = result.stderr || result.error || null;
-    const validatedByTests = Boolean(
-      expectations.testCommand &&
-      testsPassed === true &&
-      actualFileChanges.length > 0
-    );
-    const fatalError = rawError && !validatedByTests ? rawError : null;
-    const noEffort = (actualToolCalls.length === 0 && !externalRunnerWork) ||
-      (inputTokens === 0 && !externalRunnerWork) ||
-      missingExpectedWork;
-    const hadError = !!fatalError;
-    const validatedSuccess = Boolean(result.success || validatedByTests || externalRunnerWork) && !hadError && !noEffort && !testRegression;
-    // Score the result
-    let score = scoreAgentResult(benchmark, {
-      actualToolCalls,
-      actualFileChanges,
-      actualTurns,
-      testsPassed,
-      output: result.output || '',
-      success: validatedSuccess,
-      sandboxDir,
-      costDollars,
-      testsBefore,
-      testsAfter,
-      totalTests,
-      toolCallDetails,
-    });
-    // Hard-zero floor: prevent runs that didn't actually do anything from
-    // scoring above 0. Without these gates, an agent that hits a 401 / makes
-    // zero tool calls / leaves tests broken still landed at composite ≈ 0.4
-    // through process-metric weights (turnEconomy, errorHandling, costEfficiency).
-    // That inflated past failure-investigation thresholds and reported FAIL as
-    // PASS. Cap explicitly here.
-    if (hadError || noEffort || testRegression) {
-      score = {
-        composite: 0,
-        dimensions: { ...(score.dimensions || {}), _zeroed: true,
-          _zeroReason: hadError
-            ? 'error'
-            : testRegression
-              ? 'tests_failed'
-              : missingExpectedWork
-                ? attemptedFileChange ? 'missing_expected_changes' : 'no_file_changes'
-                : 'no_effort' },
-      };
-    }
-    const finalResult = {
-      benchmarkId: benchmark.id,
-      success: validatedSuccess,
-      score,
-      latencyMs,
-      actualToolCalls,
-      actualFileChanges,
-      actualTurns,
-      testsPassed,
-      costDollars,
-      testsBefore,
-      testsAfter,
-      totalTests,
-      inputTokens: usage.inputTokens ?? usage.input ?? null,
-      outputTokens: usage.outputTokens ?? usage.output ?? null,
-      dimensionsJson: JSON.stringify(score.dimensions || {}),
-      output: (result.output || '').slice(0, 2000),
-      error: fatalError,
-    };
-    if (artifactDir) {
-      exportedArtifact = exportBenchmarkArtifacts({
-        artifactDir,
-        sandboxDir,
-        benchmark,
-        result: { ...finalResult, sessionId: result.sessionId || null, rawResult: result },
-        artifactContext,
-        startedAt,
-      });
-      finalResult.artifactPath = exportedArtifact.artifactDir;
-      finalResult.transcriptPath = exportedArtifact.transcriptPath;
-    }
-    return finalResult;
-  } catch (err) {
-    const errorResult = {
-      benchmarkId: benchmark.id,
-      success: false,
-      score: { composite: 0, dimensions: {} },
-      latencyMs: Date.now() - startTime,
-      error: err.message,
-    };
-    if (artifactDir && sandboxDir) {
-      exportedArtifact = exportBenchmarkArtifacts({
-        artifactDir,
-        sandboxDir,
-        benchmark,
-        result: errorResult,
-        artifactContext,
-        startedAt,
-      });
-      errorResult.artifactPath = exportedArtifact.artifactDir;
-      errorResult.transcriptPath = exportedArtifact.transcriptPath;
-    }
-    return errorResult;
-  } finally {
-    if (sandboxDir) cleanupSandbox(sandboxDir);
-  }
-}
-function shouldRetryAgentBenchmarkResult(result, attempt, maxAttempts) {
-  if (!result || result.success || attempt >= maxAttempts) return false;
-  return isRetryableHarnessFailure(result.error);
-}
-function isRetryableHarnessFailure(error) {
-  const text = String(error || '');
-  return /operation was aborted|AbortError|Hard timeout exceeded|modified since last read|edit-conflict|edit_file no-op|Command failed: SIGTERM|timed out/i.test(text);
-}
-function extractToolCalls(result) {
-  // Extract tool call names from agent result
-  if (result.toolCalls) return result.toolCalls.map(t => t.name || t);
-  if (result.log) {
-    const calls = [];
-    for (const entry of result.log) {
-      if (entry.toolCalls) {
-        for (const tc of entry.toolCalls) {
-          calls.push(tc.name || tc);
-        }
-      } else if (entry.toolCall) {
-        calls.push(entry.toolCall.name || entry.toolCall);
-      }
-    }
-    return calls;
-  }
-  return [];
-}
-async function getModifiedFiles(dir) {
-  try {
-    const { stdout } = await execFileAsync('git', ['diff', '--name-only', 'HEAD'], { cwd: dir });
-    const untracked = (await execFileAsync('git', ['ls-files', '--others', '--exclude-standard'], { cwd: dir })).stdout;
-    return [...stdout.trim().split('\n'), ...untracked.trim().split('\n')].filter(Boolean);
-  } catch {
-    return [];
-  }
-}
-/**
- * Score an agent's performance across multiple dimensions.
- * Delegates to agent-scorer.js for consistent scoring across benchmarks and sessions.
- */
-function scoreAgentResult(benchmark, actual) {
-  const expectations = benchmark.agentExpectations || {};
-  const { computeAgentScore } = require('./agent-scorer');
-  return computeAgentScore({
-    actualToolCalls: actual.actualToolCalls || [],
-    expectedToolCalls: expectations.expectedToolCalls || [],
-    forbiddenToolCalls: expectations.forbiddenToolCalls || [],
-    testsPassed: actual.testsPassed ?? null,
-    success: actual.success || false,
-    output: actual.output || '',
-    actualFiles: actual.actualFileChanges || [],
-    expectedFiles: expectations.expectedFileChanges || [],
-    actualTurns: actual.actualTurns || 0,
-    maxTurns: expectations.maxTurns || 20,
-    consecutiveErrors: 0,
-    // Enhanced dimensions
-    sandboxDir: actual.sandboxDir || null,
-    costDollars: actual.costDollars ?? null,
-    testsBefore: actual.testsBefore ?? null,
-    testsAfter: actual.testsAfter ?? null,
-    totalTests: actual.totalTests ?? null,
-    toolCallDetails: actual.toolCallDetails || null,
-    shouldAskUser: expectations.shouldAskUser || false,
-  });
-}
-function isTrustedAgentResult(result = {}) {
-  return result.success === true && !result.error && result.testsPassed === true;
-}
-/**
- * Run a multi-turn benchmark — sends each turn's prompt sequentially,
- * accumulating conversation context. Scores after the final turn.
- */
-async function runMultiTurnBenchmark(benchmark, options = {}) {
-  const { runAgentLoop, timeoutMs = DEFAULT_TIMEOUT_MS, provider, model } = options;
-  if (!runAgentLoop) throw new Error('runAgentLoop function is required');
-  const expectations = benchmark.agentExpectations || {};
-  const fixtureName = expectations.projectFixture || 'express-basic';
-  const turns = benchmark.turns || [];
-  let sandboxDir;
-  const startTime = Date.now();
-  try {
-    sandboxDir = setupSandbox(fixtureName);
-    const messages = [];
-    let lastResult = null;
-    let allToolCalls = [];
-    let allToolCallDetails = [];
-    let totalUsage = { inputTokens: 0, outputTokens: 0 };
-    let totalTurns = 0;
-    for (const turn of turns) {
-      messages.push({ role: 'user', content: turn.prompt });
-      lastResult = await runAgentLoop(turn.prompt, {
-        cwd: sandboxDir,
-        timeoutMs: timeoutMs || (expectations.maxTurns || 20) * 30000,
-        provider,
-        model,
-        mode: 'build',
-        benchmark: true,
-        headless: true,
-        headlessPolicy: 'allow',
-        permissionTimeoutMs: 0,
-        persistTranscript: false,
-        messages, // pass accumulated conversation
-      });
-      allToolCalls.push(...extractToolCalls(lastResult));
-      allToolCallDetails.push(...extractToolCallDetails(lastResult));
-      totalTurns += (lastResult.log || []).length || 1;
-      const turnUsage = lastResult.usage || {};
-      totalUsage.inputTokens += turnUsage.inputTokens || turnUsage.input || 0;
-      totalUsage.outputTokens += turnUsage.outputTokens || turnUsage.output || 0;
-      if (lastResult.output) {
-        messages.push({ role: 'assistant', content: lastResult.output });
-      }
-    }
-    const latencyMs = Date.now() - startTime;
-    const estimateCost = getEstimateProviderCost();
-    const costDollars = estimateCost(totalUsage, provider?.type || provider || 'anthropic', model);
-    const actualFileChanges = await getModifiedFiles(sandboxDir);
-    const externalRunnerId = lastResult?.runnerId || lastResult?.fallback?.runnerId || null;
-    const externalRunnerWork = Boolean(externalRunnerId && actualFileChanges.length > 0);
-    const actualTurns = totalTurns || (externalRunnerId ? 1 : 0);
-    let testsPassed = null;
-    let testsAfter = null;
-    let testsBefore = null;
-    let totalTests = null;
-    if (testCommandAllowed(expectations.testCommand)) {
-      try {
-        execFileSync('sh', ['-c', expectations.testCommand], { cwd: sandboxDir, timeout: 30000, stdio: 'pipe' });
-        testsPassed = true;
-      } catch {
-        testsPassed = false;
-      }
-      const afterCounts = countTests(sandboxDir, expectations.testCommand);
-      testsAfter = afterCounts.passed;
-      totalTests = afterCounts.total;
-    }
-    let score = scoreAgentResult(benchmark, {
-      actualToolCalls: allToolCalls,
-      actualFileChanges,
-      actualTurns,
-      testsPassed,
-      output: lastResult?.output || '',
-      success: lastResult?.success || false,
-      sandboxDir,
-      costDollars,
-      testsBefore,
-      testsAfter,
-      totalTests,
-      toolCallDetails: allToolCallDetails,
-    });
-    // Same hard-zero floor as single-turn — see runAgentBenchmark for rationale.
-    const inputTokens = totalUsage.inputTokens ?? 0;
-    const hadError = !!(lastResult?.stderr || lastResult?.error);
-    const noEffort = (allToolCalls.length === 0 && !externalRunnerWork) ||
-      (inputTokens === 0 && !externalRunnerWork);
-    const testRegression = (expectations.testCommand && testsPassed === false);
-    if (hadError || noEffort || testRegression) {
-      score = {
-        composite: 0,
-        dimensions: { ...(score.dimensions || {}), _zeroed: true,
-          _zeroReason: hadError ? 'error' : noEffort ? 'no_effort' : 'tests_failed' },
-      };
-    }
-    return {
-      benchmarkId: benchmark.id,
-      multiTurn: true,
-      turnsCompleted: turns.length,
-      success: lastResult?.success || false,
-      score,
-      latencyMs,
-      actualToolCalls: allToolCalls,
-      actualFileChanges,
-      actualTurns,
-      testsPassed,
-      costDollars,
-      testsBefore,
-      testsAfter,
-      totalTests,
-      inputTokens: totalUsage.inputTokens ?? null,
-      outputTokens: totalUsage.outputTokens ?? null,
-      dimensionsJson: JSON.stringify(score.dimensions || {}),
-      output: (lastResult?.output || '').slice(0, 2000),
-      error: lastResult?.stderr || lastResult?.error || null,
-    };
-  } catch (err) {
-    return {
-      benchmarkId: benchmark.id,
-      multiTurn: true,
-      success: false,
-      score: { composite: 0, dimensions: {} },
-      latencyMs: Date.now() - startTime,
-      error: err.message,
-    };
-  } finally {
-    if (sandboxDir) cleanupSandbox(sandboxDir);
-  }
-}
-/**
- * Resolve the actual model name that runAgentLoop will use.
- * Mirrors the resolution logic in coding-orchestrator.js.
- */
-function resolveModelName(model) {
-  return model || process.env.WALLE_MODEL_COMPLEX || process.env.WALLE_MODEL || 'claude-haiku-4-5-20251001';
-}
-/**
- * Run the full coding-agent benchmark suite.
- */
-async function runAgentBenchmarkSuite(options = {}) {
-  const { brain, runAgentLoop, provider, model, timeoutMs, signal } = options;
-  let benchmarks;
-  try {
-    benchmarks = require('./benchmarks/coding-agent.json');
-  } catch (err) {
-    throw new Error(`Failed to load coding-agent benchmarks: ${err.message}`);
-  }
-  const runId = crypto.randomUUID();
-  const results = [];
-  for (const benchmark of benchmarks) {
-    if (signal?.aborted) break;
-    const runner = benchmark.multiTurn ? runMultiTurnBenchmark : runAgentBenchmark;
-    const result = await runner(benchmark, {
-      runAgentLoop,
-      brain,
-      timeoutMs,
-      provider,
-      model,
-    });
-    result.runId = runId;
-    result.timestamp = new Date().toISOString();
-    results.push(result);
-    // Store result
-    if (brain && typeof brain.insertBenchmarkResult === 'function') {
-      try {
-        const scoringMethod = benchmark.agentExpectations?.testCommand
-          ? 'agent-rubric+tests'
-          : 'agent-rubric';
-        brain.insertBenchmarkResult(decorateBenchmarkResult({
-          runId,
-          suite: 'coding-agent',
-          promptId: benchmark.id,
-          taskType: 'coding-agent',
-          difficulty: benchmark.difficulty,
-          provider: provider?.type || 'default',
-          model: resolveModelName(model),
-          prompt: benchmark.prompt,
-          response: result.output || '',
-          traitScore: null,
-          matchedTraits: [],
-          compositeScore: result.score?.composite || 0,
-          latencyMs: result.latencyMs,
-          error: result.error,
-          timestamp: result.timestamp,
-          // Enhanced metrics
-          costDollars: result.costDollars || null,
-          testsBefore: result.testsBefore ?? null,
-          testsAfter: result.testsAfter ?? null,
-          totalTests: result.totalTests ?? null,
-          dimensionsJson: result.dimensionsJson || null,
-          inputTokens: result.inputTokens ?? null,
-          outputTokens: result.outputTokens ?? null,
-          scorerVersion: DEFAULT_SCORER_VERSION,
-          scoringMethod,
-          trusted: isTrustedAgentResult(result),
-          runConfig: { timeoutMs, scoringMethod },
-        }, {
-          suite: 'coding-agent',
-          benchmark,
-          runId,
-          provider: provider?.type || 'default',
-          model: resolveModelName(model),
-          scoringMethod,
-          scorerVersion: DEFAULT_SCORER_VERSION,
-          trusted: isTrustedAgentResult(result),
-          runConfig: { timeoutMs, scoringMethod },
-        }));
-      } catch { /* non-fatal */ }
-    }
-  }
-  // Compute summary
-  const avgScore = results.length > 0
-    ? results.reduce((sum, r) => sum + (r.score?.composite || 0), 0) / results.length
-    : 0;
-  return {
-    runId,
-    suite: 'coding-agent',
-    totalBenchmarks: benchmarks.length,
-    completed: results.length,
-    avgScore,
-    results,
-  };
-}
-/**
- * Detect regressions by comparing against stored baselines.
- */
-function detectRegressions(brain, currentResults, { thresholdPct = 10 } = {}) {
-  if (!brain || typeof brain.getBenchmarkResults !== 'function') return [];
-  const historical = brain.getBenchmarkResults({ suite: 'coding-agent', days: 30 });
-  if (historical.length === 0) return [];
-  // Group historical by benchmark_id
-  const baselines = {};
-  for (const h of historical) {
-    const key = h.benchmark_id || h.promptId;
-    if (!baselines[key]) baselines[key] = [];
-    baselines[key].push(h.composite_score || h.compositeScore || 0);
-  }
-  const regressions = [];
-  for (const result of currentResults) {
-    const baseline = baselines[result.benchmarkId];
-    if (!baseline || baseline.length < 3) continue;
-    const avgBaseline = baseline.reduce((a, b) => a + b, 0) / baseline.length;
-    const currentScore = result.score?.composite || 0;
-    const dropPct = ((avgBaseline - currentScore) / avgBaseline) * 100;
-    if (dropPct > thresholdPct) {
-      regressions.push({
-        benchmarkId: result.benchmarkId,
-        baselineAvg: avgBaseline,
-        currentScore,
-        dropPct: Math.round(dropPct),
-      });
-    }
-  }
-  return regressions;
-}
-/**
- * Count passing/total tests by running the test command and parsing output.
- * Best-effort — returns { passed: null, total: null } if parsing fails.
- */
-function countTests(cwd, testCommand) {
-  try {
-    const result = execFileSync('sh', ['-c', testCommand + ' 2>&1 || true'], {
-      cwd,
-      timeout: 30000,
-      stdio: ['pipe', 'pipe', 'pipe'],
-    });
-    const output = result.toString();
-    // Try to parse common test output formats
-    // node:test: "# pass N" / "# tests N"
-    const passMatch = output.match(/# pass\s+(\d+)/);
-    const totalMatch = output.match(/# tests\s+(\d+)/);
-    if (passMatch && totalMatch) {
-      return { passed: parseInt(passMatch[1], 10), total: parseInt(totalMatch[1], 10) };
-    }
-    // jest/mocha: "N passing" / "N failing"
-    const passingMatch = output.match(/(\d+)\s+passing/);
-    const failingMatch = output.match(/(\d+)\s+failing/);
-    if (passingMatch) {
-      const passed = parseInt(passingMatch[1], 10);
-      const failed = failingMatch ? parseInt(failingMatch[1], 10) : 0;
-      return { passed, total: passed + failed };
-    }
-    // pytest: "N passed, M failed"
-    const pytestMatch = output.match(/(\d+)\s+passed/);
-    const pytestFail = output.match(/(\d+)\s+failed/);
-    if (pytestMatch) {
-      const passed = parseInt(pytestMatch[1], 10);
-      const failed = pytestFail ? parseInt(pytestFail[1], 10) : 0;
-      return { passed, total: passed + failed };
-    }
-    return { passed: null, total: null };
-  } catch {
-    return { passed: null, total: null };
-  }
-}
-/**
- * Extract detailed tool call info (name + args + result) from agent result.
- */
-function extractToolCallDetails(result) {
-  if (!result) return [];
-  if (result.toolCalls && result.toolCalls.length > 0) {
-    return result.toolCalls.map(t => ({
-      name: t.name || (typeof t === 'string' ? t : ''),
-      args: t.args || t.input || {},
-      result: t.result || t.output || '',
-    }));
-  }
-  if (result.log) {
-    const details = [];
-    for (const entry of result.log) {
-      if (entry.toolCalls) {
-        for (const tc of entry.toolCalls) {
-          details.push({
-            name: tc.name || '',
-            args: tc.args || tc.input || {},
-            result: tc.result || tc.output || '',
-          });
-        }
-      } else if (entry.toolCall) {
-        details.push({
-          name: entry.toolCall.name || '',
-          args: entry.toolCall.args || entry.toolCall.input || {},
-          result: entry.toolCall.result || entry.toolCall.output || '',
-        });
-      }
-    }
-    return details;
-  }
-  return [];
-}
-module.exports = {
-  setupSandbox,
-  cleanupSandbox,
-  runAgentBenchmark,
-  runMultiTurnBenchmark,
-  runAgentBenchmarkSuite,
-  scoreAgentResult,
-  isTrustedAgentResult,
-  extractToolCalls,
-  extractToolCallDetails,
-  countTests,
-  detectRegressions,
-  isRetryableHarnessFailure,
-  testCommandAllowed,
-  resolveModelName,
-  FIXTURES_DIR,
-};