npm - create-walle - Versions diffs - 0.9.21 → 0.9.23 - Mend

create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (500) hide show

package/template/wall-e/eval/agent-scorer.js DELETED Viewed

@@ -1,461 +0,0 @@
-'use strict';
-const { execFileSync } = require('child_process');
-const path = require('path');
-/**
- * Score coding agent performance across multiple dimensions.
- * Used for both benchmark evaluation and post-session evaluation.
- *
- * 11 dimensions (weights sum to 1.0):
- *   correctness(0.25), toolEfficiency(0.10), diffAccuracy(0.10),
- *   costEfficiency(0.10), planQuality(0.08), errorHandling(0.08),
- *   turnEconomy(0.07), codeQuality(0.07), partialProgress(0.05),
- *   contextManagement(0.05), iterativeRefinement(0.05)
- */
-const DIMENSION_WEIGHTS = {
-  correctness: 0.25,
-  toolEfficiency: 0.10,
-  diffAccuracy: 0.10,
-  costEfficiency: 0.10,
-  planQuality: 0.08,
-  errorHandling: 0.08,
-  turnEconomy: 0.07,
-  codeQuality: 0.07,
-  partialProgress: 0.05,
-  contextManagement: 0.05,
-  iterativeRefinement: 0.05,
-};
-/**
- * Compute tool efficiency score.
- * Penalizes redundant calls, rewards expected tool usage.
- */
-function scoreToolEfficiency(actual, expected = [], forbidden = []) {
-  if (!actual || actual.length === 0) return 0.1;
-  let score = 0.5; // base score for any tool usage
-  // Reward using expected tools
-  if (expected.length > 0) {
-    const actualSet = new Set(actual);
-    const overlap = expected.filter(t => actualSet.has(t)).length;
-    score = overlap / expected.length;
-  }
-  // Penalize forbidden tool usage
-  const violations = actual.filter(t => forbidden.includes(t));
-  if (violations.length > 0) score *= 0.5;
-  // Penalize excessive redundancy
-  const uniqueRatio = new Set(actual).size / actual.length;
-  if (uniqueRatio < 0.3) score *= 0.7; // too many repeated calls
-  return Math.min(1, Math.max(0, score));
-}
-/**
- * Score code correctness based on test results, file changes, and completion.
- * Tests passing WITHOUT any file modifications = model didn't do the work.
- */
-function scoreCorrectness({ testsPassed, success, output, actualFiles }) {
-  const hasFileChanges = actualFiles && actualFiles.length > 0;
-  if (testsPassed === true && hasFileChanges) return 1.0;
-  if (testsPassed === true && !hasFileChanges) return 0.2; // baseline tests pass but no work done
-  if (testsPassed === false) return 0.2;
-  if (success && hasFileChanges) return 0.7;
-  if (success && !hasFileChanges) return 0.3; // "success" without changes is suspicious
-  if (output && output.length > 100) return 0.4;
-  return 0.1;
-}
-/**
- * Score plan quality based on todo usage and structure.
- */
-function scorePlanQuality(toolCalls, output) {
-  let score = 0.3; // baseline
-  const hasTodos = toolCalls.some(t => t === 'update_todos' || (typeof t === 'object' && t.name === 'update_todos'));
-  if (hasTodos) score += 0.3;
-  // Check if output shows structured planning
-  if (/step\s*\d|phase\s*\d|plan:/i.test(output || '')) score += 0.2;
-  // Check if read/explore happens before write
-  const readIdx = toolCalls.findIndex(t => {
-    const name = typeof t === 'string' ? t : t.name;
-    return /read_file|glob|grep/.test(name);
-  });
-  const writeIdx = toolCalls.findIndex(t => {
-    const name = typeof t === 'string' ? t : t.name;
-    return /write_file|edit_file/.test(name);
-  });
-  if (readIdx >= 0 && writeIdx >= 0 && readIdx < writeIdx) score += 0.2;
-  return Math.min(1, score);
-}
-/**
- * Score diff accuracy — how well the changes match expectations.
- */
-function scoreDiffAccuracy(actualFiles, expectedFiles) {
-  if (!expectedFiles || expectedFiles.length === 0) return 0.5;
-  if (!actualFiles || actualFiles.length === 0) return 0.1;
-  const expected = new Set(expectedFiles);
-  const actual = new Set(actualFiles);
-  const truePositives = [...expected].filter(f => actual.has(f)).length;
-  const precision = actual.size > 0 ? truePositives / actual.size : 0;
-  const recall = expected.size > 0 ? truePositives / expected.size : 0;
-  // F1 score
-  if (precision + recall === 0) return 0.1;
-  return 2 * (precision * recall) / (precision + recall);
-}
-/**
- * Score turn economy — fewer turns for equivalent work is better.
- */
-function scoreTurnEconomy(actualTurns, maxTurns = 20) {
-  if (actualTurns <= 0) return 0;
-  if (actualTurns <= maxTurns * 0.5) return 1.0;
-  if (actualTurns <= maxTurns) return 0.7;
-  if (actualTurns <= maxTurns * 1.5) return 0.3;
-  return 0.1;
-}
-/**
- * Score error handling — doom loop avoidance and recovery.
- */
-function scoreErrorHandling(toolCalls, consecutiveErrors = 0) {
-  let score = 0.7; // baseline
-  // Penalize doom loops (same tool call repeated 3+ times)
-  const callNames = toolCalls.map(t => typeof t === 'string' ? t : t.name || '');
-  for (let i = 2; i < callNames.length; i++) {
-    if (callNames[i] === callNames[i-1] && callNames[i] === callNames[i-2]) {
-      score -= 0.2;
-      break;
-    }
-  }
-  // Penalize excessive consecutive errors
-  if (consecutiveErrors >= 3) score -= 0.3;
-  return Math.max(0, Math.min(1, score));
-}
-/**
- * Score code quality using eslint static analysis on modified files.
- * Falls back to heuristic if eslint is unavailable or sandbox is missing.
- *
- * @param {object} opts
- * @param {string} opts.sandboxDir - directory containing modified files
- * @param {string[]} opts.actualFiles - list of modified file paths (relative to sandboxDir)
- * @param {boolean} opts.success - whether the agent reported success
- * @returns {number} score in [0, 1]
- */
-function scoreCodeQuality({ sandboxDir, actualFiles, success } = {}) {
-  if (!sandboxDir || !actualFiles || actualFiles.length === 0) {
-    return success ? 0.7 : 0.3;
-  }
-  // Filter to JS/TS files only — eslint won't help for other types
-  const jsFiles = actualFiles
-    .filter(f => /\.(js|ts|mjs|cjs)$/.test(f))
-    .map(f => path.resolve(sandboxDir, f));
-  if (jsFiles.length === 0) {
-    return success ? 0.7 : 0.3;
-  }
-  try {
-    execFileSync('npx', [
-      'eslint', '--format', 'json', '--no-eslintrc',
-      '-c', JSON.stringify({ rules: { 'no-undef': 'error', 'no-unused-vars': 'warn' } }),
-      ...jsFiles,
-    ], {
-      cwd: sandboxDir,
-      timeout: 15000,
-      stdio: ['pipe', 'pipe', 'pipe'],
-      env: { ...process.env, NODE_ENV: 'test' },
-    });
-    // eslint exits 0 = no issues
-    return 1.0;
-  } catch (err) {
-    // eslint exits non-zero if there are issues — stdout still has JSON
-    const stdout = err.stdout ? err.stdout.toString() : '';
-    if (!stdout || !stdout.startsWith('[')) {
-      // eslint not available or unexpected error — fall back
-      return success ? 0.7 : 0.3;
-    }
-    try {
-      const report = JSON.parse(stdout);
-      let errors = 0;
-      let warnings = 0;
-      for (const file of report) {
-        errors += file.errorCount || 0;
-        warnings += file.warningCount || 0;
-      }
-      const score = 1.0 - (errors * 0.1 + warnings * 0.03);
-      return Math.min(1, Math.max(0, score));
-    } catch {
-      return success ? 0.7 : 0.3;
-    }
-  }
-}
-/**
- * Score cost efficiency — quality per dollar spent.
- * Higher quality at lower cost = better score.
- * Normalized: qualityScore / max(costDollars, 0.001)
- *
- * @param {number} qualityScore - composite quality score (0-1) from other dimensions
- * @param {number} costDollars - actual cost in USD
- * @param {number} [cohortMaxRatio] - max quality/cost ratio in cohort for normalization
- * @returns {number} score in [0, 1]
- */
-function scoreCostEfficiency(qualityScore, costDollars, cohortMaxRatio = 0) {
-  if (costDollars === undefined || costDollars === null) return 0.5; // neutral if no cost data
-  const ratio = (qualityScore || 0) / Math.max(costDollars, 0.001);
-  if (cohortMaxRatio > 0) {
-    return Math.min(1, Math.max(0, ratio / cohortMaxRatio));
-  }
-  // Without cohort context, use heuristic: ratio of 100 (good quality for $0.01) = 1.0
-  // ratio of 1 ($1 for score 1.0) = low
-  const normalized = Math.min(1, ratio / 100);
-  return Math.max(0, normalized);
-}
-/**
- * Score partial progress — fix rate based on test improvements.
- * (testsAfter - testsBefore) / totalTests
- *
- * @param {number} testsBefore - tests passing before agent run
- * @param {number} testsAfter - tests passing after agent run
- * @param {number} totalTests - total number of tests
- * @returns {number} score in [0, 1]
- */
-function scorePartialProgress(testsBefore, testsAfter, totalTests) {
-  if (totalTests === undefined || totalTests === null || totalTests <= 0) return 0.5; // neutral
-  if (testsBefore === undefined || testsBefore === null) return 0.5;
-  if (testsAfter === undefined || testsAfter === null) return 0.5;
-  const fixRate = (testsAfter - testsBefore) / totalTests;
-  // fixRate can be negative (regression) — clamp to [0, 1]
-  return Math.min(1, Math.max(0, fixRate));
-}
-/**
- * Score context management — penalize redundant file reads, reward targeted access.
- * Examines tool call details to detect re-reading the same file >2x
- * and rewards use of offset/limit parameters for targeted reads.
- *
- * @param {Array} toolCallDetails - array of { name, args } objects with full call info
- * @returns {number} score in [0, 1]
- */
-function scoreContextManagement(toolCallDetails) {
-  if (!toolCallDetails || toolCallDetails.length === 0) return 0.5; // neutral
-  const readCounts = {}; // file -> read count
-  let targetedReads = 0;
-  let totalReads = 0;
-  for (const call of toolCallDetails) {
-    const name = typeof call === 'string' ? call : (call.name || '');
-    const args = (typeof call === 'object' && call.args) || {};
-    if (/read_file|Read/.test(name)) {
-      totalReads++;
-      const filePath = args.file_path || args.path || args.file || 'unknown';
-      readCounts[filePath] = (readCounts[filePath] || 0) + 1;
-      // Reward targeted reads (using offset/limit)
-      if (args.offset !== undefined || args.limit !== undefined || args.line_range) {
-        targetedReads++;
-      }
-    }
-  }
-  if (totalReads === 0) return 0.5; // no reads = neutral
-  let score = 0.7; // baseline
-  // Penalize re-reading same file >2 times
-  for (const count of Object.values(readCounts)) {
-    if (count > 2) {
-      score -= 0.1 * (count - 2); // -0.1 per extra re-read beyond 2
-    }
-  }
-  // Reward targeted reads (offset/limit usage)
-  if (totalReads > 0) {
-    const targetedRatio = targetedReads / totalReads;
-    score += targetedRatio * 0.3; // up to +0.3 for all targeted reads
-  }
-  return Math.min(1, Math.max(0, score));
-}
-/**
- * Score iterative refinement — reward edit->test fail->re-edit->test pass sequences.
- * Looks for the pattern: edit, test/run, (fail detected), edit again, test/run, (pass).
- *
- * @param {Array} toolCallDetails - array of { name, args, result } objects
- * @returns {number} score in [0, 1]
- */
-function scoreIterativeRefinement(toolCallDetails) {
-  if (!toolCallDetails || toolCallDetails.length < 3) return 0.5; // neutral — not enough data
-  const calls = toolCallDetails.map(c => {
-    const name = typeof c === 'string' ? c : (c.name || '');
-    const result = (typeof c === 'object' && c.result) || '';
-    return { name, result: typeof result === 'string' ? result : JSON.stringify(result) };
-  });
-  let refinementCycles = 0;
-  let totalEditTestPairs = 0;
-  for (let i = 0; i < calls.length - 1; i++) {
-    const isEdit = /^(edit_file|write_file|Edit|Write)$/.test(calls[i].name);
-    if (!isEdit) continue;
-    // Look for a test/run after this edit
-    for (let j = i + 1; j < Math.min(i + 4, calls.length); j++) {
-      const isTest = /^(run_shell|bash|Bash)$/.test(calls[j].name);
-      if (!isTest) continue;
-      totalEditTestPairs++;
-      const testFailed = /error|fail|exception/i.test(calls[j].result);
-      if (!testFailed) break;
-      // Look for re-edit after failed test
-      for (let k = j + 1; k < Math.min(j + 3, calls.length); k++) {
-        const isReEdit = /^(edit_file|write_file|Edit|Write)$/.test(calls[k].name);
-        if (!isReEdit) continue;
-        // Look for passing test after re-edit
-        for (let m = k + 1; m < Math.min(k + 3, calls.length); m++) {
-          const isReTest = /^(run_shell|bash|Bash)$/.test(calls[m].name);
-          if (!isReTest) continue;
-          const testPassed = !/error|fail|exception/i.test(calls[m].result);
-          if (testPassed) refinementCycles++;
-          break;
-        }
-        break;
-      }
-      break;
-    }
-  }
-  if (totalEditTestPairs === 0) return 0.5; // no edit->test pairs = neutral
-  // Score: base 0.5 + bonus for successful refinement cycles
-  const score = 0.5 + (refinementCycles / Math.max(totalEditTestPairs, 1)) * 0.5;
-  return Math.min(1, Math.max(0, score));
-}
-/**
- * Score ambiguity handling — agent should ask for clarification before writing
- * when the prompt is vague/ambiguous.
- *
- * @param {string[]} actualToolCalls - tool call names
- * @param {object} expectations - benchmark agentExpectations (shouldAskUser, forbiddenToolCalls)
- * @returns {number} score in [0, 1]
- */
-function scoreAmbiguityHandling(actualToolCalls, expectations = {}) {
-  if (!expectations.shouldAskUser) return 0.5; // not an ambiguity benchmark
-  const calls = actualToolCalls.map(t => typeof t === 'string' ? t : (t.name || ''));
-  const askedUser = calls.some(c => /^(ask_user|AskUserQuestion)$/i.test(c));
-  const wroteFiles = calls.some(c => /^(write_file|edit_file|Write|Edit)$/i.test(c));
-  if (askedUser && !wroteFiles) return 1.0;  // perfect: clarified without writing
-  if (askedUser && wroteFiles) return 0.4;    // asked but also wrote (mixed signals)
-  if (!askedUser && !wroteFiles) return 0.6;  // at least didn't write blindly
-  return 0.0;                                  // worst: wrote without asking
-}
-/**
- * Compute composite agent score across all 11 dimensions.
- */
-function computeAgentScore({
-  actualToolCalls = [],
-  expectedToolCalls = [],
-  forbiddenToolCalls = [],
-  testsPassed = null,
-  success = false,
-  output = '',
-  actualFiles = [],
-  expectedFiles = [],
-  actualTurns = 0,
-  maxTurns = 20,
-  consecutiveErrors = 0,
-  // New params for enhanced dimensions
-  sandboxDir = null,
-  costDollars = null,
-  testsBefore = null,
-  testsAfter = null,
-  totalTests = null,
-  toolCallDetails = null,  // array of { name, args, result } for context/refinement scoring
-  shouldAskUser = false,   // true for ambiguity benchmarks
-  weights = DIMENSION_WEIGHTS,
-} = {}) {
-  // Score the original 6 dimensions (codeQuality is now real)
-  const dimensions = {
-    toolEfficiency: scoreToolEfficiency(actualToolCalls, expectedToolCalls, forbiddenToolCalls),
-    correctness: scoreCorrectness({ testsPassed, success, output, actualFiles }),
-    planQuality: scorePlanQuality(actualToolCalls, output),
-    diffAccuracy: scoreDiffAccuracy(actualFiles, expectedFiles),
-    turnEconomy: scoreTurnEconomy(actualTurns, maxTurns),
-    errorHandling: scoreErrorHandling(actualToolCalls, consecutiveErrors),
-    codeQuality: scoreCodeQuality({ sandboxDir, actualFiles, success }),
-  };
-  // Compute a preliminary quality score for cost efficiency
-  // (average of non-cost dimensions that have data)
-  const prelimDims = ['correctness', 'toolEfficiency', 'diffAccuracy', 'planQuality', 'turnEconomy', 'errorHandling', 'codeQuality'];
-  const qualityScore = prelimDims.reduce((sum, d) => sum + dimensions[d], 0) / prelimDims.length;
-  // Score the 4 new dimensions
-  dimensions.costEfficiency = scoreCostEfficiency(qualityScore, costDollars);
-  dimensions.partialProgress = scorePartialProgress(testsBefore, testsAfter, totalTests);
-  dimensions.contextManagement = scoreContextManagement(toolCallDetails || actualToolCalls);
-  dimensions.iterativeRefinement = scoreIterativeRefinement(toolCallDetails);
-  // For ambiguity benchmarks, override correctness with ambiguity handling score
-  if (shouldAskUser) {
-    dimensions.ambiguityHandling = scoreAmbiguityHandling(actualToolCalls, { shouldAskUser });
-    // Replace correctness with ambiguity handling for these benchmarks
-    dimensions.correctness = dimensions.ambiguityHandling;
-  }
-  let composite = 0;
-  for (const [dim, weight] of Object.entries(weights)) {
-    composite += (dimensions[dim] || 0) * (weight || 0);
-  }
-  return { composite: Math.min(1, Math.max(0, composite)), dimensions };
-}
-module.exports = {
-  DIMENSION_WEIGHTS,
-  scoreToolEfficiency,
-  scoreCorrectness,
-  scorePlanQuality,
-  scoreDiffAccuracy,
-  scoreTurnEconomy,
-  scoreErrorHandling,
-  scoreCodeQuality,
-  scoreCostEfficiency,
-  scorePartialProgress,
-  scoreContextManagement,
-  scoreIterativeRefinement,
-  scoreAmbiguityHandling,
-  computeAgentScore,
-};