npm - create-walle - Versions diffs - 0.9.21 → 0.9.23 - Mend

create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (500) hide show

package/template/wall-e/eval/aggregator.js DELETED Viewed

@@ -1,414 +0,0 @@
-'use strict';
-/**
- * Score aggregator — rolls up shadow eval scores into model_task_scores.
- * Computes rolling window stats, trends, and win rates.
- * Phase 6: also blends benchmark data (40%) with shadow data (60%).
- */
-const SHADOW_WEIGHT = 0.6;
-const BENCHMARK_WEIGHT = 0.4;
-/**
- * Map benchmark difficulty + keywords to coding task phases.
- */
-const DIFFICULTY_TASK_MAP = {
-  easy: ['coding:exploration', 'coding:generation'],
-  medium_debug: ['coding:debugging'],
-  hard_multi: ['coding:refactoring', 'coding:planning'],
-};
-function mapBenchmarkToTaskTypes(difficulty, benchmarkId) {
-  const id = (benchmarkId || '').toLowerCase();
-  if (difficulty === 'easy') return DIFFICULTY_TASK_MAP.easy;
-  if (difficulty === 'medium' && /debug|fix|patch|bug/.test(id)) return DIFFICULTY_TASK_MAP.medium_debug;
-  if (difficulty === 'hard' && /multi.?file|refactor|architect/.test(id)) return DIFFICULTY_TASK_MAP.hard_multi;
-  // Fallback: medium → generation, hard → planning
-  if (difficulty === 'medium') return ['coding:generation'];
-  if (difficulty === 'hard') return ['coding:planning'];
-  return ['coding:generation'];
-}
-/**
- * Detect trend from 7-day and 30-day averages.
- * @param {number|null} score7d
- * @param {number|null} score30d
- * @returns {'improving'|'stable'|'declining'}
- */
-function detectTrend(score7d, score30d) {
-  if (score7d == null || score30d == null) return 'stable';
-  const diff = score7d - score30d;
-  if (diff > 0.05) return 'improving';
-  if (diff < -0.05) return 'declining';
-  return 'stable';
-}
-/**
- * Aggregate benchmark scores for a model within a task-type-compatible window.
- * Queries eval_benchmark_runs and maps difficulty to coding task phases.
- *
- * @param {Object} brain - Brain module
- * @param {string} model - Model name
- * @param {string} taskType - Task type (e.g. 'coding:generation')
- * @param {number} [windowDays=30] - Rolling window in days
- * @returns {{ totalEvals: number, avgScore: number, score7d: number|null, score30d: number|null }}
- */
-function aggregateBenchmarkScores(brain, model, taskType, windowDays = 30) {
-  const db = brain.getDb();
-  // Check if eval_benchmark_runs table exists
-  const tableExists = db.prepare(
-    "SELECT name FROM sqlite_master WHERE type='table' AND name='eval_benchmark_runs'"
-  ).get();
-  if (!tableExists) return { totalEvals: 0, avgScore: 0, score7d: null, score30d: null };
-  const cutoff30d = new Date(Date.now() - windowDays * 86400000).toISOString();
-  const cutoff7d = new Date(Date.now() - 7 * 86400000).toISOString();
-  // Get all benchmark runs for this model in window
-  const rows = db.prepare(`
-    SELECT benchmark_id, composite_score, created_at
-    FROM eval_benchmark_runs
-    WHERE model = ? AND error IS NULL AND composite_score IS NOT NULL AND created_at >= ?
-  `).all(model, cutoff30d);
-  if (rows.length === 0) return { totalEvals: 0, avgScore: 0, score7d: null, score30d: null };
-  // Load benchmark definitions to get difficulty info
-  let benchmarkDefs;
-  try {
-    benchmarkDefs = require('./benchmarks').loadAllBenchmarks();
-  } catch (_) {
-    benchmarkDefs = [];
-  }
-  const defMap = new Map();
-  for (const b of benchmarkDefs) defMap.set(b.id, b);
-  // Filter rows that map to the requested taskType
-  const matching = rows.filter(r => {
-    const def = defMap.get(r.benchmark_id);
-    const difficulty = def?.difficulty || 'medium';
-    const taskTypes = mapBenchmarkToTaskTypes(difficulty, r.benchmark_id);
-    return taskTypes.includes(taskType);
-  });
-  if (matching.length === 0) return { totalEvals: 0, avgScore: 0, score7d: null, score30d: null };
-  const totalEvals = matching.length;
-  const avgScore = matching.reduce((s, r) => s + r.composite_score, 0) / totalEvals;
-  const matching7d = matching.filter(r => r.created_at >= cutoff7d);
-  const score7d = matching7d.length > 0
-    ? matching7d.reduce((s, r) => s + r.composite_score, 0) / matching7d.length
-    : null;
-  return {
-    totalEvals,
-    avgScore: Math.round(avgScore * 1000) / 1000,
-    score7d: score7d != null ? Math.round(score7d * 1000) / 1000 : null,
-    score30d: Math.round(avgScore * 1000) / 1000,
-  };
-}
-/**
- * Resolve provider for a model by checking eval_benchmark_runs, then falling back to name heuristic.
- */
-function resolveProvider(brain, model) {
-  try {
-    const row = brain.getDb().prepare(
-      "SELECT provider FROM eval_benchmark_runs WHERE model = ? AND provider != 'unknown' LIMIT 1"
-    ).get(model);
-    if (row?.provider) return row.provider;
-  } catch (_) { /* table may not exist */ }
-  if (/^claude-/.test(model)) return 'anthropic';
-  if (/^gpt-/.test(model)) return 'openai';
-  if (/^gemini-/.test(model)) return 'google';
-  return 'ollama';
-}
-/**
- * Aggregate scores for a specific model and task type.
- * Queries shadow_results directly for rolling window stats.
- * Phase 6: blends with benchmark data (60% shadow + 40% benchmark).
- *
- * @param {Object} brain - Brain module
- * @param {string} model - Shadow model name
- * @param {string} taskType - Task type
- * @param {number} [windowDays=30] - Rolling window in days
- * @returns {Object} Aggregated stats
- */
-function aggregateScores(brain, model, taskType, windowDays = 30) {
-  const db = brain.getDb();
-  const cutoff30d = new Date(Date.now() - windowDays * 86400000).toISOString();
-  const cutoff7d = new Date(Date.now() - 7 * 86400000).toISOString();
-  // 30-day window stats from shadow_results
-  const stats30d = db.prepare(`
-    SELECT
-      COUNT(*) AS total_evals,
-      AVG(shadow_score) AS avg_score,
-      SUM(CASE WHEN shadow_score >= 0.8 * COALESCE(primary_score, 1.0) THEN 1 ELSE 0 END) AS win_count,
-      SUM(CASE WHEN shadow_score >= COALESCE(primary_score, 1.0) THEN 1 ELSE 0 END) AS strong_win_count,
-      AVG(shadow_latency_ms) AS avg_latency_ms,
-      MAX(evaluated_at) AS last_eval_at
-    FROM shadow_results
-    WHERE shadow_model = ? AND task_type = ?
-      AND evaluated_at IS NOT NULL
-      AND created_at >= ?
-  `).get(model, taskType, cutoff30d);
-  // 7-day window stats from shadow_results
-  const stats7d = db.prepare(`
-    SELECT AVG(shadow_score) AS avg_score
-    FROM shadow_results
-    WHERE shadow_model = ? AND task_type = ?
-      AND evaluated_at IS NOT NULL
-      AND created_at >= ?
-  `).get(model, taskType, cutoff7d);
-  const shadowScore7d = stats7d?.avg_score ?? null;
-  const shadowScore30d = stats30d?.avg_score ?? null;
-  // Phase 6: blend with benchmark scores
-  const bench = aggregateBenchmarkScores(brain, model, taskType, windowDays);
-  const hasShadow = (stats30d?.total_evals || 0) > 0;
-  const hasBench = bench.totalEvals > 0;
-  let blendedAvg, blended7d, blended30d;
-  if (hasShadow && hasBench) {
-    // Weighted blend: 60% shadow + 40% benchmark
-    blendedAvg = SHADOW_WEIGHT * (shadowScore30d || 0) + BENCHMARK_WEIGHT * bench.avgScore;
-    blended7d = (shadowScore7d != null && bench.score7d != null)
-      ? SHADOW_WEIGHT * shadowScore7d + BENCHMARK_WEIGHT * bench.score7d
-      : shadowScore7d ?? bench.score7d;
-    blended30d = SHADOW_WEIGHT * (shadowScore30d || 0) + BENCHMARK_WEIGHT * bench.score30d;
-  } else if (hasBench) {
-    // Only benchmark data — use it at full weight
-    blendedAvg = bench.avgScore;
-    blended7d = bench.score7d;
-    blended30d = bench.score30d;
-  } else {
-    // Only shadow data (or nothing)
-    blendedAvg = shadowScore30d || 0;
-    blended7d = shadowScore7d;
-    blended30d = shadowScore30d;
-  }
-  const score7d = blended7d;
-  const score30d = blended30d;
-  const trend = detectTrend(score7d, score30d);
-  return {
-    model,
-    taskType,
-    provider: resolveProvider(brain, model),
-    totalEvals: (stats30d?.total_evals || 0) + bench.totalEvals,
-    avgScore: Math.round((blendedAvg || 0) * 1000) / 1000,
-    winCount: stats30d?.win_count || 0,
-    strongWinCount: stats30d?.strong_win_count || 0,
-    avgLatencyMs: Math.round(stats30d?.avg_latency_ms || 0),
-    score7d: score7d != null ? Math.round(score7d * 1000) / 1000 : null,
-    score30d: score30d != null ? Math.round(score30d * 1000) / 1000 : null,
-    trend,
-    lastEvalAt: stats30d?.last_eval_at || null,
-  };
-}
-/**
- * Get all distinct (model, task_type) pairs from shadow_results that have been evaluated.
- */
-function getDistinctModelTaskPairs(brain) {
-  return brain.getDb().prepare(`
-    SELECT DISTINCT shadow_model AS model, task_type
-    FROM shadow_results
-    WHERE evaluated_at IS NOT NULL AND shadow_model IS NOT NULL
-  `).all();
-}
-/**
- * Get distinct (model, task_type) pairs from benchmark runs mapped through difficulty.
- * Returns pairs that may not exist in shadow_results.
- */
-function getDistinctBenchmarkModelTaskPairs(brain) {
-  const db = brain.getDb();
-  const tableExists = db.prepare(
-    "SELECT name FROM sqlite_master WHERE type='table' AND name='eval_benchmark_runs'"
-  ).get();
-  if (!tableExists) return [];
-  const rows = db.prepare(`
-    SELECT DISTINCT model, benchmark_id FROM eval_benchmark_runs
-    WHERE error IS NULL AND composite_score IS NOT NULL AND model != 'unknown'
-  `).all();
-  let benchmarkDefs;
-  try {
-    benchmarkDefs = require('./benchmarks').loadAllBenchmarks();
-  } catch (_) {
-    benchmarkDefs = [];
-  }
-  const defMap = new Map();
-  for (const b of benchmarkDefs) defMap.set(b.id, b);
-  const pairSet = new Set();
-  const pairs = [];
-  for (const { model, benchmark_id } of rows) {
-    const def = defMap.get(benchmark_id);
-    const difficulty = def?.difficulty || 'medium';
-    const taskTypes = mapBenchmarkToTaskTypes(difficulty, benchmark_id);
-    for (const tt of taskTypes) {
-      const key = `${model}::${tt}`;
-      if (!pairSet.has(key)) {
-        pairSet.add(key);
-        pairs.push({ model, task_type: tt });
-      }
-    }
-  }
-  return pairs;
-}
-/**
- * Refresh all model_task_scores by re-aggregating from shadow_results and benchmark runs.
- * Phase 6: also iterates benchmark-derived model/task pairs so benchmark-only models get scores.
- * @param {Object} brain - Brain module
- * @returns {{ updated: number, pairs: Array }}
- */
-function updateAllModelTaskScores(brain) {
-  const shadowPairs = getDistinctModelTaskPairs(brain);
-  const benchPairs = getDistinctBenchmarkModelTaskPairs(brain);
-  // Merge pairs, deduplicating by model::taskType
-  const seen = new Set();
-  const allPairs = [];
-  for (const p of [...shadowPairs, ...benchPairs]) {
-    const key = `${p.model}::${p.task_type}`;
-    if (!seen.has(key)) {
-      seen.add(key);
-      allPairs.push(p);
-    }
-  }
-  const updated = [];
-  for (const { model, task_type: taskType } of allPairs) {
-    const stats = aggregateScores(brain, model, taskType);
-    brain.upsertModelTaskScore(model, taskType, stats);
-    updated.push({ model, taskType, avgScore: stats.avgScore, totalEvals: stats.totalEvals, trend: stats.trend });
-  }
-  return { updated: updated.length, pairs: updated };
-}
-/**
- * Aggregate coding agent session scores alongside shadow eval data.
- * @param {Object} brain - Brain module
- * @returns {{ updated: number, agentSessions: number }}
- */
-function aggregateCodingAgentScores(brain) {
-  const db = brain.getDb();
-  // Check if coding_agent_sessions table exists
-  const tableExists = db.prepare(
-    "SELECT name FROM sqlite_master WHERE type='table' AND name='coding_agent_sessions'"
-  ).get();
-  if (!tableExists) return { updated: 0, agentSessions: 0 };
-  // Get coding agent session stats grouped by classified_type
-  const stats = db.prepare(`
-    SELECT
-      classified_type,
-      COUNT(*) as total,
-      AVG(significance_score) as avg_significance
-    FROM coding_agent_sessions
-    WHERE significance_score > 0
-    GROUP BY classified_type
-  `).all();
-  return { updated: stats.length, agentSessions: stats.reduce((s, r) => s + r.total, 0), byType: stats };
-}
-/**
- * Data-driven difficulty calibration.
- * After 50+ benchmark runs per benchmark_id, checks if actual scores suggest
- * the difficulty label should be changed.
- *
- * @param {Object} brain - Brain module
- * @returns {Array<{ benchmarkId: string, currentDifficulty: string, suggestedDifficulty: string, avgScore: number, runCount: number }>}
- */
-function calibrateDifficulty(brain) {
-  const db = brain.getDb();
-  const tableExists = db.prepare(
-    "SELECT name FROM sqlite_master WHERE type='table' AND name='eval_benchmark_runs'"
-  ).get();
-  if (!tableExists) return [];
-  const rows = db.prepare(`
-    SELECT benchmark_id, AVG(composite_score) AS actual_difficulty, COUNT(*) AS run_count
-    FROM eval_benchmark_runs
-    WHERE error IS NULL AND composite_score IS NOT NULL
-    GROUP BY benchmark_id
-    HAVING COUNT(*) >= 50
-  `).all();
-  if (rows.length === 0) return [];
-  // Load benchmark definitions to get current difficulty labels
-  let benchmarkDefs;
-  try {
-    benchmarkDefs = require('./benchmarks').loadAllBenchmarks();
-  } catch (_) {
-    benchmarkDefs = [];
-  }
-  const defMap = new Map();
-  for (const b of benchmarkDefs) defMap.set(b.id, b);
-  const suggestions = [];
-  for (const { benchmark_id, actual_difficulty: avgScore, run_count: runCount } of rows) {
-    const def = defMap.get(benchmark_id);
-    const currentDifficulty = def?.difficulty || 'unknown';
-    let suggestedDifficulty = currentDifficulty;
-    // High avg score on a "hard" benchmark → suggest downgrade to medium
-    if (currentDifficulty === 'hard' && avgScore >= 0.85) {
-      suggestedDifficulty = 'medium';
-    }
-    // High avg score on a "medium" benchmark → suggest downgrade to easy
-    else if (currentDifficulty === 'medium' && avgScore >= 0.85) {
-      suggestedDifficulty = 'easy';
-    }
-    // Low avg score on "easy" → suggest upgrade to medium or hard
-    else if (currentDifficulty === 'easy' && avgScore < 0.5) {
-      suggestedDifficulty = 'hard';
-    }
-    // Low avg score on "medium" → suggest upgrade to hard
-    else if (currentDifficulty === 'medium' && avgScore < 0.5) {
-      suggestedDifficulty = 'hard';
-    }
-    if (suggestedDifficulty !== currentDifficulty) {
-      suggestions.push({
-        benchmarkId: benchmark_id,
-        currentDifficulty,
-        suggestedDifficulty,
-        avgScore: Math.round(avgScore * 1000) / 1000,
-        runCount,
-      });
-    }
-  }
-  return suggestions;
-}
-module.exports = {
-  detectTrend,
-  aggregateScores,
-  aggregateBenchmarkScores,
-  getDistinctModelTaskPairs,
-  updateAllModelTaskScores,
-  aggregateCodingAgentScores,
-  calibrateDifficulty,
-  mapBenchmarkToTaskTypes,
-  SHADOW_WEIGHT,
-  BENCHMARK_WEIGHT,
-};

package/template/wall-e/eval/allowed-test-commands.js DELETED Viewed

@@ -1,34 +0,0 @@
-'use strict';
-const ALLOWED_TEST_COMMAND_PATTERNS = [
-  /^npm test(?:\s+--\s*[\w./:= -]+)?$/,
-  /^npm run (?:test|test:[\w:-]+|typecheck|lint)(?:\s+--\s*[\w./:= -]+)?$/,
-  /^pnpm test(?:\s+--\s*[\w./:= -]+)?$/,
-  /^pnpm run (?:test|test:[\w:-]+|typecheck|lint)(?:\s+--\s*[\w./:= -]+)?$/,
-  /^yarn test(?:\s+[\w./:= -]+)?$/,
-  /^bun test(?:\s+[\w./:= -]+)?$/,
-  /^node test\.js$/,
-  /^node --test(?:\s+[\w./-]+)*$/,
-  /^pytest(?:\s+[\w./:-]+)*$/,
-  /^python -m pytest(?:\s+[\w./:-]+)*$/,
-  /^make test$/,
-  /^tsc --noEmit(?:\s+--[\w:-]+(?:[= ]\S+)?)?$/,
-  /^npx tsc --noEmit(?:\s+--[\w:-]+(?:[= ]\S+)?)?$/,
-  /^go test(?:\s+(?:\.|\.\/\.\.\.|[\w./-]+))*$/,
-  /^cargo test(?:\s+[\w./:-]+)*$/,
-];
-function normalizeTestCommand(command) {
-  return String(command || '').trim().replace(/\s+/g, ' ');
-}
-function testCommandAllowed(command) {
-  const normalized = normalizeTestCommand(command);
-  return Boolean(normalized) && ALLOWED_TEST_COMMAND_PATTERNS.some((pattern) => pattern.test(normalized));
-}
-module.exports = {
-  ALLOWED_TEST_COMMAND_PATTERNS,
-  normalizeTestCommand,
-  testCommandAllowed,
-};

package/template/wall-e/eval/benchmark-generator.js DELETED Viewed

@@ -1,113 +0,0 @@
-'use strict';
-const crypto = require('crypto');
-const fs = require('fs');
-const path = require('path');
-/**
- * Classify a coding agent session into a task type based on prompt content
- * and tool call patterns.
- */
-function classifyCodingType(session) {
-  const prompt = (session.prompt || '').toLowerCase();
-  if (/plan|design|architect/i.test(prompt)) return 'coding:planning';
-  if (/debug|fix|bug|error|failing/i.test(prompt)) return 'coding:debugging';
-  if (/refactor|extract|rename|reorganize/i.test(prompt)) return 'coding:refactoring';
-  if (/review|assess|check/i.test(prompt)) return 'coding:review';
-  if (/test|spec|coverage/i.test(prompt)) return 'coding:testing';
-  return 'coding:generation';
-}
-function isReplayableBenchmarkPrompt(prompt) {
-  const text = String(prompt || '').trim();
-  if (text.length < 25) return false;
-  // Session-mined prompts must be the user's task, not the assistant's first
-  // progress narration. Assistant prose turns the benchmark into "continue the
-  // previous assistant's work", which is not replayable from a fresh sandbox.
-  if (/^(i('|’)ll|i will|i can|i('|’)m going to|let me|sure[, ]|happy to help|i('|’)ll help|i('|’)ll start)\b/i.test(text)) {
-    return false;
-  }
-  if (/^\s*(go ahead|continue|proceed|do it|yes|yep|ok|okay|thanks|thank you)\b/i.test(text)) {
-    return false;
-  }
-  return /\b(fix|implement|add|change|update|refactor|test|debug|make|write|delete|remove|harden|wire|bug|failing|error|regression|feature|endpoint|api|ui|server|component|code review|review.*code)\b/i.test(text);
-}
-/**
- * Convert a coding agent session object to a benchmark entry.
- * Returns null if the session would be a duplicate (id already in existingIds).
- */
-function sessionToBenchmark(session, existingIds = new Set()) {
-  if (!isReplayableBenchmarkPrompt(session.prompt)) return null;
-  const id = `agent-session-${crypto.createHash('sha256').update(session.prompt || '').digest('hex').slice(0, 8)}`;
-  if (existingIds.has(id)) return null; // dedup
-  const type = classifyCodingType(session);
-  const toolNames = (session.tool_calls || []).map(t => typeof t === 'string' ? t : t.name || '');
-  // Infer expected traits from what the session actually did
-  const traits = [];
-  if (toolNames.some(t => /read_file|glob|grep/.test(t))) traits.push('reads before writing');
-  if (toolNames.some(t => /edit_file/.test(t)) && !toolNames.some(t => t === 'write_file')) traits.push('uses edit over write');
-  if (toolNames.some(t => /test|npm test/.test(t))) traits.push('runs tests after changes');
-  if (toolNames.some(t => /update_todos/.test(t))) traits.push('plans before executing');
-  if (traits.length === 0) traits.push('has code block');
-  const difficulty = session.turns > 10 ? 'hard' : session.turns > 5 ? 'medium' : 'easy';
-  return {
-    id,
-    prompt: session.prompt,
-    taskType: 'coding-agent',
-    difficulty,
-    expectedTraits: traits,
-    agentExpectations: {
-      expectedToolCalls: [...new Set(toolNames)].slice(0, 10),
-      maxTurns: Math.min((session.turns || 1) * 2, 50),
-      expectedFileChanges: session.files_modified || [],
-    },
-    sourceSessionId: session.session_id,
-    classifiedType: type,
-    expectedDiff: session.git_diff || null,
-    complexityIndicator: Array.isArray(session.files_modified) ? session.files_modified.length : 0,
-  };
-}
-/**
- * Generate benchmarks from significant coding agent sessions stored in the brain DB.
- *
- * @param {object} brain - Brain module (must have getCodingSessions, markBenchmarkGenerated)
- * @param {object} opts - Options: minSignificance (default 0.5), limit (default 50)
- * @returns {Array} Array of newly generated benchmark objects
- */
-async function generateBenchmarks(brain, { minSignificance = 0.5, limit = 50 } = {}) {
-  // Get significant sessions not yet converted
-  const sessions = brain.getCodingSessions({ minSignificance, limit });
-  const unconverted = sessions.filter(s => !s.benchmark_generated);
-  // Load existing benchmark IDs to dedup
-  const existingIds = new Set();
-  try {
-    const benchmarkPath = path.join(__dirname, 'benchmarks', 'coding-agent.json');
-    const existing = JSON.parse(fs.readFileSync(benchmarkPath, 'utf8'));
-    existing.forEach(b => existingIds.add(b.id));
-  } catch { /* no existing file or parse error */ }
-  const benchmarks = [];
-  for (const session of unconverted) {
-    const benchmark = sessionToBenchmark(session, existingIds);
-    if (benchmark) {
-      benchmarks.push(benchmark);
-      existingIds.add(benchmark.id);
-      brain.markBenchmarkGenerated(session.id);
-    }
-  }
-  return benchmarks;
-}
-module.exports = { classifyCodingType, isReplayableBenchmarkPrompt, sessionToBenchmark, generateBenchmarks };