npm - create-walle - Versions diffs - 0.9.21 → 0.9.23 - Mend

create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (500) hide show

package/template/wall-e/eval/head-to-head.js DELETED Viewed

@@ -1,388 +0,0 @@
-'use strict';
-const { randomUUID } = require('node:crypto');
-const { createClient } = require('../llm/client');
-const { heuristicScore } = require('./evaluator');
-const { scoreTraitsDetailed, TRAIT_MATCHERS, UNSCORABLE_TRAITS } = require('./benchmarks');
-// ============================================================
-// Concurrency limiter (inline, no external dependency)
-// ============================================================
-function pLimit(concurrency) {
-  let active = 0;
-  const queue = [];
-  function next() {
-    if (queue.length > 0 && active < concurrency) queue.shift()();
-  }
-  return function limit(fn) {
-    return new Promise((resolve, reject) => {
-      queue.push(() => {
-        active++;
-        fn().then(resolve, reject).finally(() => {
-          active--;
-          next();
-        });
-      });
-      next();
-    });
-  };
-}
-// ============================================================
-// Cost estimation
-// ============================================================
-const COST_TABLE = {
-  // Per-model costs ($/1M tokens)
-  'claude-opus-4-6':          { input: 5.0,  output: 25.0 },
-  'claude-sonnet-4-6':        { input: 3.0,  output: 15.0 },
-  'claude-haiku-4-5-20251001': { input: 0.25, output: 1.25 },
-  'claude-haiku-4-5':         { input: 0.25, output: 1.25 },
-  // OpenAI GPT-5.x uses the provider-level fallback until the pricing sync
-  // records exact per-model rates.
-  'gemini-2.5-pro':           { input: 2.0,  output: 12.0 },
-  'gemini-2.5-flash':         { input: 0.08, output: 0.30 },
-  // DeepSeek V4 - OpenAI-compatible API, $/1M tokens.
-  'deepseek-v4-flash':        { input: 0.14,  output: 0.28 },
-  'deepseek-v4-pro':          { input: 1.74,  output: 3.48 },
-  // Moonshot / Kimi - OpenAI-compatible API, $/1M tokens.
-  'kimi-k2.6':                 { input: 0.95,  output: 4.00 },
-  'kimi-k2.5':                 { input: 0.60,  output: 3.00 },
-  'kimi-k2-0905-preview':      { input: 0.60,  output: 2.50 },
-  // Local models = free
-  'gemma4:e4b':               { input: 0,    output: 0 },
-  'gemma4:26b':               { input: 0,    output: 0 },
-  'qwen2.5:7b-instruct-q4_K_M': { input: 0, output: 0 },
-  // Provider-level fallbacks (for backward compat)
-  anthropic: { input: 3.0,  output: 15.0 },
-  openai:    { input: 2.0,  output: 10.0 },
-  google:    { input: 1.5,  output: 8.0 },
-  deepseek:  { input: 0.30, output: 0.60 },
-  moonshot:  { input: 0.95, output: 4.00 },
-  ollama:    { input: 0,    output: 0 },
-  mlx:       { input: 0,    output: 0 },
-};
-/**
- * Estimate cost in dollars for a provider call.
- * @param {{ inputTokens: number, outputTokens: number }} usage
- * @param {string} providerType
- * @param {string} [model] - Optional model ID for per-model cost lookup
- * @returns {number}
- */
-function estimateProviderCost(usage, providerType, model) {
-  if (!usage) return 0;
-  // Check model-specific rates first, then fall back to provider type
-  const rates = (model && COST_TABLE[model]) || COST_TABLE[providerType] || COST_TABLE.anthropic;
-  const inputCost = ((usage.inputTokens || 0) / 1_000_000) * rates.input;
-  const outputCost = ((usage.outputTokens || 0) / 1_000_000) * rates.output;
-  return Math.round((inputCost + outputCost) * 1_000_000) / 1_000_000;
-}
-// ============================================================
-// Provider discovery
-// ============================================================
-/**
- * Discover all configured and credentialed providers from brain's model registry.
- * @param {Object} brain
- * @returns {Array<{ provider: string, model: string, apiKey: string|null, baseUrl: string|null, registryId: string }>}
- */
-function getAvailableProviders(brain) {
-  const models = brain.listAllModels();
-  const results = [];
-  const seen = new Set();
-  for (const m of models) {
-    if (!m.enabled) continue;
-    const provider = brain.getModelProvider(m.provider_id);
-    if (!provider || !provider.enabled) continue;
-    // Ollama/mlx don't need API keys
-    const needsKey = provider.type !== 'ollama' && provider.type !== 'mlx';
-    if (needsKey && !provider.api_key_encrypted) continue;
-    const key = `${provider.type}:${m.model_id}`;
-    if (seen.has(key)) continue;
-    seen.add(key);
-    results.push({
-      provider: provider.type,
-      model: m.model_id,
-      apiKey: provider.api_key_encrypted || null,
-      baseUrl: provider.base_url || null,
-      registryId: m.id,
-    });
-  }
-  return results;
-}
-// ============================================================
-// Trait matching
-// ============================================================
-function traitScore(response, expectedTraits) {
-  if (!expectedTraits || expectedTraits.length === 0) return null;
-  const text = String(response || '').toLowerCase();
-  let matched = 0;
-  let scored = 0;
-  const knownTraits = [];
-  for (const trait of expectedTraits) {
-    if (UNSCORABLE_TRAITS.has(trait)) continue;
-    if (TRAIT_MATCHERS[trait]) {
-      knownTraits.push(trait);
-      continue;
-    }
-    scored++;
-    if (text.includes(String(trait).toLowerCase())) matched++;
-  }
-  if (knownTraits.length > 0) {
-    const detail = scoreTraitsDetailed(response, knownTraits);
-    matched += detail.matched.length;
-    scored += detail.scoredCount;
-  }
-  return scored > 0 ? matched / scored : 0;
-}
-// ============================================================
-// Head-to-head runner
-// ============================================================
-/**
- * Run head-to-head evaluation across multiple providers.
- *
- * @param {Object} brain - Brain module for DB access
- * @param {Object} options
- * @param {Array|string} options.prompts - Array of { prompt, taskType, expectedTraits? } or single prompt string
- * @param {Array} [options.providers] - Provider filter; if omitted, use all from getAvailableProviders
- * @param {Function} [options.judgeFn] - LLM judge: (prompt, response, taskType) => { score, reasoning }
- * @param {number} [options.concurrency=3] - Max parallel provider calls
- * @param {Function} [options.clientFactory] - Override createClient (for testing)
- * @returns {Promise<{ runId: string, results: Array, leaderboard: Array }>}
- */
-async function runHeadToHead(brain, options = {}) {
-  const runId = randomUUID();
-  const concurrency = options.concurrency || 3;
-  const limit = pLimit(concurrency);
-  const factory = options.clientFactory || createClient;
-  // Normalize prompts
-  let prompts = options.prompts;
-  if (typeof prompts === 'string') {
-    prompts = [{ prompt: prompts, taskType: 'chat' }];
-  }
-  if (!Array.isArray(prompts) || prompts.length === 0) {
-    return { runId, results: [], leaderboard: [] };
-  }
-  // Resolve providers
-  const providers = options.providers || getAvailableProviders(brain);
-  if (providers.length === 0) {
-    return { runId, results: [], leaderboard: [] };
-  }
-  const allResults = [];
-  for (const item of prompts) {
-    const prompt = typeof item === 'string' ? item : item.prompt;
-    const taskType = (typeof item === 'object' && item.taskType) || 'chat';
-    const expectedTraits = (typeof item === 'object' && item.expectedTraits) || null;
-    const providerPromises = providers.map((prov) =>
-      limit(async () => {
-        const start = Date.now();
-        try {
-          const client = factory(prov.provider, {
-            apiKey: prov.apiKey,
-            baseUrl: prov.baseUrl,
-          });
-          const response = await client.chat({
-            model: prov.model,
-            messages: [{ role: 'user', content: prompt }],
-            maxTokens: 4096,
-          });
-          const latencyMs = Date.now() - start;
-          const content = typeof response.content === 'string'
-            ? response.content
-            : (response.text || JSON.stringify(response.content));
-          const usage = response.usage || {};
-          // Score independently (no reference/primary response)
-          const hScore = heuristicScore(null, content, taskType);
-          const tScore = traitScore(content, expectedTraits);
-          let judgeResult = null;
-          if (options.judgeFn) {
-            try {
-              judgeResult = await options.judgeFn(prompt, content, taskType);
-            } catch (_) {
-              judgeResult = null;
-            }
-          }
-          const judgeScore = judgeResult ? Math.min(1, (judgeResult.score || 0) / 10) : null;
-          // Composite: weighted average of available scores
-          const components = [];
-          components.push({ score: hScore, weight: 1.0 });
-          if (tScore !== null) components.push({ score: tScore, weight: 1.5 });
-          if (judgeScore !== null) components.push({ score: judgeScore, weight: 2.0 });
-          const totalWeight = components.reduce((s, c) => s + c.weight, 0);
-          const composite = Math.round(
-            (components.reduce((s, c) => s + c.score * c.weight, 0) / totalWeight) * 1000
-          ) / 1000;
-          const inputTokens = usage.input_tokens || usage.inputTokens || usage.input || 0;
-          const outputTokens = usage.output_tokens || usage.outputTokens || usage.output || 0;
-          const cost = estimateProviderCost(
-            { inputTokens, outputTokens },
-            prov.provider,
-            prov.model
-          );
-          // Store in brain if insertModelEvaluation is available
-          if (brain.insertModelEvaluation && prov.registryId) {
-            try {
-              brain.insertModelEvaluation({
-                modelRegistryId: prov.registryId,
-                taskType,
-                qualityScore: composite,
-                latencyMs,
-                inputTokens,
-                outputTokens,
-                costEstimate: cost,
-                wasSelected: false,
-                quorumId: runId,
-              });
-            } catch (_) {
-              // Non-critical — don't fail the run
-            }
-          }
-          return {
-            runId,
-            provider: prov.provider,
-            model: prov.model,
-            registryId: prov.registryId,
-            prompt,
-            taskType,
-            content,
-            scores: {
-              heuristic: hScore,
-              trait: tScore,
-              judge: judgeScore,
-              composite,
-            },
-            latencyMs,
-            cost,
-            genTokPerSec: usage.genTokPerSec || null,
-            error: null,
-          };
-        } catch (err) {
-          return {
-            runId,
-            provider: prov.provider,
-            model: prov.model,
-            registryId: prov.registryId,
-            prompt,
-            taskType,
-            content: null,
-            scores: { heuristic: 0, trait: null, judge: null, composite: 0 },
-            latencyMs: Date.now() - start,
-            cost: 0,
-            error: err.message,
-          };
-        }
-      })
-    );
-    const promptResults = await Promise.all(providerPromises);
-    allResults.push(...promptResults);
-  }
-  const leaderboard = buildLeaderboard(allResults);
-  return { runId, results: allResults, leaderboard };
-}
-// ============================================================
-// Leaderboard
-// ============================================================
-/**
- * Build a leaderboard from head-to-head results.
- * @param {Array} results - Per-prompt per-provider results from runHeadToHead
- * @returns {Array<{ provider: string, model: string, avgComposite: number, avgLatencyMs: number, totalCost: number, runs: number, errors: number, rank: number }>}
- */
-function buildLeaderboard(results) {
-  const byKey = {};
-  for (const r of results) {
-    const key = `${r.provider}:${r.model}`;
-    if (!byKey[key]) {
-      byKey[key] = {
-        provider: r.provider,
-        model: r.model,
-        totalComposite: 0,
-        totalLatency: 0,
-        totalCost: 0,
-        runs: 0,
-        errors: 0,
-      };
-    }
-    const entry = byKey[key];
-    entry.runs++;
-    if (r.error) {
-      entry.errors++;
-    } else {
-      entry.totalComposite += r.scores.composite;
-      entry.totalLatency += r.latencyMs;
-    }
-    entry.totalCost += r.cost;
-  }
-  const entries = Object.values(byKey).map((e) => {
-    const successRuns = e.runs - e.errors;
-    return {
-      provider: e.provider,
-      model: e.model,
-      avgComposite: successRuns > 0
-        ? Math.round((e.totalComposite / successRuns) * 1000) / 1000
-        : 0,
-      avgLatencyMs: successRuns > 0
-        ? Math.round(e.totalLatency / successRuns)
-        : 0,
-      totalCost: Math.round(e.totalCost * 1_000_000) / 1_000_000,
-      runs: e.runs,
-      errors: e.errors,
-    };
-  });
-  // Sort by composite score descending, then by latency ascending as tiebreaker
-  entries.sort((a, b) => {
-    if (b.avgComposite !== a.avgComposite) return b.avgComposite - a.avgComposite;
-    return a.avgLatencyMs - b.avgLatencyMs;
-  });
-  // Assign ranks
-  entries.forEach((e, i) => { e.rank = i + 1; });
-  return entries;
-}
-module.exports = {
-  pLimit,
-  estimateProviderCost,
-  getAvailableProviders,
-  traitScore,
-  runHeadToHead,
-  buildLeaderboard,
-};

package/template/wall-e/eval/humaneval-adapter.js DELETED Viewed

@@ -1,321 +0,0 @@
-'use strict';
-const fs = require('fs');
-const path = require('path');
-const os = require('os');
-const crypto = require('crypto');
-const { execFileSync } = require('child_process');
-const { createClient } = require('../llm/client');
-const { resolveModelName } = require('./agent-runner');
-const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
-const SUITE_NAME = 'humaneval-plus';
-const CACHE_DIR = path.join(os.homedir(), '.walle', 'eval-cache');
-const CACHE_FILE = path.join(CACHE_DIR, 'humaneval-plus.json');
-const CACHE_MAX_AGE_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
-// EvalPlus HumanEval+ dataset URL (JSON format)
-const DATASET_URL = 'https://raw.githubusercontent.com/evalplus/evalplus/master/evalplus/data/humaneval.json';
-/**
- * Download and cache the HumanEval+ dataset.
- * @returns {Promise<Array>} Array of HumanEval tasks
- */
-async function loadHumanEvalDataset() {
-  fs.mkdirSync(CACHE_DIR, { recursive: true });
-  // Check cache freshness
-  if (fs.existsSync(CACHE_FILE)) {
-    const stat = fs.statSync(CACHE_FILE);
-    if (Date.now() - stat.mtimeMs < CACHE_MAX_AGE_MS) {
-      try {
-        const data = JSON.parse(fs.readFileSync(CACHE_FILE, 'utf8'));
-        if (Array.isArray(data) && data.length > 0) return data;
-      } catch { /* re-download on parse error */ }
-    }
-  }
-  // Download dataset
-  console.log('[humaneval] Downloading HumanEval+ dataset...');
-  const resp = await fetch(DATASET_URL, { signal: AbortSignal.timeout(30000) });
-  if (!resp.ok) {
-    throw new Error(`Failed to download HumanEval+ dataset: ${resp.status} ${resp.statusText}`);
-  }
-  const raw = await resp.json();
-  // Dataset can be object keyed by task_id or array
-  let tasks;
-  if (Array.isArray(raw)) {
-    tasks = raw;
-  } else if (typeof raw === 'object') {
-    tasks = Object.entries(raw).map(([id, task]) => ({
-      task_id: id,
-      ...task,
-    }));
-  } else {
-    throw new Error('Unexpected HumanEval+ dataset format');
-  }
-  // Cache
-  fs.writeFileSync(CACHE_FILE, JSON.stringify(tasks, null, 2));
-  console.log(`[humaneval] Cached ${tasks.length} tasks`);
-  return tasks;
-}
-/**
- * Extract code from an LLM response, stripping markdown fences.
- */
-function extractCode(response) {
-  if (!response) return '';
-  // Try to extract from markdown code block
-  const fenceMatch = response.match(/```(?:python)?\s*\n([\s\S]*?)```/);
-  if (fenceMatch) return fenceMatch[1].trim();
-  // If no fence, try to find a function definition
-  const funcMatch = response.match(/((?:def|class)\s+[\s\S]*)/);
-  if (funcMatch) return funcMatch[1].trim();
-  // Return as-is
-  return response.trim();
-}
-/**
- * Map difficulty based on task index (rough heuristic).
- */
-function taskDifficulty(taskId) {
-  const num = parseInt((taskId || '').replace(/\D/g, ''), 10);
-  if (num < 50) return 'easy';
-  if (num < 120) return 'medium';
-  return 'hard';
-}
-/**
- * Run a single HumanEval task.
- * @param {object} task - HumanEval task object
- * @param {object} options - { provider (client instance), model, providerType, config }
- * @returns {Promise<object>} Result with score and metadata
- */
-async function runHumanEvalTask(task, options = {}) {
-  const { provider, model, providerType, config } = options;
-  const startTime = Date.now();
-  let client = provider;
-  if (!client && providerType) {
-    client = createClient(providerType, config || {});
-  }
-  if (!client) throw new Error('provider or providerType is required');
-  const taskPrompt = `Complete the following Python function. Return ONLY the complete function implementation, nothing else.\n\n${task.prompt}`;
-  let response = '';
-  let error = null;
-  let usage = null;
-  try {
-    const result = await client.chat({
-      model: model || 'claude-haiku-4-5-20251001',
-      messages: [{ role: 'user', content: taskPrompt }],
-      maxTokens: 1024,
-    });
-    response = result.content || '';
-    usage = result.usage || null;
-  } catch (err) {
-    error = err.message;
-    return {
-      taskId: task.task_id,
-      passed: false,
-      score: { composite: 0, dimensions: {} },
-      latencyMs: Date.now() - startTime,
-      error,
-      response: '',
-    };
-  }
-  const latencyMs = Date.now() - startTime;
-  const code = extractCode(response);
-  // Write code + tests to temp file and run
-  let passed = false;
-  let testError = null;
-  if (code) {
-    const tmpDir = path.join(os.tmpdir(), `humaneval-${crypto.randomUUID().slice(0, 8)}`);
-    fs.mkdirSync(tmpDir, { recursive: true });
-    const tmpFile = path.join(tmpDir, 'solution.py');
-    try {
-      // Build test file: generated code + test harness
-      const testCode = task.test || '';
-      const entryPoint = task.entry_point || '';
-      // Combine: generated function + test code + check call
-      let fullCode = code + '\n\n' + testCode;
-      if (entryPoint && testCode.includes('check(')) {
-        fullCode += `\ncheck(${entryPoint})\n`;
-      }
-      fs.writeFileSync(tmpFile, fullCode);
-      execFileSync('python3', [tmpFile], {
-        timeout: 30000,
-        stdio: 'pipe',
-        cwd: tmpDir,
-      });
-      passed = true;
-    } catch (err) {
-      testError = (err.stderr ? err.stderr.toString() : err.message).slice(0, 500);
-    } finally {
-      try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
-    }
-  }
-  // Score dimensions
-  const dimensions = {
-    correctness: passed ? 1.0 : 0.0,
-    codeQuality: scoreHumanEvalQuality(code),
-  };
-  const composite = dimensions.correctness * 0.8 + dimensions.codeQuality * 0.2;
-  // Estimate cost
-  let costDollars = 0;
-  try {
-    const { estimateProviderCost } = require('./head-to-head');
-    costDollars = estimateProviderCost(usage || {}, providerType || 'anthropic');
-  } catch {}
-  return {
-    taskId: task.task_id,
-    passed,
-    score: { composite, dimensions },
-    latencyMs,
-    costDollars,
-    response: response.slice(0, 2000),
-    code: code.slice(0, 2000),
-    error: error || null,
-    testError: testError || null,
-    usage,
-  };
-}
-/**
- * Score code quality heuristically.
- */
-function scoreHumanEvalQuality(code) {
-  if (!code) return 0;
-  let score = 0.5;
-  if (/"""[\s\S]*?"""|'''[\s\S]*?'''/.test(code)) score += 0.15; // has docstring
-  if (/if\s+.*(?:None|not\s|len\(|==\s*0)/.test(code)) score += 0.15; // edge case handling
-  if (code.split('\n').length > 2) score += 0.1; // non-trivial
-  if (!/\bprint\b/.test(code)) score += 0.1; // no debug prints
-  return Math.min(1, score);
-}
-/**
- * Run the full HumanEval+ suite.
- * @param {object} options
- * @param {object} [options.brain] - Brain instance for storing results
- * @param {string} options.providerType - Provider type (anthropic, openai, etc.)
- * @param {object} [options.config] - Provider config (apiKey, baseUrl)
- * @param {string} options.model - Model ID
- * @param {number} [options.maxTasks] - Limit number of tasks (default: all)
- * @param {AbortSignal} [options.signal] - Abort signal
- * @returns {Promise<object>} Suite results
- */
-async function runHumanEvalSuite(options = {}) {
-  const { brain, providerType, config, model, maxTasks, signal, runId: providedRunId } = options;
-  const allTasks = await loadHumanEvalDataset();
-  const tasks = maxTasks ? allTasks.slice(0, maxTasks) : allTasks;
-  const client = createClient(providerType || 'anthropic', config || {});
-  const runId = providedRunId || crypto.randomUUID();
-  const results = [];
-  let totalPassed = 0;
-  console.log(`[humaneval] Running ${tasks.length} tasks with ${model || 'default'}...`);
-  for (const task of tasks) {
-    if (signal?.aborted) break;
-    const result = await runHumanEvalTask(task, { provider: client, model, providerType });
-    results.push(result);
-    if (result.passed) totalPassed++;
-    console.log(`  ${result.passed ? 'PASS' : 'FAIL'} ${task.task_id} (${result.latencyMs}ms)`);
-    // Store in brain
-    if (brain && typeof brain.insertBenchmarkResult === 'function') {
-      try {
-        const scoringMethod = 'executable-tests';
-        brain.insertBenchmarkResult(decorateBenchmarkResult({
-          runId,
-          suite: SUITE_NAME,
-          promptId: task.task_id,
-          taskType: 'coding',
-          difficulty: taskDifficulty(task.task_id),
-          provider: providerType || 'unknown',
-          model: resolveModelName(model),
-          prompt: task.prompt,
-          response: result.response || '',
-          traitScore: null,
-          compositeScore: result.score.composite,
-          latencyMs: result.latencyMs,
-          error: result.error || null,
-          costDollars: result.costDollars || null,
-          testsBefore: null,
-          testsAfter: result.passed ? 1 : 0,
-          totalTests: 1,
-          dimensionsJson: JSON.stringify(result.score.dimensions),
-          modelMetadataJson: JSON.stringify({ testError: result.testError || null }),
-          datasetVersion: 'humaneval-plus:evalplus-master',
-          scorerVersion: DEFAULT_SCORER_VERSION,
-          scoringMethod,
-          trusted: !result.error,
-          runConfig: { maxTasks, scoringMethod },
-        }, {
-          suite: SUITE_NAME,
-          benchmark: {
-            id: task.task_id,
-            prompt: task.prompt,
-            taskType: 'coding',
-            difficulty: taskDifficulty(task.task_id),
-            datasetVersion: 'humaneval-plus:evalplus-master',
-          },
-          runId,
-          provider: providerType || 'unknown',
-          model: resolveModelName(model),
-          scorerVersion: DEFAULT_SCORER_VERSION,
-          scoringMethod,
-          trusted: !result.error,
-          runConfig: { maxTasks, scoringMethod },
-        }));
-      } catch {}
-    }
-  }
-  const passAt1 = tasks.length > 0 ? totalPassed / tasks.length : 0;
-  return {
-    runId,
-    suite: SUITE_NAME,
-    model: resolveModelName(model),
-    totalTasks: tasks.length,
-    passed: totalPassed,
-    passAt1,
-    avgScore: results.reduce((s, r) => s + r.score.composite, 0) / Math.max(results.length, 1),
-    totalCost: results.reduce((s, r) => s + (r.costDollars || 0), 0),
-    results,
-  };
-}
-module.exports = {
-  SUITE_NAME,
-  loadHumanEvalDataset,
-  extractCode,
-  runHumanEvalTask,
-  runHumanEvalSuite,
-  scoreHumanEvalQuality,
-};