npm - create-walle - Versions diffs - 0.9.21 → 0.9.23 - Mend

create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (500) hide show

package/template/wall-e/eval/evaluate.js DELETED Viewed

@@ -1,202 +0,0 @@
-'use strict';
-const { execFile } = require('node:child_process');
-const { promisify } = require('node:util');
-const execFileAsync = promisify(execFile);
-// ---------------------------------------------------------------------------
-// Built-in benchmark prompts
-// ---------------------------------------------------------------------------
-const BENCHMARK_PROMPTS = [
-  { prompt: 'Explain the difference between a promise and a callback in JavaScript.', type: 'coding' },
-  { prompt: 'Summarize the key points of effective code review.', type: 'coding' },
-  { prompt: 'What are the pros and cons of microservices vs monolith?', type: 'reasoning' },
-  { prompt: 'Write a function to find the longest palindrome in a string.', type: 'coding' },
-  { prompt: 'What should I prioritize when debugging a production outage?', type: 'reasoning' },
-];
-// ---------------------------------------------------------------------------
-// Scoring helpers
-// ---------------------------------------------------------------------------
-/**
- * Score a single response on a 0-1 scale.
- * Factors: non-empty (coherence), length (detail), latency (speed).
- */
-function scoreResponse(response, latencyMs) {
-  if (!response || !response.trim()) return 0;
-  // Coherence: 0.4 points for non-empty
-  let score = 0.4;
-  // Length score: up to 0.3 points (longer = more detailed, capped at 500 chars)
-  const lengthScore = Math.min(response.length / 500, 1) * 0.3;
-  score += lengthScore;
-  // Latency score: up to 0.3 points (faster = better, under 2s is perfect)
-  const latencyScore = Math.max(0, 1 - latencyMs / 10000) * 0.3;
-  score += latencyScore;
-  return Math.round(score * 1000) / 1000;
-}
-// ---------------------------------------------------------------------------
-// Model runner
-// ---------------------------------------------------------------------------
-/**
- * Run a prompt against an Ollama model and return { response, latencyMs }.
- * @param {string} model - Ollama model name
- * @param {string} prompt - The prompt text
- * @returns {Promise<{ response: string, latencyMs: number }>}
- */
-async function runOllamaPrompt(model, prompt) {
-  const start = Date.now();
-  try {
-    const { stdout } = await execFileAsync('ollama', ['run', model, prompt], {
-      timeout: 30000,
-      maxBuffer: 1024 * 1024,
-    });
-    return { response: stdout.trim(), latencyMs: Date.now() - start };
-  } catch (err) {
-    return { response: '', latencyMs: Date.now() - start, error: err.message };
-  }
-}
-// ---------------------------------------------------------------------------
-// Main entry point
-// ---------------------------------------------------------------------------
-/**
- * Evaluate fine-tuned models against base models using the quorum system.
- * @param {Object} options
- * @param {string[]} options.fineTunedModels - Ollama model names to evaluate
- * @param {string[]} [options.benchmarkPrompts] - Test prompts (defaults to built-in set)
- * @param {string} [options.baseline] - Baseline model name (default: 'llama3.2:1b')
- * @param {number} [options.winThreshold] - Win rate threshold for deployment (default: 0.6)
- * @param {Object} [deps] - { brain, runPrompt }
- * @returns {Promise<{ results: Array, winner: string|null, deployed: boolean }>}
- */
-async function evaluateAndDeploy(options, deps = {}) {
-  const {
-    fineTunedModels = [],
-    benchmarkPrompts,
-    baseline = 'llama3.2:1b',
-    winThreshold = 0.6,
-  } = options || {};
-  const prompts = benchmarkPrompts || BENCHMARK_PROMPTS;
-  const runPrompt = deps.runPrompt || runOllamaPrompt;
-  const brain = deps.brain || null;
-  if (!fineTunedModels.length) {
-    return { results: [], winner: null, deployed: false };
-  }
-  // All models to evaluate: fine-tuned + baseline
-  const allModels = [...fineTunedModels, baseline];
-  const results = [];
-  // Run each prompt against all models
-  for (const { prompt, type } of prompts) {
-    const promptResult = { prompt, type, scores: {} };
-    for (const model of allModels) {
-      try {
-        const { response, latencyMs } = await runPrompt(model, prompt);
-        const score = scoreResponse(response, latencyMs);
-        promptResult.scores[model] = { score, latencyMs, responseLength: (response || '').length };
-      } catch (err) {
-        promptResult.scores[model] = { score: 0, latencyMs: 0, responseLength: 0, error: err.message };
-      }
-    }
-    results.push(promptResult);
-  }
-  // Compare each fine-tuned model against baseline
-  let winner = null;
-  let bestWinRate = 0;
-  let bestAvgScore = 0;
-  for (const model of fineTunedModels) {
-    let wins = 0;
-    let comparisons = 0;
-    let totalScore = 0;
-    for (const r of results) {
-      const modelScore = r.scores[model]?.score || 0;
-      const baselineScore = r.scores[baseline]?.score || 0;
-      comparisons++;
-      totalScore += modelScore;
-      if (modelScore > baselineScore) wins++;
-    }
-    const winRate = comparisons > 0 ? wins / comparisons : 0;
-    const avgScore = comparisons > 0 ? totalScore / comparisons : 0;
-    // Pick model with highest win rate; break ties by average score
-    if (winRate > bestWinRate || (winRate === bestWinRate && avgScore > bestAvgScore)) {
-      bestWinRate = winRate;
-      bestAvgScore = avgScore;
-      if (winRate > winThreshold) {
-        winner = model;
-      }
-    }
-  }
-  // Deploy winner if found
-  let deployed = false;
-  if (winner) {
-    try {
-      await deployModel(winner, deps);
-      deployed = true;
-    } catch (err) {
-      // Deployment failed but evaluation still succeeded
-      deployed = false;
-    }
-  }
-  return { results, winner, deployed };
-}
-// ---------------------------------------------------------------------------
-// Deployment
-// ---------------------------------------------------------------------------
-/**
- * Deploy a model to Ollama and register in brain's model_registry.
- * @param {string} modelName - The model to deploy
- * @param {Object} deps - { brain, execDeploy }
- */
-async function deployModel(modelName, deps = {}) {
-  const execDeploy = deps.execDeploy || execFileAsync;
-  const brain = deps.brain || null;
-  // Create an Ollama model alias
-  const walleAlias = `walle-${modelName.replace(/[/:]/g, '-')}`;
-  await execDeploy('ollama', ['cp', modelName, walleAlias], { timeout: 60000 });
-  // Register in brain's model_registry if available
-  if (brain && brain.getDb) {
-    try {
-      const db = brain.getDb();
-      db.prepare(`
-        INSERT OR REPLACE INTO model_registry (name, base_model, status, deployed_at)
-        VALUES (?, ?, 'active', datetime('now'))
-      `).run(walleAlias, modelName);
-    } catch {
-      // model_registry table may not exist yet -- non-fatal
-    }
-  }
-}
-module.exports = {
-  evaluateAndDeploy,
-  deployModel,
-  scoreResponse,
-  BENCHMARK_PROMPTS,
-};

package/template/wall-e/eval/evaluator.js DELETED Viewed

@@ -1,373 +0,0 @@
-'use strict';
-/**
- * Shadow evaluation framework — scores shadow model outputs against primary.
- * Three methods: heuristic (free), embedding similarity (cheap), LLM-as-judge (expensive).
- */
-// --- Heuristic Scoring (free, instant) ---
-function codingHeuristic(primary, shadow) {
-  let score = 0;
-  // Check for code blocks
-  const primaryHasCode = /```/.test(primary);
-  const shadowHasCode = /```/.test(shadow);
-  if (primaryHasCode && shadowHasCode) score += 0.3;
-  else if (primaryHasCode && !shadowHasCode) score += 0.0;
-  else score += 0.15;
-  // Check for similar keywords (function, const, class, import, etc.)
-  const codeKeywords = /\b(function|const|let|var|class|import|export|return|if|for|while|async|await)\b/g;
-  const primaryKeywords = new Set((primary.match(codeKeywords) || []).map(k => k.toLowerCase()));
-  const shadowKeywords = new Set((shadow.match(codeKeywords) || []).map(k => k.toLowerCase()));
-  if (primaryKeywords.size > 0 && shadowKeywords.size > 0) {
-    const overlap = [...shadowKeywords].filter(k => primaryKeywords.has(k)).length;
-    score += 0.3 * (overlap / Math.max(primaryKeywords.size, 1));
-  }
-  return Math.min(score, 0.6);
-}
-function qaHeuristic(primary, shadow) {
-  let score = 0;
-  // Key terms overlap (split into words, compare)
-  const primaryWords = new Set(primary.toLowerCase().split(/\W+/).filter(w => w.length > 3));
-  const shadowWords = new Set(shadow.toLowerCase().split(/\W+/).filter(w => w.length > 3));
-  if (primaryWords.size > 0 && shadowWords.size > 0) {
-    const overlap = [...shadowWords].filter(w => primaryWords.has(w)).length;
-    const overlapRate = overlap / Math.max(primaryWords.size, 1);
-    score += 0.6 * Math.min(overlapRate * 2, 1); // scale so 50% overlap = full score
-  }
-  return Math.min(score, 0.6);
-}
-function planningHeuristic(primary, shadow) {
-  let score = 0;
-  // Structure signals — good plans have headers, lists, or numbered steps
-  const structurePatterns = /^(#{1,3}\s|[\-*]\s|\d+[\.\)]\s)/m;
-  const primaryHasStructure = structurePatterns.test(primary);
-  const shadowHasStructure = structurePatterns.test(shadow);
-  if (primaryHasStructure && shadowHasStructure) score += 0.15;
-  else if (shadowHasStructure) score += 0.1;
-  // Key concept overlap — planning discussions revolve around domain terms
-  const primaryWords = new Set(primary.toLowerCase().split(/\W+/).filter(w => w.length > 3));
-  const shadowWords = new Set(shadow.toLowerCase().split(/\W+/).filter(w => w.length > 3));
-  if (primaryWords.size > 0 && shadowWords.size > 0) {
-    const overlap = [...shadowWords].filter(w => primaryWords.has(w)).length;
-    const overlapRate = overlap / Math.max(primaryWords.size, 1);
-    score += 0.25 * Math.min(overlapRate * 2, 1);
-  }
-  // Thoroughness — plans should be substantive, not terse
-  if (shadow.length >= primary.length * 0.4) score += 0.1;
-  if (shadow.length >= primary.length * 0.7) score += 0.1;
-  return Math.min(score, 0.6);
-}
-function generalHeuristic(primary, shadow) {
-  // Simple word overlap as semantic similarity proxy
-  const primaryWords = new Set(primary.toLowerCase().split(/\W+/).filter(w => w.length > 3));
-  const shadowWords = new Set(shadow.toLowerCase().split(/\W+/).filter(w => w.length > 3));
-  if (primaryWords.size === 0 || shadowWords.size === 0) return 0.3;
-  const overlap = [...shadowWords].filter(w => primaryWords.has(w)).length;
-  const overlapRate = overlap / Math.max(primaryWords.size, 1);
-  return Math.min(0.6 * overlapRate * 2, 0.6);
-}
-/**
- * Heuristic scoring — free, instant.
- * @returns {number} 0.0 - 1.0
- */
-function heuristicScore(primary, shadow, taskType) {
-  if (!shadow || !shadow.trim()) return 0;
-  let score = 0;
-  // Non-empty response: 0.2
-  score += 0.2;
-  // Length ratio: 0.2 (within 0.3x-3x of primary is ideal)
-  if (primary) {
-    const ratio = shadow.length / Math.max(primary.length, 1);
-    if (ratio >= 0.3 && ratio <= 3.0) score += 0.2;
-    else if (ratio >= 0.1 && ratio <= 5.0) score += 0.1;
-  } else {
-    score += 0.1;
-  }
-  // Task-specific scoring: up to 0.6
-  if (taskType === 'coding') {
-    score += codingHeuristic(primary || '', shadow);
-  } else if (taskType === 'planning') {
-    score += planningHeuristic(primary || '', shadow);
-  } else if (taskType === 'qa') {
-    score += qaHeuristic(primary || '', shadow);
-  } else {
-    score += generalHeuristic(primary || '', shadow);
-  }
-  return Math.min(Math.round(score * 1000) / 1000, 1.0);
-}
-// --- Embedding Similarity (cheap, ~100ms) ---
-/**
- * Compute cosine similarity between two vectors.
- */
-function cosineSimilarity(a, b) {
-  if (!a || !b || a.length !== b.length || a.length === 0) return 0;
-  let dot = 0, normA = 0, normB = 0;
-  for (let i = 0; i < a.length; i++) {
-    dot += a[i] * b[i];
-    normA += a[i] * a[i];
-    normB += b[i] * b[i];
-  }
-  const denom = Math.sqrt(normA) * Math.sqrt(normB);
-  return denom === 0 ? 0 : dot / denom;
-}
-/**
- * Get embedding from Ollama.
- * @returns {number[]|null}
- */
-async function getEmbedding(text, model = 'nomic-embed-text') {
-  try {
-    const response = await fetch('http://localhost:11434/api/embed', {
-      method: 'POST',
-      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ model, input: text.slice(0, 2048) }),
-      signal: AbortSignal.timeout(10000),
-    });
-    if (!response.ok) return null;
-    const data = await response.json();
-    return data.embeddings?.[0] || null;
-  } catch {
-    return null;
-  }
-}
-/**
- * Embedding similarity between primary and shadow responses.
- * @returns {number|null} 0.0-1.0 or null if unavailable
- */
-async function embeddingSimilarity(primary, shadow, embedFn) {
-  const embed = embedFn || getEmbedding;
-  try {
-    const [pEmb, sEmb] = await Promise.all([embed(primary), embed(shadow)]);
-    if (!pEmb || !sEmb) return null;
-    return Math.round(cosineSimilarity(pEmb, sEmb) * 1000) / 1000;
-  } catch {
-    return null;
-  }
-}
-// --- LLM-as-Judge (expensive, ~5s) ---
-// Daily counter for rate limiting
-let _judgeDailyCount = 0;
-let _judgeCountDate = '';
-function resetJudgeCountIfNewDay() {
-  const today = new Date().toISOString().slice(0, 10);
-  if (_judgeCountDate !== today) {
-    _judgeDailyCount = 0;
-    _judgeCountDate = today;
-  }
-}
-/**
- * Whether to use LLM judge for this task type.
- * Rate-limited to maxJudgePerDay (default 50).
- */
-function shouldJudge(taskType, maxPerDay = 50) {
-  resetJudgeCountIfNewDay();
-  if (_judgeDailyCount >= maxPerDay) return false;
-  // Always judge planning and complex tasks
-  if (taskType === 'planning' || taskType === 'coding') return true;
-  // Sample 10-20% of others
-  const sampleRates = { chat: 0.1, qa: 0.1, 'slack-reply': 0.2 };
-  const rate = sampleRates[taskType] || 0.1;
-  return Math.random() < rate;
-}
-/**
- * LLM-as-Judge — blind A/B comparison.
- * @returns {{ primaryScore: number, shadowScore: number, reasoning: string }|null}
- */
-async function llmJudge(prompt, primary, shadow, taskType, judgeFn) {
-  resetJudgeCountIfNewDay();
-  _judgeDailyCount++;
-  const judgePrompt = `You are evaluating two AI responses to the same prompt.
-Rate each response on a 0-10 scale for: accuracy, helpfulness, completeness.
-Then compute an overall score (average of the three).
-Task type: ${taskType}
-Prompt: ${prompt.slice(0, 1000)}
-Response A:
-${primary.slice(0, 2000)}
-Response B:
-${shadow.slice(0, 2000)}
-Output ONLY valid JSON (no markdown): { "a_score": N, "b_score": N, "reasoning": "..." }`;
-  try {
-    const judge = judgeFn || defaultJudgeFn;
-    const result = await judge(judgePrompt);
-    if (!result) return null;
-    // Normalize to 0-1
-    return {
-      primaryScore: Math.min((result.a_score || 0) / 10, 1),
-      shadowScore: Math.min((result.b_score || 0) / 10, 1),
-      reasoning: result.reasoning || '',
-    };
-  } catch {
-    return null;
-  }
-}
-async function defaultJudgeFn(prompt) {
-  // Uses the default LLM client for judging
-  // Falls back gracefully if not available
-  try {
-    const { getDefaultClient } = require('../llm/client');
-    const client = getDefaultClient();
-    const response = await client.chat({
-      model: 'claude-haiku-4-5-20251001',
-      messages: [{ role: 'user', content: prompt }],
-      maxTokens: 500,
-    });
-    const text = typeof response.content === 'string' ? response.content : response.text || '';
-    // Extract JSON from response
-    const jsonMatch = text.match(/\{[\s\S]*\}/);
-    if (!jsonMatch) return null;
-    return JSON.parse(jsonMatch[0]);
-  } catch {
-    return null;
-  }
-}
-// --- Composite Scoring ---
-/**
- * Compute composite eval score from available methods.
- */
-function computeCompositeEvalScore(heuristic, embedding, judge, taskType) {
-  const components = [];
-  if (heuristic != null) components.push({ score: heuristic, weight: 1.0 });
-  if (embedding != null) components.push({ score: embedding, weight: 1.5 });
-  if (judge != null) components.push({ score: judge.shadowScore, weight: 2.0 });
-  if (components.length === 0) return 0;
-  const totalWeight = components.reduce((sum, c) => sum + c.weight, 0);
-  const weightedSum = components.reduce((sum, c) => sum + c.score * c.weight, 0);
-  return Math.round((weightedSum / totalWeight) * 1000) / 1000;
-}
-// --- Main Evaluation Pipeline ---
-/**
- * Evaluate a single shadow result.
- * @param {Object} result - Row from shadow_results table
- * @param {Object} brain - Brain module
- * @param {Object} [deps] - Dependency injection for testing
- * @returns {Object} { primaryScore, shadowScore, evalMethod }
- */
-async function evaluateShadowResult(result, brain, deps = {}) {
-  const { task_type: taskType, prompt, primary_response: primary, shadow_response: shadow } = result;
-  if (!shadow) {
-    brain.updateShadowResultEval(result.id, {
-      primaryScore: 1.0, shadowScore: 0, evalMethod: 'no-response',
-    });
-    return { primaryScore: 1.0, shadowScore: 0, evalMethod: 'no-response' };
-  }
-  // Always run heuristic
-  const hScore = heuristicScore(primary, shadow, taskType);
-  // Try embedding similarity
-  let eScore = null;
-  if (deps.embedFn !== false) {
-    eScore = await embeddingSimilarity(primary, shadow, deps.embedFn || undefined);
-  }
-  // LLM judge for qualifying results
-  let jResult = null;
-  if (deps.judgeFn !== false && shouldJudge(taskType, deps.maxJudgePerDay)) {
-    jResult = await llmJudge(prompt, primary, shadow, taskType, deps.judgeFn || undefined);
-  }
-  const shadowScore = computeCompositeEvalScore(hScore, eScore, jResult, taskType);
-  const evalMethod = [
-    'heuristic',
-    eScore != null ? 'embedding' : null,
-    jResult != null ? 'judge' : null,
-  ].filter(Boolean).join('+');
-  brain.updateShadowResultEval(result.id, {
-    primaryScore: 1.0,
-    shadowScore,
-    evalMethod,
-    judgeModel: jResult ? 'claude-haiku-4-5-20251001' : null,
-    judgeReasoning: jResult?.reasoning || null,
-  });
-  return { primaryScore: 1.0, shadowScore, evalMethod };
-}
-/**
- * Run batch evaluation on unevaluated shadow results.
- * @param {Object} brain - Brain module
- * @param {Object} [options]
- * @param {number} [options.limit=100]
- * @param {Object} [options.deps] - Dependency injection
- * @returns {{ evaluated: number, avgScore: number }}
- */
-async function runBatchEval(brain, options = {}) {
-  const limit = options.limit || 100;
-  const deps = options.deps || {};
-  const results = brain.getShadowResults({ evaluated: false, limit });
-  let totalScore = 0;
-  let count = 0;
-  for (const result of results) {
-    try {
-      const eval_ = await evaluateShadowResult(result, brain, deps);
-      totalScore += eval_.shadowScore;
-      count++;
-    } catch (err) {
-      console.error(`[evaluator] Error evaluating ${result.id}:`, err.message);
-    }
-  }
-  return {
-    evaluated: count,
-    avgScore: count > 0 ? Math.round((totalScore / count) * 1000) / 1000 : 0,
-  };
-}
-module.exports = {
-  heuristicScore,
-  codingHeuristic,
-  qaHeuristic,
-  generalHeuristic,
-  cosineSimilarity,
-  embeddingSimilarity,
-  shouldJudge,
-  llmJudge,
-  computeCompositeEvalScore,
-  evaluateShadowResult,
-  runBatchEval,
-  // Exported for testing
-  getEmbedding,
-};