npm - create-walle - Versions diffs - 0.9.21 → 0.9.23 - Mend

create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (500) hide show

package/template/wall-e/eval/eval-orchestrator.js DELETED Viewed

@@ -1,642 +0,0 @@
-'use strict';
-const { EventEmitter } = require('events');
-const crypto = require('crypto');
-const path = require('path');
-const fs = require('fs');
-const { pLimit, getAvailableProviders } = require('./head-to-head');
-const { runAgentBenchmark, runMultiTurnBenchmark, isTrustedAgentResult } = require('./agent-runner');
-const { createClient } = require('../llm/client');
-const { createAnthropicFromEnv } = require('../llm/anthropic');
-const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
-// ============================================================
-// Benchmark suite loader
-// ============================================================
-const BENCHMARKS_DIR = path.join(__dirname, 'benchmarks');
-const SUITE_FILES = {
-  'coding-agent': 'coding-agent.json',
-  'coding':       'coding.json',
-  'chat':         'chat.json',
-  'chat-eval':    'chat-eval.json',
-  'reasoning':    'reasoning.json',
-  'memory-retrieval': 'memory-retrieval.json',
-};
-const ADAPTER_SUITE_METADATA = {
-  'humaneval-plus': {
-    name: 'humaneval-plus',
-    count: null,
-    taskTypes: ['coding'],
-    difficulties: ['easy', 'medium', 'hard'],
-    adapter: true,
-    description: 'EvalPlus HumanEval+ Python function-generation tasks',
-  },
-  livecodebench: {
-    name: 'livecodebench',
-    count: null,
-    taskTypes: ['coding'],
-    difficulties: ['easy', 'medium', 'hard'],
-    adapter: true,
-    description: 'LiveCodeBench code-generation tasks with date filtering',
-  },
-  'swebench-lite': {
-    name: 'swebench-lite',
-    count: 30,
-    taskTypes: ['coding-agent'],
-    difficulties: ['medium', 'hard'],
-    adapter: true,
-    description: 'Curated SWE-bench Lite issue-fixing tasks',
-  },
-};
-function isAdapterSuite(suiteName) {
-  return !!ADAPTER_SUITE_METADATA[suiteName];
-}
-function listAdapterSuites() {
-  return Object.values(ADAPTER_SUITE_METADATA).map((s) => ({ ...s }));
-}
-const DEFAULT_ADAPTER_RUNNERS = {
-  'humaneval-plus': async ({ brain, providerInfo, model, runId, maxTasks, signal }) => {
-    const { runHumanEvalSuite } = require('./humaneval-adapter');
-    return runHumanEvalSuite({
-      brain,
-      providerType: providerInfo.provider,
-      config: { apiKey: providerInfo.apiKey, baseUrl: providerInfo.baseUrl },
-      model,
-      runId,
-      maxTasks,
-      signal,
-    });
-  },
-  livecodebench: async ({ brain, providerInfo, model, runId, maxTasks, afterDate, signal }) => {
-    const { runLiveCodeBenchSuite } = require('./livecodebench-adapter');
-    return runLiveCodeBenchSuite({
-      brain,
-      providerType: providerInfo.provider,
-      config: { apiKey: providerInfo.apiKey, baseUrl: providerInfo.baseUrl },
-      model,
-      runId,
-      maxTasks,
-      afterDate,
-      signal,
-    });
-  },
-  'swebench-lite': async ({ brain, providerInfo, model, runId, maxTasks, signal, runAgentLoop, timeoutMs }) => {
-    const { runSWEBenchSuite } = require('./swebench-adapter');
-    return runSWEBenchSuite({
-      brain,
-      provider: providerInfo.client,
-      providerType: providerInfo.provider,
-      model,
-      runId,
-      maxTasks,
-      signal,
-      runAgentLoop,
-      timeoutMs,
-    });
-  },
-};
-function loadSuite(suiteName) {
-  if (isAdapterSuite(suiteName)) return [];
-  if (!(suiteName in SUITE_FILES)) throw new Error(`Unknown suite: ${suiteName}`);
-  const file = SUITE_FILES[suiteName];
-  const filePath = path.join(BENCHMARKS_DIR, file);
-  if (!fs.existsSync(filePath)) throw new Error(`Suite file not found: ${filePath}`);
-  return JSON.parse(fs.readFileSync(filePath, 'utf8'));
-}
-function loadAllSuites() {
-  const all = [];
-  for (const [suite, file] of Object.entries(SUITE_FILES)) {
-    const filePath = path.join(BENCHMARKS_DIR, file);
-    if (!fs.existsSync(filePath)) continue;
-    try {
-      const benchmarks = JSON.parse(fs.readFileSync(filePath, 'utf8'));
-      for (const b of benchmarks) {
-        b._suite = suite;
-        all.push(b);
-      }
-    } catch { /* skip malformed suites */ }
-  }
-  return all;
-}
-// ============================================================
-// Local model detection (Ollama/MLX = free)
-// ============================================================
-function isLocalModel(provider) {
-  return provider === 'ollama' || provider === 'mlx';
-}
-// ============================================================
-// EvalOrchestrator
-// ============================================================
-class EvalOrchestrator extends EventEmitter {
-  /**
-   * @param {object} options
-   * @param {number} [options.concurrency=2] - Max concurrent benchmark runs
-   * @param {number} [options.budgetDollars=10.0] - Total budget cap in dollars
-   * @param {number} [options.timeoutMs=600000] - Per-benchmark timeout
-   * @param {object} [options.brain] - Brain module for DB access
-   * @param {string} [options.runId] - Resume a previous run, or auto-generated
-   */
-  constructor(options = {}) {
-    super();
-    this.concurrency = options.concurrency || 2;
-    this.budgetDollars = options.budgetDollars || 10.0;
-    this.timeoutMs = options.timeoutMs || 600000;
-    this.brain = options.brain || null;
-    this.runId = options.runId || crypto.randomUUID();
-    this.spent = {};   // model -> dollars spent
-    this.totalSpent = 0;
-    this.running = false;
-    this.aborted = false;
-    this.adapterRunners = { ...DEFAULT_ADAPTER_RUNNERS, ...(options.adapterRunners || {}) };
-  }
-  /**
-   * Run benchmarks across models.
-   * @param {object} params
-   * @param {string} params.suite - "coding-agent", "all", etc.
-   * @param {Array<string>} params.models - Model IDs to test
-   * @param {Array<string>} [params.benchmarkIds] - Filter to specific benchmark IDs
-   * @param {Function} params.runAgentLoop - Agent loop function for coding benchmarks
-   * @returns {Promise<object>} Run summary
-   */
-  async run({ suite, models, benchmarkIds, runAgentLoop, maxTasks, afterDate }) {
-    if (this.running) throw new Error('Orchestrator is already running');
-    this.running = true;
-    this.aborted = false;
-    try {
-      if (isAdapterSuite(suite)) {
-        return await this._runAdapterSuite({ suite, models, benchmarkIds, runAgentLoop, maxTasks, afterDate });
-      }
-      // 1. Load benchmarks
-      const benchmarks = suite === 'all' ? loadAllSuites() : loadSuite(suite);
-      const filtered = benchmarkIds && benchmarkIds.length > 0
-        ? benchmarks.filter(b => benchmarkIds.includes(b.id))
-        : benchmarks;
-      if (filtered.length === 0) {
-        const summary = { runId: this.runId, status: 'complete', models: {}, totalBenchmarks: 0, totalSpent: 0 };
-        this.emit('run-complete', { runId: this.runId, summary });
-        return summary;
-      }
-      // 2. Validate models
-      if (!models || models.length === 0) {
-        const summary = { runId: this.runId, status: 'error', error: 'No models specified', models: {}, totalBenchmarks: 0, totalSpent: 0 };
-        this.emit('error', { benchmarkId: null, model: null, error: 'No models specified. Provide at least one model.' });
-        this.emit('run-complete', { runId: this.runId, summary });
-        return summary;
-      }
-      // 3. Resolve providers for each model
-      const providerMap = this._resolveProviders(models);
-      // 4. Get completed benchmarks for resume support
-      const completed = this._getCompletedBenchmarks();
-      // 5. Per-model budget
-      const perModelBudget = this.budgetDollars / models.length;
-      // 6. Build work items
-      const workItems = [];
-      for (const model of models) {
-        const prov = providerMap[model];
-        if (!prov) {
-          this.emit('error', { benchmarkId: null, model, error: `No provider found for model: ${model}` });
-          continue;
-        }
-        for (const benchmark of filtered) {
-          const key = `${this.runId}:${model}:${benchmark.id}`;
-          if (completed.has(key)) continue; // skip already-done (resume)
-          workItems.push({ benchmark, model, provider: prov });
-        }
-      }
-      // 7. Run with concurrency limiter
-      const limit = pLimit(this.concurrency);
-      const results = [];
-      const modelResults = {}; // model -> [result, ...]
-      const promises = workItems.map(item => limit(async () => {
-        if (this.aborted) return null;
-        // Check budget before starting
-        const modelKey = item.model;
-        if (!isLocalModel(item.provider.provider) && (this.spent[modelKey] || 0) >= perModelBudget) {
-          this.emit('error', {
-            benchmarkId: item.benchmark.id,
-            model: modelKey,
-            error: `Per-model budget exceeded ($${(this.spent[modelKey] || 0).toFixed(4)} >= $${perModelBudget.toFixed(2)})`,
-          });
-          return null;
-        }
-        const startedAt = new Date().toISOString();
-        this.emit('benchmark-start', {
-          benchmarkId: item.benchmark.id,
-          model: modelKey,
-          startedAt,
-        });
-        const startTime = Date.now();
-        let result;
-        try {
-          const runner = item.benchmark.multiTurn ? runMultiTurnBenchmark : runAgentBenchmark;
-          result = await runner(item.benchmark, {
-            runAgentLoop,
-            brain: this.brain,
-            timeoutMs: this.timeoutMs,
-            provider: item.provider.client,
-            model: modelKey,
-          });
-        } catch (err) {
-          result = {
-            benchmarkId: item.benchmark.id,
-            success: false,
-            score: { composite: 0, dimensions: {} },
-            latencyMs: Date.now() - startTime,
-            costDollars: 0,
-            error: err.message,
-          };
-        }
-        const elapsed = Date.now() - startTime;
-        const costDollars = result.costDollars || 0;
-        // Track cost
-        if (!isLocalModel(item.provider.provider)) {
-          this.spent[modelKey] = (this.spent[modelKey] || 0) + costDollars;
-          this.totalSpent += costDollars;
-        }
-        // Store result in brain
-        this._storeResult(item, result);
-        // Emit benchmark-complete
-        this.emit('benchmark-complete', {
-          benchmarkId: item.benchmark.id,
-          model: modelKey,
-          composite: result.score?.composite || 0,
-          dimensions: result.score?.dimensions || {},
-          costDollars,
-          elapsed,
-        });
-        // Track per-model
-        if (!modelResults[modelKey]) modelResults[modelKey] = [];
-        modelResults[modelKey].push(result);
-        results.push(result);
-        // Budget checks
-        this._checkBudget(perModelBudget);
-        return result;
-      }));
-      await Promise.all(promises);
-      // 8. Emit model-complete for each model
-      for (const model of models) {
-        const mrs = modelResults[model] || [];
-        if (mrs.length === 0) continue;
-        const avgScore = mrs.reduce((s, r) => s + (r.score?.composite || 0), 0) / mrs.length;
-        const totalCost = mrs.reduce((s, r) => s + (r.costDollars || 0), 0);
-        this.emit('model-complete', {
-          model,
-          avgScore: Math.round(avgScore * 1000) / 1000,
-          totalCost: Math.round(totalCost * 1_000_000) / 1_000_000,
-          benchmarksRun: mrs.length,
-        });
-      }
-      // 9. Summary
-      const summary = {
-        runId: this.runId,
-        status: this.aborted ? 'aborted' : 'complete',
-        totalBenchmarks: results.length,
-        totalSpent: Math.round(this.totalSpent * 1_000_000) / 1_000_000,
-        models: {},
-      };
-      for (const model of models) {
-        const mrs = modelResults[model] || [];
-        if (mrs.length === 0) continue;
-        summary.models[model] = {
-          avgScore: Math.round((mrs.reduce((s, r) => s + (r.score?.composite || 0), 0) / mrs.length) * 1000) / 1000,
-          totalCost: Math.round(mrs.reduce((s, r) => s + (r.costDollars || 0), 0) * 1_000_000) / 1_000_000,
-          benchmarksRun: mrs.length,
-          errors: mrs.filter(r => r.error).length,
-        };
-      }
-      this.emit('run-complete', { runId: this.runId, summary });
-      return summary;
-    } finally {
-      this.running = false;
-    }
-  }
-  /**
-   * Abort a running evaluation.
-   */
-  abort() {
-    this.aborted = true;
-    this.emit('error', { benchmarkId: null, model: null, error: 'Run aborted by user' });
-  }
-  /**
-   * Resume a previous run — re-runs with the same runId, skipping completed benchmarks.
-   * @param {object} params - Same as run() params
-   */
-  async resume(params) {
-    // runId is already set (from constructor), completed benchmarks will be skipped
-    return this.run(params);
-  }
-  // ---- Internal helpers ----
-  /**
-   * Resolve provider info and create LLM clients for each model.
-   * Returns map of model -> { client, provider (type string), registryId }
-   */
-  _resolveProviders(models) {
-    const map = {};
-    const available = this.brain ? getAvailableProviders(this.brain) : [];
-    // Infer provider type from model name
-    const inferProvider = (model) => {
-      if (model.startsWith('claude-')) return 'anthropic';
-      if (model.startsWith('gpt-') || model.startsWith('o1') || model.startsWith('o3') || model.startsWith('o4')) return 'openai';
-      if (model.startsWith('gemini-')) return 'google';
-      if (model.startsWith('deepseek-')) return 'deepseek';
-      if (model.startsWith('kimi-') || model.startsWith('moonshot-')) return 'moonshot';
-      if (model.includes(':')) return 'ollama';
-      return null;
-    };
-    // Env-var fallback for providers not registered in brain DB.
-    const envKeyFor = (providerType) => {
-      if (providerType === 'openai')   return process.env.OPENAI_API_KEY;
-      if (providerType === 'google')   return process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY;
-      if (providerType === 'deepseek') return process.env.DEEPSEEK_API_KEY;
-      if (providerType === 'moonshot') return process.env.MOONSHOT_API_KEY;
-      return null;
-    };
-    for (const model of models) {
-      const match = available.find(p => p.model === model);
-      let providerType = match?.provider || inferProvider(model);
-      if (!providerType) continue;
-      const apiKey = match?.apiKey || envKeyFor(providerType) || null;
-      const baseUrl = match?.baseUrl || null;
-      const registryId = match?.registryId || null;
-      try {
-        let client;
-        if (providerType === 'anthropic' && !apiKey) {
-          // Use env-based factory which picks up Portkey gateway config
-          client = createAnthropicFromEnv();
-        } else {
-          client = createClient(providerType, { apiKey, baseUrl });
-        }
-        map[model] = { client, provider: providerType, registryId, apiKey, baseUrl };
-      } catch (err) {
-        this.emit('error', { benchmarkId: null, model, error: `Failed to create client: ${err.message}` });
-      }
-    }
-    return map;
-  }
-  async _runAdapterSuite({ suite, models, benchmarkIds, runAgentLoop, maxTasks, afterDate }) {
-    const runner = this.adapterRunners[suite];
-    if (!runner) throw new Error(`No adapter runner configured for suite: ${suite}`);
-    if (!models || models.length === 0) {
-      const summary = { runId: this.runId, status: 'error', error: 'No models specified', models: {}, totalBenchmarks: 0, totalSpent: 0 };
-      this.emit('error', { benchmarkId: null, model: null, error: 'No models specified. Provide at least one model.' });
-      this.emit('run-complete', { runId: this.runId, summary });
-      return summary;
-    }
-    const providerMap = this._resolveProviders(models);
-    const adapterMaxTasks = maxTasks || (benchmarkIds && benchmarkIds.length ? benchmarkIds.length : undefined);
-    const limit = pLimit(this.concurrency);
-    const modelResults = {};
-    const orchestrator = this;
-    const adapterSignal = { get aborted() { return orchestrator.aborted; } };
-    const tasks = models.map((model) => limit(async () => {
-      const providerInfo = providerMap[model];
-      if (!providerInfo) {
-        this.emit('error', { benchmarkId: suite, model, error: `No provider found for model: ${model}` });
-        return null;
-      }
-      this.emit('benchmark-start', {
-        benchmarkId: suite,
-        model,
-        startedAt: new Date().toISOString(),
-      });
-      let result;
-      try {
-        result = await runner({
-          brain: this.brain,
-          providerInfo,
-          model,
-          runId: this.runId,
-          maxTasks: adapterMaxTasks,
-          afterDate,
-          signal: adapterSignal,
-          runAgentLoop,
-          timeoutMs: this.timeoutMs,
-        });
-      } catch (err) {
-        result = {
-          suite,
-          model,
-          totalTasks: 0,
-          avgScore: 0,
-          totalCost: 0,
-          error: err.message,
-          results: [],
-        };
-        this.emit('error', { benchmarkId: suite, model, error: err.message });
-      }
-      const totalCost = result.totalCost || 0;
-      if (!isLocalModel(providerInfo.provider)) {
-        this.spent[model] = (this.spent[model] || 0) + totalCost;
-        this.totalSpent += totalCost;
-      }
-      modelResults[model] = result;
-      this.emit('model-complete', {
-        model,
-        avgScore: Math.round((result.avgScore || 0) * 1000) / 1000,
-        totalCost: Math.round(totalCost * 1_000_000) / 1_000_000,
-        benchmarksRun: result.totalTasks || result.results?.length || 0,
-      });
-      return result;
-    }));
-    await Promise.all(tasks);
-    const summary = {
-      runId: this.runId,
-      status: this.aborted ? 'aborted' : 'complete',
-      totalBenchmarks: Object.values(modelResults).reduce((s, r) => s + (r.totalTasks || r.results?.length || 0), 0),
-      totalSpent: Math.round(this.totalSpent * 1_000_000) / 1_000_000,
-      models: {},
-      suite,
-      adapter: true,
-    };
-    for (const [model, result] of Object.entries(modelResults)) {
-      summary.models[model] = {
-        avgScore: Math.round((result.avgScore || 0) * 1000) / 1000,
-        totalCost: Math.round((result.totalCost || 0) * 1_000_000) / 1_000_000,
-        benchmarksRun: result.totalTasks || result.results?.length || 0,
-        errors: (result.results || []).filter(r => r.error).length + (result.error ? 1 : 0),
-      };
-    }
-    this.emit('run-complete', { runId: this.runId, summary });
-    return summary;
-  }
-  /**
-   * Get set of completed benchmark keys for this runId (for resume).
-   */
-  _getCompletedBenchmarks() {
-    const completed = new Set();
-    if (!this.brain || typeof this.brain.getBenchmarkResults !== 'function') return completed;
-    try {
-      const results = this.brain.getBenchmarkResults({});
-      for (const r of results) {
-        if (r.run_id === this.runId && !r.error) {
-          completed.add(`${this.runId}:${r.model}:${r.benchmark_id}`);
-        }
-      }
-    } catch { /* non-fatal */ }
-    return completed;
-  }
-  /**
-   * Store a benchmark result in the brain DB.
-   */
-  _storeResult(item, result) {
-    if (!this.brain || typeof this.brain.insertBenchmarkResult !== 'function') return;
-    try {
-      const scoringMethod = item.benchmark.agentExpectations?.testCommand
-        ? 'agent-rubric+tests'
-        : 'agent-rubric';
-      this.brain.insertBenchmarkResult(decorateBenchmarkResult({
-        runId: this.runId,
-        suite: item.benchmark._suite || 'coding-agent',
-        promptId: item.benchmark.id,
-        taskType: item.benchmark.taskType || 'coding-agent',
-        difficulty: item.benchmark.difficulty,
-        provider: item.provider.provider,
-        model: item.model,
-        prompt: item.benchmark.prompt,
-        response: result.output || '',
-        traitScore: null,
-        compositeScore: result.score?.composite || 0,
-        latencyMs: result.latencyMs,
-        error: result.error || null,
-        costDollars: result.costDollars || null,
-        testsBefore: result.testsBefore ?? null,
-        testsAfter: result.testsAfter ?? null,
-        totalTests: result.totalTests ?? null,
-        dimensionsJson: result.dimensionsJson || null,
-        inputTokens: result.inputTokens ?? null,
-        outputTokens: result.outputTokens ?? null,
-        scorerVersion: DEFAULT_SCORER_VERSION,
-        scoringMethod,
-        trusted: isTrustedAgentResult(result),
-        runConfig: {
-          timeoutMs: this.timeoutMs,
-          concurrency: this.concurrency,
-          budgetDollars: this.budgetDollars,
-          scoringMethod,
-        },
-      }, {
-        suite: item.benchmark._suite || 'coding-agent',
-        benchmark: item.benchmark,
-        runId: this.runId,
-        provider: item.provider.provider,
-        model: item.model,
-        scoringMethod,
-        scorerVersion: DEFAULT_SCORER_VERSION,
-        trusted: isTrustedAgentResult(result),
-        runConfig: {
-          timeoutMs: this.timeoutMs,
-          concurrency: this.concurrency,
-          budgetDollars: this.budgetDollars,
-          scoringMethod,
-        },
-      }));
-    } catch { /* non-fatal */ }
-  }
-  /**
-   * Check budget and emit warnings/stops.
-   */
-  _checkBudget(perModelBudget) {
-    const remaining = this.budgetDollars - this.totalSpent;
-    const pctSpent = this.totalSpent / this.budgetDollars;
-    if (pctSpent >= 1.0) {
-      this.emit('budget-warning', {
-        spent: this.totalSpent,
-        budget: this.budgetDollars,
-        remaining: 0,
-      });
-      this.abort();
-      return;
-    }
-    if (pctSpent >= 0.8) {
-      this.emit('budget-warning', {
-        spent: Math.round(this.totalSpent * 1_000_000) / 1_000_000,
-        budget: this.budgetDollars,
-        remaining: Math.round(remaining * 1_000_000) / 1_000_000,
-      });
-    }
-    // Check per-model budgets
-    for (const [model, spent] of Object.entries(this.spent)) {
-      if (spent >= perModelBudget) {
-        this.emit('budget-warning', {
-          spent: Math.round(spent * 1_000_000) / 1_000_000,
-          budget: perModelBudget,
-          remaining: 0,
-          model,
-        });
-      }
-    }
-  }
-}
-module.exports = {
-  EvalOrchestrator,
-  ADAPTER_SUITE_METADATA,
-  isAdapterSuite,
-  listAdapterSuites,
-  loadSuite,
-};