npm - create-walle - Versions diffs - 0.9.21 → 0.9.23 - Mend

create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (500) hide show

package/template/wall-e/eval/meta-harness/proposer-runner.js DELETED Viewed

@@ -1,110 +0,0 @@
-'use strict';
-const fs = require('node:fs');
-const path = require('node:path');
-function buildProposerPrompt({ iteration, store, domainSpec, frontier = {} } = {}) {
-  if (!store) throw new Error('store is required');
-  if (!domainSpec) throw new Error('domainSpec is required');
-  const allowed = (domainSpec.allowedCandidateSurfaces || []).join(', ');
-  const heldoutPolicy = domainSpec.leakagePolicy?.proposerCanReadHeldout
-    ? 'Held-out artifacts are visible.'
-    : 'Do not read, infer, or hardcode held-out task IDs or held-out results.';
-  return [
-    `You are the Meta-Harness proposer for domain ${domainSpec.id}.`,
-    `Iteration: ${iteration}`,
-    '',
-    'Read these run-local artifacts before writing candidates:',
-    '- run_manifest.json',
-    '- frontier.json',
-    '- experience.jsonl',
-    '- reports/*.md when present',
-    '- iterations/*/candidates/*/tasks/*/{result.json,diff.patch,git-status.txt,workspace-manifest.json}',
-    '',
-    `Allowed candidate surfaces: ${allowed}`,
-    heldoutPolicy,
-    '',
-    'Candidate module contract:',
-    '- CommonJS module exporting { manifest, apply(baseHarness, context) }.',
-    '- manifest.id must be safe and unique.',
-    '- manifest.hypothesis must be falsifiable and trace-grounded.',
-    '- manifest.surfaces must only use allowed candidate surfaces.',
-    '- apply() must return a harness overlay; do not mutate production Wall-E source.',
-    '',
-    'Write candidate files inside candidates/<candidate-id>/candidate.js.',
-    'Then write pending_eval.json at the run root with this exact shape:',
-    '{ "candidates": [{ "id": "...", "candidatePath": "candidates/<id>/candidate.js", "hypothesis": "...", "surfaces": ["..."] }] }',
-    '',
-    'Avoid parameter sweeps, task-name hardcoding, scorer changes, and no-op candidates.',
-    `Current best: ${frontier.bestCandidate?.candidateId || 'none'} score=${frontier.bestCandidate?.aggregateScore ?? 'n/a'}`,
-  ].join('\n');
-}
-async function runProposer({
-  runAgentLoop,
-  store,
-  domainSpec,
-  frontier = {},
-  iteration,
-  provider = null,
-  model = null,
-  timeoutMs = 600000,
-} = {}) {
-  if (typeof runAgentLoop !== 'function') throw new Error('runAgentLoop is required');
-  store.ensureRun();
-  const prompt = buildProposerPrompt({ iteration, store, domainSpec, frontier });
-  const result = await runAgentLoop(prompt, {
-    cwd: store.runDir,
-    provider,
-    model,
-    timeoutMs,
-    maxTurns: 12,
-    mode: 'build',
-    benchmark: true,
-    headless: true,
-    headlessPolicy: 'allow',
-    permissionTimeoutMs: 0,
-    persistTranscript: true,
-  });
-  const proposals = readPendingEval(store);
-  store.appendExperience({
-    event: 'proposer_ran',
-    iteration,
-    success: result.success === true,
-    proposalCount: proposals.length,
-    sessionId: result.sessionId || null,
-    error: result.error || result.stderr || null,
-  });
-  return { result, proposals, prompt };
-}
-function readPendingEval(store) {
-  const pendingPath = path.join(store.runDir, 'pending_eval.json');
-  if (!fs.existsSync(pendingPath)) return [];
-  const pending = JSON.parse(fs.readFileSync(pendingPath, 'utf8'));
-  const raw = Array.isArray(pending) ? pending : pending.candidates;
-  if (!Array.isArray(raw)) throw new Error('pending_eval.json must contain a candidates array');
-  return raw.map((entry) => normalizeProposal(entry, store));
-}
-function normalizeProposal(entry, store) {
-  if (!entry || typeof entry !== 'object') throw new Error('pending candidate entry must be an object');
-  if (!entry.candidatePath) throw new Error(`pending candidate ${entry.id || '<unknown>'} missing candidatePath`);
-  const candidatePath = path.isAbsolute(entry.candidatePath)
-    ? entry.candidatePath
-    : path.resolve(store.runDir, entry.candidatePath);
-  if (!candidatePath.startsWith(store.runDir + path.sep)) {
-    throw new Error(`pending candidate path escapes run directory: ${entry.candidatePath}`);
-  }
-  return {
-    ...entry,
-    candidatePath,
-  };
-}
-module.exports = {
-  buildProposerPrompt,
-  runProposer,
-  readPendingEval,
-  normalizeProposal,
-};

package/template/wall-e/eval/meta-harness/reporting.js DELETED Viewed

@@ -1,58 +0,0 @@
-'use strict';
-const path = require('node:path');
-function writeIterationReport({ store, iteration, frontier, summaries = [] } = {}) {
-  store.ensureRun();
-  const lines = [
-    `# Meta-Harness Iteration ${iteration}`,
-    '',
-    `Best candidate: ${frontier?.bestCandidate?.candidateId || 'none'}`,
-    `Best score: ${frontier?.bestCandidate?.aggregateScore ?? 'n/a'}`,
-    '',
-    '## Candidate Summaries',
-    '',
-  ];
-  for (const summary of summaries) {
-    lines.push(`- ${summary.candidateId} [${summary.split}]: score=${formatScore(summary.aggregateScore)}, tasks=${summary.taskCount}, success=${summary.successCount}`);
-  }
-  if (summaries.length === 0) lines.push('- No candidates evaluated.');
-  lines.push('', '## Next Proposer Checklist', '');
-  lines.push('- Read this report, frontier.json, and experience.jsonl before proposing.');
-  lines.push('- Inspect raw task artifacts for failures, especially diff.patch and result.json.');
-  lines.push('- Propose only trace-grounded candidate mechanisms using allowed surfaces.');
-  const rel = path.join('reports', `iteration-${String(iteration).padStart(3, '0')}.md`);
-  store.writeText(rel, lines.join('\n') + '\n');
-  return path.join(store.runDir, rel);
-}
-function writeRunSummary({ store, frontier, summaries = [] } = {}) {
-  store.ensureRun();
-  const lines = [
-    '# Meta-Harness Run Summary',
-    '',
-    `Best candidate: ${frontier?.bestCandidate?.candidateId || 'none'}`,
-    `Best score: ${frontier?.bestCandidate?.aggregateScore ?? 'n/a'}`,
-    `Evaluated summaries: ${summaries.length}`,
-    '',
-    '## Frontier History',
-    '',
-  ];
-  for (const item of frontier?.history || []) {
-    lines.push(`- iter ${item.iteration} ${item.candidateId} [${item.split}]: score=${formatScore(item.aggregateScore)}`);
-  }
-  if (!frontier?.history?.length) lines.push('- No frontier entries.');
-  store.writeText(path.join('reports', 'run-summary.md'), lines.join('\n') + '\n');
-  return path.join(store.runDir, 'reports', 'run-summary.md');
-}
-function formatScore(value) {
-  const n = Number(value);
-  return Number.isFinite(n) ? n.toFixed(4) : 'n/a';
-}
-module.exports = {
-  writeIterationReport,
-  writeRunSummary,
-  formatScore,
-};

package/template/wall-e/eval/meta-harness/telemetry.js DELETED Viewed

@@ -1,27 +0,0 @@
-'use strict';
-function recordMetaHarnessTelemetry(store, event = {}) {
-  const row = {
-    timestamp: typeof store?.now === 'function' ? store.now() : new Date().toISOString(),
-    subsystem: 'meta-harness',
-    ...event,
-  };
-  try {
-    if (store?.ensureRun) store.ensureRun();
-    if (store?.appendJsonl) store.appendJsonl('telemetry.jsonl', row);
-  } catch (err) {
-    try {
-      console.warn(`[meta-harness telemetry] failed to write event ${event.type || event.event || 'unknown'}: ${err.message}`);
-    } catch {}
-  }
-  return row;
-}
-function createTelemetryEmitter(store, defaults = {}) {
-  return (event = {}) => recordMetaHarnessTelemetry(store, { ...defaults, ...event });
-}
-module.exports = {
-  recordMetaHarnessTelemetry,
-  createTelemetryEmitter,
-};

package/template/wall-e/eval/meta-harness/validation.js DELETED Viewed

@@ -1,81 +0,0 @@
-'use strict';
-const fs = require('node:fs');
-const path = require('node:path');
-const { SAFE_ID_RE } = require('./domain-spec');
-function loadCandidateModule(filePath) {
-  const absolutePath = path.resolve(filePath);
-  if (!fs.existsSync(absolutePath)) throw new Error(`candidate file not found: ${absolutePath}`);
-  delete require.cache[require.resolve(absolutePath)];
-  return require(absolutePath);
-}
-function validateCandidateModule(candidateModule, opts = {}) {
-  const normalized = normalizeCandidateModule(candidateModule);
-  if (!normalized || typeof normalized !== 'object') {
-    throw new Error('candidate module must export an object');
-  }
-  validateCandidateManifest(normalized.manifest, opts);
-  if (typeof normalized.apply !== 'function') {
-    throw new Error(`candidate ${normalized.manifest.id} must export apply(baseHarness, context)`);
-  }
-  return normalized;
-}
-function validateCandidateManifest(manifest, opts = {}) {
-  if (!manifest || typeof manifest !== 'object' || Array.isArray(manifest)) {
-    throw new Error('candidate manifest must be an object');
-  }
-  requireString(manifest, 'id', 'candidate manifest id');
-  if (!SAFE_ID_RE.test(manifest.id)) throw new Error(`candidate id is not safe: ${manifest.id}`);
-  requireString(manifest, 'hypothesis', `candidate ${manifest.id} hypothesis`);
-  const surfaces = requireArray(manifest, 'surfaces', `candidate ${manifest.id} surfaces`);
-  if (surfaces.length === 0) throw new Error(`candidate ${manifest.id} surfaces must not be empty`);
-  for (const surface of surfaces) {
-    if (typeof surface !== 'string' || !SAFE_ID_RE.test(surface)) {
-      throw new Error(`candidate ${manifest.id} has unsafe surface: ${surface}`);
-    }
-  }
-  if (opts.domainSpec) {
-    assertAllowedSurfaces(surfaces, opts.domainSpec.allowedCandidateSurfaces || [], manifest.id);
-  }
-  return true;
-}
-function assertAllowedSurfaces(surfaces, allowedSurfaces, candidateId = 'candidate') {
-  const allowed = new Set(allowedSurfaces || []);
-  const forbidden = surfaces.filter((surface) => !allowed.has(surface));
-  if (forbidden.length > 0) {
-    throw new Error(`${candidateId} uses forbidden candidate surface(s): ${forbidden.join(', ')}`);
-  }
-  return true;
-}
-function normalizeCandidateModule(candidateModule) {
-  if (!candidateModule || typeof candidateModule !== 'object') return candidateModule;
-  return candidateModule.default || candidateModule.candidate || candidateModule;
-}
-function requireString(obj, key, label = key) {
-  const value = obj[key];
-  if (typeof value !== 'string' || value.trim() === '') {
-    throw new Error(`${label} must be a non-empty string`);
-  }
-  return value;
-}
-function requireArray(obj, key, label = key) {
-  const value = obj[key];
-  if (!Array.isArray(value)) throw new Error(`${label} must be an array`);
-  return value;
-}
-module.exports = {
-  loadCandidateModule,
-  validateCandidateModule,
-  validateCandidateManifest,
-  assertAllowedSurfaces,
-  normalizeCandidateModule,
-};

package/template/wall-e/eval/promoter.js DELETED Viewed

@@ -1,228 +0,0 @@
-'use strict';
-/**
- * Promotion detector — checks if shadow models are ready for promotion.
- * Creates brain tasks and briefing items when models meet criteria.
- */
-const PROMOTION_CRITERIA = {
-  minEvals: 50,
-  minAvgScore: 0.75,
-  minWinRate: 0.6,
-  validTrends: ['improving', 'stable'],
-  minDaysSinceAlert: 7,
-};
-/**
- * Format a promotion alert as markdown.
- */
-function formatPromotionAlert(candidate) {
-  const winRate = candidate.total_evals > 0
-    ? (candidate.win_count / candidate.total_evals * 100).toFixed(1)
-    : '0.0';
-  return [
-    `## Model Promotion: ${candidate.model}`,
-    '',
-    `**Task type:** ${candidate.task_type}`,
-    `**Evaluations:** ${candidate.total_evals}`,
-    `**Average score:** ${(candidate.avg_score * 100).toFixed(1)}%`,
-    `**Win rate:** ${winRate}%`,
-    `**Strong wins:** ${candidate.strong_win_count}`,
-    `**Avg latency:** ${Math.round(candidate.avg_latency_ms || 0)}ms`,
-    `**Trend:** ${candidate.trend}`,
-    `**Current rollout:** ${((candidate.rollout_pct || 0) * 100).toFixed(0)}%`,
-    '',
-    '### Recommendation',
-    buildRecommendation(candidate),
-    '',
-    '### Actions',
-    '- **Approve**: Set as default model for this task type, start gradual rollout',
-    '- **Trial**: Start at 10% rollout for 7 days',
-    '- **Reject**: Suppress alerts for 30 days',
-  ].join('\n');
-}
-/**
- * Build a recommendation based on candidate stats.
- */
-function buildRecommendation(candidate) {
-  const winRate = candidate.total_evals > 0
-    ? candidate.win_count / candidate.total_evals
-    : 0;
-  if (candidate.avg_score >= 0.9 && winRate >= 0.8 && candidate.trend === 'improving') {
-    return 'Strong candidate for full promotion. High quality and improving trend.';
-  }
-  if (candidate.avg_score >= 0.8 && winRate >= 0.7) {
-    return 'Good candidate. Consider a 30% trial rollout to validate at scale.';
-  }
-  return 'Meets minimum criteria. Recommend starting with a 10% trial.';
-}
-/**
- * Check for models eligible for promotion.
- * @param {Object} brain - Brain module
- * @returns {{ candidates: Array, alertsCreated: number }}
- */
-function checkPromotions(brain) {
-  const promotionCandidates = brain.getPromotionCandidates();
-  let alertsCreated = 0;
-  for (const candidate of promotionCandidates) {
-    // Skip if already fully promoted
-    if (candidate.rollout_pct >= 1.0) continue;
-    // Skip if recently alerted
-    if (candidate.last_promotion_alert) {
-      const daysSince = (Date.now() - new Date(candidate.last_promotion_alert).getTime()) / 86400000;
-      if (daysSince < PROMOTION_CRITERIA.minDaysSinceAlert) continue;
-    }
-    // Create brain task for user review
-    try {
-      brain.insertTask({
-        title: `Model promotion: ${candidate.model} for ${candidate.task_type}`,
-        description: formatPromotionAlert(candidate),
-        priority: 'high',
-        type: 'once',
-        execution: 'manual',
-        source: 'training-pipeline',
-      });
-    } catch (err) {
-      console.error(`[promoter] Failed to create task for ${candidate.model}:`, err.message);
-      continue;
-    }
-    // Add briefing item
-    try {
-      if (brain.insertBriefingItem) {
-        brain.insertBriefingItem({
-          title: `Shadow model ${candidate.model} ready for promotion (${candidate.task_type})`,
-          category: 'training',
-          urgency: 'medium',
-          context: JSON.stringify({
-            model: candidate.model,
-            taskType: candidate.task_type,
-            avgScore: candidate.avg_score,
-            totalEvals: candidate.total_evals,
-            trend: candidate.trend,
-          }),
-        });
-      }
-    } catch (err) {
-      console.error(`[promoter] Failed to create briefing for ${candidate.model}:`, err.message);
-    }
-    // Update last_promotion_alert timestamp
-    try {
-      brain.upsertModelTaskScore(candidate.model, candidate.task_type, {
-        provider: candidate.provider,
-        totalEvals: candidate.total_evals,
-        avgScore: candidate.avg_score,
-        winCount: candidate.win_count,
-        strongWinCount: candidate.strong_win_count,
-        avgLatencyMs: candidate.avg_latency_ms,
-        score7d: candidate.score_7d,
-        score30d: candidate.score_30d,
-        trend: candidate.trend,
-        lastEvalAt: candidate.last_eval_at,
-      });
-      // Set last_promotion_alert directly
-      brain.getDb().prepare('UPDATE model_task_scores SET last_promotion_alert = datetime("now") WHERE model = ? AND task_type = ?')
-        .run(candidate.model, candidate.task_type);
-    } catch {}
-    alertsCreated++;
-  }
-  return { candidates: promotionCandidates, alertsCreated };
-}
-const DEMOTION_CRITERIA = {
-  maxScore7d: 0.5,
-  minEvals: 30,
-};
-/**
- * Check for models that should be demoted based on declining performance.
- * Models with 7-day rolling avg < 0.5 and 30+ evals get flagged.
- *
- * @param {Object} brain - Brain module
- * @returns {{ demotions: Array, alertsCreated: number }}
- */
-function checkDemotions(brain) {
-  const db = brain.getDb();
-  let alertsCreated = 0;
-  // Find models with poor recent performance
-  const demotionCandidates = db.prepare(`
-    SELECT model, task_type, provider, total_evals, avg_score,
-      score_7d, score_30d, trend, rollout_pct, last_eval_at
-    FROM model_task_scores
-    WHERE score_7d IS NOT NULL
-      AND score_7d < ?
-      AND total_evals >= ?
-  `).all(DEMOTION_CRITERIA.maxScore7d, DEMOTION_CRITERIA.minEvals);
-  for (const candidate of demotionCandidates) {
-    const dropPct = candidate.score_30d
-      ? Math.round((1 - candidate.score_7d / candidate.score_30d) * 100 * 10) / 10
-      : 0;
-    // Create regression alert with demotion context
-    const alertId = `demotion::${candidate.model}::${candidate.task_type}::${new Date().toISOString().slice(0, 10)}`;
-    try {
-      db.prepare(`
-        INSERT OR IGNORE INTO eval_regression_alerts
-          (id, benchmark_id, model, provider, baseline_avg, current_score, drop_pct)
-        VALUES (?, ?, ?, ?, ?, ?, ?)
-      `).run(
-        alertId,
-        `demotion:${candidate.task_type}`,
-        candidate.model,
-        candidate.provider || 'unknown',
-        candidate.score_30d || candidate.avg_score,
-        candidate.score_7d,
-        dropPct
-      );
-    } catch (err) {
-      console.error(`[promoter] Failed to create demotion alert for ${candidate.model}:`, err.message);
-      continue;
-    }
-    // Create briefing item
-    try {
-      if (brain.insertBriefingItem) {
-        brain.insertBriefingItem({
-          title: `Model demotion candidate: ${candidate.model} (${candidate.task_type})`,
-          category: 'training',
-          urgency: 'high',
-          context: JSON.stringify({
-            model: candidate.model,
-            taskType: candidate.task_type,
-            score7d: candidate.score_7d,
-            score30d: candidate.score_30d,
-            totalEvals: candidate.total_evals,
-            trend: candidate.trend,
-            dropPct,
-          }),
-        });
-      }
-    } catch (err) {
-      console.error(`[promoter] Failed to create demotion briefing for ${candidate.model}:`, err.message);
-    }
-    alertsCreated++;
-  }
-  return { demotions: demotionCandidates, alertsCreated };
-}
-module.exports = {
-  checkPromotions,
-  checkDemotions,
-  formatPromotionAlert,
-  buildRecommendation,
-  PROMOTION_CRITERIA,
-  DEMOTION_CRITERIA,
-};

package/template/wall-e/eval/provider-normalizer.js DELETED Viewed

@@ -1,33 +0,0 @@
-'use strict';
-function providerString(provider) {
-  if (provider && typeof provider === 'object') {
-    if (typeof provider.type === 'string') return provider.type.trim();
-    if (typeof provider.provider === 'string') return provider.provider.trim();
-  }
-  if (typeof provider === 'string') return provider.trim();
-  return '';
-}
-function inferProviderFromModel(model) {
-  const value = typeof model === 'string' ? model.trim().toLowerCase() : '';
-  if (!value) return null;
-  if (value.startsWith('deepseek-') || value.startsWith('deepseek/')) return 'deepseek';
-  if (value.startsWith('kimi-') || value.startsWith('moonshot-') || value.startsWith('moonshot/')) return 'moonshot';
-  if (value.startsWith('claude-')) return 'anthropic';
-  if (value.startsWith('gemini-') || value.startsWith('google/')) return 'google';
-  if (/^(gpt-|chatgpt-|o[134](?:-|$))/.test(value)) return 'openai';
-  return null;
-}
-function normalizeEvalProvider(provider, model, { fallback = 'default' } = {}) {
-  const explicit = providerString(provider);
-  if (explicit && explicit !== 'default') return explicit;
-  return inferProviderFromModel(model) || explicit || fallback;
-}
-module.exports = {
-  inferProviderFromModel,
-  normalizeEvalProvider,
-  providerString,
-};