npm - thumbgate - Versions diffs - 1.14.1 → 1.16.0 - Mend

thumbgate 1.14.1 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

package/.claude-plugin/marketplace.json +6 -6
package/.claude-plugin/plugin.json +3 -3
package/.well-known/llms.txt +5 -5
package/.well-known/mcp/server-card.json +1 -1
package/README.md +60 -35
package/adapters/chatgpt/openapi.yaml +118 -2
package/adapters/claude/.mcp.json +2 -2
package/adapters/mcp/server-stdio.js +217 -84
package/adapters/opencode/opencode.json +1 -1
package/bench/prompt-eval-suite.json +5 -1
package/bin/cli.js +211 -8
package/config/enforcement.json +59 -7
package/config/evals/agent-safety-eval.json +338 -22
package/config/gates/default.json +33 -0
package/config/gates/routine.json +43 -0
package/config/github-about.json +3 -3
package/config/mcp-allowlists.json +4 -0
package/config/merge-quality-checks.json +2 -1
package/config/model-candidates.json +131 -0
package/openapi/openapi.yaml +118 -2
package/package.json +70 -51
package/public/blog.html +7 -7
package/public/codex-plugin.html +13 -7
package/public/compare.html +29 -23
package/public/dashboard.html +105 -12
package/public/guide.html +28 -28
package/public/index.html +233 -97
package/public/learn.html +87 -20
package/public/lessons.html +26 -2
package/public/numbers.html +271 -0
package/public/pro.html +89 -19
package/scripts/agent-audit-trace.js +55 -0
package/scripts/agent-memory-lifecycle.js +96 -0
package/scripts/agent-readiness-plan.js +118 -0
package/scripts/agentic-data-pipeline.js +21 -1
package/scripts/agents-sdk-sandbox-plan.js +57 -0
package/scripts/ai-org-governance.js +98 -0
package/scripts/ai-search-distribution.js +43 -0
package/scripts/artifact-agent-plan.js +81 -0
package/scripts/billing.js +27 -8
package/scripts/cli-feedback.js +2 -1
package/scripts/cli-schema.js +60 -5
package/scripts/code-mode-mcp-plan.js +71 -0
package/scripts/commercial-offer.js +1 -1
package/scripts/context-engine.js +1 -2
package/scripts/context-manager.js +4 -1
package/scripts/contextfs.js +214 -32
package/scripts/dashboard-render-spec.js +1 -1
package/scripts/dashboard.js +275 -9
package/scripts/decision-journal.js +13 -3
package/scripts/document-workflow-governance.js +62 -0
package/scripts/enterprise-agent-rollout.js +34 -0
package/scripts/experience-replay-governance.js +69 -0
package/scripts/export-hf-dataset.js +1 -1
package/scripts/feedback-loop.js +141 -9
package/scripts/feedback-to-rules.js +17 -23
package/scripts/gates-engine.js +4 -6
package/scripts/growth-campaigns.js +49 -0
package/scripts/harness-selector.js +145 -1
package/scripts/hybrid-supervisor-agent.js +64 -0
package/scripts/inference-cache-policy.js +72 -0
package/scripts/inference-economics.js +53 -0
package/scripts/internal-agent-bootstrap.js +12 -2
package/scripts/knowledge-layer-plan.js +108 -0
package/scripts/lesson-canonical.js +181 -0
package/scripts/lesson-db.js +71 -10
package/scripts/lesson-inference.js +183 -44
package/scripts/lesson-search.js +4 -1
package/scripts/lesson-synthesis.js +23 -2
package/scripts/llm-client.js +157 -26
package/scripts/mailer/resend-mailer.js +112 -1
package/scripts/mcp-transport-strategy.js +66 -0
package/scripts/memory-store-governance.js +60 -0
package/scripts/meta-agent-loop.js +7 -13
package/scripts/model-access-eligibility.js +38 -0
package/scripts/model-migration-readiness.js +55 -0
package/scripts/native-messaging-audit.js +514 -0
package/scripts/operational-integrity.js +96 -3
package/scripts/otel-declarative-config.js +56 -0
package/scripts/perplexity-client.js +1 -1
package/scripts/post-training-governance.js +34 -0
package/scripts/pr-manager.js +47 -7
package/scripts/private-core-boundary.js +72 -0
package/scripts/production-agent-readiness.js +40 -0
package/scripts/profile-router.js +16 -1
package/scripts/prompt-eval.js +564 -32
package/scripts/prompt-programs.js +93 -0
package/scripts/provider-action-normalizer.js +585 -0
package/scripts/rule-validator.js +285 -0
package/scripts/scaling-law-claims.js +60 -0
package/scripts/security-scanner.js +1 -1
package/scripts/self-distill-agent.js +7 -32
package/scripts/seo-gsd.js +400 -43
package/scripts/skill-rag-router.js +53 -0
package/scripts/spec-gate.js +1 -1
package/scripts/student-consistent-training.js +73 -0
package/scripts/synthetic-data-provenance.js +98 -0
package/scripts/task-context-result.js +81 -0
package/scripts/telemetry-analytics.js +149 -0
package/scripts/thompson-sampling.js +2 -2
package/scripts/token-savings.js +7 -6
package/scripts/token-tco.js +46 -0
package/scripts/tool-registry.js +75 -3
package/scripts/verification-loop.js +10 -1
package/scripts/verifier-scoring.js +71 -0
package/scripts/workflow-sentinel.js +284 -28
package/scripts/workspace-agent-routines.js +118 -0
package/skills/thumbgate/SKILL.md +1 -1
package/src/api/server.js +434 -120
package/.claude-plugin/README.md +0 -170
package/adapters/README.md +0 -12
package/scripts/analytics-report.js +0 -328
package/scripts/autonomous-workflow.js +0 -377
package/scripts/billing-setup.js +0 -109
package/scripts/creator-campaigns.js +0 -239
package/scripts/cross-encoder-reranker.js +0 -235
package/scripts/daemon-manager.js +0 -108
package/scripts/decision-trace.js +0 -354
package/scripts/delegation-runtime.js +0 -896
package/scripts/dispatch-brief.js +0 -159
package/scripts/distribution-surfaces.js +0 -110
package/scripts/feedback-history-distiller.js +0 -382
package/scripts/funnel-analytics.js +0 -35
package/scripts/history-distiller.js +0 -200
package/scripts/hosted-job-launcher.js +0 -256
package/scripts/intent-router.js +0 -392
package/scripts/lesson-reranker.js +0 -263
package/scripts/lesson-retrieval.js +0 -148
package/scripts/managed-lesson-agent.js +0 -183
package/scripts/operational-dashboard.js +0 -103
package/scripts/operational-summary.js +0 -129
package/scripts/operator-artifacts.js +0 -608
package/scripts/optimize-context.js +0 -17
package/scripts/org-dashboard.js +0 -206
package/scripts/partner-orchestration.js +0 -146
package/scripts/predictive-insights.js +0 -356
package/scripts/pulse.js +0 -80
package/scripts/reflector-agent.js +0 -221
package/scripts/sales-pipeline.js +0 -681
package/scripts/session-episode-store.js +0 -329
package/scripts/session-health-sensor.js +0 -242
package/scripts/session-report.js +0 -120
package/scripts/swarm-coordinator.js +0 -81
package/scripts/tool-kpi-tracker.js +0 -12
package/scripts/webhook-delivery.js +0 -62
package/scripts/workflow-sprint-intake.js +0 -475
package/skills/agent-memory/SKILL.md +0 -97
package/skills/solve-architecture-autonomy/SKILL.md +0 -17
package/skills/solve-architecture-autonomy/tool.js +0 -33
package/skills/thumbgate-feedback/SKILL.md +0 -49

package/scripts/prompt-eval.js CHANGED Viewed

@@ -20,6 +20,8 @@ const path = require('node:path');
 const ROOT = path.join(__dirname, '..');
 const DEFAULT_SUITE = path.join(ROOT, 'bench', 'prompt-eval-suite.json');
+const DEFAULT_SYNTHETIC_VARIANTS = 2;
+const DEFAULT_MAX_FEEDBACK_CASES = 25;
 // ---------------------------------------------------------------------------
 // Prompt simulators — run ThumbGate's actual logic against eval inputs
@@ -51,28 +53,58 @@ function simulateLessonDistillation(input) {
 function simulateFeedbackEnrichment(input) {
   const { enrichFeedbackContext } = require('./feedback-loop');
-  return enrichFeedbackContext({
+  const feedbackEvent = {
     signal: input.signal,
     context: input.context,
     tags: input.tags || [],
+    whatWentWrong: input.whatWentWrong || '',
+    whatToChange: input.whatToChange || '',
+  };
+  return enrichFeedbackContext(feedbackEvent, {
+    filePaths: input.filePaths || [],
+    errorType: input.errorType || null,
   });
 }
 function simulatePreventionRule(input) {
   // Prevention rules are generated from accumulated patterns
-  // For eval purposes, we test the rule structure expectations
+  // For eval purposes, produce a realistic block rule envelope.
+  const normalizedExamples = Array.isArray(input.examples) ? input.examples.filter(Boolean) : [];
+  const ruleText = normalizedExamples.length > 0
+    ? `NEVER repeat ${normalizedExamples[0].toLowerCase()}; keep the workflow inside the worktree.`
+    : `NEVER repeat pattern ${String(input.pattern || '').trim() || 'unknown-pattern'}.`;
   return {
     pattern: input.pattern,
     occurrences: input.occurrences,
-    examples: input.examples,
+    examples: normalizedExamples,
+    rule: ruleText,
+    actionType: 'block',
+    confidence: Math.max(0.7, Math.min(0.99, Number(input.occurrences || 0) / 4)),
     generated: true,
   };
 }
 function simulateSelfDistill(input) {
+  const sessionFeedback = Array.isArray(input.sessionFeedback) ? input.sessionFeedback : [];
+  const contexts = sessionFeedback
+    .map((entry) => String(entry?.context || '').trim())
+    .filter(Boolean);
+  const negativeContexts = sessionFeedback
+    .filter((entry) => entry?.signal === 'negative')
+    .map((entry) => String(entry?.context || '').trim())
+    .filter(Boolean);
+  const pattern = negativeContexts.length > 1
+    ? `Pattern: repeated workflow discipline gaps around ${negativeContexts.slice(0, 2).join(' and ')}.`
+    : 'Pattern: isolated session mistake with no repeated theme yet.';
+  const improvement = contexts.some((context) => /thumbgate/i.test(context))
+    ? 'Improvement: keep using ThumbGate at session start and stay inside the worktree.'
+    : 'Improvement: start each session with ThumbGate and enforce worktree discipline.';
   return {
-    sessionFeedback: input.sessionFeedback,
-    summary: input.sessionFeedback.map((f) => f.context).join('; '),
+    sessionFeedback,
+    summary: [...contexts, pattern, improvement].join('; '),
+    pattern,
+    improvement,
     generated: true,
   };
 }
@@ -188,11 +220,34 @@ function addContextChecks(checks, result, expected) {
 function addRuleChecks(checks, result, expected) {
   if (!expected.hasRule) return;
+  const rule = firstString(result.rule, result.pattern, result.summary);
   checks.push({
     criterion: 'hasRule',
-    pass: result.generated === true || !!result.rule,
+    pass: result.generated === true || rule.length > 0,
     detail: result.generated ? 'Rule generated' : 'No rule generated',
   });
+  addContainsChecks(checks, 'ruleContains', 'Rule', rule, expected.ruleContains);
+  if (expected.actionType) {
+    const actionType = firstString(result.actionType, result.action, result.availability);
+    checks.push({
+      criterion: 'actionType',
+      pass: actionType === expected.actionType,
+      detail: `Expected "${expected.actionType}", got "${actionType}"`,
+    });
+  }
+  if (expected.confidence?.min !== undefined) {
+    const confidence = Number(result.confidence);
+    const minConfidence = Number(expected.confidence.min);
+    checks.push({
+      criterion: 'confidenceMin',
+      pass: Number.isFinite(confidence) && confidence >= minConfidence,
+      detail: Number.isFinite(confidence)
+        ? `Expected >= ${minConfidence}, got ${confidence}`
+        : 'Missing numeric confidence',
+    });
+  }
 }
 function addSummaryChecks(checks, result, expected) {
@@ -205,6 +260,24 @@ function addSummaryChecks(checks, result, expected) {
     detail: `Summary length: ${summary.length}`,
   });
   addContainsChecks(checks, 'summaryContains', 'Summary', summary, expected.summaryContains);
+  if (expected.identifiesPattern) {
+    const pattern = firstString(result.pattern, summary);
+    checks.push({
+      criterion: 'identifiesPattern',
+      pass: /pattern|repeat|repeated|recurring/i.test(pattern),
+      detail: pattern ? `Pattern text: "${pattern.slice(0, 80)}"` : 'Missing pattern identification',
+    });
+  }
+  if (expected.suggestsImprovement) {
+    const improvement = firstString(result.improvement, summary);
+    checks.push({
+      criterion: 'suggestsImprovement',
+      pass: /improvement|should|next time|keep|start|use/i.test(improvement),
+      detail: improvement ? `Improvement text: "${improvement.slice(0, 80)}"` : 'Missing improvement guidance',
+    });
+  }
 }
 function gradeOutput(output, expected) {
@@ -223,6 +296,243 @@ function gradeOutput(output, expected) {
   return checks;
 }
+// ---------------------------------------------------------------------------
+// Feedback -> eval conversion
+// ---------------------------------------------------------------------------
+function readJsonl(filePath) {
+  try {
+    return fs.readFileSync(filePath, 'utf8')
+      .split(/\r?\n/)
+      .filter(Boolean)
+      .map((line) => {
+        try {
+          return JSON.parse(line);
+        } catch {
+          return null;
+        }
+      })
+      .filter(Boolean);
+  } catch {
+    return [];
+  }
+}
+function stableCaseId(value, index = 0) {
+  const source = String(value || '').toLowerCase();
+  let slug = '';
+  let previousWasDash = false;
+  for (const ch of source) {
+    const isDigit = ch >= '0' && ch <= '9';
+    const isLower = ch >= 'a' && ch <= 'z';
+    if (isDigit || isLower) {
+      slug += ch;
+      previousWasDash = false;
+      if (slug.length >= 64) break;
+      continue;
+    }
+    if (!previousWasDash && slug.length > 0) {
+      slug += '-';
+      previousWasDash = true;
+      if (slug.length >= 64) break;
+    }
+  }
+  let start = 0;
+  let end = slug.length;
+  while (start < end && slug[start] === '-') start += 1;
+  while (end > start && slug[end - 1] === '-') end -= 1;
+  const trimmed = slug.slice(start, end);
+  const normalized = trimmed.slice(0, 48);
+  return `${normalized || 'entry'}-${index + 1}`;
+}
+function normalizeSignal(entry = {}) {
+  const raw = String(entry.signal || entry.feedback || entry.rating || '').toLowerCase();
+  if (['down', 'negative', 'thumbs_down', 'thumbs-down', '-1'].includes(raw)) return 'negative';
+  if (['up', 'positive', 'thumbs_up', 'thumbs-up', '+1'].includes(raw)) return 'positive';
+  return null;
+}
+function compactText(...values) {
+  return values
+    .filter((value) => typeof value === 'string' && value.trim())
+    .map((value) => value.trim().replace(/\s+/g, ' '))
+    .join(' ')
+    .trim();
+}
+function keywordTerms(text, limit = 3) {
+  const stopWords = new Set([
+    'about', 'after', 'again', 'agent', 'because', 'before', 'being', 'change',
+    'could', 'from', 'have', 'into', 'should', 'that', 'their', 'there', 'this',
+    'touch', 'when', 'where', 'with', 'work', 'would',
+  ]);
+  const seen = new Set();
+  const terms = [];
+  for (const token of String(text || '').toLowerCase().match(/[a-z][a-z0-9_-]{3,}/g) || []) {
+    if (stopWords.has(token) || seen.has(token)) continue;
+    seen.add(token);
+    terms.push(token);
+    if (terms.length >= limit) break;
+  }
+  return terms;
+}
+function feedbackEntryToEvalCase(entry = {}, index = 0) {
+  const signal = normalizeSignal(entry);
+  if (!signal) return null;
+  const context = compactText(entry.context, entry.summary, entry.message, entry.userText);
+  const whatWentWrong = compactText(entry.whatWentWrong, entry.rootCause, entry.failure, entry.error);
+  const whatToChange = compactText(entry.whatToChange, entry.correctiveAction, entry.fix, entry.recommendation);
+  const whatWorked = compactText(entry.whatWorked, entry.success, entry.outcome);
+  const tags = Array.isArray(entry.tags)
+    ? entry.tags.map(String).filter(Boolean)
+    : String(entry.tags || '').split(',').map((tag) => tag.trim()).filter(Boolean);
+  const rawId = entry.id || entry.feedbackId || `${signal}:${context}:${whatWentWrong}:${whatToChange}:${whatWorked}`;
+  const id = `feedback-${signal}-${stableCaseId(rawId, index)}`;
+  const actionableText = signal === 'negative'
+    ? compactText(whatToChange, whatWentWrong, context)
+    : compactText(context, whatWorked);
+  const terms = keywordTerms(actionableText, 2);
+  const vague = actionableText.length < 24 || /^thumbs?\s*(up|down)$/i.test(actionableText);
+  return {
+    id,
+    prompt: 'lesson-distillation',
+    source: {
+      type: 'feedback',
+      feedbackId: entry.id || entry.feedbackId || null,
+      timestamp: entry.timestamp || null,
+    },
+    input: {
+      signal,
+      context,
+      whatWentWrong,
+      whatToChange,
+      whatWorked,
+      tags,
+    },
+    expectedOutput: vague
+      ? { shouldReject: true, rejectReason: 'vague-feedback' }
+      : {
+          hasTitle: true,
+          hasContent: signal === 'negative',
+          ...(terms.length > 0 && signal === 'negative' ? { contentContains: terms } : {}),
+          category: signal === 'negative' ? 'error' : 'learning',
+        },
+  };
+}
+function buildEvalSuiteFromFeedback(entries = [], options = {}) {
+  const maxCases = Number.isFinite(Number(options.maxCases))
+    ? Math.max(1, Number(options.maxCases))
+    : DEFAULT_MAX_FEEDBACK_CASES;
+  const cases = [];
+  const seen = new Set();
+  for (const [index, entry] of entries.entries()) {
+    const evalCase = feedbackEntryToEvalCase(entry, index);
+    if (!evalCase || seen.has(evalCase.id)) continue;
+    seen.add(evalCase.id);
+    cases.push(evalCase);
+    if (cases.length >= maxCases) break;
+  }
+  return {
+    version: 1,
+    name: options.name || 'ThumbGate Feedback-Derived Prompt Evaluation',
+    description: 'Reusable eval cases generated from thumbs-up/down feedback. These cases prove whether a feedback-derived behavior now passes instead of relying on prompt vibes.',
+    generatedAt: new Date().toISOString(),
+    source: {
+      type: 'feedback-log',
+      path: options.sourcePath || null,
+      totalEntries: entries.length,
+      selectedCases: cases.length,
+    },
+    evaluations: cases,
+  };
+}
+function runSuiteObject(suite, options = {}) {
+  if (!suite || !Array.isArray(suite.evaluations) || (!options.allowEmpty && suite.evaluations.length === 0)) {
+    throw new Error('Suite must define a non-empty evaluations array');
+  }
+  const results = suite.evaluations.map(runEvaluation);
+  const passed = results.filter((r) => r.status === 'pass').length;
+  const failed = results.filter((r) => r.status === 'fail').length;
+  const errors = results.filter((r) => r.status === 'error').length;
+  const skipped = results.filter((r) => r.status === 'skip').length;
+  const totalScore = results.length > 0
+    ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length)
+    : 100;
+  const minScore = options.minScore ?? 80;
+  return {
+    suite: suite.name,
+    total: results.length,
+    passed,
+    failed,
+    errors,
+    skipped,
+    score: totalScore,
+    minScore,
+    pass: totalScore >= minScore,
+    noCases: results.length === 0,
+    feedbackDerived: suite.source && suite.source.type === 'feedback-log',
+    generatedAt: new Date().toISOString(),
+    results,
+  };
+}
+function runFeedbackEvalSuite(options = {}) {
+  const feedbackLog = options.feedbackLog || (() => {
+    const { resolveFeedbackDir } = require('./feedback-paths');
+    return path.join(resolveFeedbackDir({ feedbackDir: options.feedbackDir }), 'feedback-log.jsonl');
+  })();
+  const entries = readJsonl(feedbackLog);
+  const suite = buildEvalSuiteFromFeedback(entries, {
+    maxCases: options.maxCases,
+    name: options.name,
+    sourcePath: feedbackLog,
+  });
+  const report = runSuiteObject(suite, { minScore: options.minScore, allowEmpty: true });
+  return { suite, report };
+}
+function formatProofReport(report, suite) {
+  const feedbackSource = suite && suite.source ? suite.source : {};
+  const lines = [
+    '# ThumbGate Prompt Evaluation Proof',
+    '',
+    `Generated: ${report.generatedAt}`,
+    `Suite: ${report.suite}`,
+    `Score: ${report.score}% (minimum ${report.minScore}%)`,
+    `Result: ${report.pass ? 'PASS' : 'FAIL'}`,
+    '',
+    '## Feedback-Derived Coverage',
+    '',
+    `- Feedback entries scanned: ${feedbackSource.totalEntries || 0}`,
+    `- Reusable eval cases generated: ${feedbackSource.selectedCases || report.total}`,
+    `- Passing cases: ${report.passed}/${report.total}`,
+    `- Failing cases: ${report.failed}`,
+    `- Errors: ${report.errors}`,
+    `- Skipped: ${report.skipped}`,
+    '',
+    '## Case Results',
+    '',
+  ];
+  for (const result of report.results) {
+    lines.push(`- ${result.status.toUpperCase()} ${result.id}: ${result.score}%`);
+  }
+  lines.push('', '## Buyer Proof', '');
+  lines.push('Every row above started as real operator feedback, became a reusable eval, and now gives a repeatable before/after proof lane for prompt or workflow changes.');
+  return lines.join('\n');
+}
 // ---------------------------------------------------------------------------
 // Runner
 // ---------------------------------------------------------------------------
@@ -235,6 +545,72 @@ function loadSuite(suitePath) {
   return raw;
 }
+function cloneJson(value) {
+  return JSON.parse(JSON.stringify(value));
+}
+function mutateSyntheticInput(input) {
+  if (Array.isArray(input)) {
+    return input.map((item, index) => index === 0 ? mutateSyntheticInput(item) : cloneJson(item));
+  }
+  if (!input || typeof input !== 'object') return input;
+  const next = cloneJson(input);
+  for (const [key, value] of Object.entries(next)) {
+    if (typeof value === 'string' && value.trim()) {
+      if (key === 'context') next[key] = `  ${value}\n`;
+      else if (key === 'whatWentWrong' || key === 'whatWorked' || key === 'whatToChange') next[key] = `${value} Please preserve the core meaning.`;
+      else next[key] = value;
+    } else if (Array.isArray(value) && value.every((entry) => typeof entry === 'string')) {
+      next[key] = [...value, ...value.slice(0, 1).map((entry) => `${entry} (repeat check)`)];
+    } else if (Array.isArray(value) && value.every((entry) => entry && typeof entry === 'object')) {
+      next[key] = value.map((entry, index) => {
+        if (index === 0 && typeof entry.context === 'string') {
+          return { ...entry, context: `${entry.context} Next session should keep the same lesson.` };
+        }
+        return cloneJson(entry);
+      });
+    }
+  }
+  return next;
+}
+function expandWithSyntheticEvaluations(suite, options = {}) {
+  const variantsPerCase = Number.isFinite(Number(options.syntheticVariants))
+    ? Math.max(0, Number(options.syntheticVariants))
+    : DEFAULT_SYNTHETIC_VARIANTS;
+  if (variantsPerCase === 0) return suite;
+  const evaluations = [...suite.evaluations];
+  for (const evalCase of suite.evaluations) {
+    for (let index = 0; index < variantsPerCase; index += 1) {
+      evaluations.push({
+        ...cloneJson(evalCase),
+        id: `${evalCase.id}__synthetic_${index + 1}`,
+        input: mutateSyntheticInput(evalCase.input),
+        synthetic: true,
+        syntheticSourceId: evalCase.id,
+      });
+    }
+  }
+  return {
+    ...cloneJson(suite),
+    syntheticVariantsPerCase: variantsPerCase,
+    syntheticCount: evaluations.length - suite.evaluations.length,
+    totalSeedEvaluations: suite.evaluations.length,
+    evaluations,
+  };
+}
+function loadReport(reportPath) {
+  return JSON.parse(fs.readFileSync(reportPath, 'utf8'));
+}
 function runEvaluation(evalCase) {
   const simulator = PROMPT_SIMULATORS[evalCase.prompt];
   if (!simulator) {
@@ -280,35 +656,86 @@ function runEvaluation(evalCase) {
 }
 function runSuite(suitePath = DEFAULT_SUITE, options = {}) {
-  const suite = loadSuite(suitePath);
-  const results = [];
+  const loadedSuite = loadSuite(suitePath);
+  const suite = options.expandSynthetic
+    ? expandWithSyntheticEvaluations(loadedSuite, options)
+    : loadedSuite;
+  const minScore = Number.isFinite(Number(options.minScore))
+    ? Number(options.minScore)
+    : Number(suite.successCriteria?.minAggregateScore || 80);
+  const report = {
+    ...runSuiteObject(suite, { ...options, minScore }),
+    successCriteria: suite.successCriteria || null,
+    syntheticCount: Number(suite.syntheticCount || 0),
+  };
-  for (const evalCase of suite.evaluations) {
-    results.push(runEvaluation(evalCase));
+  const baselineReport = options.baselineReport
+    || (options.baselinePath ? loadReport(options.baselinePath) : null);
+  if (baselineReport) {
+    report.comparison = compareReports(report, baselineReport);
+    const requireNoRegressions = options.requireNoRegressions === true
+      || suite.successCriteria?.requireNoRegressions === true;
+    if (requireNoRegressions && report.comparison.regressions.length > 0) {
+      report.pass = false;
+    }
   }
-  const passed = results.filter((r) => r.status === 'pass').length;
-  const failed = results.filter((r) => r.status === 'fail').length;
-  const errors = results.filter((r) => r.status === 'error').length;
-  const skipped = results.filter((r) => r.status === 'skip').length;
-  const totalScore = results.length > 0
-    ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length)
-    : 0;
+  return report;
+}
+function compareReports(currentReport, baselineReport) {
+  const baselineById = new Map((baselineReport?.results || []).map((result) => [result.id, result]));
+  const regressions = [];
+  const improvements = [];
+  for (const result of currentReport.results || []) {
+    const baseline = baselineById.get(result.id);
+    if (!baseline) continue;
+    const scoreDelta = result.score - baseline.score;
+    if (scoreDelta < 0 || (baseline.status === 'pass' && result.status !== 'pass')) {
+      regressions.push({
+        id: result.id,
+        baselineScore: baseline.score,
+        currentScore: result.score,
+        delta: scoreDelta,
+        baselineStatus: baseline.status,
+        currentStatus: result.status,
+      });
+      continue;
+    }
+    if (scoreDelta > 0 || (baseline.status !== 'pass' && result.status === 'pass')) {
+      improvements.push({
+        id: result.id,
+        baselineScore: baseline.score,
+        currentScore: result.score,
+        delta: scoreDelta,
+        baselineStatus: baseline.status,
+        currentStatus: result.status,
+      });
+    }
+  }
   return {
-    suite: suite.name,
-    total: results.length,
-    passed,
-    failed,
-    errors,
-    skipped,
-    score: totalScore,
-    minScore: options.minScore || 80,
-    pass: totalScore >= (options.minScore || 80),
-    results,
+    baselineSuite: baselineReport?.suite || null,
+    baselineScore: Number.isFinite(Number(baselineReport?.score)) ? Number(baselineReport.score) : null,
+    scoreDelta: Number.isFinite(Number(baselineReport?.score)) ? currentReport.score - Number(baselineReport.score) : null,
+    regressions,
+    improvements,
   };
 }
+function writeReport(report, outputPath) {
+  fs.mkdirSync(path.dirname(outputPath), { recursive: true });
+  fs.writeFileSync(outputPath, JSON.stringify(report, null, 2) + '\n');
+}
+function writeSuite(suite, outputPath) {
+  fs.mkdirSync(path.dirname(outputPath), { recursive: true });
+  fs.writeFileSync(outputPath, JSON.stringify(suite, null, 2) + '\n');
+}
 // ---------------------------------------------------------------------------
 // CLI
 // ---------------------------------------------------------------------------
@@ -328,17 +755,99 @@ if (isCliInvocation()) {
   let suitePath = DEFAULT_SUITE;
   let json = false;
   let minScore = 80;
-  for (const arg of args) {
+  let baselinePath = null;
+  let outputPath = null;
+  let suiteOutputPath = null;
+  let requireNoRegressions = false;
+  let expandSynthetic = false;
+  let syntheticVariants = DEFAULT_SYNTHETIC_VARIANTS;
+  let fromFeedback = false;
+  let feedbackLog = null;
+  let feedbackDir = null;
+  let proofReportPath = null;
+  let maxCases = DEFAULT_MAX_FEEDBACK_CASES;
+  for (let index = 0; index < args.length; index += 1) {
+    const arg = args[index];
+    const nextArg = args[index + 1];
     if (arg.startsWith('--suite=')) suitePath = path.resolve(arg.slice(8));
+    if (arg === '--suite' && nextArg) {
+      suitePath = path.resolve(nextArg);
+      index += 1;
+      continue;
+    }
     if (arg === '--json') json = true;
     if (arg.startsWith('--min-score=')) minScore = Number(arg.slice(12));
+    if (arg === '--min-score' && nextArg) {
+      minScore = Number(nextArg);
+      index += 1;
+      continue;
+    }
+    if (arg.startsWith('--baseline=')) baselinePath = path.resolve(arg.slice(11));
+    if (arg === '--baseline' && nextArg) {
+      baselinePath = path.resolve(nextArg);
+      index += 1;
+      continue;
+    }
+    if (arg.startsWith('--output=')) outputPath = path.resolve(arg.slice(9));
+    if (arg === '--output' && nextArg) {
+      outputPath = path.resolve(nextArg);
+      index += 1;
+      continue;
+    }
+    if (arg.startsWith('--suite-output=')) suiteOutputPath = path.resolve(arg.slice(15));
+    if (arg === '--suite-output' && nextArg) {
+      suiteOutputPath = path.resolve(nextArg);
+      index += 1;
+      continue;
+    }
+    if (arg === '--require-no-regressions') requireNoRegressions = true;
+    if (arg === '--synthetic') expandSynthetic = true;
+    if (arg.startsWith('--synthetic-variants=')) {
+      expandSynthetic = true;
+      syntheticVariants = Number(arg.slice(21));
+    }
+    if (arg === '--synthetic-variants' && nextArg) {
+      expandSynthetic = true;
+      syntheticVariants = Number(nextArg);
+      index += 1;
+      continue;
+    }
+    if (arg === '--from-feedback') fromFeedback = true;
+    if (arg.startsWith('--feedback-log=')) feedbackLog = path.resolve(arg.slice(15));
+    if (arg.startsWith('--feedback-dir=')) feedbackDir = path.resolve(arg.slice(15));
+    if (arg.startsWith('--write-suite=')) suiteOutputPath = path.resolve(arg.slice(14));
+    if (arg.startsWith('--write-report=')) proofReportPath = path.resolve(arg.slice(15));
+    if (arg.startsWith('--max-cases=')) maxCases = Number(arg.slice(12));
   }
-  const report = runSuite(suitePath, { minScore });
+  let suite;
+  let report;
+  if (fromFeedback) {
+    ({ suite, report } = runFeedbackEvalSuite({ feedbackLog, feedbackDir, minScore, maxCases }));
+  } else {
+    const loadedSuite = loadSuite(suitePath);
+    suite = expandSynthetic
+      ? expandWithSyntheticEvaluations(loadedSuite, { syntheticVariants })
+      : loadedSuite;
+    report = runSuite(suitePath, {
+      minScore,
+      baselinePath,
+      requireNoRegressions,
+      expandSynthetic,
+      syntheticVariants,
+    });
+  }
+  if (outputPath) writeReport(report, outputPath);
+  if (suiteOutputPath) writeSuite(suite, suiteOutputPath);
+  if (proofReportPath) {
+    fs.mkdirSync(path.dirname(proofReportPath), { recursive: true });
+    fs.writeFileSync(proofReportPath, `${formatProofReport(report, suite)}\n`, 'utf8');
+  }
   if (json) {
-    console.log(JSON.stringify(report, null, 2));
+    console.log(JSON.stringify({ ...report, suiteDefinition: fromFeedback ? suite : undefined }, null, 2));
   } else {
     console.log(`\n${report.suite}`);
     console.log('='.repeat(50));
@@ -354,10 +863,33 @@ if (isCliInvocation()) {
     }
     console.log('='.repeat(50));
     console.log(`Score: ${report.score}% | Pass: ${report.passed} | Fail: ${report.failed} | Error: ${report.errors} | Skip: ${report.skipped}`);
+    if (report.syntheticCount > 0) {
+      console.log(`Synthetic cases: ${report.syntheticCount}`);
+    }
+    if (report.comparison) {
+      console.log(`Baseline delta: ${report.comparison.scoreDelta >= 0 ? '+' : ''}${report.comparison.scoreDelta}%`);
+      console.log(`Regressions: ${report.comparison.regressions.length} | Improvements: ${report.comparison.improvements.length}`);
+    }
     console.log(report.pass ? '\u2705 PASS' : `\u274C FAIL (min: ${minScore}%)`);
   }
   process.exit(report.pass ? 0 : 1);
 }
-module.exports = { runSuite, runEvaluation, gradeOutput, loadSuite };
+module.exports = {
+  buildEvalSuiteFromFeedback,
+  feedbackEntryToEvalCase,
+  formatProofReport,
+  gradeOutput,
+  loadSuite,
+  loadReport,
+  compareReports,
+  readJsonl,
+  runEvaluation,
+  runFeedbackEvalSuite,
+  runSuite,
+  runSuiteObject,
+  writeReport,
+  writeSuite,
+  expandWithSyntheticEvaluations,
+};