thumbgate 1.14.1 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/.claude-plugin/marketplace.json +6 -6
  2. package/.claude-plugin/plugin.json +3 -3
  3. package/.well-known/llms.txt +5 -5
  4. package/.well-known/mcp/server-card.json +1 -1
  5. package/README.md +60 -35
  6. package/adapters/chatgpt/openapi.yaml +118 -2
  7. package/adapters/claude/.mcp.json +2 -2
  8. package/adapters/mcp/server-stdio.js +217 -84
  9. package/adapters/opencode/opencode.json +1 -1
  10. package/bench/prompt-eval-suite.json +5 -1
  11. package/bin/cli.js +211 -8
  12. package/config/enforcement.json +59 -7
  13. package/config/evals/agent-safety-eval.json +338 -22
  14. package/config/gates/default.json +33 -0
  15. package/config/gates/routine.json +43 -0
  16. package/config/github-about.json +3 -3
  17. package/config/mcp-allowlists.json +4 -0
  18. package/config/merge-quality-checks.json +2 -1
  19. package/config/model-candidates.json +131 -0
  20. package/openapi/openapi.yaml +118 -2
  21. package/package.json +70 -51
  22. package/public/blog.html +7 -7
  23. package/public/codex-plugin.html +13 -7
  24. package/public/compare.html +29 -23
  25. package/public/dashboard.html +105 -12
  26. package/public/guide.html +28 -28
  27. package/public/index.html +233 -97
  28. package/public/learn.html +87 -20
  29. package/public/lessons.html +26 -2
  30. package/public/numbers.html +271 -0
  31. package/public/pro.html +89 -19
  32. package/scripts/agent-audit-trace.js +55 -0
  33. package/scripts/agent-memory-lifecycle.js +96 -0
  34. package/scripts/agent-readiness-plan.js +118 -0
  35. package/scripts/agentic-data-pipeline.js +21 -1
  36. package/scripts/agents-sdk-sandbox-plan.js +57 -0
  37. package/scripts/ai-org-governance.js +98 -0
  38. package/scripts/ai-search-distribution.js +43 -0
  39. package/scripts/artifact-agent-plan.js +81 -0
  40. package/scripts/billing.js +27 -8
  41. package/scripts/cli-feedback.js +2 -1
  42. package/scripts/cli-schema.js +60 -5
  43. package/scripts/code-mode-mcp-plan.js +71 -0
  44. package/scripts/commercial-offer.js +1 -1
  45. package/scripts/context-engine.js +1 -2
  46. package/scripts/context-manager.js +4 -1
  47. package/scripts/contextfs.js +214 -32
  48. package/scripts/dashboard-render-spec.js +1 -1
  49. package/scripts/dashboard.js +275 -9
  50. package/scripts/decision-journal.js +13 -3
  51. package/scripts/document-workflow-governance.js +62 -0
  52. package/scripts/enterprise-agent-rollout.js +34 -0
  53. package/scripts/experience-replay-governance.js +69 -0
  54. package/scripts/export-hf-dataset.js +1 -1
  55. package/scripts/feedback-loop.js +141 -9
  56. package/scripts/feedback-to-rules.js +17 -23
  57. package/scripts/gates-engine.js +4 -6
  58. package/scripts/growth-campaigns.js +49 -0
  59. package/scripts/harness-selector.js +145 -1
  60. package/scripts/hybrid-supervisor-agent.js +64 -0
  61. package/scripts/inference-cache-policy.js +72 -0
  62. package/scripts/inference-economics.js +53 -0
  63. package/scripts/internal-agent-bootstrap.js +12 -2
  64. package/scripts/knowledge-layer-plan.js +108 -0
  65. package/scripts/lesson-canonical.js +181 -0
  66. package/scripts/lesson-db.js +71 -10
  67. package/scripts/lesson-inference.js +183 -44
  68. package/scripts/lesson-search.js +4 -1
  69. package/scripts/lesson-synthesis.js +23 -2
  70. package/scripts/llm-client.js +157 -26
  71. package/scripts/mailer/resend-mailer.js +112 -1
  72. package/scripts/mcp-transport-strategy.js +66 -0
  73. package/scripts/memory-store-governance.js +60 -0
  74. package/scripts/meta-agent-loop.js +7 -13
  75. package/scripts/model-access-eligibility.js +38 -0
  76. package/scripts/model-migration-readiness.js +55 -0
  77. package/scripts/native-messaging-audit.js +514 -0
  78. package/scripts/operational-integrity.js +96 -3
  79. package/scripts/otel-declarative-config.js +56 -0
  80. package/scripts/perplexity-client.js +1 -1
  81. package/scripts/post-training-governance.js +34 -0
  82. package/scripts/pr-manager.js +47 -7
  83. package/scripts/private-core-boundary.js +72 -0
  84. package/scripts/production-agent-readiness.js +40 -0
  85. package/scripts/profile-router.js +16 -1
  86. package/scripts/prompt-eval.js +564 -32
  87. package/scripts/prompt-programs.js +93 -0
  88. package/scripts/provider-action-normalizer.js +585 -0
  89. package/scripts/rule-validator.js +285 -0
  90. package/scripts/scaling-law-claims.js +60 -0
  91. package/scripts/security-scanner.js +1 -1
  92. package/scripts/self-distill-agent.js +7 -32
  93. package/scripts/seo-gsd.js +400 -43
  94. package/scripts/skill-rag-router.js +53 -0
  95. package/scripts/spec-gate.js +1 -1
  96. package/scripts/student-consistent-training.js +73 -0
  97. package/scripts/synthetic-data-provenance.js +98 -0
  98. package/scripts/task-context-result.js +81 -0
  99. package/scripts/telemetry-analytics.js +149 -0
  100. package/scripts/thompson-sampling.js +2 -2
  101. package/scripts/token-savings.js +7 -6
  102. package/scripts/token-tco.js +46 -0
  103. package/scripts/tool-registry.js +75 -3
  104. package/scripts/verification-loop.js +10 -1
  105. package/scripts/verifier-scoring.js +71 -0
  106. package/scripts/workflow-sentinel.js +284 -28
  107. package/scripts/workspace-agent-routines.js +118 -0
  108. package/skills/thumbgate/SKILL.md +1 -1
  109. package/src/api/server.js +434 -120
  110. package/.claude-plugin/README.md +0 -170
  111. package/adapters/README.md +0 -12
  112. package/scripts/analytics-report.js +0 -328
  113. package/scripts/autonomous-workflow.js +0 -377
  114. package/scripts/billing-setup.js +0 -109
  115. package/scripts/creator-campaigns.js +0 -239
  116. package/scripts/cross-encoder-reranker.js +0 -235
  117. package/scripts/daemon-manager.js +0 -108
  118. package/scripts/decision-trace.js +0 -354
  119. package/scripts/delegation-runtime.js +0 -896
  120. package/scripts/dispatch-brief.js +0 -159
  121. package/scripts/distribution-surfaces.js +0 -110
  122. package/scripts/feedback-history-distiller.js +0 -382
  123. package/scripts/funnel-analytics.js +0 -35
  124. package/scripts/history-distiller.js +0 -200
  125. package/scripts/hosted-job-launcher.js +0 -256
  126. package/scripts/intent-router.js +0 -392
  127. package/scripts/lesson-reranker.js +0 -263
  128. package/scripts/lesson-retrieval.js +0 -148
  129. package/scripts/managed-lesson-agent.js +0 -183
  130. package/scripts/operational-dashboard.js +0 -103
  131. package/scripts/operational-summary.js +0 -129
  132. package/scripts/operator-artifacts.js +0 -608
  133. package/scripts/optimize-context.js +0 -17
  134. package/scripts/org-dashboard.js +0 -206
  135. package/scripts/partner-orchestration.js +0 -146
  136. package/scripts/predictive-insights.js +0 -356
  137. package/scripts/pulse.js +0 -80
  138. package/scripts/reflector-agent.js +0 -221
  139. package/scripts/sales-pipeline.js +0 -681
  140. package/scripts/session-episode-store.js +0 -329
  141. package/scripts/session-health-sensor.js +0 -242
  142. package/scripts/session-report.js +0 -120
  143. package/scripts/swarm-coordinator.js +0 -81
  144. package/scripts/tool-kpi-tracker.js +0 -12
  145. package/scripts/webhook-delivery.js +0 -62
  146. package/scripts/workflow-sprint-intake.js +0 -475
  147. package/skills/agent-memory/SKILL.md +0 -97
  148. package/skills/solve-architecture-autonomy/SKILL.md +0 -17
  149. package/skills/solve-architecture-autonomy/tool.js +0 -33
  150. package/skills/thumbgate-feedback/SKILL.md +0 -49
@@ -20,6 +20,8 @@ const path = require('node:path');
20
20
 
21
21
  const ROOT = path.join(__dirname, '..');
22
22
  const DEFAULT_SUITE = path.join(ROOT, 'bench', 'prompt-eval-suite.json');
23
+ const DEFAULT_SYNTHETIC_VARIANTS = 2;
24
+ const DEFAULT_MAX_FEEDBACK_CASES = 25;
23
25
 
24
26
  // ---------------------------------------------------------------------------
25
27
  // Prompt simulators — run ThumbGate's actual logic against eval inputs
@@ -51,28 +53,58 @@ function simulateLessonDistillation(input) {
51
53
 
52
54
  function simulateFeedbackEnrichment(input) {
53
55
  const { enrichFeedbackContext } = require('./feedback-loop');
54
- return enrichFeedbackContext({
56
+ const feedbackEvent = {
55
57
  signal: input.signal,
56
58
  context: input.context,
57
59
  tags: input.tags || [],
60
+ whatWentWrong: input.whatWentWrong || '',
61
+ whatToChange: input.whatToChange || '',
62
+ };
63
+ return enrichFeedbackContext(feedbackEvent, {
64
+ filePaths: input.filePaths || [],
65
+ errorType: input.errorType || null,
58
66
  });
59
67
  }
60
68
 
61
69
  function simulatePreventionRule(input) {
62
70
  // Prevention rules are generated from accumulated patterns
63
- // For eval purposes, we test the rule structure expectations
71
+ // For eval purposes, produce a realistic block rule envelope.
72
+ const normalizedExamples = Array.isArray(input.examples) ? input.examples.filter(Boolean) : [];
73
+ const ruleText = normalizedExamples.length > 0
74
+ ? `NEVER repeat ${normalizedExamples[0].toLowerCase()}; keep the workflow inside the worktree.`
75
+ : `NEVER repeat pattern ${String(input.pattern || '').trim() || 'unknown-pattern'}.`;
64
76
  return {
65
77
  pattern: input.pattern,
66
78
  occurrences: input.occurrences,
67
- examples: input.examples,
79
+ examples: normalizedExamples,
80
+ rule: ruleText,
81
+ actionType: 'block',
82
+ confidence: Math.max(0.7, Math.min(0.99, Number(input.occurrences || 0) / 4)),
68
83
  generated: true,
69
84
  };
70
85
  }
71
86
 
72
87
  function simulateSelfDistill(input) {
88
+ const sessionFeedback = Array.isArray(input.sessionFeedback) ? input.sessionFeedback : [];
89
+ const contexts = sessionFeedback
90
+ .map((entry) => String(entry?.context || '').trim())
91
+ .filter(Boolean);
92
+ const negativeContexts = sessionFeedback
93
+ .filter((entry) => entry?.signal === 'negative')
94
+ .map((entry) => String(entry?.context || '').trim())
95
+ .filter(Boolean);
96
+
97
+ const pattern = negativeContexts.length > 1
98
+ ? `Pattern: repeated workflow discipline gaps around ${negativeContexts.slice(0, 2).join(' and ')}.`
99
+ : 'Pattern: isolated session mistake with no repeated theme yet.';
100
+ const improvement = contexts.some((context) => /thumbgate/i.test(context))
101
+ ? 'Improvement: keep using ThumbGate at session start and stay inside the worktree.'
102
+ : 'Improvement: start each session with ThumbGate and enforce worktree discipline.';
73
103
  return {
74
- sessionFeedback: input.sessionFeedback,
75
- summary: input.sessionFeedback.map((f) => f.context).join('; '),
104
+ sessionFeedback,
105
+ summary: [...contexts, pattern, improvement].join('; '),
106
+ pattern,
107
+ improvement,
76
108
  generated: true,
77
109
  };
78
110
  }
@@ -188,11 +220,34 @@ function addContextChecks(checks, result, expected) {
188
220
  function addRuleChecks(checks, result, expected) {
189
221
  if (!expected.hasRule) return;
190
222
 
223
+ const rule = firstString(result.rule, result.pattern, result.summary);
191
224
  checks.push({
192
225
  criterion: 'hasRule',
193
- pass: result.generated === true || !!result.rule,
226
+ pass: result.generated === true || rule.length > 0,
194
227
  detail: result.generated ? 'Rule generated' : 'No rule generated',
195
228
  });
229
+ addContainsChecks(checks, 'ruleContains', 'Rule', rule, expected.ruleContains);
230
+
231
+ if (expected.actionType) {
232
+ const actionType = firstString(result.actionType, result.action, result.availability);
233
+ checks.push({
234
+ criterion: 'actionType',
235
+ pass: actionType === expected.actionType,
236
+ detail: `Expected "${expected.actionType}", got "${actionType}"`,
237
+ });
238
+ }
239
+
240
+ if (expected.confidence?.min !== undefined) {
241
+ const confidence = Number(result.confidence);
242
+ const minConfidence = Number(expected.confidence.min);
243
+ checks.push({
244
+ criterion: 'confidenceMin',
245
+ pass: Number.isFinite(confidence) && confidence >= minConfidence,
246
+ detail: Number.isFinite(confidence)
247
+ ? `Expected >= ${minConfidence}, got ${confidence}`
248
+ : 'Missing numeric confidence',
249
+ });
250
+ }
196
251
  }
197
252
 
198
253
  function addSummaryChecks(checks, result, expected) {
@@ -205,6 +260,24 @@ function addSummaryChecks(checks, result, expected) {
205
260
  detail: `Summary length: ${summary.length}`,
206
261
  });
207
262
  addContainsChecks(checks, 'summaryContains', 'Summary', summary, expected.summaryContains);
263
+
264
+ if (expected.identifiesPattern) {
265
+ const pattern = firstString(result.pattern, summary);
266
+ checks.push({
267
+ criterion: 'identifiesPattern',
268
+ pass: /pattern|repeat|repeated|recurring/i.test(pattern),
269
+ detail: pattern ? `Pattern text: "${pattern.slice(0, 80)}"` : 'Missing pattern identification',
270
+ });
271
+ }
272
+
273
+ if (expected.suggestsImprovement) {
274
+ const improvement = firstString(result.improvement, summary);
275
+ checks.push({
276
+ criterion: 'suggestsImprovement',
277
+ pass: /improvement|should|next time|keep|start|use/i.test(improvement),
278
+ detail: improvement ? `Improvement text: "${improvement.slice(0, 80)}"` : 'Missing improvement guidance',
279
+ });
280
+ }
208
281
  }
209
282
 
210
283
  function gradeOutput(output, expected) {
@@ -223,6 +296,243 @@ function gradeOutput(output, expected) {
223
296
  return checks;
224
297
  }
225
298
 
299
+ // ---------------------------------------------------------------------------
300
+ // Feedback -> eval conversion
301
+ // ---------------------------------------------------------------------------
302
+
303
+ function readJsonl(filePath) {
304
+ try {
305
+ return fs.readFileSync(filePath, 'utf8')
306
+ .split(/\r?\n/)
307
+ .filter(Boolean)
308
+ .map((line) => {
309
+ try {
310
+ return JSON.parse(line);
311
+ } catch {
312
+ return null;
313
+ }
314
+ })
315
+ .filter(Boolean);
316
+ } catch {
317
+ return [];
318
+ }
319
+ }
320
+
321
+ function stableCaseId(value, index = 0) {
322
+ const source = String(value || '').toLowerCase();
323
+ let slug = '';
324
+ let previousWasDash = false;
325
+ for (const ch of source) {
326
+ const isDigit = ch >= '0' && ch <= '9';
327
+ const isLower = ch >= 'a' && ch <= 'z';
328
+ if (isDigit || isLower) {
329
+ slug += ch;
330
+ previousWasDash = false;
331
+ if (slug.length >= 64) break;
332
+ continue;
333
+ }
334
+ if (!previousWasDash && slug.length > 0) {
335
+ slug += '-';
336
+ previousWasDash = true;
337
+ if (slug.length >= 64) break;
338
+ }
339
+ }
340
+ let start = 0;
341
+ let end = slug.length;
342
+ while (start < end && slug[start] === '-') start += 1;
343
+ while (end > start && slug[end - 1] === '-') end -= 1;
344
+ const trimmed = slug.slice(start, end);
345
+ const normalized = trimmed.slice(0, 48);
346
+ return `${normalized || 'entry'}-${index + 1}`;
347
+ }
348
+
349
+ function normalizeSignal(entry = {}) {
350
+ const raw = String(entry.signal || entry.feedback || entry.rating || '').toLowerCase();
351
+ if (['down', 'negative', 'thumbs_down', 'thumbs-down', '-1'].includes(raw)) return 'negative';
352
+ if (['up', 'positive', 'thumbs_up', 'thumbs-up', '+1'].includes(raw)) return 'positive';
353
+ return null;
354
+ }
355
+
356
+ function compactText(...values) {
357
+ return values
358
+ .filter((value) => typeof value === 'string' && value.trim())
359
+ .map((value) => value.trim().replace(/\s+/g, ' '))
360
+ .join(' ')
361
+ .trim();
362
+ }
363
+
364
+ function keywordTerms(text, limit = 3) {
365
+ const stopWords = new Set([
366
+ 'about', 'after', 'again', 'agent', 'because', 'before', 'being', 'change',
367
+ 'could', 'from', 'have', 'into', 'should', 'that', 'their', 'there', 'this',
368
+ 'touch', 'when', 'where', 'with', 'work', 'would',
369
+ ]);
370
+ const seen = new Set();
371
+ const terms = [];
372
+ for (const token of String(text || '').toLowerCase().match(/[a-z][a-z0-9_-]{3,}/g) || []) {
373
+ if (stopWords.has(token) || seen.has(token)) continue;
374
+ seen.add(token);
375
+ terms.push(token);
376
+ if (terms.length >= limit) break;
377
+ }
378
+ return terms;
379
+ }
380
+
381
+ function feedbackEntryToEvalCase(entry = {}, index = 0) {
382
+ const signal = normalizeSignal(entry);
383
+ if (!signal) return null;
384
+
385
+ const context = compactText(entry.context, entry.summary, entry.message, entry.userText);
386
+ const whatWentWrong = compactText(entry.whatWentWrong, entry.rootCause, entry.failure, entry.error);
387
+ const whatToChange = compactText(entry.whatToChange, entry.correctiveAction, entry.fix, entry.recommendation);
388
+ const whatWorked = compactText(entry.whatWorked, entry.success, entry.outcome);
389
+ const tags = Array.isArray(entry.tags)
390
+ ? entry.tags.map(String).filter(Boolean)
391
+ : String(entry.tags || '').split(',').map((tag) => tag.trim()).filter(Boolean);
392
+ const rawId = entry.id || entry.feedbackId || `${signal}:${context}:${whatWentWrong}:${whatToChange}:${whatWorked}`;
393
+ const id = `feedback-${signal}-${stableCaseId(rawId, index)}`;
394
+ const actionableText = signal === 'negative'
395
+ ? compactText(whatToChange, whatWentWrong, context)
396
+ : compactText(context, whatWorked);
397
+ const terms = keywordTerms(actionableText, 2);
398
+ const vague = actionableText.length < 24 || /^thumbs?\s*(up|down)$/i.test(actionableText);
399
+
400
+ return {
401
+ id,
402
+ prompt: 'lesson-distillation',
403
+ source: {
404
+ type: 'feedback',
405
+ feedbackId: entry.id || entry.feedbackId || null,
406
+ timestamp: entry.timestamp || null,
407
+ },
408
+ input: {
409
+ signal,
410
+ context,
411
+ whatWentWrong,
412
+ whatToChange,
413
+ whatWorked,
414
+ tags,
415
+ },
416
+ expectedOutput: vague
417
+ ? { shouldReject: true, rejectReason: 'vague-feedback' }
418
+ : {
419
+ hasTitle: true,
420
+ hasContent: signal === 'negative',
421
+ ...(terms.length > 0 && signal === 'negative' ? { contentContains: terms } : {}),
422
+ category: signal === 'negative' ? 'error' : 'learning',
423
+ },
424
+ };
425
+ }
426
+
427
+ function buildEvalSuiteFromFeedback(entries = [], options = {}) {
428
+ const maxCases = Number.isFinite(Number(options.maxCases))
429
+ ? Math.max(1, Number(options.maxCases))
430
+ : DEFAULT_MAX_FEEDBACK_CASES;
431
+ const cases = [];
432
+ const seen = new Set();
433
+
434
+ for (const [index, entry] of entries.entries()) {
435
+ const evalCase = feedbackEntryToEvalCase(entry, index);
436
+ if (!evalCase || seen.has(evalCase.id)) continue;
437
+ seen.add(evalCase.id);
438
+ cases.push(evalCase);
439
+ if (cases.length >= maxCases) break;
440
+ }
441
+
442
+ return {
443
+ version: 1,
444
+ name: options.name || 'ThumbGate Feedback-Derived Prompt Evaluation',
445
+ description: 'Reusable eval cases generated from thumbs-up/down feedback. These cases prove whether a feedback-derived behavior now passes instead of relying on prompt vibes.',
446
+ generatedAt: new Date().toISOString(),
447
+ source: {
448
+ type: 'feedback-log',
449
+ path: options.sourcePath || null,
450
+ totalEntries: entries.length,
451
+ selectedCases: cases.length,
452
+ },
453
+ evaluations: cases,
454
+ };
455
+ }
456
+
457
+ function runSuiteObject(suite, options = {}) {
458
+ if (!suite || !Array.isArray(suite.evaluations) || (!options.allowEmpty && suite.evaluations.length === 0)) {
459
+ throw new Error('Suite must define a non-empty evaluations array');
460
+ }
461
+
462
+ const results = suite.evaluations.map(runEvaluation);
463
+ const passed = results.filter((r) => r.status === 'pass').length;
464
+ const failed = results.filter((r) => r.status === 'fail').length;
465
+ const errors = results.filter((r) => r.status === 'error').length;
466
+ const skipped = results.filter((r) => r.status === 'skip').length;
467
+ const totalScore = results.length > 0
468
+ ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length)
469
+ : 100;
470
+ const minScore = options.minScore ?? 80;
471
+
472
+ return {
473
+ suite: suite.name,
474
+ total: results.length,
475
+ passed,
476
+ failed,
477
+ errors,
478
+ skipped,
479
+ score: totalScore,
480
+ minScore,
481
+ pass: totalScore >= minScore,
482
+ noCases: results.length === 0,
483
+ feedbackDerived: suite.source && suite.source.type === 'feedback-log',
484
+ generatedAt: new Date().toISOString(),
485
+ results,
486
+ };
487
+ }
488
+
489
+ function runFeedbackEvalSuite(options = {}) {
490
+ const feedbackLog = options.feedbackLog || (() => {
491
+ const { resolveFeedbackDir } = require('./feedback-paths');
492
+ return path.join(resolveFeedbackDir({ feedbackDir: options.feedbackDir }), 'feedback-log.jsonl');
493
+ })();
494
+ const entries = readJsonl(feedbackLog);
495
+ const suite = buildEvalSuiteFromFeedback(entries, {
496
+ maxCases: options.maxCases,
497
+ name: options.name,
498
+ sourcePath: feedbackLog,
499
+ });
500
+ const report = runSuiteObject(suite, { minScore: options.minScore, allowEmpty: true });
501
+ return { suite, report };
502
+ }
503
+
504
+ function formatProofReport(report, suite) {
505
+ const feedbackSource = suite && suite.source ? suite.source : {};
506
+ const lines = [
507
+ '# ThumbGate Prompt Evaluation Proof',
508
+ '',
509
+ `Generated: ${report.generatedAt}`,
510
+ `Suite: ${report.suite}`,
511
+ `Score: ${report.score}% (minimum ${report.minScore}%)`,
512
+ `Result: ${report.pass ? 'PASS' : 'FAIL'}`,
513
+ '',
514
+ '## Feedback-Derived Coverage',
515
+ '',
516
+ `- Feedback entries scanned: ${feedbackSource.totalEntries || 0}`,
517
+ `- Reusable eval cases generated: ${feedbackSource.selectedCases || report.total}`,
518
+ `- Passing cases: ${report.passed}/${report.total}`,
519
+ `- Failing cases: ${report.failed}`,
520
+ `- Errors: ${report.errors}`,
521
+ `- Skipped: ${report.skipped}`,
522
+ '',
523
+ '## Case Results',
524
+ '',
525
+ ];
526
+
527
+ for (const result of report.results) {
528
+ lines.push(`- ${result.status.toUpperCase()} ${result.id}: ${result.score}%`);
529
+ }
530
+
531
+ lines.push('', '## Buyer Proof', '');
532
+ lines.push('Every row above started as real operator feedback, became a reusable eval, and now gives a repeatable before/after proof lane for prompt or workflow changes.');
533
+ return lines.join('\n');
534
+ }
535
+
226
536
  // ---------------------------------------------------------------------------
227
537
  // Runner
228
538
  // ---------------------------------------------------------------------------
@@ -235,6 +545,72 @@ function loadSuite(suitePath) {
235
545
  return raw;
236
546
  }
237
547
 
548
+ function cloneJson(value) {
549
+ return JSON.parse(JSON.stringify(value));
550
+ }
551
+
552
+ function mutateSyntheticInput(input) {
553
+ if (Array.isArray(input)) {
554
+ return input.map((item, index) => index === 0 ? mutateSyntheticInput(item) : cloneJson(item));
555
+ }
556
+
557
+ if (!input || typeof input !== 'object') return input;
558
+
559
+ const next = cloneJson(input);
560
+
561
+ for (const [key, value] of Object.entries(next)) {
562
+ if (typeof value === 'string' && value.trim()) {
563
+ if (key === 'context') next[key] = ` ${value}\n`;
564
+ else if (key === 'whatWentWrong' || key === 'whatWorked' || key === 'whatToChange') next[key] = `${value} Please preserve the core meaning.`;
565
+ else next[key] = value;
566
+ } else if (Array.isArray(value) && value.every((entry) => typeof entry === 'string')) {
567
+ next[key] = [...value, ...value.slice(0, 1).map((entry) => `${entry} (repeat check)`)];
568
+ } else if (Array.isArray(value) && value.every((entry) => entry && typeof entry === 'object')) {
569
+ next[key] = value.map((entry, index) => {
570
+ if (index === 0 && typeof entry.context === 'string') {
571
+ return { ...entry, context: `${entry.context} Next session should keep the same lesson.` };
572
+ }
573
+ return cloneJson(entry);
574
+ });
575
+ }
576
+ }
577
+
578
+ return next;
579
+ }
580
+
581
+ function expandWithSyntheticEvaluations(suite, options = {}) {
582
+ const variantsPerCase = Number.isFinite(Number(options.syntheticVariants))
583
+ ? Math.max(0, Number(options.syntheticVariants))
584
+ : DEFAULT_SYNTHETIC_VARIANTS;
585
+
586
+ if (variantsPerCase === 0) return suite;
587
+
588
+ const evaluations = [...suite.evaluations];
589
+ for (const evalCase of suite.evaluations) {
590
+ for (let index = 0; index < variantsPerCase; index += 1) {
591
+ evaluations.push({
592
+ ...cloneJson(evalCase),
593
+ id: `${evalCase.id}__synthetic_${index + 1}`,
594
+ input: mutateSyntheticInput(evalCase.input),
595
+ synthetic: true,
596
+ syntheticSourceId: evalCase.id,
597
+ });
598
+ }
599
+ }
600
+
601
+ return {
602
+ ...cloneJson(suite),
603
+ syntheticVariantsPerCase: variantsPerCase,
604
+ syntheticCount: evaluations.length - suite.evaluations.length,
605
+ totalSeedEvaluations: suite.evaluations.length,
606
+ evaluations,
607
+ };
608
+ }
609
+
610
+ function loadReport(reportPath) {
611
+ return JSON.parse(fs.readFileSync(reportPath, 'utf8'));
612
+ }
613
+
238
614
  function runEvaluation(evalCase) {
239
615
  const simulator = PROMPT_SIMULATORS[evalCase.prompt];
240
616
  if (!simulator) {
@@ -280,35 +656,86 @@ function runEvaluation(evalCase) {
280
656
  }
281
657
 
282
658
  function runSuite(suitePath = DEFAULT_SUITE, options = {}) {
283
- const suite = loadSuite(suitePath);
284
- const results = [];
659
+ const loadedSuite = loadSuite(suitePath);
660
+ const suite = options.expandSynthetic
661
+ ? expandWithSyntheticEvaluations(loadedSuite, options)
662
+ : loadedSuite;
663
+ const minScore = Number.isFinite(Number(options.minScore))
664
+ ? Number(options.minScore)
665
+ : Number(suite.successCriteria?.minAggregateScore || 80);
666
+ const report = {
667
+ ...runSuiteObject(suite, { ...options, minScore }),
668
+ successCriteria: suite.successCriteria || null,
669
+ syntheticCount: Number(suite.syntheticCount || 0),
670
+ };
285
671
 
286
- for (const evalCase of suite.evaluations) {
287
- results.push(runEvaluation(evalCase));
672
+ const baselineReport = options.baselineReport
673
+ || (options.baselinePath ? loadReport(options.baselinePath) : null);
674
+ if (baselineReport) {
675
+ report.comparison = compareReports(report, baselineReport);
676
+ const requireNoRegressions = options.requireNoRegressions === true
677
+ || suite.successCriteria?.requireNoRegressions === true;
678
+ if (requireNoRegressions && report.comparison.regressions.length > 0) {
679
+ report.pass = false;
680
+ }
288
681
  }
289
682
 
290
- const passed = results.filter((r) => r.status === 'pass').length;
291
- const failed = results.filter((r) => r.status === 'fail').length;
292
- const errors = results.filter((r) => r.status === 'error').length;
293
- const skipped = results.filter((r) => r.status === 'skip').length;
294
- const totalScore = results.length > 0
295
- ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length)
296
- : 0;
683
+ return report;
684
+ }
685
+
686
+ function compareReports(currentReport, baselineReport) {
687
+ const baselineById = new Map((baselineReport?.results || []).map((result) => [result.id, result]));
688
+ const regressions = [];
689
+ const improvements = [];
690
+
691
+ for (const result of currentReport.results || []) {
692
+ const baseline = baselineById.get(result.id);
693
+ if (!baseline) continue;
694
+
695
+ const scoreDelta = result.score - baseline.score;
696
+ if (scoreDelta < 0 || (baseline.status === 'pass' && result.status !== 'pass')) {
697
+ regressions.push({
698
+ id: result.id,
699
+ baselineScore: baseline.score,
700
+ currentScore: result.score,
701
+ delta: scoreDelta,
702
+ baselineStatus: baseline.status,
703
+ currentStatus: result.status,
704
+ });
705
+ continue;
706
+ }
707
+
708
+ if (scoreDelta > 0 || (baseline.status !== 'pass' && result.status === 'pass')) {
709
+ improvements.push({
710
+ id: result.id,
711
+ baselineScore: baseline.score,
712
+ currentScore: result.score,
713
+ delta: scoreDelta,
714
+ baselineStatus: baseline.status,
715
+ currentStatus: result.status,
716
+ });
717
+ }
718
+ }
297
719
 
298
720
  return {
299
- suite: suite.name,
300
- total: results.length,
301
- passed,
302
- failed,
303
- errors,
304
- skipped,
305
- score: totalScore,
306
- minScore: options.minScore || 80,
307
- pass: totalScore >= (options.minScore || 80),
308
- results,
721
+ baselineSuite: baselineReport?.suite || null,
722
+ baselineScore: Number.isFinite(Number(baselineReport?.score)) ? Number(baselineReport.score) : null,
723
+ scoreDelta: Number.isFinite(Number(baselineReport?.score)) ? currentReport.score - Number(baselineReport.score) : null,
724
+ regressions,
725
+ improvements,
309
726
  };
310
727
  }
311
728
 
729
+ function writeReport(report, outputPath) {
730
+ fs.mkdirSync(path.dirname(outputPath), { recursive: true });
731
+ fs.writeFileSync(outputPath, JSON.stringify(report, null, 2) + '\n');
732
+ }
733
+
734
+ function writeSuite(suite, outputPath) {
735
+ fs.mkdirSync(path.dirname(outputPath), { recursive: true });
736
+ fs.writeFileSync(outputPath, JSON.stringify(suite, null, 2) + '\n');
737
+ }
738
+
312
739
  // ---------------------------------------------------------------------------
313
740
  // CLI
314
741
  // ---------------------------------------------------------------------------
@@ -328,17 +755,99 @@ if (isCliInvocation()) {
328
755
  let suitePath = DEFAULT_SUITE;
329
756
  let json = false;
330
757
  let minScore = 80;
331
-
332
- for (const arg of args) {
758
+ let baselinePath = null;
759
+ let outputPath = null;
760
+ let suiteOutputPath = null;
761
+ let requireNoRegressions = false;
762
+ let expandSynthetic = false;
763
+ let syntheticVariants = DEFAULT_SYNTHETIC_VARIANTS;
764
+ let fromFeedback = false;
765
+ let feedbackLog = null;
766
+ let feedbackDir = null;
767
+ let proofReportPath = null;
768
+ let maxCases = DEFAULT_MAX_FEEDBACK_CASES;
769
+
770
+ for (let index = 0; index < args.length; index += 1) {
771
+ const arg = args[index];
772
+ const nextArg = args[index + 1];
333
773
  if (arg.startsWith('--suite=')) suitePath = path.resolve(arg.slice(8));
774
+ if (arg === '--suite' && nextArg) {
775
+ suitePath = path.resolve(nextArg);
776
+ index += 1;
777
+ continue;
778
+ }
334
779
  if (arg === '--json') json = true;
335
780
  if (arg.startsWith('--min-score=')) minScore = Number(arg.slice(12));
781
+ if (arg === '--min-score' && nextArg) {
782
+ minScore = Number(nextArg);
783
+ index += 1;
784
+ continue;
785
+ }
786
+ if (arg.startsWith('--baseline=')) baselinePath = path.resolve(arg.slice(11));
787
+ if (arg === '--baseline' && nextArg) {
788
+ baselinePath = path.resolve(nextArg);
789
+ index += 1;
790
+ continue;
791
+ }
792
+ if (arg.startsWith('--output=')) outputPath = path.resolve(arg.slice(9));
793
+ if (arg === '--output' && nextArg) {
794
+ outputPath = path.resolve(nextArg);
795
+ index += 1;
796
+ continue;
797
+ }
798
+ if (arg.startsWith('--suite-output=')) suiteOutputPath = path.resolve(arg.slice(15));
799
+ if (arg === '--suite-output' && nextArg) {
800
+ suiteOutputPath = path.resolve(nextArg);
801
+ index += 1;
802
+ continue;
803
+ }
804
+ if (arg === '--require-no-regressions') requireNoRegressions = true;
805
+ if (arg === '--synthetic') expandSynthetic = true;
806
+ if (arg.startsWith('--synthetic-variants=')) {
807
+ expandSynthetic = true;
808
+ syntheticVariants = Number(arg.slice(21));
809
+ }
810
+ if (arg === '--synthetic-variants' && nextArg) {
811
+ expandSynthetic = true;
812
+ syntheticVariants = Number(nextArg);
813
+ index += 1;
814
+ continue;
815
+ }
816
+ if (arg === '--from-feedback') fromFeedback = true;
817
+ if (arg.startsWith('--feedback-log=')) feedbackLog = path.resolve(arg.slice(15));
818
+ if (arg.startsWith('--feedback-dir=')) feedbackDir = path.resolve(arg.slice(15));
819
+ if (arg.startsWith('--write-suite=')) suiteOutputPath = path.resolve(arg.slice(14));
820
+ if (arg.startsWith('--write-report=')) proofReportPath = path.resolve(arg.slice(15));
821
+ if (arg.startsWith('--max-cases=')) maxCases = Number(arg.slice(12));
336
822
  }
337
823
 
338
- const report = runSuite(suitePath, { minScore });
824
+ let suite;
825
+ let report;
826
+ if (fromFeedback) {
827
+ ({ suite, report } = runFeedbackEvalSuite({ feedbackLog, feedbackDir, minScore, maxCases }));
828
+ } else {
829
+ const loadedSuite = loadSuite(suitePath);
830
+ suite = expandSynthetic
831
+ ? expandWithSyntheticEvaluations(loadedSuite, { syntheticVariants })
832
+ : loadedSuite;
833
+ report = runSuite(suitePath, {
834
+ minScore,
835
+ baselinePath,
836
+ requireNoRegressions,
837
+ expandSynthetic,
838
+ syntheticVariants,
839
+ });
840
+ }
841
+
842
+ if (outputPath) writeReport(report, outputPath);
843
+ if (suiteOutputPath) writeSuite(suite, suiteOutputPath);
844
+ if (proofReportPath) {
845
+ fs.mkdirSync(path.dirname(proofReportPath), { recursive: true });
846
+ fs.writeFileSync(proofReportPath, `${formatProofReport(report, suite)}\n`, 'utf8');
847
+ }
339
848
 
340
849
  if (json) {
341
- console.log(JSON.stringify(report, null, 2));
850
+ console.log(JSON.stringify({ ...report, suiteDefinition: fromFeedback ? suite : undefined }, null, 2));
342
851
  } else {
343
852
  console.log(`\n${report.suite}`);
344
853
  console.log('='.repeat(50));
@@ -354,10 +863,33 @@ if (isCliInvocation()) {
354
863
  }
355
864
  console.log('='.repeat(50));
356
865
  console.log(`Score: ${report.score}% | Pass: ${report.passed} | Fail: ${report.failed} | Error: ${report.errors} | Skip: ${report.skipped}`);
866
+ if (report.syntheticCount > 0) {
867
+ console.log(`Synthetic cases: ${report.syntheticCount}`);
868
+ }
869
+ if (report.comparison) {
870
+ console.log(`Baseline delta: ${report.comparison.scoreDelta >= 0 ? '+' : ''}${report.comparison.scoreDelta}%`);
871
+ console.log(`Regressions: ${report.comparison.regressions.length} | Improvements: ${report.comparison.improvements.length}`);
872
+ }
357
873
  console.log(report.pass ? '\u2705 PASS' : `\u274C FAIL (min: ${minScore}%)`);
358
874
  }
359
875
 
360
876
  process.exit(report.pass ? 0 : 1);
361
877
  }
362
878
 
363
- module.exports = { runSuite, runEvaluation, gradeOutput, loadSuite };
879
+ module.exports = {
880
+ buildEvalSuiteFromFeedback,
881
+ feedbackEntryToEvalCase,
882
+ formatProofReport,
883
+ gradeOutput,
884
+ loadSuite,
885
+ loadReport,
886
+ compareReports,
887
+ readJsonl,
888
+ runEvaluation,
889
+ runFeedbackEvalSuite,
890
+ runSuite,
891
+ runSuiteObject,
892
+ writeReport,
893
+ writeSuite,
894
+ expandWithSyntheticEvaluations,
895
+ };