thumbgate 1.14.1 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +6 -6
- package/.claude-plugin/plugin.json +3 -3
- package/.well-known/llms.txt +5 -5
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +60 -35
- package/adapters/chatgpt/openapi.yaml +118 -2
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +217 -84
- package/adapters/opencode/opencode.json +1 -1
- package/bench/prompt-eval-suite.json +5 -1
- package/bin/cli.js +211 -8
- package/config/enforcement.json +59 -7
- package/config/evals/agent-safety-eval.json +338 -22
- package/config/gates/default.json +33 -0
- package/config/gates/routine.json +43 -0
- package/config/github-about.json +3 -3
- package/config/mcp-allowlists.json +4 -0
- package/config/merge-quality-checks.json +2 -1
- package/config/model-candidates.json +131 -0
- package/openapi/openapi.yaml +118 -2
- package/package.json +70 -51
- package/public/blog.html +7 -7
- package/public/codex-plugin.html +13 -7
- package/public/compare.html +29 -23
- package/public/dashboard.html +105 -12
- package/public/guide.html +28 -28
- package/public/index.html +233 -97
- package/public/learn.html +87 -20
- package/public/lessons.html +26 -2
- package/public/numbers.html +271 -0
- package/public/pro.html +89 -19
- package/scripts/agent-audit-trace.js +55 -0
- package/scripts/agent-memory-lifecycle.js +96 -0
- package/scripts/agent-readiness-plan.js +118 -0
- package/scripts/agentic-data-pipeline.js +21 -1
- package/scripts/agents-sdk-sandbox-plan.js +57 -0
- package/scripts/ai-org-governance.js +98 -0
- package/scripts/ai-search-distribution.js +43 -0
- package/scripts/artifact-agent-plan.js +81 -0
- package/scripts/billing.js +27 -8
- package/scripts/cli-feedback.js +2 -1
- package/scripts/cli-schema.js +60 -5
- package/scripts/code-mode-mcp-plan.js +71 -0
- package/scripts/commercial-offer.js +1 -1
- package/scripts/context-engine.js +1 -2
- package/scripts/context-manager.js +4 -1
- package/scripts/contextfs.js +214 -32
- package/scripts/dashboard-render-spec.js +1 -1
- package/scripts/dashboard.js +275 -9
- package/scripts/decision-journal.js +13 -3
- package/scripts/document-workflow-governance.js +62 -0
- package/scripts/enterprise-agent-rollout.js +34 -0
- package/scripts/experience-replay-governance.js +69 -0
- package/scripts/export-hf-dataset.js +1 -1
- package/scripts/feedback-loop.js +141 -9
- package/scripts/feedback-to-rules.js +17 -23
- package/scripts/gates-engine.js +4 -6
- package/scripts/growth-campaigns.js +49 -0
- package/scripts/harness-selector.js +145 -1
- package/scripts/hybrid-supervisor-agent.js +64 -0
- package/scripts/inference-cache-policy.js +72 -0
- package/scripts/inference-economics.js +53 -0
- package/scripts/internal-agent-bootstrap.js +12 -2
- package/scripts/knowledge-layer-plan.js +108 -0
- package/scripts/lesson-canonical.js +181 -0
- package/scripts/lesson-db.js +71 -10
- package/scripts/lesson-inference.js +183 -44
- package/scripts/lesson-search.js +4 -1
- package/scripts/lesson-synthesis.js +23 -2
- package/scripts/llm-client.js +157 -26
- package/scripts/mailer/resend-mailer.js +112 -1
- package/scripts/mcp-transport-strategy.js +66 -0
- package/scripts/memory-store-governance.js +60 -0
- package/scripts/meta-agent-loop.js +7 -13
- package/scripts/model-access-eligibility.js +38 -0
- package/scripts/model-migration-readiness.js +55 -0
- package/scripts/native-messaging-audit.js +514 -0
- package/scripts/operational-integrity.js +96 -3
- package/scripts/otel-declarative-config.js +56 -0
- package/scripts/perplexity-client.js +1 -1
- package/scripts/post-training-governance.js +34 -0
- package/scripts/pr-manager.js +47 -7
- package/scripts/private-core-boundary.js +72 -0
- package/scripts/production-agent-readiness.js +40 -0
- package/scripts/profile-router.js +16 -1
- package/scripts/prompt-eval.js +564 -32
- package/scripts/prompt-programs.js +93 -0
- package/scripts/provider-action-normalizer.js +585 -0
- package/scripts/rule-validator.js +285 -0
- package/scripts/scaling-law-claims.js +60 -0
- package/scripts/security-scanner.js +1 -1
- package/scripts/self-distill-agent.js +7 -32
- package/scripts/seo-gsd.js +400 -43
- package/scripts/skill-rag-router.js +53 -0
- package/scripts/spec-gate.js +1 -1
- package/scripts/student-consistent-training.js +73 -0
- package/scripts/synthetic-data-provenance.js +98 -0
- package/scripts/task-context-result.js +81 -0
- package/scripts/telemetry-analytics.js +149 -0
- package/scripts/thompson-sampling.js +2 -2
- package/scripts/token-savings.js +7 -6
- package/scripts/token-tco.js +46 -0
- package/scripts/tool-registry.js +75 -3
- package/scripts/verification-loop.js +10 -1
- package/scripts/verifier-scoring.js +71 -0
- package/scripts/workflow-sentinel.js +284 -28
- package/scripts/workspace-agent-routines.js +118 -0
- package/skills/thumbgate/SKILL.md +1 -1
- package/src/api/server.js +434 -120
- package/.claude-plugin/README.md +0 -170
- package/adapters/README.md +0 -12
- package/scripts/analytics-report.js +0 -328
- package/scripts/autonomous-workflow.js +0 -377
- package/scripts/billing-setup.js +0 -109
- package/scripts/creator-campaigns.js +0 -239
- package/scripts/cross-encoder-reranker.js +0 -235
- package/scripts/daemon-manager.js +0 -108
- package/scripts/decision-trace.js +0 -354
- package/scripts/delegation-runtime.js +0 -896
- package/scripts/dispatch-brief.js +0 -159
- package/scripts/distribution-surfaces.js +0 -110
- package/scripts/feedback-history-distiller.js +0 -382
- package/scripts/funnel-analytics.js +0 -35
- package/scripts/history-distiller.js +0 -200
- package/scripts/hosted-job-launcher.js +0 -256
- package/scripts/intent-router.js +0 -392
- package/scripts/lesson-reranker.js +0 -263
- package/scripts/lesson-retrieval.js +0 -148
- package/scripts/managed-lesson-agent.js +0 -183
- package/scripts/operational-dashboard.js +0 -103
- package/scripts/operational-summary.js +0 -129
- package/scripts/operator-artifacts.js +0 -608
- package/scripts/optimize-context.js +0 -17
- package/scripts/org-dashboard.js +0 -206
- package/scripts/partner-orchestration.js +0 -146
- package/scripts/predictive-insights.js +0 -356
- package/scripts/pulse.js +0 -80
- package/scripts/reflector-agent.js +0 -221
- package/scripts/sales-pipeline.js +0 -681
- package/scripts/session-episode-store.js +0 -329
- package/scripts/session-health-sensor.js +0 -242
- package/scripts/session-report.js +0 -120
- package/scripts/swarm-coordinator.js +0 -81
- package/scripts/tool-kpi-tracker.js +0 -12
- package/scripts/webhook-delivery.js +0 -62
- package/scripts/workflow-sprint-intake.js +0 -475
- package/skills/agent-memory/SKILL.md +0 -97
- package/skills/solve-architecture-autonomy/SKILL.md +0 -17
- package/skills/solve-architecture-autonomy/tool.js +0 -33
- package/skills/thumbgate-feedback/SKILL.md +0 -49
package/scripts/prompt-eval.js
CHANGED
|
@@ -20,6 +20,8 @@ const path = require('node:path');
|
|
|
20
20
|
|
|
21
21
|
const ROOT = path.join(__dirname, '..');
|
|
22
22
|
const DEFAULT_SUITE = path.join(ROOT, 'bench', 'prompt-eval-suite.json');
|
|
23
|
+
const DEFAULT_SYNTHETIC_VARIANTS = 2;
|
|
24
|
+
const DEFAULT_MAX_FEEDBACK_CASES = 25;
|
|
23
25
|
|
|
24
26
|
// ---------------------------------------------------------------------------
|
|
25
27
|
// Prompt simulators — run ThumbGate's actual logic against eval inputs
|
|
@@ -51,28 +53,58 @@ function simulateLessonDistillation(input) {
|
|
|
51
53
|
|
|
52
54
|
function simulateFeedbackEnrichment(input) {
|
|
53
55
|
const { enrichFeedbackContext } = require('./feedback-loop');
|
|
54
|
-
|
|
56
|
+
const feedbackEvent = {
|
|
55
57
|
signal: input.signal,
|
|
56
58
|
context: input.context,
|
|
57
59
|
tags: input.tags || [],
|
|
60
|
+
whatWentWrong: input.whatWentWrong || '',
|
|
61
|
+
whatToChange: input.whatToChange || '',
|
|
62
|
+
};
|
|
63
|
+
return enrichFeedbackContext(feedbackEvent, {
|
|
64
|
+
filePaths: input.filePaths || [],
|
|
65
|
+
errorType: input.errorType || null,
|
|
58
66
|
});
|
|
59
67
|
}
|
|
60
68
|
|
|
61
69
|
function simulatePreventionRule(input) {
|
|
62
70
|
// Prevention rules are generated from accumulated patterns
|
|
63
|
-
// For eval purposes,
|
|
71
|
+
// For eval purposes, produce a realistic block rule envelope.
|
|
72
|
+
const normalizedExamples = Array.isArray(input.examples) ? input.examples.filter(Boolean) : [];
|
|
73
|
+
const ruleText = normalizedExamples.length > 0
|
|
74
|
+
? `NEVER repeat ${normalizedExamples[0].toLowerCase()}; keep the workflow inside the worktree.`
|
|
75
|
+
: `NEVER repeat pattern ${String(input.pattern || '').trim() || 'unknown-pattern'}.`;
|
|
64
76
|
return {
|
|
65
77
|
pattern: input.pattern,
|
|
66
78
|
occurrences: input.occurrences,
|
|
67
|
-
examples:
|
|
79
|
+
examples: normalizedExamples,
|
|
80
|
+
rule: ruleText,
|
|
81
|
+
actionType: 'block',
|
|
82
|
+
confidence: Math.max(0.7, Math.min(0.99, Number(input.occurrences || 0) / 4)),
|
|
68
83
|
generated: true,
|
|
69
84
|
};
|
|
70
85
|
}
|
|
71
86
|
|
|
72
87
|
function simulateSelfDistill(input) {
|
|
88
|
+
const sessionFeedback = Array.isArray(input.sessionFeedback) ? input.sessionFeedback : [];
|
|
89
|
+
const contexts = sessionFeedback
|
|
90
|
+
.map((entry) => String(entry?.context || '').trim())
|
|
91
|
+
.filter(Boolean);
|
|
92
|
+
const negativeContexts = sessionFeedback
|
|
93
|
+
.filter((entry) => entry?.signal === 'negative')
|
|
94
|
+
.map((entry) => String(entry?.context || '').trim())
|
|
95
|
+
.filter(Boolean);
|
|
96
|
+
|
|
97
|
+
const pattern = negativeContexts.length > 1
|
|
98
|
+
? `Pattern: repeated workflow discipline gaps around ${negativeContexts.slice(0, 2).join(' and ')}.`
|
|
99
|
+
: 'Pattern: isolated session mistake with no repeated theme yet.';
|
|
100
|
+
const improvement = contexts.some((context) => /thumbgate/i.test(context))
|
|
101
|
+
? 'Improvement: keep using ThumbGate at session start and stay inside the worktree.'
|
|
102
|
+
: 'Improvement: start each session with ThumbGate and enforce worktree discipline.';
|
|
73
103
|
return {
|
|
74
|
-
sessionFeedback
|
|
75
|
-
summary:
|
|
104
|
+
sessionFeedback,
|
|
105
|
+
summary: [...contexts, pattern, improvement].join('; '),
|
|
106
|
+
pattern,
|
|
107
|
+
improvement,
|
|
76
108
|
generated: true,
|
|
77
109
|
};
|
|
78
110
|
}
|
|
@@ -188,11 +220,34 @@ function addContextChecks(checks, result, expected) {
|
|
|
188
220
|
function addRuleChecks(checks, result, expected) {
|
|
189
221
|
if (!expected.hasRule) return;
|
|
190
222
|
|
|
223
|
+
const rule = firstString(result.rule, result.pattern, result.summary);
|
|
191
224
|
checks.push({
|
|
192
225
|
criterion: 'hasRule',
|
|
193
|
-
pass: result.generated === true ||
|
|
226
|
+
pass: result.generated === true || rule.length > 0,
|
|
194
227
|
detail: result.generated ? 'Rule generated' : 'No rule generated',
|
|
195
228
|
});
|
|
229
|
+
addContainsChecks(checks, 'ruleContains', 'Rule', rule, expected.ruleContains);
|
|
230
|
+
|
|
231
|
+
if (expected.actionType) {
|
|
232
|
+
const actionType = firstString(result.actionType, result.action, result.availability);
|
|
233
|
+
checks.push({
|
|
234
|
+
criterion: 'actionType',
|
|
235
|
+
pass: actionType === expected.actionType,
|
|
236
|
+
detail: `Expected "${expected.actionType}", got "${actionType}"`,
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
if (expected.confidence?.min !== undefined) {
|
|
241
|
+
const confidence = Number(result.confidence);
|
|
242
|
+
const minConfidence = Number(expected.confidence.min);
|
|
243
|
+
checks.push({
|
|
244
|
+
criterion: 'confidenceMin',
|
|
245
|
+
pass: Number.isFinite(confidence) && confidence >= minConfidence,
|
|
246
|
+
detail: Number.isFinite(confidence)
|
|
247
|
+
? `Expected >= ${minConfidence}, got ${confidence}`
|
|
248
|
+
: 'Missing numeric confidence',
|
|
249
|
+
});
|
|
250
|
+
}
|
|
196
251
|
}
|
|
197
252
|
|
|
198
253
|
function addSummaryChecks(checks, result, expected) {
|
|
@@ -205,6 +260,24 @@ function addSummaryChecks(checks, result, expected) {
|
|
|
205
260
|
detail: `Summary length: ${summary.length}`,
|
|
206
261
|
});
|
|
207
262
|
addContainsChecks(checks, 'summaryContains', 'Summary', summary, expected.summaryContains);
|
|
263
|
+
|
|
264
|
+
if (expected.identifiesPattern) {
|
|
265
|
+
const pattern = firstString(result.pattern, summary);
|
|
266
|
+
checks.push({
|
|
267
|
+
criterion: 'identifiesPattern',
|
|
268
|
+
pass: /pattern|repeat|repeated|recurring/i.test(pattern),
|
|
269
|
+
detail: pattern ? `Pattern text: "${pattern.slice(0, 80)}"` : 'Missing pattern identification',
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
if (expected.suggestsImprovement) {
|
|
274
|
+
const improvement = firstString(result.improvement, summary);
|
|
275
|
+
checks.push({
|
|
276
|
+
criterion: 'suggestsImprovement',
|
|
277
|
+
pass: /improvement|should|next time|keep|start|use/i.test(improvement),
|
|
278
|
+
detail: improvement ? `Improvement text: "${improvement.slice(0, 80)}"` : 'Missing improvement guidance',
|
|
279
|
+
});
|
|
280
|
+
}
|
|
208
281
|
}
|
|
209
282
|
|
|
210
283
|
function gradeOutput(output, expected) {
|
|
@@ -223,6 +296,243 @@ function gradeOutput(output, expected) {
|
|
|
223
296
|
return checks;
|
|
224
297
|
}
|
|
225
298
|
|
|
299
|
+
// ---------------------------------------------------------------------------
|
|
300
|
+
// Feedback -> eval conversion
|
|
301
|
+
// ---------------------------------------------------------------------------
|
|
302
|
+
|
|
303
|
+
function readJsonl(filePath) {
|
|
304
|
+
try {
|
|
305
|
+
return fs.readFileSync(filePath, 'utf8')
|
|
306
|
+
.split(/\r?\n/)
|
|
307
|
+
.filter(Boolean)
|
|
308
|
+
.map((line) => {
|
|
309
|
+
try {
|
|
310
|
+
return JSON.parse(line);
|
|
311
|
+
} catch {
|
|
312
|
+
return null;
|
|
313
|
+
}
|
|
314
|
+
})
|
|
315
|
+
.filter(Boolean);
|
|
316
|
+
} catch {
|
|
317
|
+
return [];
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
function stableCaseId(value, index = 0) {
|
|
322
|
+
const source = String(value || '').toLowerCase();
|
|
323
|
+
let slug = '';
|
|
324
|
+
let previousWasDash = false;
|
|
325
|
+
for (const ch of source) {
|
|
326
|
+
const isDigit = ch >= '0' && ch <= '9';
|
|
327
|
+
const isLower = ch >= 'a' && ch <= 'z';
|
|
328
|
+
if (isDigit || isLower) {
|
|
329
|
+
slug += ch;
|
|
330
|
+
previousWasDash = false;
|
|
331
|
+
if (slug.length >= 64) break;
|
|
332
|
+
continue;
|
|
333
|
+
}
|
|
334
|
+
if (!previousWasDash && slug.length > 0) {
|
|
335
|
+
slug += '-';
|
|
336
|
+
previousWasDash = true;
|
|
337
|
+
if (slug.length >= 64) break;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
let start = 0;
|
|
341
|
+
let end = slug.length;
|
|
342
|
+
while (start < end && slug[start] === '-') start += 1;
|
|
343
|
+
while (end > start && slug[end - 1] === '-') end -= 1;
|
|
344
|
+
const trimmed = slug.slice(start, end);
|
|
345
|
+
const normalized = trimmed.slice(0, 48);
|
|
346
|
+
return `${normalized || 'entry'}-${index + 1}`;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
function normalizeSignal(entry = {}) {
|
|
350
|
+
const raw = String(entry.signal || entry.feedback || entry.rating || '').toLowerCase();
|
|
351
|
+
if (['down', 'negative', 'thumbs_down', 'thumbs-down', '-1'].includes(raw)) return 'negative';
|
|
352
|
+
if (['up', 'positive', 'thumbs_up', 'thumbs-up', '+1'].includes(raw)) return 'positive';
|
|
353
|
+
return null;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
function compactText(...values) {
|
|
357
|
+
return values
|
|
358
|
+
.filter((value) => typeof value === 'string' && value.trim())
|
|
359
|
+
.map((value) => value.trim().replace(/\s+/g, ' '))
|
|
360
|
+
.join(' ')
|
|
361
|
+
.trim();
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
function keywordTerms(text, limit = 3) {
|
|
365
|
+
const stopWords = new Set([
|
|
366
|
+
'about', 'after', 'again', 'agent', 'because', 'before', 'being', 'change',
|
|
367
|
+
'could', 'from', 'have', 'into', 'should', 'that', 'their', 'there', 'this',
|
|
368
|
+
'touch', 'when', 'where', 'with', 'work', 'would',
|
|
369
|
+
]);
|
|
370
|
+
const seen = new Set();
|
|
371
|
+
const terms = [];
|
|
372
|
+
for (const token of String(text || '').toLowerCase().match(/[a-z][a-z0-9_-]{3,}/g) || []) {
|
|
373
|
+
if (stopWords.has(token) || seen.has(token)) continue;
|
|
374
|
+
seen.add(token);
|
|
375
|
+
terms.push(token);
|
|
376
|
+
if (terms.length >= limit) break;
|
|
377
|
+
}
|
|
378
|
+
return terms;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
function feedbackEntryToEvalCase(entry = {}, index = 0) {
|
|
382
|
+
const signal = normalizeSignal(entry);
|
|
383
|
+
if (!signal) return null;
|
|
384
|
+
|
|
385
|
+
const context = compactText(entry.context, entry.summary, entry.message, entry.userText);
|
|
386
|
+
const whatWentWrong = compactText(entry.whatWentWrong, entry.rootCause, entry.failure, entry.error);
|
|
387
|
+
const whatToChange = compactText(entry.whatToChange, entry.correctiveAction, entry.fix, entry.recommendation);
|
|
388
|
+
const whatWorked = compactText(entry.whatWorked, entry.success, entry.outcome);
|
|
389
|
+
const tags = Array.isArray(entry.tags)
|
|
390
|
+
? entry.tags.map(String).filter(Boolean)
|
|
391
|
+
: String(entry.tags || '').split(',').map((tag) => tag.trim()).filter(Boolean);
|
|
392
|
+
const rawId = entry.id || entry.feedbackId || `${signal}:${context}:${whatWentWrong}:${whatToChange}:${whatWorked}`;
|
|
393
|
+
const id = `feedback-${signal}-${stableCaseId(rawId, index)}`;
|
|
394
|
+
const actionableText = signal === 'negative'
|
|
395
|
+
? compactText(whatToChange, whatWentWrong, context)
|
|
396
|
+
: compactText(context, whatWorked);
|
|
397
|
+
const terms = keywordTerms(actionableText, 2);
|
|
398
|
+
const vague = actionableText.length < 24 || /^thumbs?\s*(up|down)$/i.test(actionableText);
|
|
399
|
+
|
|
400
|
+
return {
|
|
401
|
+
id,
|
|
402
|
+
prompt: 'lesson-distillation',
|
|
403
|
+
source: {
|
|
404
|
+
type: 'feedback',
|
|
405
|
+
feedbackId: entry.id || entry.feedbackId || null,
|
|
406
|
+
timestamp: entry.timestamp || null,
|
|
407
|
+
},
|
|
408
|
+
input: {
|
|
409
|
+
signal,
|
|
410
|
+
context,
|
|
411
|
+
whatWentWrong,
|
|
412
|
+
whatToChange,
|
|
413
|
+
whatWorked,
|
|
414
|
+
tags,
|
|
415
|
+
},
|
|
416
|
+
expectedOutput: vague
|
|
417
|
+
? { shouldReject: true, rejectReason: 'vague-feedback' }
|
|
418
|
+
: {
|
|
419
|
+
hasTitle: true,
|
|
420
|
+
hasContent: signal === 'negative',
|
|
421
|
+
...(terms.length > 0 && signal === 'negative' ? { contentContains: terms } : {}),
|
|
422
|
+
category: signal === 'negative' ? 'error' : 'learning',
|
|
423
|
+
},
|
|
424
|
+
};
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
function buildEvalSuiteFromFeedback(entries = [], options = {}) {
|
|
428
|
+
const maxCases = Number.isFinite(Number(options.maxCases))
|
|
429
|
+
? Math.max(1, Number(options.maxCases))
|
|
430
|
+
: DEFAULT_MAX_FEEDBACK_CASES;
|
|
431
|
+
const cases = [];
|
|
432
|
+
const seen = new Set();
|
|
433
|
+
|
|
434
|
+
for (const [index, entry] of entries.entries()) {
|
|
435
|
+
const evalCase = feedbackEntryToEvalCase(entry, index);
|
|
436
|
+
if (!evalCase || seen.has(evalCase.id)) continue;
|
|
437
|
+
seen.add(evalCase.id);
|
|
438
|
+
cases.push(evalCase);
|
|
439
|
+
if (cases.length >= maxCases) break;
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
return {
|
|
443
|
+
version: 1,
|
|
444
|
+
name: options.name || 'ThumbGate Feedback-Derived Prompt Evaluation',
|
|
445
|
+
description: 'Reusable eval cases generated from thumbs-up/down feedback. These cases prove whether a feedback-derived behavior now passes instead of relying on prompt vibes.',
|
|
446
|
+
generatedAt: new Date().toISOString(),
|
|
447
|
+
source: {
|
|
448
|
+
type: 'feedback-log',
|
|
449
|
+
path: options.sourcePath || null,
|
|
450
|
+
totalEntries: entries.length,
|
|
451
|
+
selectedCases: cases.length,
|
|
452
|
+
},
|
|
453
|
+
evaluations: cases,
|
|
454
|
+
};
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
function runSuiteObject(suite, options = {}) {
|
|
458
|
+
if (!suite || !Array.isArray(suite.evaluations) || (!options.allowEmpty && suite.evaluations.length === 0)) {
|
|
459
|
+
throw new Error('Suite must define a non-empty evaluations array');
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
const results = suite.evaluations.map(runEvaluation);
|
|
463
|
+
const passed = results.filter((r) => r.status === 'pass').length;
|
|
464
|
+
const failed = results.filter((r) => r.status === 'fail').length;
|
|
465
|
+
const errors = results.filter((r) => r.status === 'error').length;
|
|
466
|
+
const skipped = results.filter((r) => r.status === 'skip').length;
|
|
467
|
+
const totalScore = results.length > 0
|
|
468
|
+
? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length)
|
|
469
|
+
: 100;
|
|
470
|
+
const minScore = options.minScore ?? 80;
|
|
471
|
+
|
|
472
|
+
return {
|
|
473
|
+
suite: suite.name,
|
|
474
|
+
total: results.length,
|
|
475
|
+
passed,
|
|
476
|
+
failed,
|
|
477
|
+
errors,
|
|
478
|
+
skipped,
|
|
479
|
+
score: totalScore,
|
|
480
|
+
minScore,
|
|
481
|
+
pass: totalScore >= minScore,
|
|
482
|
+
noCases: results.length === 0,
|
|
483
|
+
feedbackDerived: suite.source && suite.source.type === 'feedback-log',
|
|
484
|
+
generatedAt: new Date().toISOString(),
|
|
485
|
+
results,
|
|
486
|
+
};
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
function runFeedbackEvalSuite(options = {}) {
|
|
490
|
+
const feedbackLog = options.feedbackLog || (() => {
|
|
491
|
+
const { resolveFeedbackDir } = require('./feedback-paths');
|
|
492
|
+
return path.join(resolveFeedbackDir({ feedbackDir: options.feedbackDir }), 'feedback-log.jsonl');
|
|
493
|
+
})();
|
|
494
|
+
const entries = readJsonl(feedbackLog);
|
|
495
|
+
const suite = buildEvalSuiteFromFeedback(entries, {
|
|
496
|
+
maxCases: options.maxCases,
|
|
497
|
+
name: options.name,
|
|
498
|
+
sourcePath: feedbackLog,
|
|
499
|
+
});
|
|
500
|
+
const report = runSuiteObject(suite, { minScore: options.minScore, allowEmpty: true });
|
|
501
|
+
return { suite, report };
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
function formatProofReport(report, suite) {
|
|
505
|
+
const feedbackSource = suite && suite.source ? suite.source : {};
|
|
506
|
+
const lines = [
|
|
507
|
+
'# ThumbGate Prompt Evaluation Proof',
|
|
508
|
+
'',
|
|
509
|
+
`Generated: ${report.generatedAt}`,
|
|
510
|
+
`Suite: ${report.suite}`,
|
|
511
|
+
`Score: ${report.score}% (minimum ${report.minScore}%)`,
|
|
512
|
+
`Result: ${report.pass ? 'PASS' : 'FAIL'}`,
|
|
513
|
+
'',
|
|
514
|
+
'## Feedback-Derived Coverage',
|
|
515
|
+
'',
|
|
516
|
+
`- Feedback entries scanned: ${feedbackSource.totalEntries || 0}`,
|
|
517
|
+
`- Reusable eval cases generated: ${feedbackSource.selectedCases || report.total}`,
|
|
518
|
+
`- Passing cases: ${report.passed}/${report.total}`,
|
|
519
|
+
`- Failing cases: ${report.failed}`,
|
|
520
|
+
`- Errors: ${report.errors}`,
|
|
521
|
+
`- Skipped: ${report.skipped}`,
|
|
522
|
+
'',
|
|
523
|
+
'## Case Results',
|
|
524
|
+
'',
|
|
525
|
+
];
|
|
526
|
+
|
|
527
|
+
for (const result of report.results) {
|
|
528
|
+
lines.push(`- ${result.status.toUpperCase()} ${result.id}: ${result.score}%`);
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
lines.push('', '## Buyer Proof', '');
|
|
532
|
+
lines.push('Every row above started as real operator feedback, became a reusable eval, and now gives a repeatable before/after proof lane for prompt or workflow changes.');
|
|
533
|
+
return lines.join('\n');
|
|
534
|
+
}
|
|
535
|
+
|
|
226
536
|
// ---------------------------------------------------------------------------
|
|
227
537
|
// Runner
|
|
228
538
|
// ---------------------------------------------------------------------------
|
|
@@ -235,6 +545,72 @@ function loadSuite(suitePath) {
|
|
|
235
545
|
return raw;
|
|
236
546
|
}
|
|
237
547
|
|
|
548
|
+
function cloneJson(value) {
|
|
549
|
+
return JSON.parse(JSON.stringify(value));
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
function mutateSyntheticInput(input) {
|
|
553
|
+
if (Array.isArray(input)) {
|
|
554
|
+
return input.map((item, index) => index === 0 ? mutateSyntheticInput(item) : cloneJson(item));
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
if (!input || typeof input !== 'object') return input;
|
|
558
|
+
|
|
559
|
+
const next = cloneJson(input);
|
|
560
|
+
|
|
561
|
+
for (const [key, value] of Object.entries(next)) {
|
|
562
|
+
if (typeof value === 'string' && value.trim()) {
|
|
563
|
+
if (key === 'context') next[key] = ` ${value}\n`;
|
|
564
|
+
else if (key === 'whatWentWrong' || key === 'whatWorked' || key === 'whatToChange') next[key] = `${value} Please preserve the core meaning.`;
|
|
565
|
+
else next[key] = value;
|
|
566
|
+
} else if (Array.isArray(value) && value.every((entry) => typeof entry === 'string')) {
|
|
567
|
+
next[key] = [...value, ...value.slice(0, 1).map((entry) => `${entry} (repeat check)`)];
|
|
568
|
+
} else if (Array.isArray(value) && value.every((entry) => entry && typeof entry === 'object')) {
|
|
569
|
+
next[key] = value.map((entry, index) => {
|
|
570
|
+
if (index === 0 && typeof entry.context === 'string') {
|
|
571
|
+
return { ...entry, context: `${entry.context} Next session should keep the same lesson.` };
|
|
572
|
+
}
|
|
573
|
+
return cloneJson(entry);
|
|
574
|
+
});
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
return next;
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
function expandWithSyntheticEvaluations(suite, options = {}) {
|
|
582
|
+
const variantsPerCase = Number.isFinite(Number(options.syntheticVariants))
|
|
583
|
+
? Math.max(0, Number(options.syntheticVariants))
|
|
584
|
+
: DEFAULT_SYNTHETIC_VARIANTS;
|
|
585
|
+
|
|
586
|
+
if (variantsPerCase === 0) return suite;
|
|
587
|
+
|
|
588
|
+
const evaluations = [...suite.evaluations];
|
|
589
|
+
for (const evalCase of suite.evaluations) {
|
|
590
|
+
for (let index = 0; index < variantsPerCase; index += 1) {
|
|
591
|
+
evaluations.push({
|
|
592
|
+
...cloneJson(evalCase),
|
|
593
|
+
id: `${evalCase.id}__synthetic_${index + 1}`,
|
|
594
|
+
input: mutateSyntheticInput(evalCase.input),
|
|
595
|
+
synthetic: true,
|
|
596
|
+
syntheticSourceId: evalCase.id,
|
|
597
|
+
});
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
return {
|
|
602
|
+
...cloneJson(suite),
|
|
603
|
+
syntheticVariantsPerCase: variantsPerCase,
|
|
604
|
+
syntheticCount: evaluations.length - suite.evaluations.length,
|
|
605
|
+
totalSeedEvaluations: suite.evaluations.length,
|
|
606
|
+
evaluations,
|
|
607
|
+
};
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
function loadReport(reportPath) {
|
|
611
|
+
return JSON.parse(fs.readFileSync(reportPath, 'utf8'));
|
|
612
|
+
}
|
|
613
|
+
|
|
238
614
|
function runEvaluation(evalCase) {
|
|
239
615
|
const simulator = PROMPT_SIMULATORS[evalCase.prompt];
|
|
240
616
|
if (!simulator) {
|
|
@@ -280,35 +656,86 @@ function runEvaluation(evalCase) {
|
|
|
280
656
|
}
|
|
281
657
|
|
|
282
658
|
function runSuite(suitePath = DEFAULT_SUITE, options = {}) {
|
|
283
|
-
const
|
|
284
|
-
const
|
|
659
|
+
const loadedSuite = loadSuite(suitePath);
|
|
660
|
+
const suite = options.expandSynthetic
|
|
661
|
+
? expandWithSyntheticEvaluations(loadedSuite, options)
|
|
662
|
+
: loadedSuite;
|
|
663
|
+
const minScore = Number.isFinite(Number(options.minScore))
|
|
664
|
+
? Number(options.minScore)
|
|
665
|
+
: Number(suite.successCriteria?.minAggregateScore || 80);
|
|
666
|
+
const report = {
|
|
667
|
+
...runSuiteObject(suite, { ...options, minScore }),
|
|
668
|
+
successCriteria: suite.successCriteria || null,
|
|
669
|
+
syntheticCount: Number(suite.syntheticCount || 0),
|
|
670
|
+
};
|
|
285
671
|
|
|
286
|
-
|
|
287
|
-
|
|
672
|
+
const baselineReport = options.baselineReport
|
|
673
|
+
|| (options.baselinePath ? loadReport(options.baselinePath) : null);
|
|
674
|
+
if (baselineReport) {
|
|
675
|
+
report.comparison = compareReports(report, baselineReport);
|
|
676
|
+
const requireNoRegressions = options.requireNoRegressions === true
|
|
677
|
+
|| suite.successCriteria?.requireNoRegressions === true;
|
|
678
|
+
if (requireNoRegressions && report.comparison.regressions.length > 0) {
|
|
679
|
+
report.pass = false;
|
|
680
|
+
}
|
|
288
681
|
}
|
|
289
682
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
const
|
|
295
|
-
|
|
296
|
-
|
|
683
|
+
return report;
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
function compareReports(currentReport, baselineReport) {
|
|
687
|
+
const baselineById = new Map((baselineReport?.results || []).map((result) => [result.id, result]));
|
|
688
|
+
const regressions = [];
|
|
689
|
+
const improvements = [];
|
|
690
|
+
|
|
691
|
+
for (const result of currentReport.results || []) {
|
|
692
|
+
const baseline = baselineById.get(result.id);
|
|
693
|
+
if (!baseline) continue;
|
|
694
|
+
|
|
695
|
+
const scoreDelta = result.score - baseline.score;
|
|
696
|
+
if (scoreDelta < 0 || (baseline.status === 'pass' && result.status !== 'pass')) {
|
|
697
|
+
regressions.push({
|
|
698
|
+
id: result.id,
|
|
699
|
+
baselineScore: baseline.score,
|
|
700
|
+
currentScore: result.score,
|
|
701
|
+
delta: scoreDelta,
|
|
702
|
+
baselineStatus: baseline.status,
|
|
703
|
+
currentStatus: result.status,
|
|
704
|
+
});
|
|
705
|
+
continue;
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
if (scoreDelta > 0 || (baseline.status !== 'pass' && result.status === 'pass')) {
|
|
709
|
+
improvements.push({
|
|
710
|
+
id: result.id,
|
|
711
|
+
baselineScore: baseline.score,
|
|
712
|
+
currentScore: result.score,
|
|
713
|
+
delta: scoreDelta,
|
|
714
|
+
baselineStatus: baseline.status,
|
|
715
|
+
currentStatus: result.status,
|
|
716
|
+
});
|
|
717
|
+
}
|
|
718
|
+
}
|
|
297
719
|
|
|
298
720
|
return {
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
skipped,
|
|
305
|
-
score: totalScore,
|
|
306
|
-
minScore: options.minScore || 80,
|
|
307
|
-
pass: totalScore >= (options.minScore || 80),
|
|
308
|
-
results,
|
|
721
|
+
baselineSuite: baselineReport?.suite || null,
|
|
722
|
+
baselineScore: Number.isFinite(Number(baselineReport?.score)) ? Number(baselineReport.score) : null,
|
|
723
|
+
scoreDelta: Number.isFinite(Number(baselineReport?.score)) ? currentReport.score - Number(baselineReport.score) : null,
|
|
724
|
+
regressions,
|
|
725
|
+
improvements,
|
|
309
726
|
};
|
|
310
727
|
}
|
|
311
728
|
|
|
729
|
+
function writeReport(report, outputPath) {
|
|
730
|
+
fs.mkdirSync(path.dirname(outputPath), { recursive: true });
|
|
731
|
+
fs.writeFileSync(outputPath, JSON.stringify(report, null, 2) + '\n');
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
function writeSuite(suite, outputPath) {
|
|
735
|
+
fs.mkdirSync(path.dirname(outputPath), { recursive: true });
|
|
736
|
+
fs.writeFileSync(outputPath, JSON.stringify(suite, null, 2) + '\n');
|
|
737
|
+
}
|
|
738
|
+
|
|
312
739
|
// ---------------------------------------------------------------------------
|
|
313
740
|
// CLI
|
|
314
741
|
// ---------------------------------------------------------------------------
|
|
@@ -328,17 +755,99 @@ if (isCliInvocation()) {
|
|
|
328
755
|
let suitePath = DEFAULT_SUITE;
|
|
329
756
|
let json = false;
|
|
330
757
|
let minScore = 80;
|
|
331
|
-
|
|
332
|
-
|
|
758
|
+
let baselinePath = null;
|
|
759
|
+
let outputPath = null;
|
|
760
|
+
let suiteOutputPath = null;
|
|
761
|
+
let requireNoRegressions = false;
|
|
762
|
+
let expandSynthetic = false;
|
|
763
|
+
let syntheticVariants = DEFAULT_SYNTHETIC_VARIANTS;
|
|
764
|
+
let fromFeedback = false;
|
|
765
|
+
let feedbackLog = null;
|
|
766
|
+
let feedbackDir = null;
|
|
767
|
+
let proofReportPath = null;
|
|
768
|
+
let maxCases = DEFAULT_MAX_FEEDBACK_CASES;
|
|
769
|
+
|
|
770
|
+
for (let index = 0; index < args.length; index += 1) {
|
|
771
|
+
const arg = args[index];
|
|
772
|
+
const nextArg = args[index + 1];
|
|
333
773
|
if (arg.startsWith('--suite=')) suitePath = path.resolve(arg.slice(8));
|
|
774
|
+
if (arg === '--suite' && nextArg) {
|
|
775
|
+
suitePath = path.resolve(nextArg);
|
|
776
|
+
index += 1;
|
|
777
|
+
continue;
|
|
778
|
+
}
|
|
334
779
|
if (arg === '--json') json = true;
|
|
335
780
|
if (arg.startsWith('--min-score=')) minScore = Number(arg.slice(12));
|
|
781
|
+
if (arg === '--min-score' && nextArg) {
|
|
782
|
+
minScore = Number(nextArg);
|
|
783
|
+
index += 1;
|
|
784
|
+
continue;
|
|
785
|
+
}
|
|
786
|
+
if (arg.startsWith('--baseline=')) baselinePath = path.resolve(arg.slice(11));
|
|
787
|
+
if (arg === '--baseline' && nextArg) {
|
|
788
|
+
baselinePath = path.resolve(nextArg);
|
|
789
|
+
index += 1;
|
|
790
|
+
continue;
|
|
791
|
+
}
|
|
792
|
+
if (arg.startsWith('--output=')) outputPath = path.resolve(arg.slice(9));
|
|
793
|
+
if (arg === '--output' && nextArg) {
|
|
794
|
+
outputPath = path.resolve(nextArg);
|
|
795
|
+
index += 1;
|
|
796
|
+
continue;
|
|
797
|
+
}
|
|
798
|
+
if (arg.startsWith('--suite-output=')) suiteOutputPath = path.resolve(arg.slice(15));
|
|
799
|
+
if (arg === '--suite-output' && nextArg) {
|
|
800
|
+
suiteOutputPath = path.resolve(nextArg);
|
|
801
|
+
index += 1;
|
|
802
|
+
continue;
|
|
803
|
+
}
|
|
804
|
+
if (arg === '--require-no-regressions') requireNoRegressions = true;
|
|
805
|
+
if (arg === '--synthetic') expandSynthetic = true;
|
|
806
|
+
if (arg.startsWith('--synthetic-variants=')) {
|
|
807
|
+
expandSynthetic = true;
|
|
808
|
+
syntheticVariants = Number(arg.slice(21));
|
|
809
|
+
}
|
|
810
|
+
if (arg === '--synthetic-variants' && nextArg) {
|
|
811
|
+
expandSynthetic = true;
|
|
812
|
+
syntheticVariants = Number(nextArg);
|
|
813
|
+
index += 1;
|
|
814
|
+
continue;
|
|
815
|
+
}
|
|
816
|
+
if (arg === '--from-feedback') fromFeedback = true;
|
|
817
|
+
if (arg.startsWith('--feedback-log=')) feedbackLog = path.resolve(arg.slice(15));
|
|
818
|
+
if (arg.startsWith('--feedback-dir=')) feedbackDir = path.resolve(arg.slice(15));
|
|
819
|
+
if (arg.startsWith('--write-suite=')) suiteOutputPath = path.resolve(arg.slice(14));
|
|
820
|
+
if (arg.startsWith('--write-report=')) proofReportPath = path.resolve(arg.slice(15));
|
|
821
|
+
if (arg.startsWith('--max-cases=')) maxCases = Number(arg.slice(12));
|
|
336
822
|
}
|
|
337
823
|
|
|
338
|
-
|
|
824
|
+
let suite;
|
|
825
|
+
let report;
|
|
826
|
+
if (fromFeedback) {
|
|
827
|
+
({ suite, report } = runFeedbackEvalSuite({ feedbackLog, feedbackDir, minScore, maxCases }));
|
|
828
|
+
} else {
|
|
829
|
+
const loadedSuite = loadSuite(suitePath);
|
|
830
|
+
suite = expandSynthetic
|
|
831
|
+
? expandWithSyntheticEvaluations(loadedSuite, { syntheticVariants })
|
|
832
|
+
: loadedSuite;
|
|
833
|
+
report = runSuite(suitePath, {
|
|
834
|
+
minScore,
|
|
835
|
+
baselinePath,
|
|
836
|
+
requireNoRegressions,
|
|
837
|
+
expandSynthetic,
|
|
838
|
+
syntheticVariants,
|
|
839
|
+
});
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
if (outputPath) writeReport(report, outputPath);
|
|
843
|
+
if (suiteOutputPath) writeSuite(suite, suiteOutputPath);
|
|
844
|
+
if (proofReportPath) {
|
|
845
|
+
fs.mkdirSync(path.dirname(proofReportPath), { recursive: true });
|
|
846
|
+
fs.writeFileSync(proofReportPath, `${formatProofReport(report, suite)}\n`, 'utf8');
|
|
847
|
+
}
|
|
339
848
|
|
|
340
849
|
if (json) {
|
|
341
|
-
console.log(JSON.stringify(report, null, 2));
|
|
850
|
+
console.log(JSON.stringify({ ...report, suiteDefinition: fromFeedback ? suite : undefined }, null, 2));
|
|
342
851
|
} else {
|
|
343
852
|
console.log(`\n${report.suite}`);
|
|
344
853
|
console.log('='.repeat(50));
|
|
@@ -354,10 +863,33 @@ if (isCliInvocation()) {
|
|
|
354
863
|
}
|
|
355
864
|
console.log('='.repeat(50));
|
|
356
865
|
console.log(`Score: ${report.score}% | Pass: ${report.passed} | Fail: ${report.failed} | Error: ${report.errors} | Skip: ${report.skipped}`);
|
|
866
|
+
if (report.syntheticCount > 0) {
|
|
867
|
+
console.log(`Synthetic cases: ${report.syntheticCount}`);
|
|
868
|
+
}
|
|
869
|
+
if (report.comparison) {
|
|
870
|
+
console.log(`Baseline delta: ${report.comparison.scoreDelta >= 0 ? '+' : ''}${report.comparison.scoreDelta}%`);
|
|
871
|
+
console.log(`Regressions: ${report.comparison.regressions.length} | Improvements: ${report.comparison.improvements.length}`);
|
|
872
|
+
}
|
|
357
873
|
console.log(report.pass ? '\u2705 PASS' : `\u274C FAIL (min: ${minScore}%)`);
|
|
358
874
|
}
|
|
359
875
|
|
|
360
876
|
process.exit(report.pass ? 0 : 1);
|
|
361
877
|
}
|
|
362
878
|
|
|
363
|
-
module.exports = {
|
|
879
|
+
module.exports = {
|
|
880
|
+
buildEvalSuiteFromFeedback,
|
|
881
|
+
feedbackEntryToEvalCase,
|
|
882
|
+
formatProofReport,
|
|
883
|
+
gradeOutput,
|
|
884
|
+
loadSuite,
|
|
885
|
+
loadReport,
|
|
886
|
+
compareReports,
|
|
887
|
+
readJsonl,
|
|
888
|
+
runEvaluation,
|
|
889
|
+
runFeedbackEvalSuite,
|
|
890
|
+
runSuite,
|
|
891
|
+
runSuiteObject,
|
|
892
|
+
writeReport,
|
|
893
|
+
writeSuite,
|
|
894
|
+
expandWithSyntheticEvaluations,
|
|
895
|
+
};
|