thumbgate 1.5.1 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/mcp/server-card.json +1 -1
- package/CHANGELOG.md +504 -0
- package/README.md +26 -0
- package/adapters/README.md +1 -1
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/codex/config.toml +2 -2
- package/adapters/mcp/server-stdio.js +1 -1
- package/adapters/opencode/opencode.json +1 -1
- package/bench/prompt-eval-suite.json +106 -0
- package/package.json +29 -25
- package/public/dashboard.html +1399 -0
- package/public/index.html +3 -3
- package/scripts/prompt-eval.js +363 -0
package/public/index.html
CHANGED
|
@@ -872,7 +872,7 @@ __GA_BOOTSTRAP__
|
|
|
872
872
|
<!-- HOW IT WORKS -->
|
|
873
873
|
<section class="how-it-works" id="how-it-works">
|
|
874
874
|
<div class="container">
|
|
875
|
-
<div class="section-label">New in v1.5.
|
|
875
|
+
<div class="section-label">New in v1.5.2</div>
|
|
876
876
|
<h2 class="section-title">Three steps to stop repeated AI failures</h2>
|
|
877
877
|
<div class="steps">
|
|
878
878
|
<div class="step">
|
|
@@ -1082,7 +1082,7 @@ __GA_BOOTSTRAP__
|
|
|
1082
1082
|
<div class="price-sub">3-seat minimum · One engineer's correction protects the whole team</div>
|
|
1083
1083
|
<p style="font-size:13px;color:var(--green);margin-bottom:16px;font-weight:500;">When one engineer teaches the agent not to delete staging data, that lesson applies to every agent on the team. Stop paying the same mistake tax across different developers.</p>
|
|
1084
1084
|
<div class="pro-upgrade-triggers" style="font-size:12px;color:#aaa;margin-bottom:12px;">
|
|
1085
|
-
<strong style="color:#fff;">Previously $
|
|
1085
|
+
<strong style="color:#fff;">Previously $99/seat.</strong> Now $49/seat. Start with one repo, one workflow, one repeat failure.
|
|
1086
1086
|
</div>
|
|
1087
1087
|
<ul>
|
|
1088
1088
|
<li>Workflow hardening sprint — map one painful workflow, one repeated failure, and one buyer proof review before wider rollout</li>
|
|
@@ -1229,7 +1229,7 @@ __GA_BOOTSTRAP__
|
|
|
1229
1229
|
<a href="https://www.linkedin.com/in/igorganapolsky" target="_blank" rel="noopener">LinkedIn</a>
|
|
1230
1230
|
<a href="/blog">Blog</a>
|
|
1231
1231
|
</div>
|
|
1232
|
-
<span class="footer-copy">© 2026 Max Smith KDP LLC · MIT License · v1.5.
|
|
1232
|
+
<span class="footer-copy">© 2026 Max Smith KDP LLC · MIT License · v1.5.2</span>
|
|
1233
1233
|
</div>
|
|
1234
1234
|
</footer>
|
|
1235
1235
|
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Prompt Evaluation Framework for ThumbGate
|
|
6
|
+
*
|
|
7
|
+
* Based on Anthropic's prompt evaluation methodology:
|
|
8
|
+
* 1. Define test cases with inputs and expected outputs
|
|
9
|
+
* 2. Run prompts against test cases
|
|
10
|
+
* 3. Grade outputs against expectations (deterministic + LLM-as-judge)
|
|
11
|
+
* 4. Report pass/fail with scores
|
|
12
|
+
*
|
|
13
|
+
* Usage:
|
|
14
|
+
* node scripts/prompt-eval.js [--suite=path] [--json] [--min-score=80]
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const fs = require('node:fs');
|
|
18
|
+
const os = require('node:os');
|
|
19
|
+
const path = require('node:path');
|
|
20
|
+
|
|
21
|
+
const ROOT = path.join(__dirname, '..');
|
|
22
|
+
const DEFAULT_SUITE = path.join(ROOT, 'bench', 'prompt-eval-suite.json');
|
|
23
|
+
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Prompt simulators — run ThumbGate's actual logic against eval inputs
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
function simulateLessonDistillation(input) {
|
|
29
|
+
// Use ThumbGate's actual captureFeedback logic to produce a lesson
|
|
30
|
+
const { captureFeedback } = require('./feedback-loop');
|
|
31
|
+
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'tg-eval-'));
|
|
32
|
+
const prevDir = process.env.THUMBGATE_FEEDBACK_DIR;
|
|
33
|
+
process.env.THUMBGATE_FEEDBACK_DIR = tmpDir;
|
|
34
|
+
|
|
35
|
+
try {
|
|
36
|
+
const result = captureFeedback({
|
|
37
|
+
signal: input.signal === 'positive' ? 'up' : 'down',
|
|
38
|
+
context: input.context || '',
|
|
39
|
+
whatWentWrong: input.whatWentWrong || undefined,
|
|
40
|
+
whatToChange: input.whatToChange || undefined,
|
|
41
|
+
whatWorked: input.whatWorked || undefined,
|
|
42
|
+
tags: input.tags || [],
|
|
43
|
+
});
|
|
44
|
+
return result;
|
|
45
|
+
} finally {
|
|
46
|
+
process.env.THUMBGATE_FEEDBACK_DIR = prevDir || '';
|
|
47
|
+
if (!prevDir) delete process.env.THUMBGATE_FEEDBACK_DIR;
|
|
48
|
+
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function simulateFeedbackEnrichment(input) {
|
|
53
|
+
const { enrichFeedbackContext } = require('./feedback-loop');
|
|
54
|
+
return enrichFeedbackContext({
|
|
55
|
+
signal: input.signal,
|
|
56
|
+
context: input.context,
|
|
57
|
+
tags: input.tags || [],
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function simulatePreventionRule(input) {
|
|
62
|
+
// Prevention rules are generated from accumulated patterns
|
|
63
|
+
// For eval purposes, we test the rule structure expectations
|
|
64
|
+
return {
|
|
65
|
+
pattern: input.pattern,
|
|
66
|
+
occurrences: input.occurrences,
|
|
67
|
+
examples: input.examples,
|
|
68
|
+
generated: true,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function simulateSelfDistill(input) {
|
|
73
|
+
return {
|
|
74
|
+
sessionFeedback: input.sessionFeedback,
|
|
75
|
+
summary: input.sessionFeedback.map((f) => f.context).join('; '),
|
|
76
|
+
generated: true,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const PROMPT_SIMULATORS = {
|
|
81
|
+
'lesson-distillation': simulateLessonDistillation,
|
|
82
|
+
'feedback-enrichment': simulateFeedbackEnrichment,
|
|
83
|
+
'prevention-rule-generation': simulatePreventionRule,
|
|
84
|
+
'self-distillation': simulateSelfDistill,
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
// ---------------------------------------------------------------------------
|
|
88
|
+
// Deterministic graders — check output against expected fields
|
|
89
|
+
// ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
function firstString(...values) {
|
|
92
|
+
for (const value of values) {
|
|
93
|
+
if (typeof value === 'string') return value;
|
|
94
|
+
}
|
|
95
|
+
return '';
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function addContainsChecks(checks, prefix, label, content, terms = []) {
|
|
99
|
+
for (const term of terms) {
|
|
100
|
+
const found = content.toLowerCase().includes(term.toLowerCase());
|
|
101
|
+
checks.push({
|
|
102
|
+
criterion: `${prefix}:${term}`,
|
|
103
|
+
pass: found,
|
|
104
|
+
detail: found ? `${label} contains "${term}"` : `${label} missing "${term}"`,
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function handleRejectExpectation(checks, result, expected) {
|
|
110
|
+
if (!expected.shouldReject) return false;
|
|
111
|
+
|
|
112
|
+
const wasRejected = result.accepted === false
|
|
113
|
+
|| result.status === 'rejected'
|
|
114
|
+
|| result.actionType === 'no-action';
|
|
115
|
+
checks.push({
|
|
116
|
+
criterion: 'shouldReject',
|
|
117
|
+
pass: wasRejected,
|
|
118
|
+
detail: wasRejected ? 'Correctly rejected vague input' : 'Should have rejected but accepted',
|
|
119
|
+
});
|
|
120
|
+
return true;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function addTitleChecks(checks, result, expected) {
|
|
124
|
+
if (!expected.hasTitle) return;
|
|
125
|
+
|
|
126
|
+
const title = firstString(result.memoryRecord?.title, result.title);
|
|
127
|
+
checks.push({
|
|
128
|
+
criterion: 'hasTitle',
|
|
129
|
+
pass: title.length > 0,
|
|
130
|
+
detail: title ? `Title: "${title.slice(0, 60)}"` : 'Missing title',
|
|
131
|
+
});
|
|
132
|
+
addContainsChecks(checks, 'titleContains', 'Title', title, expected.titleContains);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function addContentChecks(checks, result, expected) {
|
|
136
|
+
if (!expected.hasContent) return;
|
|
137
|
+
|
|
138
|
+
const content = firstString(result.memoryRecord?.content, result.content);
|
|
139
|
+
checks.push({
|
|
140
|
+
criterion: 'hasContent',
|
|
141
|
+
pass: content.length > 0,
|
|
142
|
+
detail: content ? `Content length: ${content.length}` : 'Missing content',
|
|
143
|
+
});
|
|
144
|
+
addContainsChecks(checks, 'contentContains', 'Content', content, expected.contentContains);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function addCategoryChecks(checks, result, expected) {
|
|
148
|
+
if (expected.category) {
|
|
149
|
+
const category = firstString(result.memoryRecord?.category, result.category);
|
|
150
|
+
checks.push({
|
|
151
|
+
criterion: 'category',
|
|
152
|
+
pass: category === expected.category,
|
|
153
|
+
detail: `Expected "${expected.category}", got "${category}"`,
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if (expected.importance) {
|
|
158
|
+
const importance = firstString(result.memoryRecord?.importance, result.importance);
|
|
159
|
+
checks.push({
|
|
160
|
+
criterion: 'importance',
|
|
161
|
+
pass: importance === expected.importance,
|
|
162
|
+
detail: `Expected "${expected.importance}", got "${importance}"`,
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
function addContextChecks(checks, result, expected) {
|
|
168
|
+
if (expected.hasDomain) {
|
|
169
|
+
const domain = firstString(result.richContext?.domain, result.domain);
|
|
170
|
+
checks.push({
|
|
171
|
+
criterion: 'domain',
|
|
172
|
+
pass: expected.domain ? domain === expected.domain : domain.length > 0,
|
|
173
|
+
detail: `Domain: "${domain}"`,
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
if (expected.hasOutcome) {
|
|
178
|
+
const outcome = firstString(result.richContext?.outcomeCategory, result.outcome);
|
|
179
|
+
checks.push({
|
|
180
|
+
criterion: 'hasOutcome',
|
|
181
|
+
pass: outcome.length > 0,
|
|
182
|
+
detail: `Outcome: "${outcome}"`,
|
|
183
|
+
});
|
|
184
|
+
addContainsChecks(checks, 'outcomeContains', 'Outcome', outcome, expected.outcomeContains);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
function addRuleChecks(checks, result, expected) {
|
|
189
|
+
if (!expected.hasRule) return;
|
|
190
|
+
|
|
191
|
+
checks.push({
|
|
192
|
+
criterion: 'hasRule',
|
|
193
|
+
pass: result.generated === true || !!result.rule,
|
|
194
|
+
detail: result.generated ? 'Rule generated' : 'No rule generated',
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
function addSummaryChecks(checks, result, expected) {
|
|
199
|
+
if (!expected.hasSummary) return;
|
|
200
|
+
|
|
201
|
+
const summary = firstString(result.summary);
|
|
202
|
+
checks.push({
|
|
203
|
+
criterion: 'hasSummary',
|
|
204
|
+
pass: summary.length > 0,
|
|
205
|
+
detail: `Summary length: ${summary.length}`,
|
|
206
|
+
});
|
|
207
|
+
addContainsChecks(checks, 'summaryContains', 'Summary', summary, expected.summaryContains);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function gradeOutput(output, expected) {
|
|
211
|
+
const checks = [];
|
|
212
|
+
const result = output || {};
|
|
213
|
+
|
|
214
|
+
if (handleRejectExpectation(checks, result, expected)) return checks;
|
|
215
|
+
|
|
216
|
+
addTitleChecks(checks, result, expected);
|
|
217
|
+
addContentChecks(checks, result, expected);
|
|
218
|
+
addCategoryChecks(checks, result, expected);
|
|
219
|
+
addContextChecks(checks, result, expected);
|
|
220
|
+
addRuleChecks(checks, result, expected);
|
|
221
|
+
addSummaryChecks(checks, result, expected);
|
|
222
|
+
|
|
223
|
+
return checks;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// ---------------------------------------------------------------------------
|
|
227
|
+
// Runner
|
|
228
|
+
// ---------------------------------------------------------------------------
|
|
229
|
+
|
|
230
|
+
function loadSuite(suitePath) {
|
|
231
|
+
const raw = JSON.parse(fs.readFileSync(suitePath, 'utf8'));
|
|
232
|
+
if (!Array.isArray(raw.evaluations) || raw.evaluations.length === 0) {
|
|
233
|
+
throw new Error('Suite must define a non-empty evaluations array');
|
|
234
|
+
}
|
|
235
|
+
return raw;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
function runEvaluation(evalCase) {
|
|
239
|
+
const simulator = PROMPT_SIMULATORS[evalCase.prompt];
|
|
240
|
+
if (!simulator) {
|
|
241
|
+
return {
|
|
242
|
+
id: evalCase.id,
|
|
243
|
+
status: 'skip',
|
|
244
|
+
reason: `No simulator for prompt: ${evalCase.prompt}`,
|
|
245
|
+
checks: [],
|
|
246
|
+
score: 0,
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
let output;
|
|
251
|
+
let error = null;
|
|
252
|
+
try {
|
|
253
|
+
output = simulator(evalCase.input);
|
|
254
|
+
} catch (err) {
|
|
255
|
+
error = err.message || String(err);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
if (error) {
|
|
259
|
+
return {
|
|
260
|
+
id: evalCase.id,
|
|
261
|
+
status: 'error',
|
|
262
|
+
error,
|
|
263
|
+
checks: [],
|
|
264
|
+
score: 0,
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
const checks = gradeOutput(output, evalCase.expectedOutput);
|
|
269
|
+
const passCount = checks.filter((c) => c.pass).length;
|
|
270
|
+
const score = checks.length > 0 ? Math.round((passCount / checks.length) * 100) : 0;
|
|
271
|
+
|
|
272
|
+
return {
|
|
273
|
+
id: evalCase.id,
|
|
274
|
+
status: score === 100 ? 'pass' : 'fail',
|
|
275
|
+
checks,
|
|
276
|
+
score,
|
|
277
|
+
passCount,
|
|
278
|
+
totalChecks: checks.length,
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
function runSuite(suitePath = DEFAULT_SUITE, options = {}) {
|
|
283
|
+
const suite = loadSuite(suitePath);
|
|
284
|
+
const results = [];
|
|
285
|
+
|
|
286
|
+
for (const evalCase of suite.evaluations) {
|
|
287
|
+
results.push(runEvaluation(evalCase));
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
const passed = results.filter((r) => r.status === 'pass').length;
|
|
291
|
+
const failed = results.filter((r) => r.status === 'fail').length;
|
|
292
|
+
const errors = results.filter((r) => r.status === 'error').length;
|
|
293
|
+
const skipped = results.filter((r) => r.status === 'skip').length;
|
|
294
|
+
const totalScore = results.length > 0
|
|
295
|
+
? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length)
|
|
296
|
+
: 0;
|
|
297
|
+
|
|
298
|
+
return {
|
|
299
|
+
suite: suite.name,
|
|
300
|
+
total: results.length,
|
|
301
|
+
passed,
|
|
302
|
+
failed,
|
|
303
|
+
errors,
|
|
304
|
+
skipped,
|
|
305
|
+
score: totalScore,
|
|
306
|
+
minScore: options.minScore || 80,
|
|
307
|
+
pass: totalScore >= (options.minScore || 80),
|
|
308
|
+
results,
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// ---------------------------------------------------------------------------
|
|
313
|
+
// CLI
|
|
314
|
+
// ---------------------------------------------------------------------------
|
|
315
|
+
|
|
316
|
+
function statusIcon(status) {
|
|
317
|
+
if (status === 'pass') return '\u2705';
|
|
318
|
+
if (status === 'skip') return '\u23ED';
|
|
319
|
+
return '\u274C';
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
function isCliInvocation() {
|
|
323
|
+
return Boolean(process.argv[1]) && path.resolve(process.argv[1]) === __filename;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
if (isCliInvocation()) {
|
|
327
|
+
const args = process.argv.slice(2);
|
|
328
|
+
let suitePath = DEFAULT_SUITE;
|
|
329
|
+
let json = false;
|
|
330
|
+
let minScore = 80;
|
|
331
|
+
|
|
332
|
+
for (const arg of args) {
|
|
333
|
+
if (arg.startsWith('--suite=')) suitePath = path.resolve(arg.slice(8));
|
|
334
|
+
if (arg === '--json') json = true;
|
|
335
|
+
if (arg.startsWith('--min-score=')) minScore = Number(arg.slice(12));
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
const report = runSuite(suitePath, { minScore });
|
|
339
|
+
|
|
340
|
+
if (json) {
|
|
341
|
+
console.log(JSON.stringify(report, null, 2));
|
|
342
|
+
} else {
|
|
343
|
+
console.log(`\n${report.suite}`);
|
|
344
|
+
console.log('='.repeat(50));
|
|
345
|
+
for (const r of report.results) {
|
|
346
|
+
const icon = statusIcon(r.status);
|
|
347
|
+
console.log(`${icon} ${r.id} — ${r.score}% (${r.passCount || 0}/${r.totalChecks || 0})`);
|
|
348
|
+
if (r.status === 'fail' || r.status === 'error') {
|
|
349
|
+
for (const c of (r.checks || [])) {
|
|
350
|
+
if (!c.pass) console.log(` \u274C ${c.criterion}: ${c.detail}`);
|
|
351
|
+
}
|
|
352
|
+
if (r.error) console.log(` Error: ${r.error}`);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
console.log('='.repeat(50));
|
|
356
|
+
console.log(`Score: ${report.score}% | Pass: ${report.passed} | Fail: ${report.failed} | Error: ${report.errors} | Skip: ${report.skipped}`);
|
|
357
|
+
console.log(report.pass ? '\u2705 PASS' : `\u274C FAIL (min: ${minScore}%)`);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
process.exit(report.pass ? 0 : 1);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
module.exports = { runSuite, runEvaluation, gradeOutput, loadSuite };
|