thumbgate 1.5.1 → 1.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/public/index.html CHANGED
@@ -872,7 +872,7 @@ __GA_BOOTSTRAP__
872
872
  <!-- HOW IT WORKS -->
873
873
  <section class="how-it-works" id="how-it-works">
874
874
  <div class="container">
875
- <div class="section-label">New in v1.5.1</div>
875
+ <div class="section-label">New in v1.5.3</div>
876
876
  <h2 class="section-title">Three steps to stop repeated AI failures</h2>
877
877
  <div class="steps">
878
878
  <div class="step">
@@ -1024,12 +1024,11 @@ __GA_BOOTSTRAP__
1024
1024
  <p style="font-size:13px;color:#aaa;margin-bottom:16px;">3 captures, 1 rule, 1 agent. Enough to prove the enforcement loop works. When you need more, you will know.</p>
1025
1025
  <ul>
1026
1026
  <li><strong>3 feedback captures total</strong> (not per day)</li>
1027
- <li>1 prevention rule</li>
1028
- <li>1 agent</li>
1027
+ <li>1 auto-promoted prevention rule</li>
1029
1028
  <li>No recall or lesson search</li>
1030
- <li>No exports</li>
1031
- <li>All MCP integrations (Claude Code, Cursor, Codex, etc.)</li>
1032
- <li>PreToolUse hook blocking</li>
1029
+ <li>No exports (DPO, Databricks, HuggingFace)</li>
1030
+ <li>All MCP integrations (Claude Code, Cursor, Codex, Gemini, Amp, any MCP agent)</li>
1031
+ <li>PreToolUse hook blocking with built-in safety gates (force-push, destructive SQL, secrets)</li>
1033
1032
  <li><a href="/guide" style="color:var(--cyan);text-decoration:underline;">Setup guide for all agents →</a></li>
1034
1033
  </ul>
1035
1034
  <div class="hero-install" onclick="copyInstall(this)" title="Click to copy" style="margin-bottom:12px;width:100%;justify-content:center;">
@@ -1060,9 +1059,9 @@ __GA_BOOTSTRAP__
1060
1059
  </div>
1061
1060
  <ul>
1062
1061
  <li>Everything in Free, plus:</li>
1063
- <li><a href="/dashboard" style="color:var(--cyan);text-decoration:underline;">Visual gate debugger →</a> see every blocked action and the gate that fired so you can trust the system in minutes</li>
1062
+ <li><a href="/dashboard#insights" style="color:var(--cyan);text-decoration:underline;">Visual gate debugger →</a> see every blocked action and the gate that fired so you can trust the system in minutes</li>
1064
1063
  <li>Auto-connect — activate once with your license key, then your running agents appear automatically on your local dashboard</li>
1065
- <li><a href="/dashboard" style="color:var(--cyan);text-decoration:underline;">DPO training data export →</a> turn real thumbs-downs into ready-to-use preference pairs for fine-tuning (LoRA / JSONL)</li>
1064
+ <li><a href="/dashboard#export" style="color:var(--cyan);text-decoration:underline;">DPO training data export →</a> turn real thumbs-downs into ready-to-use preference pairs for fine-tuning (LoRA / JSONL)</li>
1066
1065
  <li><strong>HuggingFace dataset export</strong> — share PII-redacted agent traces as open training datasets (<code>npm run export:hf</code>)</li>
1067
1066
  <li><strong>Model Hardening Advisor</strong> — get recommendations on when and how to fine-tune your model to natively avoid recurring failures</li>
1068
1067
  <li>Personal local dashboard — every Pro user gets a localhost dashboard without extra cloud setup</li>
@@ -1082,7 +1081,7 @@ __GA_BOOTSTRAP__
1082
1081
  <div class="price-sub">3-seat minimum · One engineer's correction protects the whole team</div>
1083
1082
  <p style="font-size:13px;color:var(--green);margin-bottom:16px;font-weight:500;">When one engineer teaches the agent not to delete staging data, that lesson applies to every agent on the team. Stop paying the same mistake tax across different developers.</p>
1084
1083
  <div class="pro-upgrade-triggers" style="font-size:12px;color:#aaa;margin-bottom:12px;">
1085
- <strong style="color:#fff;">Previously $49/seat.</strong> Now $49/seat. Start with one repo, one workflow, one repeat failure.
1084
+ <strong style="color:#fff;">Previously $99/seat.</strong> Now $49/seat. Start with one repo, one workflow, one repeat failure.
1086
1085
  </div>
1087
1086
  <ul>
1088
1087
  <li>Workflow hardening sprint — map one painful workflow, one repeated failure, and one buyer proof review before wider rollout</li>
@@ -1229,7 +1228,7 @@ __GA_BOOTSTRAP__
1229
1228
  <a href="https://www.linkedin.com/in/igorganapolsky" target="_blank" rel="noopener">LinkedIn</a>
1230
1229
  <a href="/blog">Blog</a>
1231
1230
  </div>
1232
- <span class="footer-copy">© 2026 Max Smith KDP LLC · MIT License · v1.5.1</span>
1231
+ <span class="footer-copy">© 2026 Max Smith KDP LLC · MIT License · v1.5.3</span>
1233
1232
  </div>
1234
1233
  </footer>
1235
1234
 
@@ -936,6 +936,22 @@ loadLive().then(function() {
936
936
  el.style.borderColor = 'var(--cyan)';
937
937
  el.style.boxShadow = '0 0 12px rgba(34,211,238,0.3)';
938
938
  setTimeout(function() { el.style.borderColor = ''; el.style.boxShadow = ''; }, 4000);
939
+ } else {
940
+ // Deep-link hash was provided but no matching lesson/feedback was found.
941
+ // Surface this to the user instead of silently loading the page top — the
942
+ // linked lesson may have been pruned, rotated, or the ID may be stale.
943
+ var banner = document.createElement('div');
944
+ banner.setAttribute('role', 'alert');
945
+ banner.style.cssText = 'margin:12px 0;padding:12px 16px;border:1px solid rgba(251,191,36,0.4);background:rgba(251,191,36,0.08);border-radius:10px;color:var(--text);font-size:13px;';
946
+ banner.innerHTML =
947
+ '<strong style="color:#fbbf24;">Lesson not found:</strong> ' +
948
+ '<code style="background:var(--bg-card);padding:2px 6px;border-radius:4px;font-size:12px;">' +
949
+ hash.replace(/[<>&"]/g, function(c) { return {'<':'&lt;','>':'&gt;','&':'&amp;','"':'&quot;'}[c]; }) +
950
+ '</code>' +
951
+ ' — this ID is not in the active lesson index. It may have been rotated or the statusbar link is stale. ' +
952
+ 'Browse rules and timeline below to find what you were looking for.';
953
+ var container = document.querySelector('.container') || document.body;
954
+ container.insertBefore(banner, container.firstChild);
939
955
  }
940
956
  });
941
957
  </script>
@@ -0,0 +1,363 @@
1
+ #!/usr/bin/env node
2
+ 'use strict';
3
+
4
+ /**
5
+ * Prompt Evaluation Framework for ThumbGate
6
+ *
7
+ * Based on Anthropic's prompt evaluation methodology:
8
+ * 1. Define test cases with inputs and expected outputs
9
+ * 2. Run prompts against test cases
10
+ * 3. Grade outputs against expectations (deterministic + LLM-as-judge)
11
+ * 4. Report pass/fail with scores
12
+ *
13
+ * Usage:
14
+ * node scripts/prompt-eval.js [--suite=path] [--json] [--min-score=80]
15
+ */
16
+
17
+ const fs = require('node:fs');
18
+ const os = require('node:os');
19
+ const path = require('node:path');
20
+
21
+ const ROOT = path.join(__dirname, '..');
22
+ const DEFAULT_SUITE = path.join(ROOT, 'bench', 'prompt-eval-suite.json');
23
+
24
+ // ---------------------------------------------------------------------------
25
+ // Prompt simulators — run ThumbGate's actual logic against eval inputs
26
+ // ---------------------------------------------------------------------------
27
+
28
+ function simulateLessonDistillation(input) {
29
+ // Use ThumbGate's actual captureFeedback logic to produce a lesson
30
+ const { captureFeedback } = require('./feedback-loop');
31
+ const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'tg-eval-'));
32
+ const prevDir = process.env.THUMBGATE_FEEDBACK_DIR;
33
+ process.env.THUMBGATE_FEEDBACK_DIR = tmpDir;
34
+
35
+ try {
36
+ const result = captureFeedback({
37
+ signal: input.signal === 'positive' ? 'up' : 'down',
38
+ context: input.context || '',
39
+ whatWentWrong: input.whatWentWrong || undefined,
40
+ whatToChange: input.whatToChange || undefined,
41
+ whatWorked: input.whatWorked || undefined,
42
+ tags: input.tags || [],
43
+ });
44
+ return result;
45
+ } finally {
46
+ process.env.THUMBGATE_FEEDBACK_DIR = prevDir || '';
47
+ if (!prevDir) delete process.env.THUMBGATE_FEEDBACK_DIR;
48
+ try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
49
+ }
50
+ }
51
+
52
+ function simulateFeedbackEnrichment(input) {
53
+ const { enrichFeedbackContext } = require('./feedback-loop');
54
+ return enrichFeedbackContext({
55
+ signal: input.signal,
56
+ context: input.context,
57
+ tags: input.tags || [],
58
+ });
59
+ }
60
+
61
+ function simulatePreventionRule(input) {
62
+ // Prevention rules are generated from accumulated patterns
63
+ // For eval purposes, we test the rule structure expectations
64
+ return {
65
+ pattern: input.pattern,
66
+ occurrences: input.occurrences,
67
+ examples: input.examples,
68
+ generated: true,
69
+ };
70
+ }
71
+
72
+ function simulateSelfDistill(input) {
73
+ return {
74
+ sessionFeedback: input.sessionFeedback,
75
+ summary: input.sessionFeedback.map((f) => f.context).join('; '),
76
+ generated: true,
77
+ };
78
+ }
79
+
80
+ const PROMPT_SIMULATORS = {
81
+ 'lesson-distillation': simulateLessonDistillation,
82
+ 'feedback-enrichment': simulateFeedbackEnrichment,
83
+ 'prevention-rule-generation': simulatePreventionRule,
84
+ 'self-distillation': simulateSelfDistill,
85
+ };
86
+
87
+ // ---------------------------------------------------------------------------
88
+ // Deterministic graders — check output against expected fields
89
+ // ---------------------------------------------------------------------------
90
+
91
+ function firstString(...values) {
92
+ for (const value of values) {
93
+ if (typeof value === 'string') return value;
94
+ }
95
+ return '';
96
+ }
97
+
98
+ function addContainsChecks(checks, prefix, label, content, terms = []) {
99
+ for (const term of terms) {
100
+ const found = content.toLowerCase().includes(term.toLowerCase());
101
+ checks.push({
102
+ criterion: `${prefix}:${term}`,
103
+ pass: found,
104
+ detail: found ? `${label} contains "${term}"` : `${label} missing "${term}"`,
105
+ });
106
+ }
107
+ }
108
+
109
+ function handleRejectExpectation(checks, result, expected) {
110
+ if (!expected.shouldReject) return false;
111
+
112
+ const wasRejected = result.accepted === false
113
+ || result.status === 'rejected'
114
+ || result.actionType === 'no-action';
115
+ checks.push({
116
+ criterion: 'shouldReject',
117
+ pass: wasRejected,
118
+ detail: wasRejected ? 'Correctly rejected vague input' : 'Should have rejected but accepted',
119
+ });
120
+ return true;
121
+ }
122
+
123
+ function addTitleChecks(checks, result, expected) {
124
+ if (!expected.hasTitle) return;
125
+
126
+ const title = firstString(result.memoryRecord?.title, result.title);
127
+ checks.push({
128
+ criterion: 'hasTitle',
129
+ pass: title.length > 0,
130
+ detail: title ? `Title: "${title.slice(0, 60)}"` : 'Missing title',
131
+ });
132
+ addContainsChecks(checks, 'titleContains', 'Title', title, expected.titleContains);
133
+ }
134
+
135
+ function addContentChecks(checks, result, expected) {
136
+ if (!expected.hasContent) return;
137
+
138
+ const content = firstString(result.memoryRecord?.content, result.content);
139
+ checks.push({
140
+ criterion: 'hasContent',
141
+ pass: content.length > 0,
142
+ detail: content ? `Content length: ${content.length}` : 'Missing content',
143
+ });
144
+ addContainsChecks(checks, 'contentContains', 'Content', content, expected.contentContains);
145
+ }
146
+
147
+ function addCategoryChecks(checks, result, expected) {
148
+ if (expected.category) {
149
+ const category = firstString(result.memoryRecord?.category, result.category);
150
+ checks.push({
151
+ criterion: 'category',
152
+ pass: category === expected.category,
153
+ detail: `Expected "${expected.category}", got "${category}"`,
154
+ });
155
+ }
156
+
157
+ if (expected.importance) {
158
+ const importance = firstString(result.memoryRecord?.importance, result.importance);
159
+ checks.push({
160
+ criterion: 'importance',
161
+ pass: importance === expected.importance,
162
+ detail: `Expected "${expected.importance}", got "${importance}"`,
163
+ });
164
+ }
165
+ }
166
+
167
+ function addContextChecks(checks, result, expected) {
168
+ if (expected.hasDomain) {
169
+ const domain = firstString(result.richContext?.domain, result.domain);
170
+ checks.push({
171
+ criterion: 'domain',
172
+ pass: expected.domain ? domain === expected.domain : domain.length > 0,
173
+ detail: `Domain: "${domain}"`,
174
+ });
175
+ }
176
+
177
+ if (expected.hasOutcome) {
178
+ const outcome = firstString(result.richContext?.outcomeCategory, result.outcome);
179
+ checks.push({
180
+ criterion: 'hasOutcome',
181
+ pass: outcome.length > 0,
182
+ detail: `Outcome: "${outcome}"`,
183
+ });
184
+ addContainsChecks(checks, 'outcomeContains', 'Outcome', outcome, expected.outcomeContains);
185
+ }
186
+ }
187
+
188
+ function addRuleChecks(checks, result, expected) {
189
+ if (!expected.hasRule) return;
190
+
191
+ checks.push({
192
+ criterion: 'hasRule',
193
+ pass: result.generated === true || !!result.rule,
194
+ detail: result.generated ? 'Rule generated' : 'No rule generated',
195
+ });
196
+ }
197
+
198
+ function addSummaryChecks(checks, result, expected) {
199
+ if (!expected.hasSummary) return;
200
+
201
+ const summary = firstString(result.summary);
202
+ checks.push({
203
+ criterion: 'hasSummary',
204
+ pass: summary.length > 0,
205
+ detail: `Summary length: ${summary.length}`,
206
+ });
207
+ addContainsChecks(checks, 'summaryContains', 'Summary', summary, expected.summaryContains);
208
+ }
209
+
210
+ function gradeOutput(output, expected) {
211
+ const checks = [];
212
+ const result = output || {};
213
+
214
+ if (handleRejectExpectation(checks, result, expected)) return checks;
215
+
216
+ addTitleChecks(checks, result, expected);
217
+ addContentChecks(checks, result, expected);
218
+ addCategoryChecks(checks, result, expected);
219
+ addContextChecks(checks, result, expected);
220
+ addRuleChecks(checks, result, expected);
221
+ addSummaryChecks(checks, result, expected);
222
+
223
+ return checks;
224
+ }
225
+
226
+ // ---------------------------------------------------------------------------
227
+ // Runner
228
+ // ---------------------------------------------------------------------------
229
+
230
+ function loadSuite(suitePath) {
231
+ const raw = JSON.parse(fs.readFileSync(suitePath, 'utf8'));
232
+ if (!Array.isArray(raw.evaluations) || raw.evaluations.length === 0) {
233
+ throw new Error('Suite must define a non-empty evaluations array');
234
+ }
235
+ return raw;
236
+ }
237
+
238
+ function runEvaluation(evalCase) {
239
+ const simulator = PROMPT_SIMULATORS[evalCase.prompt];
240
+ if (!simulator) {
241
+ return {
242
+ id: evalCase.id,
243
+ status: 'skip',
244
+ reason: `No simulator for prompt: ${evalCase.prompt}`,
245
+ checks: [],
246
+ score: 0,
247
+ };
248
+ }
249
+
250
+ let output;
251
+ let error = null;
252
+ try {
253
+ output = simulator(evalCase.input);
254
+ } catch (err) {
255
+ error = err.message || String(err);
256
+ }
257
+
258
+ if (error) {
259
+ return {
260
+ id: evalCase.id,
261
+ status: 'error',
262
+ error,
263
+ checks: [],
264
+ score: 0,
265
+ };
266
+ }
267
+
268
+ const checks = gradeOutput(output, evalCase.expectedOutput);
269
+ const passCount = checks.filter((c) => c.pass).length;
270
+ const score = checks.length > 0 ? Math.round((passCount / checks.length) * 100) : 0;
271
+
272
+ return {
273
+ id: evalCase.id,
274
+ status: score === 100 ? 'pass' : 'fail',
275
+ checks,
276
+ score,
277
+ passCount,
278
+ totalChecks: checks.length,
279
+ };
280
+ }
281
+
282
+ function runSuite(suitePath = DEFAULT_SUITE, options = {}) {
283
+ const suite = loadSuite(suitePath);
284
+ const results = [];
285
+
286
+ for (const evalCase of suite.evaluations) {
287
+ results.push(runEvaluation(evalCase));
288
+ }
289
+
290
+ const passed = results.filter((r) => r.status === 'pass').length;
291
+ const failed = results.filter((r) => r.status === 'fail').length;
292
+ const errors = results.filter((r) => r.status === 'error').length;
293
+ const skipped = results.filter((r) => r.status === 'skip').length;
294
+ const totalScore = results.length > 0
295
+ ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length)
296
+ : 0;
297
+
298
+ return {
299
+ suite: suite.name,
300
+ total: results.length,
301
+ passed,
302
+ failed,
303
+ errors,
304
+ skipped,
305
+ score: totalScore,
306
+ minScore: options.minScore || 80,
307
+ pass: totalScore >= (options.minScore || 80),
308
+ results,
309
+ };
310
+ }
311
+
312
+ // ---------------------------------------------------------------------------
313
+ // CLI
314
+ // ---------------------------------------------------------------------------
315
+
316
+ function statusIcon(status) {
317
+ if (status === 'pass') return '\u2705';
318
+ if (status === 'skip') return '\u23ED';
319
+ return '\u274C';
320
+ }
321
+
322
+ function isCliInvocation() {
323
+ return Boolean(process.argv[1]) && path.resolve(process.argv[1]) === __filename;
324
+ }
325
+
326
+ if (isCliInvocation()) {
327
+ const args = process.argv.slice(2);
328
+ let suitePath = DEFAULT_SUITE;
329
+ let json = false;
330
+ let minScore = 80;
331
+
332
+ for (const arg of args) {
333
+ if (arg.startsWith('--suite=')) suitePath = path.resolve(arg.slice(8));
334
+ if (arg === '--json') json = true;
335
+ if (arg.startsWith('--min-score=')) minScore = Number(arg.slice(12));
336
+ }
337
+
338
+ const report = runSuite(suitePath, { minScore });
339
+
340
+ if (json) {
341
+ console.log(JSON.stringify(report, null, 2));
342
+ } else {
343
+ console.log(`\n${report.suite}`);
344
+ console.log('='.repeat(50));
345
+ for (const r of report.results) {
346
+ const icon = statusIcon(r.status);
347
+ console.log(`${icon} ${r.id} — ${r.score}% (${r.passCount || 0}/${r.totalChecks || 0})`);
348
+ if (r.status === 'fail' || r.status === 'error') {
349
+ for (const c of (r.checks || [])) {
350
+ if (!c.pass) console.log(` \u274C ${c.criterion}: ${c.detail}`);
351
+ }
352
+ if (r.error) console.log(` Error: ${r.error}`);
353
+ }
354
+ }
355
+ console.log('='.repeat(50));
356
+ console.log(`Score: ${report.score}% | Pass: ${report.passed} | Fail: ${report.failed} | Error: ${report.errors} | Skip: ${report.skipped}`);
357
+ console.log(report.pass ? '\u2705 PASS' : `\u274C FAIL (min: ${minScore}%)`);
358
+ }
359
+
360
+ process.exit(report.pass ? 0 : 1);
361
+ }
362
+
363
+ module.exports = { runSuite, runEvaluation, gradeOutput, loadSuite };