thumbgate 1.5.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,363 @@
1
+ #!/usr/bin/env node
2
+ 'use strict';
3
+
4
+ /**
5
+ * Prompt Evaluation Framework for ThumbGate
6
+ *
7
+ * Based on Anthropic's prompt evaluation methodology:
8
+ * 1. Define test cases with inputs and expected outputs
9
+ * 2. Run prompts against test cases
10
+ * 3. Grade outputs against expectations (deterministic + LLM-as-judge)
11
+ * 4. Report pass/fail with scores
12
+ *
13
+ * Usage:
14
+ * node scripts/prompt-eval.js [--suite=path] [--json] [--min-score=80]
15
+ */
16
+
17
+ const fs = require('node:fs');
18
+ const os = require('node:os');
19
+ const path = require('node:path');
20
+
21
+ const ROOT = path.join(__dirname, '..');
22
+ const DEFAULT_SUITE = path.join(ROOT, 'bench', 'prompt-eval-suite.json');
23
+
24
+ // ---------------------------------------------------------------------------
25
+ // Prompt simulators — run ThumbGate's actual logic against eval inputs
26
+ // ---------------------------------------------------------------------------
27
+
28
+ function simulateLessonDistillation(input) {
29
+ // Use ThumbGate's actual captureFeedback logic to produce a lesson
30
+ const { captureFeedback } = require('./feedback-loop');
31
+ const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'tg-eval-'));
32
+ const prevDir = process.env.THUMBGATE_FEEDBACK_DIR;
33
+ process.env.THUMBGATE_FEEDBACK_DIR = tmpDir;
34
+
35
+ try {
36
+ const result = captureFeedback({
37
+ signal: input.signal === 'positive' ? 'up' : 'down',
38
+ context: input.context || '',
39
+ whatWentWrong: input.whatWentWrong || undefined,
40
+ whatToChange: input.whatToChange || undefined,
41
+ whatWorked: input.whatWorked || undefined,
42
+ tags: input.tags || [],
43
+ });
44
+ return result;
45
+ } finally {
46
+ process.env.THUMBGATE_FEEDBACK_DIR = prevDir || '';
47
+ if (!prevDir) delete process.env.THUMBGATE_FEEDBACK_DIR;
48
+ try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
49
+ }
50
+ }
51
+
52
+ function simulateFeedbackEnrichment(input) {
53
+ const { enrichFeedbackContext } = require('./feedback-loop');
54
+ return enrichFeedbackContext({
55
+ signal: input.signal,
56
+ context: input.context,
57
+ tags: input.tags || [],
58
+ });
59
+ }
60
+
61
+ function simulatePreventionRule(input) {
62
+ // Prevention rules are generated from accumulated patterns
63
+ // For eval purposes, we test the rule structure expectations
64
+ return {
65
+ pattern: input.pattern,
66
+ occurrences: input.occurrences,
67
+ examples: input.examples,
68
+ generated: true,
69
+ };
70
+ }
71
+
72
+ function simulateSelfDistill(input) {
73
+ return {
74
+ sessionFeedback: input.sessionFeedback,
75
+ summary: input.sessionFeedback.map((f) => f.context).join('; '),
76
+ generated: true,
77
+ };
78
+ }
79
+
80
+ const PROMPT_SIMULATORS = {
81
+ 'lesson-distillation': simulateLessonDistillation,
82
+ 'feedback-enrichment': simulateFeedbackEnrichment,
83
+ 'prevention-rule-generation': simulatePreventionRule,
84
+ 'self-distillation': simulateSelfDistill,
85
+ };
86
+
87
+ // ---------------------------------------------------------------------------
88
+ // Deterministic graders — check output against expected fields
89
+ // ---------------------------------------------------------------------------
90
+
91
+ function firstString(...values) {
92
+ for (const value of values) {
93
+ if (typeof value === 'string') return value;
94
+ }
95
+ return '';
96
+ }
97
+
98
+ function addContainsChecks(checks, prefix, label, content, terms = []) {
99
+ for (const term of terms) {
100
+ const found = content.toLowerCase().includes(term.toLowerCase());
101
+ checks.push({
102
+ criterion: `${prefix}:${term}`,
103
+ pass: found,
104
+ detail: found ? `${label} contains "${term}"` : `${label} missing "${term}"`,
105
+ });
106
+ }
107
+ }
108
+
109
+ function handleRejectExpectation(checks, result, expected) {
110
+ if (!expected.shouldReject) return false;
111
+
112
+ const wasRejected = result.accepted === false
113
+ || result.status === 'rejected'
114
+ || result.actionType === 'no-action';
115
+ checks.push({
116
+ criterion: 'shouldReject',
117
+ pass: wasRejected,
118
+ detail: wasRejected ? 'Correctly rejected vague input' : 'Should have rejected but accepted',
119
+ });
120
+ return true;
121
+ }
122
+
123
+ function addTitleChecks(checks, result, expected) {
124
+ if (!expected.hasTitle) return;
125
+
126
+ const title = firstString(result.memoryRecord?.title, result.title);
127
+ checks.push({
128
+ criterion: 'hasTitle',
129
+ pass: title.length > 0,
130
+ detail: title ? `Title: "${title.slice(0, 60)}"` : 'Missing title',
131
+ });
132
+ addContainsChecks(checks, 'titleContains', 'Title', title, expected.titleContains);
133
+ }
134
+
135
+ function addContentChecks(checks, result, expected) {
136
+ if (!expected.hasContent) return;
137
+
138
+ const content = firstString(result.memoryRecord?.content, result.content);
139
+ checks.push({
140
+ criterion: 'hasContent',
141
+ pass: content.length > 0,
142
+ detail: content ? `Content length: ${content.length}` : 'Missing content',
143
+ });
144
+ addContainsChecks(checks, 'contentContains', 'Content', content, expected.contentContains);
145
+ }
146
+
147
+ function addCategoryChecks(checks, result, expected) {
148
+ if (expected.category) {
149
+ const category = firstString(result.memoryRecord?.category, result.category);
150
+ checks.push({
151
+ criterion: 'category',
152
+ pass: category === expected.category,
153
+ detail: `Expected "${expected.category}", got "${category}"`,
154
+ });
155
+ }
156
+
157
+ if (expected.importance) {
158
+ const importance = firstString(result.memoryRecord?.importance, result.importance);
159
+ checks.push({
160
+ criterion: 'importance',
161
+ pass: importance === expected.importance,
162
+ detail: `Expected "${expected.importance}", got "${importance}"`,
163
+ });
164
+ }
165
+ }
166
+
167
+ function addContextChecks(checks, result, expected) {
168
+ if (expected.hasDomain) {
169
+ const domain = firstString(result.richContext?.domain, result.domain);
170
+ checks.push({
171
+ criterion: 'domain',
172
+ pass: expected.domain ? domain === expected.domain : domain.length > 0,
173
+ detail: `Domain: "${domain}"`,
174
+ });
175
+ }
176
+
177
+ if (expected.hasOutcome) {
178
+ const outcome = firstString(result.richContext?.outcomeCategory, result.outcome);
179
+ checks.push({
180
+ criterion: 'hasOutcome',
181
+ pass: outcome.length > 0,
182
+ detail: `Outcome: "${outcome}"`,
183
+ });
184
+ addContainsChecks(checks, 'outcomeContains', 'Outcome', outcome, expected.outcomeContains);
185
+ }
186
+ }
187
+
188
+ function addRuleChecks(checks, result, expected) {
189
+ if (!expected.hasRule) return;
190
+
191
+ checks.push({
192
+ criterion: 'hasRule',
193
+ pass: result.generated === true || !!result.rule,
194
+ detail: result.generated ? 'Rule generated' : 'No rule generated',
195
+ });
196
+ }
197
+
198
+ function addSummaryChecks(checks, result, expected) {
199
+ if (!expected.hasSummary) return;
200
+
201
+ const summary = firstString(result.summary);
202
+ checks.push({
203
+ criterion: 'hasSummary',
204
+ pass: summary.length > 0,
205
+ detail: `Summary length: ${summary.length}`,
206
+ });
207
+ addContainsChecks(checks, 'summaryContains', 'Summary', summary, expected.summaryContains);
208
+ }
209
+
210
+ function gradeOutput(output, expected) {
211
+ const checks = [];
212
+ const result = output || {};
213
+
214
+ if (handleRejectExpectation(checks, result, expected)) return checks;
215
+
216
+ addTitleChecks(checks, result, expected);
217
+ addContentChecks(checks, result, expected);
218
+ addCategoryChecks(checks, result, expected);
219
+ addContextChecks(checks, result, expected);
220
+ addRuleChecks(checks, result, expected);
221
+ addSummaryChecks(checks, result, expected);
222
+
223
+ return checks;
224
+ }
225
+
226
+ // ---------------------------------------------------------------------------
227
+ // Runner
228
+ // ---------------------------------------------------------------------------
229
+
230
+ function loadSuite(suitePath) {
231
+ const raw = JSON.parse(fs.readFileSync(suitePath, 'utf8'));
232
+ if (!Array.isArray(raw.evaluations) || raw.evaluations.length === 0) {
233
+ throw new Error('Suite must define a non-empty evaluations array');
234
+ }
235
+ return raw;
236
+ }
237
+
238
+ function runEvaluation(evalCase) {
239
+ const simulator = PROMPT_SIMULATORS[evalCase.prompt];
240
+ if (!simulator) {
241
+ return {
242
+ id: evalCase.id,
243
+ status: 'skip',
244
+ reason: `No simulator for prompt: ${evalCase.prompt}`,
245
+ checks: [],
246
+ score: 0,
247
+ };
248
+ }
249
+
250
+ let output;
251
+ let error = null;
252
+ try {
253
+ output = simulator(evalCase.input);
254
+ } catch (err) {
255
+ error = err.message || String(err);
256
+ }
257
+
258
+ if (error) {
259
+ return {
260
+ id: evalCase.id,
261
+ status: 'error',
262
+ error,
263
+ checks: [],
264
+ score: 0,
265
+ };
266
+ }
267
+
268
+ const checks = gradeOutput(output, evalCase.expectedOutput);
269
+ const passCount = checks.filter((c) => c.pass).length;
270
+ const score = checks.length > 0 ? Math.round((passCount / checks.length) * 100) : 0;
271
+
272
+ return {
273
+ id: evalCase.id,
274
+ status: score === 100 ? 'pass' : 'fail',
275
+ checks,
276
+ score,
277
+ passCount,
278
+ totalChecks: checks.length,
279
+ };
280
+ }
281
+
282
+ function runSuite(suitePath = DEFAULT_SUITE, options = {}) {
283
+ const suite = loadSuite(suitePath);
284
+ const results = [];
285
+
286
+ for (const evalCase of suite.evaluations) {
287
+ results.push(runEvaluation(evalCase));
288
+ }
289
+
290
+ const passed = results.filter((r) => r.status === 'pass').length;
291
+ const failed = results.filter((r) => r.status === 'fail').length;
292
+ const errors = results.filter((r) => r.status === 'error').length;
293
+ const skipped = results.filter((r) => r.status === 'skip').length;
294
+ const totalScore = results.length > 0
295
+ ? Math.round(results.reduce((s, r) => s + r.score, 0) / results.length)
296
+ : 0;
297
+
298
+ return {
299
+ suite: suite.name,
300
+ total: results.length,
301
+ passed,
302
+ failed,
303
+ errors,
304
+ skipped,
305
+ score: totalScore,
306
+ minScore: options.minScore || 80,
307
+ pass: totalScore >= (options.minScore || 80),
308
+ results,
309
+ };
310
+ }
311
+
312
+ // ---------------------------------------------------------------------------
313
+ // CLI
314
+ // ---------------------------------------------------------------------------
315
+
316
+ function statusIcon(status) {
317
+ if (status === 'pass') return '\u2705';
318
+ if (status === 'skip') return '\u23ED';
319
+ return '\u274C';
320
+ }
321
+
322
+ function isCliInvocation() {
323
+ return Boolean(process.argv[1]) && path.resolve(process.argv[1]) === __filename;
324
+ }
325
+
326
+ if (isCliInvocation()) {
327
+ const args = process.argv.slice(2);
328
+ let suitePath = DEFAULT_SUITE;
329
+ let json = false;
330
+ let minScore = 80;
331
+
332
+ for (const arg of args) {
333
+ if (arg.startsWith('--suite=')) suitePath = path.resolve(arg.slice(8));
334
+ if (arg === '--json') json = true;
335
+ if (arg.startsWith('--min-score=')) minScore = Number(arg.slice(12));
336
+ }
337
+
338
+ const report = runSuite(suitePath, { minScore });
339
+
340
+ if (json) {
341
+ console.log(JSON.stringify(report, null, 2));
342
+ } else {
343
+ console.log(`\n${report.suite}`);
344
+ console.log('='.repeat(50));
345
+ for (const r of report.results) {
346
+ const icon = statusIcon(r.status);
347
+ console.log(`${icon} ${r.id} — ${r.score}% (${r.passCount || 0}/${r.totalChecks || 0})`);
348
+ if (r.status === 'fail' || r.status === 'error') {
349
+ for (const c of (r.checks || [])) {
350
+ if (!c.pass) console.log(` \u274C ${c.criterion}: ${c.detail}`);
351
+ }
352
+ if (r.error) console.log(` Error: ${r.error}`);
353
+ }
354
+ }
355
+ console.log('='.repeat(50));
356
+ console.log(`Score: ${report.score}% | Pass: ${report.passed} | Fail: ${report.failed} | Error: ${report.errors} | Skip: ${report.skipped}`);
357
+ console.log(report.pass ? '\u2705 PASS' : `\u274C FAIL (min: ${minScore}%)`);
358
+ }
359
+
360
+ process.exit(report.pass ? 0 : 1);
361
+ }
362
+
363
+ module.exports = { runSuite, runEvaluation, gradeOutput, loadSuite };
@@ -11,23 +11,37 @@ const {
11
11
 
12
12
  const USAGE_FILE = path.join(process.env.HOME || '/tmp', '.thumbgate', 'usage-limits.json');
13
13
 
14
+ // ──────────────────────────────────────────────────────────
15
+ // NEW: Lifetime caps on free tier — users hit the wall fast
16
+ // and must upgrade to keep using core features.
17
+ // ──────────────────────────────────────────────────────────
14
18
  const FREE_TIER_LIMITS = {
15
- capture_feedback: { daily: 3, label: 'feedback captures' },
16
- search_lessons: { daily: 5, label: 'lesson searches' },
17
- search_thumbgate: { daily: 5, label: 'ThumbGate searches' },
18
- commerce_recall: { daily: 5, label: 'commerce recalls' },
19
- export_dpo: { daily: 0, label: 'DPO exports (Pro only)' },
20
- export_databricks: { daily: 0, label: 'Databricks exports (Pro only)' },
19
+ capture_feedback: { daily: Infinity, lifetime: 3, label: 'feedback captures' },
20
+ prevention_rules: { daily: Infinity, lifetime: 1, label: 'prevention rules generated' },
21
+ recall: { daily: 0, lifetime: 0, label: 'recall queries (Pro only)' },
22
+ search_lessons: { daily: 0, lifetime: 0, label: 'lesson searches (Pro only)' },
23
+ search_thumbgate: { daily: 0, lifetime: 0, label: 'ThumbGate searches (Pro only)' },
24
+ commerce_recall: { daily: 0, lifetime: 0, label: 'commerce recalls (Pro only)' },
25
+ export_dpo: { daily: 0, lifetime: 0, label: 'DPO exports (Pro only)' },
26
+ export_databricks: { daily: 0, lifetime: 0, label: 'Databricks exports (Pro only)' },
27
+ construct_context_pack: { daily: Infinity, lifetime: 3, label: 'context packs' },
21
28
  };
22
29
 
23
- const FREE_TIER_MAX_GATES = 5;
30
+ const FREE_TIER_MAX_GATES = 1; // Down from 5 — one auto-promoted gate, then paywall
24
31
 
25
- const UPGRADE_MESSAGE = `Pro: ${PRO_PRICE_LABEL} — dashboard and DPO export: ${PRO_MONTHLY_PAYMENT_LINK}\n Team: ${TEAM_PRICE_LABEL} after workflow qualification.`;
32
+ const UPGRADE_MESSAGE = `Pro: ${PRO_PRICE_LABEL} — unlimited captures, recall, prevention rules, and dashboard: ${PRO_MONTHLY_PAYMENT_LINK}\n Team: ${TEAM_PRICE_LABEL} after workflow qualification.`;
33
+
34
+ const PAYWALL_MESSAGES = {
35
+ capture_feedback: 'You\'ve used all 3 free feedback captures. Your agent is still making mistakes — upgrade to Pro to capture every one and build real prevention rules.',
36
+ prevention_rules: 'Free tier includes 1 prevention rule. Your agents need more protection — upgrade to Pro for unlimited rules.',
37
+ recall: 'Recall is a Pro feature. Your past feedback is stored locally — upgrade to search and reuse it.',
38
+ search_lessons: 'Lesson search is a Pro feature. Upgrade to find patterns in your agent\'s mistakes.',
39
+ default: 'This feature requires Pro. Start a 7-day free trial — no credit card required.',
40
+ };
26
41
 
27
42
  function isProTier(authContext) {
28
43
  if (authContext && authContext.tier === 'pro') return true;
29
44
  if (process.env.THUMBGATE_API_KEY || process.env.THUMBGATE_PRO_MODE === '1' || process.env.THUMBGATE_NO_RATE_LIMIT === '1') return true;
30
- // Also check license file for real customer Pro verification
31
45
  try {
32
46
  const { isProLicensed } = require('./license');
33
47
  if (isProLicensed()) return true;
@@ -62,38 +76,79 @@ function todayKey() {
62
76
 
63
77
  /**
64
78
  * Check and increment usage for a given action.
79
+ * Now enforces LIFETIME limits in addition to daily limits.
65
80
  * Returns { allowed: true } or { allowed: false, message: string }
66
81
  */
67
82
  function checkLimit(action, authContext) {
68
83
  if (isProTier(authContext)) return { allowed: true };
69
84
 
70
85
  const limitEntry = FREE_TIER_LIMITS[action];
71
- if (limitEntry == null) return { allowed: true }; // no limit for this action
86
+ if (limitEntry == null) return { allowed: true };
72
87
 
73
88
  const dailyLimit = typeof limitEntry === 'object' ? limitEntry.daily : limitEntry;
89
+ const lifetimeLimit = typeof limitEntry === 'object' ? limitEntry.lifetime : Infinity;
74
90
 
75
91
  const usage = loadUsage();
76
92
  const today = todayKey();
77
93
 
78
- // Reset if different day
94
+ // Reset daily counts if different day
79
95
  if (usage.date !== today) {
80
96
  usage.date = today;
81
97
  usage.counts = {};
82
98
  }
83
99
 
84
100
  usage.counts = usage.counts || {};
85
- const current = usage.counts[action] || 0;
101
+ usage.lifetime = usage.lifetime || {};
102
+
103
+ const dailyCurrent = usage.counts[action] || 0;
104
+ const lifetimeCurrent = usage.lifetime[action] || 0;
105
+
106
+ // Check lifetime limit first (the hard wall)
107
+ if (lifetimeLimit !== Infinity && lifetimeCurrent >= lifetimeLimit) {
108
+ const paywallMsg = PAYWALL_MESSAGES[action] || PAYWALL_MESSAGES.default;
109
+ return {
110
+ allowed: false,
111
+ message: `${paywallMsg}\n\n${UPGRADE_MESSAGE}`,
112
+ used: lifetimeCurrent,
113
+ limit: lifetimeLimit,
114
+ limitType: 'lifetime',
115
+ };
116
+ }
86
117
 
87
- if (current >= dailyLimit) {
88
- return { allowed: false, message: `Free tier limit reached. Upgrade to Pro for unlimited: https://thumbgate-production.up.railway.app/pro\n${UPGRADE_MESSAGE}`, used: current, limit: dailyLimit };
118
+ // Check daily limit
119
+ if (dailyLimit !== Infinity && dailyCurrent >= dailyLimit) {
120
+ return {
121
+ allowed: false,
122
+ message: `Daily limit reached. ${UPGRADE_MESSAGE}`,
123
+ used: dailyCurrent,
124
+ limit: dailyLimit,
125
+ limitType: 'daily',
126
+ };
89
127
  }
90
128
 
91
- // Increment
92
- usage.counts[action] = current + 1;
129
+ // Increment both counters
130
+ usage.counts[action] = dailyCurrent + 1;
131
+ usage.lifetime[action] = lifetimeCurrent + 1;
93
132
  saveUsage(usage);
94
133
 
95
- const used = current + 1;
96
- return { allowed: true, used, limit: dailyLimit, remaining: dailyLimit - used };
134
+ const remaining = lifetimeLimit === Infinity
135
+ ? Infinity
136
+ : lifetimeLimit - (lifetimeCurrent + 1);
137
+
138
+ // Warn when approaching limit
139
+ const warningThreshold = lifetimeLimit <= 3 ? 1 : Math.ceil(lifetimeLimit * 0.2);
140
+ const isNearLimit = remaining <= warningThreshold && remaining > 0;
141
+
142
+ return {
143
+ allowed: true,
144
+ used: lifetimeCurrent + 1,
145
+ limit: lifetimeLimit,
146
+ remaining,
147
+ limitType: 'lifetime',
148
+ warning: isNearLimit
149
+ ? `${remaining} free ${limitEntry.label} remaining. Upgrade to Pro for unlimited.`
150
+ : undefined,
151
+ };
97
152
  }
98
153
 
99
154
  /**
@@ -103,14 +158,11 @@ function getUsage(action, authContext) {
103
158
  if (isProTier(authContext)) return { count: 0, limit: Infinity, remaining: Infinity };
104
159
 
105
160
  const limitEntry = FREE_TIER_LIMITS[action];
106
- const dailyLimit = limitEntry == null ? Infinity : (typeof limitEntry === 'object' ? limitEntry.daily : limitEntry);
161
+ const lifetimeLimit = limitEntry == null ? Infinity : (typeof limitEntry === 'object' ? (limitEntry.lifetime ?? Infinity) : Infinity);
107
162
  const usage = loadUsage();
108
- const today = todayKey();
109
-
110
- if (usage.date !== today) return { count: 0, limit: dailyLimit, remaining: dailyLimit };
111
163
 
112
- const count = (usage.counts || {})[action] || 0;
113
- return { count, limit: dailyLimit, remaining: Math.max(0, dailyLimit - count) };
164
+ const lifetimeCount = (usage.lifetime || {})[action] || 0;
165
+ return { count: lifetimeCount, limit: lifetimeLimit, remaining: Math.max(0, lifetimeLimit - lifetimeCount) };
114
166
  }
115
167
 
116
168
  module.exports = {
@@ -123,5 +175,6 @@ module.exports = {
123
175
  FREE_TIER_LIMITS,
124
176
  FREE_TIER_MAX_GATES,
125
177
  UPGRADE_MESSAGE,
178
+ PAYWALL_MESSAGES,
126
179
  USAGE_FILE,
127
180
  };