thumbgate 1.27.4 → 1.27.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/dashboard.md +15 -0
- package/.claude/commands/thumbgate-blocked.md +27 -0
- package/.claude/commands/thumbgate-dashboard.md +15 -0
- package/.claude/commands/thumbgate-doctor.md +30 -0
- package/.claude/commands/thumbgate-guard.md +36 -0
- package/.claude/commands/thumbgate-protect.md +30 -0
- package/.claude/commands/thumbgate-rules.md +30 -0
- package/.claude-plugin/plugin.json +2 -1
- package/.well-known/llms.txt +6 -2
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +49 -5
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/letta/README.md +41 -0
- package/adapters/letta/thumbgate-letta-adapter.js +133 -0
- package/adapters/mcp/server-stdio.js +16 -1
- package/adapters/opencode/opencode.json +1 -1
- package/adapters/policy-engine/ethicore-guardian-client.js +68 -0
- package/adapters/policy-engine/thumbgate-policy-engine-adapter.js +260 -0
- package/bench/observability-eval-suite.json +26 -0
- package/bin/cli.js +230 -6
- package/bin/postinstall.js +1 -1
- package/commands/dashboard.md +15 -0
- package/commands/thumbgate-dashboard.md +15 -0
- package/config/gate-templates.json +84 -0
- package/config/gates/claim-verification.json +12 -0
- package/config/gates/default.json +20 -0
- package/config/github-about.json +1 -1
- package/config/model-candidates.json +50 -0
- package/config/post-deploy-marketing-pages.json +5 -0
- package/package.json +67 -25
- package/public/agent-manager.html +41 -1
- package/public/agents-cost-savings.html +1 -1
- package/public/ai-malpractice-prevention.html +2 -1
- package/public/assets/brand/github-social-preview.png +0 -0
- package/public/assets/brand/thumbgate-icon-512.png +0 -0
- package/public/assets/brand/thumbgate-icon-pro-512.png +0 -0
- package/public/assets/brand/thumbgate-icon-team-512.png +0 -0
- package/public/assets/brand/thumbgate-logo-1200x360.png +0 -0
- package/public/assets/brand/thumbgate-mark-inline.svg +15 -0
- package/public/assets/brand/thumbgate-mark-pro.svg +23 -0
- package/public/assets/brand/thumbgate-mark-team.svg +26 -0
- package/public/assets/brand/thumbgate-mark.svg +15 -0
- package/public/assets/brand/thumbgate-wordmark.svg +20 -0
- package/public/assets/claude-thumbgate-statusbar.svg +8 -0
- package/public/assets/codex-thumbgate-statusbar-test.svg +9 -0
- package/public/assets/legal-intake-control-flow.svg +66 -0
- package/public/blog.html +1 -1
- package/public/brand/thumbgate-mark.svg +15 -0
- package/public/brand/thumbgate-og.svg +16 -0
- package/public/codex-enterprise.html +1 -1
- package/public/codex-plugin.html +1 -1
- package/public/compare.html +23 -3
- package/public/dashboard.html +316 -30
- package/public/federal.html +1 -1
- package/public/guide.html +5 -4
- package/public/index.html +167 -49
- package/public/js/buyer-intent.js +672 -0
- package/public/learn.html +88 -7
- package/public/lessons.html +2 -1
- package/public/numbers.html +3 -3
- package/public/pricing.html +63 -15
- package/public/pro.html +7 -7
- package/scripts/activation-quickstart.js +187 -0
- package/scripts/agent-memory-lifecycle.js +211 -0
- package/scripts/async-eval-observability.js +236 -0
- package/scripts/auto-promote-gates.js +75 -4
- package/scripts/billing.js +12 -1
- package/scripts/build-metadata.js +24 -3
- package/scripts/cli-schema.js +42 -10
- package/scripts/dashboard-chat.js +53 -7
- package/scripts/dashboard.js +12 -17
- package/scripts/export-databricks-bundle.js +5 -1
- package/scripts/export-dpo-pairs.js +7 -2
- package/scripts/feedback-aggregate.js +281 -0
- package/scripts/feedback-loop.js +121 -0
- package/scripts/filesystem-search.js +35 -10
- package/scripts/gates-engine.js +234 -7
- package/scripts/gemini-embedding-policy.js +2 -1
- package/scripts/hook-stop-anti-claim.js +227 -0
- package/scripts/hook-thumbgate-cache-updater.js +18 -2
- package/scripts/hybrid-feedback-context.js +1 -0
- package/scripts/lesson-inference.js +8 -3
- package/scripts/lesson-search.js +17 -1
- package/scripts/operational-integrity.js +39 -5
- package/scripts/plausible-domain-config.js +15 -2
- package/scripts/plausible-server-events.js +4 -4
- package/scripts/rate-limiter.js +12 -6
- package/scripts/secret-redaction.js +166 -0
- package/scripts/security-scanner.js +100 -0
- package/scripts/self-distill-agent.js +3 -1
- package/scripts/self-harness-optimizer.js +141 -0
- package/scripts/seo-gsd.js +635 -0
- package/scripts/statusline-cache-path.js +17 -2
- package/scripts/statusline-cache-read.js +57 -0
- package/scripts/statusline-local-stats.js +9 -1
- package/scripts/statusline-meta.js +5 -2
- package/scripts/statusline.sh +13 -1
- package/scripts/sync-telemetry-from-prod.js +374 -0
- package/scripts/telemetry-analytics.js +9 -0
- package/scripts/thumbgate-search.js +85 -19
- package/scripts/tool-contract-validator.js +76 -0
- package/scripts/vector-store.js +44 -0
- package/scripts/workspace-evolver.js +62 -2
- package/src/api/server.js +862 -146
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
// Activation: guided first prevention rule + a live demonstrated block.
|
|
5
|
+
//
|
|
6
|
+
// ~98.5% of `init` users never promote their first rule, so they never see
|
|
7
|
+
// "ThumbGate just blocked a repeat mistake" — the aha moment that drives
|
|
8
|
+
// activation (and, downstream, conversion). `quickstart` walks a new user
|
|
9
|
+
// through capturing one real example, promoting it to a block rule, and then
|
|
10
|
+
// firing that rule against the exact action so they watch it get blocked.
|
|
11
|
+
//
|
|
12
|
+
// SAFETY: this is additive and lives in its own module. `init` is untouched.
|
|
13
|
+
// The interactive flow ONLY runs in a real TTY. Non-interactive / piped / CI
|
|
14
|
+
// invocations print a one-line hint and exit 0 without prompting or hanging.
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
const path = require('node:path');
|
|
18
|
+
|
|
19
|
+
const PKG_ROOT = path.join(__dirname, '..');
|
|
20
|
+
const PRO_URL = 'https://thumbgate.ai';
|
|
21
|
+
|
|
22
|
+
// Turn a free-text mistake description into a safe, literal gate pattern. We
|
|
23
|
+
// escape regex metacharacters so arbitrary user input can never produce an
|
|
24
|
+
// invalid or runaway regex inside the gates engine.
|
|
25
|
+
function escapeRegex(text) {
|
|
26
|
+
return String(text).replace(/[.*+?^${}()|[\]\\]/g, String.raw`\$&`);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function pkgVersion() {
|
|
30
|
+
try {
|
|
31
|
+
return require(path.join(PKG_ROOT, 'package.json')).version;
|
|
32
|
+
} catch {
|
|
33
|
+
// Version is cosmetic in the banner; fall back if package.json is unreadable.
|
|
34
|
+
return '0.0.0';
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Core, dependency-injected activation flow so it is testable without a real
|
|
39
|
+
// TTY. Callers provide ask() (returns a Promise<string>), out() (line writer),
|
|
40
|
+
// and isTTY. Returns a structured result describing what was promoted/blocked.
|
|
41
|
+
//
|
|
42
|
+
// Optional injectables (deps) keep the test hermetic and let the CLI pass the
|
|
43
|
+
// real modules:
|
|
44
|
+
// deps.forcePromote(pattern, action) -> { gateId, totalGates }
|
|
45
|
+
// deps.runGate(input) -> Promise<rawJsonString>
|
|
46
|
+
// deps.captureFeedback(params)
|
|
47
|
+
// deps.trackEvent(name, props)
|
|
48
|
+
async function runActivationFlow({ ask, out, isTTY, deps = {} }) {
|
|
49
|
+
// Hard safety gate: never prompt or hang in non-interactive contexts.
|
|
50
|
+
if (!isTTY) {
|
|
51
|
+
out('thumbgate quickstart is an interactive walkthrough — run it in a terminal.');
|
|
52
|
+
out('Non-interactive setup: npx thumbgate init (or: npx thumbgate quick-start)');
|
|
53
|
+
return { interactive: false, promoted: false, blocked: false };
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
const forcePromote = deps.forcePromote
|
|
57
|
+
|| require(path.join(PKG_ROOT, 'scripts', 'auto-promote-gates')).forcePromote;
|
|
58
|
+
const runGate = deps.runGate
|
|
59
|
+
|| ((input) => require(path.join(PKG_ROOT, 'scripts', 'gates-engine')).runAsync(input));
|
|
60
|
+
let captureFeedback = deps.captureFeedback;
|
|
61
|
+
if (captureFeedback === undefined) {
|
|
62
|
+
try {
|
|
63
|
+
({ captureFeedback } = require(path.join(PKG_ROOT, 'scripts', 'feedback-loop')));
|
|
64
|
+
} catch {
|
|
65
|
+
// Feedback capture is a nice-to-have here; the rule + block is the aha.
|
|
66
|
+
captureFeedback = null;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
const trackEvent = deps.trackEvent || (() => {});
|
|
70
|
+
|
|
71
|
+
out('');
|
|
72
|
+
out(`thumbgate quickstart v${pkgVersion()}`);
|
|
73
|
+
out('');
|
|
74
|
+
out("Let's set up your first prevention rule and watch it block a repeat mistake.");
|
|
75
|
+
out('');
|
|
76
|
+
out('Think of one thing an AI agent did that you never want it to do again.');
|
|
77
|
+
out('Examples: "git push --force to main", "rm -rf node_modules", "edit .env directly".');
|
|
78
|
+
out('');
|
|
79
|
+
|
|
80
|
+
let mistake = String(await ask('The mistake to block: ') || '').trim();
|
|
81
|
+
if (!mistake) {
|
|
82
|
+
mistake = 'git push --force to main';
|
|
83
|
+
out(`(using a starter example: ${mistake})`);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// 1. Capture it as a real thumbs-down example (best-effort).
|
|
87
|
+
if (typeof captureFeedback === 'function') {
|
|
88
|
+
try {
|
|
89
|
+
captureFeedback({
|
|
90
|
+
signal: 'down',
|
|
91
|
+
context: mistake,
|
|
92
|
+
whatWentWrong: mistake,
|
|
93
|
+
whatToChange: `Block this action: ${mistake}`,
|
|
94
|
+
tags: 'quickstart,activation,first-rule',
|
|
95
|
+
gateAction: 'block',
|
|
96
|
+
});
|
|
97
|
+
} catch {
|
|
98
|
+
// Capture failure should not abort the activation aha.
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// 2. Promote it into a hard block rule using the existing primitive.
|
|
103
|
+
const pattern = escapeRegex(mistake);
|
|
104
|
+
const promotion = forcePromote(pattern, 'block');
|
|
105
|
+
out('');
|
|
106
|
+
out(` Rule promoted: ${promotion.gateId} [block]`);
|
|
107
|
+
out(` Active rules now: ${promotion.totalGates}`);
|
|
108
|
+
|
|
109
|
+
// 3. Demonstrate the block. Force strict enforcement for the demo so the
|
|
110
|
+
// user's brand-new rule hard-blocks (default posture is warn-by-default).
|
|
111
|
+
// We restore the prior value so we never mutate the user's environment.
|
|
112
|
+
const priorStrict = process.env.THUMBGATE_STRICT_ENFORCEMENT;
|
|
113
|
+
process.env.THUMBGATE_STRICT_ENFORCEMENT = '1';
|
|
114
|
+
let verdict;
|
|
115
|
+
try {
|
|
116
|
+
const demoInput = { tool_name: 'Bash', tool_input: { command: mistake } };
|
|
117
|
+
const raw = await runGate(demoInput);
|
|
118
|
+
verdict = JSON.parse(raw);
|
|
119
|
+
} finally {
|
|
120
|
+
if (priorStrict === undefined) delete process.env.THUMBGATE_STRICT_ENFORCEMENT;
|
|
121
|
+
else process.env.THUMBGATE_STRICT_ENFORCEMENT = priorStrict;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const decision = verdict?.decision
|
|
125
|
+
|| verdict?.hookSpecificOutput?.permissionDecision;
|
|
126
|
+
const blocked = decision === 'block' || decision === 'deny';
|
|
127
|
+
|
|
128
|
+
out('');
|
|
129
|
+
if (blocked) {
|
|
130
|
+
out(' | ThumbGate just blocked it.');
|
|
131
|
+
out(` | Action attempted: ${mistake}`);
|
|
132
|
+
out(' | Verdict: BLOCKED — the agent cannot repeat this mistake.');
|
|
133
|
+
} else {
|
|
134
|
+
// Even in warn posture the action is flagged + logged. Be honest about it.
|
|
135
|
+
out(' | ThumbGate flagged it.');
|
|
136
|
+
out(` | Action attempted: ${mistake}`);
|
|
137
|
+
out(' | Verdict: flagged and logged. Set THUMBGATE_STRICT_ENFORCEMENT=1 to hard-block.');
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// 4. Tie the value the user just saw to what Pro keeps working.
|
|
141
|
+
out('');
|
|
142
|
+
out(' That rule lives in this project. Pro keeps your rules — and the mistakes');
|
|
143
|
+
out(' you teach it — synced across every machine and agent runtime you work in,');
|
|
144
|
+
out(` so the block you just saw follows your whole team. ${PRO_URL}/pricing`);
|
|
145
|
+
out('');
|
|
146
|
+
|
|
147
|
+
try { trackEvent('cli_quickstart_activated', { command: 'quickstart', blocked }); } catch { /* telemetry best-effort */ }
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
interactive: true,
|
|
151
|
+
promoted: true,
|
|
152
|
+
blocked,
|
|
153
|
+
gateId: promotion.gateId,
|
|
154
|
+
pattern,
|
|
155
|
+
mistake,
|
|
156
|
+
verdict,
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// CLI entrypoint: wires runActivationFlow to a real readline prompt + stdout.
|
|
161
|
+
function quickstart() {
|
|
162
|
+
const isTTY = Boolean(process.stdout.isTTY && process.stdin.isTTY);
|
|
163
|
+
const out = (line = '') => console.log(line);
|
|
164
|
+
|
|
165
|
+
const trackEvent = (() => {
|
|
166
|
+
try { return require(path.join(PKG_ROOT, 'scripts', 'cli-telemetry')).trackEvent; } catch { return () => {}; }
|
|
167
|
+
})();
|
|
168
|
+
|
|
169
|
+
if (!isTTY) {
|
|
170
|
+
// Mirror the non-interactive branch without opening readline (which would
|
|
171
|
+
// otherwise keep the process alive waiting for input that never comes).
|
|
172
|
+
return runActivationFlow({ ask: async () => '', out, isTTY: false, deps: { trackEvent } });
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
const readline = require('node:readline');
|
|
176
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
177
|
+
const ask = (q) => new Promise((resolve) => rl.question(q, resolve));
|
|
178
|
+
|
|
179
|
+
return runActivationFlow({ ask, out, isTTY: true, deps: { trackEvent } })
|
|
180
|
+
.catch((err) => {
|
|
181
|
+
console.error(`quickstart error: ${err?.message ?? err}`);
|
|
182
|
+
process.exitCode = 1;
|
|
183
|
+
})
|
|
184
|
+
.finally(() => rl.close());
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
module.exports = { runActivationFlow, quickstart, escapeRegex };
|
|
@@ -2,6 +2,40 @@
|
|
|
2
2
|
'use strict';
|
|
3
3
|
|
|
4
4
|
const MEMORY_TYPES = new Set(['episodic', 'semantic', 'procedural', 'preference', 'working']);
|
|
5
|
+
const MEMORY_SCOPES = new Set(['task', 'session', 'user', 'project', 'org']);
|
|
6
|
+
const HIGH_RISK_TERMS = new Set([
|
|
7
|
+
'billing',
|
|
8
|
+
'checkout',
|
|
9
|
+
'compliance',
|
|
10
|
+
'credential',
|
|
11
|
+
'data-loss',
|
|
12
|
+
'deploy',
|
|
13
|
+
'deployment',
|
|
14
|
+
'git',
|
|
15
|
+
'payment',
|
|
16
|
+
'production',
|
|
17
|
+
'release',
|
|
18
|
+
'secret',
|
|
19
|
+
'security',
|
|
20
|
+
'stripe',
|
|
21
|
+
'verification',
|
|
22
|
+
]);
|
|
23
|
+
const KNOWN_ENTITY_PATTERNS = [
|
|
24
|
+
['Claude Code', /\bclaude\s+code\b/i, 'agent'],
|
|
25
|
+
['Codex', /\bcodex\b/i, 'agent'],
|
|
26
|
+
['Cursor', /\bcursor\b/i, 'agent'],
|
|
27
|
+
['Gemini CLI', /\bgemini\s+cli\b/i, 'agent'],
|
|
28
|
+
['MCP', /\bmcp\b/i, 'protocol'],
|
|
29
|
+
['Stripe', /\bstripe\b/i, 'service'],
|
|
30
|
+
['GitHub', /\bgithub\b|\bgh\s+/i, 'service'],
|
|
31
|
+
['Railway', /\brailway\b/i, 'service'],
|
|
32
|
+
['Plausible', /\bplausible\b/i, 'service'],
|
|
33
|
+
['PostHog', /\bposthog\b/i, 'service'],
|
|
34
|
+
['SQLite', /\bsqlite\b|\bfts5\b/i, 'storage'],
|
|
35
|
+
['LanceDB', /\blancedb\b/i, 'storage'],
|
|
36
|
+
['Docker', /\bdocker\b/i, 'runtime'],
|
|
37
|
+
['npm', /\bnpm\b|\bnpx\b/i, 'runtime'],
|
|
38
|
+
];
|
|
5
39
|
|
|
6
40
|
function normalizeText(value) {
|
|
7
41
|
if (value === undefined || value === null) return '';
|
|
@@ -13,6 +47,178 @@ function normalizeMemoryType(value) {
|
|
|
13
47
|
return MEMORY_TYPES.has(normalized) ? normalized : 'episodic';
|
|
14
48
|
}
|
|
15
49
|
|
|
50
|
+
function tokenize(value) {
|
|
51
|
+
return normalizeText(value)
|
|
52
|
+
.toLowerCase()
|
|
53
|
+
.split(/[^a-z0-9_.:/-]+/)
|
|
54
|
+
.filter(Boolean);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function uniqueByName(entities) {
|
|
58
|
+
const seen = new Set();
|
|
59
|
+
return entities.filter((entity) => {
|
|
60
|
+
const key = normalizeText(entity.name).toLowerCase();
|
|
61
|
+
if (!key || seen.has(key)) return false;
|
|
62
|
+
seen.add(key);
|
|
63
|
+
return true;
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function collectMemoryText(memory = {}) {
|
|
68
|
+
return [
|
|
69
|
+
memory.title,
|
|
70
|
+
memory.content,
|
|
71
|
+
memory.context,
|
|
72
|
+
memory.whatWentWrong,
|
|
73
|
+
memory.whatToChange,
|
|
74
|
+
memory.whatWorked,
|
|
75
|
+
memory.domain,
|
|
76
|
+
memory.skill,
|
|
77
|
+
Array.isArray(memory.tags) ? memory.tags.join(' ') : memory.tags,
|
|
78
|
+
].filter(Boolean).join(' ');
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function extractMemoryEntities(memory = {}) {
|
|
82
|
+
const text = collectMemoryText(memory);
|
|
83
|
+
const entities = [];
|
|
84
|
+
|
|
85
|
+
for (const [name, pattern, type] of KNOWN_ENTITY_PATTERNS) {
|
|
86
|
+
if (pattern.test(text)) entities.push({ name, type });
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const commandMatches = text.match(/`([^`]+)`/g) || [];
|
|
90
|
+
for (const match of commandMatches) {
|
|
91
|
+
const command = match.slice(1, -1).trim();
|
|
92
|
+
if (/^(git|npm|npx|node|gh|curl|docker|python|pytest|stripe)\b/i.test(command)) {
|
|
93
|
+
entities.push({ name: command, type: 'command' });
|
|
94
|
+
} else if (/[./-]/.test(command)) {
|
|
95
|
+
entities.push({ name: command, type: 'path' });
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const pathMatches = text.match(/\b(?:[a-z0-9_-]+\/)+[a-z0-9_.-]+\b/gi) || [];
|
|
100
|
+
for (const filePath of pathMatches.slice(0, 8)) {
|
|
101
|
+
entities.push({ name: filePath, type: 'path' });
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return uniqueByName(entities).slice(0, 16);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function inferMemoryScope(memory = {}) {
|
|
108
|
+
const explicit = normalizeText(memory.scope || memory.memoryScope).toLowerCase();
|
|
109
|
+
if (MEMORY_SCOPES.has(explicit)) return explicit;
|
|
110
|
+
|
|
111
|
+
const text = collectMemoryText(memory).toLowerCase();
|
|
112
|
+
const tags = new Set(Array.isArray(memory.tags) ? memory.tags.map((tag) => normalizeText(tag).toLowerCase()) : []);
|
|
113
|
+
|
|
114
|
+
if (tags.has('preference') || /\b(prefer|style|tone|my preference|user preference)\b/.test(text)) return 'user';
|
|
115
|
+
if (tags.has('org') || tags.has('team') || /\b(enterprise|seat|team|shared|org|compliance|policy|approval)\b/.test(text)) return 'org';
|
|
116
|
+
if (tags.has('repo') || tags.has('project') || tags.has('release') || tags.has('deployment')
|
|
117
|
+
|| /\b(repo|repository|branch|ci|pull request|github|deploy|production|release|publish)\b/.test(text)) return 'project';
|
|
118
|
+
if (tags.has('session') || /\b(this session|current session|today|right now)\b/.test(text)) return 'session';
|
|
119
|
+
return 'task';
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function scoreMemoryDecay(memory = {}, options = {}) {
|
|
123
|
+
const nowMs = options.now ? new Date(options.now).getTime() : Date.now();
|
|
124
|
+
const timestampMs = memory.timestamp ? new Date(memory.timestamp).getTime() : NaN;
|
|
125
|
+
const ageDays = Number.isFinite(timestampMs)
|
|
126
|
+
? Math.max(0, (nowMs - timestampMs) / (1000 * 60 * 60 * 24))
|
|
127
|
+
: null;
|
|
128
|
+
const textTokens = new Set(tokenize(collectMemoryText(memory)));
|
|
129
|
+
const tags = Array.isArray(memory.tags) ? memory.tags.map((tag) => normalizeText(tag).toLowerCase()) : [];
|
|
130
|
+
const highRisk = tags.some((tag) => HIGH_RISK_TERMS.has(tag))
|
|
131
|
+
|| [...textTokens].some((token) => HIGH_RISK_TERMS.has(token))
|
|
132
|
+
|| ['critical', 'high'].includes(normalizeText(memory.importance).toLowerCase());
|
|
133
|
+
|
|
134
|
+
if (highRisk) {
|
|
135
|
+
return {
|
|
136
|
+
state: 'sticky',
|
|
137
|
+
ageDays,
|
|
138
|
+
score: 1,
|
|
139
|
+
reason: 'high-risk memories stay retrievable until explicitly retired',
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
if (ageDays === null) {
|
|
143
|
+
return {
|
|
144
|
+
state: 'review',
|
|
145
|
+
ageDays,
|
|
146
|
+
score: 0.6,
|
|
147
|
+
reason: 'memory has no timestamp, so it needs review before durable promotion',
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
if (ageDays > 180) {
|
|
151
|
+
return {
|
|
152
|
+
state: 'archive_candidate',
|
|
153
|
+
ageDays,
|
|
154
|
+
score: 0.2,
|
|
155
|
+
reason: 'old low-risk memory should be consolidated or archived',
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
if (ageDays > 60) {
|
|
159
|
+
return {
|
|
160
|
+
state: 'review',
|
|
161
|
+
ageDays,
|
|
162
|
+
score: 0.55,
|
|
163
|
+
reason: 'older low-risk memory should be refreshed before it dominates recall',
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
return {
|
|
167
|
+
state: 'active',
|
|
168
|
+
ageDays,
|
|
169
|
+
score: 0.85,
|
|
170
|
+
reason: 'recent memory remains eligible for recall',
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
function scoreHybridMemoryMatch(query, memory = {}, options = {}) {
|
|
175
|
+
const queryTokens = new Set(tokenize(query));
|
|
176
|
+
const memoryTokens = new Set(tokenize(collectMemoryText(memory)));
|
|
177
|
+
const queryText = normalizeText(query).toLowerCase();
|
|
178
|
+
const memoryText = collectMemoryText(memory).toLowerCase();
|
|
179
|
+
const memoryEntities = extractMemoryEntities(memory);
|
|
180
|
+
const queryEntityNames = extractMemoryEntities({ content: query }).map((entity) => entity.name.toLowerCase());
|
|
181
|
+
|
|
182
|
+
let lexicalMatches = 0;
|
|
183
|
+
for (const token of queryTokens) {
|
|
184
|
+
if (memoryTokens.has(token)) lexicalMatches++;
|
|
185
|
+
}
|
|
186
|
+
const lexicalScore = queryTokens.size > 0 ? lexicalMatches / queryTokens.size : 0;
|
|
187
|
+
const phraseScore = queryText && memoryText.includes(queryText) ? 0.35 : 0;
|
|
188
|
+
const entityMatches = memoryEntities.filter((entity) => queryEntityNames.includes(entity.name.toLowerCase()));
|
|
189
|
+
const entityScore = queryEntityNames.length > 0 ? entityMatches.length / queryEntityNames.length : 0;
|
|
190
|
+
const decay = scoreMemoryDecay(memory, options);
|
|
191
|
+
const lifecycleScore = decay.state === 'archive_candidate' ? -0.15 : decay.state === 'sticky' ? 0.12 : 0;
|
|
192
|
+
const score = lexicalScore + phraseScore + (entityScore * 0.45) + lifecycleScore;
|
|
193
|
+
|
|
194
|
+
return {
|
|
195
|
+
score: Number(Math.max(0, score).toFixed(4)),
|
|
196
|
+
lexicalScore: Number(lexicalScore.toFixed(4)),
|
|
197
|
+
entityScore: Number(entityScore.toFixed(4)),
|
|
198
|
+
matchedEntities: entityMatches,
|
|
199
|
+
decayState: decay.state,
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
function buildMemoryLifecycleView(memory = {}, options = {}) {
|
|
204
|
+
const scope = inferMemoryScope(memory);
|
|
205
|
+
const entities = extractMemoryEntities(memory);
|
|
206
|
+
const decay = scoreMemoryDecay(memory, options);
|
|
207
|
+
const retrieval = scoreHybridMemoryMatch(options.query || '', memory, options);
|
|
208
|
+
|
|
209
|
+
return {
|
|
210
|
+
scope,
|
|
211
|
+
entities,
|
|
212
|
+
decay,
|
|
213
|
+
retrievalHints: {
|
|
214
|
+
hybridScore: retrieval.score,
|
|
215
|
+
lexicalScore: retrieval.lexicalScore,
|
|
216
|
+
entityScore: retrieval.entityScore,
|
|
217
|
+
matchedEntities: retrieval.matchedEntities,
|
|
218
|
+
},
|
|
219
|
+
};
|
|
220
|
+
}
|
|
221
|
+
|
|
16
222
|
function buildMemoryLifecyclePolicy(input = {}) {
|
|
17
223
|
return {
|
|
18
224
|
generatedAt: normalizeText(input.generatedAt) || new Date().toISOString(),
|
|
@@ -91,6 +297,11 @@ function evaluateMemoryPromotion(memory = {}, policy = buildMemoryLifecyclePolic
|
|
|
91
297
|
|
|
92
298
|
module.exports = {
|
|
93
299
|
buildMemoryLifecyclePolicy,
|
|
300
|
+
buildMemoryLifecycleView,
|
|
94
301
|
evaluateMemoryPromotion,
|
|
302
|
+
extractMemoryEntities,
|
|
303
|
+
inferMemoryScope,
|
|
95
304
|
normalizeMemoryType,
|
|
305
|
+
scoreHybridMemoryMatch,
|
|
306
|
+
scoreMemoryDecay,
|
|
96
307
|
};
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
const fs = require('node:fs');
|
|
5
|
+
const path = require('node:path');
|
|
6
|
+
|
|
7
|
+
const DEFAULT_THRESHOLDS = {
|
|
8
|
+
faithfulness: 0.72,
|
|
9
|
+
answerRelevance: 0.45,
|
|
10
|
+
contextPrecision: 0.5,
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
function tokenize(value) {
|
|
14
|
+
return String(value || '')
|
|
15
|
+
.toLowerCase()
|
|
16
|
+
.split(/[^a-z0-9]+/)
|
|
17
|
+
.filter((token) => token.length > 2);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function unique(values) {
|
|
21
|
+
return [...new Set(values.filter(Boolean))];
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function overlapScore(left, right) {
|
|
25
|
+
const leftTokens = unique(tokenize(left));
|
|
26
|
+
const rightSet = new Set(tokenize(right));
|
|
27
|
+
if (leftTokens.length === 0) return 0;
|
|
28
|
+
const matches = leftTokens.filter((token) => rightSet.has(token)).length;
|
|
29
|
+
return matches / leftTokens.length;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function splitClaims(response) {
|
|
33
|
+
return String(response || '')
|
|
34
|
+
.split(/(?:[.!?]\s+|\n+)/)
|
|
35
|
+
.map((claim) => claim.trim())
|
|
36
|
+
.filter((claim) => claim.length > 0);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function normalizeContexts(contexts) {
|
|
40
|
+
if (Array.isArray(contexts)) return contexts.map(String).filter(Boolean);
|
|
41
|
+
if (contexts) return [String(contexts)];
|
|
42
|
+
return [];
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function scoreFaithfulness(response, contexts) {
|
|
46
|
+
const claims = splitClaims(response);
|
|
47
|
+
const contextText = normalizeContexts(contexts).join('\n');
|
|
48
|
+
if (claims.length === 0) return { score: 0, supportedClaims: 0, totalClaims: 0 };
|
|
49
|
+
const supportedClaims = claims.filter((claim) => {
|
|
50
|
+
const normalized = claim.toLowerCase();
|
|
51
|
+
return contextText.toLowerCase().includes(normalized) || overlapScore(claim, contextText) >= 0.58;
|
|
52
|
+
}).length;
|
|
53
|
+
return {
|
|
54
|
+
score: Number((supportedClaims / claims.length).toFixed(4)),
|
|
55
|
+
supportedClaims,
|
|
56
|
+
totalClaims: claims.length,
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function scoreAnswerRelevance(question, response) {
|
|
61
|
+
const score = overlapScore(question, response);
|
|
62
|
+
return {
|
|
63
|
+
score: Number(score.toFixed(4)),
|
|
64
|
+
matchedQuestionTerms: unique(tokenize(question).filter((token) => tokenize(response).includes(token))),
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function scoreContextPrecision(question, contexts, reference = '') {
|
|
69
|
+
const normalizedContexts = normalizeContexts(contexts);
|
|
70
|
+
const target = [question, reference].filter(Boolean).join('\n');
|
|
71
|
+
if (normalizedContexts.length === 0) return { score: 0, relevantContexts: 0, totalContexts: 0 };
|
|
72
|
+
|
|
73
|
+
let precisionSum = 0;
|
|
74
|
+
let relevantContexts = 0;
|
|
75
|
+
normalizedContexts.forEach((context, index) => {
|
|
76
|
+
const relevant = overlapScore(target, context) >= 0.22 || overlapScore(context, target) >= 0.22;
|
|
77
|
+
if (relevant) relevantContexts += 1;
|
|
78
|
+
const precisionAtK = relevantContexts / (index + 1);
|
|
79
|
+
if (relevant) precisionSum += precisionAtK;
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
const score = relevantContexts === 0 ? 0 : precisionSum / relevantContexts;
|
|
83
|
+
return {
|
|
84
|
+
score: Number(score.toFixed(4)),
|
|
85
|
+
relevantContexts,
|
|
86
|
+
totalContexts: normalizedContexts.length,
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function evaluateGeneration(testCase, options = {}) {
|
|
91
|
+
const thresholds = { ...DEFAULT_THRESHOLDS, ...(options.thresholds || {}) };
|
|
92
|
+
const contexts = normalizeContexts(testCase.retrievedContexts || testCase.contexts || testCase.retrieved_contexts);
|
|
93
|
+
const faithfulness = scoreFaithfulness(testCase.response || testCase.answer, contexts);
|
|
94
|
+
const answerRelevance = scoreAnswerRelevance(testCase.question || testCase.user_input, testCase.response || testCase.answer);
|
|
95
|
+
const contextPrecision = scoreContextPrecision(
|
|
96
|
+
testCase.question || testCase.user_input,
|
|
97
|
+
contexts,
|
|
98
|
+
testCase.reference || testCase.groundTruth || ''
|
|
99
|
+
);
|
|
100
|
+
const scores = {
|
|
101
|
+
faithfulness: faithfulness.score,
|
|
102
|
+
answerRelevance: answerRelevance.score,
|
|
103
|
+
contextPrecision: contextPrecision.score,
|
|
104
|
+
};
|
|
105
|
+
const passed = scores.faithfulness >= thresholds.faithfulness
|
|
106
|
+
&& scores.answerRelevance >= thresholds.answerRelevance
|
|
107
|
+
&& scores.contextPrecision >= thresholds.contextPrecision;
|
|
108
|
+
|
|
109
|
+
return {
|
|
110
|
+
id: String(testCase.id || testCase.traceId || 'case'),
|
|
111
|
+
traceId: String(testCase.traceId || testCase.id || ''),
|
|
112
|
+
passed,
|
|
113
|
+
scores,
|
|
114
|
+
thresholds,
|
|
115
|
+
details: {
|
|
116
|
+
faithfulness,
|
|
117
|
+
answerRelevance,
|
|
118
|
+
contextPrecision,
|
|
119
|
+
},
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function buildRagasCompatibleRows(cases) {
|
|
124
|
+
return cases.map((testCase) => ({
|
|
125
|
+
user_input: testCase.question || testCase.user_input || '',
|
|
126
|
+
response: testCase.response || testCase.answer || '',
|
|
127
|
+
retrieved_contexts: normalizeContexts(testCase.retrievedContexts || testCase.contexts || testCase.retrieved_contexts),
|
|
128
|
+
reference: testCase.reference || testCase.groundTruth || '',
|
|
129
|
+
}));
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function buildLangSmithCompatibleRuns(cases, results) {
|
|
133
|
+
return cases.map((testCase, index) => ({
|
|
134
|
+
id: testCase.traceId || testCase.id || `case-${index + 1}`,
|
|
135
|
+
name: 'thumbgate_async_rag_eval',
|
|
136
|
+
inputs: { question: testCase.question || testCase.user_input || '' },
|
|
137
|
+
outputs: { response: testCase.response || testCase.answer || '' },
|
|
138
|
+
metadata: {
|
|
139
|
+
evaluator: 'thumbgate-async-eval-observability',
|
|
140
|
+
caseId: testCase.id || null,
|
|
141
|
+
},
|
|
142
|
+
feedback: Object.entries(results[index].scores).map(([key, score]) => ({
|
|
143
|
+
key,
|
|
144
|
+
score,
|
|
145
|
+
})),
|
|
146
|
+
}));
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function buildEvalReport(cases, options = {}) {
|
|
150
|
+
const normalizedCases = Array.isArray(cases) ? cases : [];
|
|
151
|
+
const results = normalizedCases.map((testCase) => evaluateGeneration(testCase, options));
|
|
152
|
+
const passed = results.filter((result) => result.passed).length;
|
|
153
|
+
const failed = results.length - passed;
|
|
154
|
+
const aggregate = {
|
|
155
|
+
faithfulness: average(results.map((result) => result.scores.faithfulness)),
|
|
156
|
+
answerRelevance: average(results.map((result) => result.scores.answerRelevance)),
|
|
157
|
+
contextPrecision: average(results.map((result) => result.scores.contextPrecision)),
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
generatedAt: new Date().toISOString(),
|
|
162
|
+
mode: 'async-post-generation',
|
|
163
|
+
total: results.length,
|
|
164
|
+
passed,
|
|
165
|
+
failed,
|
|
166
|
+
passRate: results.length === 0 ? 0 : Number(((passed / results.length) * 100).toFixed(2)),
|
|
167
|
+
aggregate,
|
|
168
|
+
passedThreshold: failed === 0,
|
|
169
|
+
metrics: ['faithfulness', 'answerRelevance', 'contextPrecision'],
|
|
170
|
+
sinks: {
|
|
171
|
+
ci: true,
|
|
172
|
+
langsmithCompatible: true,
|
|
173
|
+
ragasCompatible: true,
|
|
174
|
+
},
|
|
175
|
+
results,
|
|
176
|
+
ragasDataset: buildRagasCompatibleRows(normalizedCases),
|
|
177
|
+
langsmithRuns: buildLangSmithCompatibleRuns(normalizedCases, results),
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
function average(values) {
|
|
182
|
+
const numeric = values.filter((value) => Number.isFinite(value));
|
|
183
|
+
if (numeric.length === 0) return 0;
|
|
184
|
+
return Number((numeric.reduce((sum, value) => sum + value, 0) / numeric.length).toFixed(4));
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
async function runAsyncEvaluation(cases, options = {}) {
|
|
188
|
+
const report = await new Promise((resolve) => {
|
|
189
|
+
setImmediate(() => resolve(buildEvalReport(cases, options)));
|
|
190
|
+
});
|
|
191
|
+
if (options.outputPath) {
|
|
192
|
+
fs.mkdirSync(path.dirname(options.outputPath), { recursive: true });
|
|
193
|
+
fs.writeFileSync(options.outputPath, `${JSON.stringify(report, null, 2)}\n`);
|
|
194
|
+
}
|
|
195
|
+
return report;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
function loadCases(inputPath) {
|
|
199
|
+
const payload = JSON.parse(fs.readFileSync(inputPath, 'utf8'));
|
|
200
|
+
return Array.isArray(payload) ? payload : payload.cases || [];
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
async function main(argv = process.argv.slice(2)) {
|
|
204
|
+
const inputIndex = argv.indexOf('--input');
|
|
205
|
+
const outputIndex = argv.indexOf('--output');
|
|
206
|
+
const inputPath = inputIndex >= 0 ? argv[inputIndex + 1] : 'bench/observability-eval-suite.json';
|
|
207
|
+
const outputPath = outputIndex >= 0 ? argv[outputIndex + 1] : 'proof/async-eval-observability-report.json';
|
|
208
|
+
const report = await runAsyncEvaluation(loadCases(inputPath), { outputPath });
|
|
209
|
+
process.stdout.write(`${JSON.stringify({
|
|
210
|
+
outputPath,
|
|
211
|
+
total: report.total,
|
|
212
|
+
passed: report.passed,
|
|
213
|
+
failed: report.failed,
|
|
214
|
+
passRate: report.passRate,
|
|
215
|
+
}, null, 2)}\n`);
|
|
216
|
+
if (!report.passedThreshold) process.exitCode = 1;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
module.exports = {
|
|
220
|
+
DEFAULT_THRESHOLDS,
|
|
221
|
+
buildEvalReport,
|
|
222
|
+
buildLangSmithCompatibleRuns,
|
|
223
|
+
buildRagasCompatibleRows,
|
|
224
|
+
evaluateGeneration,
|
|
225
|
+
runAsyncEvaluation,
|
|
226
|
+
scoreAnswerRelevance,
|
|
227
|
+
scoreContextPrecision,
|
|
228
|
+
scoreFaithfulness,
|
|
229
|
+
};
|
|
230
|
+
|
|
231
|
+
if (require.main === module) {
|
|
232
|
+
main().catch((err) => {
|
|
233
|
+
console.error(err.stack || err.message);
|
|
234
|
+
process.exitCode = 1;
|
|
235
|
+
});
|
|
236
|
+
}
|