thumbgate 1.27.6 → 1.27.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/thumbgate-blocked.md +27 -0
- package/.claude/commands/thumbgate-doctor.md +30 -0
- package/.claude/commands/thumbgate-guard.md +36 -0
- package/.claude/commands/thumbgate-protect.md +30 -0
- package/.claude/commands/thumbgate-rules.md +30 -0
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/llms.txt +6 -2
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +49 -5
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/letta/README.md +41 -0
- package/adapters/letta/thumbgate-letta-adapter.js +133 -0
- package/adapters/mcp/server-stdio.js +16 -1
- package/adapters/opencode/opencode.json +1 -1
- package/adapters/policy-engine/ethicore-guardian-client.js +68 -0
- package/adapters/policy-engine/thumbgate-policy-engine-adapter.js +260 -0
- package/bench/observability-eval-suite.json +26 -0
- package/bin/cli.js +180 -2
- package/bin/postinstall.js +1 -1
- package/config/gate-templates.json +84 -0
- package/config/gates/claim-verification.json +6 -0
- package/config/gates/default.json +20 -0
- package/config/github-about.json +1 -1
- package/config/model-candidates.json +50 -0
- package/package.json +65 -25
- package/public/agent-manager.html +41 -1
- package/public/agents-cost-savings.html +1 -1
- package/public/ai-malpractice-prevention.html +2 -1
- package/public/assets/brand/github-social-preview.png +0 -0
- package/public/assets/brand/thumbgate-icon-512.png +0 -0
- package/public/assets/brand/thumbgate-icon-pro-512.png +0 -0
- package/public/assets/brand/thumbgate-icon-team-512.png +0 -0
- package/public/assets/brand/thumbgate-logo-1200x360.png +0 -0
- package/public/assets/brand/thumbgate-mark-inline.svg +15 -0
- package/public/assets/brand/thumbgate-mark-pro.svg +23 -0
- package/public/assets/brand/thumbgate-mark-team.svg +26 -0
- package/public/assets/brand/thumbgate-mark.svg +15 -0
- package/public/assets/brand/thumbgate-wordmark.svg +20 -0
- package/public/assets/claude-thumbgate-statusbar.svg +8 -0
- package/public/assets/codex-thumbgate-statusbar-test.svg +9 -0
- package/public/assets/legal-intake-control-flow.svg +66 -0
- package/public/blog.html +1 -1
- package/public/brand/thumbgate-mark.svg +15 -0
- package/public/brand/thumbgate-og.svg +16 -0
- package/public/codex-enterprise.html +1 -1
- package/public/codex-plugin.html +1 -1
- package/public/compare.html +23 -3
- package/public/dashboard.html +312 -30
- package/public/federal.html +1 -1
- package/public/guide.html +5 -4
- package/public/index.html +167 -49
- package/public/js/buyer-intent.js +672 -0
- package/public/learn.html +74 -7
- package/public/lessons.html +2 -1
- package/public/numbers.html +3 -3
- package/public/pricing.html +63 -15
- package/public/pro.html +7 -7
- package/scripts/activation-quickstart.js +187 -0
- package/scripts/agent-memory-lifecycle.js +211 -0
- package/scripts/async-eval-observability.js +236 -0
- package/scripts/auto-promote-gates.js +75 -4
- package/scripts/build-metadata.js +24 -3
- package/scripts/cli-schema.js +22 -0
- package/scripts/dashboard-chat.js +2 -1
- package/scripts/dashboard.js +8 -0
- package/scripts/export-databricks-bundle.js +5 -1
- package/scripts/export-dpo-pairs.js +7 -2
- package/scripts/feedback-aggregate.js +281 -0
- package/scripts/feedback-loop.js +34 -0
- package/scripts/filesystem-search.js +35 -10
- package/scripts/gates-engine.js +198 -6
- package/scripts/gemini-embedding-policy.js +2 -1
- package/scripts/hook-stop-anti-claim.js +227 -0
- package/scripts/hook-thumbgate-cache-updater.js +18 -2
- package/scripts/lesson-inference.js +8 -3
- package/scripts/lesson-search.js +17 -1
- package/scripts/operational-integrity.js +39 -5
- package/scripts/plausible-domain-config.js +4 -2
- package/scripts/rate-limiter.js +12 -6
- package/scripts/secret-redaction.js +166 -0
- package/scripts/security-scanner.js +100 -0
- package/scripts/self-distill-agent.js +3 -1
- package/scripts/self-harness-optimizer.js +141 -0
- package/scripts/seo-gsd.js +635 -0
- package/scripts/statusline-cache-path.js +17 -2
- package/scripts/statusline-cache-read.js +57 -0
- package/scripts/statusline-local-stats.js +9 -1
- package/scripts/statusline-meta.js +5 -2
- package/scripts/statusline.sh +13 -1
- package/scripts/sync-telemetry-from-prod.js +374 -0
- package/scripts/telemetry-analytics.js +9 -0
- package/scripts/thumbgate-search.js +85 -19
- package/scripts/tool-contract-validator.js +76 -0
- package/scripts/vector-store.js +44 -0
- package/scripts/workspace-evolver.js +62 -2
- package/src/api/server.js +715 -86
|
@@ -2,6 +2,40 @@
|
|
|
2
2
|
'use strict';
|
|
3
3
|
|
|
4
4
|
const MEMORY_TYPES = new Set(['episodic', 'semantic', 'procedural', 'preference', 'working']);
|
|
5
|
+
const MEMORY_SCOPES = new Set(['task', 'session', 'user', 'project', 'org']);
|
|
6
|
+
const HIGH_RISK_TERMS = new Set([
|
|
7
|
+
'billing',
|
|
8
|
+
'checkout',
|
|
9
|
+
'compliance',
|
|
10
|
+
'credential',
|
|
11
|
+
'data-loss',
|
|
12
|
+
'deploy',
|
|
13
|
+
'deployment',
|
|
14
|
+
'git',
|
|
15
|
+
'payment',
|
|
16
|
+
'production',
|
|
17
|
+
'release',
|
|
18
|
+
'secret',
|
|
19
|
+
'security',
|
|
20
|
+
'stripe',
|
|
21
|
+
'verification',
|
|
22
|
+
]);
|
|
23
|
+
const KNOWN_ENTITY_PATTERNS = [
|
|
24
|
+
['Claude Code', /\bclaude\s+code\b/i, 'agent'],
|
|
25
|
+
['Codex', /\bcodex\b/i, 'agent'],
|
|
26
|
+
['Cursor', /\bcursor\b/i, 'agent'],
|
|
27
|
+
['Gemini CLI', /\bgemini\s+cli\b/i, 'agent'],
|
|
28
|
+
['MCP', /\bmcp\b/i, 'protocol'],
|
|
29
|
+
['Stripe', /\bstripe\b/i, 'service'],
|
|
30
|
+
['GitHub', /\bgithub\b|\bgh\s+/i, 'service'],
|
|
31
|
+
['Railway', /\brailway\b/i, 'service'],
|
|
32
|
+
['Plausible', /\bplausible\b/i, 'service'],
|
|
33
|
+
['PostHog', /\bposthog\b/i, 'service'],
|
|
34
|
+
['SQLite', /\bsqlite\b|\bfts5\b/i, 'storage'],
|
|
35
|
+
['LanceDB', /\blancedb\b/i, 'storage'],
|
|
36
|
+
['Docker', /\bdocker\b/i, 'runtime'],
|
|
37
|
+
['npm', /\bnpm\b|\bnpx\b/i, 'runtime'],
|
|
38
|
+
];
|
|
5
39
|
|
|
6
40
|
function normalizeText(value) {
|
|
7
41
|
if (value === undefined || value === null) return '';
|
|
@@ -13,6 +47,178 @@ function normalizeMemoryType(value) {
|
|
|
13
47
|
return MEMORY_TYPES.has(normalized) ? normalized : 'episodic';
|
|
14
48
|
}
|
|
15
49
|
|
|
50
|
+
function tokenize(value) {
|
|
51
|
+
return normalizeText(value)
|
|
52
|
+
.toLowerCase()
|
|
53
|
+
.split(/[^a-z0-9_.:/-]+/)
|
|
54
|
+
.filter(Boolean);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function uniqueByName(entities) {
|
|
58
|
+
const seen = new Set();
|
|
59
|
+
return entities.filter((entity) => {
|
|
60
|
+
const key = normalizeText(entity.name).toLowerCase();
|
|
61
|
+
if (!key || seen.has(key)) return false;
|
|
62
|
+
seen.add(key);
|
|
63
|
+
return true;
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function collectMemoryText(memory = {}) {
|
|
68
|
+
return [
|
|
69
|
+
memory.title,
|
|
70
|
+
memory.content,
|
|
71
|
+
memory.context,
|
|
72
|
+
memory.whatWentWrong,
|
|
73
|
+
memory.whatToChange,
|
|
74
|
+
memory.whatWorked,
|
|
75
|
+
memory.domain,
|
|
76
|
+
memory.skill,
|
|
77
|
+
Array.isArray(memory.tags) ? memory.tags.join(' ') : memory.tags,
|
|
78
|
+
].filter(Boolean).join(' ');
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function extractMemoryEntities(memory = {}) {
|
|
82
|
+
const text = collectMemoryText(memory);
|
|
83
|
+
const entities = [];
|
|
84
|
+
|
|
85
|
+
for (const [name, pattern, type] of KNOWN_ENTITY_PATTERNS) {
|
|
86
|
+
if (pattern.test(text)) entities.push({ name, type });
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const commandMatches = text.match(/`([^`]+)`/g) || [];
|
|
90
|
+
for (const match of commandMatches) {
|
|
91
|
+
const command = match.slice(1, -1).trim();
|
|
92
|
+
if (/^(git|npm|npx|node|gh|curl|docker|python|pytest|stripe)\b/i.test(command)) {
|
|
93
|
+
entities.push({ name: command, type: 'command' });
|
|
94
|
+
} else if (/[./-]/.test(command)) {
|
|
95
|
+
entities.push({ name: command, type: 'path' });
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const pathMatches = text.match(/\b(?:[a-z0-9_-]+\/)+[a-z0-9_.-]+\b/gi) || [];
|
|
100
|
+
for (const filePath of pathMatches.slice(0, 8)) {
|
|
101
|
+
entities.push({ name: filePath, type: 'path' });
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return uniqueByName(entities).slice(0, 16);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function inferMemoryScope(memory = {}) {
|
|
108
|
+
const explicit = normalizeText(memory.scope || memory.memoryScope).toLowerCase();
|
|
109
|
+
if (MEMORY_SCOPES.has(explicit)) return explicit;
|
|
110
|
+
|
|
111
|
+
const text = collectMemoryText(memory).toLowerCase();
|
|
112
|
+
const tags = new Set(Array.isArray(memory.tags) ? memory.tags.map((tag) => normalizeText(tag).toLowerCase()) : []);
|
|
113
|
+
|
|
114
|
+
if (tags.has('preference') || /\b(prefer|style|tone|my preference|user preference)\b/.test(text)) return 'user';
|
|
115
|
+
if (tags.has('org') || tags.has('team') || /\b(enterprise|seat|team|shared|org|compliance|policy|approval)\b/.test(text)) return 'org';
|
|
116
|
+
if (tags.has('repo') || tags.has('project') || tags.has('release') || tags.has('deployment')
|
|
117
|
+
|| /\b(repo|repository|branch|ci|pull request|github|deploy|production|release|publish)\b/.test(text)) return 'project';
|
|
118
|
+
if (tags.has('session') || /\b(this session|current session|today|right now)\b/.test(text)) return 'session';
|
|
119
|
+
return 'task';
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function scoreMemoryDecay(memory = {}, options = {}) {
|
|
123
|
+
const nowMs = options.now ? new Date(options.now).getTime() : Date.now();
|
|
124
|
+
const timestampMs = memory.timestamp ? new Date(memory.timestamp).getTime() : NaN;
|
|
125
|
+
const ageDays = Number.isFinite(timestampMs)
|
|
126
|
+
? Math.max(0, (nowMs - timestampMs) / (1000 * 60 * 60 * 24))
|
|
127
|
+
: null;
|
|
128
|
+
const textTokens = new Set(tokenize(collectMemoryText(memory)));
|
|
129
|
+
const tags = Array.isArray(memory.tags) ? memory.tags.map((tag) => normalizeText(tag).toLowerCase()) : [];
|
|
130
|
+
const highRisk = tags.some((tag) => HIGH_RISK_TERMS.has(tag))
|
|
131
|
+
|| [...textTokens].some((token) => HIGH_RISK_TERMS.has(token))
|
|
132
|
+
|| ['critical', 'high'].includes(normalizeText(memory.importance).toLowerCase());
|
|
133
|
+
|
|
134
|
+
if (highRisk) {
|
|
135
|
+
return {
|
|
136
|
+
state: 'sticky',
|
|
137
|
+
ageDays,
|
|
138
|
+
score: 1,
|
|
139
|
+
reason: 'high-risk memories stay retrievable until explicitly retired',
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
if (ageDays === null) {
|
|
143
|
+
return {
|
|
144
|
+
state: 'review',
|
|
145
|
+
ageDays,
|
|
146
|
+
score: 0.6,
|
|
147
|
+
reason: 'memory has no timestamp, so it needs review before durable promotion',
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
if (ageDays > 180) {
|
|
151
|
+
return {
|
|
152
|
+
state: 'archive_candidate',
|
|
153
|
+
ageDays,
|
|
154
|
+
score: 0.2,
|
|
155
|
+
reason: 'old low-risk memory should be consolidated or archived',
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
if (ageDays > 60) {
|
|
159
|
+
return {
|
|
160
|
+
state: 'review',
|
|
161
|
+
ageDays,
|
|
162
|
+
score: 0.55,
|
|
163
|
+
reason: 'older low-risk memory should be refreshed before it dominates recall',
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
return {
|
|
167
|
+
state: 'active',
|
|
168
|
+
ageDays,
|
|
169
|
+
score: 0.85,
|
|
170
|
+
reason: 'recent memory remains eligible for recall',
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
function scoreHybridMemoryMatch(query, memory = {}, options = {}) {
|
|
175
|
+
const queryTokens = new Set(tokenize(query));
|
|
176
|
+
const memoryTokens = new Set(tokenize(collectMemoryText(memory)));
|
|
177
|
+
const queryText = normalizeText(query).toLowerCase();
|
|
178
|
+
const memoryText = collectMemoryText(memory).toLowerCase();
|
|
179
|
+
const memoryEntities = extractMemoryEntities(memory);
|
|
180
|
+
const queryEntityNames = extractMemoryEntities({ content: query }).map((entity) => entity.name.toLowerCase());
|
|
181
|
+
|
|
182
|
+
let lexicalMatches = 0;
|
|
183
|
+
for (const token of queryTokens) {
|
|
184
|
+
if (memoryTokens.has(token)) lexicalMatches++;
|
|
185
|
+
}
|
|
186
|
+
const lexicalScore = queryTokens.size > 0 ? lexicalMatches / queryTokens.size : 0;
|
|
187
|
+
const phraseScore = queryText && memoryText.includes(queryText) ? 0.35 : 0;
|
|
188
|
+
const entityMatches = memoryEntities.filter((entity) => queryEntityNames.includes(entity.name.toLowerCase()));
|
|
189
|
+
const entityScore = queryEntityNames.length > 0 ? entityMatches.length / queryEntityNames.length : 0;
|
|
190
|
+
const decay = scoreMemoryDecay(memory, options);
|
|
191
|
+
const lifecycleScore = decay.state === 'archive_candidate' ? -0.15 : decay.state === 'sticky' ? 0.12 : 0;
|
|
192
|
+
const score = lexicalScore + phraseScore + (entityScore * 0.45) + lifecycleScore;
|
|
193
|
+
|
|
194
|
+
return {
|
|
195
|
+
score: Number(Math.max(0, score).toFixed(4)),
|
|
196
|
+
lexicalScore: Number(lexicalScore.toFixed(4)),
|
|
197
|
+
entityScore: Number(entityScore.toFixed(4)),
|
|
198
|
+
matchedEntities: entityMatches,
|
|
199
|
+
decayState: decay.state,
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
function buildMemoryLifecycleView(memory = {}, options = {}) {
|
|
204
|
+
const scope = inferMemoryScope(memory);
|
|
205
|
+
const entities = extractMemoryEntities(memory);
|
|
206
|
+
const decay = scoreMemoryDecay(memory, options);
|
|
207
|
+
const retrieval = scoreHybridMemoryMatch(options.query || '', memory, options);
|
|
208
|
+
|
|
209
|
+
return {
|
|
210
|
+
scope,
|
|
211
|
+
entities,
|
|
212
|
+
decay,
|
|
213
|
+
retrievalHints: {
|
|
214
|
+
hybridScore: retrieval.score,
|
|
215
|
+
lexicalScore: retrieval.lexicalScore,
|
|
216
|
+
entityScore: retrieval.entityScore,
|
|
217
|
+
matchedEntities: retrieval.matchedEntities,
|
|
218
|
+
},
|
|
219
|
+
};
|
|
220
|
+
}
|
|
221
|
+
|
|
16
222
|
function buildMemoryLifecyclePolicy(input = {}) {
|
|
17
223
|
return {
|
|
18
224
|
generatedAt: normalizeText(input.generatedAt) || new Date().toISOString(),
|
|
@@ -91,6 +297,11 @@ function evaluateMemoryPromotion(memory = {}, policy = buildMemoryLifecyclePolic
|
|
|
91
297
|
|
|
92
298
|
module.exports = {
|
|
93
299
|
buildMemoryLifecyclePolicy,
|
|
300
|
+
buildMemoryLifecycleView,
|
|
94
301
|
evaluateMemoryPromotion,
|
|
302
|
+
extractMemoryEntities,
|
|
303
|
+
inferMemoryScope,
|
|
95
304
|
normalizeMemoryType,
|
|
305
|
+
scoreHybridMemoryMatch,
|
|
306
|
+
scoreMemoryDecay,
|
|
96
307
|
};
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
const fs = require('node:fs');
|
|
5
|
+
const path = require('node:path');
|
|
6
|
+
|
|
7
|
+
const DEFAULT_THRESHOLDS = {
|
|
8
|
+
faithfulness: 0.72,
|
|
9
|
+
answerRelevance: 0.45,
|
|
10
|
+
contextPrecision: 0.5,
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
function tokenize(value) {
|
|
14
|
+
return String(value || '')
|
|
15
|
+
.toLowerCase()
|
|
16
|
+
.split(/[^a-z0-9]+/)
|
|
17
|
+
.filter((token) => token.length > 2);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function unique(values) {
|
|
21
|
+
return [...new Set(values.filter(Boolean))];
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function overlapScore(left, right) {
|
|
25
|
+
const leftTokens = unique(tokenize(left));
|
|
26
|
+
const rightSet = new Set(tokenize(right));
|
|
27
|
+
if (leftTokens.length === 0) return 0;
|
|
28
|
+
const matches = leftTokens.filter((token) => rightSet.has(token)).length;
|
|
29
|
+
return matches / leftTokens.length;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function splitClaims(response) {
|
|
33
|
+
return String(response || '')
|
|
34
|
+
.split(/(?:[.!?]\s+|\n+)/)
|
|
35
|
+
.map((claim) => claim.trim())
|
|
36
|
+
.filter((claim) => claim.length > 0);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function normalizeContexts(contexts) {
|
|
40
|
+
if (Array.isArray(contexts)) return contexts.map(String).filter(Boolean);
|
|
41
|
+
if (contexts) return [String(contexts)];
|
|
42
|
+
return [];
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function scoreFaithfulness(response, contexts) {
|
|
46
|
+
const claims = splitClaims(response);
|
|
47
|
+
const contextText = normalizeContexts(contexts).join('\n');
|
|
48
|
+
if (claims.length === 0) return { score: 0, supportedClaims: 0, totalClaims: 0 };
|
|
49
|
+
const supportedClaims = claims.filter((claim) => {
|
|
50
|
+
const normalized = claim.toLowerCase();
|
|
51
|
+
return contextText.toLowerCase().includes(normalized) || overlapScore(claim, contextText) >= 0.58;
|
|
52
|
+
}).length;
|
|
53
|
+
return {
|
|
54
|
+
score: Number((supportedClaims / claims.length).toFixed(4)),
|
|
55
|
+
supportedClaims,
|
|
56
|
+
totalClaims: claims.length,
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function scoreAnswerRelevance(question, response) {
|
|
61
|
+
const score = overlapScore(question, response);
|
|
62
|
+
return {
|
|
63
|
+
score: Number(score.toFixed(4)),
|
|
64
|
+
matchedQuestionTerms: unique(tokenize(question).filter((token) => tokenize(response).includes(token))),
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function scoreContextPrecision(question, contexts, reference = '') {
|
|
69
|
+
const normalizedContexts = normalizeContexts(contexts);
|
|
70
|
+
const target = [question, reference].filter(Boolean).join('\n');
|
|
71
|
+
if (normalizedContexts.length === 0) return { score: 0, relevantContexts: 0, totalContexts: 0 };
|
|
72
|
+
|
|
73
|
+
let precisionSum = 0;
|
|
74
|
+
let relevantContexts = 0;
|
|
75
|
+
normalizedContexts.forEach((context, index) => {
|
|
76
|
+
const relevant = overlapScore(target, context) >= 0.22 || overlapScore(context, target) >= 0.22;
|
|
77
|
+
if (relevant) relevantContexts += 1;
|
|
78
|
+
const precisionAtK = relevantContexts / (index + 1);
|
|
79
|
+
if (relevant) precisionSum += precisionAtK;
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
const score = relevantContexts === 0 ? 0 : precisionSum / relevantContexts;
|
|
83
|
+
return {
|
|
84
|
+
score: Number(score.toFixed(4)),
|
|
85
|
+
relevantContexts,
|
|
86
|
+
totalContexts: normalizedContexts.length,
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function evaluateGeneration(testCase, options = {}) {
|
|
91
|
+
const thresholds = { ...DEFAULT_THRESHOLDS, ...(options.thresholds || {}) };
|
|
92
|
+
const contexts = normalizeContexts(testCase.retrievedContexts || testCase.contexts || testCase.retrieved_contexts);
|
|
93
|
+
const faithfulness = scoreFaithfulness(testCase.response || testCase.answer, contexts);
|
|
94
|
+
const answerRelevance = scoreAnswerRelevance(testCase.question || testCase.user_input, testCase.response || testCase.answer);
|
|
95
|
+
const contextPrecision = scoreContextPrecision(
|
|
96
|
+
testCase.question || testCase.user_input,
|
|
97
|
+
contexts,
|
|
98
|
+
testCase.reference || testCase.groundTruth || ''
|
|
99
|
+
);
|
|
100
|
+
const scores = {
|
|
101
|
+
faithfulness: faithfulness.score,
|
|
102
|
+
answerRelevance: answerRelevance.score,
|
|
103
|
+
contextPrecision: contextPrecision.score,
|
|
104
|
+
};
|
|
105
|
+
const passed = scores.faithfulness >= thresholds.faithfulness
|
|
106
|
+
&& scores.answerRelevance >= thresholds.answerRelevance
|
|
107
|
+
&& scores.contextPrecision >= thresholds.contextPrecision;
|
|
108
|
+
|
|
109
|
+
return {
|
|
110
|
+
id: String(testCase.id || testCase.traceId || 'case'),
|
|
111
|
+
traceId: String(testCase.traceId || testCase.id || ''),
|
|
112
|
+
passed,
|
|
113
|
+
scores,
|
|
114
|
+
thresholds,
|
|
115
|
+
details: {
|
|
116
|
+
faithfulness,
|
|
117
|
+
answerRelevance,
|
|
118
|
+
contextPrecision,
|
|
119
|
+
},
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function buildRagasCompatibleRows(cases) {
|
|
124
|
+
return cases.map((testCase) => ({
|
|
125
|
+
user_input: testCase.question || testCase.user_input || '',
|
|
126
|
+
response: testCase.response || testCase.answer || '',
|
|
127
|
+
retrieved_contexts: normalizeContexts(testCase.retrievedContexts || testCase.contexts || testCase.retrieved_contexts),
|
|
128
|
+
reference: testCase.reference || testCase.groundTruth || '',
|
|
129
|
+
}));
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function buildLangSmithCompatibleRuns(cases, results) {
|
|
133
|
+
return cases.map((testCase, index) => ({
|
|
134
|
+
id: testCase.traceId || testCase.id || `case-${index + 1}`,
|
|
135
|
+
name: 'thumbgate_async_rag_eval',
|
|
136
|
+
inputs: { question: testCase.question || testCase.user_input || '' },
|
|
137
|
+
outputs: { response: testCase.response || testCase.answer || '' },
|
|
138
|
+
metadata: {
|
|
139
|
+
evaluator: 'thumbgate-async-eval-observability',
|
|
140
|
+
caseId: testCase.id || null,
|
|
141
|
+
},
|
|
142
|
+
feedback: Object.entries(results[index].scores).map(([key, score]) => ({
|
|
143
|
+
key,
|
|
144
|
+
score,
|
|
145
|
+
})),
|
|
146
|
+
}));
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function buildEvalReport(cases, options = {}) {
|
|
150
|
+
const normalizedCases = Array.isArray(cases) ? cases : [];
|
|
151
|
+
const results = normalizedCases.map((testCase) => evaluateGeneration(testCase, options));
|
|
152
|
+
const passed = results.filter((result) => result.passed).length;
|
|
153
|
+
const failed = results.length - passed;
|
|
154
|
+
const aggregate = {
|
|
155
|
+
faithfulness: average(results.map((result) => result.scores.faithfulness)),
|
|
156
|
+
answerRelevance: average(results.map((result) => result.scores.answerRelevance)),
|
|
157
|
+
contextPrecision: average(results.map((result) => result.scores.contextPrecision)),
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
generatedAt: new Date().toISOString(),
|
|
162
|
+
mode: 'async-post-generation',
|
|
163
|
+
total: results.length,
|
|
164
|
+
passed,
|
|
165
|
+
failed,
|
|
166
|
+
passRate: results.length === 0 ? 0 : Number(((passed / results.length) * 100).toFixed(2)),
|
|
167
|
+
aggregate,
|
|
168
|
+
passedThreshold: failed === 0,
|
|
169
|
+
metrics: ['faithfulness', 'answerRelevance', 'contextPrecision'],
|
|
170
|
+
sinks: {
|
|
171
|
+
ci: true,
|
|
172
|
+
langsmithCompatible: true,
|
|
173
|
+
ragasCompatible: true,
|
|
174
|
+
},
|
|
175
|
+
results,
|
|
176
|
+
ragasDataset: buildRagasCompatibleRows(normalizedCases),
|
|
177
|
+
langsmithRuns: buildLangSmithCompatibleRuns(normalizedCases, results),
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
function average(values) {
|
|
182
|
+
const numeric = values.filter((value) => Number.isFinite(value));
|
|
183
|
+
if (numeric.length === 0) return 0;
|
|
184
|
+
return Number((numeric.reduce((sum, value) => sum + value, 0) / numeric.length).toFixed(4));
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
async function runAsyncEvaluation(cases, options = {}) {
|
|
188
|
+
const report = await new Promise((resolve) => {
|
|
189
|
+
setImmediate(() => resolve(buildEvalReport(cases, options)));
|
|
190
|
+
});
|
|
191
|
+
if (options.outputPath) {
|
|
192
|
+
fs.mkdirSync(path.dirname(options.outputPath), { recursive: true });
|
|
193
|
+
fs.writeFileSync(options.outputPath, `${JSON.stringify(report, null, 2)}\n`);
|
|
194
|
+
}
|
|
195
|
+
return report;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
function loadCases(inputPath) {
|
|
199
|
+
const payload = JSON.parse(fs.readFileSync(inputPath, 'utf8'));
|
|
200
|
+
return Array.isArray(payload) ? payload : payload.cases || [];
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
async function main(argv = process.argv.slice(2)) {
|
|
204
|
+
const inputIndex = argv.indexOf('--input');
|
|
205
|
+
const outputIndex = argv.indexOf('--output');
|
|
206
|
+
const inputPath = inputIndex >= 0 ? argv[inputIndex + 1] : 'bench/observability-eval-suite.json';
|
|
207
|
+
const outputPath = outputIndex >= 0 ? argv[outputIndex + 1] : 'proof/async-eval-observability-report.json';
|
|
208
|
+
const report = await runAsyncEvaluation(loadCases(inputPath), { outputPath });
|
|
209
|
+
process.stdout.write(`${JSON.stringify({
|
|
210
|
+
outputPath,
|
|
211
|
+
total: report.total,
|
|
212
|
+
passed: report.passed,
|
|
213
|
+
failed: report.failed,
|
|
214
|
+
passRate: report.passRate,
|
|
215
|
+
}, null, 2)}\n`);
|
|
216
|
+
if (!report.passedThreshold) process.exitCode = 1;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
module.exports = {
|
|
220
|
+
DEFAULT_THRESHOLDS,
|
|
221
|
+
buildEvalReport,
|
|
222
|
+
buildLangSmithCompatibleRuns,
|
|
223
|
+
buildRagasCompatibleRows,
|
|
224
|
+
evaluateGeneration,
|
|
225
|
+
runAsyncEvaluation,
|
|
226
|
+
scoreAnswerRelevance,
|
|
227
|
+
scoreContextPrecision,
|
|
228
|
+
scoreFaithfulness,
|
|
229
|
+
};
|
|
230
|
+
|
|
231
|
+
if (require.main === module) {
|
|
232
|
+
main().catch((err) => {
|
|
233
|
+
console.error(err.stack || err.message);
|
|
234
|
+
process.exitCode = 1;
|
|
235
|
+
});
|
|
236
|
+
}
|
|
@@ -58,6 +58,47 @@ function readJSONL(filePath) {
|
|
|
58
58
|
}).filter(Boolean);
|
|
59
59
|
}
|
|
60
60
|
|
|
61
|
+
// --- Self-Harness stage 3: regression-gated promotion -----------------------
|
|
62
|
+
// Inspired by "Self-Harness: Harnesses That Improve Themselves" (arXiv 2606.09498).
|
|
63
|
+
// Stages 1-2 (weakness mining -> rule extraction) already exist via lesson
|
|
64
|
+
// inference + this promoter. Stage 3 — accept a harness change only after
|
|
65
|
+
// regression-testing it does not degrade behavior — was missing: a noisy 3x
|
|
66
|
+
// capture could hard-block an over-broad pattern with no check that it wouldn't
|
|
67
|
+
// have wrongly blocked actions that were previously ALLOWED. This replays a
|
|
68
|
+
// candidate BLOCK rule against the audit trail's prior `allow` decisions; if it
|
|
69
|
+
// would have blocked safe actions, the caller quarantines it to `warn` instead.
|
|
70
|
+
const REGRESSION_FALSE_BLOCK_LIMIT = 0; // any prior safe action it would block => quarantine
|
|
71
|
+
|
|
72
|
+
function getAuditTrailPath() {
|
|
73
|
+
return path.join(path.dirname(getFeedbackLogPath()), 'audit-trail.jsonl');
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Returns { falseBlocks, allowSampleSize } or null when there is no history /
|
|
77
|
+
// matcher available — in which case the caller promotes as usual (fail-open to
|
|
78
|
+
// existing behavior, since regression gating is an enhancement, not a hard gate).
|
|
79
|
+
function regressionCheck(gate, options = {}) {
|
|
80
|
+
const auditPath = options.auditTrailPath || getAuditTrailPath();
|
|
81
|
+
const entries = readJSONL(auditPath);
|
|
82
|
+
if (!entries.length) return null;
|
|
83
|
+
// Lazy-require to avoid the gates-engine <-> auto-promote-gates require cycle.
|
|
84
|
+
let matchesGate;
|
|
85
|
+
try { ({ matchesGate } = require('./gates-engine')); } catch { return null; }
|
|
86
|
+
if (typeof matchesGate !== 'function') return null;
|
|
87
|
+
const allowed = entries.filter((e) => e && e.decision === 'allow' && e.toolName);
|
|
88
|
+
if (!allowed.length) return null;
|
|
89
|
+
let falseBlocks = 0;
|
|
90
|
+
for (const e of allowed) {
|
|
91
|
+
try {
|
|
92
|
+
if (matchesGate(gate, e.toolName, e.toolInput || {})) falseBlocks += 1;
|
|
93
|
+
} catch { /* a bad pattern/entry never counts as a false block */ }
|
|
94
|
+
}
|
|
95
|
+
return { falseBlocks, allowSampleSize: allowed.length };
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function safeRegressionCheck(gate, options) {
|
|
99
|
+
try { return regressionCheck(gate, options); } catch { return null; }
|
|
100
|
+
}
|
|
101
|
+
|
|
61
102
|
function loadAutoGates() {
|
|
62
103
|
const autoGatesPath = getAutoGatesPath();
|
|
63
104
|
if (!fs.existsSync(autoGatesPath)) {
|
|
@@ -358,9 +399,16 @@ function promote(feedbackLogPath, options) {
|
|
|
358
399
|
const existing = data.gates[existingIdx];
|
|
359
400
|
const newAction = group.count >= BLOCK_THRESHOLD ? 'block' : 'warn';
|
|
360
401
|
if (existing.action !== newAction && newAction === 'block') {
|
|
361
|
-
//
|
|
362
|
-
|
|
363
|
-
|
|
402
|
+
// Self-Harness stage 3: regression-test before upgrading warn -> block.
|
|
403
|
+
const regression = opts.skipRegression ? null : safeRegressionCheck(buildGateRule(group, 'block'), opts);
|
|
404
|
+
if (regression && regression.falseBlocks > REGRESSION_FALSE_BLOCK_LIMIT) {
|
|
405
|
+
// Would block prior safe actions — hold at warn instead of upgrading.
|
|
406
|
+
promotions.push({ type: 'upgrade-quarantined', gateId, from: existing.action, occurrences: group.count, falseBlocks: regression.falseBlocks });
|
|
407
|
+
} else {
|
|
408
|
+
// Upgrade from warn to block
|
|
409
|
+
data.gates[existingIdx] = { ...existing, action: 'block', severity: 'critical', occurrences: group.count, upgradedAt: new Date().toISOString() };
|
|
410
|
+
promotions.push({ type: 'upgrade', gateId, from: existing.action, to: 'block', occurrences: group.count });
|
|
411
|
+
}
|
|
364
412
|
}
|
|
365
413
|
// Update occurrence count even if no action change
|
|
366
414
|
data.gates[existingIdx].occurrences = group.count;
|
|
@@ -370,6 +418,20 @@ function promote(feedbackLogPath, options) {
|
|
|
370
418
|
// New gate — respect explicit gateAction override (e.g. 'approve' for human-approval rules)
|
|
371
419
|
const gate = buildGateRule(group, opts.gateAction);
|
|
372
420
|
|
|
421
|
+
// Self-Harness stage 3: before a feedback rule goes live as a hard block,
|
|
422
|
+
// regression-test it against prior allowed actions. If it would have blocked
|
|
423
|
+
// safe actions, quarantine it to `warn` instead of `block`.
|
|
424
|
+
let regression = null;
|
|
425
|
+
if (gate.action === 'block' && !opts.gateAction && !opts.skipRegression) {
|
|
426
|
+
regression = safeRegressionCheck(gate, opts);
|
|
427
|
+
if (regression && regression.falseBlocks > REGRESSION_FALSE_BLOCK_LIMIT) {
|
|
428
|
+
gate.action = 'warn';
|
|
429
|
+
gate.severity = 'medium';
|
|
430
|
+
gate.quarantined = true;
|
|
431
|
+
gate.regression = regression;
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
373
435
|
// Enforce max limit — rotate oldest
|
|
374
436
|
if (data.gates.length >= MAX_AUTO_GATES) {
|
|
375
437
|
const removed = data.gates.shift();
|
|
@@ -377,7 +439,13 @@ function promote(feedbackLogPath, options) {
|
|
|
377
439
|
}
|
|
378
440
|
|
|
379
441
|
data.gates.push(gate);
|
|
380
|
-
promotions.push({
|
|
442
|
+
promotions.push({
|
|
443
|
+
type: gate.quarantined ? 'new-quarantined' : 'new',
|
|
444
|
+
gateId: gate.id,
|
|
445
|
+
action: gate.action,
|
|
446
|
+
occurrences: group.count,
|
|
447
|
+
...(gate.quarantined ? { falseBlocks: regression.falseBlocks, allowSampleSize: regression.allowSampleSize } : {}),
|
|
448
|
+
});
|
|
381
449
|
}
|
|
382
450
|
|
|
383
451
|
// Log promotions
|
|
@@ -438,6 +506,9 @@ module.exports = {
|
|
|
438
506
|
groupNegativeFeedback,
|
|
439
507
|
patternToGateId,
|
|
440
508
|
buildGateRule,
|
|
509
|
+
regressionCheck,
|
|
510
|
+
getAuditTrailPath,
|
|
511
|
+
REGRESSION_FALSE_BLOCK_LIMIT,
|
|
441
512
|
extractPatternKey,
|
|
442
513
|
normalizeCommandSignature,
|
|
443
514
|
isNegative,
|
|
@@ -5,6 +5,11 @@ const PROJECT_ROOT = path.resolve(__dirname, '..');
|
|
|
5
5
|
const DEFAULT_BUILD_METADATA_PATH = path.join(PROJECT_ROOT, 'config', 'build-metadata.json');
|
|
6
6
|
const BUILD_SHA_ENV_KEY = 'THUMBGATE_BUILD_SHA';
|
|
7
7
|
const BUILD_GENERATED_AT_ENV_KEY = 'THUMBGATE_BUILD_GENERATED_AT';
|
|
8
|
+
// Railway injects this automatically for GitHub-connected deployments: the git
|
|
9
|
+
// SHA of the commit that triggered the deploy. It is the ground truth for what
|
|
10
|
+
// code is actually live, and unlike THUMBGATE_BUILD_SHA it cannot drift (Railway
|
|
11
|
+
// sets it per deploy). https://docs.railway.com/reference/variables
|
|
12
|
+
const RAILWAY_GIT_COMMIT_SHA_ENV_KEY = 'RAILWAY_GIT_COMMIT_SHA';
|
|
8
13
|
|
|
9
14
|
function normalizeNullableText(value) {
|
|
10
15
|
if (typeof value !== 'string') {
|
|
@@ -28,6 +33,7 @@ function resolveBuildMetadata({ env = process.env, filePath } = {}) {
|
|
|
28
33
|
normalizeNullableText(env.THUMBGATE_BUILD_METADATA_PATH) ||
|
|
29
34
|
DEFAULT_BUILD_METADATA_PATH;
|
|
30
35
|
const envBuildSha = normalizeNullableText(env[BUILD_SHA_ENV_KEY]);
|
|
36
|
+
const railwayGitSha = normalizeNullableText(env[RAILWAY_GIT_COMMIT_SHA_ENV_KEY]);
|
|
31
37
|
const envGeneratedAt = normalizeNullableText(env[BUILD_GENERATED_AT_ENV_KEY]);
|
|
32
38
|
|
|
33
39
|
let fileBuildSha = null;
|
|
@@ -48,9 +54,23 @@ function resolveBuildMetadata({ env = process.env, filePath } = {}) {
|
|
|
48
54
|
};
|
|
49
55
|
}
|
|
50
56
|
|
|
51
|
-
// No SHA
|
|
52
|
-
//
|
|
53
|
-
//
|
|
57
|
+
// No SHA baked into the image. Prefer Railway's own per-deploy commit SHA over
|
|
58
|
+
// THUMBGATE_BUILD_SHA: the latter is set out-of-band by the deploy workflow and
|
|
59
|
+
// has drifted in prod (stuck reporting an old commit while newer code was live,
|
|
60
|
+
// because RAILWAY_SYNC_VARIABLES is off and `railway up` stamping is unreliable).
|
|
61
|
+
// RAILWAY_GIT_COMMIT_SHA is injected by Railway per deploy, so it always matches
|
|
62
|
+
// the code actually serving traffic on a GitHub-connected service.
|
|
63
|
+
if (railwayGitSha) {
|
|
64
|
+
return {
|
|
65
|
+
path: resolvedPath,
|
|
66
|
+
buildSha: railwayGitSha,
|
|
67
|
+
generatedAt: envGeneratedAt,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Last resort: the workflow-managed env var. Only trust it when an explicit SHA
|
|
72
|
+
// is set. (Previously a bare GENERATED_AT with no SHA could short-circuit and
|
|
73
|
+
// return { buildSha: null }, losing both signals; now we require the SHA.)
|
|
54
74
|
if (envBuildSha) {
|
|
55
75
|
return {
|
|
56
76
|
path: resolvedPath,
|
|
@@ -124,6 +144,7 @@ if (require.main === module) {
|
|
|
124
144
|
module.exports = {
|
|
125
145
|
BUILD_GENERATED_AT_ENV_KEY,
|
|
126
146
|
BUILD_SHA_ENV_KEY,
|
|
147
|
+
RAILWAY_GIT_COMMIT_SHA_ENV_KEY,
|
|
127
148
|
DEFAULT_BUILD_METADATA_PATH,
|
|
128
149
|
resolveBuildMetadata,
|
|
129
150
|
writeBuildMetadataFile,
|
package/scripts/cli-schema.js
CHANGED
|
@@ -505,6 +505,12 @@ const CLI_COMMANDS = [
|
|
|
505
505
|
group: 'gates',
|
|
506
506
|
flags: [],
|
|
507
507
|
},
|
|
508
|
+
{
|
|
509
|
+
name: 'hermes-gate',
|
|
510
|
+
description: 'Hermes Agent pre_tool_call hook: gate runtime tool calls (incl. skill_manage) before they run',
|
|
511
|
+
group: 'gates',
|
|
512
|
+
flags: [],
|
|
513
|
+
},
|
|
508
514
|
{
|
|
509
515
|
name: 'force-gate',
|
|
510
516
|
description: 'Immediately create a blocking gate from a pattern string',
|
|
@@ -650,6 +656,22 @@ const CLI_COMMANDS = [
|
|
|
650
656
|
{ name: 'json', type: 'boolean', description: 'Output results as JSON' },
|
|
651
657
|
],
|
|
652
658
|
},
|
|
659
|
+
{
|
|
660
|
+
name: 'check-update',
|
|
661
|
+
aliases: ['upgrade-check'],
|
|
662
|
+
description: 'Check for newer versions of ThumbGate from npm or GitHub',
|
|
663
|
+
group: 'ops',
|
|
664
|
+
flags: [
|
|
665
|
+
{ name: 'json', type: 'boolean', description: 'Output results as JSON' },
|
|
666
|
+
],
|
|
667
|
+
},
|
|
668
|
+
{
|
|
669
|
+
name: 'self-update',
|
|
670
|
+
aliases: ['upgrade-cli'],
|
|
671
|
+
description: 'Automatically install the latest version of ThumbGate globally',
|
|
672
|
+
group: 'ops',
|
|
673
|
+
flags: [],
|
|
674
|
+
},
|
|
653
675
|
];
|
|
654
676
|
|
|
655
677
|
/**
|
|
@@ -317,7 +317,8 @@ async function answerDataQuestion(question, opts = {}) {
|
|
|
317
317
|
if (isPerplexity) return await callPerplexityEndpoint({ apiKey, prompt, fetchImpl, sources });
|
|
318
318
|
return await callGeminiEndpoint({ apiKey, model, prompt, fetchImpl, sources });
|
|
319
319
|
} catch (err) {
|
|
320
|
-
|
|
320
|
+
const safeMessage = (err && err.message) ? String(err.message).split('\n')[0].slice(0, 100) : 'An unexpected error occurred.';
|
|
321
|
+
return { ok: false, error: 'network', message: safeMessage, sources };
|
|
321
322
|
}
|
|
322
323
|
}
|
|
323
324
|
|