thumbgate 1.27.6 → 1.27.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/.claude/commands/thumbgate-blocked.md +27 -0
  2. package/.claude/commands/thumbgate-doctor.md +30 -0
  3. package/.claude/commands/thumbgate-guard.md +36 -0
  4. package/.claude/commands/thumbgate-protect.md +30 -0
  5. package/.claude/commands/thumbgate-rules.md +30 -0
  6. package/.claude-plugin/plugin.json +1 -1
  7. package/.well-known/llms.txt +6 -2
  8. package/.well-known/mcp/server-card.json +1 -1
  9. package/README.md +49 -5
  10. package/adapters/claude/.mcp.json +2 -2
  11. package/adapters/letta/README.md +41 -0
  12. package/adapters/letta/thumbgate-letta-adapter.js +133 -0
  13. package/adapters/mcp/server-stdio.js +16 -1
  14. package/adapters/opencode/opencode.json +1 -1
  15. package/adapters/policy-engine/ethicore-guardian-client.js +68 -0
  16. package/adapters/policy-engine/thumbgate-policy-engine-adapter.js +260 -0
  17. package/bench/observability-eval-suite.json +26 -0
  18. package/bin/cli.js +180 -2
  19. package/bin/postinstall.js +1 -1
  20. package/config/gate-templates.json +84 -0
  21. package/config/gates/claim-verification.json +6 -0
  22. package/config/gates/default.json +20 -0
  23. package/config/github-about.json +1 -1
  24. package/config/model-candidates.json +50 -0
  25. package/package.json +66 -25
  26. package/public/agent-manager.html +41 -1
  27. package/public/agents-cost-savings.html +1 -1
  28. package/public/ai-malpractice-prevention.html +2 -1
  29. package/public/assets/brand/github-social-preview.png +0 -0
  30. package/public/assets/brand/thumbgate-icon-512.png +0 -0
  31. package/public/assets/brand/thumbgate-icon-pro-512.png +0 -0
  32. package/public/assets/brand/thumbgate-icon-team-512.png +0 -0
  33. package/public/assets/brand/thumbgate-logo-1200x360.png +0 -0
  34. package/public/assets/brand/thumbgate-mark-inline.svg +15 -0
  35. package/public/assets/brand/thumbgate-mark-pro.svg +23 -0
  36. package/public/assets/brand/thumbgate-mark-team.svg +26 -0
  37. package/public/assets/brand/thumbgate-mark.svg +15 -0
  38. package/public/assets/brand/thumbgate-wordmark.svg +20 -0
  39. package/public/assets/claude-thumbgate-statusbar.svg +8 -0
  40. package/public/assets/codex-thumbgate-statusbar-test.svg +9 -0
  41. package/public/assets/legal-intake-control-flow.svg +66 -0
  42. package/public/blog.html +1 -1
  43. package/public/brand/thumbgate-mark.svg +15 -0
  44. package/public/brand/thumbgate-og.svg +16 -0
  45. package/public/codex-enterprise.html +1 -1
  46. package/public/codex-plugin.html +1 -1
  47. package/public/compare.html +23 -3
  48. package/public/dashboard.html +312 -30
  49. package/public/federal.html +1 -1
  50. package/public/guide.html +5 -4
  51. package/public/index.html +167 -49
  52. package/public/js/buyer-intent.js +672 -0
  53. package/public/learn.html +74 -7
  54. package/public/lessons.html +2 -1
  55. package/public/numbers.html +3 -3
  56. package/public/pricing.html +63 -15
  57. package/public/pro.html +7 -7
  58. package/scripts/activation-quickstart.js +187 -0
  59. package/scripts/agent-memory-lifecycle.js +211 -0
  60. package/scripts/async-eval-observability.js +236 -0
  61. package/scripts/auto-promote-gates.js +75 -4
  62. package/scripts/build-metadata.js +24 -3
  63. package/scripts/cli-schema.js +22 -0
  64. package/scripts/dashboard-chat.js +2 -1
  65. package/scripts/dashboard.js +8 -0
  66. package/scripts/export-databricks-bundle.js +5 -1
  67. package/scripts/export-dpo-pairs.js +7 -2
  68. package/scripts/feedback-aggregate.js +281 -0
  69. package/scripts/feedback-loop.js +34 -0
  70. package/scripts/filesystem-search.js +35 -10
  71. package/scripts/gates-engine.js +198 -6
  72. package/scripts/gemini-embedding-policy.js +2 -1
  73. package/scripts/hook-stop-anti-claim.js +227 -0
  74. package/scripts/hook-thumbgate-cache-updater.js +18 -2
  75. package/scripts/lesson-inference.js +8 -3
  76. package/scripts/lesson-search.js +17 -1
  77. package/scripts/operational-integrity.js +39 -5
  78. package/scripts/plausible-domain-config.js +4 -2
  79. package/scripts/rate-limiter.js +12 -6
  80. package/scripts/secret-redaction.js +166 -0
  81. package/scripts/security-scanner.js +100 -0
  82. package/scripts/self-distill-agent.js +3 -1
  83. package/scripts/self-harness-optimizer.js +141 -0
  84. package/scripts/seo-gsd.js +635 -0
  85. package/scripts/statusline-cache-path.js +17 -2
  86. package/scripts/statusline-cache-read.js +57 -0
  87. package/scripts/statusline-local-stats.js +9 -1
  88. package/scripts/statusline-meta.js +5 -2
  89. package/scripts/statusline.sh +13 -1
  90. package/scripts/sync-telemetry-from-prod.js +374 -0
  91. package/scripts/telemetry-analytics.js +9 -0
  92. package/scripts/thumbgate-search.js +85 -19
  93. package/scripts/tool-contract-validator.js +76 -0
  94. package/scripts/vector-store.js +44 -0
  95. package/scripts/workspace-evolver.js +62 -2
  96. package/src/api/server.js +715 -86
@@ -2,6 +2,40 @@
2
2
  'use strict';
3
3
 
4
4
  const MEMORY_TYPES = new Set(['episodic', 'semantic', 'procedural', 'preference', 'working']);
5
+ const MEMORY_SCOPES = new Set(['task', 'session', 'user', 'project', 'org']);
6
+ const HIGH_RISK_TERMS = new Set([
7
+ 'billing',
8
+ 'checkout',
9
+ 'compliance',
10
+ 'credential',
11
+ 'data-loss',
12
+ 'deploy',
13
+ 'deployment',
14
+ 'git',
15
+ 'payment',
16
+ 'production',
17
+ 'release',
18
+ 'secret',
19
+ 'security',
20
+ 'stripe',
21
+ 'verification',
22
+ ]);
23
+ const KNOWN_ENTITY_PATTERNS = [
24
+ ['Claude Code', /\bclaude\s+code\b/i, 'agent'],
25
+ ['Codex', /\bcodex\b/i, 'agent'],
26
+ ['Cursor', /\bcursor\b/i, 'agent'],
27
+ ['Gemini CLI', /\bgemini\s+cli\b/i, 'agent'],
28
+ ['MCP', /\bmcp\b/i, 'protocol'],
29
+ ['Stripe', /\bstripe\b/i, 'service'],
30
+ ['GitHub', /\bgithub\b|\bgh\s+/i, 'service'],
31
+ ['Railway', /\brailway\b/i, 'service'],
32
+ ['Plausible', /\bplausible\b/i, 'service'],
33
+ ['PostHog', /\bposthog\b/i, 'service'],
34
+ ['SQLite', /\bsqlite\b|\bfts5\b/i, 'storage'],
35
+ ['LanceDB', /\blancedb\b/i, 'storage'],
36
+ ['Docker', /\bdocker\b/i, 'runtime'],
37
+ ['npm', /\bnpm\b|\bnpx\b/i, 'runtime'],
38
+ ];
5
39
 
6
40
  function normalizeText(value) {
7
41
  if (value === undefined || value === null) return '';
@@ -13,6 +47,178 @@ function normalizeMemoryType(value) {
13
47
  return MEMORY_TYPES.has(normalized) ? normalized : 'episodic';
14
48
  }
15
49
 
50
+ function tokenize(value) {
51
+ return normalizeText(value)
52
+ .toLowerCase()
53
+ .split(/[^a-z0-9_.:/-]+/)
54
+ .filter(Boolean);
55
+ }
56
+
57
+ function uniqueByName(entities) {
58
+ const seen = new Set();
59
+ return entities.filter((entity) => {
60
+ const key = normalizeText(entity.name).toLowerCase();
61
+ if (!key || seen.has(key)) return false;
62
+ seen.add(key);
63
+ return true;
64
+ });
65
+ }
66
+
67
+ function collectMemoryText(memory = {}) {
68
+ return [
69
+ memory.title,
70
+ memory.content,
71
+ memory.context,
72
+ memory.whatWentWrong,
73
+ memory.whatToChange,
74
+ memory.whatWorked,
75
+ memory.domain,
76
+ memory.skill,
77
+ Array.isArray(memory.tags) ? memory.tags.join(' ') : memory.tags,
78
+ ].filter(Boolean).join(' ');
79
+ }
80
+
81
+ function extractMemoryEntities(memory = {}) {
82
+ const text = collectMemoryText(memory);
83
+ const entities = [];
84
+
85
+ for (const [name, pattern, type] of KNOWN_ENTITY_PATTERNS) {
86
+ if (pattern.test(text)) entities.push({ name, type });
87
+ }
88
+
89
+ const commandMatches = text.match(/`([^`]+)`/g) || [];
90
+ for (const match of commandMatches) {
91
+ const command = match.slice(1, -1).trim();
92
+ if (/^(git|npm|npx|node|gh|curl|docker|python|pytest|stripe)\b/i.test(command)) {
93
+ entities.push({ name: command, type: 'command' });
94
+ } else if (/[./-]/.test(command)) {
95
+ entities.push({ name: command, type: 'path' });
96
+ }
97
+ }
98
+
99
+ const pathMatches = text.match(/\b(?:[a-z0-9_-]+\/)+[a-z0-9_.-]+\b/gi) || [];
100
+ for (const filePath of pathMatches.slice(0, 8)) {
101
+ entities.push({ name: filePath, type: 'path' });
102
+ }
103
+
104
+ return uniqueByName(entities).slice(0, 16);
105
+ }
106
+
107
+ function inferMemoryScope(memory = {}) {
108
+ const explicit = normalizeText(memory.scope || memory.memoryScope).toLowerCase();
109
+ if (MEMORY_SCOPES.has(explicit)) return explicit;
110
+
111
+ const text = collectMemoryText(memory).toLowerCase();
112
+ const tags = new Set(Array.isArray(memory.tags) ? memory.tags.map((tag) => normalizeText(tag).toLowerCase()) : []);
113
+
114
+ if (tags.has('preference') || /\b(prefer|style|tone|my preference|user preference)\b/.test(text)) return 'user';
115
+ if (tags.has('org') || tags.has('team') || /\b(enterprise|seat|team|shared|org|compliance|policy|approval)\b/.test(text)) return 'org';
116
+ if (tags.has('repo') || tags.has('project') || tags.has('release') || tags.has('deployment')
117
+ || /\b(repo|repository|branch|ci|pull request|github|deploy|production|release|publish)\b/.test(text)) return 'project';
118
+ if (tags.has('session') || /\b(this session|current session|today|right now)\b/.test(text)) return 'session';
119
+ return 'task';
120
+ }
121
+
122
+ function scoreMemoryDecay(memory = {}, options = {}) {
123
+ const nowMs = options.now ? new Date(options.now).getTime() : Date.now();
124
+ const timestampMs = memory.timestamp ? new Date(memory.timestamp).getTime() : NaN;
125
+ const ageDays = Number.isFinite(timestampMs)
126
+ ? Math.max(0, (nowMs - timestampMs) / (1000 * 60 * 60 * 24))
127
+ : null;
128
+ const textTokens = new Set(tokenize(collectMemoryText(memory)));
129
+ const tags = Array.isArray(memory.tags) ? memory.tags.map((tag) => normalizeText(tag).toLowerCase()) : [];
130
+ const highRisk = tags.some((tag) => HIGH_RISK_TERMS.has(tag))
131
+ || [...textTokens].some((token) => HIGH_RISK_TERMS.has(token))
132
+ || ['critical', 'high'].includes(normalizeText(memory.importance).toLowerCase());
133
+
134
+ if (highRisk) {
135
+ return {
136
+ state: 'sticky',
137
+ ageDays,
138
+ score: 1,
139
+ reason: 'high-risk memories stay retrievable until explicitly retired',
140
+ };
141
+ }
142
+ if (ageDays === null) {
143
+ return {
144
+ state: 'review',
145
+ ageDays,
146
+ score: 0.6,
147
+ reason: 'memory has no timestamp, so it needs review before durable promotion',
148
+ };
149
+ }
150
+ if (ageDays > 180) {
151
+ return {
152
+ state: 'archive_candidate',
153
+ ageDays,
154
+ score: 0.2,
155
+ reason: 'old low-risk memory should be consolidated or archived',
156
+ };
157
+ }
158
+ if (ageDays > 60) {
159
+ return {
160
+ state: 'review',
161
+ ageDays,
162
+ score: 0.55,
163
+ reason: 'older low-risk memory should be refreshed before it dominates recall',
164
+ };
165
+ }
166
+ return {
167
+ state: 'active',
168
+ ageDays,
169
+ score: 0.85,
170
+ reason: 'recent memory remains eligible for recall',
171
+ };
172
+ }
173
+
174
+ function scoreHybridMemoryMatch(query, memory = {}, options = {}) {
175
+ const queryTokens = new Set(tokenize(query));
176
+ const memoryTokens = new Set(tokenize(collectMemoryText(memory)));
177
+ const queryText = normalizeText(query).toLowerCase();
178
+ const memoryText = collectMemoryText(memory).toLowerCase();
179
+ const memoryEntities = extractMemoryEntities(memory);
180
+ const queryEntityNames = extractMemoryEntities({ content: query }).map((entity) => entity.name.toLowerCase());
181
+
182
+ let lexicalMatches = 0;
183
+ for (const token of queryTokens) {
184
+ if (memoryTokens.has(token)) lexicalMatches++;
185
+ }
186
+ const lexicalScore = queryTokens.size > 0 ? lexicalMatches / queryTokens.size : 0;
187
+ const phraseScore = queryText && memoryText.includes(queryText) ? 0.35 : 0;
188
+ const entityMatches = memoryEntities.filter((entity) => queryEntityNames.includes(entity.name.toLowerCase()));
189
+ const entityScore = queryEntityNames.length > 0 ? entityMatches.length / queryEntityNames.length : 0;
190
+ const decay = scoreMemoryDecay(memory, options);
191
+ const lifecycleScore = decay.state === 'archive_candidate' ? -0.15 : decay.state === 'sticky' ? 0.12 : 0;
192
+ const score = lexicalScore + phraseScore + (entityScore * 0.45) + lifecycleScore;
193
+
194
+ return {
195
+ score: Number(Math.max(0, score).toFixed(4)),
196
+ lexicalScore: Number(lexicalScore.toFixed(4)),
197
+ entityScore: Number(entityScore.toFixed(4)),
198
+ matchedEntities: entityMatches,
199
+ decayState: decay.state,
200
+ };
201
+ }
202
+
203
+ function buildMemoryLifecycleView(memory = {}, options = {}) {
204
+ const scope = inferMemoryScope(memory);
205
+ const entities = extractMemoryEntities(memory);
206
+ const decay = scoreMemoryDecay(memory, options);
207
+ const retrieval = scoreHybridMemoryMatch(options.query || '', memory, options);
208
+
209
+ return {
210
+ scope,
211
+ entities,
212
+ decay,
213
+ retrievalHints: {
214
+ hybridScore: retrieval.score,
215
+ lexicalScore: retrieval.lexicalScore,
216
+ entityScore: retrieval.entityScore,
217
+ matchedEntities: retrieval.matchedEntities,
218
+ },
219
+ };
220
+ }
221
+
16
222
  function buildMemoryLifecyclePolicy(input = {}) {
17
223
  return {
18
224
  generatedAt: normalizeText(input.generatedAt) || new Date().toISOString(),
@@ -91,6 +297,11 @@ function evaluateMemoryPromotion(memory = {}, policy = buildMemoryLifecyclePolic
91
297
 
92
298
  module.exports = {
93
299
  buildMemoryLifecyclePolicy,
300
+ buildMemoryLifecycleView,
94
301
  evaluateMemoryPromotion,
302
+ extractMemoryEntities,
303
+ inferMemoryScope,
95
304
  normalizeMemoryType,
305
+ scoreHybridMemoryMatch,
306
+ scoreMemoryDecay,
96
307
  };
@@ -0,0 +1,236 @@
1
+ #!/usr/bin/env node
2
+ 'use strict';
3
+
4
+ const fs = require('node:fs');
5
+ const path = require('node:path');
6
+
7
+ const DEFAULT_THRESHOLDS = {
8
+ faithfulness: 0.72,
9
+ answerRelevance: 0.45,
10
+ contextPrecision: 0.5,
11
+ };
12
+
13
+ function tokenize(value) {
14
+ return String(value || '')
15
+ .toLowerCase()
16
+ .split(/[^a-z0-9]+/)
17
+ .filter((token) => token.length > 2);
18
+ }
19
+
20
+ function unique(values) {
21
+ return [...new Set(values.filter(Boolean))];
22
+ }
23
+
24
+ function overlapScore(left, right) {
25
+ const leftTokens = unique(tokenize(left));
26
+ const rightSet = new Set(tokenize(right));
27
+ if (leftTokens.length === 0) return 0;
28
+ const matches = leftTokens.filter((token) => rightSet.has(token)).length;
29
+ return matches / leftTokens.length;
30
+ }
31
+
32
+ function splitClaims(response) {
33
+ return String(response || '')
34
+ .split(/(?:[.!?]\s+|\n+)/)
35
+ .map((claim) => claim.trim())
36
+ .filter((claim) => claim.length > 0);
37
+ }
38
+
39
+ function normalizeContexts(contexts) {
40
+ if (Array.isArray(contexts)) return contexts.map(String).filter(Boolean);
41
+ if (contexts) return [String(contexts)];
42
+ return [];
43
+ }
44
+
45
+ function scoreFaithfulness(response, contexts) {
46
+ const claims = splitClaims(response);
47
+ const contextText = normalizeContexts(contexts).join('\n');
48
+ if (claims.length === 0) return { score: 0, supportedClaims: 0, totalClaims: 0 };
49
+ const supportedClaims = claims.filter((claim) => {
50
+ const normalized = claim.toLowerCase();
51
+ return contextText.toLowerCase().includes(normalized) || overlapScore(claim, contextText) >= 0.58;
52
+ }).length;
53
+ return {
54
+ score: Number((supportedClaims / claims.length).toFixed(4)),
55
+ supportedClaims,
56
+ totalClaims: claims.length,
57
+ };
58
+ }
59
+
60
+ function scoreAnswerRelevance(question, response) {
61
+ const score = overlapScore(question, response);
62
+ return {
63
+ score: Number(score.toFixed(4)),
64
+ matchedQuestionTerms: unique(tokenize(question).filter((token) => tokenize(response).includes(token))),
65
+ };
66
+ }
67
+
68
+ function scoreContextPrecision(question, contexts, reference = '') {
69
+ const normalizedContexts = normalizeContexts(contexts);
70
+ const target = [question, reference].filter(Boolean).join('\n');
71
+ if (normalizedContexts.length === 0) return { score: 0, relevantContexts: 0, totalContexts: 0 };
72
+
73
+ let precisionSum = 0;
74
+ let relevantContexts = 0;
75
+ normalizedContexts.forEach((context, index) => {
76
+ const relevant = overlapScore(target, context) >= 0.22 || overlapScore(context, target) >= 0.22;
77
+ if (relevant) relevantContexts += 1;
78
+ const precisionAtK = relevantContexts / (index + 1);
79
+ if (relevant) precisionSum += precisionAtK;
80
+ });
81
+
82
+ const score = relevantContexts === 0 ? 0 : precisionSum / relevantContexts;
83
+ return {
84
+ score: Number(score.toFixed(4)),
85
+ relevantContexts,
86
+ totalContexts: normalizedContexts.length,
87
+ };
88
+ }
89
+
90
+ function evaluateGeneration(testCase, options = {}) {
91
+ const thresholds = { ...DEFAULT_THRESHOLDS, ...(options.thresholds || {}) };
92
+ const contexts = normalizeContexts(testCase.retrievedContexts || testCase.contexts || testCase.retrieved_contexts);
93
+ const faithfulness = scoreFaithfulness(testCase.response || testCase.answer, contexts);
94
+ const answerRelevance = scoreAnswerRelevance(testCase.question || testCase.user_input, testCase.response || testCase.answer);
95
+ const contextPrecision = scoreContextPrecision(
96
+ testCase.question || testCase.user_input,
97
+ contexts,
98
+ testCase.reference || testCase.groundTruth || ''
99
+ );
100
+ const scores = {
101
+ faithfulness: faithfulness.score,
102
+ answerRelevance: answerRelevance.score,
103
+ contextPrecision: contextPrecision.score,
104
+ };
105
+ const passed = scores.faithfulness >= thresholds.faithfulness
106
+ && scores.answerRelevance >= thresholds.answerRelevance
107
+ && scores.contextPrecision >= thresholds.contextPrecision;
108
+
109
+ return {
110
+ id: String(testCase.id || testCase.traceId || 'case'),
111
+ traceId: String(testCase.traceId || testCase.id || ''),
112
+ passed,
113
+ scores,
114
+ thresholds,
115
+ details: {
116
+ faithfulness,
117
+ answerRelevance,
118
+ contextPrecision,
119
+ },
120
+ };
121
+ }
122
+
123
+ function buildRagasCompatibleRows(cases) {
124
+ return cases.map((testCase) => ({
125
+ user_input: testCase.question || testCase.user_input || '',
126
+ response: testCase.response || testCase.answer || '',
127
+ retrieved_contexts: normalizeContexts(testCase.retrievedContexts || testCase.contexts || testCase.retrieved_contexts),
128
+ reference: testCase.reference || testCase.groundTruth || '',
129
+ }));
130
+ }
131
+
132
+ function buildLangSmithCompatibleRuns(cases, results) {
133
+ return cases.map((testCase, index) => ({
134
+ id: testCase.traceId || testCase.id || `case-${index + 1}`,
135
+ name: 'thumbgate_async_rag_eval',
136
+ inputs: { question: testCase.question || testCase.user_input || '' },
137
+ outputs: { response: testCase.response || testCase.answer || '' },
138
+ metadata: {
139
+ evaluator: 'thumbgate-async-eval-observability',
140
+ caseId: testCase.id || null,
141
+ },
142
+ feedback: Object.entries(results[index].scores).map(([key, score]) => ({
143
+ key,
144
+ score,
145
+ })),
146
+ }));
147
+ }
148
+
149
+ function buildEvalReport(cases, options = {}) {
150
+ const normalizedCases = Array.isArray(cases) ? cases : [];
151
+ const results = normalizedCases.map((testCase) => evaluateGeneration(testCase, options));
152
+ const passed = results.filter((result) => result.passed).length;
153
+ const failed = results.length - passed;
154
+ const aggregate = {
155
+ faithfulness: average(results.map((result) => result.scores.faithfulness)),
156
+ answerRelevance: average(results.map((result) => result.scores.answerRelevance)),
157
+ contextPrecision: average(results.map((result) => result.scores.contextPrecision)),
158
+ };
159
+
160
+ return {
161
+ generatedAt: new Date().toISOString(),
162
+ mode: 'async-post-generation',
163
+ total: results.length,
164
+ passed,
165
+ failed,
166
+ passRate: results.length === 0 ? 0 : Number(((passed / results.length) * 100).toFixed(2)),
167
+ aggregate,
168
+ passedThreshold: failed === 0,
169
+ metrics: ['faithfulness', 'answerRelevance', 'contextPrecision'],
170
+ sinks: {
171
+ ci: true,
172
+ langsmithCompatible: true,
173
+ ragasCompatible: true,
174
+ },
175
+ results,
176
+ ragasDataset: buildRagasCompatibleRows(normalizedCases),
177
+ langsmithRuns: buildLangSmithCompatibleRuns(normalizedCases, results),
178
+ };
179
+ }
180
+
181
+ function average(values) {
182
+ const numeric = values.filter((value) => Number.isFinite(value));
183
+ if (numeric.length === 0) return 0;
184
+ return Number((numeric.reduce((sum, value) => sum + value, 0) / numeric.length).toFixed(4));
185
+ }
186
+
187
+ async function runAsyncEvaluation(cases, options = {}) {
188
+ const report = await new Promise((resolve) => {
189
+ setImmediate(() => resolve(buildEvalReport(cases, options)));
190
+ });
191
+ if (options.outputPath) {
192
+ fs.mkdirSync(path.dirname(options.outputPath), { recursive: true });
193
+ fs.writeFileSync(options.outputPath, `${JSON.stringify(report, null, 2)}\n`);
194
+ }
195
+ return report;
196
+ }
197
+
198
+ function loadCases(inputPath) {
199
+ const payload = JSON.parse(fs.readFileSync(inputPath, 'utf8'));
200
+ return Array.isArray(payload) ? payload : payload.cases || [];
201
+ }
202
+
203
+ async function main(argv = process.argv.slice(2)) {
204
+ const inputIndex = argv.indexOf('--input');
205
+ const outputIndex = argv.indexOf('--output');
206
+ const inputPath = inputIndex >= 0 ? argv[inputIndex + 1] : 'bench/observability-eval-suite.json';
207
+ const outputPath = outputIndex >= 0 ? argv[outputIndex + 1] : 'proof/async-eval-observability-report.json';
208
+ const report = await runAsyncEvaluation(loadCases(inputPath), { outputPath });
209
+ process.stdout.write(`${JSON.stringify({
210
+ outputPath,
211
+ total: report.total,
212
+ passed: report.passed,
213
+ failed: report.failed,
214
+ passRate: report.passRate,
215
+ }, null, 2)}\n`);
216
+ if (!report.passedThreshold) process.exitCode = 1;
217
+ }
218
+
219
+ module.exports = {
220
+ DEFAULT_THRESHOLDS,
221
+ buildEvalReport,
222
+ buildLangSmithCompatibleRuns,
223
+ buildRagasCompatibleRows,
224
+ evaluateGeneration,
225
+ runAsyncEvaluation,
226
+ scoreAnswerRelevance,
227
+ scoreContextPrecision,
228
+ scoreFaithfulness,
229
+ };
230
+
231
+ if (require.main === module) {
232
+ main().catch((err) => {
233
+ console.error(err.stack || err.message);
234
+ process.exitCode = 1;
235
+ });
236
+ }
@@ -58,6 +58,47 @@ function readJSONL(filePath) {
58
58
  }).filter(Boolean);
59
59
  }
60
60
 
61
+ // --- Self-Harness stage 3: regression-gated promotion -----------------------
62
+ // Inspired by "Self-Harness: Harnesses That Improve Themselves" (arXiv 2606.09498).
63
+ // Stages 1-2 (weakness mining -> rule extraction) already exist via lesson
64
+ // inference + this promoter. Stage 3 — accept a harness change only after
65
+ // regression-testing it does not degrade behavior — was missing: a noisy 3x
66
+ // capture could hard-block an over-broad pattern with no check that it wouldn't
67
+ // have wrongly blocked actions that were previously ALLOWED. This replays a
68
+ // candidate BLOCK rule against the audit trail's prior `allow` decisions; if it
69
+ // would have blocked safe actions, the caller quarantines it to `warn` instead.
70
+ const REGRESSION_FALSE_BLOCK_LIMIT = 0; // any prior safe action it would block => quarantine
71
+
72
+ function getAuditTrailPath() {
73
+ return path.join(path.dirname(getFeedbackLogPath()), 'audit-trail.jsonl');
74
+ }
75
+
76
+ // Returns { falseBlocks, allowSampleSize } or null when there is no history /
77
+ // matcher available — in which case the caller promotes as usual (fail-open to
78
+ // existing behavior, since regression gating is an enhancement, not a hard gate).
79
+ function regressionCheck(gate, options = {}) {
80
+ const auditPath = options.auditTrailPath || getAuditTrailPath();
81
+ const entries = readJSONL(auditPath);
82
+ if (!entries.length) return null;
83
+ // Lazy-require to avoid the gates-engine <-> auto-promote-gates require cycle.
84
+ let matchesGate;
85
+ try { ({ matchesGate } = require('./gates-engine')); } catch { return null; }
86
+ if (typeof matchesGate !== 'function') return null;
87
+ const allowed = entries.filter((e) => e && e.decision === 'allow' && e.toolName);
88
+ if (!allowed.length) return null;
89
+ let falseBlocks = 0;
90
+ for (const e of allowed) {
91
+ try {
92
+ if (matchesGate(gate, e.toolName, e.toolInput || {})) falseBlocks += 1;
93
+ } catch { /* a bad pattern/entry never counts as a false block */ }
94
+ }
95
+ return { falseBlocks, allowSampleSize: allowed.length };
96
+ }
97
+
98
+ function safeRegressionCheck(gate, options) {
99
+ try { return regressionCheck(gate, options); } catch { return null; }
100
+ }
101
+
61
102
  function loadAutoGates() {
62
103
  const autoGatesPath = getAutoGatesPath();
63
104
  if (!fs.existsSync(autoGatesPath)) {
@@ -358,9 +399,16 @@ function promote(feedbackLogPath, options) {
358
399
  const existing = data.gates[existingIdx];
359
400
  const newAction = group.count >= BLOCK_THRESHOLD ? 'block' : 'warn';
360
401
  if (existing.action !== newAction && newAction === 'block') {
361
- // Upgrade from warn to block
362
- data.gates[existingIdx] = { ...existing, action: 'block', severity: 'critical', occurrences: group.count, upgradedAt: new Date().toISOString() };
363
- promotions.push({ type: 'upgrade', gateId, from: existing.action, to: 'block', occurrences: group.count });
402
+ // Self-Harness stage 3: regression-test before upgrading warn -> block.
403
+ const regression = opts.skipRegression ? null : safeRegressionCheck(buildGateRule(group, 'block'), opts);
404
+ if (regression && regression.falseBlocks > REGRESSION_FALSE_BLOCK_LIMIT) {
405
+ // Would block prior safe actions — hold at warn instead of upgrading.
406
+ promotions.push({ type: 'upgrade-quarantined', gateId, from: existing.action, occurrences: group.count, falseBlocks: regression.falseBlocks });
407
+ } else {
408
+ // Upgrade from warn to block
409
+ data.gates[existingIdx] = { ...existing, action: 'block', severity: 'critical', occurrences: group.count, upgradedAt: new Date().toISOString() };
410
+ promotions.push({ type: 'upgrade', gateId, from: existing.action, to: 'block', occurrences: group.count });
411
+ }
364
412
  }
365
413
  // Update occurrence count even if no action change
366
414
  data.gates[existingIdx].occurrences = group.count;
@@ -370,6 +418,20 @@ function promote(feedbackLogPath, options) {
370
418
  // New gate — respect explicit gateAction override (e.g. 'approve' for human-approval rules)
371
419
  const gate = buildGateRule(group, opts.gateAction);
372
420
 
421
+ // Self-Harness stage 3: before a feedback rule goes live as a hard block,
422
+ // regression-test it against prior allowed actions. If it would have blocked
423
+ // safe actions, quarantine it to `warn` instead of `block`.
424
+ let regression = null;
425
+ if (gate.action === 'block' && !opts.gateAction && !opts.skipRegression) {
426
+ regression = safeRegressionCheck(gate, opts);
427
+ if (regression && regression.falseBlocks > REGRESSION_FALSE_BLOCK_LIMIT) {
428
+ gate.action = 'warn';
429
+ gate.severity = 'medium';
430
+ gate.quarantined = true;
431
+ gate.regression = regression;
432
+ }
433
+ }
434
+
373
435
  // Enforce max limit — rotate oldest
374
436
  if (data.gates.length >= MAX_AUTO_GATES) {
375
437
  const removed = data.gates.shift();
@@ -377,7 +439,13 @@ function promote(feedbackLogPath, options) {
377
439
  }
378
440
 
379
441
  data.gates.push(gate);
380
- promotions.push({ type: 'new', gateId: gate.id, action: gate.action, occurrences: group.count });
442
+ promotions.push({
443
+ type: gate.quarantined ? 'new-quarantined' : 'new',
444
+ gateId: gate.id,
445
+ action: gate.action,
446
+ occurrences: group.count,
447
+ ...(gate.quarantined ? { falseBlocks: regression.falseBlocks, allowSampleSize: regression.allowSampleSize } : {}),
448
+ });
381
449
  }
382
450
 
383
451
  // Log promotions
@@ -438,6 +506,9 @@ module.exports = {
438
506
  groupNegativeFeedback,
439
507
  patternToGateId,
440
508
  buildGateRule,
509
+ regressionCheck,
510
+ getAuditTrailPath,
511
+ REGRESSION_FALSE_BLOCK_LIMIT,
441
512
  extractPatternKey,
442
513
  normalizeCommandSignature,
443
514
  isNegative,
@@ -5,6 +5,11 @@ const PROJECT_ROOT = path.resolve(__dirname, '..');
5
5
  const DEFAULT_BUILD_METADATA_PATH = path.join(PROJECT_ROOT, 'config', 'build-metadata.json');
6
6
  const BUILD_SHA_ENV_KEY = 'THUMBGATE_BUILD_SHA';
7
7
  const BUILD_GENERATED_AT_ENV_KEY = 'THUMBGATE_BUILD_GENERATED_AT';
8
+ // Railway injects this automatically for GitHub-connected deployments: the git
9
+ // SHA of the commit that triggered the deploy. It is the ground truth for what
10
+ // code is actually live, and unlike THUMBGATE_BUILD_SHA it cannot drift (Railway
11
+ // sets it per deploy). https://docs.railway.com/reference/variables
12
+ const RAILWAY_GIT_COMMIT_SHA_ENV_KEY = 'RAILWAY_GIT_COMMIT_SHA';
8
13
 
9
14
  function normalizeNullableText(value) {
10
15
  if (typeof value !== 'string') {
@@ -28,6 +33,7 @@ function resolveBuildMetadata({ env = process.env, filePath } = {}) {
28
33
  normalizeNullableText(env.THUMBGATE_BUILD_METADATA_PATH) ||
29
34
  DEFAULT_BUILD_METADATA_PATH;
30
35
  const envBuildSha = normalizeNullableText(env[BUILD_SHA_ENV_KEY]);
36
+ const railwayGitSha = normalizeNullableText(env[RAILWAY_GIT_COMMIT_SHA_ENV_KEY]);
31
37
  const envGeneratedAt = normalizeNullableText(env[BUILD_GENERATED_AT_ENV_KEY]);
32
38
 
33
39
  let fileBuildSha = null;
@@ -48,9 +54,23 @@ function resolveBuildMetadata({ env = process.env, filePath } = {}) {
48
54
  };
49
55
  }
50
56
 
51
- // No SHA in the file fall back to env only if an explicit SHA is set.
52
- // (Previously a bare GENERATED_AT with no SHA could short-circuit and return
53
- // { buildSha: null }, losing both signals; now we require the SHA.)
57
+ // No SHA baked into the image. Prefer Railway's own per-deploy commit SHA over
58
+ // THUMBGATE_BUILD_SHA: the latter is set out-of-band by the deploy workflow and
59
+ // has drifted in prod (stuck reporting an old commit while newer code was live,
60
+ // because RAILWAY_SYNC_VARIABLES is off and `railway up` stamping is unreliable).
61
+ // RAILWAY_GIT_COMMIT_SHA is injected by Railway per deploy, so it always matches
62
+ // the code actually serving traffic on a GitHub-connected service.
63
+ if (railwayGitSha) {
64
+ return {
65
+ path: resolvedPath,
66
+ buildSha: railwayGitSha,
67
+ generatedAt: envGeneratedAt,
68
+ };
69
+ }
70
+
71
+ // Last resort: the workflow-managed env var. Only trust it when an explicit SHA
72
+ // is set. (Previously a bare GENERATED_AT with no SHA could short-circuit and
73
+ // return { buildSha: null }, losing both signals; now we require the SHA.)
54
74
  if (envBuildSha) {
55
75
  return {
56
76
  path: resolvedPath,
@@ -124,6 +144,7 @@ if (require.main === module) {
124
144
  module.exports = {
125
145
  BUILD_GENERATED_AT_ENV_KEY,
126
146
  BUILD_SHA_ENV_KEY,
147
+ RAILWAY_GIT_COMMIT_SHA_ENV_KEY,
127
148
  DEFAULT_BUILD_METADATA_PATH,
128
149
  resolveBuildMetadata,
129
150
  writeBuildMetadataFile,
@@ -505,6 +505,12 @@ const CLI_COMMANDS = [
505
505
  group: 'gates',
506
506
  flags: [],
507
507
  },
508
+ {
509
+ name: 'hermes-gate',
510
+ description: 'Hermes Agent pre_tool_call hook: gate runtime tool calls (incl. skill_manage) before they run',
511
+ group: 'gates',
512
+ flags: [],
513
+ },
508
514
  {
509
515
  name: 'force-gate',
510
516
  description: 'Immediately create a blocking gate from a pattern string',
@@ -650,6 +656,22 @@ const CLI_COMMANDS = [
650
656
  { name: 'json', type: 'boolean', description: 'Output results as JSON' },
651
657
  ],
652
658
  },
659
+ {
660
+ name: 'check-update',
661
+ aliases: ['upgrade-check'],
662
+ description: 'Check for newer versions of ThumbGate from npm or GitHub',
663
+ group: 'ops',
664
+ flags: [
665
+ { name: 'json', type: 'boolean', description: 'Output results as JSON' },
666
+ ],
667
+ },
668
+ {
669
+ name: 'self-update',
670
+ aliases: ['upgrade-cli'],
671
+ description: 'Automatically install the latest version of ThumbGate globally',
672
+ group: 'ops',
673
+ flags: [],
674
+ },
653
675
  ];
654
676
 
655
677
  /**
@@ -317,7 +317,8 @@ async function answerDataQuestion(question, opts = {}) {
317
317
  if (isPerplexity) return await callPerplexityEndpoint({ apiKey, prompt, fetchImpl, sources });
318
318
  return await callGeminiEndpoint({ apiKey, model, prompt, fetchImpl, sources });
319
319
  } catch (err) {
320
- return { ok: false, error: 'network', message: err?.message || String(err), sources };
320
+ const safeMessage = (err && err.message) ? String(err.message).split('\n')[0].slice(0, 100) : 'An unexpected error occurred.';
321
+ return { ok: false, error: 'network', message: safeMessage, sources };
321
322
  }
322
323
  }
323
324