thumbgate 1.25.2 → 1.26.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +62 -31
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +84 -7
- package/adapters/opencode/opencode.json +1 -1
- package/bin/cli.js +390 -14
- package/config/mcp-allowlists.json +3 -0
- package/package.json +16 -3
- package/public/agents-cost-savings.html +2 -0
- package/public/index.html +10 -2
- package/public/numbers.html +2 -2
- package/scripts/action-receipts.js +324 -0
- package/scripts/cli-schema.js +24 -0
- package/scripts/context-manager.js +10 -0
- package/scripts/dashboard.js +6 -1
- package/scripts/gates-engine.js +68 -9
- package/scripts/install-shim.js +84 -0
- package/scripts/llm-client.js +90 -4
- package/scripts/local-model-profile.js +15 -8
- package/scripts/meta-agent-loop.js +9 -5
- package/scripts/noop-detect.js +285 -0
- package/scripts/operational-dashboard.js +160 -0
- package/scripts/plan-gate.js +243 -0
- package/scripts/repeat-metric.js +121 -0
- package/scripts/silent-failure-cluster.js +22 -3
- package/scripts/thompson-sampling.js +20 -5
- package/scripts/tool-registry.js +50 -0
- package/scripts/trajectory-scorer.js +63 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { resolveAnalyticsWindow } = require('./analytics-window');
|
|
4
|
+
const { getBillingSummaryLive } = require('./billing');
|
|
5
|
+
const { generateDashboard } = require('./dashboard');
|
|
6
|
+
const { getFeedbackPaths } = require('./feedback-loop');
|
|
7
|
+
const { resolveHostedBillingConfig } = require('./hosted-config');
|
|
8
|
+
const { loadOperatorConfig } = require('./operational-summary');
|
|
9
|
+
|
|
10
|
+
function normalizeText(value) {
|
|
11
|
+
if (value === undefined || value === null) return null;
|
|
12
|
+
const text = String(value).trim();
|
|
13
|
+
return text || null;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function shouldPreferHostedDashboard() {
|
|
17
|
+
return String(process.env.THUMBGATE_METRICS_SOURCE || '').trim().toLowerCase() !== 'local';
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function resolveHostedDashboardConfig() {
|
|
21
|
+
const runtimeConfig = resolveHostedBillingConfig();
|
|
22
|
+
const operatorConfig = loadOperatorConfig();
|
|
23
|
+
// Match operational-summary's key priority chain so north-star and cfo
|
|
24
|
+
// authenticate against the same hosted deployment consistently. Prior to
|
|
25
|
+
// this change, north-star only read THUMBGATE_API_KEY, silently 401'ing
|
|
26
|
+
// on machines configured via operator.json or THUMBGATE_OPERATOR_KEY.
|
|
27
|
+
const apiKey = normalizeText(process.env.THUMBGATE_OPERATOR_KEY)
|
|
28
|
+
|| operatorConfig.operatorKey
|
|
29
|
+
|| normalizeText(process.env.THUMBGATE_API_KEY);
|
|
30
|
+
const apiBaseUrl = normalizeText(process.env.THUMBGATE_BILLING_API_BASE_URL)
|
|
31
|
+
|| operatorConfig.baseUrl
|
|
32
|
+
|| runtimeConfig.billingApiBaseUrl;
|
|
33
|
+
return {
|
|
34
|
+
apiBaseUrl,
|
|
35
|
+
apiKey,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
async function buildOperationalDashboard(options = {}) {
|
|
40
|
+
const analyticsWindow = resolveAnalyticsWindow(options);
|
|
41
|
+
const feedbackDir = options.feedbackDir || getFeedbackPaths().FEEDBACK_DIR;
|
|
42
|
+
const billingSummary = await getBillingSummaryLive(analyticsWindow);
|
|
43
|
+
|
|
44
|
+
return generateDashboard(feedbackDir, {
|
|
45
|
+
analyticsWindow,
|
|
46
|
+
billingSummary,
|
|
47
|
+
billingSource: 'live',
|
|
48
|
+
billingFallbackReason: null,
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
async function fetchHostedDashboard(options = {}, config = resolveHostedDashboardConfig()) {
|
|
53
|
+
const analyticsWindow = resolveAnalyticsWindow(options);
|
|
54
|
+
if (!shouldPreferHostedDashboard()) {
|
|
55
|
+
const err = new Error('Hosted operational dashboard is disabled.');
|
|
56
|
+
err.code = 'hosted_dashboard_disabled';
|
|
57
|
+
throw err;
|
|
58
|
+
}
|
|
59
|
+
if (!config.apiBaseUrl || !config.apiKey) {
|
|
60
|
+
const err = new Error('Hosted operational dashboard is not configured.');
|
|
61
|
+
err.code = 'hosted_dashboard_unconfigured';
|
|
62
|
+
throw err;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const requestUrl = new URL('/v1/dashboard', config.apiBaseUrl);
|
|
66
|
+
requestUrl.searchParams.set('window', analyticsWindow.window);
|
|
67
|
+
requestUrl.searchParams.set('timezone', analyticsWindow.timeZone);
|
|
68
|
+
requestUrl.searchParams.set('now', analyticsWindow.now);
|
|
69
|
+
|
|
70
|
+
const response = await fetch(requestUrl, {
|
|
71
|
+
method: 'GET',
|
|
72
|
+
headers: {
|
|
73
|
+
authorization: `Bearer ${config.apiKey}`,
|
|
74
|
+
accept: 'application/json',
|
|
75
|
+
},
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
if (!response.ok) {
|
|
79
|
+
const detail = await response.text().catch(() => '');
|
|
80
|
+
const err = new Error(`Hosted operational dashboard request failed (${response.status}): ${detail || 'unknown error'}`);
|
|
81
|
+
err.code = 'hosted_dashboard_http_error';
|
|
82
|
+
err.status = response.status;
|
|
83
|
+
throw err;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return response.json();
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
async function getOperationalDashboard(options = {}) {
|
|
90
|
+
const analyticsWindow = resolveAnalyticsWindow(options);
|
|
91
|
+
try {
|
|
92
|
+
const data = await fetchHostedDashboard(analyticsWindow);
|
|
93
|
+
return {
|
|
94
|
+
source: 'hosted',
|
|
95
|
+
data,
|
|
96
|
+
fallbackReason: null,
|
|
97
|
+
hostedStatus: 200,
|
|
98
|
+
};
|
|
99
|
+
} catch (err) {
|
|
100
|
+
const reason = err && err.message ? err.message : 'hosted_dashboard_unavailable';
|
|
101
|
+
const status = err && typeof err.status === 'number' ? err.status : null;
|
|
102
|
+
const code = err && err.code ? err.code : null;
|
|
103
|
+
|
|
104
|
+
// Hosted deliberately disabled or never configured — local fallback is
|
|
105
|
+
// intentional, not a degraded state. Tag as plain 'local'.
|
|
106
|
+
if (code === 'hosted_dashboard_disabled' || code === 'hosted_dashboard_unconfigured') {
|
|
107
|
+
return {
|
|
108
|
+
source: 'local',
|
|
109
|
+
data: await buildOperationalDashboard(analyticsWindow),
|
|
110
|
+
fallbackReason: reason,
|
|
111
|
+
hostedStatus: null,
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Mirror operational-summary: auth failure is the dangerous case. A
|
|
116
|
+
// dashboard that silently shows $0 revenue (from the local ledger) when
|
|
117
|
+
// Stripe actually has paid customers is a lie the operator acts on.
|
|
118
|
+
// Refuse to guess — surface an actionable error.
|
|
119
|
+
if (status === 401 || status === 403) {
|
|
120
|
+
const authErr = new Error(
|
|
121
|
+
`Hosted operational dashboard rejected credentials (HTTP ${status}). ` +
|
|
122
|
+
`The operator key on this machine does not match the one on the ` +
|
|
123
|
+
`hosted deployment. Fix: set THUMBGATE_OPERATOR_KEY in this shell, ` +
|
|
124
|
+
`or update the operatorKey field in ~/.config/thumbgate/operator.json, ` +
|
|
125
|
+
`to match Railway's THUMBGATE_OPERATOR_KEY. ` +
|
|
126
|
+
`Running north-star without hosted auth would report local-only ` +
|
|
127
|
+
`data as ground truth, which may not reflect actual Stripe revenue. ` +
|
|
128
|
+
`Original response: ${reason}`
|
|
129
|
+
);
|
|
130
|
+
authErr.code = 'hosted_dashboard_unauthorized';
|
|
131
|
+
authErr.status = status;
|
|
132
|
+
throw authErr;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Non-auth failure — local fallback is still useful for dev workflows,
|
|
136
|
+
// but tag the source so downstream renderers do not mistake it for
|
|
137
|
+
// verified hosted truth.
|
|
138
|
+
//
|
|
139
|
+
// Log only the status code (trusted) — the full reason contains upstream
|
|
140
|
+
// response text and is only returned structurally via fallbackReason.
|
|
141
|
+
console.warn(
|
|
142
|
+
`[operational-dashboard] Hosted dashboard unreachable (status=${status ?? 'network'}); ` +
|
|
143
|
+
`falling back to LOCAL-UNVERIFIED state. Numbers below may not reflect actual Stripe revenue.`
|
|
144
|
+
);
|
|
145
|
+
return {
|
|
146
|
+
source: 'local-unverified',
|
|
147
|
+
data: await buildOperationalDashboard(analyticsWindow),
|
|
148
|
+
fallbackReason: reason,
|
|
149
|
+
hostedStatus: status,
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
module.exports = {
|
|
155
|
+
buildOperationalDashboard,
|
|
156
|
+
fetchHostedDashboard,
|
|
157
|
+
getOperationalDashboard,
|
|
158
|
+
resolveHostedDashboardConfig,
|
|
159
|
+
shouldPreferHostedDashboard,
|
|
160
|
+
};
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Plan Gate — implementing the CodeRabbit "Planning-First" pattern.
|
|
6
|
+
*
|
|
7
|
+
* 1. (Static) Validates structured 'PLAN.md' / 'PRD' content (used in loop-closure).
|
|
8
|
+
* 2. (Dynamic) Intercepts high-risk tool calls during agent execution.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
const fs = require('fs');
|
|
12
|
+
const path = require('path');
|
|
13
|
+
|
|
14
|
+
const RISK_TOOLS = ['Bash', 'Write', 'Edit', 'Deploy'];
|
|
15
|
+
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
// Gate validators (Legacy / Loop Closure)
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
function countTableRows(content, sectionHeading) {
|
|
21
|
+
const sectionRegex = new RegExp(
|
|
22
|
+
`#+\\s*${sectionHeading}[^\\n]*\\n([\\s\\S]*?)(?=\\n#+\\s|$)`,
|
|
23
|
+
);
|
|
24
|
+
const match = content.match(sectionRegex);
|
|
25
|
+
if (!match) return 0;
|
|
26
|
+
|
|
27
|
+
const lines = match[1].split('\n').filter((l) => l.trim().startsWith('|'));
|
|
28
|
+
// Subtract header row and separator row
|
|
29
|
+
const dataRows = lines.filter(
|
|
30
|
+
(l) => !/^\|\s*-+/.test(l.trim()) && !/^\|\s*:?-+/.test(l.trim()),
|
|
31
|
+
);
|
|
32
|
+
// First row is the header
|
|
33
|
+
return Math.max(0, dataRows.length - 1);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function countContracts(content) {
|
|
37
|
+
const sectionRegex = /#+\s*Contracts[^\n]*\n([\s\S]*?)(?=\n#+\s|$)/;
|
|
38
|
+
const match = content.match(sectionRegex);
|
|
39
|
+
if (!match) return 0;
|
|
40
|
+
|
|
41
|
+
const section = match[1];
|
|
42
|
+
// Find code blocks and look for interface/type keywords inside them
|
|
43
|
+
const codeBlockRegex = /```[\s\S]*?```/g;
|
|
44
|
+
let count = 0;
|
|
45
|
+
let blockMatch;
|
|
46
|
+
while ((blockMatch = codeBlockRegex.exec(section)) !== null) {
|
|
47
|
+
const block = blockMatch[0];
|
|
48
|
+
const interfaceMatches = block.match(/\b(interface|type)\s+\w+/g);
|
|
49
|
+
if (interfaceMatches) count += interfaceMatches.length;
|
|
50
|
+
}
|
|
51
|
+
return count;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function countValidationScenarios(content) {
|
|
55
|
+
const sectionRegex =
|
|
56
|
+
/#+\s*Validation\s+Checklist[^\n]*\n([\s\S]*?)(?=\n#+\s|$)/;
|
|
57
|
+
const match = content.match(sectionRegex);
|
|
58
|
+
if (!match) return 0;
|
|
59
|
+
|
|
60
|
+
const lines = match[1].split('\n');
|
|
61
|
+
return lines.filter((l) => /^\s*-\s*\[\s*\]/.test(l)).length;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function getStatus(content) {
|
|
65
|
+
const match = content.match(/#+\s*Status[^\n]*\n\s*(\S+)/);
|
|
66
|
+
return match ? match[1].trim() : null;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function validatePlan(content) {
|
|
70
|
+
const questionCount = countTableRows(content, 'Clarifying Questions Resolved');
|
|
71
|
+
const contractCount = countContracts(content);
|
|
72
|
+
const scenarioCount = countValidationScenarios(content);
|
|
73
|
+
const status = getStatus(content);
|
|
74
|
+
|
|
75
|
+
const gates = [
|
|
76
|
+
{
|
|
77
|
+
name: 'Clarifying Questions',
|
|
78
|
+
pass: questionCount >= 3,
|
|
79
|
+
detail: `${questionCount} questions resolved`,
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
name: 'Contracts Defined',
|
|
83
|
+
pass: contractCount >= 1,
|
|
84
|
+
detail: `${contractCount} interface${contractCount !== 1 ? 's' : ''} found`,
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
name: 'Validation Checklist',
|
|
88
|
+
pass: scenarioCount >= 2,
|
|
89
|
+
detail: `${scenarioCount} scenarios defined`,
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
name: 'Status',
|
|
93
|
+
pass: status !== 'COMPLETE',
|
|
94
|
+
detail:
|
|
95
|
+
status === 'COMPLETE'
|
|
96
|
+
? 'COMPLETE (already finished — cannot re-approve)'
|
|
97
|
+
: `${status || 'UNKNOWN'} (not COMPLETE)`,
|
|
98
|
+
},
|
|
99
|
+
];
|
|
100
|
+
|
|
101
|
+
const allPass = gates.every((g) => g.pass);
|
|
102
|
+
return { gates, allPass };
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function formatReport(result) {
|
|
106
|
+
const lines = result.gates.map(
|
|
107
|
+
(g) => `${g.pass ? '✅' : '❌'} ${g.name}: ${g.detail}`,
|
|
108
|
+
);
|
|
109
|
+
lines.push('');
|
|
110
|
+
lines.push(
|
|
111
|
+
result.allPass
|
|
112
|
+
? 'RESULT: PASS — all gates satisfied'
|
|
113
|
+
: 'RESULT: BLOCKED — resolve issues above before spawning agents',
|
|
114
|
+
);
|
|
115
|
+
return lines.join('\n');
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// ---------------------------------------------------------------------------
|
|
119
|
+
// Dynamic Gating (CodeRabbit Orchestration Pattern)
|
|
120
|
+
// ---------------------------------------------------------------------------
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Evaluates the planning state for the current tool call.
|
|
124
|
+
*/
|
|
125
|
+
function evaluatePlanGate(toolName, toolInput, options = {}) {
|
|
126
|
+
if (!RISK_TOOLS.includes(toolName)) return null;
|
|
127
|
+
|
|
128
|
+
const projectRoot = options.projectRoot || process.cwd();
|
|
129
|
+
const planPath = path.join(projectRoot, 'PLAN.md');
|
|
130
|
+
|
|
131
|
+
// Tier 1: Existence Check
|
|
132
|
+
if (!fs.existsSync(planPath)) {
|
|
133
|
+
return {
|
|
134
|
+
decision: 'warn',
|
|
135
|
+
gate: 'plan-gate-missing',
|
|
136
|
+
message: '⚠️ THUMBGATE: High-risk tool call without a PLAN.md. Please create a plan documenting your intent and assumptions.',
|
|
137
|
+
severity: 'high'
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Tier 2: Alignment Check (Simple)
|
|
142
|
+
const planContent = fs.readFileSync(planPath, 'utf8');
|
|
143
|
+
const action = toolName === 'Bash' ? toolInput.command : toolInput.filePath;
|
|
144
|
+
|
|
145
|
+
if (action && !planContent.toLowerCase().includes(path.basename(action).toLowerCase())) {
|
|
146
|
+
return {
|
|
147
|
+
decision: 'warn',
|
|
148
|
+
gate: 'plan-gate-drift',
|
|
149
|
+
message: `⚠️ THUMBGATE: Strategic Drift detected. The action "${action}" is not mentioned in your PLAN.md.`,
|
|
150
|
+
severity: 'medium'
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Tier 3: Implicit Assumption Extraction
|
|
155
|
+
const assumptions = extractAssumptions(planContent);
|
|
156
|
+
if (assumptions.length > 0) {
|
|
157
|
+
return {
|
|
158
|
+
decision: 'warn',
|
|
159
|
+
gate: 'plan-gate-assumptions',
|
|
160
|
+
message: '🔍 THUMBGATE: Explicitly verify these implicit assumptions before proceeding:\n- ' + assumptions.join('\n- '),
|
|
161
|
+
severity: 'medium'
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Tier 4: Self-Critique / Risk Mitigation Check (Tip #8)
|
|
166
|
+
const hasCritique = /(?:critique|self-critique|risk|mitigation|alternative|flaw|weakness|pitfall)/i.test(planContent);
|
|
167
|
+
if (!hasCritique) {
|
|
168
|
+
return {
|
|
169
|
+
decision: 'warn',
|
|
170
|
+
gate: 'plan-gate-critique-missing',
|
|
171
|
+
message: '🧐 THUMBGATE: No Self-Critique/Risk Analysis detected in your PLAN.md. Please add a "Critique", "Risks", or "Mitigations" section to evaluate potential flaws in this plan before executing high-risk tools.',
|
|
172
|
+
severity: 'medium'
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
return null;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Scans plan content for "Assumes" or "Implicit" keywords.
|
|
181
|
+
*/
|
|
182
|
+
function extractAssumptions(content) {
|
|
183
|
+
const lines = content.split('\n');
|
|
184
|
+
const assumptions = [];
|
|
185
|
+
const regex = /(?:assume|assumption|implicit|pre-requisite|depends on)s?[:\-]?\s*(.*)/i;
|
|
186
|
+
|
|
187
|
+
for (const line of lines) {
|
|
188
|
+
const match = line.match(regex);
|
|
189
|
+
if (match && match[1].trim()) {
|
|
190
|
+
assumptions.push(match[1].trim());
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return assumptions.slice(0, 5);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// ---------------------------------------------------------------------------
|
|
197
|
+
// Main
|
|
198
|
+
// ---------------------------------------------------------------------------
|
|
199
|
+
|
|
200
|
+
function run() {
|
|
201
|
+
const args = process.argv.slice(2);
|
|
202
|
+
const jsonFlag = args.includes('--json');
|
|
203
|
+
const filePath = args.find((a) => a !== '--json');
|
|
204
|
+
|
|
205
|
+
if (!filePath) {
|
|
206
|
+
console.error('Usage: node scripts/plan-gate.js <plan-file.md> [--json]');
|
|
207
|
+
process.exit(1);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
const resolved = path.resolve(filePath);
|
|
211
|
+
if (!fs.existsSync(resolved)) {
|
|
212
|
+
console.error(`File not found: ${resolved}`);
|
|
213
|
+
process.exit(1);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
const content = fs.readFileSync(resolved, 'utf-8');
|
|
217
|
+
const result = validatePlan(content);
|
|
218
|
+
|
|
219
|
+
if (jsonFlag) {
|
|
220
|
+
console.log(JSON.stringify(result, null, 2));
|
|
221
|
+
} else {
|
|
222
|
+
console.log(formatReport(result));
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
process.exit(result.allPass ? 0 : 1);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// Export for testing
|
|
229
|
+
module.exports = {
|
|
230
|
+
validatePlan,
|
|
231
|
+
formatReport,
|
|
232
|
+
countTableRows,
|
|
233
|
+
countContracts,
|
|
234
|
+
countValidationScenarios,
|
|
235
|
+
getStatus,
|
|
236
|
+
evaluatePlanGate,
|
|
237
|
+
extractAssumptions,
|
|
238
|
+
};
|
|
239
|
+
|
|
240
|
+
// Run only when executed directly
|
|
241
|
+
if (require.main === module) {
|
|
242
|
+
run();
|
|
243
|
+
}
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
// repeat-metric — first-class "repeat-attempts blocked before execution" metric
|
|
5
|
+
//
|
|
6
|
+
// This module exposes data ThumbGate already collects in gate-stats state. It
|
|
7
|
+
// does NOT write to disk; it is a pure function over gates-engine.loadStats().
|
|
8
|
+
//
|
|
9
|
+
// The headline number is stats.recurringBlocks — incremented by recordStat()
|
|
10
|
+
// in gates-engine.js every time the SAME gateId fires twice within one session
|
|
11
|
+
// bucket. That is exactly "a pre-action gate fire that stopped a tool call the
|
|
12
|
+
// agent had already been blocked on", i.e. a repeat attempt prevented before it
|
|
13
|
+
// could round-trip and execute.
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
const gatesEngine = require('./gates-engine');
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Derive a per-gate { firstBlocks, repeatBlocks } split from the raw stats.
|
|
20
|
+
*
|
|
21
|
+
* recordStat() records, per session bucket, which gates have fired
|
|
22
|
+
* (stats.sessionFiredGates[sessionKey][gateId] === true). The FIRST fire of a
|
|
23
|
+
* gate in a bucket marks the flag; every subsequent fire in that same bucket
|
|
24
|
+
* increments stats.recurringBlocks. So for each gate:
|
|
25
|
+
* firstBlocks = number of distinct session buckets the gate fired in
|
|
26
|
+
* repeatBlocks = (total block+warn events for the gate) - firstBlocks
|
|
27
|
+
*
|
|
28
|
+
* total block+warn events come from stats.byGate[id] (blocked + warned), which
|
|
29
|
+
* recordStat() also maintains. repeatBlocks is clamped to >= 0 to stay robust
|
|
30
|
+
* against partially-written / legacy state.
|
|
31
|
+
*
|
|
32
|
+
* @param {object} stats raw object returned by gates-engine.loadStats()
|
|
33
|
+
* @returns {Object<string,{firstBlocks:number, repeatBlocks:number}>}
|
|
34
|
+
*/
|
|
35
|
+
function computeByGateSplit(stats) {
|
|
36
|
+
const byGate = {};
|
|
37
|
+
const sessionFiredGates = (stats && stats.sessionFiredGates) || {};
|
|
38
|
+
const rawByGate = (stats && stats.byGate) || {};
|
|
39
|
+
|
|
40
|
+
// Count distinct session buckets each gate fired in => firstBlocks.
|
|
41
|
+
const firstBlocksByGate = {};
|
|
42
|
+
for (const sessionKey of Object.keys(sessionFiredGates)) {
|
|
43
|
+
const fired = sessionFiredGates[sessionKey] || {};
|
|
44
|
+
for (const gateId of Object.keys(fired)) {
|
|
45
|
+
if (fired[gateId]) {
|
|
46
|
+
firstBlocksByGate[gateId] = (firstBlocksByGate[gateId] || 0) + 1;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Union of every gate id we know about from either source.
|
|
52
|
+
const gateIds = new Set([
|
|
53
|
+
...Object.keys(rawByGate),
|
|
54
|
+
...Object.keys(firstBlocksByGate),
|
|
55
|
+
]);
|
|
56
|
+
|
|
57
|
+
for (const gateId of gateIds) {
|
|
58
|
+
const gateStat = rawByGate[gateId] || {};
|
|
59
|
+
const totalFires = (gateStat.blocked || 0) + (gateStat.warned || 0);
|
|
60
|
+
const firstBlocks = firstBlocksByGate[gateId] || 0;
|
|
61
|
+
// Repeat fires are total fires beyond the first fire per session bucket.
|
|
62
|
+
const repeatBlocks = Math.max(0, totalFires - firstBlocks);
|
|
63
|
+
byGate[gateId] = { firstBlocks, repeatBlocks };
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return byGate;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Compute the repeat-attempts-blocked-before-execution metric.
|
|
71
|
+
*
|
|
72
|
+
* Pure read of gates-engine.loadStats(); no disk writes.
|
|
73
|
+
*
|
|
74
|
+
* @returns {{
|
|
75
|
+
* repeatBlocksBeforeExecution: number,
|
|
76
|
+
* recurringBlocks: number,
|
|
77
|
+
* totalBlocked: number,
|
|
78
|
+
* byGate: Object<string,{firstBlocks:number, repeatBlocks:number}>
|
|
79
|
+
* }}
|
|
80
|
+
*/
|
|
81
|
+
function computeRepeatMetric() {
|
|
82
|
+
let stats;
|
|
83
|
+
try {
|
|
84
|
+
stats = gatesEngine.loadStats() || {};
|
|
85
|
+
} catch (_) {
|
|
86
|
+
stats = {};
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const recurringBlocks = Number(stats.recurringBlocks || 0);
|
|
90
|
+
const totalBlocked = Number(stats.blocked || 0);
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
// Headline: a pre-action block that stopped a tool call the agent had
|
|
94
|
+
// already been blocked on this session.
|
|
95
|
+
repeatBlocksBeforeExecution: recurringBlocks,
|
|
96
|
+
recurringBlocks,
|
|
97
|
+
totalBlocked,
|
|
98
|
+
byGate: computeByGateSplit(stats),
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Add a `repeat` sub-key to a gate-stats object WITHOUT mutating the original.
|
|
104
|
+
*
|
|
105
|
+
* Takes the object returned by gate-stats.calculateStats() or
|
|
106
|
+
* dashboard.computeGateStats() and returns a shallow copy with the repeat
|
|
107
|
+
* metric attached. The caller's file does not need to import any internals.
|
|
108
|
+
*
|
|
109
|
+
* @param {object} gateStatsObject
|
|
110
|
+
* @returns {object} copy of gateStatsObject with `.repeat`
|
|
111
|
+
*/
|
|
112
|
+
function mergeRepeatMetricIntoGateStats(gateStatsObject) {
|
|
113
|
+
const base = gateStatsObject && typeof gateStatsObject === 'object' ? gateStatsObject : {};
|
|
114
|
+
return Object.assign({}, base, { repeat: computeRepeatMetric() });
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
module.exports = {
|
|
118
|
+
computeRepeatMetric,
|
|
119
|
+
mergeRepeatMetricIntoGateStats,
|
|
120
|
+
computeByGateSplit,
|
|
121
|
+
};
|
|
@@ -4,7 +4,14 @@
|
|
|
4
4
|
/**
|
|
5
5
|
* Silent-Failure Clustering — Unsupervised candidate source for the meta-agent loop
|
|
6
6
|
*
|
|
7
|
-
*
|
|
7
|
+
* Default-ON since 2026-05-21. Opt-out with: THUMBGATE_SILENT_FAILURE_CLUSTERING=0
|
|
8
|
+
* (or set NODE_ENV=test to skip in test runs). Was opt-in for the initial
|
|
9
|
+
* landing of PR #2285; flipped to default-on because the entire point is to
|
|
10
|
+
* cover the case where users never give thumbs-down — keeping it opt-in
|
|
11
|
+
* means lazy users (the ones who need it most) never benefit. Bounded risk:
|
|
12
|
+
* candidates still flow through meta-agent-loop's existing fp-rate eval, so
|
|
13
|
+
* a noisy cluster can't auto-promote to a real gate without passing the
|
|
14
|
+
* same precision/recall thresholds as LLM-generated candidates.
|
|
8
15
|
*
|
|
9
16
|
* Problem: ThumbGate's HITL loop only learns from explicit thumbs-down. Tool calls
|
|
10
17
|
* that fail without user feedback (exit_code != 0, regex-matched error in output,
|
|
@@ -460,9 +467,20 @@ function generateSilentFailureCandidates(opts = {}) {
|
|
|
460
467
|
// CLI
|
|
461
468
|
// ---------------------------------------------------------------------------
|
|
462
469
|
|
|
470
|
+
/**
|
|
471
|
+
* Resolve the enabled state. Default ON. Explicit "0" or "false" opts out;
|
|
472
|
+
* NODE_ENV=test also opts out to keep test runs deterministic.
|
|
473
|
+
*/
|
|
474
|
+
function isSilentFailureClusteringEnabled(env = process.env) {
|
|
475
|
+
if (env.NODE_ENV === 'test') return false;
|
|
476
|
+
const raw = (env.THUMBGATE_SILENT_FAILURE_CLUSTERING || '').toLowerCase();
|
|
477
|
+
if (raw === '0' || raw === 'false' || raw === 'off' || raw === 'no') return false;
|
|
478
|
+
return true;
|
|
479
|
+
}
|
|
480
|
+
|
|
463
481
|
async function main() {
|
|
464
|
-
if (
|
|
465
|
-
process.stdout.write('silent-failure-cluster: disabled (
|
|
482
|
+
if (!isSilentFailureClusteringEnabled()) {
|
|
483
|
+
process.stdout.write('silent-failure-cluster: disabled (THUMBGATE_SILENT_FAILURE_CLUSTERING=0 or NODE_ENV=test)\n');
|
|
466
484
|
return;
|
|
467
485
|
}
|
|
468
486
|
|
|
@@ -492,6 +510,7 @@ if (require.main === module) {
|
|
|
492
510
|
|
|
493
511
|
module.exports = {
|
|
494
512
|
generateSilentFailureCandidates,
|
|
513
|
+
isSilentFailureClusteringEnabled,
|
|
495
514
|
// exported for testing
|
|
496
515
|
redactSecrets,
|
|
497
516
|
normalizePaths,
|
|
@@ -301,6 +301,24 @@ function getCalibration(model) {
|
|
|
301
301
|
// Posterior Sampling
|
|
302
302
|
// ---------------------------------------------------------------------------
|
|
303
303
|
|
|
304
|
+
/**
|
|
305
|
+
* Return the Beta posterior parameters after applying Thompson temperature
|
|
306
|
+
* scaling. The posterior mean is preserved while precision changes:
|
|
307
|
+
* lower temperatures sharpen the posterior, higher temperatures flatten it.
|
|
308
|
+
*
|
|
309
|
+
* @param {Object} params - Category posterior parameters
|
|
310
|
+
* @param {number} temperature - Scaling factor (default 1.0)
|
|
311
|
+
* @returns {{ alpha: number, beta: number }}
|
|
312
|
+
*/
|
|
313
|
+
function getTemperatureScaledPosteriorParams(params, temperature = 1.0) {
|
|
314
|
+
const T = Math.max(0.01, Number(temperature) || 1.0);
|
|
315
|
+
const invT = 1.0 / T;
|
|
316
|
+
return {
|
|
317
|
+
alpha: Math.max(params.alpha * invT, 0.01),
|
|
318
|
+
beta: Math.max(params.beta * invT, 0.01),
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
|
|
304
322
|
/**
|
|
305
323
|
* Draw one sample from the Beta posterior for each category.
|
|
306
324
|
* Supports temperature scaling to adjust exploitation vs exploration.
|
|
@@ -318,13 +336,9 @@ function getCalibration(model) {
|
|
|
318
336
|
*/
|
|
319
337
|
function samplePosteriors(model, temperature = 1.0) {
|
|
320
338
|
const samples = {};
|
|
321
|
-
const T = Math.max(0.01, Number(temperature) || 1.0);
|
|
322
|
-
const invT = 1.0 / T;
|
|
323
339
|
|
|
324
340
|
for (const [cat, params] of Object.entries(model.categories || {})) {
|
|
325
|
-
|
|
326
|
-
const alpha = Math.max(params.alpha * invT, 0.01);
|
|
327
|
-
const beta = Math.max(params.beta * invT, 0.01);
|
|
341
|
+
const { alpha, beta } = getTemperatureScaledPosteriorParams(params, temperature);
|
|
328
342
|
samples[cat] = betaSample(alpha, beta);
|
|
329
343
|
}
|
|
330
344
|
return samples;
|
|
@@ -457,6 +471,7 @@ module.exports = {
|
|
|
457
471
|
getReliability,
|
|
458
472
|
isCalibrated,
|
|
459
473
|
getCalibration,
|
|
474
|
+
getTemperatureScaledPosteriorParams,
|
|
460
475
|
samplePosteriors,
|
|
461
476
|
argmaxPosteriors,
|
|
462
477
|
pickBestCategory,
|
package/scripts/tool-registry.js
CHANGED
|
@@ -895,6 +895,56 @@ const TOOLS = [
|
|
|
895
895
|
},
|
|
896
896
|
},
|
|
897
897
|
}),
|
|
898
|
+
destructiveTool({
|
|
899
|
+
name: 'detect_noop',
|
|
900
|
+
title: 'Detect No-op Action',
|
|
901
|
+
description: 'Detect whether a tool call was a no-op (state unchanged) or identical to a prior attempt in the session — a cheap repeat-loop signal. Records the action attempt state for repeat detection.',
|
|
902
|
+
inputSchema: {
|
|
903
|
+
type: 'object',
|
|
904
|
+
required: ['actionId'],
|
|
905
|
+
properties: {
|
|
906
|
+
actionId: { type: 'string', description: 'Stable identifier for the action being checked (e.g. the file path or command being attempted)' },
|
|
907
|
+
kind: { type: 'string', enum: ['file', 'command'], description: 'Action kind: file edit/write or command execution' },
|
|
908
|
+
filePath: { type: 'string', description: 'Path of the file the action targets (file kind)' },
|
|
909
|
+
beforeContent: { type: 'string', description: 'File content before the action (file kind)' },
|
|
910
|
+
afterContent: { type: 'string', description: 'File content after the action (file kind)' },
|
|
911
|
+
exitCode: { type: 'number', description: 'Command exit code (command kind)' },
|
|
912
|
+
stdout: { type: 'string', description: 'Command stdout (command kind)' },
|
|
913
|
+
stderr: { type: 'string', description: 'Command stderr (command kind)' },
|
|
914
|
+
sessionId: { type: 'string', description: 'Optional session id used to scope repeat-attempt detection' },
|
|
915
|
+
},
|
|
916
|
+
},
|
|
917
|
+
}),
|
|
918
|
+
destructiveTool({
|
|
919
|
+
name: 'record_action_receipt',
|
|
920
|
+
title: 'Record Action Receipt',
|
|
921
|
+
description: 'Pair a tracked tool call with its outcome (diff, exit code, test result) so a promoted lesson encodes "this action -> this outcome", not just a thumbs signal. Appends to the action-receipts log.',
|
|
922
|
+
inputSchema: {
|
|
923
|
+
type: 'object',
|
|
924
|
+
required: ['actionId'],
|
|
925
|
+
properties: {
|
|
926
|
+
actionId: { type: 'string', description: 'Identifier of the tracked action this receipt pairs with' },
|
|
927
|
+
toolName: { type: 'string', description: 'Name of the tool that was invoked' },
|
|
928
|
+
toolInput: { type: 'object', description: 'Structured input the tool was called with' },
|
|
929
|
+
diff: { type: 'string', description: 'Optional unified diff or change summary produced by the action' },
|
|
930
|
+
exitCode: { type: 'number', description: 'Optional command exit code outcome' },
|
|
931
|
+
testOutcome: { type: 'string', description: 'Optional test outcome (e.g. passed, failed, 12/12)' },
|
|
932
|
+
stateHash: { type: 'string', description: 'Optional post-action state hash (from detect_noop)' },
|
|
933
|
+
},
|
|
934
|
+
},
|
|
935
|
+
}),
|
|
936
|
+
readOnlyTool({
|
|
937
|
+
name: 'get_action_receipts',
|
|
938
|
+
title: 'Get Action Receipts',
|
|
939
|
+
description: 'Read outcome-paired action receipts. Returns the receipt for a specific actionId, or the most recent receipts when no actionId is given.',
|
|
940
|
+
inputSchema: {
|
|
941
|
+
type: 'object',
|
|
942
|
+
properties: {
|
|
943
|
+
actionId: { type: 'string', description: 'Optional action id to fetch the matching receipt for' },
|
|
944
|
+
limit: { type: 'number', description: 'Max number of recent receipts to return when no actionId is given (default 20)' },
|
|
945
|
+
},
|
|
946
|
+
},
|
|
947
|
+
}),
|
|
898
948
|
readOnlyTool({
|
|
899
949
|
name: 'verify_claim',
|
|
900
950
|
description: 'Check whether a claim has enough tracked evidence before the agent asserts it.',
|