thumbgate 1.22.0 → 1.23.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +1 -0
- package/adapters/chatgpt/openapi.yaml +10 -0
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +1 -1
- package/adapters/opencode/opencode.json +1 -1
- package/bin/cli.js +212 -30
- package/config/post-deploy-marketing-pages.json +5 -0
- package/openapi/openapi.yaml +10 -0
- package/package.json +13 -3
- package/public/agents-cost-savings.html +151 -0
- package/public/ai-malpractice-prevention.html +489 -0
- package/public/codex-plugin.html +1 -1
- package/public/index.html +34 -3
- package/public/numbers.html +2 -2
- package/public/pricing.html +1 -1
- package/public/pro.html +22 -0
- package/scripts/cli-telemetry.js +6 -1
- package/scripts/commercial-offer.js +72 -0
- package/scripts/gates-engine.js +119 -6
- package/scripts/meta-agent-loop.js +32 -0
- package/scripts/pro-local-dashboard.js +4 -4
- package/scripts/rate-limiter.js +7 -1
- package/scripts/self-healing-check.js +193 -0
- package/scripts/silent-failure-cluster.js +512 -0
- package/scripts/telemetry-analytics.js +38 -0
- package/src/api/server.js +252 -36
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
const fs = require('node:fs');
|
|
3
|
+
const os = require('node:os');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
const { spawnSync } = require('node:child_process');
|
|
6
|
+
const { diagnoseFailure } = require('./failure-diagnostics');
|
|
7
|
+
const { appendDiagnosticRecord } = require('./feedback-loop');
|
|
8
|
+
|
|
9
|
+
const PROJECT_ROOT = path.join(__dirname, '..');
|
|
10
|
+
const DEFAULT_MAX_BUFFER_BYTES = 64 * 1024 * 1024;
|
|
11
|
+
const DEFAULT_TESTS_TIMEOUT_MS = Number.parseInt(
|
|
12
|
+
process.env.THUMBGATE_SELF_HEAL_TEST_TIMEOUT_MS || '',
|
|
13
|
+
10,
|
|
14
|
+
) || 60 * 60_000;
|
|
15
|
+
|
|
16
|
+
const DEFAULT_CHECKS = [
|
|
17
|
+
{ name: 'budget_status', command: ['npm', 'run', 'budget:status'], timeoutMs: 60_000 },
|
|
18
|
+
{ name: 'tests', command: ['npm', 'test'], timeoutMs: DEFAULT_TESTS_TIMEOUT_MS },
|
|
19
|
+
{ name: 'prove_adapters', command: ['npm', 'run', 'prove:adapters'], timeoutMs: 10 * 60_000, useTempProofDir: true },
|
|
20
|
+
{ name: 'prove_automation', command: ['npm', 'run', 'prove:automation'], timeoutMs: 10 * 60_000, useTempProofDir: true },
|
|
21
|
+
{ name: 'prove_data_pipeline', command: ['npm', 'run', 'prove:data-pipeline'], timeoutMs: 10 * 60_000, useTempProofDir: true },
|
|
22
|
+
{ name: 'prove_tessl', command: ['npm', 'run', 'prove:tessl'], timeoutMs: 10 * 60_000, useTempProofDir: true },
|
|
23
|
+
];
|
|
24
|
+
|
|
25
|
+
function runCommand(command, {
|
|
26
|
+
cwd = PROJECT_ROOT,
|
|
27
|
+
timeoutMs = 5 * 60_000,
|
|
28
|
+
env = process.env,
|
|
29
|
+
maxBufferBytes = DEFAULT_MAX_BUFFER_BYTES,
|
|
30
|
+
} = {}) {
|
|
31
|
+
const [cmd, ...args] = command;
|
|
32
|
+
const started = Date.now();
|
|
33
|
+
const result = spawnSync(cmd, args, {
|
|
34
|
+
cwd,
|
|
35
|
+
env,
|
|
36
|
+
encoding: 'utf-8',
|
|
37
|
+
timeout: timeoutMs,
|
|
38
|
+
maxBuffer: maxBufferBytes,
|
|
39
|
+
shell: false,
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
const durationMs = Date.now() - started;
|
|
43
|
+
const status = Number.isInteger(result.status) ? result.status : 1;
|
|
44
|
+
return {
|
|
45
|
+
exitCode: status,
|
|
46
|
+
durationMs,
|
|
47
|
+
stdout: result.stdout || '',
|
|
48
|
+
stderr: result.stderr || '',
|
|
49
|
+
error: result.error ? result.error.message : null,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function createCheckEnvironment(check) {
|
|
54
|
+
const environment = { ...process.env };
|
|
55
|
+
let cleanup = null;
|
|
56
|
+
|
|
57
|
+
if (check.useTempProofDir) {
|
|
58
|
+
const proofDir = fs.mkdtempSync(path.join(os.tmpdir(), `thumbgate-${check.name}-`));
|
|
59
|
+
environment.THUMBGATE_PROOF_DIR = proofDir;
|
|
60
|
+
if (check.name === 'prove_automation') {
|
|
61
|
+
environment.THUMBGATE_AUTOMATION_PROOF_DIR = proofDir;
|
|
62
|
+
}
|
|
63
|
+
cleanup = () => {
|
|
64
|
+
fs.rmSync(proofDir, { recursive: true, force: true });
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
return { env: environment, cleanup };
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function collectHealthReport({
|
|
72
|
+
checks = DEFAULT_CHECKS,
|
|
73
|
+
runner = runCommand,
|
|
74
|
+
cwd = PROJECT_ROOT,
|
|
75
|
+
persistDiagnostics = false,
|
|
76
|
+
} = {}) {
|
|
77
|
+
const startedAt = new Date();
|
|
78
|
+
const results = checks.map((check) => {
|
|
79
|
+
const { env, cleanup } = createCheckEnvironment(check);
|
|
80
|
+
let run;
|
|
81
|
+
try {
|
|
82
|
+
run = runner(check.command, { cwd, timeoutMs: check.timeoutMs, env });
|
|
83
|
+
} finally {
|
|
84
|
+
if (cleanup) {
|
|
85
|
+
cleanup();
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
const diagnosis = run.exitCode === 0
|
|
89
|
+
? null
|
|
90
|
+
: diagnoseFailure({
|
|
91
|
+
step: check.name,
|
|
92
|
+
context: check.command.join(' '),
|
|
93
|
+
healthCheck: {
|
|
94
|
+
name: check.name,
|
|
95
|
+
exitCode: run.exitCode,
|
|
96
|
+
status: 'unhealthy',
|
|
97
|
+
outputTail: `${run.stdout}\n${run.stderr}`.trim().slice(-2000),
|
|
98
|
+
},
|
|
99
|
+
exitCode: run.exitCode,
|
|
100
|
+
error: run.error,
|
|
101
|
+
output: `${run.stdout}\n${run.stderr}`.trim(),
|
|
102
|
+
});
|
|
103
|
+
const persistedDiagnosis = persistDiagnostics && diagnosis
|
|
104
|
+
? appendDiagnosticRecord({
|
|
105
|
+
source: 'self_heal_check',
|
|
106
|
+
step: check.name,
|
|
107
|
+
context: check.command.join(' '),
|
|
108
|
+
diagnosis,
|
|
109
|
+
metadata: {
|
|
110
|
+
command: check.command.join(' '),
|
|
111
|
+
},
|
|
112
|
+
})
|
|
113
|
+
: null;
|
|
114
|
+
return {
|
|
115
|
+
name: check.name,
|
|
116
|
+
command: check.command.join(' '),
|
|
117
|
+
status: run.exitCode === 0 ? 'healthy' : 'unhealthy',
|
|
118
|
+
exitCode: run.exitCode,
|
|
119
|
+
durationMs: run.durationMs,
|
|
120
|
+
error: run.error,
|
|
121
|
+
outputTail: `${run.stdout}\n${run.stderr}`.trim().slice(-2000),
|
|
122
|
+
diagnosis,
|
|
123
|
+
persistedDiagnosis,
|
|
124
|
+
};
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
const healthyCount = results.filter((x) => x.status === 'healthy').length;
|
|
128
|
+
const unhealthyCount = results.length - healthyCount;
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
generatedAt: startedAt.toISOString(),
|
|
132
|
+
durationMs: Date.now() - startedAt.getTime(),
|
|
133
|
+
overall_status: unhealthyCount === 0 ? 'healthy' : 'unhealthy',
|
|
134
|
+
summary: {
|
|
135
|
+
total: results.length,
|
|
136
|
+
healthy: healthyCount,
|
|
137
|
+
unhealthy: unhealthyCount,
|
|
138
|
+
},
|
|
139
|
+
checks: results,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function reportToText(report) {
|
|
144
|
+
const lines = [];
|
|
145
|
+
lines.push(`Self-Healing Health Check @ ${report.generatedAt}`);
|
|
146
|
+
lines.push(`Overall: ${report.overall_status.toUpperCase()}`);
|
|
147
|
+
lines.push(`Checks: ${report.summary.healthy}/${report.summary.total} healthy`);
|
|
148
|
+
lines.push('');
|
|
149
|
+
|
|
150
|
+
report.checks.forEach((check) => {
|
|
151
|
+
const icon = check.status === 'healthy' ? '✅' : '❌';
|
|
152
|
+
lines.push(`${icon} ${check.name} (${check.durationMs}ms)`);
|
|
153
|
+
if (check.status !== 'healthy') {
|
|
154
|
+
lines.push(` command: ${check.command}`);
|
|
155
|
+
if (check.error) lines.push(` error: ${check.error}`);
|
|
156
|
+
if (check.diagnosis && check.diagnosis.rootCauseCategory) {
|
|
157
|
+
lines.push(` diagnosis: ${check.diagnosis.rootCauseCategory}`);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
return `${lines.join('\n')}\n`;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function runCli() {
|
|
166
|
+
const args = new Set(process.argv.slice(2));
|
|
167
|
+
const emitJson = args.has('--json');
|
|
168
|
+
const noFail = args.has('--no-fail');
|
|
169
|
+
const report = collectHealthReport({ persistDiagnostics: true });
|
|
170
|
+
|
|
171
|
+
if (emitJson) {
|
|
172
|
+
console.log(JSON.stringify(report, null, 2));
|
|
173
|
+
} else {
|
|
174
|
+
process.stdout.write(reportToText(report));
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
if (!noFail && report.overall_status !== 'healthy') {
|
|
178
|
+
process.exit(1);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
module.exports = {
|
|
183
|
+
DEFAULT_CHECKS,
|
|
184
|
+
DEFAULT_TESTS_TIMEOUT_MS,
|
|
185
|
+
DEFAULT_MAX_BUFFER_BYTES,
|
|
186
|
+
runCommand,
|
|
187
|
+
collectHealthReport,
|
|
188
|
+
reportToText,
|
|
189
|
+
};
|
|
190
|
+
|
|
191
|
+
if (require.main === module) {
|
|
192
|
+
runCli();
|
|
193
|
+
}
|
|
@@ -0,0 +1,512 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Silent-Failure Clustering — Unsupervised candidate source for the meta-agent loop
|
|
6
|
+
*
|
|
7
|
+
* Off by default. Enabled with: THUMBGATE_SILENT_FAILURE_CLUSTERING=1
|
|
8
|
+
*
|
|
9
|
+
* Problem: ThumbGate's HITL loop only learns from explicit thumbs-down. Tool calls
|
|
10
|
+
* that fail without user feedback (exit_code != 0, regex-matched error in output,
|
|
11
|
+
* agent silently recovers) are invisible to `auto-promote-gates.js`. This module
|
|
12
|
+
* mines those silent failures from the JSONL conversation logs, clusters them by
|
|
13
|
+
* (tool, normalized-arg-signature), and emits candidate prevention rules that
|
|
14
|
+
* flow through the EXISTING meta-agent-loop fp-rate eval — never bypassed.
|
|
15
|
+
*
|
|
16
|
+
* Pipeline:
|
|
17
|
+
* 1. Reuse `discoverConversationLogs` from `self-distill-agent.js` to find logs
|
|
18
|
+
* 2. Read each JSONL line; extract tool calls (Bash, Edit, Write, …) with their args
|
|
19
|
+
* and adjacent tool_result entries that carry exit_code / error text
|
|
20
|
+
* 3. Filter to "failed" calls (exit_code != 0 OR matches one of ERROR_PATTERNS,
|
|
21
|
+
* mirroring `self-distill-agent.js`)
|
|
22
|
+
* 4. Drop any call whose timestamp is within ±5min of a feedback-log entry —
|
|
23
|
+
* those are already in the HITL loop and would double-count
|
|
24
|
+
* 5. Normalize args: absolute paths → `<HOME>/…`, redact secrets per the
|
|
25
|
+
* canonical regex set in `~/.claude/hooks/daily-log-append.sh`
|
|
26
|
+
* 6. Cluster by exact tuple `(tool, normalized-arg-signature)`, min size 3
|
|
27
|
+
* 7. Emit each cluster as a candidate with `origin: 'silent-failure-cluster'`
|
|
28
|
+
* so meta-agent-loop tags it for downstream precision measurement
|
|
29
|
+
*
|
|
30
|
+
* Known limitations (locked in by the spec):
|
|
31
|
+
* - Only worthwhile on workspaces generating ≥ 50 tool calls/day. Surfaces
|
|
32
|
+
* "insufficient data, skipped" cleanly rather than emitting noise.
|
|
33
|
+
* - Cluster ≠ bad; we rely on the exit_code / ERROR_PATTERNS filter to make
|
|
34
|
+
* a cluster a *failure* cluster.
|
|
35
|
+
* - No drift detection. If tools change, old clusters pollute. Out of scope for v1.
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
const fs = require('fs');
|
|
39
|
+
const os = require('os');
|
|
40
|
+
const path = require('path');
|
|
41
|
+
|
|
42
|
+
const {
|
|
43
|
+
discoverConversationLogs,
|
|
44
|
+
} = require('./self-distill-agent');
|
|
45
|
+
|
|
46
|
+
// Mirrors self-distill-agent.js ERROR_PATTERNS exactly. self-distill does NOT
|
|
47
|
+
// export this constant; duplicating here is the smallest-surface choice that
|
|
48
|
+
// keeps both modules independently testable. If self-distill ever exports it,
|
|
49
|
+
// switch to the import.
|
|
50
|
+
const ERROR_PATTERNS = [
|
|
51
|
+
/\bError:/i,
|
|
52
|
+
/\bFAIL\b/,
|
|
53
|
+
/\bnot ok\b/,
|
|
54
|
+
/exit code\s*(?:!=\s*0|[1-9]\d*)/i,
|
|
55
|
+
/\bERROR\b/,
|
|
56
|
+
/\bTypeError\b/,
|
|
57
|
+
/\bReferenceError\b/,
|
|
58
|
+
/\bSyntaxError\b/,
|
|
59
|
+
/\bcommand failed\b/i,
|
|
60
|
+
/\bexited with\s+[1-9]/i,
|
|
61
|
+
];
|
|
62
|
+
|
|
63
|
+
const HOME = process.env.HOME || process.env.USERPROFILE || os.homedir() || '';
|
|
64
|
+
|
|
65
|
+
const MIN_CLUSTER_SIZE = 3;
|
|
66
|
+
const MIN_DAILY_CALLS_FOR_USEFUL_CLUSTERING = 50;
|
|
67
|
+
const FEEDBACK_PROXIMITY_WINDOW_MS = 5 * 60 * 1000; // ±5 min
|
|
68
|
+
|
|
69
|
+
// ---------------------------------------------------------------------------
|
|
70
|
+
// Redaction — keep in sync with ~/.claude/hooks/daily-log-append.sh
|
|
71
|
+
// ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
const SECRET_PATTERNS = [
|
|
74
|
+
// Stripe + GitHub + Slack + AWS + Google + npm + Anthropic keys
|
|
75
|
+
{
|
|
76
|
+
re: /(sk_live_|sk_test_|rk_live_|rk_test_|ghp_|gho_|ghu_|ghs_|ghr_|github_pat_|xoxb-|xoxp-|xapp-|AKIA|AIza|npm_|sk-ant-[A-Za-z0-9]*-?|sk-proj-|sk-svcacct-)[A-Za-z0-9_-]{8,}/g,
|
|
77
|
+
replacement: '[REDACTED]',
|
|
78
|
+
},
|
|
79
|
+
// JWT (3 base64url segments)
|
|
80
|
+
{ re: /eyJ[A-Za-z0-9_-]{20,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}/g, replacement: '[REDACTED-JWT]' },
|
|
81
|
+
// Slack webhook
|
|
82
|
+
{ re: /https:\/\/hooks\.slack\.com\/services\/[A-Z0-9/]+/g, replacement: '[REDACTED-SLACK-WEBHOOK]' },
|
|
83
|
+
// Private key header
|
|
84
|
+
{ re: /-----BEGIN [A-Z ]*PRIVATE KEY-----/g, replacement: '[REDACTED-PRIVATE-KEY-HEADER]' },
|
|
85
|
+
];
|
|
86
|
+
|
|
87
|
+
function redactSecrets(text) {
|
|
88
|
+
let out = String(text == null ? '' : text);
|
|
89
|
+
for (const { re, replacement } of SECRET_PATTERNS) {
|
|
90
|
+
out = out.replace(re, replacement);
|
|
91
|
+
}
|
|
92
|
+
return out;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// ---------------------------------------------------------------------------
|
|
96
|
+
// Path normalization
|
|
97
|
+
// ---------------------------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
function normalizePaths(text) {
|
|
100
|
+
let out = String(text == null ? '' : text);
|
|
101
|
+
if (HOME) {
|
|
102
|
+
// Replace exact HOME prefix
|
|
103
|
+
out = out.split(HOME).join('<HOME>');
|
|
104
|
+
}
|
|
105
|
+
// Replace generic /Users/<name>/... and /home/<name>/... that don't match this HOME
|
|
106
|
+
out = out.replace(/\/Users\/[^/\s"']+/g, '<HOME>');
|
|
107
|
+
out = out.replace(/\/home\/[^/\s"']+/g, '<HOME>');
|
|
108
|
+
out = out.replace(/\/tmp\/[A-Za-z0-9._-]+/g, '/tmp/<X>'); // NOSONAR — regex on strings, not filesystem
|
|
109
|
+
out = out.replace(/\/private\/tmp\/[A-Za-z0-9._-]+/g, '/tmp/<X>'); // NOSONAR
|
|
110
|
+
return out;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function normalizeForSignature(value) {
|
|
114
|
+
// Order matters: redact first (some secrets contain path-ish chars), then paths.
|
|
115
|
+
return normalizePaths(redactSecrets(value));
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// ---------------------------------------------------------------------------
|
|
119
|
+
// JSONL parsing
|
|
120
|
+
// ---------------------------------------------------------------------------
|
|
121
|
+
|
|
122
|
+
function readJsonlSafe(filePath) {
|
|
123
|
+
if (!filePath || !fs.existsSync(filePath)) return [];
|
|
124
|
+
try {
|
|
125
|
+
const raw = fs.readFileSync(filePath, 'utf-8');
|
|
126
|
+
if (!raw.trim()) return [];
|
|
127
|
+
return raw
|
|
128
|
+
.split('\n')
|
|
129
|
+
.filter(Boolean)
|
|
130
|
+
.map((line) => {
|
|
131
|
+
try {
|
|
132
|
+
return JSON.parse(line);
|
|
133
|
+
} catch {
|
|
134
|
+
return null;
|
|
135
|
+
}
|
|
136
|
+
})
|
|
137
|
+
.filter(Boolean);
|
|
138
|
+
} catch {
|
|
139
|
+
return [];
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// ---------------------------------------------------------------------------
|
|
144
|
+
// Tool-call + failure extraction
|
|
145
|
+
// ---------------------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Extract tool-call records from a parsed transcript entry.
|
|
149
|
+
*
|
|
150
|
+
* Supports two shapes:
|
|
151
|
+
* (A) Claude Code transcript format:
|
|
152
|
+
* { type:"assistant", message:{ content:[ { type:"tool_use", name, input, id } ] } }
|
|
153
|
+
* { type:"user", message:{ content:[ { type:"tool_result", tool_use_id, content, is_error } ] }, toolUseResult: {...} }
|
|
154
|
+
* (B) Simplified test fixture format:
|
|
155
|
+
* { type:"tool_call", tool, args, timestamp }
|
|
156
|
+
* { type:"tool_result", tool_use_id, exit_code, output, timestamp }
|
|
157
|
+
*
|
|
158
|
+
* Both shapes are normalized to:
|
|
159
|
+
* { kind:'call', tool, args, callId, timestamp }
|
|
160
|
+
* { kind:'result', callId, exitCode, output, isError, timestamp }
|
|
161
|
+
*/
|
|
162
|
+
function extractToolEvents(entry) {
|
|
163
|
+
if (!entry || typeof entry !== 'object') return [];
|
|
164
|
+
const events = [];
|
|
165
|
+
const ts = entry.timestamp || entry.ts || null;
|
|
166
|
+
|
|
167
|
+
// Shape (B) — test fixture / simplified
|
|
168
|
+
if (entry.type === 'tool_call' && entry.tool) {
|
|
169
|
+
events.push({
|
|
170
|
+
kind: 'call',
|
|
171
|
+
tool: String(entry.tool),
|
|
172
|
+
args: entry.args || entry.input || {},
|
|
173
|
+
callId: entry.callId || entry.id || null,
|
|
174
|
+
timestamp: ts,
|
|
175
|
+
});
|
|
176
|
+
return events;
|
|
177
|
+
}
|
|
178
|
+
if (entry.type === 'tool_result' && (entry.tool_use_id || entry.callId)) {
|
|
179
|
+
events.push({
|
|
180
|
+
kind: 'result',
|
|
181
|
+
callId: entry.tool_use_id || entry.callId,
|
|
182
|
+
exitCode: typeof entry.exit_code === 'number' ? entry.exit_code : (typeof entry.exitCode === 'number' ? entry.exitCode : null),
|
|
183
|
+
output: String(entry.output || entry.content || ''),
|
|
184
|
+
isError: Boolean(entry.is_error || entry.isError),
|
|
185
|
+
timestamp: ts,
|
|
186
|
+
});
|
|
187
|
+
return events;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Shape (A) — Claude Code transcript
|
|
191
|
+
const msg = entry.message;
|
|
192
|
+
if (entry.type === 'assistant' && msg && Array.isArray(msg.content)) {
|
|
193
|
+
for (const part of msg.content) {
|
|
194
|
+
if (part && part.type === 'tool_use' && part.name) {
|
|
195
|
+
events.push({
|
|
196
|
+
kind: 'call',
|
|
197
|
+
tool: String(part.name),
|
|
198
|
+
args: part.input || {},
|
|
199
|
+
callId: part.id || null,
|
|
200
|
+
timestamp: ts,
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
if (entry.type === 'user' && msg && Array.isArray(msg.content)) {
|
|
206
|
+
for (const part of msg.content) {
|
|
207
|
+
if (part && part.type === 'tool_result') {
|
|
208
|
+
const tur = entry.toolUseResult || {};
|
|
209
|
+
const exitCode = typeof tur.stderr === 'string' && tur.stderr.length > 0 && typeof tur.interrupted === 'undefined'
|
|
210
|
+
? null
|
|
211
|
+
: (typeof tur.exit_code === 'number' ? tur.exit_code : (typeof tur.exitCode === 'number' ? tur.exitCode : null));
|
|
212
|
+
const outputText = typeof part.content === 'string'
|
|
213
|
+
? part.content
|
|
214
|
+
: (Array.isArray(part.content)
|
|
215
|
+
? part.content.map((c) => (typeof c === 'string' ? c : (c && c.text) || '')).join('\n')
|
|
216
|
+
: '');
|
|
217
|
+
events.push({
|
|
218
|
+
kind: 'result',
|
|
219
|
+
callId: part.tool_use_id || null,
|
|
220
|
+
exitCode,
|
|
221
|
+
output: outputText,
|
|
222
|
+
isError: Boolean(part.is_error),
|
|
223
|
+
timestamp: ts,
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
return events;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* Pair tool calls with their results by callId; for calls without a matching
|
|
233
|
+
* result, treat them as having no failure signal (skipped).
|
|
234
|
+
*/
|
|
235
|
+
function pairCallsWithResults(events) {
|
|
236
|
+
const calls = new Map(); // callId → call
|
|
237
|
+
const orphanCalls = [];
|
|
238
|
+
for (const e of events) {
|
|
239
|
+
if (e.kind === 'call') {
|
|
240
|
+
if (e.callId) calls.set(e.callId, e);
|
|
241
|
+
else orphanCalls.push(e);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
const paired = [];
|
|
245
|
+
for (const e of events) {
|
|
246
|
+
if (e.kind !== 'result') continue;
|
|
247
|
+
const call = e.callId ? calls.get(e.callId) : null;
|
|
248
|
+
if (!call) continue;
|
|
249
|
+
paired.push({ call, result: e });
|
|
250
|
+
}
|
|
251
|
+
return paired;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
function isFailedCall(pair) {
|
|
255
|
+
const { result } = pair;
|
|
256
|
+
if (!result) return false;
|
|
257
|
+
if (result.isError === true) return true;
|
|
258
|
+
if (typeof result.exitCode === 'number' && result.exitCode !== 0) return true;
|
|
259
|
+
const output = String(result.output || '');
|
|
260
|
+
for (const re of ERROR_PATTERNS) {
|
|
261
|
+
if (re.test(output)) return true;
|
|
262
|
+
}
|
|
263
|
+
return false;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// ---------------------------------------------------------------------------
|
|
267
|
+
// Feedback-log proximity filter
|
|
268
|
+
// ---------------------------------------------------------------------------
|
|
269
|
+
|
|
270
|
+
function loadFeedbackTimestamps(feedbackLogPath) {
|
|
271
|
+
const entries = readJsonlSafe(feedbackLogPath);
|
|
272
|
+
const timestamps = [];
|
|
273
|
+
for (const e of entries) {
|
|
274
|
+
const ts = e && (e.timestamp || e.ts);
|
|
275
|
+
if (!ts) continue;
|
|
276
|
+
const t = Date.parse(ts);
|
|
277
|
+
if (Number.isFinite(t)) timestamps.push(t);
|
|
278
|
+
}
|
|
279
|
+
return timestamps.sort((a, b) => a - b);
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
function hasAdjacentFeedback(timestampIso, feedbackTimestamps, windowMs = FEEDBACK_PROXIMITY_WINDOW_MS) {
|
|
283
|
+
if (!timestampIso || !feedbackTimestamps || feedbackTimestamps.length === 0) return false;
|
|
284
|
+
const t = Date.parse(timestampIso);
|
|
285
|
+
if (!Number.isFinite(t)) return false;
|
|
286
|
+
// Linear scan — feedback log is small (HITL = sparse). If it gets large,
|
|
287
|
+
// switch to binary search; not worth the complexity at v1.
|
|
288
|
+
for (const f of feedbackTimestamps) {
|
|
289
|
+
if (Math.abs(f - t) <= windowMs) return true;
|
|
290
|
+
}
|
|
291
|
+
return false;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// ---------------------------------------------------------------------------
|
|
295
|
+
// Signature + clustering
|
|
296
|
+
// ---------------------------------------------------------------------------
|
|
297
|
+
|
|
298
|
+
function argsToSignature(tool, args) {
|
|
299
|
+
// Stable string signature over args. For Bash we use the command (first ~120
|
|
300
|
+
// chars after normalization); for file-tools we use the file_path; otherwise
|
|
301
|
+
// a sorted-key shallow JSON.
|
|
302
|
+
const norm = (v) => normalizeForSignature(String(v == null ? '' : v));
|
|
303
|
+
if (tool === 'Bash' && args && typeof args.command === 'string') {
|
|
304
|
+
return `Bash:${norm(args.command).slice(0, 160)}`;
|
|
305
|
+
}
|
|
306
|
+
if ((tool === 'Read' || tool === 'Edit' || tool === 'Write') && args && typeof args.file_path === 'string') {
|
|
307
|
+
return `${tool}:${norm(args.file_path)}`;
|
|
308
|
+
}
|
|
309
|
+
// Generic fallback — sorted keys, normalized values
|
|
310
|
+
try {
|
|
311
|
+
const keys = Object.keys(args || {}).sort();
|
|
312
|
+
const parts = keys.map((k) => {
|
|
313
|
+
const v = args[k];
|
|
314
|
+
const s = typeof v === 'string' ? v : JSON.stringify(v);
|
|
315
|
+
return `${k}=${norm(s).slice(0, 80)}`;
|
|
316
|
+
});
|
|
317
|
+
return `${tool}:${parts.join('|')}`;
|
|
318
|
+
} catch {
|
|
319
|
+
return `${tool}:<unserializable>`;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
function clusterFailures(failures, { minClusterSize = MIN_CLUSTER_SIZE } = {}) {
|
|
324
|
+
const buckets = new Map();
|
|
325
|
+
for (const f of failures) {
|
|
326
|
+
const sig = argsToSignature(f.tool, f.args);
|
|
327
|
+
if (!buckets.has(sig)) buckets.set(sig, []);
|
|
328
|
+
buckets.get(sig).push(f);
|
|
329
|
+
}
|
|
330
|
+
const clusters = [];
|
|
331
|
+
for (const [signature, members] of buckets.entries()) {
|
|
332
|
+
if (members.length < minClusterSize) continue;
|
|
333
|
+
// Sample excerpt from the first member's output for the rule message.
|
|
334
|
+
const sample = members[0];
|
|
335
|
+
clusters.push({
|
|
336
|
+
signature,
|
|
337
|
+
tool: sample.tool,
|
|
338
|
+
size: members.length,
|
|
339
|
+
// Keep redacted+normalized excerpts only.
|
|
340
|
+
sampleArgs: normalizeForSignature(JSON.stringify(sample.args || {})).slice(0, 200),
|
|
341
|
+
sampleOutput: normalizeForSignature(String(sample.output || '')).slice(0, 200),
|
|
342
|
+
});
|
|
343
|
+
}
|
|
344
|
+
return clusters.sort((a, b) => b.size - a.size);
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// ---------------------------------------------------------------------------
|
|
348
|
+
// Candidate emission — same shape as meta-agent-loop.js candidates
|
|
349
|
+
// ---------------------------------------------------------------------------
|
|
350
|
+
|
|
351
|
+
function candidateFromCluster(cluster) {
|
|
352
|
+
// Build a regex that targets this normalized signature. We escape regex
|
|
353
|
+
// metacharacters and cap the pattern length — meta-agent-loop's matchesEntry
|
|
354
|
+
// will compile this with `new RegExp(pattern, 'i')`.
|
|
355
|
+
const escape = (s) => String(s).replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
356
|
+
// Use a few keywords from the signature to form a flexible pattern.
|
|
357
|
+
const sigBody = cluster.signature.replace(/^[^:]+:/, '');
|
|
358
|
+
const words = sigBody
|
|
359
|
+
.split(/[\s/|=]+/)
|
|
360
|
+
.map((w) => w.replace(/[<>]/g, '').trim())
|
|
361
|
+
.filter((w) => w.length >= 4 && !/^[0-9]+$/.test(w))
|
|
362
|
+
.slice(0, 3);
|
|
363
|
+
const pattern = words.length >= 2
|
|
364
|
+
? words.map(escape).join('.*')
|
|
365
|
+
: escape(sigBody.slice(0, 60));
|
|
366
|
+
|
|
367
|
+
return {
|
|
368
|
+
pattern,
|
|
369
|
+
action: 'warn',
|
|
370
|
+
message: `Silent-failure cluster (${cluster.size}× ${cluster.tool}): ${cluster.sampleOutput.slice(0, 100) || cluster.sampleArgs.slice(0, 100)}`,
|
|
371
|
+
severity: 'medium',
|
|
372
|
+
rationale: `Observed ${cluster.size} silent failures matching ${cluster.tool} signature; never thumbed-down by user.`,
|
|
373
|
+
source: 'silent-failure-cluster',
|
|
374
|
+
origin: 'silent-failure-cluster',
|
|
375
|
+
clusterSize: cluster.size,
|
|
376
|
+
clusterSignature: cluster.signature,
|
|
377
|
+
};
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// ---------------------------------------------------------------------------
|
|
381
|
+
// Main entry point
|
|
382
|
+
// ---------------------------------------------------------------------------
|
|
383
|
+
|
|
384
|
+
/**
|
|
385
|
+
* Generate candidate rules from clustered silent failures.
|
|
386
|
+
*
|
|
387
|
+
* @param {object} opts
|
|
388
|
+
* @param {string[]} [opts.logPaths] — override conversation-log discovery (tests)
|
|
389
|
+
* @param {string} [opts.feedbackLogPath] — feedback-log.jsonl to exclude HITL'd calls
|
|
390
|
+
* @param {number} [opts.minClusterSize]
|
|
391
|
+
* @param {number} [opts.minDailyCalls]
|
|
392
|
+
* @returns {{
|
|
393
|
+
* candidates: object[],
|
|
394
|
+
* stats: {
|
|
395
|
+
* totalToolCalls: number,
|
|
396
|
+
* failedCalls: number,
|
|
397
|
+
* filteredByFeedback: number,
|
|
398
|
+
* clusters: number,
|
|
399
|
+
* skippedReason: string|null
|
|
400
|
+
* }
|
|
401
|
+
* }}
|
|
402
|
+
*/
|
|
403
|
+
function generateSilentFailureCandidates(opts = {}) {
|
|
404
|
+
const {
|
|
405
|
+
logPaths = discoverConversationLogs({ limit: 50 }),
|
|
406
|
+
feedbackLogPath = null,
|
|
407
|
+
minClusterSize = MIN_CLUSTER_SIZE,
|
|
408
|
+
minDailyCalls = MIN_DAILY_CALLS_FOR_USEFUL_CLUSTERING,
|
|
409
|
+
} = opts;
|
|
410
|
+
|
|
411
|
+
const stats = {
|
|
412
|
+
totalToolCalls: 0,
|
|
413
|
+
failedCalls: 0,
|
|
414
|
+
filteredByFeedback: 0,
|
|
415
|
+
clusters: 0,
|
|
416
|
+
skippedReason: null,
|
|
417
|
+
};
|
|
418
|
+
|
|
419
|
+
const feedbackTimestamps = feedbackLogPath ? loadFeedbackTimestamps(feedbackLogPath) : [];
|
|
420
|
+
|
|
421
|
+
const allFailures = [];
|
|
422
|
+
|
|
423
|
+
for (const logPath of logPaths) {
|
|
424
|
+
const entries = readJsonlSafe(logPath);
|
|
425
|
+
const allEvents = entries.flatMap(extractToolEvents);
|
|
426
|
+
const pairs = pairCallsWithResults(allEvents);
|
|
427
|
+
stats.totalToolCalls += pairs.length;
|
|
428
|
+
|
|
429
|
+
for (const pair of pairs) {
|
|
430
|
+
if (!isFailedCall(pair)) continue;
|
|
431
|
+
stats.failedCalls += 1;
|
|
432
|
+
const ts = pair.call.timestamp || pair.result.timestamp;
|
|
433
|
+
if (hasAdjacentFeedback(ts, feedbackTimestamps)) {
|
|
434
|
+
stats.filteredByFeedback += 1;
|
|
435
|
+
continue;
|
|
436
|
+
}
|
|
437
|
+
allFailures.push({
|
|
438
|
+
tool: pair.call.tool,
|
|
439
|
+
args: pair.call.args,
|
|
440
|
+
output: pair.result.output,
|
|
441
|
+
timestamp: ts,
|
|
442
|
+
});
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// Insufficient-data path — emit empty cluster set cleanly.
|
|
447
|
+
if (stats.totalToolCalls < minDailyCalls) {
|
|
448
|
+
stats.skippedReason = `insufficient-data: ${stats.totalToolCalls} tool calls < ${minDailyCalls} threshold`;
|
|
449
|
+
return { candidates: [], stats };
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
const clusters = clusterFailures(allFailures, { minClusterSize });
|
|
453
|
+
stats.clusters = clusters.length;
|
|
454
|
+
|
|
455
|
+
const candidates = clusters.map(candidateFromCluster);
|
|
456
|
+
return { candidates, stats };
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
// ---------------------------------------------------------------------------
|
|
460
|
+
// CLI
|
|
461
|
+
// ---------------------------------------------------------------------------
|
|
462
|
+
|
|
463
|
+
async function main() {
|
|
464
|
+
if (process.env.THUMBGATE_SILENT_FAILURE_CLUSTERING !== '1') {
|
|
465
|
+
process.stdout.write('silent-failure-cluster: disabled (set THUMBGATE_SILENT_FAILURE_CLUSTERING=1 to enable)\n');
|
|
466
|
+
return;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
const { resolveFeedbackDir } = require('./feedback-paths');
|
|
470
|
+
let feedbackLogPath = null;
|
|
471
|
+
try {
|
|
472
|
+
feedbackLogPath = path.join(resolveFeedbackDir(), 'feedback-log.jsonl');
|
|
473
|
+
} catch {
|
|
474
|
+
// running outside a configured feedback dir — fine, just skip the proximity filter
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
const result = generateSilentFailureCandidates({ feedbackLogPath });
|
|
478
|
+
process.stdout.write(JSON.stringify({
|
|
479
|
+
enabled: true,
|
|
480
|
+
candidateCount: result.candidates.length,
|
|
481
|
+
stats: result.stats,
|
|
482
|
+
candidates: result.candidates,
|
|
483
|
+
}, null, 2) + '\n');
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
if (require.main === module) {
|
|
487
|
+
main().catch((err) => {
|
|
488
|
+
process.stderr.write(`silent-failure-cluster failed: ${err.message}\n`);
|
|
489
|
+
process.exitCode = 1;
|
|
490
|
+
});
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
module.exports = {
|
|
494
|
+
generateSilentFailureCandidates,
|
|
495
|
+
// exported for testing
|
|
496
|
+
redactSecrets,
|
|
497
|
+
normalizePaths,
|
|
498
|
+
normalizeForSignature,
|
|
499
|
+
extractToolEvents,
|
|
500
|
+
pairCallsWithResults,
|
|
501
|
+
isFailedCall,
|
|
502
|
+
hasAdjacentFeedback,
|
|
503
|
+
loadFeedbackTimestamps,
|
|
504
|
+
argsToSignature,
|
|
505
|
+
clusterFailures,
|
|
506
|
+
candidateFromCluster,
|
|
507
|
+
readJsonlSafe,
|
|
508
|
+
ERROR_PATTERNS,
|
|
509
|
+
MIN_CLUSTER_SIZE,
|
|
510
|
+
MIN_DAILY_CALLS_FOR_USEFUL_CLUSTERING,
|
|
511
|
+
FEEDBACK_PROXIMITY_WINDOW_MS,
|
|
512
|
+
};
|