@kernel.chat/kbot 3.99.20 → 3.99.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -0
- package/dist/agent.js +23 -0
- package/dist/agents/producer.js +65 -23
- package/dist/auth.d.ts +2 -0
- package/dist/cli.js +7 -4
- package/dist/critic-gate.d.ts +29 -0
- package/dist/critic-gate.js +223 -0
- package/dist/critic-retrospect.d.ts +64 -0
- package/dist/critic-retrospect.js +279 -0
- package/dist/critic-taxonomy.d.ts +40 -0
- package/dist/critic-taxonomy.js +146 -0
- package/dist/growth.d.ts +37 -0
- package/dist/growth.js +272 -0
- package/dist/integrations/ableton.d.ts +30 -0
- package/dist/integrations/ableton.js +66 -0
- package/dist/integrations/kbot-control-client.d.ts +66 -0
- package/dist/integrations/kbot-control-client.js +224 -0
- package/dist/observer.d.ts +13 -0
- package/dist/observer.js +5 -1
- package/dist/planner/hierarchical/dag.d.ts +71 -0
- package/dist/planner/hierarchical/dag.js +97 -0
- package/dist/planner/hierarchical/persistence.d.ts +26 -0
- package/dist/planner/hierarchical/persistence.js +113 -0
- package/dist/planner/hierarchical/session-planner.d.ts +68 -0
- package/dist/planner/hierarchical/session-planner.js +141 -0
- package/dist/planner/hierarchical/types.d.ts +116 -0
- package/dist/planner/hierarchical/types.js +18 -0
- package/dist/tool-pipeline.d.ts +39 -1
- package/dist/tool-pipeline.js +109 -1
- package/dist/tools/ableton-listen.d.ts +2 -0
- package/dist/tools/ableton-listen.js +126 -0
- package/dist/tools/ableton.js +477 -12
- package/dist/tools/index.js +2 -0
- package/dist/tools/kbot-control.d.ts +2 -0
- package/dist/tools/kbot-control.js +63 -0
- package/package.json +1 -1
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Critic Retrospect — retroactive judgement of past session tool calls.
|
|
3
|
+
*
|
|
4
|
+
* Reads ~/.kbot/observer/session.jsonl, replays tool calls through
|
|
5
|
+
* gateToolResult (critic-gate.ts), and reports:
|
|
6
|
+
* - overall accept/reject ratio
|
|
7
|
+
* - tools with highest reject rate (args-validation candidates)
|
|
8
|
+
* - rejects that were later retried successfully (critic false positives)
|
|
9
|
+
* - sessions ranked by "retries saved" score
|
|
10
|
+
* - suggested strictness setting from precision/recall tradeoff
|
|
11
|
+
*
|
|
12
|
+
* NB: the observer only logs {ts, tool, args, session} — no results.
|
|
13
|
+
* We synthesize a *result proxy* from retry behaviour: a call whose exact
|
|
14
|
+
* (tool, args-hash) recurs inside the same session within RETRY_WINDOW_MS
|
|
15
|
+
* is treated as having implicitly failed the first time. The critic is
|
|
16
|
+
* passed this synthesized signal so it can judge on intent + shape.
|
|
17
|
+
*
|
|
18
|
+
* Cache: ~/.kbot/critic-cache.json — keyed by (tool, argsHash, resultHash).
|
|
19
|
+
*
|
|
20
|
+
* CLI wiring: cli.ts was modified in parallel; leaving subcommand wiring
|
|
21
|
+
* as a TODO. Invoke via `node -e "import('./dist/critic-retrospect.js').then(m => m.run())"`.
|
|
22
|
+
*/
|
|
23
|
+
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs';
|
|
24
|
+
import { homedir } from 'node:os';
|
|
25
|
+
import { join, dirname } from 'node:path';
|
|
26
|
+
import { createHash } from 'node:crypto';
|
|
27
|
+
import { gateToolResult } from './critic-gate.js';
|
|
28
|
+
const OBSERVER_PATH = join(homedir(), '.kbot', 'observer', 'session.jsonl');
|
|
29
|
+
const CACHE_PATH = join(homedir(), '.kbot', 'critic-cache.json');
|
|
30
|
+
const RETRY_WINDOW_MS = 2 * 60 * 1000; // 2 minutes
|
|
31
|
+
function sha(s) { return createHash('sha256').update(s).digest('hex').slice(0, 16); }
|
|
32
|
+
function hashArgs(args) { try {
|
|
33
|
+
return sha(JSON.stringify(args));
|
|
34
|
+
}
|
|
35
|
+
catch {
|
|
36
|
+
return sha(String(args));
|
|
37
|
+
} }
|
|
38
|
+
function loadCache() {
|
|
39
|
+
if (!existsSync(CACHE_PATH))
|
|
40
|
+
return {};
|
|
41
|
+
try {
|
|
42
|
+
return JSON.parse(readFileSync(CACHE_PATH, 'utf8'));
|
|
43
|
+
}
|
|
44
|
+
catch {
|
|
45
|
+
return {};
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
function saveCache(c) {
|
|
49
|
+
try {
|
|
50
|
+
mkdirSync(dirname(CACHE_PATH), { recursive: true });
|
|
51
|
+
writeFileSync(CACHE_PATH, JSON.stringify(c, null, 2));
|
|
52
|
+
}
|
|
53
|
+
catch { /* best-effort */ }
|
|
54
|
+
}
|
|
55
|
+
function readEvents() {
|
|
56
|
+
if (!existsSync(OBSERVER_PATH))
|
|
57
|
+
return [];
|
|
58
|
+
const raw = readFileSync(OBSERVER_PATH, 'utf8');
|
|
59
|
+
const out = [];
|
|
60
|
+
for (const line of raw.split('\n')) {
|
|
61
|
+
if (!line.trim())
|
|
62
|
+
continue;
|
|
63
|
+
try {
|
|
64
|
+
const e = JSON.parse(line);
|
|
65
|
+
if (e && e.tool && e.session && e.ts)
|
|
66
|
+
out.push(e);
|
|
67
|
+
}
|
|
68
|
+
catch { /* skip */ }
|
|
69
|
+
}
|
|
70
|
+
return out;
|
|
71
|
+
}
|
|
72
|
+
/** Group events by session; keep last N sessions (by latest ts). */
|
|
73
|
+
function pickLastNSessions(events, n) {
|
|
74
|
+
const bySession = new Map();
|
|
75
|
+
for (const e of events) {
|
|
76
|
+
const arr = bySession.get(e.session) ?? [];
|
|
77
|
+
arr.push(e);
|
|
78
|
+
bySession.set(e.session, arr);
|
|
79
|
+
}
|
|
80
|
+
const ordered = [...bySession.entries()]
|
|
81
|
+
.map(([id, evs]) => ({ id, evs, lastTs: Date.parse(evs[evs.length - 1].ts) || 0 }))
|
|
82
|
+
.sort((a, b) => b.lastTs - a.lastTs);
|
|
83
|
+
const pick = ordered.slice(0, n);
|
|
84
|
+
const picked = new Map();
|
|
85
|
+
for (const p of pick)
|
|
86
|
+
picked.set(p.id, p.evs);
|
|
87
|
+
return { picked, available: bySession.size };
|
|
88
|
+
}
|
|
89
|
+
/** Annotate each call with retry info inside its session. */
|
|
90
|
+
function enrich(sessionEvents) {
|
|
91
|
+
const calls = sessionEvents.map(e => ({
|
|
92
|
+
...e, argsHash: hashArgs({ tool: e.tool, args: e.args }), retriedLater: false, retrySucceeded: false,
|
|
93
|
+
}));
|
|
94
|
+
// Two-pass: mark retriedLater, then mark retrySucceeded.
|
|
95
|
+
for (let i = 0; i < calls.length; i++) {
|
|
96
|
+
const a = calls[i];
|
|
97
|
+
const at = Date.parse(a.ts) || 0;
|
|
98
|
+
for (let j = i + 1; j < calls.length; j++) {
|
|
99
|
+
const b = calls[j];
|
|
100
|
+
const bt = Date.parse(b.ts) || 0;
|
|
101
|
+
if (bt - at > RETRY_WINDOW_MS)
|
|
102
|
+
break;
|
|
103
|
+
if (b.tool === a.tool && b.argsHash === a.argsHash) {
|
|
104
|
+
a.retriedLater = true;
|
|
105
|
+
break;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
for (let i = 0; i < calls.length; i++) {
|
|
110
|
+
if (!calls[i].retriedLater)
|
|
111
|
+
continue;
|
|
112
|
+
// If the LAST occurrence of (tool,argsHash) in this session is NOT retriedLater, consider retry "succeeded".
|
|
113
|
+
for (let j = calls.length - 1; j > i; j--) {
|
|
114
|
+
if (calls[j].tool === calls[i].tool && calls[j].argsHash === calls[i].argsHash) {
|
|
115
|
+
calls[i].retrySucceeded = !calls[j].retriedLater;
|
|
116
|
+
break;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
return calls;
|
|
121
|
+
}
|
|
122
|
+
/** Build a synthetic "result" proxy to feed the critic. */
|
|
123
|
+
function synthResult(c) {
|
|
124
|
+
if (c.retriedLater)
|
|
125
|
+
return `[observer-proxy] no result captured; same (tool,args) was retried within ${RETRY_WINDOW_MS / 1000}s — likely failed or unsatisfactory.`;
|
|
126
|
+
return `[observer-proxy] no result captured; call was not retried in-session — presumed accepted by the agent downstream.`;
|
|
127
|
+
}
|
|
128
|
+
async function judge(c, strictness, cache, llmClient) {
|
|
129
|
+
const resultProxy = synthResult(c);
|
|
130
|
+
const resultHash = sha(resultProxy);
|
|
131
|
+
const key = `${c.tool}:${c.argsHash}:${resultHash}:${strictness.toFixed(2)}`;
|
|
132
|
+
const hit = cache[key];
|
|
133
|
+
if (hit)
|
|
134
|
+
return hit.verdict;
|
|
135
|
+
const verdict = await gateToolResult(c.tool, c.args, resultProxy, { strictness, llmClient });
|
|
136
|
+
cache[key] = { verdict, cachedAt: new Date().toISOString() };
|
|
137
|
+
return verdict;
|
|
138
|
+
}
|
|
139
|
+
/** Precision/recall math.
|
|
140
|
+
* We treat "retriedLater" as ground-truth "call was bad".
|
|
141
|
+
* Critic rejects = positives.
|
|
142
|
+
* TP = reject && retriedLater (correct catch)
|
|
143
|
+
* FP = reject && !retriedLater (nagged a fine call)
|
|
144
|
+
* FN = accept && retriedLater (missed a bad call)
|
|
145
|
+
* TN = accept && !retriedLater (correct pass)
|
|
146
|
+
* Precision = TP / (TP + FP) — of rejects, how many were real
|
|
147
|
+
* Recall = TP / (TP + FN) — of bad calls, how many did we catch
|
|
148
|
+
* Sweep strictness-equivalent proxy: threshold on verdict.confidence for rejects.
|
|
149
|
+
*/
|
|
150
|
+
function prCurve(rows) {
|
|
151
|
+
const candidates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9];
|
|
152
|
+
let best = { s: 0.5, f1: -1, p: 0, r: 0 };
|
|
153
|
+
for (const s of candidates) {
|
|
154
|
+
let tp = 0, fp = 0, fn = 0;
|
|
155
|
+
for (const r of rows) {
|
|
156
|
+
const gatedReject = r.reject && r.confidence >= 1 - s; // stricter => more rejects pass through
|
|
157
|
+
if (gatedReject && r.bad)
|
|
158
|
+
tp++;
|
|
159
|
+
else if (gatedReject && !r.bad)
|
|
160
|
+
fp++;
|
|
161
|
+
else if (!gatedReject && r.bad)
|
|
162
|
+
fn++;
|
|
163
|
+
}
|
|
164
|
+
const p = tp + fp > 0 ? tp / (tp + fp) : 0;
|
|
165
|
+
const rec = tp + fn > 0 ? tp / (tp + fn) : 0;
|
|
166
|
+
const f1 = p + rec > 0 ? (2 * p * rec) / (p + rec) : 0;
|
|
167
|
+
if (f1 > best.f1)
|
|
168
|
+
best = { s, f1, p, r: rec };
|
|
169
|
+
}
|
|
170
|
+
return { suggested: best.s, precision: best.p, recall: best.r };
|
|
171
|
+
}
|
|
172
|
+
export async function run(opts = {}) {
|
|
173
|
+
const nSessions = opts.sessions ?? 10;
|
|
174
|
+
const strictness = opts.strictness ?? 0.5;
|
|
175
|
+
const perSessionCap = opts.maxCallsPerSession ?? 50;
|
|
176
|
+
const events = readEvents();
|
|
177
|
+
const { picked, available } = pickLastNSessions(events, nSessions);
|
|
178
|
+
const cache = loadCache();
|
|
179
|
+
const byTool = {};
|
|
180
|
+
const falsePositives = [];
|
|
181
|
+
const sessionStats = [];
|
|
182
|
+
const prRows = [];
|
|
183
|
+
let accepts = 0, rejects = 0, totalCalls = 0;
|
|
184
|
+
for (const [sid, evs] of picked) {
|
|
185
|
+
const enriched = enrich(evs).slice(0, perSessionCap);
|
|
186
|
+
let sessionRetriesSaved = 0;
|
|
187
|
+
for (const c of enriched) {
|
|
188
|
+
const v = await judge(c, strictness, cache, opts.llmClient);
|
|
189
|
+
totalCalls++;
|
|
190
|
+
const bucket = byTool[c.tool] ?? (byTool[c.tool] = { total: 0, accepts: 0, rejects: 0 });
|
|
191
|
+
bucket.total++;
|
|
192
|
+
if (v.accept) {
|
|
193
|
+
accepts++;
|
|
194
|
+
bucket.accepts++;
|
|
195
|
+
}
|
|
196
|
+
else {
|
|
197
|
+
rejects++;
|
|
198
|
+
bucket.rejects++;
|
|
199
|
+
}
|
|
200
|
+
prRows.push({ reject: !v.accept, bad: c.retriedLater, confidence: v.confidence });
|
|
201
|
+
if (!v.accept && c.retriedLater)
|
|
202
|
+
sessionRetriesSaved++;
|
|
203
|
+
// Likely false positives: critic rejected, but the call was NOT retried (so downstream accepted it).
|
|
204
|
+
if (!v.accept && !c.retriedLater) {
|
|
205
|
+
falsePositives.push({ tool: c.tool, session: sid, retryGap: 0, reason: v.reason });
|
|
206
|
+
}
|
|
207
|
+
// Also: rejected AND the retry later succeeded — still a FP if the agent had listened and skipped, it would have worked anyway.
|
|
208
|
+
if (!v.accept && c.retrySucceeded) {
|
|
209
|
+
falsePositives.push({ tool: c.tool, session: sid, retryGap: RETRY_WINDOW_MS, reason: `retry later succeeded: ${v.reason ?? ''}` });
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
sessionStats.push({ session: sid, calls: enriched.length, retriesSaved: sessionRetriesSaved, score: sessionRetriesSaved / Math.max(1, enriched.length) });
|
|
213
|
+
}
|
|
214
|
+
saveCache(cache);
|
|
215
|
+
const topRejectRate = Object.entries(byTool)
|
|
216
|
+
.filter(([, v]) => v.total >= 3)
|
|
217
|
+
.map(([tool, v]) => ({ tool, total: v.total, rejectRate: v.rejects / v.total }))
|
|
218
|
+
.sort((a, b) => b.rejectRate - a.rejectRate)
|
|
219
|
+
.slice(0, 5);
|
|
220
|
+
const sessionsRanked = sessionStats.sort((a, b) => b.score - a.score).slice(0, 10);
|
|
221
|
+
const fpTop = falsePositives.slice(0, 5);
|
|
222
|
+
const pr = prCurve(prRows);
|
|
223
|
+
const report = {
|
|
224
|
+
totalCalls, sessionsScanned: picked.size, sessionsAvailable: available,
|
|
225
|
+
accepts, rejects, byTool, topRejectRate, likelyFalsePositives: fpTop,
|
|
226
|
+
sessionsRanked, suggestedStrictness: pr.suggested, precision: pr.precision, recall: pr.recall,
|
|
227
|
+
};
|
|
228
|
+
renderReport(report);
|
|
229
|
+
if (opts.jsonOut) {
|
|
230
|
+
try {
|
|
231
|
+
writeFileSync(opts.jsonOut, JSON.stringify(report, null, 2));
|
|
232
|
+
console.log(`\nJSON written → ${opts.jsonOut}`);
|
|
233
|
+
}
|
|
234
|
+
catch (e) {
|
|
235
|
+
console.error(`JSON export failed: ${e.message}`);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
return report;
|
|
239
|
+
}
|
|
240
|
+
function renderReport(r) {
|
|
241
|
+
const line = (s = '') => console.log(s);
|
|
242
|
+
line('\n=== Critic Retrospective ===');
|
|
243
|
+
line(`sessions scanned: ${r.sessionsScanned} / ${r.sessionsAvailable} available`);
|
|
244
|
+
line(`tool calls judged: ${r.totalCalls}`);
|
|
245
|
+
const ratio = r.totalCalls ? (r.accepts / r.totalCalls) : 0;
|
|
246
|
+
line(`accept/reject: ${r.accepts} / ${r.rejects} (accept-rate ${(ratio * 100).toFixed(1)}%)`);
|
|
247
|
+
line('\n-- top 5 reject rate (candidates for args validation) --');
|
|
248
|
+
if (!r.topRejectRate.length)
|
|
249
|
+
line(' (no tool has >=3 calls)');
|
|
250
|
+
for (const t of r.topRejectRate)
|
|
251
|
+
line(` ${t.tool.padEnd(28)} ${(t.rejectRate * 100).toFixed(1)}% rejected (${t.total} calls)`);
|
|
252
|
+
line('\n-- likely critic false positives (rejected but agent did not retry OR retry worked) --');
|
|
253
|
+
if (!r.likelyFalsePositives.length)
|
|
254
|
+
line(' (none)');
|
|
255
|
+
for (const fp of r.likelyFalsePositives)
|
|
256
|
+
line(` ${fp.tool.padEnd(28)} session=${fp.session.slice(0, 8)} ${fp.reason ?? ''}`);
|
|
257
|
+
line('\n-- sessions ranked by retries-saved score --');
|
|
258
|
+
for (const s of r.sessionsRanked.slice(0, 5))
|
|
259
|
+
line(` ${s.session.slice(0, 8)} calls=${s.calls} saved=${s.retriesSaved} score=${s.score.toFixed(3)}`);
|
|
260
|
+
line('\n-- precision / recall tradeoff --');
|
|
261
|
+
line(` precision = ${r.precision.toFixed(3)} recall = ${r.recall.toFixed(3)}`);
|
|
262
|
+
line(` suggested critic_strictness = ${r.suggestedStrictness.toFixed(2)}`);
|
|
263
|
+
line('');
|
|
264
|
+
}
|
|
265
|
+
// TODO(cli-wiring): register `kbot critic retrospect` subcommand in cli.ts once
|
|
266
|
+
// the parallel skills-subcommand edit lands. For now, invoke via:
|
|
267
|
+
// node -e "import('./dist/critic-retrospect.js').then(m => m.run({ sessions: 20 }))"
|
|
268
|
+
// Direct-execution entrypoint for `node dist/critic-retrospect.js`.
|
|
269
|
+
const argv1 = process.argv[1] || '';
|
|
270
|
+
if (argv1.endsWith('critic-retrospect.js') || argv1.endsWith('critic-retrospect.ts')) {
|
|
271
|
+
const sessionsArg = process.argv.find(a => a.startsWith('--sessions='));
|
|
272
|
+
const jsonArg = process.argv.find(a => a.startsWith('--json='));
|
|
273
|
+
const strictArg = process.argv.find(a => a.startsWith('--strictness='));
|
|
274
|
+
const sessions = sessionsArg ? Number(sessionsArg.split('=')[1]) : 10;
|
|
275
|
+
const strictness = strictArg ? Number(strictArg.split('=')[1]) : undefined;
|
|
276
|
+
const jsonOut = jsonArg ? jsonArg.split('=')[1] : undefined;
|
|
277
|
+
run({ sessions, strictness, jsonOut }).catch(e => { console.error(e); process.exit(1); });
|
|
278
|
+
}
|
|
279
|
+
//# sourceMappingURL=critic-retrospect.js.map
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reasoning-failure taxonomy for the critic gate.
|
|
3
|
+
*
|
|
4
|
+
* Adopted verbatim from "Stalled, Biased, and Confused: Uncovering Reasoning
|
|
5
|
+
* Failures in LLMs for Cloud-Based Root Cause Analysis" (arXiv:2601.22208,
|
|
6
|
+
* 2026). 16 modes, evaluated across 48k simulated scenarios on ReAct and
|
|
7
|
+
* Plan-and-Execute workflows. The domain is cloud RCA but the modes generalize
|
|
8
|
+
* to tool-using agents.
|
|
9
|
+
*
|
|
10
|
+
* Purpose here: replace ad-hoc ERROR_KEYWORDS matching in critic-gate.ts with
|
|
11
|
+
* a typed classifier so that when the critic rejects a tool result we can
|
|
12
|
+
* attribute the rejection to a named class. Makes FP-rate measurement tractable
|
|
13
|
+
* per-class instead of in aggregate.
|
|
14
|
+
*
|
|
15
|
+
* This module is pure (no I/O, no LLM calls). Classification is rule-based and
|
|
16
|
+
* intentionally conservative — when nothing matches, returns null and the
|
|
17
|
+
* existing critic fallback handles it.
|
|
18
|
+
*/
|
|
19
|
+
export type RFClass = 'RF-01-fabricated-evidence' | 'RF-02-metric-interpretation' | 'RF-03-confused-provenance' | 'RF-04-temporal-misordering' | 'RF-05-spurious-causal-attribution' | 'RF-06-unjustified-instance-specificity' | 'RF-07-arbitrary-evidence-selection' | 'RF-08-evidential-insufficiency' | 'RF-09-failure-to-update-belief' | 'RF-10-simulation-role-confusion' | 'RF-11-excessive-speculation' | 'RF-12-repetition-failure-to-resume' | 'RF-13-anchoring-bias' | 'RF-14-invalid-inference-pattern' | 'RF-15-internal-contradiction' | 'RF-16-arithmetic-error';
|
|
20
|
+
export interface RFClassification {
|
|
21
|
+
class: RFClass;
|
|
22
|
+
evidence: string;
|
|
23
|
+
confidence: number;
|
|
24
|
+
}
|
|
25
|
+
export interface TrajectoryStep {
|
|
26
|
+
tool: string;
|
|
27
|
+
args: Record<string, unknown>;
|
|
28
|
+
result: string;
|
|
29
|
+
timestampMs: number;
|
|
30
|
+
}
|
|
31
|
+
/** RF-12: trajectory-level — last N steps repeat the same tool+args. */
|
|
32
|
+
export declare function detectRepetition(trajectory: TrajectoryStep[], windowSize?: number): RFClassification | null;
|
|
33
|
+
/**
|
|
34
|
+
* Classify a single tool result against the RF taxonomy.
|
|
35
|
+
*
|
|
36
|
+
* Returns the highest-confidence match, or null if nothing fires. Callers
|
|
37
|
+
* should treat null as "no taxonomy signal" — not as "result is fine".
|
|
38
|
+
*/
|
|
39
|
+
export declare function classifyToolResult(result: string): RFClassification | null;
|
|
40
|
+
//# sourceMappingURL=critic-taxonomy.d.ts.map
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reasoning-failure taxonomy for the critic gate.
|
|
3
|
+
*
|
|
4
|
+
* Adopted verbatim from "Stalled, Biased, and Confused: Uncovering Reasoning
|
|
5
|
+
* Failures in LLMs for Cloud-Based Root Cause Analysis" (arXiv:2601.22208,
|
|
6
|
+
* 2026). 16 modes, evaluated across 48k simulated scenarios on ReAct and
|
|
7
|
+
* Plan-and-Execute workflows. The domain is cloud RCA but the modes generalize
|
|
8
|
+
* to tool-using agents.
|
|
9
|
+
*
|
|
10
|
+
* Purpose here: replace ad-hoc ERROR_KEYWORDS matching in critic-gate.ts with
|
|
11
|
+
* a typed classifier so that when the critic rejects a tool result we can
|
|
12
|
+
* attribute the rejection to a named class. Makes FP-rate measurement tractable
|
|
13
|
+
* per-class instead of in aggregate.
|
|
14
|
+
*
|
|
15
|
+
* This module is pure (no I/O, no LLM calls). Classification is rule-based and
|
|
16
|
+
* intentionally conservative — when nothing matches, returns null and the
|
|
17
|
+
* existing critic fallback handles it.
|
|
18
|
+
*/
|
|
19
|
+
const SPECULATION_MARKERS = [
|
|
20
|
+
'i think', 'probably', 'might be', 'could be', 'perhaps',
|
|
21
|
+
'it seems', 'i believe', 'likely', "i'm guessing", 'my guess',
|
|
22
|
+
];
|
|
23
|
+
const FABRICATION_MARKERS = [
|
|
24
|
+
'as an ai', 'i cannot actually', 'i don\'t have access', 'hypothetically',
|
|
25
|
+
'let\'s assume', 'for the sake of', 'imagine that',
|
|
26
|
+
];
|
|
27
|
+
const UNRESOLVED_ERROR_MARKERS = [
|
|
28
|
+
'enoent', 'permission denied', 'eacces', 'connection refused',
|
|
29
|
+
'timeout', 'econnrefused', 'not found',
|
|
30
|
+
];
|
|
31
|
+
function lower(s) {
|
|
32
|
+
return (s || '').toLowerCase();
|
|
33
|
+
}
|
|
34
|
+
/** RF-01: tool result contains hedging language presented as fact. */
|
|
35
|
+
function detectFabrication(text) {
|
|
36
|
+
const lc = lower(text);
|
|
37
|
+
for (const m of FABRICATION_MARKERS) {
|
|
38
|
+
if (lc.includes(m)) {
|
|
39
|
+
return {
|
|
40
|
+
class: 'RF-01-fabricated-evidence',
|
|
41
|
+
evidence: `hedging marker "${m}" in tool result`,
|
|
42
|
+
confidence: 0.7,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
/** RF-08: result is structurally empty or shorter than task demands. */
|
|
49
|
+
function detectEvidentialInsufficiency(text) {
|
|
50
|
+
const trimmed = (text || '').trim();
|
|
51
|
+
if (trimmed.length === 0) {
|
|
52
|
+
return {
|
|
53
|
+
class: 'RF-08-evidential-insufficiency',
|
|
54
|
+
evidence: 'empty tool result',
|
|
55
|
+
confidence: 0.95,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
if (trimmed.length < 16 && !/\d/.test(trimmed)) {
|
|
59
|
+
return {
|
|
60
|
+
class: 'RF-08-evidential-insufficiency',
|
|
61
|
+
evidence: `result is ${trimmed.length} chars with no numeric content`,
|
|
62
|
+
confidence: 0.55,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
/** RF-11: speculation language in what should be a factual tool result. */
|
|
68
|
+
function detectExcessiveSpeculation(text) {
|
|
69
|
+
const lc = lower(text);
|
|
70
|
+
const hits = SPECULATION_MARKERS.filter(m => lc.includes(m));
|
|
71
|
+
if (hits.length >= 2) {
|
|
72
|
+
return {
|
|
73
|
+
class: 'RF-11-excessive-speculation',
|
|
74
|
+
evidence: `speculation markers: ${hits.slice(0, 3).join(', ')}`,
|
|
75
|
+
confidence: 0.6,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
return null;
|
|
79
|
+
}
|
|
80
|
+
/** RF-10: model output claims tool ran when the result text shows it did not. */
|
|
81
|
+
function detectSimulationConfusion(text) {
|
|
82
|
+
const lc = lower(text);
|
|
83
|
+
const hasUnresolvedError = UNRESOLVED_ERROR_MARKERS.some(m => lc.includes(m));
|
|
84
|
+
const claimsSuccess = /\b(successfully|completed|done|finished)\b/.test(lc);
|
|
85
|
+
if (hasUnresolvedError && claimsSuccess) {
|
|
86
|
+
return {
|
|
87
|
+
class: 'RF-10-simulation-role-confusion',
|
|
88
|
+
evidence: 'result claims success and contains an error marker',
|
|
89
|
+
confidence: 0.85,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
return null;
|
|
93
|
+
}
|
|
94
|
+
/** RF-15: internal contradiction — two opposing claims in one result. */
|
|
95
|
+
function detectInternalContradiction(text) {
|
|
96
|
+
const lc = lower(text);
|
|
97
|
+
if (/\b(is|was|are)\s+\w+/.test(lc) && /\bis\s+not\b.*\bis\b/.test(lc)) {
|
|
98
|
+
return {
|
|
99
|
+
class: 'RF-15-internal-contradiction',
|
|
100
|
+
evidence: 'opposing "is"/"is not" claims within result',
|
|
101
|
+
confidence: 0.5,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
return null;
|
|
105
|
+
}
|
|
106
|
+
/** RF-12: trajectory-level — last N steps repeat the same tool+args. */
|
|
107
|
+
export function detectRepetition(trajectory, windowSize = 3) {
|
|
108
|
+
if (trajectory.length < windowSize)
|
|
109
|
+
return null;
|
|
110
|
+
const window = trajectory.slice(-windowSize);
|
|
111
|
+
const first = window[0];
|
|
112
|
+
const key = `${first.tool}:${JSON.stringify(first.args)}`;
|
|
113
|
+
const allSame = window.every(s => `${s.tool}:${JSON.stringify(s.args)}` === key);
|
|
114
|
+
if (allSame) {
|
|
115
|
+
return {
|
|
116
|
+
class: 'RF-12-repetition-failure-to-resume',
|
|
117
|
+
evidence: `${windowSize} consecutive identical calls to ${first.tool}`,
|
|
118
|
+
confidence: 0.9,
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
return null;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Classify a single tool result against the RF taxonomy.
|
|
125
|
+
*
|
|
126
|
+
* Returns the highest-confidence match, or null if nothing fires. Callers
|
|
127
|
+
* should treat null as "no taxonomy signal" — not as "result is fine".
|
|
128
|
+
*/
|
|
129
|
+
export function classifyToolResult(result) {
|
|
130
|
+
const detectors = [
|
|
131
|
+
detectSimulationConfusion,
|
|
132
|
+
detectFabrication,
|
|
133
|
+
detectEvidentialInsufficiency,
|
|
134
|
+
detectExcessiveSpeculation,
|
|
135
|
+
detectInternalContradiction,
|
|
136
|
+
];
|
|
137
|
+
let best = null;
|
|
138
|
+
for (const d of detectors) {
|
|
139
|
+
const hit = d(result);
|
|
140
|
+
if (hit && (!best || hit.confidence > best.confidence)) {
|
|
141
|
+
best = hit;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
return best;
|
|
145
|
+
}
|
|
146
|
+
//# sourceMappingURL=critic-taxonomy.js.map
|
package/dist/growth.d.ts
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
interface GrowthSummary {
|
|
2
|
+
betterPct: number;
|
|
3
|
+
days: number;
|
|
4
|
+
sessions: number;
|
|
5
|
+
toolCalls: number;
|
|
6
|
+
successRate: number;
|
|
7
|
+
routingAccuracy: number;
|
|
8
|
+
newPatterns: number;
|
|
9
|
+
}
|
|
10
|
+
interface GrowthResult {
|
|
11
|
+
summary: GrowthSummary;
|
|
12
|
+
metrics: Array<{
|
|
13
|
+
label: string;
|
|
14
|
+
current: number;
|
|
15
|
+
prior: number;
|
|
16
|
+
delta: number;
|
|
17
|
+
}>;
|
|
18
|
+
deltas: Array<{
|
|
19
|
+
tool: string;
|
|
20
|
+
current: number;
|
|
21
|
+
prior: number;
|
|
22
|
+
delta: number;
|
|
23
|
+
}>;
|
|
24
|
+
agents: Array<{
|
|
25
|
+
agent: string;
|
|
26
|
+
accuracy: number;
|
|
27
|
+
samples: number;
|
|
28
|
+
}>;
|
|
29
|
+
}
|
|
30
|
+
export declare function runGrowth(opts?: {
|
|
31
|
+
json?: boolean;
|
|
32
|
+
days?: number;
|
|
33
|
+
dataDir?: string;
|
|
34
|
+
now?: number;
|
|
35
|
+
}): GrowthResult | null;
|
|
36
|
+
export {};
|
|
37
|
+
//# sourceMappingURL=growth.d.ts.map
|