@kernel.chat/kbot 2.7.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-protocol.d.ts +97 -0
- package/dist/agent-protocol.d.ts.map +1 -0
- package/dist/agent-protocol.js +618 -0
- package/dist/agent-protocol.js.map +1 -0
- package/dist/agent.d.ts.map +1 -1
- package/dist/agent.js +25 -1
- package/dist/agent.js.map +1 -1
- package/dist/architect.d.ts +44 -0
- package/dist/architect.d.ts.map +1 -0
- package/dist/architect.js +403 -0
- package/dist/architect.js.map +1 -0
- package/dist/cli.js +210 -2
- package/dist/cli.js.map +1 -1
- package/dist/confidence.d.ts +102 -0
- package/dist/confidence.d.ts.map +1 -0
- package/dist/confidence.js +693 -0
- package/dist/confidence.js.map +1 -0
- package/dist/graph-memory.d.ts +98 -0
- package/dist/graph-memory.d.ts.map +1 -0
- package/dist/graph-memory.js +926 -0
- package/dist/graph-memory.js.map +1 -0
- package/dist/ide/acp-server.js +2 -2
- package/dist/ide/acp-server.js.map +1 -1
- package/dist/intentionality.d.ts +139 -0
- package/dist/intentionality.d.ts.map +1 -0
- package/dist/intentionality.js +1092 -0
- package/dist/intentionality.js.map +1 -0
- package/dist/lsp-client.d.ts +167 -0
- package/dist/lsp-client.d.ts.map +1 -0
- package/dist/lsp-client.js +679 -0
- package/dist/lsp-client.js.map +1 -0
- package/dist/mcp-plugins.d.ts +62 -0
- package/dist/mcp-plugins.d.ts.map +1 -0
- package/dist/mcp-plugins.js +551 -0
- package/dist/mcp-plugins.js.map +1 -0
- package/dist/reasoning.d.ts +100 -0
- package/dist/reasoning.d.ts.map +1 -0
- package/dist/reasoning.js +1292 -0
- package/dist/reasoning.js.map +1 -0
- package/dist/temporal.d.ts +133 -0
- package/dist/temporal.d.ts.map +1 -0
- package/dist/temporal.js +778 -0
- package/dist/temporal.js.map +1 -0
- package/dist/tools/e2b-sandbox.d.ts +2 -0
- package/dist/tools/e2b-sandbox.d.ts.map +1 -0
- package/dist/tools/e2b-sandbox.js +460 -0
- package/dist/tools/e2b-sandbox.js.map +1 -0
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +19 -1
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/lsp-tools.d.ts +2 -0
- package/dist/tools/lsp-tools.d.ts.map +1 -0
- package/dist/tools/lsp-tools.js +268 -0
- package/dist/tools/lsp-tools.js.map +1 -0
- package/dist/ui.js +1 -1
- package/dist/ui.js.map +1 -1
- package/package.json +2 -2
|
@@ -0,0 +1,693 @@
|
|
|
1
|
+
// K:BOT Confidence Engine — Self-Awareness for Agent Actions
|
|
2
|
+
//
|
|
3
|
+
// Three systems:
|
|
4
|
+
// 1. CONFIDENCE CALIBRATION — Express uncertainty about responses/actions
|
|
5
|
+
// 2. SKILL BOUNDARIES — Self-model of strengths, weaknesses, unknowns
|
|
6
|
+
// 3. EFFORT ESTIMATION — Predict tool calls, cost, and complexity
|
|
7
|
+
//
|
|
8
|
+
// Persists calibration and effort history to ~/.kbot/ as JSON files.
|
|
9
|
+
// Integrates with the learning engine for pattern/task data.
|
|
10
|
+
import { homedir } from 'node:os';
|
|
11
|
+
import { join } from 'node:path';
|
|
12
|
+
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'node:fs';
|
|
13
|
+
import { registerTool } from './tools/index.js';
|
|
14
|
+
import { classifyTask, findPattern } from './learning.js';
|
|
15
|
+
import { isLocalProvider, getByokProvider, estimateCost } from './auth.js';
|
|
16
|
+
// ── File Paths ──
|
|
17
|
+
const KBOT_DIR = join(homedir(), '.kbot');
|
|
18
|
+
const CONFIDENCE_FILE = join(KBOT_DIR, 'confidence.json');
|
|
19
|
+
const EFFORT_FILE = join(KBOT_DIR, 'effort-history.json');
|
|
20
|
+
const SKILL_FILE = join(KBOT_DIR, 'skill-profile.json');
|
|
21
|
+
// ── Helpers ──
|
|
22
|
+
function ensureDir() {
|
|
23
|
+
if (!existsSync(KBOT_DIR))
|
|
24
|
+
mkdirSync(KBOT_DIR, { recursive: true });
|
|
25
|
+
}
|
|
26
|
+
function loadJSON(path, fallback) {
|
|
27
|
+
ensureDir();
|
|
28
|
+
if (!existsSync(path))
|
|
29
|
+
return fallback;
|
|
30
|
+
try {
|
|
31
|
+
return JSON.parse(readFileSync(path, 'utf-8'));
|
|
32
|
+
}
|
|
33
|
+
catch {
|
|
34
|
+
return fallback;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
function saveJSON(path, data) {
|
|
38
|
+
ensureDir();
|
|
39
|
+
try {
|
|
40
|
+
writeFileSync(path, JSON.stringify(data, null, 2), 'utf-8');
|
|
41
|
+
}
|
|
42
|
+
catch { /* silently fail — non-critical persistence */ }
|
|
43
|
+
}
|
|
44
|
+
function loadCalibration() {
|
|
45
|
+
return loadJSON(CONFIDENCE_FILE, { entries: [], avgError: 0 });
|
|
46
|
+
}
|
|
47
|
+
function saveCalibration(data) {
|
|
48
|
+
// Keep only last 200 entries to avoid unbounded growth
|
|
49
|
+
if (data.entries.length > 200) {
|
|
50
|
+
data.entries = data.entries.slice(-200);
|
|
51
|
+
}
|
|
52
|
+
saveJSON(CONFIDENCE_FILE, data);
|
|
53
|
+
}
|
|
54
|
+
/** Compute calibration bias from historical data for a domain */
|
|
55
|
+
function getCalibrationBias(domain) {
|
|
56
|
+
const data = loadCalibration();
|
|
57
|
+
const domainEntries = data.entries.filter(e => e.domain === domain);
|
|
58
|
+
if (domainEntries.length < 3)
|
|
59
|
+
return 0;
|
|
60
|
+
// Average (predicted - actual): positive = overconfident, negative = underconfident
|
|
61
|
+
const totalBias = domainEntries.reduce((sum, e) => sum + (e.predicted - e.actual), 0);
|
|
62
|
+
return totalBias / domainEntries.length;
|
|
63
|
+
}
|
|
64
|
+
/** Get historical success rate for a domain from calibration data */
|
|
65
|
+
function getHistoricalSuccessRate(domain) {
|
|
66
|
+
const data = loadCalibration();
|
|
67
|
+
const entries = data.entries.filter(e => e.domain === domain);
|
|
68
|
+
if (entries.length === 0)
|
|
69
|
+
return { rate: 0.5, count: 0 };
|
|
70
|
+
const avgActual = entries.reduce((sum, e) => sum + e.actual, 0) / entries.length;
|
|
71
|
+
return { rate: avgActual, count: entries.length };
|
|
72
|
+
}
|
|
73
|
+
/** Detect the primary domain from task text */
|
|
74
|
+
function detectDomain(task) {
|
|
75
|
+
const lower = task.toLowerCase();
|
|
76
|
+
// Check file extensions
|
|
77
|
+
const extMatch = lower.match(/\.(ts|tsx|js|jsx|py|rs|go|java|rb|cpp|c|cs|swift|kt|lua|zig|sql|css|html|md|yaml|yml|json|toml|sh|bash|zsh)/);
|
|
78
|
+
if (extMatch) {
|
|
79
|
+
const extMap = {
|
|
80
|
+
ts: 'typescript', tsx: 'typescript', js: 'javascript', jsx: 'javascript',
|
|
81
|
+
py: 'python', rs: 'rust', go: 'go', java: 'java', rb: 'ruby',
|
|
82
|
+
cpp: 'cpp', c: 'c', cs: 'csharp', swift: 'swift', kt: 'kotlin',
|
|
83
|
+
lua: 'lua', zig: 'zig', sql: 'sql', css: 'css', html: 'html',
|
|
84
|
+
md: 'writing', yaml: 'devops', yml: 'devops', json: 'config',
|
|
85
|
+
toml: 'config', sh: 'devops', bash: 'devops', zsh: 'devops',
|
|
86
|
+
};
|
|
87
|
+
return extMap[extMatch[1]] || 'general';
|
|
88
|
+
}
|
|
89
|
+
// Check language/domain keywords
|
|
90
|
+
const domainKeywords = {
|
|
91
|
+
typescript: ['typescript', 'ts ', 'tsx', 'type ', 'interface ', 'enum '],
|
|
92
|
+
javascript: ['javascript', 'js ', 'jsx', 'node', 'npm', 'yarn', 'pnpm', 'bun'],
|
|
93
|
+
python: ['python', 'pip', 'conda', 'django', 'flask', 'fastapi', 'pytorch'],
|
|
94
|
+
rust: ['rust', 'cargo', 'crate', 'impl ', 'fn ', 'struct '],
|
|
95
|
+
go: ['golang', 'go ', 'goroutine', 'go mod'],
|
|
96
|
+
devops: ['docker', 'kubernetes', 'k8s', 'terraform', 'ansible', 'ci/cd', 'pipeline', 'deploy', 'nginx', 'aws', 'gcp', 'azure'],
|
|
97
|
+
database: ['sql', 'postgres', 'mysql', 'mongo', 'redis', 'supabase', 'prisma', 'database', 'migration'],
|
|
98
|
+
writing: ['write', 'blog', 'article', 'documentation', 'readme', 'explain', 'describe'],
|
|
99
|
+
testing: ['test', 'spec', 'coverage', 'vitest', 'jest', 'pytest', 'assert'],
|
|
100
|
+
security: ['security', 'vulnerability', 'auth', 'permission', 'encrypt', 'cve'],
|
|
101
|
+
design: ['css', 'style', 'layout', 'responsive', 'animation', 'color', 'font'],
|
|
102
|
+
react: ['react', 'component', 'hook', 'jsx', 'tsx', 'next.js', 'nextjs', 'remix'],
|
|
103
|
+
git: ['git', 'commit', 'branch', 'merge', 'rebase', 'pr ', 'pull request'],
|
|
104
|
+
};
|
|
105
|
+
for (const [domain, keywords] of Object.entries(domainKeywords)) {
|
|
106
|
+
if (keywords.some(kw => lower.includes(kw)))
|
|
107
|
+
return domain;
|
|
108
|
+
}
|
|
109
|
+
return 'general';
|
|
110
|
+
}
|
|
111
|
+
/** Count complexity signals in a task description */
|
|
112
|
+
function assessComplexity(task) {
|
|
113
|
+
const lower = task.toLowerCase();
|
|
114
|
+
// Estimate number of files involved
|
|
115
|
+
const fileRefs = (task.match(/\b\w+\.\w{1,5}\b/g) || []).length;
|
|
116
|
+
const dirRefs = (task.match(/\b\w+\//g) || []).length;
|
|
117
|
+
const fileCount = Math.max(1, fileRefs + dirRefs);
|
|
118
|
+
// Ambiguity — vague words that signal unclear intent
|
|
119
|
+
const vagueTerms = ['maybe', 'somehow', 'might', 'possibly', 'not sure', 'something like',
|
|
120
|
+
'kind of', 'sort of', 'whatever', 'figure out', 'if possible'];
|
|
121
|
+
const ambiguity = vagueTerms.filter(t => lower.includes(t)).length / vagueTerms.length;
|
|
122
|
+
// Multi-step — signals that the task has multiple phases
|
|
123
|
+
const multiStepTerms = ['then', 'after that', 'also', 'and then', 'next', 'finally',
|
|
124
|
+
'first', 'second', 'step', 'multi', 'several', 'all', 'each'];
|
|
125
|
+
const multiStep = multiStepTerms.some(t => lower.includes(t));
|
|
126
|
+
return { fileCount, ambiguity, multiStep };
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Estimate confidence for a task before execution.
|
|
130
|
+
*
|
|
131
|
+
* Considers task complexity, past success with similar tasks,
|
|
132
|
+
* provider capability, and available context.
|
|
133
|
+
*/
|
|
134
|
+
export function estimateConfidence(task, context) {
|
|
135
|
+
const domain = detectDomain(task);
|
|
136
|
+
const taskType = classifyTask(task);
|
|
137
|
+
const complexity = assessComplexity(task);
|
|
138
|
+
const cachedPattern = findPattern(task);
|
|
139
|
+
const historical = getHistoricalSuccessRate(domain);
|
|
140
|
+
const calibrationBias = getCalibrationBias(domain);
|
|
141
|
+
// ── Base scores ──
|
|
142
|
+
// Factual confidence — higher if we have context, patterns, or history
|
|
143
|
+
let factual = 0.5;
|
|
144
|
+
if (context.length > 200)
|
|
145
|
+
factual += 0.15; // good context available
|
|
146
|
+
if (context.length > 1000)
|
|
147
|
+
factual += 0.1; // rich context
|
|
148
|
+
if (cachedPattern)
|
|
149
|
+
factual += 0.15; // we've done this before
|
|
150
|
+
if (historical.count > 5) {
|
|
151
|
+
factual += (historical.rate - 0.5) * 0.2; // historical performance adjustment
|
|
152
|
+
}
|
|
153
|
+
// Approach confidence — higher for known task types, lower for ambiguity
|
|
154
|
+
let approach = 0.6;
|
|
155
|
+
if (taskType !== 'general')
|
|
156
|
+
approach += 0.1; // recognized task type
|
|
157
|
+
if (cachedPattern && cachedPattern.successRate > 0.8)
|
|
158
|
+
approach += 0.2; // proven pattern
|
|
159
|
+
approach -= complexity.ambiguity * 0.3; // ambiguity reduces confidence
|
|
160
|
+
if (complexity.multiStep)
|
|
161
|
+
approach -= 0.1; // multi-step = more room for error
|
|
162
|
+
// Completeness — how sure we cover everything
|
|
163
|
+
let completeness = 0.6;
|
|
164
|
+
if (complexity.fileCount > 5)
|
|
165
|
+
completeness -= 0.15; // many files = easy to miss things
|
|
166
|
+
if (complexity.multiStep)
|
|
167
|
+
completeness -= 0.1; // multi-step = might skip a step
|
|
168
|
+
if (context.includes('repo-map') || context.includes('project structure'))
|
|
169
|
+
completeness += 0.1;
|
|
170
|
+
if (cachedPattern)
|
|
171
|
+
completeness += 0.1;
|
|
172
|
+
// Provider adjustment — local models get a penalty on complex tasks
|
|
173
|
+
try {
|
|
174
|
+
const provider = getByokProvider();
|
|
175
|
+
if (isLocalProvider(provider)) {
|
|
176
|
+
const localPenalty = complexity.multiStep ? 0.2 : 0.1;
|
|
177
|
+
factual -= localPenalty;
|
|
178
|
+
approach -= localPenalty;
|
|
179
|
+
completeness -= localPenalty * 0.5;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
catch { /* no provider configured — skip adjustment */ }
|
|
183
|
+
// Apply calibration correction — if we're historically overconfident, reduce scores
|
|
184
|
+
factual -= calibrationBias * 0.5;
|
|
185
|
+
approach -= calibrationBias * 0.5;
|
|
186
|
+
completeness -= calibrationBias * 0.3;
|
|
187
|
+
// Clamp all scores to [0.05, 0.99]
|
|
188
|
+
factual = Math.max(0.05, Math.min(0.99, factual));
|
|
189
|
+
approach = Math.max(0.05, Math.min(0.99, approach));
|
|
190
|
+
completeness = Math.max(0.05, Math.min(0.99, completeness));
|
|
191
|
+
// Overall — weighted average
|
|
192
|
+
const overall = Math.round((factual * 0.3 + approach * 0.4 + completeness * 0.3) * 100) / 100;
|
|
193
|
+
// Build reasoning
|
|
194
|
+
const reasons = [];
|
|
195
|
+
if (cachedPattern)
|
|
196
|
+
reasons.push('have a proven pattern for this');
|
|
197
|
+
if (historical.count > 5 && historical.rate > 0.7)
|
|
198
|
+
reasons.push(`strong track record in ${domain}`);
|
|
199
|
+
if (historical.count > 5 && historical.rate < 0.4)
|
|
200
|
+
reasons.push(`historically weak in ${domain}`);
|
|
201
|
+
if (complexity.ambiguity > 0.2)
|
|
202
|
+
reasons.push('request is somewhat ambiguous');
|
|
203
|
+
if (complexity.multiStep)
|
|
204
|
+
reasons.push('multi-step task');
|
|
205
|
+
if (context.length < 100)
|
|
206
|
+
reasons.push('limited context available');
|
|
207
|
+
try {
|
|
208
|
+
if (isLocalProvider(getByokProvider()))
|
|
209
|
+
reasons.push('using local model (lower capability)');
|
|
210
|
+
}
|
|
211
|
+
catch { /* skip */ }
|
|
212
|
+
const pct = Math.round(overall * 100);
|
|
213
|
+
const reasoning = reasons.length > 0
|
|
214
|
+
? `~${pct}% confident — ${reasons.join(', ')}`
|
|
215
|
+
: `~${pct}% confident — standard assessment for ${domain} ${taskType} task`;
|
|
216
|
+
return {
|
|
217
|
+
overall,
|
|
218
|
+
factual: Math.round(factual * 100) / 100,
|
|
219
|
+
approach: Math.round(approach * 100) / 100,
|
|
220
|
+
completeness: Math.round(completeness * 100) / 100,
|
|
221
|
+
reasoning,
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
/**
|
|
225
|
+
* Format a confidence score as a human-readable string.
|
|
226
|
+
*/
|
|
227
|
+
export function reportConfidence(score) {
|
|
228
|
+
const pct = Math.round(score.overall * 100);
|
|
229
|
+
const level = pct >= 80 ? 'high' : pct >= 50 ? 'moderate' : 'low';
|
|
230
|
+
const lines = [
|
|
231
|
+
`Confidence: ${pct}% (${level})`,
|
|
232
|
+
` Factual: ${Math.round(score.factual * 100)}%`,
|
|
233
|
+
` Approach: ${Math.round(score.approach * 100)}%`,
|
|
234
|
+
` Completeness: ${Math.round(score.completeness * 100)}%`,
|
|
235
|
+
` ${score.reasoning}`,
|
|
236
|
+
];
|
|
237
|
+
return lines.join('\n');
|
|
238
|
+
}
|
|
239
|
+
/**
|
|
240
|
+
* Record a calibration entry — predicted vs actual (from self-eval or user feedback).
|
|
241
|
+
* Called after a task completes to improve future predictions.
|
|
242
|
+
*/
|
|
243
|
+
export function recordCalibration(task, predicted, actual) {
|
|
244
|
+
const domain = detectDomain(task);
|
|
245
|
+
const data = loadCalibration();
|
|
246
|
+
data.entries.push({
|
|
247
|
+
task: task.slice(0, 200), // truncate for storage
|
|
248
|
+
predicted,
|
|
249
|
+
actual,
|
|
250
|
+
domain,
|
|
251
|
+
timestamp: new Date().toISOString(),
|
|
252
|
+
});
|
|
253
|
+
// Recompute running average error
|
|
254
|
+
if (data.entries.length > 0) {
|
|
255
|
+
data.avgError = data.entries.reduce((sum, e) => sum + Math.abs(e.predicted - e.actual), 0) / data.entries.length;
|
|
256
|
+
}
|
|
257
|
+
saveCalibration(data);
|
|
258
|
+
}
|
|
259
|
+
function loadSkillData() {
|
|
260
|
+
return loadJSON(SKILL_FILE, { skills: {} });
|
|
261
|
+
}
|
|
262
|
+
function saveSkillData(data) {
|
|
263
|
+
saveJSON(SKILL_FILE, data);
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Build a skill profile from stored skill data and calibration history.
|
|
267
|
+
*/
|
|
268
|
+
export function getSkillProfile() {
|
|
269
|
+
const skillData = loadSkillData();
|
|
270
|
+
const calibration = loadCalibration();
|
|
271
|
+
// Merge skill data with calibration entries for domains not in skill data
|
|
272
|
+
const domainSet = new Set(Object.keys(skillData.skills));
|
|
273
|
+
for (const entry of calibration.entries) {
|
|
274
|
+
domainSet.add(entry.domain);
|
|
275
|
+
}
|
|
276
|
+
const allDomains = Array.from(domainSet);
|
|
277
|
+
const entries = [];
|
|
278
|
+
for (const domain of allDomains) {
|
|
279
|
+
const stored = skillData.skills[domain];
|
|
280
|
+
const calEntries = calibration.entries.filter(e => e.domain === domain);
|
|
281
|
+
let successRate;
|
|
282
|
+
let avgConfidence;
|
|
283
|
+
let sampleSize;
|
|
284
|
+
let lastAttempt;
|
|
285
|
+
if (stored) {
|
|
286
|
+
sampleSize = stored.sampleSize;
|
|
287
|
+
successRate = sampleSize > 0 ? stored.successCount / sampleSize : 0.5;
|
|
288
|
+
avgConfidence = sampleSize > 0 ? stored.totalConfidence / sampleSize : 0.5;
|
|
289
|
+
lastAttempt = stored.lastAttempt;
|
|
290
|
+
}
|
|
291
|
+
else if (calEntries.length > 0) {
|
|
292
|
+
sampleSize = calEntries.length;
|
|
293
|
+
successRate = calEntries.reduce((s, e) => s + e.actual, 0) / sampleSize;
|
|
294
|
+
avgConfidence = calEntries.reduce((s, e) => s + e.predicted, 0) / sampleSize;
|
|
295
|
+
lastAttempt = calEntries[calEntries.length - 1].timestamp;
|
|
296
|
+
}
|
|
297
|
+
else {
|
|
298
|
+
continue;
|
|
299
|
+
}
|
|
300
|
+
entries.push({
|
|
301
|
+
domain,
|
|
302
|
+
successRate: Math.round(successRate * 100) / 100,
|
|
303
|
+
avgConfidence: Math.round(avgConfidence * 100) / 100,
|
|
304
|
+
sampleSize,
|
|
305
|
+
lastAttempt,
|
|
306
|
+
});
|
|
307
|
+
}
|
|
308
|
+
// All known domains that could exist but have no data
|
|
309
|
+
const allKnownDomains = [
|
|
310
|
+
'typescript', 'javascript', 'python', 'rust', 'go', 'java', 'ruby',
|
|
311
|
+
'cpp', 'c', 'csharp', 'swift', 'kotlin', 'devops', 'database', 'sql',
|
|
312
|
+
'testing', 'security', 'design', 'writing', 'react', 'git', 'config',
|
|
313
|
+
'html', 'css', 'lua', 'zig',
|
|
314
|
+
];
|
|
315
|
+
const strengths = entries
|
|
316
|
+
.filter(e => e.successRate >= 0.7 && e.sampleSize >= 3)
|
|
317
|
+
.sort((a, b) => b.successRate - a.successRate);
|
|
318
|
+
const weaknesses = entries
|
|
319
|
+
.filter(e => e.successRate < 0.5 && e.sampleSize >= 3)
|
|
320
|
+
.sort((a, b) => a.successRate - b.successRate);
|
|
321
|
+
const assessedDomains = new Set(entries.map(e => e.domain));
|
|
322
|
+
const unknown = allKnownDomains.filter(d => {
|
|
323
|
+
const entry = entries.find(e => e.domain === d);
|
|
324
|
+
return !entry || entry.sampleSize < 3;
|
|
325
|
+
});
|
|
326
|
+
return { strengths, weaknesses, unknown };
|
|
327
|
+
}
|
|
328
|
+
/**
|
|
329
|
+
* Assess whether the agent is suitable for a given task.
|
|
330
|
+
*/
|
|
331
|
+
export function assessSkillForTask(task) {
|
|
332
|
+
const domain = detectDomain(task);
|
|
333
|
+
const profile = getSkillProfile();
|
|
334
|
+
// Check if this is a known strength
|
|
335
|
+
const strength = profile.strengths.find(s => s.domain === domain);
|
|
336
|
+
if (strength) {
|
|
337
|
+
return {
|
|
338
|
+
canDo: true,
|
|
339
|
+
confidence: strength.avgConfidence,
|
|
340
|
+
};
|
|
341
|
+
}
|
|
342
|
+
// Check if this is a known weakness
|
|
343
|
+
const weakness = profile.weaknesses.find(w => w.domain === domain);
|
|
344
|
+
if (weakness) {
|
|
345
|
+
return {
|
|
346
|
+
canDo: true,
|
|
347
|
+
confidence: weakness.avgConfidence,
|
|
348
|
+
suggestion: `Historical success rate in ${domain} is ${Math.round(weakness.successRate * 100)}% — consider breaking this into smaller steps or using a specialized tool.`,
|
|
349
|
+
};
|
|
350
|
+
}
|
|
351
|
+
// Check if unknown domain
|
|
352
|
+
if (profile.unknown.includes(domain)) {
|
|
353
|
+
return {
|
|
354
|
+
canDo: true,
|
|
355
|
+
confidence: 0.5,
|
|
356
|
+
suggestion: `Limited experience with ${domain} tasks — proceeding with caution.`,
|
|
357
|
+
};
|
|
358
|
+
}
|
|
359
|
+
// General domain — no strong signal either way
|
|
360
|
+
return {
|
|
361
|
+
canDo: true,
|
|
362
|
+
confidence: 0.6,
|
|
363
|
+
};
|
|
364
|
+
}
|
|
365
|
+
/**
|
|
366
|
+
* Update the skill profile after completing a task.
|
|
367
|
+
*
|
|
368
|
+
* @param domain - The task domain (auto-detected or overridden)
|
|
369
|
+
* @param success - Whether the task completed successfully
|
|
370
|
+
* @param confidence - The confidence score used for this task
|
|
371
|
+
*/
|
|
372
|
+
export function updateSkillProfile(domain, success, confidence) {
|
|
373
|
+
const data = loadSkillData();
|
|
374
|
+
const existing = data.skills[domain];
|
|
375
|
+
if (existing) {
|
|
376
|
+
if (success)
|
|
377
|
+
existing.successCount++;
|
|
378
|
+
else
|
|
379
|
+
existing.failureCount++;
|
|
380
|
+
existing.totalConfidence += confidence;
|
|
381
|
+
existing.sampleSize++;
|
|
382
|
+
existing.lastAttempt = new Date().toISOString();
|
|
383
|
+
}
|
|
384
|
+
else {
|
|
385
|
+
data.skills[domain] = {
|
|
386
|
+
successCount: success ? 1 : 0,
|
|
387
|
+
failureCount: success ? 0 : 1,
|
|
388
|
+
totalConfidence: confidence,
|
|
389
|
+
sampleSize: 1,
|
|
390
|
+
lastAttempt: new Date().toISOString(),
|
|
391
|
+
};
|
|
392
|
+
}
|
|
393
|
+
saveSkillData(data);
|
|
394
|
+
}
|
|
395
|
+
function loadEffortHistory() {
|
|
396
|
+
return loadJSON(EFFORT_FILE, { entries: [] });
|
|
397
|
+
}
|
|
398
|
+
function saveEffortHistory(data) {
|
|
399
|
+
// Keep only last 100
|
|
400
|
+
if (data.entries.length > 100) {
|
|
401
|
+
data.entries = data.entries.slice(-100);
|
|
402
|
+
}
|
|
403
|
+
saveJSON(EFFORT_FILE, data);
|
|
404
|
+
}
|
|
405
|
+
/** Get average actual tool calls for a task type from history */
|
|
406
|
+
function getHistoricalEffort(taskType, domain) {
|
|
407
|
+
const history = loadEffortHistory();
|
|
408
|
+
const relevant = history.entries.filter(e => e.actual && (e.taskType === taskType || e.domain === domain));
|
|
409
|
+
if (relevant.length < 2)
|
|
410
|
+
return null;
|
|
411
|
+
const avgToolCalls = relevant.reduce((s, e) => s + (e.actual.toolCalls || 0), 0) / relevant.length;
|
|
412
|
+
const avgCost = relevant.reduce((s, e) => s + (e.actual.costUsd || 0), 0) / relevant.length;
|
|
413
|
+
return { avgToolCalls, avgCost, count: relevant.length };
|
|
414
|
+
}
|
|
415
|
+
/** Complexity classification based on signals */
|
|
416
|
+
function classifyComplexity(task) {
|
|
417
|
+
const complexity = assessComplexity(task);
|
|
418
|
+
const taskType = classifyTask(task);
|
|
419
|
+
const wordCount = task.split(/\s+/).length;
|
|
420
|
+
// Simple heuristics
|
|
421
|
+
if (wordCount < 8 && !complexity.multiStep && complexity.fileCount <= 1)
|
|
422
|
+
return 'trivial';
|
|
423
|
+
if (wordCount < 20 && !complexity.multiStep && complexity.fileCount <= 2)
|
|
424
|
+
return 'simple';
|
|
425
|
+
if (complexity.fileCount > 10 || (complexity.multiStep && wordCount > 50))
|
|
426
|
+
return 'ambitious';
|
|
427
|
+
if (complexity.multiStep || complexity.fileCount > 5 || wordCount > 40)
|
|
428
|
+
return 'complex';
|
|
429
|
+
return 'moderate';
|
|
430
|
+
}
|
|
431
|
+
/** Estimate tool call counts by task type */
|
|
432
|
+
function estimateToolCounts(taskType, complexityLevel, fileCount) {
|
|
433
|
+
// Base estimates by task type
|
|
434
|
+
const baseEstimates = {
|
|
435
|
+
debug: { min: 3, expected: 6, max: 15 },
|
|
436
|
+
build: { min: 5, expected: 10, max: 25 },
|
|
437
|
+
refactor: { min: 3, expected: 8, max: 20 },
|
|
438
|
+
test: { min: 2, expected: 5, max: 12 },
|
|
439
|
+
deploy: { min: 2, expected: 5, max: 10 },
|
|
440
|
+
explain: { min: 1, expected: 3, max: 6 },
|
|
441
|
+
review: { min: 2, expected: 5, max: 12 },
|
|
442
|
+
search: { min: 1, expected: 3, max: 8 },
|
|
443
|
+
general: { min: 2, expected: 5, max: 12 },
|
|
444
|
+
};
|
|
445
|
+
const base = baseEstimates[taskType] || baseEstimates.general;
|
|
446
|
+
// Complexity multiplier
|
|
447
|
+
const complexityMultiplier = {
|
|
448
|
+
trivial: 0.3,
|
|
449
|
+
simple: 0.6,
|
|
450
|
+
moderate: 1.0,
|
|
451
|
+
complex: 1.8,
|
|
452
|
+
ambitious: 3.0,
|
|
453
|
+
};
|
|
454
|
+
const mult = complexityMultiplier[complexityLevel] || 1.0;
|
|
455
|
+
// File count adjustment
|
|
456
|
+
const fileAdj = Math.max(0, (fileCount - 2) * 0.5);
|
|
457
|
+
return {
|
|
458
|
+
min: Math.max(1, Math.round(base.min * mult)),
|
|
459
|
+
expected: Math.max(1, Math.round(base.expected * mult + fileAdj)),
|
|
460
|
+
max: Math.max(2, Math.round(base.max * mult + fileAdj * 2)),
|
|
461
|
+
};
|
|
462
|
+
}
|
|
463
|
+
/** Build a human-readable breakdown of expected operations */
|
|
464
|
+
function buildBreakdown(taskType, fileCount, complexityLevel) {
|
|
465
|
+
const parts = [];
|
|
466
|
+
// Reads
|
|
467
|
+
const reads = Math.max(1, Math.ceil(fileCount * 0.8));
|
|
468
|
+
parts.push(`~${reads} file read${reads > 1 ? 's' : ''}`);
|
|
469
|
+
// Edits
|
|
470
|
+
if (['build', 'debug', 'refactor'].includes(taskType)) {
|
|
471
|
+
const edits = Math.max(1, Math.ceil(fileCount * 0.5));
|
|
472
|
+
parts.push(`~${edits} edit${edits > 1 ? 's' : ''}`);
|
|
473
|
+
}
|
|
474
|
+
// Search
|
|
475
|
+
if (['debug', 'search', 'review', 'refactor'].includes(taskType)) {
|
|
476
|
+
parts.push('~1-2 searches');
|
|
477
|
+
}
|
|
478
|
+
// Test/build run
|
|
479
|
+
if (['test', 'build', 'debug', 'deploy'].includes(taskType)) {
|
|
480
|
+
parts.push('~1 test/build run');
|
|
481
|
+
}
|
|
482
|
+
// Git
|
|
483
|
+
if (['deploy', 'build'].includes(taskType)) {
|
|
484
|
+
parts.push('~1 git operation');
|
|
485
|
+
}
|
|
486
|
+
if (complexityLevel === 'ambitious') {
|
|
487
|
+
parts.push('may require multiple iterations');
|
|
488
|
+
}
|
|
489
|
+
return parts.join(', ');
|
|
490
|
+
}
|
|
491
|
+
/**
|
|
492
|
+
* Estimate the effort required for a task — tool calls, cost, and complexity.
|
|
493
|
+
*
|
|
494
|
+
* @param task - The task description
|
|
495
|
+
* @param context - Optional context (repo state, file list, etc.)
|
|
496
|
+
*/
|
|
497
|
+
export function estimateEffort(task, context) {
|
|
498
|
+
const taskType = classifyTask(task);
|
|
499
|
+
const domain = detectDomain(task);
|
|
500
|
+
const complexityLevel = classifyComplexity(task);
|
|
501
|
+
const complexity = assessComplexity(task);
|
|
502
|
+
// Try historical data first
|
|
503
|
+
const historical = getHistoricalEffort(taskType, domain);
|
|
504
|
+
let toolCalls;
|
|
505
|
+
if (historical && historical.count >= 3) {
|
|
506
|
+
// Use historical averages with some spread
|
|
507
|
+
const avg = Math.round(historical.avgToolCalls);
|
|
508
|
+
toolCalls = {
|
|
509
|
+
min: Math.max(1, Math.round(avg * 0.5)),
|
|
510
|
+
expected: avg,
|
|
511
|
+
max: Math.round(avg * 2),
|
|
512
|
+
};
|
|
513
|
+
}
|
|
514
|
+
else {
|
|
515
|
+
toolCalls = estimateToolCounts(taskType, complexityLevel, complexity.fileCount);
|
|
516
|
+
}
|
|
517
|
+
// Cost estimation — rough approximation based on tool calls
|
|
518
|
+
// Each tool call ~ 500 input tokens + 200 output tokens on average
|
|
519
|
+
const tokensPerCall = { input: 500, output: 200 };
|
|
520
|
+
let costPerCall;
|
|
521
|
+
try {
|
|
522
|
+
const provider = getByokProvider();
|
|
523
|
+
costPerCall = estimateCost(provider, tokensPerCall.input, tokensPerCall.output);
|
|
524
|
+
if (isLocalProvider(provider))
|
|
525
|
+
costPerCall = 0;
|
|
526
|
+
}
|
|
527
|
+
catch {
|
|
528
|
+
// Default to Anthropic pricing (~$3/$15 per MTok)
|
|
529
|
+
costPerCall = (tokensPerCall.input * 3 / 1_000_000) + (tokensPerCall.output * 15 / 1_000_000);
|
|
530
|
+
}
|
|
531
|
+
const estimatedCostUsd = {
|
|
532
|
+
min: Math.round(toolCalls.min * costPerCall * 10000) / 10000,
|
|
533
|
+
expected: Math.round(toolCalls.expected * costPerCall * 10000) / 10000,
|
|
534
|
+
max: Math.round(toolCalls.max * costPerCall * 10000) / 10000,
|
|
535
|
+
};
|
|
536
|
+
const breakdown = buildBreakdown(taskType, complexity.fileCount, complexityLevel);
|
|
537
|
+
// Store prediction for later calibration
|
|
538
|
+
const history = loadEffortHistory();
|
|
539
|
+
history.entries.push({
|
|
540
|
+
task: task.slice(0, 200),
|
|
541
|
+
taskType,
|
|
542
|
+
domain,
|
|
543
|
+
predicted: {
|
|
544
|
+
toolCalls: toolCalls.expected,
|
|
545
|
+
costUsd: estimatedCostUsd.expected,
|
|
546
|
+
complexity: complexityLevel,
|
|
547
|
+
},
|
|
548
|
+
timestamp: new Date().toISOString(),
|
|
549
|
+
});
|
|
550
|
+
saveEffortHistory(history);
|
|
551
|
+
return {
|
|
552
|
+
toolCalls,
|
|
553
|
+
estimatedCostUsd,
|
|
554
|
+
complexity: complexityLevel,
|
|
555
|
+
breakdown,
|
|
556
|
+
};
|
|
557
|
+
}
|
|
558
|
+
/**
|
|
559
|
+
* Record actual effort after a task completes, for future calibration.
|
|
560
|
+
*/
|
|
561
|
+
export function recordActualEffort(task, actualToolCalls, actualCostUsd) {
|
|
562
|
+
const history = loadEffortHistory();
|
|
563
|
+
// Find the most recent prediction for this task
|
|
564
|
+
const taskSlice = task.slice(0, 200);
|
|
565
|
+
for (let i = history.entries.length - 1; i >= 0; i--) {
|
|
566
|
+
if (history.entries[i].task === taskSlice && !history.entries[i].actual) {
|
|
567
|
+
history.entries[i].actual = { toolCalls: actualToolCalls, costUsd: actualCostUsd };
|
|
568
|
+
break;
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
saveEffortHistory(history);
|
|
572
|
+
}
|
|
573
|
+
// ══════════════════════════════════════════════════════════════════
|
|
574
|
+
// TOOL REGISTRATION
|
|
575
|
+
// ══════════════════════════════════════════════════════════════════
|
|
576
|
+
/**
|
|
577
|
+
* Register confidence engine tools with the K:BOT tool registry.
|
|
578
|
+
*/
|
|
579
|
+
export function registerConfidenceTools() {
|
|
580
|
+
registerTool({
|
|
581
|
+
name: 'confidence_check',
|
|
582
|
+
description: 'Get a confidence score for a proposed action or task. Returns factual, approach, and completeness scores with reasoning.',
|
|
583
|
+
parameters: {
|
|
584
|
+
task: {
|
|
585
|
+
type: 'string',
|
|
586
|
+
description: 'Description of the task or action to assess confidence for',
|
|
587
|
+
required: true,
|
|
588
|
+
},
|
|
589
|
+
context: {
|
|
590
|
+
type: 'string',
|
|
591
|
+
description: 'Available context (repo state, file contents, memory, etc.)',
|
|
592
|
+
required: false,
|
|
593
|
+
default: '',
|
|
594
|
+
},
|
|
595
|
+
},
|
|
596
|
+
tier: 'free',
|
|
597
|
+
execute: async (args) => {
|
|
598
|
+
const task = String(args.task || '');
|
|
599
|
+
const context = String(args.context || '');
|
|
600
|
+
if (!task)
|
|
601
|
+
return 'Error: task parameter is required';
|
|
602
|
+
const score = estimateConfidence(task, context);
|
|
603
|
+
return reportConfidence(score);
|
|
604
|
+
},
|
|
605
|
+
});
|
|
606
|
+
registerTool({
|
|
607
|
+
name: 'skill_profile',
|
|
608
|
+
description: 'Show the agent skill profile — strengths, weaknesses, and untested domains. Optionally assess suitability for a specific task.',
|
|
609
|
+
parameters: {
|
|
610
|
+
task: {
|
|
611
|
+
type: 'string',
|
|
612
|
+
description: 'Optional task to assess suitability for',
|
|
613
|
+
required: false,
|
|
614
|
+
},
|
|
615
|
+
},
|
|
616
|
+
tier: 'free',
|
|
617
|
+
execute: async (args) => {
|
|
618
|
+
const task = args.task ? String(args.task) : null;
|
|
619
|
+
const profile = getSkillProfile();
|
|
620
|
+
const lines = ['=== Skill Profile ===', ''];
|
|
621
|
+
if (profile.strengths.length > 0) {
|
|
622
|
+
lines.push('Strengths:');
|
|
623
|
+
for (const s of profile.strengths) {
|
|
624
|
+
lines.push(` ${s.domain}: ${Math.round(s.successRate * 100)}% success (${s.sampleSize} tasks, avg confidence ${Math.round(s.avgConfidence * 100)}%)`);
|
|
625
|
+
}
|
|
626
|
+
lines.push('');
|
|
627
|
+
}
|
|
628
|
+
if (profile.weaknesses.length > 0) {
|
|
629
|
+
lines.push('Weaknesses:');
|
|
630
|
+
for (const w of profile.weaknesses) {
|
|
631
|
+
lines.push(` ${w.domain}: ${Math.round(w.successRate * 100)}% success (${w.sampleSize} tasks, avg confidence ${Math.round(w.avgConfidence * 100)}%)`);
|
|
632
|
+
}
|
|
633
|
+
lines.push('');
|
|
634
|
+
}
|
|
635
|
+
if (profile.unknown.length > 0) {
|
|
636
|
+
lines.push(`Untested domains: ${profile.unknown.join(', ')}`);
|
|
637
|
+
lines.push('');
|
|
638
|
+
}
|
|
639
|
+
if (task) {
|
|
640
|
+
lines.push('--- Task Assessment ---');
|
|
641
|
+
const assessment = assessSkillForTask(task);
|
|
642
|
+
lines.push(`Can do: ${assessment.canDo ? 'yes' : 'no'}`);
|
|
643
|
+
lines.push(`Confidence: ${Math.round(assessment.confidence * 100)}%`);
|
|
644
|
+
if (assessment.suggestion)
|
|
645
|
+
lines.push(`Note: ${assessment.suggestion}`);
|
|
646
|
+
}
|
|
647
|
+
return lines.join('\n');
|
|
648
|
+
},
|
|
649
|
+
});
|
|
650
|
+
registerTool({
|
|
651
|
+
name: 'effort_estimate',
|
|
652
|
+
description: 'Predict how many tool calls, cost, and complexity a task will involve. Uses historical data when available.',
|
|
653
|
+
parameters: {
|
|
654
|
+
task: {
|
|
655
|
+
type: 'string',
|
|
656
|
+
description: 'Description of the task to estimate effort for',
|
|
657
|
+
required: true,
|
|
658
|
+
},
|
|
659
|
+
context: {
|
|
660
|
+
type: 'string',
|
|
661
|
+
description: 'Optional context about the current state',
|
|
662
|
+
required: false,
|
|
663
|
+
},
|
|
664
|
+
},
|
|
665
|
+
tier: 'free',
|
|
666
|
+
execute: async (args) => {
|
|
667
|
+
const task = String(args.task || '');
|
|
668
|
+
const context = args.context ? String(args.context) : undefined;
|
|
669
|
+
if (!task)
|
|
670
|
+
return 'Error: task parameter is required';
|
|
671
|
+
const estimate = estimateEffort(task, context);
|
|
672
|
+
const lines = [
|
|
673
|
+
'=== Effort Estimate ===',
|
|
674
|
+
'',
|
|
675
|
+
`Complexity: ${estimate.complexity}`,
|
|
676
|
+
'',
|
|
677
|
+
`Tool calls:`,
|
|
678
|
+
` Min: ${estimate.toolCalls.min}`,
|
|
679
|
+
` Expected: ${estimate.toolCalls.expected}`,
|
|
680
|
+
` Max: ${estimate.toolCalls.max}`,
|
|
681
|
+
'',
|
|
682
|
+
`Estimated cost (USD):`,
|
|
683
|
+
` Min: $${estimate.estimatedCostUsd.min.toFixed(4)}`,
|
|
684
|
+
` Expected: $${estimate.estimatedCostUsd.expected.toFixed(4)}`,
|
|
685
|
+
` Max: $${estimate.estimatedCostUsd.max.toFixed(4)}`,
|
|
686
|
+
'',
|
|
687
|
+
`Breakdown: ${estimate.breakdown}`,
|
|
688
|
+
];
|
|
689
|
+
return lines.join('\n');
|
|
690
|
+
},
|
|
691
|
+
});
|
|
692
|
+
}
|
|
693
|
+
//# sourceMappingURL=confidence.js.map
|