@kernel.chat/kbot 2.8.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,693 @@
1
+ // K:BOT Confidence Engine — Self-Awareness for Agent Actions
2
+ //
3
+ // Three systems:
4
+ // 1. CONFIDENCE CALIBRATION — Express uncertainty about responses/actions
5
+ // 2. SKILL BOUNDARIES — Self-model of strengths, weaknesses, unknowns
6
+ // 3. EFFORT ESTIMATION — Predict tool calls, cost, and complexity
7
+ //
8
+ // Persists calibration and effort history to ~/.kbot/ as JSON files.
9
+ // Integrates with the learning engine for pattern/task data.
10
+ import { homedir } from 'node:os';
11
+ import { join } from 'node:path';
12
+ import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'node:fs';
13
+ import { registerTool } from './tools/index.js';
14
+ import { classifyTask, findPattern } from './learning.js';
15
+ import { isLocalProvider, getByokProvider, estimateCost } from './auth.js';
16
+ // ── File Paths ──
17
+ const KBOT_DIR = join(homedir(), '.kbot');
18
+ const CONFIDENCE_FILE = join(KBOT_DIR, 'confidence.json');
19
+ const EFFORT_FILE = join(KBOT_DIR, 'effort-history.json');
20
+ const SKILL_FILE = join(KBOT_DIR, 'skill-profile.json');
21
+ // ── Helpers ──
22
+ function ensureDir() {
23
+ if (!existsSync(KBOT_DIR))
24
+ mkdirSync(KBOT_DIR, { recursive: true });
25
+ }
26
+ function loadJSON(path, fallback) {
27
+ ensureDir();
28
+ if (!existsSync(path))
29
+ return fallback;
30
+ try {
31
+ return JSON.parse(readFileSync(path, 'utf-8'));
32
+ }
33
+ catch {
34
+ return fallback;
35
+ }
36
+ }
37
+ function saveJSON(path, data) {
38
+ ensureDir();
39
+ try {
40
+ writeFileSync(path, JSON.stringify(data, null, 2), 'utf-8');
41
+ }
42
+ catch { /* silently fail — non-critical persistence */ }
43
+ }
44
+ function loadCalibration() {
45
+ return loadJSON(CONFIDENCE_FILE, { entries: [], avgError: 0 });
46
+ }
47
+ function saveCalibration(data) {
48
+ // Keep only last 200 entries to avoid unbounded growth
49
+ if (data.entries.length > 200) {
50
+ data.entries = data.entries.slice(-200);
51
+ }
52
+ saveJSON(CONFIDENCE_FILE, data);
53
+ }
54
+ /** Compute calibration bias from historical data for a domain */
55
+ function getCalibrationBias(domain) {
56
+ const data = loadCalibration();
57
+ const domainEntries = data.entries.filter(e => e.domain === domain);
58
+ if (domainEntries.length < 3)
59
+ return 0;
60
+ // Average (predicted - actual): positive = overconfident, negative = underconfident
61
+ const totalBias = domainEntries.reduce((sum, e) => sum + (e.predicted - e.actual), 0);
62
+ return totalBias / domainEntries.length;
63
+ }
64
+ /** Get historical success rate for a domain from calibration data */
65
+ function getHistoricalSuccessRate(domain) {
66
+ const data = loadCalibration();
67
+ const entries = data.entries.filter(e => e.domain === domain);
68
+ if (entries.length === 0)
69
+ return { rate: 0.5, count: 0 };
70
+ const avgActual = entries.reduce((sum, e) => sum + e.actual, 0) / entries.length;
71
+ return { rate: avgActual, count: entries.length };
72
+ }
73
+ /** Detect the primary domain from task text */
74
+ function detectDomain(task) {
75
+ const lower = task.toLowerCase();
76
+ // Check file extensions
77
+ const extMatch = lower.match(/\.(ts|tsx|js|jsx|py|rs|go|java|rb|cpp|c|cs|swift|kt|lua|zig|sql|css|html|md|yaml|yml|json|toml|sh|bash|zsh)/);
78
+ if (extMatch) {
79
+ const extMap = {
80
+ ts: 'typescript', tsx: 'typescript', js: 'javascript', jsx: 'javascript',
81
+ py: 'python', rs: 'rust', go: 'go', java: 'java', rb: 'ruby',
82
+ cpp: 'cpp', c: 'c', cs: 'csharp', swift: 'swift', kt: 'kotlin',
83
+ lua: 'lua', zig: 'zig', sql: 'sql', css: 'css', html: 'html',
84
+ md: 'writing', yaml: 'devops', yml: 'devops', json: 'config',
85
+ toml: 'config', sh: 'devops', bash: 'devops', zsh: 'devops',
86
+ };
87
+ return extMap[extMatch[1]] || 'general';
88
+ }
89
+ // Check language/domain keywords
90
+ const domainKeywords = {
91
+ typescript: ['typescript', 'ts ', 'tsx', 'type ', 'interface ', 'enum '],
92
+ javascript: ['javascript', 'js ', 'jsx', 'node', 'npm', 'yarn', 'pnpm', 'bun'],
93
+ python: ['python', 'pip', 'conda', 'django', 'flask', 'fastapi', 'pytorch'],
94
+ rust: ['rust', 'cargo', 'crate', 'impl ', 'fn ', 'struct '],
95
+ go: ['golang', 'go ', 'goroutine', 'go mod'],
96
+ devops: ['docker', 'kubernetes', 'k8s', 'terraform', 'ansible', 'ci/cd', 'pipeline', 'deploy', 'nginx', 'aws', 'gcp', 'azure'],
97
+ database: ['sql', 'postgres', 'mysql', 'mongo', 'redis', 'supabase', 'prisma', 'database', 'migration'],
98
+ writing: ['write', 'blog', 'article', 'documentation', 'readme', 'explain', 'describe'],
99
+ testing: ['test', 'spec', 'coverage', 'vitest', 'jest', 'pytest', 'assert'],
100
+ security: ['security', 'vulnerability', 'auth', 'permission', 'encrypt', 'cve'],
101
+ design: ['css', 'style', 'layout', 'responsive', 'animation', 'color', 'font'],
102
+ react: ['react', 'component', 'hook', 'jsx', 'tsx', 'next.js', 'nextjs', 'remix'],
103
+ git: ['git', 'commit', 'branch', 'merge', 'rebase', 'pr ', 'pull request'],
104
+ };
105
+ for (const [domain, keywords] of Object.entries(domainKeywords)) {
106
+ if (keywords.some(kw => lower.includes(kw)))
107
+ return domain;
108
+ }
109
+ return 'general';
110
+ }
111
+ /** Count complexity signals in a task description */
112
+ function assessComplexity(task) {
113
+ const lower = task.toLowerCase();
114
+ // Estimate number of files involved
115
+ const fileRefs = (task.match(/\b\w+\.\w{1,5}\b/g) || []).length;
116
+ const dirRefs = (task.match(/\b\w+\//g) || []).length;
117
+ const fileCount = Math.max(1, fileRefs + dirRefs);
118
+ // Ambiguity — vague words that signal unclear intent
119
+ const vagueTerms = ['maybe', 'somehow', 'might', 'possibly', 'not sure', 'something like',
120
+ 'kind of', 'sort of', 'whatever', 'figure out', 'if possible'];
121
+ const ambiguity = vagueTerms.filter(t => lower.includes(t)).length / vagueTerms.length;
122
+ // Multi-step — signals that the task has multiple phases
123
+ const multiStepTerms = ['then', 'after that', 'also', 'and then', 'next', 'finally',
124
+ 'first', 'second', 'step', 'multi', 'several', 'all', 'each'];
125
+ const multiStep = multiStepTerms.some(t => lower.includes(t));
126
+ return { fileCount, ambiguity, multiStep };
127
+ }
128
+ /**
129
+ * Estimate confidence for a task before execution.
130
+ *
131
+ * Considers task complexity, past success with similar tasks,
132
+ * provider capability, and available context.
133
+ */
134
+ export function estimateConfidence(task, context) {
135
+ const domain = detectDomain(task);
136
+ const taskType = classifyTask(task);
137
+ const complexity = assessComplexity(task);
138
+ const cachedPattern = findPattern(task);
139
+ const historical = getHistoricalSuccessRate(domain);
140
+ const calibrationBias = getCalibrationBias(domain);
141
+ // ── Base scores ──
142
+ // Factual confidence — higher if we have context, patterns, or history
143
+ let factual = 0.5;
144
+ if (context.length > 200)
145
+ factual += 0.15; // good context available
146
+ if (context.length > 1000)
147
+ factual += 0.1; // rich context
148
+ if (cachedPattern)
149
+ factual += 0.15; // we've done this before
150
+ if (historical.count > 5) {
151
+ factual += (historical.rate - 0.5) * 0.2; // historical performance adjustment
152
+ }
153
+ // Approach confidence — higher for known task types, lower for ambiguity
154
+ let approach = 0.6;
155
+ if (taskType !== 'general')
156
+ approach += 0.1; // recognized task type
157
+ if (cachedPattern && cachedPattern.successRate > 0.8)
158
+ approach += 0.2; // proven pattern
159
+ approach -= complexity.ambiguity * 0.3; // ambiguity reduces confidence
160
+ if (complexity.multiStep)
161
+ approach -= 0.1; // multi-step = more room for error
162
+ // Completeness — how sure we cover everything
163
+ let completeness = 0.6;
164
+ if (complexity.fileCount > 5)
165
+ completeness -= 0.15; // many files = easy to miss things
166
+ if (complexity.multiStep)
167
+ completeness -= 0.1; // multi-step = might skip a step
168
+ if (context.includes('repo-map') || context.includes('project structure'))
169
+ completeness += 0.1;
170
+ if (cachedPattern)
171
+ completeness += 0.1;
172
+ // Provider adjustment — local models get a penalty on complex tasks
173
+ try {
174
+ const provider = getByokProvider();
175
+ if (isLocalProvider(provider)) {
176
+ const localPenalty = complexity.multiStep ? 0.2 : 0.1;
177
+ factual -= localPenalty;
178
+ approach -= localPenalty;
179
+ completeness -= localPenalty * 0.5;
180
+ }
181
+ }
182
+ catch { /* no provider configured — skip adjustment */ }
183
+ // Apply calibration correction — if we're historically overconfident, reduce scores
184
+ factual -= calibrationBias * 0.5;
185
+ approach -= calibrationBias * 0.5;
186
+ completeness -= calibrationBias * 0.3;
187
+ // Clamp all scores to [0.05, 0.99]
188
+ factual = Math.max(0.05, Math.min(0.99, factual));
189
+ approach = Math.max(0.05, Math.min(0.99, approach));
190
+ completeness = Math.max(0.05, Math.min(0.99, completeness));
191
+ // Overall — weighted average
192
+ const overall = Math.round((factual * 0.3 + approach * 0.4 + completeness * 0.3) * 100) / 100;
193
+ // Build reasoning
194
+ const reasons = [];
195
+ if (cachedPattern)
196
+ reasons.push('have a proven pattern for this');
197
+ if (historical.count > 5 && historical.rate > 0.7)
198
+ reasons.push(`strong track record in ${domain}`);
199
+ if (historical.count > 5 && historical.rate < 0.4)
200
+ reasons.push(`historically weak in ${domain}`);
201
+ if (complexity.ambiguity > 0.2)
202
+ reasons.push('request is somewhat ambiguous');
203
+ if (complexity.multiStep)
204
+ reasons.push('multi-step task');
205
+ if (context.length < 100)
206
+ reasons.push('limited context available');
207
+ try {
208
+ if (isLocalProvider(getByokProvider()))
209
+ reasons.push('using local model (lower capability)');
210
+ }
211
+ catch { /* skip */ }
212
+ const pct = Math.round(overall * 100);
213
+ const reasoning = reasons.length > 0
214
+ ? `~${pct}% confident — ${reasons.join(', ')}`
215
+ : `~${pct}% confident — standard assessment for ${domain} ${taskType} task`;
216
+ return {
217
+ overall,
218
+ factual: Math.round(factual * 100) / 100,
219
+ approach: Math.round(approach * 100) / 100,
220
+ completeness: Math.round(completeness * 100) / 100,
221
+ reasoning,
222
+ };
223
+ }
224
+ /**
225
+ * Format a confidence score as a human-readable string.
226
+ */
227
+ export function reportConfidence(score) {
228
+ const pct = Math.round(score.overall * 100);
229
+ const level = pct >= 80 ? 'high' : pct >= 50 ? 'moderate' : 'low';
230
+ const lines = [
231
+ `Confidence: ${pct}% (${level})`,
232
+ ` Factual: ${Math.round(score.factual * 100)}%`,
233
+ ` Approach: ${Math.round(score.approach * 100)}%`,
234
+ ` Completeness: ${Math.round(score.completeness * 100)}%`,
235
+ ` ${score.reasoning}`,
236
+ ];
237
+ return lines.join('\n');
238
+ }
239
+ /**
240
+ * Record a calibration entry — predicted vs actual (from self-eval or user feedback).
241
+ * Called after a task completes to improve future predictions.
242
+ */
243
+ export function recordCalibration(task, predicted, actual) {
244
+ const domain = detectDomain(task);
245
+ const data = loadCalibration();
246
+ data.entries.push({
247
+ task: task.slice(0, 200), // truncate for storage
248
+ predicted,
249
+ actual,
250
+ domain,
251
+ timestamp: new Date().toISOString(),
252
+ });
253
+ // Recompute running average error
254
+ if (data.entries.length > 0) {
255
+ data.avgError = data.entries.reduce((sum, e) => sum + Math.abs(e.predicted - e.actual), 0) / data.entries.length;
256
+ }
257
+ saveCalibration(data);
258
+ }
259
+ function loadSkillData() {
260
+ return loadJSON(SKILL_FILE, { skills: {} });
261
+ }
262
+ function saveSkillData(data) {
263
+ saveJSON(SKILL_FILE, data);
264
+ }
265
+ /**
266
+ * Build a skill profile from stored skill data and calibration history.
267
+ */
268
+ export function getSkillProfile() {
269
+ const skillData = loadSkillData();
270
+ const calibration = loadCalibration();
271
+ // Merge skill data with calibration entries for domains not in skill data
272
+ const domainSet = new Set(Object.keys(skillData.skills));
273
+ for (const entry of calibration.entries) {
274
+ domainSet.add(entry.domain);
275
+ }
276
+ const allDomains = Array.from(domainSet);
277
+ const entries = [];
278
+ for (const domain of allDomains) {
279
+ const stored = skillData.skills[domain];
280
+ const calEntries = calibration.entries.filter(e => e.domain === domain);
281
+ let successRate;
282
+ let avgConfidence;
283
+ let sampleSize;
284
+ let lastAttempt;
285
+ if (stored) {
286
+ sampleSize = stored.sampleSize;
287
+ successRate = sampleSize > 0 ? stored.successCount / sampleSize : 0.5;
288
+ avgConfidence = sampleSize > 0 ? stored.totalConfidence / sampleSize : 0.5;
289
+ lastAttempt = stored.lastAttempt;
290
+ }
291
+ else if (calEntries.length > 0) {
292
+ sampleSize = calEntries.length;
293
+ successRate = calEntries.reduce((s, e) => s + e.actual, 0) / sampleSize;
294
+ avgConfidence = calEntries.reduce((s, e) => s + e.predicted, 0) / sampleSize;
295
+ lastAttempt = calEntries[calEntries.length - 1].timestamp;
296
+ }
297
+ else {
298
+ continue;
299
+ }
300
+ entries.push({
301
+ domain,
302
+ successRate: Math.round(successRate * 100) / 100,
303
+ avgConfidence: Math.round(avgConfidence * 100) / 100,
304
+ sampleSize,
305
+ lastAttempt,
306
+ });
307
+ }
308
+ // All known domains that could exist but have no data
309
+ const allKnownDomains = [
310
+ 'typescript', 'javascript', 'python', 'rust', 'go', 'java', 'ruby',
311
+ 'cpp', 'c', 'csharp', 'swift', 'kotlin', 'devops', 'database', 'sql',
312
+ 'testing', 'security', 'design', 'writing', 'react', 'git', 'config',
313
+ 'html', 'css', 'lua', 'zig',
314
+ ];
315
+ const strengths = entries
316
+ .filter(e => e.successRate >= 0.7 && e.sampleSize >= 3)
317
+ .sort((a, b) => b.successRate - a.successRate);
318
+ const weaknesses = entries
319
+ .filter(e => e.successRate < 0.5 && e.sampleSize >= 3)
320
+ .sort((a, b) => a.successRate - b.successRate);
321
+ const assessedDomains = new Set(entries.map(e => e.domain));
322
+ const unknown = allKnownDomains.filter(d => {
323
+ const entry = entries.find(e => e.domain === d);
324
+ return !entry || entry.sampleSize < 3;
325
+ });
326
+ return { strengths, weaknesses, unknown };
327
+ }
328
+ /**
329
+ * Assess whether the agent is suitable for a given task.
330
+ */
331
+ export function assessSkillForTask(task) {
332
+ const domain = detectDomain(task);
333
+ const profile = getSkillProfile();
334
+ // Check if this is a known strength
335
+ const strength = profile.strengths.find(s => s.domain === domain);
336
+ if (strength) {
337
+ return {
338
+ canDo: true,
339
+ confidence: strength.avgConfidence,
340
+ };
341
+ }
342
+ // Check if this is a known weakness
343
+ const weakness = profile.weaknesses.find(w => w.domain === domain);
344
+ if (weakness) {
345
+ return {
346
+ canDo: true,
347
+ confidence: weakness.avgConfidence,
348
+ suggestion: `Historical success rate in ${domain} is ${Math.round(weakness.successRate * 100)}% — consider breaking this into smaller steps or using a specialized tool.`,
349
+ };
350
+ }
351
+ // Check if unknown domain
352
+ if (profile.unknown.includes(domain)) {
353
+ return {
354
+ canDo: true,
355
+ confidence: 0.5,
356
+ suggestion: `Limited experience with ${domain} tasks — proceeding with caution.`,
357
+ };
358
+ }
359
+ // General domain — no strong signal either way
360
+ return {
361
+ canDo: true,
362
+ confidence: 0.6,
363
+ };
364
+ }
365
+ /**
366
+ * Update the skill profile after completing a task.
367
+ *
368
+ * @param domain - The task domain (auto-detected or overridden)
369
+ * @param success - Whether the task completed successfully
370
+ * @param confidence - The confidence score used for this task
371
+ */
372
+ export function updateSkillProfile(domain, success, confidence) {
373
+ const data = loadSkillData();
374
+ const existing = data.skills[domain];
375
+ if (existing) {
376
+ if (success)
377
+ existing.successCount++;
378
+ else
379
+ existing.failureCount++;
380
+ existing.totalConfidence += confidence;
381
+ existing.sampleSize++;
382
+ existing.lastAttempt = new Date().toISOString();
383
+ }
384
+ else {
385
+ data.skills[domain] = {
386
+ successCount: success ? 1 : 0,
387
+ failureCount: success ? 0 : 1,
388
+ totalConfidence: confidence,
389
+ sampleSize: 1,
390
+ lastAttempt: new Date().toISOString(),
391
+ };
392
+ }
393
+ saveSkillData(data);
394
+ }
395
+ function loadEffortHistory() {
396
+ return loadJSON(EFFORT_FILE, { entries: [] });
397
+ }
398
+ function saveEffortHistory(data) {
399
+ // Keep only last 100
400
+ if (data.entries.length > 100) {
401
+ data.entries = data.entries.slice(-100);
402
+ }
403
+ saveJSON(EFFORT_FILE, data);
404
+ }
405
+ /** Get average actual tool calls for a task type from history */
406
+ function getHistoricalEffort(taskType, domain) {
407
+ const history = loadEffortHistory();
408
+ const relevant = history.entries.filter(e => e.actual && (e.taskType === taskType || e.domain === domain));
409
+ if (relevant.length < 2)
410
+ return null;
411
+ const avgToolCalls = relevant.reduce((s, e) => s + (e.actual.toolCalls || 0), 0) / relevant.length;
412
+ const avgCost = relevant.reduce((s, e) => s + (e.actual.costUsd || 0), 0) / relevant.length;
413
+ return { avgToolCalls, avgCost, count: relevant.length };
414
+ }
415
+ /** Complexity classification based on signals */
416
+ function classifyComplexity(task) {
417
+ const complexity = assessComplexity(task);
418
+ const taskType = classifyTask(task);
419
+ const wordCount = task.split(/\s+/).length;
420
+ // Simple heuristics
421
+ if (wordCount < 8 && !complexity.multiStep && complexity.fileCount <= 1)
422
+ return 'trivial';
423
+ if (wordCount < 20 && !complexity.multiStep && complexity.fileCount <= 2)
424
+ return 'simple';
425
+ if (complexity.fileCount > 10 || (complexity.multiStep && wordCount > 50))
426
+ return 'ambitious';
427
+ if (complexity.multiStep || complexity.fileCount > 5 || wordCount > 40)
428
+ return 'complex';
429
+ return 'moderate';
430
+ }
431
+ /** Estimate tool call counts by task type */
432
+ function estimateToolCounts(taskType, complexityLevel, fileCount) {
433
+ // Base estimates by task type
434
+ const baseEstimates = {
435
+ debug: { min: 3, expected: 6, max: 15 },
436
+ build: { min: 5, expected: 10, max: 25 },
437
+ refactor: { min: 3, expected: 8, max: 20 },
438
+ test: { min: 2, expected: 5, max: 12 },
439
+ deploy: { min: 2, expected: 5, max: 10 },
440
+ explain: { min: 1, expected: 3, max: 6 },
441
+ review: { min: 2, expected: 5, max: 12 },
442
+ search: { min: 1, expected: 3, max: 8 },
443
+ general: { min: 2, expected: 5, max: 12 },
444
+ };
445
+ const base = baseEstimates[taskType] || baseEstimates.general;
446
+ // Complexity multiplier
447
+ const complexityMultiplier = {
448
+ trivial: 0.3,
449
+ simple: 0.6,
450
+ moderate: 1.0,
451
+ complex: 1.8,
452
+ ambitious: 3.0,
453
+ };
454
+ const mult = complexityMultiplier[complexityLevel] || 1.0;
455
+ // File count adjustment
456
+ const fileAdj = Math.max(0, (fileCount - 2) * 0.5);
457
+ return {
458
+ min: Math.max(1, Math.round(base.min * mult)),
459
+ expected: Math.max(1, Math.round(base.expected * mult + fileAdj)),
460
+ max: Math.max(2, Math.round(base.max * mult + fileAdj * 2)),
461
+ };
462
+ }
463
+ /** Build a human-readable breakdown of expected operations */
464
+ function buildBreakdown(taskType, fileCount, complexityLevel) {
465
+ const parts = [];
466
+ // Reads
467
+ const reads = Math.max(1, Math.ceil(fileCount * 0.8));
468
+ parts.push(`~${reads} file read${reads > 1 ? 's' : ''}`);
469
+ // Edits
470
+ if (['build', 'debug', 'refactor'].includes(taskType)) {
471
+ const edits = Math.max(1, Math.ceil(fileCount * 0.5));
472
+ parts.push(`~${edits} edit${edits > 1 ? 's' : ''}`);
473
+ }
474
+ // Search
475
+ if (['debug', 'search', 'review', 'refactor'].includes(taskType)) {
476
+ parts.push('~1-2 searches');
477
+ }
478
+ // Test/build run
479
+ if (['test', 'build', 'debug', 'deploy'].includes(taskType)) {
480
+ parts.push('~1 test/build run');
481
+ }
482
+ // Git
483
+ if (['deploy', 'build'].includes(taskType)) {
484
+ parts.push('~1 git operation');
485
+ }
486
+ if (complexityLevel === 'ambitious') {
487
+ parts.push('may require multiple iterations');
488
+ }
489
+ return parts.join(', ');
490
+ }
491
+ /**
492
+ * Estimate the effort required for a task — tool calls, cost, and complexity.
493
+ *
494
+ * @param task - The task description
495
+ * @param context - Optional context (repo state, file list, etc.)
496
+ */
497
+ export function estimateEffort(task, context) {
498
+ const taskType = classifyTask(task);
499
+ const domain = detectDomain(task);
500
+ const complexityLevel = classifyComplexity(task);
501
+ const complexity = assessComplexity(task);
502
+ // Try historical data first
503
+ const historical = getHistoricalEffort(taskType, domain);
504
+ let toolCalls;
505
+ if (historical && historical.count >= 3) {
506
+ // Use historical averages with some spread
507
+ const avg = Math.round(historical.avgToolCalls);
508
+ toolCalls = {
509
+ min: Math.max(1, Math.round(avg * 0.5)),
510
+ expected: avg,
511
+ max: Math.round(avg * 2),
512
+ };
513
+ }
514
+ else {
515
+ toolCalls = estimateToolCounts(taskType, complexityLevel, complexity.fileCount);
516
+ }
517
+ // Cost estimation — rough approximation based on tool calls
518
+ // Each tool call ~ 500 input tokens + 200 output tokens on average
519
+ const tokensPerCall = { input: 500, output: 200 };
520
+ let costPerCall;
521
+ try {
522
+ const provider = getByokProvider();
523
+ costPerCall = estimateCost(provider, tokensPerCall.input, tokensPerCall.output);
524
+ if (isLocalProvider(provider))
525
+ costPerCall = 0;
526
+ }
527
+ catch {
528
+ // Default to Anthropic pricing (~$3/$15 per MTok)
529
+ costPerCall = (tokensPerCall.input * 3 / 1_000_000) + (tokensPerCall.output * 15 / 1_000_000);
530
+ }
531
+ const estimatedCostUsd = {
532
+ min: Math.round(toolCalls.min * costPerCall * 10000) / 10000,
533
+ expected: Math.round(toolCalls.expected * costPerCall * 10000) / 10000,
534
+ max: Math.round(toolCalls.max * costPerCall * 10000) / 10000,
535
+ };
536
+ const breakdown = buildBreakdown(taskType, complexity.fileCount, complexityLevel);
537
+ // Store prediction for later calibration
538
+ const history = loadEffortHistory();
539
+ history.entries.push({
540
+ task: task.slice(0, 200),
541
+ taskType,
542
+ domain,
543
+ predicted: {
544
+ toolCalls: toolCalls.expected,
545
+ costUsd: estimatedCostUsd.expected,
546
+ complexity: complexityLevel,
547
+ },
548
+ timestamp: new Date().toISOString(),
549
+ });
550
+ saveEffortHistory(history);
551
+ return {
552
+ toolCalls,
553
+ estimatedCostUsd,
554
+ complexity: complexityLevel,
555
+ breakdown,
556
+ };
557
+ }
558
+ /**
559
+ * Record actual effort after a task completes, for future calibration.
560
+ */
561
+ export function recordActualEffort(task, actualToolCalls, actualCostUsd) {
562
+ const history = loadEffortHistory();
563
+ // Find the most recent prediction for this task
564
+ const taskSlice = task.slice(0, 200);
565
+ for (let i = history.entries.length - 1; i >= 0; i--) {
566
+ if (history.entries[i].task === taskSlice && !history.entries[i].actual) {
567
+ history.entries[i].actual = { toolCalls: actualToolCalls, costUsd: actualCostUsd };
568
+ break;
569
+ }
570
+ }
571
+ saveEffortHistory(history);
572
+ }
573
+ // ══════════════════════════════════════════════════════════════════
574
+ // TOOL REGISTRATION
575
+ // ══════════════════════════════════════════════════════════════════
576
+ /**
577
+ * Register confidence engine tools with the K:BOT tool registry.
578
+ */
579
+ export function registerConfidenceTools() {
580
+ registerTool({
581
+ name: 'confidence_check',
582
+ description: 'Get a confidence score for a proposed action or task. Returns factual, approach, and completeness scores with reasoning.',
583
+ parameters: {
584
+ task: {
585
+ type: 'string',
586
+ description: 'Description of the task or action to assess confidence for',
587
+ required: true,
588
+ },
589
+ context: {
590
+ type: 'string',
591
+ description: 'Available context (repo state, file contents, memory, etc.)',
592
+ required: false,
593
+ default: '',
594
+ },
595
+ },
596
+ tier: 'free',
597
+ execute: async (args) => {
598
+ const task = String(args.task || '');
599
+ const context = String(args.context || '');
600
+ if (!task)
601
+ return 'Error: task parameter is required';
602
+ const score = estimateConfidence(task, context);
603
+ return reportConfidence(score);
604
+ },
605
+ });
606
+ registerTool({
607
+ name: 'skill_profile',
608
+ description: 'Show the agent skill profile — strengths, weaknesses, and untested domains. Optionally assess suitability for a specific task.',
609
+ parameters: {
610
+ task: {
611
+ type: 'string',
612
+ description: 'Optional task to assess suitability for',
613
+ required: false,
614
+ },
615
+ },
616
+ tier: 'free',
617
+ execute: async (args) => {
618
+ const task = args.task ? String(args.task) : null;
619
+ const profile = getSkillProfile();
620
+ const lines = ['=== Skill Profile ===', ''];
621
+ if (profile.strengths.length > 0) {
622
+ lines.push('Strengths:');
623
+ for (const s of profile.strengths) {
624
+ lines.push(` ${s.domain}: ${Math.round(s.successRate * 100)}% success (${s.sampleSize} tasks, avg confidence ${Math.round(s.avgConfidence * 100)}%)`);
625
+ }
626
+ lines.push('');
627
+ }
628
+ if (profile.weaknesses.length > 0) {
629
+ lines.push('Weaknesses:');
630
+ for (const w of profile.weaknesses) {
631
+ lines.push(` ${w.domain}: ${Math.round(w.successRate * 100)}% success (${w.sampleSize} tasks, avg confidence ${Math.round(w.avgConfidence * 100)}%)`);
632
+ }
633
+ lines.push('');
634
+ }
635
+ if (profile.unknown.length > 0) {
636
+ lines.push(`Untested domains: ${profile.unknown.join(', ')}`);
637
+ lines.push('');
638
+ }
639
+ if (task) {
640
+ lines.push('--- Task Assessment ---');
641
+ const assessment = assessSkillForTask(task);
642
+ lines.push(`Can do: ${assessment.canDo ? 'yes' : 'no'}`);
643
+ lines.push(`Confidence: ${Math.round(assessment.confidence * 100)}%`);
644
+ if (assessment.suggestion)
645
+ lines.push(`Note: ${assessment.suggestion}`);
646
+ }
647
+ return lines.join('\n');
648
+ },
649
+ });
650
+ registerTool({
651
+ name: 'effort_estimate',
652
+ description: 'Predict how many tool calls, cost, and complexity a task will involve. Uses historical data when available.',
653
+ parameters: {
654
+ task: {
655
+ type: 'string',
656
+ description: 'Description of the task to estimate effort for',
657
+ required: true,
658
+ },
659
+ context: {
660
+ type: 'string',
661
+ description: 'Optional context about the current state',
662
+ required: false,
663
+ },
664
+ },
665
+ tier: 'free',
666
+ execute: async (args) => {
667
+ const task = String(args.task || '');
668
+ const context = args.context ? String(args.context) : undefined;
669
+ if (!task)
670
+ return 'Error: task parameter is required';
671
+ const estimate = estimateEffort(task, context);
672
+ const lines = [
673
+ '=== Effort Estimate ===',
674
+ '',
675
+ `Complexity: ${estimate.complexity}`,
676
+ '',
677
+ `Tool calls:`,
678
+ ` Min: ${estimate.toolCalls.min}`,
679
+ ` Expected: ${estimate.toolCalls.expected}`,
680
+ ` Max: ${estimate.toolCalls.max}`,
681
+ '',
682
+ `Estimated cost (USD):`,
683
+ ` Min: $${estimate.estimatedCostUsd.min.toFixed(4)}`,
684
+ ` Expected: $${estimate.estimatedCostUsd.expected.toFixed(4)}`,
685
+ ` Max: $${estimate.estimatedCostUsd.max.toFixed(4)}`,
686
+ '',
687
+ `Breakdown: ${estimate.breakdown}`,
688
+ ];
689
+ return lines.join('\n');
690
+ },
691
+ });
692
+ }
693
+ //# sourceMappingURL=confidence.js.map