agent-tool-forge 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +209 -0
  3. package/lib/agent-registry.js +170 -0
  4. package/lib/api-client.js +792 -0
  5. package/lib/api-loader.js +260 -0
  6. package/lib/auth.d.ts +25 -0
  7. package/lib/auth.js +158 -0
  8. package/lib/checks/check-adapter.js +172 -0
  9. package/lib/checks/compose.js +42 -0
  10. package/lib/checks/content-match.js +14 -0
  11. package/lib/checks/cost-budget.js +11 -0
  12. package/lib/checks/index.js +18 -0
  13. package/lib/checks/json-valid.js +15 -0
  14. package/lib/checks/latency.js +11 -0
  15. package/lib/checks/length-bounds.js +17 -0
  16. package/lib/checks/negative-match.js +14 -0
  17. package/lib/checks/no-hallucinated-numbers.js +63 -0
  18. package/lib/checks/non-empty.js +34 -0
  19. package/lib/checks/regex-match.js +12 -0
  20. package/lib/checks/run-checks.js +84 -0
  21. package/lib/checks/schema-match.js +26 -0
  22. package/lib/checks/tool-call-count.js +16 -0
  23. package/lib/checks/tool-selection.js +34 -0
  24. package/lib/checks/types.js +45 -0
  25. package/lib/comparison/compare.js +86 -0
  26. package/lib/comparison/format.js +104 -0
  27. package/lib/comparison/index.js +6 -0
  28. package/lib/comparison/statistics.js +59 -0
  29. package/lib/comparison/types.js +41 -0
  30. package/lib/config-schema.js +200 -0
  31. package/lib/config.d.ts +66 -0
  32. package/lib/conversation-store.d.ts +77 -0
  33. package/lib/conversation-store.js +443 -0
  34. package/lib/db.d.ts +6 -0
  35. package/lib/db.js +1112 -0
  36. package/lib/dep-check.js +99 -0
  37. package/lib/drift-background.js +61 -0
  38. package/lib/drift-monitor.js +187 -0
  39. package/lib/eval-runner.js +566 -0
  40. package/lib/fixtures/fixture-store.js +161 -0
  41. package/lib/fixtures/index.js +11 -0
  42. package/lib/forge-engine.js +982 -0
  43. package/lib/forge-eval-generator.js +417 -0
  44. package/lib/forge-file-writer.js +386 -0
  45. package/lib/forge-service-client.js +190 -0
  46. package/lib/forge-service.d.ts +4 -0
  47. package/lib/forge-service.js +655 -0
  48. package/lib/forge-verifier-generator.js +271 -0
  49. package/lib/handlers/admin.js +151 -0
  50. package/lib/handlers/agents.js +229 -0
  51. package/lib/handlers/chat-resume.js +334 -0
  52. package/lib/handlers/chat-sync.js +320 -0
  53. package/lib/handlers/chat.js +320 -0
  54. package/lib/handlers/conversations.js +92 -0
  55. package/lib/handlers/preferences.js +88 -0
  56. package/lib/handlers/tools-list.js +58 -0
  57. package/lib/hitl-engine.d.ts +60 -0
  58. package/lib/hitl-engine.js +261 -0
  59. package/lib/http-utils.js +92 -0
  60. package/lib/index.d.ts +20 -0
  61. package/lib/index.js +141 -0
  62. package/lib/init.js +636 -0
  63. package/lib/manual-entry.js +59 -0
  64. package/lib/mcp-server.js +252 -0
  65. package/lib/output-groups.js +54 -0
  66. package/lib/postgres-store.d.ts +31 -0
  67. package/lib/postgres-store.js +465 -0
  68. package/lib/preference-store.d.ts +47 -0
  69. package/lib/preference-store.js +79 -0
  70. package/lib/prompt-store.d.ts +42 -0
  71. package/lib/prompt-store.js +60 -0
  72. package/lib/rate-limiter.d.ts +30 -0
  73. package/lib/rate-limiter.js +104 -0
  74. package/lib/react-engine.d.ts +110 -0
  75. package/lib/react-engine.js +337 -0
  76. package/lib/runner/cli.js +156 -0
  77. package/lib/runner/cost-estimator.js +71 -0
  78. package/lib/runner/gate.js +46 -0
  79. package/lib/runner/index.js +165 -0
  80. package/lib/sidecar.d.ts +83 -0
  81. package/lib/sidecar.js +161 -0
  82. package/lib/sse.d.ts +15 -0
  83. package/lib/sse.js +30 -0
  84. package/lib/tools-scanner.js +91 -0
  85. package/lib/tui.js +253 -0
  86. package/lib/verifier-report.js +78 -0
  87. package/lib/verifier-runner.js +338 -0
  88. package/lib/verifier-scanner.js +70 -0
  89. package/lib/verifier-worker-pool.js +196 -0
  90. package/lib/views/chat.js +340 -0
  91. package/lib/views/endpoints.js +203 -0
  92. package/lib/views/eval-run.js +206 -0
  93. package/lib/views/forge-agent.js +538 -0
  94. package/lib/views/forge.js +410 -0
  95. package/lib/views/main-menu.js +275 -0
  96. package/lib/views/mediation.js +381 -0
  97. package/lib/views/model-compare.js +430 -0
  98. package/lib/views/model-comparison.js +333 -0
  99. package/lib/views/onboarding.js +470 -0
  100. package/lib/views/performance.js +237 -0
  101. package/lib/views/run-evals.js +205 -0
  102. package/lib/views/settings.js +829 -0
  103. package/lib/views/tools-evals.js +514 -0
  104. package/lib/views/verifier-coverage.js +617 -0
  105. package/lib/workers/verifier-worker.js +52 -0
  106. package/package.json +123 -0
  107. package/widget/forge-chat.js +789 -0
@@ -0,0 +1,333 @@
1
+ /**
2
+ * Model Comparison View — side-by-side performance metrics across models for a single tool.
3
+ *
4
+ * Shows: pass rate, avg latency, token usage, estimated cost per 1k calls, value score.
5
+ * Sources data from eval_run_cases (live DB) merged with in-session results.
6
+ */
7
+
8
+ import blessed from 'blessed';
9
+ import { existsSync } from 'fs';
10
+ import { resolve } from 'path';
11
+
12
+ // ── Cost helpers ───────────────────────────────────────────────────────────
13
+
14
+ /**
15
+ * Estimate cost for N calls given token averages and per-million rates.
16
+ * Returns null if rates are unknown.
17
+ *
18
+ * @param {string} model
19
+ * @param {number} avgInputTokens
20
+ * @param {number} avgOutputTokens
21
+ * @param {object} costsConfig - { [model]: { input, output } } per million tokens
22
+ * @param {number} [calls=1000]
23
+ * @returns {string|null} formatted dollar string e.g. "$0.042"
24
+ */
25
+ function estimateCost(model, avgInputTokens, avgOutputTokens, costsConfig, calls = 1000) {
26
+ const rates = costsConfig?.[model];
27
+ if (!rates) return null;
28
+ const cost = ((avgInputTokens * rates.input) + (avgOutputTokens * rates.output)) / 1_000_000 * calls;
29
+ if (cost < 0.01) return `$${cost.toFixed(4)}`;
30
+ if (cost < 1) return `$${cost.toFixed(3)}`;
31
+ return `$${cost.toFixed(2)}`;
32
+ }
33
+
34
+ /**
35
+ * Value score: pass_rate / cost_per_1k_normalized.
36
+ * Higher = better value. Returns null if cost is unknown.
37
+ */
38
+ function valueScore(passRate, avgInputTokens, avgOutputTokens, model, costsConfig) {
39
+ const rates = costsConfig?.[model];
40
+ if (!rates || passRate == null) return null;
41
+ const costPer1k = ((avgInputTokens * rates.input) + (avgOutputTokens * rates.output)) / 1_000_000 * 1000;
42
+ if (costPer1k === 0) return null;
43
+ return passRate / costPer1k;
44
+ }
45
+
46
+ // ── Bar helper ─────────────────────────────────────────────────────────────
47
+
48
+ function passRateBar(rate, width = 10) {
49
+ const filled = Math.round(rate * width);
50
+ const bar = '█'.repeat(filled) + '░'.repeat(width - filled);
51
+ if (rate >= 0.9) return `{green-fg}${bar}{/green-fg}`;
52
+ if (rate >= 0.7) return `{yellow-fg}${bar}{/yellow-fg}`;
53
+ return `{red-fg}${bar}{/red-fg}`;
54
+ }
55
+
56
+ function latencyColor(ms) {
57
+ if (!ms) return '{#888888-fg}—{/#888888-fg}';
58
+ const s = (ms / 1000).toFixed(1) + 's';
59
+ if (ms < 1000) return `{green-fg}${s}{/green-fg}`;
60
+ if (ms < 3000) return `{yellow-fg}${s}{/yellow-fg}`;
61
+ return `{red-fg}${s}{/red-fg}`;
62
+ }
63
+
64
+ // ── Data loader ────────────────────────────────────────────────────────────
65
+
66
+ async function loadComparisonData(toolName, perModelFromSession, config) {
67
+ const costsConfig = config?.costs || {};
68
+ const rows = [];
69
+
70
+ // Merge session results with DB history
71
+ const dbRows = [];
72
+ try {
73
+ const dbPath = resolve(process.cwd(), config?.dbPath || 'forge.db');
74
+ if (existsSync(dbPath)) {
75
+ const { getDb, getModelComparisonData } = await import('../db.js');
76
+ const db = getDb(dbPath);
77
+ const data = getModelComparisonData(db, toolName);
78
+ dbRows.push(...data);
79
+ }
80
+ } catch (_) { /* db unavailable */ }
81
+
82
+ // Build combined model set — session results take precedence for pass_rate
83
+ const modelSet = new Set([
84
+ ...Object.keys(perModelFromSession || {}),
85
+ ...dbRows.map((r) => r.model)
86
+ ]);
87
+
88
+ for (const model of modelSet) {
89
+ const sessionResult = perModelFromSession?.[model];
90
+ const dbRow = dbRows.find((r) => r.model === model);
91
+
92
+ const passRate = sessionResult?.error
93
+ ? null
94
+ : sessionResult
95
+ ? (sessionResult.total > 0 ? sessionResult.passed / sessionResult.total : null)
96
+ : dbRow
97
+ ? (dbRow.case_count > 0 ? dbRow.passed / dbRow.case_count : null)
98
+ : null;
99
+
100
+ const avgLatency = dbRow?.avg_latency_ms ?? null;
101
+
102
+ const avgInput = dbRow
103
+ ? (dbRow.total_input_tokens / Math.max(1, dbRow.case_count))
104
+ : 0;
105
+ const avgOutput = dbRow
106
+ ? (dbRow.total_output_tokens / Math.max(1, dbRow.case_count))
107
+ : 0;
108
+
109
+ const costPer1k = estimateCost(model, avgInput, avgOutput, costsConfig, 1000);
110
+ const score = passRate != null ? valueScore(passRate, avgInput, avgOutput, model, costsConfig) : null;
111
+
112
+ rows.push({
113
+ model,
114
+ passRate,
115
+ avgLatency,
116
+ avgInput,
117
+ avgOutput,
118
+ costPer1k,
119
+ score,
120
+ error: sessionResult?.error ?? null,
121
+ caseCount: dbRow?.case_count ?? sessionResult?.total ?? 0
122
+ });
123
+ }
124
+
125
+ // Sort: errors last, then by value score DESC (null score after scored rows), then pass_rate
126
+ rows.sort((a, b) => {
127
+ if (a.error && !b.error) return 1;
128
+ if (!a.error && b.error) return -1;
129
+ if (a.score != null && b.score != null) return b.score - a.score;
130
+ if (a.score != null) return -1;
131
+ if (b.score != null) return 1;
132
+ return (b.passRate ?? -1) - (a.passRate ?? -1);
133
+ });
134
+
135
+ return rows;
136
+ }
137
+
138
+ // ── View ───────────────────────────────────────────────────────────────────
139
+
140
+ export function createView({ screen, content, config, navigate, setFooter, screenKey, openPopup, closePopup, startService }) {
141
+ const toolName = config._comparisonTarget?.toolName || null;
142
+ const perModel = config._comparisonTarget?.perModel || {};
143
+ config._comparisonTarget = null; // consume — prevent stale reads on re-entry
144
+
145
+ const container = blessed.box({
146
+ top: 0, left: 0, width: '100%', height: '100%', tags: true
147
+ });
148
+
149
+ const titleBar = blessed.box({
150
+ parent: container,
151
+ top: 0, left: 0, width: '100%', height: 1,
152
+ tags: true,
153
+ content: toolName
154
+ ? ` {bold}{white-fg}Model Comparison:{/white-fg}{/bold} {cyan-fg}${toolName}{/cyan-fg} {#888888-fg}— pass rate · latency · cost/1k calls · value score{/#888888-fg}`
155
+ : ' {red-fg}No comparison target set{/red-fg}'
156
+ });
157
+
158
+ const table = blessed.listtable({
159
+ parent: container,
160
+ top: 1, left: 0,
161
+ width: '100%', height: '100%-4',
162
+ tags: true, keys: true, vi: true, mouse: true,
163
+ border: { type: 'line' },
164
+ align: 'left',
165
+ style: {
166
+ header: { bold: true, fg: 'cyan' },
167
+ cell: { selected: { bg: '#1a3a5c', fg: 'white' } }
168
+ },
169
+ pad: 1
170
+ });
171
+
172
+ const summaryBar = blessed.box({
173
+ parent: container,
174
+ bottom: 1, left: 0, width: '100%', height: 2,
175
+ tags: true, border: { type: 'line' },
176
+ style: { border: { fg: '#555555' } }
177
+ });
178
+
179
+ setFooter(' {cyan-fg}↑↓{/cyan-fg} navigate {cyan-fg}d{/cyan-fg} difficulty breakdown {cyan-fg}r{/cyan-fg} refresh {cyan-fg}b{/cyan-fg} back');
180
+
181
+ let rowData = [];
182
+
183
+ screenKey('d', () => {
184
+ const idx = table.selected;
185
+ if (idx >= 1 && rowData[idx - 1]) {
186
+ showDifficultyBreakdown(screen, rowData[idx - 1], toolName, config, openPopup, closePopup);
187
+ }
188
+ });
189
+
190
+ container.refresh = async () => {
191
+ if (!toolName) {
192
+ table.setData([['Model', 'Pass Rate', 'Latency', 'Cost/1k', 'Value', 'Cases'], ['No target', '', '', '', '', '']]);
193
+ screen.render();
194
+ return;
195
+ }
196
+
197
+ try {
198
+ rowData = await loadComparisonData(toolName, perModel, config);
199
+
200
+ if (rowData.length === 0) {
201
+ table.setData([
202
+ ['Model', 'Pass%', 'Bar', 'Latency', 'Cost/1k', 'Value', 'Cases'],
203
+ ['No data yet — run Compare Models from Tools & Evals', '', '', '', '', '', '']
204
+ ]);
205
+ summaryBar.setContent('');
206
+ screen.render();
207
+ return;
208
+ }
209
+
210
+ const headers = ['Model', 'Pass%', 'Bar', 'Latency', 'Cost/1k', 'Value', 'Cases'];
211
+ const tableRows = rowData.map((r) => {
212
+ if (r.error) {
213
+ return [r.model, '{red-fg}error{/red-fg}', '──────────', '—', '—', '—', '—'];
214
+ }
215
+ const pct = r.passRate != null ? `${Math.round(r.passRate * 100)}%` : '—';
216
+ const bar = r.passRate != null ? passRateBar(r.passRate) : '{#888888-fg}──────────{/#888888-fg}';
217
+ const lat = latencyColor(r.avgLatency);
218
+ const cost = r.costPer1k ?? '{#888888-fg}—{/#888888-fg}';
219
+ const score = r.score != null ? r.score.toFixed(1) : '{#888888-fg}—{/#888888-fg}';
220
+ return [r.model, pct, bar, lat, cost, score, String(r.caseCount)];
221
+ });
222
+
223
+ table.setData([headers, ...tableRows]);
224
+
225
+ // Build summary recommendation
226
+ const bestValue = rowData.find((r) => !r.error && r.score != null);
227
+ const bestPass = rowData.find((r) => !r.error && r.passRate != null);
228
+ const cheapest = [...rowData]
229
+ .filter((r) => !r.error && r.costPer1k != null)
230
+ .sort((a, b) => {
231
+ const ca = parseFloat(a.costPer1k?.replace('$', '') || 'Infinity');
232
+ const cb = parseFloat(b.costPer1k?.replace('$', '') || 'Infinity');
233
+ return ca - cb;
234
+ })[0];
235
+
236
+ const parts = [];
237
+ if (bestValue) parts.push(`{green-fg}Best value:{/green-fg} ${bestValue.model} (score ${bestValue.score?.toFixed(1)})`);
238
+ if (bestPass && bestPass.model !== bestValue?.model) {
239
+ parts.push(`{cyan-fg}Highest pass rate:{/cyan-fg} ${bestPass.model} (${Math.round((bestPass.passRate ?? 0) * 100)}%)`);
240
+ }
241
+ if (cheapest && cheapest.model !== bestValue?.model) {
242
+ parts.push(`{yellow-fg}Cheapest:{/yellow-fg} ${cheapest.model} (${cheapest.costPer1k}/1k)`);
243
+ }
244
+ summaryBar.setContent(' ' + (parts.join(' ') || '{#888888-fg}Add cost rates to forge.config.json for value scoring{/#888888-fg}'));
245
+
246
+ } catch (err) {
247
+ table.setData([['Model', 'Pass%', 'Bar', 'Latency', 'Cost/1k', 'Value', 'Cases'], [`Error: ${err.message}`, '', '', '', '', '', '']]);
248
+ }
249
+
250
+ screen.render();
251
+ };
252
+
253
+ container.refresh();
254
+ table.focus();
255
+ return container;
256
+ }
257
+
258
+ // ── Difficulty breakdown popup ────────────────────────────────────────────
259
+
260
+ async function showDifficultyBreakdown(screen, row, toolName, config, openPopup, closePopup) {
261
+ let content = `\n {cyan-fg}${row.model}{/cyan-fg} — ${toolName}\n\n`;
262
+
263
+ try {
264
+ const dbPath = resolve(process.cwd(), config?.dbPath || 'forge.db');
265
+ if (existsSync(dbPath)) {
266
+ const { getDb } = await import('../db.js');
267
+ const db = getDb(dbPath);
268
+
269
+ const difficulties = ['straightforward', 'ambiguous', 'edge', 'adversarial', 'easy', 'medium', 'hard'];
270
+ const breakdown = db.prepare(`
271
+ SELECT
272
+ erc.case_id,
273
+ er.eval_type,
274
+ erc.status,
275
+ erc.latency_ms
276
+ FROM eval_run_cases erc
277
+ JOIN eval_runs er ON erc.eval_run_id = er.id
278
+ WHERE erc.tool_name = ? AND erc.model = ?
279
+ ORDER BY erc.run_at DESC
280
+ LIMIT 200
281
+ `).all(toolName, row.model);
282
+
283
+ // Group by difficulty from case_id patterns
284
+ const diffMap = {};
285
+ for (const c of breakdown) {
286
+ // Try to infer difficulty from case_id (e.g. "tool_labeled_001") or eval_type
287
+ const diff = c.case_id?.match(/(straightforward|ambiguous|edge|adversarial|easy|medium|hard)/i)?.[1]?.toLowerCase()
288
+ || c.eval_type || 'unknown';
289
+ if (!diffMap[diff]) diffMap[diff] = { passed: 0, total: 0 };
290
+ diffMap[diff].total++;
291
+ if (c.status === 'passed') diffMap[diff].passed++;
292
+ }
293
+
294
+ if (Object.keys(diffMap).length === 0) {
295
+ content += ' {#888888-fg}No per-case data yet. Run evals to populate.{/#888888-fg}';
296
+ } else {
297
+ for (const [diff, stats] of Object.entries(diffMap)) {
298
+ const rate = stats.passed / stats.total;
299
+ const bar = passRateBar(rate, 8);
300
+ const pct = `${Math.round(rate * 100)}%`.padStart(4);
301
+ content += ` ${diff.padEnd(16)} ${bar} ${pct} (${stats.passed}/${stats.total})\n`;
302
+ }
303
+ }
304
+
305
+ // Token/cost summary
306
+ if (row.avgInput > 0 || row.avgOutput > 0) {
307
+ content += `\n {#888888-fg}Avg tokens: ${Math.round(row.avgInput)} in / ${Math.round(row.avgOutput)} out{/#888888-fg}`;
308
+ }
309
+ } else {
310
+ content += ' {#888888-fg}No database found.{/#888888-fg}';
311
+ }
312
+ } catch (err) {
313
+ content += ` {red-fg}Error: ${err.message}{/red-fg}`;
314
+ }
315
+
316
+ openPopup?.();
317
+ const popup = blessed.box({
318
+ parent: screen,
319
+ border: 'line',
320
+ top: 'center', left: 'center',
321
+ width: 62, height: 18,
322
+ label: ` Difficulty Breakdown `,
323
+ tags: true, scrollable: true,
324
+ content
325
+ });
326
+ popup.key(['escape', 'q', 'enter', 'd'], () => {
327
+ closePopup?.();
328
+ popup.destroy();
329
+ screen.render();
330
+ });
331
+ popup.focus();
332
+ screen.render();
333
+ }