agent-tool-forge 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +209 -0
  3. package/lib/agent-registry.js +170 -0
  4. package/lib/api-client.js +792 -0
  5. package/lib/api-loader.js +260 -0
  6. package/lib/auth.d.ts +25 -0
  7. package/lib/auth.js +158 -0
  8. package/lib/checks/check-adapter.js +172 -0
  9. package/lib/checks/compose.js +42 -0
  10. package/lib/checks/content-match.js +14 -0
  11. package/lib/checks/cost-budget.js +11 -0
  12. package/lib/checks/index.js +18 -0
  13. package/lib/checks/json-valid.js +15 -0
  14. package/lib/checks/latency.js +11 -0
  15. package/lib/checks/length-bounds.js +17 -0
  16. package/lib/checks/negative-match.js +14 -0
  17. package/lib/checks/no-hallucinated-numbers.js +63 -0
  18. package/lib/checks/non-empty.js +34 -0
  19. package/lib/checks/regex-match.js +12 -0
  20. package/lib/checks/run-checks.js +84 -0
  21. package/lib/checks/schema-match.js +26 -0
  22. package/lib/checks/tool-call-count.js +16 -0
  23. package/lib/checks/tool-selection.js +34 -0
  24. package/lib/checks/types.js +45 -0
  25. package/lib/comparison/compare.js +86 -0
  26. package/lib/comparison/format.js +104 -0
  27. package/lib/comparison/index.js +6 -0
  28. package/lib/comparison/statistics.js +59 -0
  29. package/lib/comparison/types.js +41 -0
  30. package/lib/config-schema.js +200 -0
  31. package/lib/config.d.ts +66 -0
  32. package/lib/conversation-store.d.ts +77 -0
  33. package/lib/conversation-store.js +443 -0
  34. package/lib/db.d.ts +6 -0
  35. package/lib/db.js +1112 -0
  36. package/lib/dep-check.js +99 -0
  37. package/lib/drift-background.js +61 -0
  38. package/lib/drift-monitor.js +187 -0
  39. package/lib/eval-runner.js +566 -0
  40. package/lib/fixtures/fixture-store.js +161 -0
  41. package/lib/fixtures/index.js +11 -0
  42. package/lib/forge-engine.js +982 -0
  43. package/lib/forge-eval-generator.js +417 -0
  44. package/lib/forge-file-writer.js +386 -0
  45. package/lib/forge-service-client.js +190 -0
  46. package/lib/forge-service.d.ts +4 -0
  47. package/lib/forge-service.js +655 -0
  48. package/lib/forge-verifier-generator.js +271 -0
  49. package/lib/handlers/admin.js +151 -0
  50. package/lib/handlers/agents.js +229 -0
  51. package/lib/handlers/chat-resume.js +334 -0
  52. package/lib/handlers/chat-sync.js +320 -0
  53. package/lib/handlers/chat.js +320 -0
  54. package/lib/handlers/conversations.js +92 -0
  55. package/lib/handlers/preferences.js +88 -0
  56. package/lib/handlers/tools-list.js +58 -0
  57. package/lib/hitl-engine.d.ts +60 -0
  58. package/lib/hitl-engine.js +261 -0
  59. package/lib/http-utils.js +92 -0
  60. package/lib/index.d.ts +20 -0
  61. package/lib/index.js +141 -0
  62. package/lib/init.js +636 -0
  63. package/lib/manual-entry.js +59 -0
  64. package/lib/mcp-server.js +252 -0
  65. package/lib/output-groups.js +54 -0
  66. package/lib/postgres-store.d.ts +31 -0
  67. package/lib/postgres-store.js +465 -0
  68. package/lib/preference-store.d.ts +47 -0
  69. package/lib/preference-store.js +79 -0
  70. package/lib/prompt-store.d.ts +42 -0
  71. package/lib/prompt-store.js +60 -0
  72. package/lib/rate-limiter.d.ts +30 -0
  73. package/lib/rate-limiter.js +104 -0
  74. package/lib/react-engine.d.ts +110 -0
  75. package/lib/react-engine.js +337 -0
  76. package/lib/runner/cli.js +156 -0
  77. package/lib/runner/cost-estimator.js +71 -0
  78. package/lib/runner/gate.js +46 -0
  79. package/lib/runner/index.js +165 -0
  80. package/lib/sidecar.d.ts +83 -0
  81. package/lib/sidecar.js +161 -0
  82. package/lib/sse.d.ts +15 -0
  83. package/lib/sse.js +30 -0
  84. package/lib/tools-scanner.js +91 -0
  85. package/lib/tui.js +253 -0
  86. package/lib/verifier-report.js +78 -0
  87. package/lib/verifier-runner.js +338 -0
  88. package/lib/verifier-scanner.js +70 -0
  89. package/lib/verifier-worker-pool.js +196 -0
  90. package/lib/views/chat.js +340 -0
  91. package/lib/views/endpoints.js +203 -0
  92. package/lib/views/eval-run.js +206 -0
  93. package/lib/views/forge-agent.js +538 -0
  94. package/lib/views/forge.js +410 -0
  95. package/lib/views/main-menu.js +275 -0
  96. package/lib/views/mediation.js +381 -0
  97. package/lib/views/model-compare.js +430 -0
  98. package/lib/views/model-comparison.js +333 -0
  99. package/lib/views/onboarding.js +470 -0
  100. package/lib/views/performance.js +237 -0
  101. package/lib/views/run-evals.js +205 -0
  102. package/lib/views/settings.js +829 -0
  103. package/lib/views/tools-evals.js +514 -0
  104. package/lib/views/verifier-coverage.js +617 -0
  105. package/lib/workers/verifier-worker.js +52 -0
  106. package/package.json +123 -0
  107. package/widget/forge-chat.js +789 -0
@@ -0,0 +1,566 @@
1
+ /**
2
+ * Standalone Eval Runner — no forge service required.
3
+ *
4
+ * Reads eval JSON files for a tool, calls Anthropic or OpenAI directly,
5
+ * checks routing + content assertions, and stores results in SQLite.
6
+ *
7
+ * Modes:
8
+ * "routing-only" — single LLM turn; verifies which tools the model selects.
9
+ * "stub-based" — multi-turn loop with stubbed tool results; activated when
10
+ * the eval case includes a `stubs` field. Validates both
11
+ * routing and final response content.
12
+ */
13
+
14
+ import { readdirSync, readFileSync, existsSync } from 'fs';
15
+ import { resolve, join } from 'path';
16
+ import { llmTurn, normalizeUsage, modelConfigForName } from './api-client.js';
17
+ import { checkAdapter, checkResponseContainsAnyGroups, checkToolsAcceptable } from './checks/check-adapter.js';
18
+ import { runChecks } from './checks/run-checks.js';
19
+ import { computeActualCost } from './runner/cost-estimator.js';
20
+
21
+ // ── Stub-based multi-turn executor ─────────────────────────────────────────
22
+
23
+ /**
24
+ * Run a multi-turn LLM conversation with stubbed tool results.
25
+ *
26
+ * When the model emits tool_use calls, each tool is answered with its
27
+ * corresponding stub from the `stubs` map instead of running real tool code.
28
+ * The loop continues until the model produces a text-only response or maxTurns
29
+ * is reached.
30
+ *
31
+ * @param {object} opts
32
+ * @param {string} opts.provider - 'anthropic' | 'openai' (or compat)
33
+ * @param {string} opts.apiKey
34
+ * @param {string} opts.model
35
+ * @param {string} opts.systemPrompt
36
+ * @param {object[]} opts.tools - tool definitions with inputSchema
37
+ * @param {string} opts.input - user message
38
+ * @param {object} opts.stubs - { [toolName]: stubReturnObject }
39
+ * @param {number} [opts.maxTurns=5] - safety cap on loop iterations
40
+ * @returns {{ toolsCalled: string[], responseText: string, usage: object|null, missingStubs: string[] }}
41
+ */
42
+ export async function stubbedReactTurn({ provider, apiKey, model, systemPrompt, tools, input, stubs, maxTurns = 5 }) {
43
+ const messages = [{ role: 'user', content: input }];
44
+ const allToolsCalled = [];
45
+ const missingStubs = [];
46
+ let finalText = '';
47
+ // M2: accumulate token counts across all turns instead of keeping only the last
48
+ let accInputTokens = 0;
49
+ let accOutputTokens = 0;
50
+
51
+ for (let turn = 0; turn < maxTurns; turn++) {
52
+ const response = await llmTurn({
53
+ provider, apiKey, model,
54
+ system: systemPrompt,
55
+ messages, tools,
56
+ maxTokens: 1024,
57
+ timeoutMs: 30_000
58
+ });
59
+
60
+ if (response.usage) {
61
+ const n = normalizeUsage(response.usage, provider);
62
+ accInputTokens += n.inputTokens;
63
+ accOutputTokens += n.outputTokens;
64
+ }
65
+ if (response.text) finalText = response.text;
66
+ if (!response.toolCalls?.length) break;
67
+
68
+ const toolResults = [];
69
+ for (const tc of response.toolCalls) {
70
+ allToolsCalled.push(tc.name);
71
+ const stubData = stubs[tc.name];
72
+ if (stubData === undefined) missingStubs.push(tc.name);
73
+ toolResults.push({
74
+ toolCall: tc,
75
+ result: { status: 200, body: stubData ?? { error: `no stub for "${tc.name}"` } }
76
+ });
77
+ }
78
+
79
+ // Append provider-specific history (mirrors react-engine.js format)
80
+ if (provider === 'anthropic') {
81
+ messages.push({
82
+ role: 'assistant',
83
+ content: [
84
+ ...(response.text ? [{ type: 'text', text: response.text }] : []),
85
+ ...toolResults.map(({ toolCall }) => ({
86
+ type: 'tool_use', id: toolCall.id, name: toolCall.name, input: toolCall.input
87
+ }))
88
+ ]
89
+ });
90
+ messages.push({
91
+ role: 'user',
92
+ content: toolResults.map(({ toolCall, result }) => ({
93
+ type: 'tool_result',
94
+ tool_use_id: toolCall.id,
95
+ content: JSON.stringify(result.body)
96
+ }))
97
+ });
98
+ } else {
99
+ // L1: use '' instead of null — some OpenAI-compat providers reject content: null
100
+ messages.push({
101
+ role: 'assistant',
102
+ content: response.text || '',
103
+ tool_calls: toolResults.map(({ toolCall }) => ({
104
+ id: toolCall.id, type: 'function',
105
+ function: { name: toolCall.name, arguments: JSON.stringify(toolCall.input) }
106
+ }))
107
+ });
108
+ for (const { toolCall, result } of toolResults) {
109
+ messages.push({ role: 'tool', tool_call_id: toolCall.id, content: JSON.stringify(result.body) });
110
+ }
111
+ }
112
+ }
113
+
114
+ // Return accumulated normalized token counts so callers skip a second normalizeUsage call
115
+ return {
116
+ toolsCalled: allToolsCalled,
117
+ responseText: finalText,
118
+ usage: { inputTokens: accInputTokens, outputTokens: accOutputTokens },
119
+ missingStubs
120
+ };
121
+ }
122
+
123
+ // ── Assertion checker ──────────────────────────────────────────────────────
124
+
125
+ /**
126
+ * Check assertions from an eval case against actual API response.
127
+ * Returns array of failure strings (empty = pass).
128
+ * Uses runChecks() from lib/checks/ for structured evaluation.
129
+ */
130
+ function checkAssertions(evalCase, { toolsCalled, responseText, latencyMs, cost, missingStubs = [] }) {
131
+ const failures = [];
132
+
133
+ // Run structured checks via check-adapter + runChecks
134
+ const input = checkAdapter(evalCase, { toolsCalled, responseText, latencyMs, cost });
135
+ const result = runChecks(input);
136
+ for (const [checkName, checkResult] of Object.entries(result.checks)) {
137
+ if (!checkResult.pass) {
138
+ failures.push(checkResult.reason ?? `${checkName} failed`);
139
+ }
140
+ }
141
+
142
+ // responseContainsAny — anyOf groups (not natively in runChecks)
143
+ const expect = evalCase.expect ?? {};
144
+ if (expect.responseContainsAny?.length) {
145
+ const anyResult = checkResponseContainsAnyGroups(responseText, expect.responseContainsAny);
146
+ if (!anyResult.pass) failures.push(anyResult.reason);
147
+ }
148
+
149
+ // toolsAcceptable — acceptable alternative tool sets
150
+ if (expect.toolsAcceptable !== undefined) {
151
+ const acceptResult = checkToolsAcceptable(toolsCalled, expect.toolsAcceptable);
152
+ if (!acceptResult.pass) failures.push(acceptResult.reason);
153
+ }
154
+
155
+ // noToolErrors — in stub mode: fails if any tool was called without a stub entry.
156
+ // In routing-only mode (no stubs): this assertion is a no-op because tools are
157
+ // never executed, so success/failure cannot be determined. Set `stubs` on the
158
+ // eval case to make this assertion meaningful.
159
+ if (expect.noToolErrors && missingStubs.length > 0) {
160
+ failures.push(`noToolErrors: tools called without stubs: [${missingStubs.join(', ')}]`);
161
+ }
162
+
163
+ return failures;
164
+ }
165
+
166
+ // ── Tool schema extraction ─────────────────────────────────────────────────
167
+
168
+ /**
169
+ * Convert forge tool schema format to JSON Schema.
170
+ * Input: { city: { type: 'string' }, units: { type: 'string', optional: true } }
171
+ * Output: { type: 'object', properties: { city: {...}, units: {...} }, required: ['city'] }
172
+ */
173
+ function forgeSchemaToJsonSchema(schema) {
174
+ if (!schema || typeof schema !== 'object') {
175
+ return { type: 'object', properties: {} };
176
+ }
177
+ const properties = {};
178
+ const required = [];
179
+ for (const [key, val] of Object.entries(schema)) {
180
+ properties[key] = { type: val.type || 'string' };
181
+ if (val.description) properties[key].description = val.description;
182
+ if (!val.optional) required.push(key);
183
+ }
184
+ return { type: 'object', properties, ...(required.length ? { required } : {}) };
185
+ }
186
+
187
+ /**
188
+ * Extract schema object from tool file source using a basic brace-matcher.
189
+ * Returns null if not parseable.
190
+ */
191
+ function extractSchemaFromSource(source) {
192
+ const schemaMatch = source.match(/schema:\s*\{/);
193
+ if (!schemaMatch) return null;
194
+
195
+ const start = source.indexOf('{', schemaMatch.index + schemaMatch[0].length - 1);
196
+ let depth = 0;
197
+ let end = start;
198
+ for (let i = start; i < source.length; i++) {
199
+ if (source[i] === '{') depth++;
200
+ if (source[i] === '}') { depth--; if (depth === 0) { end = i; break; } }
201
+ }
202
+
203
+ const block = source.slice(start, end + 1);
204
+ // Convert JS object literal to evaluable form (very limited — handles simple cases)
205
+ try {
206
+ // eslint-disable-next-line no-new-func
207
+ const fn = new Function(`return ${block}`);
208
+ return fn();
209
+ } catch (_) {
210
+ return null;
211
+ }
212
+ }
213
+
214
+ /**
215
+ * Load tools from the tools directory, returning objects with inputSchema.
216
+ */
217
+ export function getToolsForEval(config) {
218
+ const project = config?.project || {};
219
+ const toolsDir = resolve(process.cwd(), project.toolsDir || 'example/tools');
220
+ if (!existsSync(toolsDir)) return [];
221
+
222
+ const files = readdirSync(toolsDir).filter(
223
+ (f) => f.endsWith('.tool.ts') || f.endsWith('.tool.js')
224
+ );
225
+
226
+ return files.map((file) => {
227
+ const source = readFileSync(join(toolsDir, file), 'utf-8');
228
+ const nameM = source.match(/name:\s*['"]([^'"]+)['"]/);
229
+ const descM = source.match(/description:\s*['"`]([^'"`]+)['"`]/);
230
+ const name = nameM?.[1] ?? file.replace(/\.tool\.(ts|js)$/, '');
231
+ const description = descM?.[1] ?? '';
232
+ const rawSchema = extractSchemaFromSource(source);
233
+ return { name, description, inputSchema: forgeSchemaToJsonSchema(rawSchema) };
234
+ });
235
+ }
236
+
237
+ // ── Eval file discovery ────────────────────────────────────────────────────
238
+
239
+ /**
240
+ * Find eval files for a tool. Searches evalsDir (and project root) for
241
+ * patterns: {toolName}.golden.json, {toolName}.labeled.json
242
+ */
243
+ export function findEvalFiles(toolName, config) {
244
+ const project = config?.project || {};
245
+ const evalsDir = resolve(process.cwd(), project.evalsDir || 'docs/examples');
246
+
247
+ const candidates = [
248
+ join(evalsDir, `${toolName}.golden.json`),
249
+ join(evalsDir, `${toolName}.labeled.json`),
250
+ // Also check one level up if toolName matches a subdirectory
251
+ join(evalsDir, toolName, `${toolName}.golden.json`),
252
+ join(evalsDir, toolName, `${toolName}.labeled.json`),
253
+ // With hyphens
254
+ join(evalsDir, `${toolName.replace(/_/g, '-')}.golden.json`),
255
+ join(evalsDir, `${toolName.replace(/_/g, '-')}.labeled.json`)
256
+ ];
257
+
258
+ return candidates
259
+ .filter(existsSync)
260
+ .filter((v, i, a) => a.indexOf(v) === i); // dedup
261
+ }
262
+
263
+ // ── Env reader ─────────────────────────────────────────────────────────────
264
+
265
+ function loadEnv(projectRoot) {
266
+ const envPath = resolve(projectRoot, '.env');
267
+ if (!existsSync(envPath)) return {};
268
+ const lines = readFileSync(envPath, 'utf-8').split('\n');
269
+ const out = {};
270
+ for (const line of lines) {
271
+ const trimmed = line.trim();
272
+ if (!trimmed || trimmed.startsWith('#')) continue;
273
+ const eqIdx = trimmed.indexOf('=');
274
+ if (eqIdx === -1) continue;
275
+ out[trimmed.slice(0, eqIdx).trim()] = trimmed.slice(eqIdx + 1).trim().replace(/^["']|["']$/g, '');
276
+ }
277
+ return out;
278
+ }
279
+
280
+ // ── Main runner ────────────────────────────────────────────────────────────
281
+
282
+ /**
283
+ * Run evals for a tool.
284
+ *
285
+ * @param {string} toolName
286
+ * @param {object} config - full forge config
287
+ * @param {string} projectRoot
288
+ * @param {function} onProgress - called after each case: ({ done, total, caseId, passed, reason })
289
+ * @returns {{ total, passed, failed, skipped, cases, provider, model }}
290
+ */
291
+ export async function runEvals(toolName, config, projectRoot, onProgress) {
292
+ const env = loadEnv(projectRoot);
293
+
294
+ // Determine provider + key.
295
+ // If config.model is explicitly set, detect its provider and resolve the matching key.
296
+ // Otherwise fall back to whichever key is available (Anthropic preferred).
297
+ let provider, model, apiKey;
298
+ if (config?.model) {
299
+ const mc = modelConfigForName(config.model, env);
300
+ provider = mc.provider;
301
+ model = mc.model;
302
+ apiKey = mc.apiKey;
303
+ if (!apiKey) {
304
+ throw new Error(`No API key found for provider "${provider}" (model: ${model}). Add the key to .env in Settings.`);
305
+ }
306
+ } else {
307
+ const anthropicKey = env['ANTHROPIC_API_KEY'];
308
+ const openaiKey = env['OPENAI_API_KEY'];
309
+ if (!anthropicKey && !openaiKey) {
310
+ throw new Error('No API key found. Add ANTHROPIC_API_KEY or OPENAI_API_KEY to .env in Settings.');
311
+ }
312
+ const useAnthropic = !!anthropicKey;
313
+ provider = useAnthropic ? 'anthropic' : 'openai';
314
+ model = useAnthropic ? 'claude-sonnet-4-6' : 'gpt-4o-mini';
315
+ apiKey = useAnthropic ? anthropicKey : openaiKey;
316
+ }
317
+
318
+ // Load system prompt
319
+ let systemPrompt = '';
320
+ if (config?.systemPromptPath) {
321
+ const spPath = resolve(projectRoot, config.systemPromptPath);
322
+ if (existsSync(spPath)) {
323
+ try { systemPrompt = readFileSync(spPath, 'utf-8'); } catch (_) { /* ignore */ }
324
+ }
325
+ }
326
+
327
+ // Load tool definitions
328
+ const tools = getToolsForEval(config);
329
+ if (tools.length === 0) {
330
+ throw new Error(`No tool files found in ${config?.project?.toolsDir || 'example/tools'}`);
331
+ }
332
+
333
+ // Find eval files
334
+ const evalFiles = findEvalFiles(toolName, config);
335
+ if (evalFiles.length === 0) {
336
+ throw new Error(
337
+ `No eval files found for "${toolName}". ` +
338
+ `Expected files like ${toolName}.golden.json in ${config?.project?.evalsDir || 'docs/examples'}`
339
+ );
340
+ }
341
+
342
+ // Load all eval cases
343
+ const allCases = [];
344
+ for (const file of evalFiles) {
345
+ const type = file.includes('.golden.') ? 'golden' : 'labeled';
346
+ let cases;
347
+ try {
348
+ cases = JSON.parse(readFileSync(file, 'utf-8'));
349
+ } catch (err) {
350
+ throw new Error(`Failed to parse eval file ${file}: ${err.message}`);
351
+ }
352
+ for (const c of cases) allCases.push({ ...c, _evalType: type });
353
+ }
354
+
355
+ const results = [];
356
+ const caseRows = [];
357
+ let passed = 0;
358
+ let failed = 0;
359
+ let skipped = 0;
360
+
361
+ for (let i = 0; i < allCases.length; i++) {
362
+ const evalCase = allCases[i];
363
+ const input = evalCase.input?.message ?? '';
364
+
365
+ if (!input) {
366
+ skipped++;
367
+ onProgress?.({ done: i + 1, total: allCases.length, caseId: evalCase.id, passed: null, reason: 'no input message' });
368
+ results.push({ id: evalCase.id, description: evalCase.description, status: 'skipped', reason: 'no input message' });
369
+ caseRows.push({ case_id: evalCase.id, tool_name: toolName, status: 'skipped', reason: 'no input message', tools_called: null, latency_ms: null, model });
370
+ continue;
371
+ }
372
+
373
+ let apiResult;
374
+ let stubbedResult;
375
+ let inputTokens = 0;
376
+ let outputTokens = 0;
377
+ const t0 = Date.now();
378
+ // C1: require at least one stub key — empty object {} stays routing-only
379
+ const hasStubs = evalCase.stubs && typeof evalCase.stubs === 'object' && Object.keys(evalCase.stubs).length > 0;
380
+ try {
381
+ if (hasStubs) {
382
+ stubbedResult = await stubbedReactTurn({
383
+ provider, apiKey, model, systemPrompt, tools,
384
+ input,
385
+ stubs: evalCase.stubs,
386
+ maxTurns: evalCase.maxTurns ?? 5
387
+ });
388
+ apiResult = { toolsCalled: stubbedResult.toolsCalled, responseText: stubbedResult.responseText };
389
+ // M2: stubbedReactTurn returns pre-accumulated normalized tokens; use directly
390
+ inputTokens = stubbedResult.usage.inputTokens;
391
+ outputTokens = stubbedResult.usage.outputTokens;
392
+ } else {
393
+ const turnResult = await llmTurn({
394
+ provider,
395
+ apiKey,
396
+ model,
397
+ system: systemPrompt,
398
+ messages: [{ role: 'user', content: input }],
399
+ tools,
400
+ maxTokens: 1024,
401
+ timeoutMs: 30_000
402
+ });
403
+ apiResult = {
404
+ toolsCalled: turnResult.toolCalls.map((tc) => tc.name),
405
+ responseText: turnResult.text
406
+ };
407
+ ({ inputTokens, outputTokens } = normalizeUsage(turnResult.usage ?? null, provider));
408
+ }
409
+ } catch (err) {
410
+ failed++;
411
+ const reason = `API error: ${err.message}`;
412
+ onProgress?.({ done: i + 1, total: allCases.length, caseId: evalCase.id, passed: false, reason });
413
+ results.push({ id: evalCase.id, description: evalCase.description, status: 'failed', reason });
414
+ caseRows.push({ case_id: evalCase.id, tool_name: toolName, status: 'failed', reason, tools_called: null, latency_ms: Date.now() - t0, model, input_tokens: null, output_tokens: null });
415
+ continue;
416
+ }
417
+
418
+ const latency_ms = Date.now() - t0;
419
+ // Cost is computed from observed token counts; maxCost assertion activates when expect.maxCost is set in the eval case
420
+ const cost = computeActualCost(inputTokens, outputTokens, model);
421
+ const failures = checkAssertions(evalCase, {
422
+ toolsCalled: apiResult.toolsCalled,
423
+ responseText: apiResult.responseText,
424
+ latencyMs: latency_ms,
425
+ cost,
426
+ missingStubs: stubbedResult?.missingStubs ?? []
427
+ });
428
+ const casePassed = failures.length === 0;
429
+
430
+ if (casePassed) passed++;
431
+ else failed++;
432
+
433
+ const reason = failures.length > 0 ? failures.join('; ') : null;
434
+ onProgress?.({ done: i + 1, total: allCases.length, caseId: evalCase.id, passed: casePassed, reason, toolsCalled: apiResult.toolsCalled });
435
+ results.push({
436
+ id: evalCase.id,
437
+ description: evalCase.description,
438
+ difficulty: evalCase.difficulty,
439
+ status: casePassed ? 'passed' : 'failed',
440
+ reason,
441
+ toolsCalled: apiResult.toolsCalled
442
+ });
443
+ caseRows.push({
444
+ case_id: evalCase.id,
445
+ tool_name: toolName,
446
+ status: casePassed ? 'passed' : 'failed',
447
+ reason,
448
+ tools_called: JSON.stringify(apiResult.toolsCalled),
449
+ latency_ms,
450
+ model,
451
+ input_tokens: inputTokens || null,
452
+ output_tokens: outputTokens || null
453
+ });
454
+ }
455
+
456
+ // Persist to SQLite
457
+ try {
458
+ const dbPath = resolve(projectRoot, config?.dbPath || 'forge.db');
459
+ const { getDb, insertEvalRun, insertEvalRunCases } = await import('./db.js');
460
+ const db = getDb(dbPath);
461
+ const evalType = allCases.every((c) => c._evalType === 'golden') ? 'golden'
462
+ : allCases.every((c) => c._evalType === 'labeled') ? 'labeled'
463
+ : 'mixed';
464
+ const ran = passed + failed;
465
+ const passRate = ran > 0 ? passed / ran : 0;
466
+ const evalRunId = insertEvalRun(db, {
467
+ tool_name: toolName,
468
+ eval_type: evalType,
469
+ total_cases: allCases.length,
470
+ passed,
471
+ failed,
472
+ skipped,
473
+ notes: `provider:${provider} model:${model}`,
474
+ model,
475
+ pass_rate: passRate,
476
+ sample_type: 'targeted'
477
+ });
478
+ if (caseRows.length > 0) {
479
+ insertEvalRunCases(db, caseRows.map((r) => ({ ...r, eval_run_id: evalRunId })));
480
+ }
481
+ } catch (_) { /* db write failure is non-fatal */ }
482
+
483
+ return { total: allCases.length, passed, failed, skipped, cases: results, provider, model };
484
+ }
485
+
486
+ // ── Multi-pass eval runner ────────────────────────────────────────────────
487
+
488
+ /**
489
+ * Run evals across a model matrix, collecting per-model results.
490
+ * Model matrix is resolved from config.modelMatrix (list of model name strings).
491
+ * Each model's provider + API key is resolved automatically via modelConfigForName.
492
+ *
493
+ * @param {string} toolName
494
+ * @param {object} config - full forge config (must include dbPath for env resolution)
495
+ * @param {string} projectRoot
496
+ * @param {{ modelMatrix?: string[] }} [options] - override matrix; defaults to config.modelMatrix
497
+ * @param {function} [onProgress] - called with { model, done, total, caseId, passed }
498
+ * @returns {Promise<{ perModel: Record<string, { passed, failed, total, pass_rate }> }>}
499
+ */
500
+ export async function runEvalsMultiPass(toolName, config, projectRoot, options = {}, onProgress) {
501
+ // Resolve env for API key lookup
502
+ const env = loadEnv(projectRoot);
503
+
504
+ const matrixNames = options.modelMatrix || config?.modelMatrix || [];
505
+ if (matrixNames.length === 0) {
506
+ throw new Error('No model matrix configured. Add "modelMatrix" to forge.config.json.');
507
+ }
508
+
509
+ const perModel = {};
510
+
511
+ for (const modelName of matrixNames) {
512
+ const mc = modelConfigForName(modelName, env);
513
+ if (!mc.apiKey) {
514
+ perModel[modelName] = { error: `No API key found for provider "${mc.provider}"` };
515
+ continue;
516
+ }
517
+
518
+ // Build a config override for this model
519
+ const modelConfig = { ...config, model: modelName, models: { ...config?.models, eval: modelName } };
520
+
521
+ try {
522
+ const result = await runEvals(
523
+ toolName,
524
+ modelConfig,
525
+ projectRoot,
526
+ (progress) => onProgress?.({ model: modelName, ...progress })
527
+ );
528
+ perModel[modelName] = {
529
+ passed: result.passed,
530
+ failed: result.failed,
531
+ total: result.total,
532
+ skipped: result.skipped,
533
+ pass_rate: (result.passed + result.failed) > 0 ? result.passed / (result.passed + result.failed) : 0,
534
+ provider: result.provider
535
+ };
536
+ } catch (err) {
537
+ perModel[modelName] = { error: err.message };
538
+ }
539
+ }
540
+
541
+ return { perModel };
542
+ }
543
+
544
+ // ── Random sample helper ──────────────────────────────────────────────────
545
+
546
+ /**
547
+ * Pull n random eval run cases from OTHER tools for blind drift detection.
548
+ *
549
+ * @param {import('better-sqlite3').Database} db
550
+ * @param {string} toolName - The tool to EXCLUDE from sampling
551
+ * @param {number} n
552
+ * @returns {object[]} eval case rows with _sampleType: 'sampled'
553
+ */
554
+ export function withRandomSample(db, toolName, n) {
555
+ try {
556
+ const rows = db.prepare(`
557
+ SELECT * FROM eval_run_cases
558
+ WHERE tool_name != ?
559
+ ORDER BY RANDOM()
560
+ LIMIT ?
561
+ `).all(toolName, n);
562
+ return rows.map((r) => ({ ...r, _sampleType: 'sampled' }));
563
+ } catch (_) {
564
+ return [];
565
+ }
566
+ }