agent-tool-forge 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +209 -0
- package/lib/agent-registry.js +170 -0
- package/lib/api-client.js +792 -0
- package/lib/api-loader.js +260 -0
- package/lib/auth.d.ts +25 -0
- package/lib/auth.js +158 -0
- package/lib/checks/check-adapter.js +172 -0
- package/lib/checks/compose.js +42 -0
- package/lib/checks/content-match.js +14 -0
- package/lib/checks/cost-budget.js +11 -0
- package/lib/checks/index.js +18 -0
- package/lib/checks/json-valid.js +15 -0
- package/lib/checks/latency.js +11 -0
- package/lib/checks/length-bounds.js +17 -0
- package/lib/checks/negative-match.js +14 -0
- package/lib/checks/no-hallucinated-numbers.js +63 -0
- package/lib/checks/non-empty.js +34 -0
- package/lib/checks/regex-match.js +12 -0
- package/lib/checks/run-checks.js +84 -0
- package/lib/checks/schema-match.js +26 -0
- package/lib/checks/tool-call-count.js +16 -0
- package/lib/checks/tool-selection.js +34 -0
- package/lib/checks/types.js +45 -0
- package/lib/comparison/compare.js +86 -0
- package/lib/comparison/format.js +104 -0
- package/lib/comparison/index.js +6 -0
- package/lib/comparison/statistics.js +59 -0
- package/lib/comparison/types.js +41 -0
- package/lib/config-schema.js +200 -0
- package/lib/config.d.ts +66 -0
- package/lib/conversation-store.d.ts +77 -0
- package/lib/conversation-store.js +443 -0
- package/lib/db.d.ts +6 -0
- package/lib/db.js +1112 -0
- package/lib/dep-check.js +99 -0
- package/lib/drift-background.js +61 -0
- package/lib/drift-monitor.js +187 -0
- package/lib/eval-runner.js +566 -0
- package/lib/fixtures/fixture-store.js +161 -0
- package/lib/fixtures/index.js +11 -0
- package/lib/forge-engine.js +982 -0
- package/lib/forge-eval-generator.js +417 -0
- package/lib/forge-file-writer.js +386 -0
- package/lib/forge-service-client.js +190 -0
- package/lib/forge-service.d.ts +4 -0
- package/lib/forge-service.js +655 -0
- package/lib/forge-verifier-generator.js +271 -0
- package/lib/handlers/admin.js +151 -0
- package/lib/handlers/agents.js +229 -0
- package/lib/handlers/chat-resume.js +334 -0
- package/lib/handlers/chat-sync.js +320 -0
- package/lib/handlers/chat.js +320 -0
- package/lib/handlers/conversations.js +92 -0
- package/lib/handlers/preferences.js +88 -0
- package/lib/handlers/tools-list.js +58 -0
- package/lib/hitl-engine.d.ts +60 -0
- package/lib/hitl-engine.js +261 -0
- package/lib/http-utils.js +92 -0
- package/lib/index.d.ts +20 -0
- package/lib/index.js +141 -0
- package/lib/init.js +636 -0
- package/lib/manual-entry.js +59 -0
- package/lib/mcp-server.js +252 -0
- package/lib/output-groups.js +54 -0
- package/lib/postgres-store.d.ts +31 -0
- package/lib/postgres-store.js +465 -0
- package/lib/preference-store.d.ts +47 -0
- package/lib/preference-store.js +79 -0
- package/lib/prompt-store.d.ts +42 -0
- package/lib/prompt-store.js +60 -0
- package/lib/rate-limiter.d.ts +30 -0
- package/lib/rate-limiter.js +104 -0
- package/lib/react-engine.d.ts +110 -0
- package/lib/react-engine.js +337 -0
- package/lib/runner/cli.js +156 -0
- package/lib/runner/cost-estimator.js +71 -0
- package/lib/runner/gate.js +46 -0
- package/lib/runner/index.js +165 -0
- package/lib/sidecar.d.ts +83 -0
- package/lib/sidecar.js +161 -0
- package/lib/sse.d.ts +15 -0
- package/lib/sse.js +30 -0
- package/lib/tools-scanner.js +91 -0
- package/lib/tui.js +253 -0
- package/lib/verifier-report.js +78 -0
- package/lib/verifier-runner.js +338 -0
- package/lib/verifier-scanner.js +70 -0
- package/lib/verifier-worker-pool.js +196 -0
- package/lib/views/chat.js +340 -0
- package/lib/views/endpoints.js +203 -0
- package/lib/views/eval-run.js +206 -0
- package/lib/views/forge-agent.js +538 -0
- package/lib/views/forge.js +410 -0
- package/lib/views/main-menu.js +275 -0
- package/lib/views/mediation.js +381 -0
- package/lib/views/model-compare.js +430 -0
- package/lib/views/model-comparison.js +333 -0
- package/lib/views/onboarding.js +470 -0
- package/lib/views/performance.js +237 -0
- package/lib/views/run-evals.js +205 -0
- package/lib/views/settings.js +829 -0
- package/lib/views/tools-evals.js +514 -0
- package/lib/views/verifier-coverage.js +617 -0
- package/lib/workers/verifier-worker.js +52 -0
- package/package.json +123 -0
- package/widget/forge-chat.js +789 -0
|
@@ -0,0 +1,566 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Standalone Eval Runner — no forge service required.
|
|
3
|
+
*
|
|
4
|
+
* Reads eval JSON files for a tool, calls Anthropic or OpenAI directly,
|
|
5
|
+
* checks routing + content assertions, and stores results in SQLite.
|
|
6
|
+
*
|
|
7
|
+
* Modes:
|
|
8
|
+
* "routing-only" — single LLM turn; verifies which tools the model selects.
|
|
9
|
+
* "stub-based" — multi-turn loop with stubbed tool results; activated when
|
|
10
|
+
* the eval case includes a `stubs` field. Validates both
|
|
11
|
+
* routing and final response content.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { readdirSync, readFileSync, existsSync } from 'fs';
|
|
15
|
+
import { resolve, join } from 'path';
|
|
16
|
+
import { llmTurn, normalizeUsage, modelConfigForName } from './api-client.js';
|
|
17
|
+
import { checkAdapter, checkResponseContainsAnyGroups, checkToolsAcceptable } from './checks/check-adapter.js';
|
|
18
|
+
import { runChecks } from './checks/run-checks.js';
|
|
19
|
+
import { computeActualCost } from './runner/cost-estimator.js';
|
|
20
|
+
|
|
21
|
+
// ── Stub-based multi-turn executor ─────────────────────────────────────────
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Run a multi-turn LLM conversation with stubbed tool results.
|
|
25
|
+
*
|
|
26
|
+
* When the model emits tool_use calls, each tool is answered with its
|
|
27
|
+
* corresponding stub from the `stubs` map instead of running real tool code.
|
|
28
|
+
* The loop continues until the model produces a text-only response or maxTurns
|
|
29
|
+
* is reached.
|
|
30
|
+
*
|
|
31
|
+
* @param {object} opts
|
|
32
|
+
* @param {string} opts.provider - 'anthropic' | 'openai' (or compat)
|
|
33
|
+
* @param {string} opts.apiKey
|
|
34
|
+
* @param {string} opts.model
|
|
35
|
+
* @param {string} opts.systemPrompt
|
|
36
|
+
* @param {object[]} opts.tools - tool definitions with inputSchema
|
|
37
|
+
* @param {string} opts.input - user message
|
|
38
|
+
* @param {object} opts.stubs - { [toolName]: stubReturnObject }
|
|
39
|
+
* @param {number} [opts.maxTurns=5] - safety cap on loop iterations
|
|
40
|
+
* @returns {{ toolsCalled: string[], responseText: string, usage: object|null, missingStubs: string[] }}
|
|
41
|
+
*/
|
|
42
|
+
export async function stubbedReactTurn({ provider, apiKey, model, systemPrompt, tools, input, stubs, maxTurns = 5 }) {
|
|
43
|
+
const messages = [{ role: 'user', content: input }];
|
|
44
|
+
const allToolsCalled = [];
|
|
45
|
+
const missingStubs = [];
|
|
46
|
+
let finalText = '';
|
|
47
|
+
// M2: accumulate token counts across all turns instead of keeping only the last
|
|
48
|
+
let accInputTokens = 0;
|
|
49
|
+
let accOutputTokens = 0;
|
|
50
|
+
|
|
51
|
+
for (let turn = 0; turn < maxTurns; turn++) {
|
|
52
|
+
const response = await llmTurn({
|
|
53
|
+
provider, apiKey, model,
|
|
54
|
+
system: systemPrompt,
|
|
55
|
+
messages, tools,
|
|
56
|
+
maxTokens: 1024,
|
|
57
|
+
timeoutMs: 30_000
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
if (response.usage) {
|
|
61
|
+
const n = normalizeUsage(response.usage, provider);
|
|
62
|
+
accInputTokens += n.inputTokens;
|
|
63
|
+
accOutputTokens += n.outputTokens;
|
|
64
|
+
}
|
|
65
|
+
if (response.text) finalText = response.text;
|
|
66
|
+
if (!response.toolCalls?.length) break;
|
|
67
|
+
|
|
68
|
+
const toolResults = [];
|
|
69
|
+
for (const tc of response.toolCalls) {
|
|
70
|
+
allToolsCalled.push(tc.name);
|
|
71
|
+
const stubData = stubs[tc.name];
|
|
72
|
+
if (stubData === undefined) missingStubs.push(tc.name);
|
|
73
|
+
toolResults.push({
|
|
74
|
+
toolCall: tc,
|
|
75
|
+
result: { status: 200, body: stubData ?? { error: `no stub for "${tc.name}"` } }
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Append provider-specific history (mirrors react-engine.js format)
|
|
80
|
+
if (provider === 'anthropic') {
|
|
81
|
+
messages.push({
|
|
82
|
+
role: 'assistant',
|
|
83
|
+
content: [
|
|
84
|
+
...(response.text ? [{ type: 'text', text: response.text }] : []),
|
|
85
|
+
...toolResults.map(({ toolCall }) => ({
|
|
86
|
+
type: 'tool_use', id: toolCall.id, name: toolCall.name, input: toolCall.input
|
|
87
|
+
}))
|
|
88
|
+
]
|
|
89
|
+
});
|
|
90
|
+
messages.push({
|
|
91
|
+
role: 'user',
|
|
92
|
+
content: toolResults.map(({ toolCall, result }) => ({
|
|
93
|
+
type: 'tool_result',
|
|
94
|
+
tool_use_id: toolCall.id,
|
|
95
|
+
content: JSON.stringify(result.body)
|
|
96
|
+
}))
|
|
97
|
+
});
|
|
98
|
+
} else {
|
|
99
|
+
// L1: use '' instead of null — some OpenAI-compat providers reject content: null
|
|
100
|
+
messages.push({
|
|
101
|
+
role: 'assistant',
|
|
102
|
+
content: response.text || '',
|
|
103
|
+
tool_calls: toolResults.map(({ toolCall }) => ({
|
|
104
|
+
id: toolCall.id, type: 'function',
|
|
105
|
+
function: { name: toolCall.name, arguments: JSON.stringify(toolCall.input) }
|
|
106
|
+
}))
|
|
107
|
+
});
|
|
108
|
+
for (const { toolCall, result } of toolResults) {
|
|
109
|
+
messages.push({ role: 'tool', tool_call_id: toolCall.id, content: JSON.stringify(result.body) });
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Return accumulated normalized token counts so callers skip a second normalizeUsage call
|
|
115
|
+
return {
|
|
116
|
+
toolsCalled: allToolsCalled,
|
|
117
|
+
responseText: finalText,
|
|
118
|
+
usage: { inputTokens: accInputTokens, outputTokens: accOutputTokens },
|
|
119
|
+
missingStubs
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// ── Assertion checker ──────────────────────────────────────────────────────
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Check assertions from an eval case against actual API response.
|
|
127
|
+
* Returns array of failure strings (empty = pass).
|
|
128
|
+
* Uses runChecks() from lib/checks/ for structured evaluation.
|
|
129
|
+
*/
|
|
130
|
+
function checkAssertions(evalCase, { toolsCalled, responseText, latencyMs, cost, missingStubs = [] }) {
|
|
131
|
+
const failures = [];
|
|
132
|
+
|
|
133
|
+
// Run structured checks via check-adapter + runChecks
|
|
134
|
+
const input = checkAdapter(evalCase, { toolsCalled, responseText, latencyMs, cost });
|
|
135
|
+
const result = runChecks(input);
|
|
136
|
+
for (const [checkName, checkResult] of Object.entries(result.checks)) {
|
|
137
|
+
if (!checkResult.pass) {
|
|
138
|
+
failures.push(checkResult.reason ?? `${checkName} failed`);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// responseContainsAny — anyOf groups (not natively in runChecks)
|
|
143
|
+
const expect = evalCase.expect ?? {};
|
|
144
|
+
if (expect.responseContainsAny?.length) {
|
|
145
|
+
const anyResult = checkResponseContainsAnyGroups(responseText, expect.responseContainsAny);
|
|
146
|
+
if (!anyResult.pass) failures.push(anyResult.reason);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// toolsAcceptable — acceptable alternative tool sets
|
|
150
|
+
if (expect.toolsAcceptable !== undefined) {
|
|
151
|
+
const acceptResult = checkToolsAcceptable(toolsCalled, expect.toolsAcceptable);
|
|
152
|
+
if (!acceptResult.pass) failures.push(acceptResult.reason);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// noToolErrors — in stub mode: fails if any tool was called without a stub entry.
|
|
156
|
+
// In routing-only mode (no stubs): this assertion is a no-op because tools are
|
|
157
|
+
// never executed, so success/failure cannot be determined. Set `stubs` on the
|
|
158
|
+
// eval case to make this assertion meaningful.
|
|
159
|
+
if (expect.noToolErrors && missingStubs.length > 0) {
|
|
160
|
+
failures.push(`noToolErrors: tools called without stubs: [${missingStubs.join(', ')}]`);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
return failures;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// ── Tool schema extraction ─────────────────────────────────────────────────
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Convert forge tool schema format to JSON Schema.
|
|
170
|
+
* Input: { city: { type: 'string' }, units: { type: 'string', optional: true } }
|
|
171
|
+
* Output: { type: 'object', properties: { city: {...}, units: {...} }, required: ['city'] }
|
|
172
|
+
*/
|
|
173
|
+
function forgeSchemaToJsonSchema(schema) {
|
|
174
|
+
if (!schema || typeof schema !== 'object') {
|
|
175
|
+
return { type: 'object', properties: {} };
|
|
176
|
+
}
|
|
177
|
+
const properties = {};
|
|
178
|
+
const required = [];
|
|
179
|
+
for (const [key, val] of Object.entries(schema)) {
|
|
180
|
+
properties[key] = { type: val.type || 'string' };
|
|
181
|
+
if (val.description) properties[key].description = val.description;
|
|
182
|
+
if (!val.optional) required.push(key);
|
|
183
|
+
}
|
|
184
|
+
return { type: 'object', properties, ...(required.length ? { required } : {}) };
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/**
|
|
188
|
+
* Extract schema object from tool file source using a basic brace-matcher.
|
|
189
|
+
* Returns null if not parseable.
|
|
190
|
+
*/
|
|
191
|
+
function extractSchemaFromSource(source) {
|
|
192
|
+
const schemaMatch = source.match(/schema:\s*\{/);
|
|
193
|
+
if (!schemaMatch) return null;
|
|
194
|
+
|
|
195
|
+
const start = source.indexOf('{', schemaMatch.index + schemaMatch[0].length - 1);
|
|
196
|
+
let depth = 0;
|
|
197
|
+
let end = start;
|
|
198
|
+
for (let i = start; i < source.length; i++) {
|
|
199
|
+
if (source[i] === '{') depth++;
|
|
200
|
+
if (source[i] === '}') { depth--; if (depth === 0) { end = i; break; } }
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
const block = source.slice(start, end + 1);
|
|
204
|
+
// Convert JS object literal to evaluable form (very limited — handles simple cases)
|
|
205
|
+
try {
|
|
206
|
+
// eslint-disable-next-line no-new-func
|
|
207
|
+
const fn = new Function(`return ${block}`);
|
|
208
|
+
return fn();
|
|
209
|
+
} catch (_) {
|
|
210
|
+
return null;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Load tools from the tools directory, returning objects with inputSchema.
|
|
216
|
+
*/
|
|
217
|
+
export function getToolsForEval(config) {
|
|
218
|
+
const project = config?.project || {};
|
|
219
|
+
const toolsDir = resolve(process.cwd(), project.toolsDir || 'example/tools');
|
|
220
|
+
if (!existsSync(toolsDir)) return [];
|
|
221
|
+
|
|
222
|
+
const files = readdirSync(toolsDir).filter(
|
|
223
|
+
(f) => f.endsWith('.tool.ts') || f.endsWith('.tool.js')
|
|
224
|
+
);
|
|
225
|
+
|
|
226
|
+
return files.map((file) => {
|
|
227
|
+
const source = readFileSync(join(toolsDir, file), 'utf-8');
|
|
228
|
+
const nameM = source.match(/name:\s*['"]([^'"]+)['"]/);
|
|
229
|
+
const descM = source.match(/description:\s*['"`]([^'"`]+)['"`]/);
|
|
230
|
+
const name = nameM?.[1] ?? file.replace(/\.tool\.(ts|js)$/, '');
|
|
231
|
+
const description = descM?.[1] ?? '';
|
|
232
|
+
const rawSchema = extractSchemaFromSource(source);
|
|
233
|
+
return { name, description, inputSchema: forgeSchemaToJsonSchema(rawSchema) };
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// ── Eval file discovery ────────────────────────────────────────────────────
|
|
238
|
+
|
|
239
|
+
/**
|
|
240
|
+
* Find eval files for a tool. Searches evalsDir (and project root) for
|
|
241
|
+
* patterns: {toolName}.golden.json, {toolName}.labeled.json
|
|
242
|
+
*/
|
|
243
|
+
export function findEvalFiles(toolName, config) {
|
|
244
|
+
const project = config?.project || {};
|
|
245
|
+
const evalsDir = resolve(process.cwd(), project.evalsDir || 'docs/examples');
|
|
246
|
+
|
|
247
|
+
const candidates = [
|
|
248
|
+
join(evalsDir, `${toolName}.golden.json`),
|
|
249
|
+
join(evalsDir, `${toolName}.labeled.json`),
|
|
250
|
+
// Also check one level up if toolName matches a subdirectory
|
|
251
|
+
join(evalsDir, toolName, `${toolName}.golden.json`),
|
|
252
|
+
join(evalsDir, toolName, `${toolName}.labeled.json`),
|
|
253
|
+
// With hyphens
|
|
254
|
+
join(evalsDir, `${toolName.replace(/_/g, '-')}.golden.json`),
|
|
255
|
+
join(evalsDir, `${toolName.replace(/_/g, '-')}.labeled.json`)
|
|
256
|
+
];
|
|
257
|
+
|
|
258
|
+
return candidates
|
|
259
|
+
.filter(existsSync)
|
|
260
|
+
.filter((v, i, a) => a.indexOf(v) === i); // dedup
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
// ── Env reader ─────────────────────────────────────────────────────────────
|
|
264
|
+
|
|
265
|
+
function loadEnv(projectRoot) {
|
|
266
|
+
const envPath = resolve(projectRoot, '.env');
|
|
267
|
+
if (!existsSync(envPath)) return {};
|
|
268
|
+
const lines = readFileSync(envPath, 'utf-8').split('\n');
|
|
269
|
+
const out = {};
|
|
270
|
+
for (const line of lines) {
|
|
271
|
+
const trimmed = line.trim();
|
|
272
|
+
if (!trimmed || trimmed.startsWith('#')) continue;
|
|
273
|
+
const eqIdx = trimmed.indexOf('=');
|
|
274
|
+
if (eqIdx === -1) continue;
|
|
275
|
+
out[trimmed.slice(0, eqIdx).trim()] = trimmed.slice(eqIdx + 1).trim().replace(/^["']|["']$/g, '');
|
|
276
|
+
}
|
|
277
|
+
return out;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// ── Main runner ────────────────────────────────────────────────────────────
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Run evals for a tool.
|
|
284
|
+
*
|
|
285
|
+
* @param {string} toolName
|
|
286
|
+
* @param {object} config - full forge config
|
|
287
|
+
* @param {string} projectRoot
|
|
288
|
+
* @param {function} onProgress - called after each case: ({ done, total, caseId, passed, reason })
|
|
289
|
+
* @returns {{ total, passed, failed, skipped, cases, provider, model }}
|
|
290
|
+
*/
|
|
291
|
+
export async function runEvals(toolName, config, projectRoot, onProgress) {
|
|
292
|
+
const env = loadEnv(projectRoot);
|
|
293
|
+
|
|
294
|
+
// Determine provider + key.
|
|
295
|
+
// If config.model is explicitly set, detect its provider and resolve the matching key.
|
|
296
|
+
// Otherwise fall back to whichever key is available (Anthropic preferred).
|
|
297
|
+
let provider, model, apiKey;
|
|
298
|
+
if (config?.model) {
|
|
299
|
+
const mc = modelConfigForName(config.model, env);
|
|
300
|
+
provider = mc.provider;
|
|
301
|
+
model = mc.model;
|
|
302
|
+
apiKey = mc.apiKey;
|
|
303
|
+
if (!apiKey) {
|
|
304
|
+
throw new Error(`No API key found for provider "${provider}" (model: ${model}). Add the key to .env in Settings.`);
|
|
305
|
+
}
|
|
306
|
+
} else {
|
|
307
|
+
const anthropicKey = env['ANTHROPIC_API_KEY'];
|
|
308
|
+
const openaiKey = env['OPENAI_API_KEY'];
|
|
309
|
+
if (!anthropicKey && !openaiKey) {
|
|
310
|
+
throw new Error('No API key found. Add ANTHROPIC_API_KEY or OPENAI_API_KEY to .env in Settings.');
|
|
311
|
+
}
|
|
312
|
+
const useAnthropic = !!anthropicKey;
|
|
313
|
+
provider = useAnthropic ? 'anthropic' : 'openai';
|
|
314
|
+
model = useAnthropic ? 'claude-sonnet-4-6' : 'gpt-4o-mini';
|
|
315
|
+
apiKey = useAnthropic ? anthropicKey : openaiKey;
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Load system prompt
|
|
319
|
+
let systemPrompt = '';
|
|
320
|
+
if (config?.systemPromptPath) {
|
|
321
|
+
const spPath = resolve(projectRoot, config.systemPromptPath);
|
|
322
|
+
if (existsSync(spPath)) {
|
|
323
|
+
try { systemPrompt = readFileSync(spPath, 'utf-8'); } catch (_) { /* ignore */ }
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
// Load tool definitions
|
|
328
|
+
const tools = getToolsForEval(config);
|
|
329
|
+
if (tools.length === 0) {
|
|
330
|
+
throw new Error(`No tool files found in ${config?.project?.toolsDir || 'example/tools'}`);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// Find eval files
|
|
334
|
+
const evalFiles = findEvalFiles(toolName, config);
|
|
335
|
+
if (evalFiles.length === 0) {
|
|
336
|
+
throw new Error(
|
|
337
|
+
`No eval files found for "${toolName}". ` +
|
|
338
|
+
`Expected files like ${toolName}.golden.json in ${config?.project?.evalsDir || 'docs/examples'}`
|
|
339
|
+
);
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// Load all eval cases
|
|
343
|
+
const allCases = [];
|
|
344
|
+
for (const file of evalFiles) {
|
|
345
|
+
const type = file.includes('.golden.') ? 'golden' : 'labeled';
|
|
346
|
+
let cases;
|
|
347
|
+
try {
|
|
348
|
+
cases = JSON.parse(readFileSync(file, 'utf-8'));
|
|
349
|
+
} catch (err) {
|
|
350
|
+
throw new Error(`Failed to parse eval file ${file}: ${err.message}`);
|
|
351
|
+
}
|
|
352
|
+
for (const c of cases) allCases.push({ ...c, _evalType: type });
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
const results = [];
|
|
356
|
+
const caseRows = [];
|
|
357
|
+
let passed = 0;
|
|
358
|
+
let failed = 0;
|
|
359
|
+
let skipped = 0;
|
|
360
|
+
|
|
361
|
+
for (let i = 0; i < allCases.length; i++) {
|
|
362
|
+
const evalCase = allCases[i];
|
|
363
|
+
const input = evalCase.input?.message ?? '';
|
|
364
|
+
|
|
365
|
+
if (!input) {
|
|
366
|
+
skipped++;
|
|
367
|
+
onProgress?.({ done: i + 1, total: allCases.length, caseId: evalCase.id, passed: null, reason: 'no input message' });
|
|
368
|
+
results.push({ id: evalCase.id, description: evalCase.description, status: 'skipped', reason: 'no input message' });
|
|
369
|
+
caseRows.push({ case_id: evalCase.id, tool_name: toolName, status: 'skipped', reason: 'no input message', tools_called: null, latency_ms: null, model });
|
|
370
|
+
continue;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
let apiResult;
|
|
374
|
+
let stubbedResult;
|
|
375
|
+
let inputTokens = 0;
|
|
376
|
+
let outputTokens = 0;
|
|
377
|
+
const t0 = Date.now();
|
|
378
|
+
// C1: require at least one stub key — empty object {} stays routing-only
|
|
379
|
+
const hasStubs = evalCase.stubs && typeof evalCase.stubs === 'object' && Object.keys(evalCase.stubs).length > 0;
|
|
380
|
+
try {
|
|
381
|
+
if (hasStubs) {
|
|
382
|
+
stubbedResult = await stubbedReactTurn({
|
|
383
|
+
provider, apiKey, model, systemPrompt, tools,
|
|
384
|
+
input,
|
|
385
|
+
stubs: evalCase.stubs,
|
|
386
|
+
maxTurns: evalCase.maxTurns ?? 5
|
|
387
|
+
});
|
|
388
|
+
apiResult = { toolsCalled: stubbedResult.toolsCalled, responseText: stubbedResult.responseText };
|
|
389
|
+
// M2: stubbedReactTurn returns pre-accumulated normalized tokens; use directly
|
|
390
|
+
inputTokens = stubbedResult.usage.inputTokens;
|
|
391
|
+
outputTokens = stubbedResult.usage.outputTokens;
|
|
392
|
+
} else {
|
|
393
|
+
const turnResult = await llmTurn({
|
|
394
|
+
provider,
|
|
395
|
+
apiKey,
|
|
396
|
+
model,
|
|
397
|
+
system: systemPrompt,
|
|
398
|
+
messages: [{ role: 'user', content: input }],
|
|
399
|
+
tools,
|
|
400
|
+
maxTokens: 1024,
|
|
401
|
+
timeoutMs: 30_000
|
|
402
|
+
});
|
|
403
|
+
apiResult = {
|
|
404
|
+
toolsCalled: turnResult.toolCalls.map((tc) => tc.name),
|
|
405
|
+
responseText: turnResult.text
|
|
406
|
+
};
|
|
407
|
+
({ inputTokens, outputTokens } = normalizeUsage(turnResult.usage ?? null, provider));
|
|
408
|
+
}
|
|
409
|
+
} catch (err) {
|
|
410
|
+
failed++;
|
|
411
|
+
const reason = `API error: ${err.message}`;
|
|
412
|
+
onProgress?.({ done: i + 1, total: allCases.length, caseId: evalCase.id, passed: false, reason });
|
|
413
|
+
results.push({ id: evalCase.id, description: evalCase.description, status: 'failed', reason });
|
|
414
|
+
caseRows.push({ case_id: evalCase.id, tool_name: toolName, status: 'failed', reason, tools_called: null, latency_ms: Date.now() - t0, model, input_tokens: null, output_tokens: null });
|
|
415
|
+
continue;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
const latency_ms = Date.now() - t0;
|
|
419
|
+
// Cost is computed from observed token counts; maxCost assertion activates when expect.maxCost is set in the eval case
|
|
420
|
+
const cost = computeActualCost(inputTokens, outputTokens, model);
|
|
421
|
+
const failures = checkAssertions(evalCase, {
|
|
422
|
+
toolsCalled: apiResult.toolsCalled,
|
|
423
|
+
responseText: apiResult.responseText,
|
|
424
|
+
latencyMs: latency_ms,
|
|
425
|
+
cost,
|
|
426
|
+
missingStubs: stubbedResult?.missingStubs ?? []
|
|
427
|
+
});
|
|
428
|
+
const casePassed = failures.length === 0;
|
|
429
|
+
|
|
430
|
+
if (casePassed) passed++;
|
|
431
|
+
else failed++;
|
|
432
|
+
|
|
433
|
+
const reason = failures.length > 0 ? failures.join('; ') : null;
|
|
434
|
+
onProgress?.({ done: i + 1, total: allCases.length, caseId: evalCase.id, passed: casePassed, reason, toolsCalled: apiResult.toolsCalled });
|
|
435
|
+
results.push({
|
|
436
|
+
id: evalCase.id,
|
|
437
|
+
description: evalCase.description,
|
|
438
|
+
difficulty: evalCase.difficulty,
|
|
439
|
+
status: casePassed ? 'passed' : 'failed',
|
|
440
|
+
reason,
|
|
441
|
+
toolsCalled: apiResult.toolsCalled
|
|
442
|
+
});
|
|
443
|
+
caseRows.push({
|
|
444
|
+
case_id: evalCase.id,
|
|
445
|
+
tool_name: toolName,
|
|
446
|
+
status: casePassed ? 'passed' : 'failed',
|
|
447
|
+
reason,
|
|
448
|
+
tools_called: JSON.stringify(apiResult.toolsCalled),
|
|
449
|
+
latency_ms,
|
|
450
|
+
model,
|
|
451
|
+
input_tokens: inputTokens || null,
|
|
452
|
+
output_tokens: outputTokens || null
|
|
453
|
+
});
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
// Persist to SQLite
|
|
457
|
+
try {
|
|
458
|
+
const dbPath = resolve(projectRoot, config?.dbPath || 'forge.db');
|
|
459
|
+
const { getDb, insertEvalRun, insertEvalRunCases } = await import('./db.js');
|
|
460
|
+
const db = getDb(dbPath);
|
|
461
|
+
const evalType = allCases.every((c) => c._evalType === 'golden') ? 'golden'
|
|
462
|
+
: allCases.every((c) => c._evalType === 'labeled') ? 'labeled'
|
|
463
|
+
: 'mixed';
|
|
464
|
+
const ran = passed + failed;
|
|
465
|
+
const passRate = ran > 0 ? passed / ran : 0;
|
|
466
|
+
const evalRunId = insertEvalRun(db, {
|
|
467
|
+
tool_name: toolName,
|
|
468
|
+
eval_type: evalType,
|
|
469
|
+
total_cases: allCases.length,
|
|
470
|
+
passed,
|
|
471
|
+
failed,
|
|
472
|
+
skipped,
|
|
473
|
+
notes: `provider:${provider} model:${model}`,
|
|
474
|
+
model,
|
|
475
|
+
pass_rate: passRate,
|
|
476
|
+
sample_type: 'targeted'
|
|
477
|
+
});
|
|
478
|
+
if (caseRows.length > 0) {
|
|
479
|
+
insertEvalRunCases(db, caseRows.map((r) => ({ ...r, eval_run_id: evalRunId })));
|
|
480
|
+
}
|
|
481
|
+
} catch (_) { /* db write failure is non-fatal */ }
|
|
482
|
+
|
|
483
|
+
return { total: allCases.length, passed, failed, skipped, cases: results, provider, model };
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
// ── Multi-pass eval runner ────────────────────────────────────────────────
|
|
487
|
+
|
|
488
|
+
/**
|
|
489
|
+
* Run evals across a model matrix, collecting per-model results.
|
|
490
|
+
* Model matrix is resolved from config.modelMatrix (list of model name strings).
|
|
491
|
+
* Each model's provider + API key is resolved automatically via modelConfigForName.
|
|
492
|
+
*
|
|
493
|
+
* @param {string} toolName
|
|
494
|
+
* @param {object} config - full forge config (must include dbPath for env resolution)
|
|
495
|
+
* @param {string} projectRoot
|
|
496
|
+
* @param {{ modelMatrix?: string[] }} [options] - override matrix; defaults to config.modelMatrix
|
|
497
|
+
* @param {function} [onProgress] - called with { model, done, total, caseId, passed }
|
|
498
|
+
* @returns {Promise<{ perModel: Record<string, { passed, failed, total, pass_rate }> }>}
|
|
499
|
+
*/
|
|
500
|
+
export async function runEvalsMultiPass(toolName, config, projectRoot, options = {}, onProgress) {
|
|
501
|
+
// Resolve env for API key lookup
|
|
502
|
+
const env = loadEnv(projectRoot);
|
|
503
|
+
|
|
504
|
+
const matrixNames = options.modelMatrix || config?.modelMatrix || [];
|
|
505
|
+
if (matrixNames.length === 0) {
|
|
506
|
+
throw new Error('No model matrix configured. Add "modelMatrix" to forge.config.json.');
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
const perModel = {};
|
|
510
|
+
|
|
511
|
+
for (const modelName of matrixNames) {
|
|
512
|
+
const mc = modelConfigForName(modelName, env);
|
|
513
|
+
if (!mc.apiKey) {
|
|
514
|
+
perModel[modelName] = { error: `No API key found for provider "${mc.provider}"` };
|
|
515
|
+
continue;
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
// Build a config override for this model
|
|
519
|
+
const modelConfig = { ...config, model: modelName, models: { ...config?.models, eval: modelName } };
|
|
520
|
+
|
|
521
|
+
try {
|
|
522
|
+
const result = await runEvals(
|
|
523
|
+
toolName,
|
|
524
|
+
modelConfig,
|
|
525
|
+
projectRoot,
|
|
526
|
+
(progress) => onProgress?.({ model: modelName, ...progress })
|
|
527
|
+
);
|
|
528
|
+
perModel[modelName] = {
|
|
529
|
+
passed: result.passed,
|
|
530
|
+
failed: result.failed,
|
|
531
|
+
total: result.total,
|
|
532
|
+
skipped: result.skipped,
|
|
533
|
+
pass_rate: (result.passed + result.failed) > 0 ? result.passed / (result.passed + result.failed) : 0,
|
|
534
|
+
provider: result.provider
|
|
535
|
+
};
|
|
536
|
+
} catch (err) {
|
|
537
|
+
perModel[modelName] = { error: err.message };
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
return { perModel };
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
// ── Random sample helper ──────────────────────────────────────────────────
|
|
545
|
+
|
|
546
|
+
/**
|
|
547
|
+
* Pull n random eval run cases from OTHER tools for blind drift detection.
|
|
548
|
+
*
|
|
549
|
+
* @param {import('better-sqlite3').Database} db
|
|
550
|
+
* @param {string} toolName - The tool to EXCLUDE from sampling
|
|
551
|
+
* @param {number} n
|
|
552
|
+
* @returns {object[]} eval case rows with _sampleType: 'sampled'
|
|
553
|
+
*/
|
|
554
|
+
export function withRandomSample(db, toolName, n) {
|
|
555
|
+
try {
|
|
556
|
+
const rows = db.prepare(`
|
|
557
|
+
SELECT * FROM eval_run_cases
|
|
558
|
+
WHERE tool_name != ?
|
|
559
|
+
ORDER BY RANDOM()
|
|
560
|
+
LIMIT ?
|
|
561
|
+
`).all(toolName, n);
|
|
562
|
+
return rows.map((r) => ({ ...r, _sampleType: 'sampled' }));
|
|
563
|
+
} catch (_) {
|
|
564
|
+
return [];
|
|
565
|
+
}
|
|
566
|
+
}
|