agent-tool-forge 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +209 -0
- package/lib/agent-registry.js +170 -0
- package/lib/api-client.js +792 -0
- package/lib/api-loader.js +260 -0
- package/lib/auth.d.ts +25 -0
- package/lib/auth.js +158 -0
- package/lib/checks/check-adapter.js +172 -0
- package/lib/checks/compose.js +42 -0
- package/lib/checks/content-match.js +14 -0
- package/lib/checks/cost-budget.js +11 -0
- package/lib/checks/index.js +18 -0
- package/lib/checks/json-valid.js +15 -0
- package/lib/checks/latency.js +11 -0
- package/lib/checks/length-bounds.js +17 -0
- package/lib/checks/negative-match.js +14 -0
- package/lib/checks/no-hallucinated-numbers.js +63 -0
- package/lib/checks/non-empty.js +34 -0
- package/lib/checks/regex-match.js +12 -0
- package/lib/checks/run-checks.js +84 -0
- package/lib/checks/schema-match.js +26 -0
- package/lib/checks/tool-call-count.js +16 -0
- package/lib/checks/tool-selection.js +34 -0
- package/lib/checks/types.js +45 -0
- package/lib/comparison/compare.js +86 -0
- package/lib/comparison/format.js +104 -0
- package/lib/comparison/index.js +6 -0
- package/lib/comparison/statistics.js +59 -0
- package/lib/comparison/types.js +41 -0
- package/lib/config-schema.js +200 -0
- package/lib/config.d.ts +66 -0
- package/lib/conversation-store.d.ts +77 -0
- package/lib/conversation-store.js +443 -0
- package/lib/db.d.ts +6 -0
- package/lib/db.js +1112 -0
- package/lib/dep-check.js +99 -0
- package/lib/drift-background.js +61 -0
- package/lib/drift-monitor.js +187 -0
- package/lib/eval-runner.js +566 -0
- package/lib/fixtures/fixture-store.js +161 -0
- package/lib/fixtures/index.js +11 -0
- package/lib/forge-engine.js +982 -0
- package/lib/forge-eval-generator.js +417 -0
- package/lib/forge-file-writer.js +386 -0
- package/lib/forge-service-client.js +190 -0
- package/lib/forge-service.d.ts +4 -0
- package/lib/forge-service.js +655 -0
- package/lib/forge-verifier-generator.js +271 -0
- package/lib/handlers/admin.js +151 -0
- package/lib/handlers/agents.js +229 -0
- package/lib/handlers/chat-resume.js +334 -0
- package/lib/handlers/chat-sync.js +320 -0
- package/lib/handlers/chat.js +320 -0
- package/lib/handlers/conversations.js +92 -0
- package/lib/handlers/preferences.js +88 -0
- package/lib/handlers/tools-list.js +58 -0
- package/lib/hitl-engine.d.ts +60 -0
- package/lib/hitl-engine.js +261 -0
- package/lib/http-utils.js +92 -0
- package/lib/index.d.ts +20 -0
- package/lib/index.js +141 -0
- package/lib/init.js +636 -0
- package/lib/manual-entry.js +59 -0
- package/lib/mcp-server.js +252 -0
- package/lib/output-groups.js +54 -0
- package/lib/postgres-store.d.ts +31 -0
- package/lib/postgres-store.js +465 -0
- package/lib/preference-store.d.ts +47 -0
- package/lib/preference-store.js +79 -0
- package/lib/prompt-store.d.ts +42 -0
- package/lib/prompt-store.js +60 -0
- package/lib/rate-limiter.d.ts +30 -0
- package/lib/rate-limiter.js +104 -0
- package/lib/react-engine.d.ts +110 -0
- package/lib/react-engine.js +337 -0
- package/lib/runner/cli.js +156 -0
- package/lib/runner/cost-estimator.js +71 -0
- package/lib/runner/gate.js +46 -0
- package/lib/runner/index.js +165 -0
- package/lib/sidecar.d.ts +83 -0
- package/lib/sidecar.js +161 -0
- package/lib/sse.d.ts +15 -0
- package/lib/sse.js +30 -0
- package/lib/tools-scanner.js +91 -0
- package/lib/tui.js +253 -0
- package/lib/verifier-report.js +78 -0
- package/lib/verifier-runner.js +338 -0
- package/lib/verifier-scanner.js +70 -0
- package/lib/verifier-worker-pool.js +196 -0
- package/lib/views/chat.js +340 -0
- package/lib/views/endpoints.js +203 -0
- package/lib/views/eval-run.js +206 -0
- package/lib/views/forge-agent.js +538 -0
- package/lib/views/forge.js +410 -0
- package/lib/views/main-menu.js +275 -0
- package/lib/views/mediation.js +381 -0
- package/lib/views/model-compare.js +430 -0
- package/lib/views/model-comparison.js +333 -0
- package/lib/views/onboarding.js +470 -0
- package/lib/views/performance.js +237 -0
- package/lib/views/run-evals.js +205 -0
- package/lib/views/settings.js +829 -0
- package/lib/views/tools-evals.js +514 -0
- package/lib/views/verifier-coverage.js +617 -0
- package/lib/workers/verifier-worker.js +52 -0
- package/package.json +123 -0
- package/widget/forge-chat.js +789 -0
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Forge Eval Generator — generates golden and labeled eval JSON via LLM.
|
|
3
|
+
*
|
|
4
|
+
* Does NOT write files — returns content and computed paths so the caller
|
|
5
|
+
* (forge.js) can preview and confirm before writing.
|
|
6
|
+
*
|
|
7
|
+
* @module forge-eval-generator
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { llmTurn } from './api-client.js';
|
|
11
|
+
|
|
12
|
+
// ── JSON array extraction ──────────────────────────────────────────────────
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Extract a JSON array from raw LLM response text.
|
|
16
|
+
* Tries ```json...``` fenced block first, then falls back to first `[` to
|
|
17
|
+
* its matching closing `]`.
|
|
18
|
+
*
|
|
19
|
+
* @param {string} text - Raw LLM response text
|
|
20
|
+
* @returns {unknown[]} Parsed JSON array
|
|
21
|
+
* @throws {Error} If no valid JSON array can be found or parsed
|
|
22
|
+
*/
|
|
23
|
+
function extractJsonArray(text) {
|
|
24
|
+
// Strategy 1: ```json ... ``` fenced block
|
|
25
|
+
const fenceMatch = text.match(/```json\s*([\s\S]*?)\s*```/);
|
|
26
|
+
if (fenceMatch) {
|
|
27
|
+
try {
|
|
28
|
+
const parsed = JSON.parse(fenceMatch[1]);
|
|
29
|
+
if (Array.isArray(parsed)) return parsed;
|
|
30
|
+
} catch (_) {
|
|
31
|
+
// Fenced block was malformed JSON — fall through to strategy 2
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Strategy 2: first `[` to its matching `]`
|
|
36
|
+
const start = text.indexOf('[');
|
|
37
|
+
if (start === -1) {
|
|
38
|
+
throw new Error('No JSON array found in LLM response');
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
let depth = 0;
|
|
42
|
+
let inString = false;
|
|
43
|
+
let escape = false;
|
|
44
|
+
|
|
45
|
+
for (let i = start; i < text.length; i++) {
|
|
46
|
+
const ch = text[i];
|
|
47
|
+
|
|
48
|
+
if (escape) {
|
|
49
|
+
escape = false;
|
|
50
|
+
continue;
|
|
51
|
+
}
|
|
52
|
+
if (ch === '\\' && inString) {
|
|
53
|
+
escape = true;
|
|
54
|
+
continue;
|
|
55
|
+
}
|
|
56
|
+
if (ch === '"') {
|
|
57
|
+
inString = !inString;
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
if (inString) continue;
|
|
61
|
+
|
|
62
|
+
if (ch === '[') depth++;
|
|
63
|
+
else if (ch === ']') {
|
|
64
|
+
depth--;
|
|
65
|
+
if (depth === 0) {
|
|
66
|
+
return JSON.parse(text.slice(start, i + 1));
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
throw new Error('Unbalanced JSON array in LLM response');
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// ── Case validation & normalisation ───────────────────────────────────────
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Validate and normalise a raw array of eval cases returned by the LLM.
|
|
78
|
+
* Items missing required fields are filtered out with a warning.
|
|
79
|
+
* Missing ids are assigned sequential defaults.
|
|
80
|
+
*
|
|
81
|
+
* @param {unknown[]} items - Raw parsed array from LLM
|
|
82
|
+
* @param {string} toolName - Used for default id generation
|
|
83
|
+
* @param {'golden'|'labeled'} kind
|
|
84
|
+
* @returns {object[]} Validated EvalCase array
|
|
85
|
+
*/
|
|
86
|
+
function validateAndNormaliseCases(items, toolName, kind) {
|
|
87
|
+
if (!Array.isArray(items)) {
|
|
88
|
+
throw new Error('LLM response did not parse to an array');
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const prefix = kind === 'golden'
|
|
92
|
+
? `${toolName}_golden`
|
|
93
|
+
: `${toolName}_labeled`;
|
|
94
|
+
|
|
95
|
+
const valid = [];
|
|
96
|
+
let counter = 1;
|
|
97
|
+
|
|
98
|
+
for (const item of items) {
|
|
99
|
+
if (!item || typeof item !== 'object') continue;
|
|
100
|
+
|
|
101
|
+
// Required: input.message
|
|
102
|
+
if (!item.input || typeof item.input.message !== 'string' || item.input.message.trim() === '') {
|
|
103
|
+
continue;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Assign default id if missing or blank
|
|
107
|
+
if (typeof item.id !== 'string' || item.id.trim() === '') {
|
|
108
|
+
item.id = `${prefix}_${String(counter).padStart(3, '0')}`;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Assign default description if missing
|
|
112
|
+
if (typeof item.description !== 'string' || item.description.trim() === '') {
|
|
113
|
+
item.description = `${kind} case ${counter}`;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Assign default difficulty if missing
|
|
117
|
+
if (typeof item.difficulty !== 'string' || item.difficulty.trim() === '') {
|
|
118
|
+
item.difficulty = 'easy';
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
counter++;
|
|
122
|
+
valid.push(item);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return valid;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// ── Prompt builders ────────────────────────────────────────────────────────
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Build the LLM prompt for golden eval cases.
|
|
132
|
+
*
|
|
133
|
+
* @param {object} spec - Tool specification
|
|
134
|
+
* @param {{ total: number }} [mix] - Golden mix config
|
|
135
|
+
* @returns {string}
|
|
136
|
+
*/
|
|
137
|
+
function buildGoldenPrompt(spec, mix) {
|
|
138
|
+
const triggers = Array.isArray(spec.triggerPhrases) && spec.triggerPhrases.length
|
|
139
|
+
? spec.triggerPhrases.join(', ')
|
|
140
|
+
: '(none provided)';
|
|
141
|
+
|
|
142
|
+
const count = mix?.total || 8;
|
|
143
|
+
|
|
144
|
+
return `Generate exactly ${count} golden eval cases for the tool '${spec.name}'.
|
|
145
|
+
Description: ${spec.description}
|
|
146
|
+
Trigger phrases: ${triggers}
|
|
147
|
+
|
|
148
|
+
Each case should have a natural user message that would trigger this tool.
|
|
149
|
+
Golden cases must have toolsCalled: ["${spec.name}"].
|
|
150
|
+
|
|
151
|
+
Include a variety of phrasings — direct questions, rephrased requests, casual wording.
|
|
152
|
+
Also include one case testing that raw JSON/internals are not leaked in the response.
|
|
153
|
+
|
|
154
|
+
Return a JSON array of eval cases. Each case must have this shape:
|
|
155
|
+
{
|
|
156
|
+
"id": "${spec.name}_golden_001",
|
|
157
|
+
"description": "brief description of the case",
|
|
158
|
+
"difficulty": "easy" | "medium" | "hard",
|
|
159
|
+
"input": { "message": "<user message>" },
|
|
160
|
+
"expect": {
|
|
161
|
+
"toolsCalled": ["${spec.name}"],
|
|
162
|
+
"noToolErrors": true,
|
|
163
|
+
"responseNonEmpty": true
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
Respond ONLY with the JSON array — no prose, no markdown outside the JSON block.`;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Build the LLM prompt for labeled eval cases.
|
|
172
|
+
*
|
|
173
|
+
* @param {object} spec - Tool specification
|
|
174
|
+
* @param {Array<{name:string, description:string}>} allTools - All tools in registry
|
|
175
|
+
* @param {{ straightforward: number, ambiguous: number, edge: number, adversarial: number }} [mix]
|
|
176
|
+
* @returns {string}
|
|
177
|
+
*/
|
|
178
|
+
function buildLabeledPrompt(spec, allTools, mix) {
|
|
179
|
+
const toolsListing = allTools.length
|
|
180
|
+
? allTools.map((t) => `${t.name}: ${t.description}`).join('\n')
|
|
181
|
+
: `${spec.name}: ${spec.description}`;
|
|
182
|
+
|
|
183
|
+
const straight = mix?.straightforward ?? 3;
|
|
184
|
+
const ambiguous = mix?.ambiguous ?? 3;
|
|
185
|
+
const edge = mix?.edge ?? 2;
|
|
186
|
+
const adversarial = mix?.adversarial ?? 2;
|
|
187
|
+
const total = straight + ambiguous + edge + adversarial;
|
|
188
|
+
|
|
189
|
+
return `Generate exactly ${total} labeled eval cases for '${spec.name}' vs other tools.
|
|
190
|
+
All tools:
|
|
191
|
+
${toolsListing}
|
|
192
|
+
|
|
193
|
+
Labeled cases test disambiguation — when the user's intent might match multiple tools or no tool.
|
|
194
|
+
Required distribution:
|
|
195
|
+
- ${straight} straightforward cases (clear intent, single tool)
|
|
196
|
+
- ${ambiguous} ambiguous cases (multiple tools could apply)
|
|
197
|
+
- ${edge} edge cases (prompt injection, off-topic, or general knowledge — use ["__none__"])
|
|
198
|
+
- ${adversarial} adversarial cases (attempts to trick or misuse the tool)
|
|
199
|
+
|
|
200
|
+
Each case has expect.toolsAcceptable (array of acceptable tool-name arrays).
|
|
201
|
+
Use ["__none__"] for cases where no tool should be called.
|
|
202
|
+
|
|
203
|
+
Return a JSON array of eval cases. Each case must have this shape:
|
|
204
|
+
{
|
|
205
|
+
"id": "${spec.name}_labeled_001",
|
|
206
|
+
"description": "brief description of the case",
|
|
207
|
+
"difficulty": "straightforward" | "ambiguous" | "edge" | "adversarial",
|
|
208
|
+
"input": { "message": "<user message>" },
|
|
209
|
+
"expect": {
|
|
210
|
+
"toolsAcceptable": [["tool_a"], ["tool_a", "tool_b"]],
|
|
211
|
+
"noToolErrors": true,
|
|
212
|
+
"responseNonEmpty": true
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
Respond ONLY with the JSON array — no prose, no markdown outside the JSON block.`;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// ── LLM call with retry ────────────────────────────────────────────────────
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Call the LLM and extract a valid JSON array of eval cases.
|
|
223
|
+
* Retries up to MAX_RETRIES times with corrective nudges.
|
|
224
|
+
*
|
|
225
|
+
* @param {object} opts
|
|
226
|
+
* @param {object} opts.modelConfig - { provider, apiKey, model }
|
|
227
|
+
* @param {string} opts.prompt - User-turn prompt
|
|
228
|
+
* @param {string} opts.toolName - For id assignment
|
|
229
|
+
* @param {'golden'|'labeled'} opts.kind
|
|
230
|
+
* @param {number} [opts.maxRetries]
|
|
231
|
+
* @returns {Promise<object[]>}
|
|
232
|
+
*/
|
|
233
|
+
async function callLlmForCases({ modelConfig, prompt, toolName, kind, maxRetries = 2 }) {
|
|
234
|
+
const messages = [{ role: 'user', content: prompt }];
|
|
235
|
+
let lastError;
|
|
236
|
+
|
|
237
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
238
|
+
let responseText;
|
|
239
|
+
|
|
240
|
+
try {
|
|
241
|
+
const turn = await llmTurn({
|
|
242
|
+
provider: modelConfig.provider,
|
|
243
|
+
apiKey: modelConfig.apiKey,
|
|
244
|
+
model: modelConfig.model,
|
|
245
|
+
messages,
|
|
246
|
+
maxTokens: 4096,
|
|
247
|
+
timeoutMs: 90_000
|
|
248
|
+
});
|
|
249
|
+
responseText = turn.text;
|
|
250
|
+
} catch (err) {
|
|
251
|
+
throw new Error(
|
|
252
|
+
`LLM API call failed while generating ${kind} evals for "${toolName}": ${err.message}`
|
|
253
|
+
);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
if (!responseText || responseText.trim() === '') {
|
|
257
|
+
lastError = new Error(
|
|
258
|
+
`LLM returned an empty response on attempt ${attempt}/${maxRetries}`
|
|
259
|
+
);
|
|
260
|
+
messages.push({ role: 'assistant', content: responseText || '' });
|
|
261
|
+
messages.push({
|
|
262
|
+
role: 'user',
|
|
263
|
+
content:
|
|
264
|
+
'Your response was empty. Please respond with ONLY a JSON array of eval cases.'
|
|
265
|
+
});
|
|
266
|
+
continue;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
let parsed;
|
|
270
|
+
try {
|
|
271
|
+
parsed = extractJsonArray(responseText);
|
|
272
|
+
} catch (parseErr) {
|
|
273
|
+
lastError = new Error(
|
|
274
|
+
`Attempt ${attempt}/${maxRetries}: Could not extract JSON array from LLM response — ` +
|
|
275
|
+
parseErr.message +
|
|
276
|
+
`\nRaw response (first 300 chars): ${responseText.slice(0, 300)}`
|
|
277
|
+
);
|
|
278
|
+
messages.push({ role: 'assistant', content: responseText });
|
|
279
|
+
messages.push({
|
|
280
|
+
role: 'user',
|
|
281
|
+
content:
|
|
282
|
+
'Your previous response did not contain a valid JSON array. ' +
|
|
283
|
+
'Respond ONLY with a JSON array of eval case objects. ' +
|
|
284
|
+
'Do not include any text outside the JSON.'
|
|
285
|
+
});
|
|
286
|
+
continue;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
let validated;
|
|
290
|
+
try {
|
|
291
|
+
validated = validateAndNormaliseCases(parsed, toolName, kind);
|
|
292
|
+
} catch (validErr) {
|
|
293
|
+
lastError = new Error(
|
|
294
|
+
`Attempt ${attempt}/${maxRetries}: Eval case validation failed — ${validErr.message}`
|
|
295
|
+
);
|
|
296
|
+
messages.push({ role: 'assistant', content: responseText });
|
|
297
|
+
messages.push({
|
|
298
|
+
role: 'user',
|
|
299
|
+
content:
|
|
300
|
+
`The array you returned was invalid: ${validErr.message}. ` +
|
|
301
|
+
'Please provide a JSON array where each item has at minimum ' +
|
|
302
|
+
'"id", "description", "input" (with "message"), and "expect".'
|
|
303
|
+
});
|
|
304
|
+
continue;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
if (validated.length === 0) {
|
|
308
|
+
lastError = new Error(
|
|
309
|
+
`Attempt ${attempt}/${maxRetries}: LLM returned an array but no valid eval cases were found`
|
|
310
|
+
);
|
|
311
|
+
messages.push({ role: 'assistant', content: responseText });
|
|
312
|
+
messages.push({
|
|
313
|
+
role: 'user',
|
|
314
|
+
content:
|
|
315
|
+
'None of the items in your array had the required shape. ' +
|
|
316
|
+
'Each item needs at minimum an "input" object with a "message" string field.'
|
|
317
|
+
});
|
|
318
|
+
continue;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
return validated;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
throw new Error(
|
|
325
|
+
`generateEvals: failed to obtain valid ${kind} eval cases for "${toolName}" ` +
|
|
326
|
+
`after ${maxRetries} attempts. Last error: ${lastError?.message}`
|
|
327
|
+
);
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// ── Main export ────────────────────────────────────────────────────────────
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Generate golden and labeled eval JSON via LLM.
|
|
334
|
+
*
|
|
335
|
+
* Does NOT write files. Returns the case arrays and their intended paths so
|
|
336
|
+
* the caller (forge.js) can preview and confirm before writing.
|
|
337
|
+
*
|
|
338
|
+
* @param {object} opts
|
|
339
|
+
* @param {object} opts.spec - Tool specification
|
|
340
|
+
* @param {string} opts.spec.name - Snake_case tool name
|
|
341
|
+
* @param {string} opts.spec.description - Human-readable description
|
|
342
|
+
* @param {string[]} [opts.spec.triggerPhrases]
|
|
343
|
+
* @param {Array<{name:string, description:string}>} [opts.allTools]
|
|
344
|
+
* All tools in the registry (used to generate disambiguation labeled cases).
|
|
345
|
+
* Defaults to a single-entry list containing the spec itself if omitted.
|
|
346
|
+
* @param {object} opts.projectConfig - forge.config.json contents
|
|
347
|
+
* @param {string} opts.projectRoot - Absolute path to project root
|
|
348
|
+
* @param {object} opts.modelConfig - { provider, apiKey, model }
|
|
349
|
+
* @param {string} opts.modelConfig.provider - 'anthropic' | 'openai'
|
|
350
|
+
* @param {string} opts.modelConfig.apiKey
|
|
351
|
+
* @param {string} opts.modelConfig.model
|
|
352
|
+
*
|
|
353
|
+
* @returns {Promise<{
|
|
354
|
+
* goldenCases: object[],
|
|
355
|
+
* labeledCases: object[],
|
|
356
|
+
* goldenPath: string,
|
|
357
|
+
* labeledPath: string
|
|
358
|
+
* }>}
|
|
359
|
+
*
|
|
360
|
+
* @throws {Error} If LLM returns invalid content after retries
|
|
361
|
+
*/
|
|
362
|
+
export async function generateEvals({
|
|
363
|
+
spec,
|
|
364
|
+
allTools = [],
|
|
365
|
+
projectConfig,
|
|
366
|
+
projectRoot,
|
|
367
|
+
modelConfig,
|
|
368
|
+
evalMix
|
|
369
|
+
}) {
|
|
370
|
+
const evalsDir = projectConfig?.project?.evalsDir || 'docs/examples';
|
|
371
|
+
|
|
372
|
+
// Resolve absolute evals directory for path construction
|
|
373
|
+
const absEvalsDir = evalsDir.startsWith('/')
|
|
374
|
+
? evalsDir
|
|
375
|
+
: `${projectRoot}/${evalsDir}`;
|
|
376
|
+
|
|
377
|
+
const goldenPath = `${absEvalsDir}/${spec.name}.golden.json`;
|
|
378
|
+
const labeledPath = `${absEvalsDir}/${spec.name}.labeled.json`;
|
|
379
|
+
|
|
380
|
+
// Ensure allTools includes at least the spec tool itself
|
|
381
|
+
const toolsForLabeled = allTools.length
|
|
382
|
+
? allTools
|
|
383
|
+
: [{ name: spec.name, description: spec.description }];
|
|
384
|
+
|
|
385
|
+
// Resolve mix — prefer explicit arg, then spec.evalMix, then config default, then hardcoded
|
|
386
|
+
const DEFAULT_MIX = {
|
|
387
|
+
golden: { total: 10 },
|
|
388
|
+
labeled: { straightforward: 3, ambiguous: 3, edge: 2, adversarial: 2 }
|
|
389
|
+
};
|
|
390
|
+
const configMix = projectConfig?.evals?.defaultMix;
|
|
391
|
+
const resolvedMix = evalMix || spec.evalMix || configMix || DEFAULT_MIX;
|
|
392
|
+
|
|
393
|
+
// ── Generate golden cases ──────────────────────────────────────────────
|
|
394
|
+
const goldenPrompt = buildGoldenPrompt(spec, resolvedMix.golden);
|
|
395
|
+
const goldenCases = await callLlmForCases({
|
|
396
|
+
modelConfig,
|
|
397
|
+
prompt: goldenPrompt,
|
|
398
|
+
toolName: spec.name,
|
|
399
|
+
kind: 'golden'
|
|
400
|
+
});
|
|
401
|
+
|
|
402
|
+
// ── Generate labeled cases ─────────────────────────────────────────────
|
|
403
|
+
const labeledPrompt = buildLabeledPrompt(spec, toolsForLabeled, resolvedMix.labeled);
|
|
404
|
+
const labeledCases = await callLlmForCases({
|
|
405
|
+
modelConfig,
|
|
406
|
+
prompt: labeledPrompt,
|
|
407
|
+
toolName: spec.name,
|
|
408
|
+
kind: 'labeled'
|
|
409
|
+
});
|
|
410
|
+
|
|
411
|
+
return {
|
|
412
|
+
goldenCases,
|
|
413
|
+
labeledCases,
|
|
414
|
+
goldenPath,
|
|
415
|
+
labeledPath
|
|
416
|
+
};
|
|
417
|
+
}
|