agent-tool-forge 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +209 -0
  3. package/lib/agent-registry.js +170 -0
  4. package/lib/api-client.js +792 -0
  5. package/lib/api-loader.js +260 -0
  6. package/lib/auth.d.ts +25 -0
  7. package/lib/auth.js +158 -0
  8. package/lib/checks/check-adapter.js +172 -0
  9. package/lib/checks/compose.js +42 -0
  10. package/lib/checks/content-match.js +14 -0
  11. package/lib/checks/cost-budget.js +11 -0
  12. package/lib/checks/index.js +18 -0
  13. package/lib/checks/json-valid.js +15 -0
  14. package/lib/checks/latency.js +11 -0
  15. package/lib/checks/length-bounds.js +17 -0
  16. package/lib/checks/negative-match.js +14 -0
  17. package/lib/checks/no-hallucinated-numbers.js +63 -0
  18. package/lib/checks/non-empty.js +34 -0
  19. package/lib/checks/regex-match.js +12 -0
  20. package/lib/checks/run-checks.js +84 -0
  21. package/lib/checks/schema-match.js +26 -0
  22. package/lib/checks/tool-call-count.js +16 -0
  23. package/lib/checks/tool-selection.js +34 -0
  24. package/lib/checks/types.js +45 -0
  25. package/lib/comparison/compare.js +86 -0
  26. package/lib/comparison/format.js +104 -0
  27. package/lib/comparison/index.js +6 -0
  28. package/lib/comparison/statistics.js +59 -0
  29. package/lib/comparison/types.js +41 -0
  30. package/lib/config-schema.js +200 -0
  31. package/lib/config.d.ts +66 -0
  32. package/lib/conversation-store.d.ts +77 -0
  33. package/lib/conversation-store.js +443 -0
  34. package/lib/db.d.ts +6 -0
  35. package/lib/db.js +1112 -0
  36. package/lib/dep-check.js +99 -0
  37. package/lib/drift-background.js +61 -0
  38. package/lib/drift-monitor.js +187 -0
  39. package/lib/eval-runner.js +566 -0
  40. package/lib/fixtures/fixture-store.js +161 -0
  41. package/lib/fixtures/index.js +11 -0
  42. package/lib/forge-engine.js +982 -0
  43. package/lib/forge-eval-generator.js +417 -0
  44. package/lib/forge-file-writer.js +386 -0
  45. package/lib/forge-service-client.js +190 -0
  46. package/lib/forge-service.d.ts +4 -0
  47. package/lib/forge-service.js +655 -0
  48. package/lib/forge-verifier-generator.js +271 -0
  49. package/lib/handlers/admin.js +151 -0
  50. package/lib/handlers/agents.js +229 -0
  51. package/lib/handlers/chat-resume.js +334 -0
  52. package/lib/handlers/chat-sync.js +320 -0
  53. package/lib/handlers/chat.js +320 -0
  54. package/lib/handlers/conversations.js +92 -0
  55. package/lib/handlers/preferences.js +88 -0
  56. package/lib/handlers/tools-list.js +58 -0
  57. package/lib/hitl-engine.d.ts +60 -0
  58. package/lib/hitl-engine.js +261 -0
  59. package/lib/http-utils.js +92 -0
  60. package/lib/index.d.ts +20 -0
  61. package/lib/index.js +141 -0
  62. package/lib/init.js +636 -0
  63. package/lib/manual-entry.js +59 -0
  64. package/lib/mcp-server.js +252 -0
  65. package/lib/output-groups.js +54 -0
  66. package/lib/postgres-store.d.ts +31 -0
  67. package/lib/postgres-store.js +465 -0
  68. package/lib/preference-store.d.ts +47 -0
  69. package/lib/preference-store.js +79 -0
  70. package/lib/prompt-store.d.ts +42 -0
  71. package/lib/prompt-store.js +60 -0
  72. package/lib/rate-limiter.d.ts +30 -0
  73. package/lib/rate-limiter.js +104 -0
  74. package/lib/react-engine.d.ts +110 -0
  75. package/lib/react-engine.js +337 -0
  76. package/lib/runner/cli.js +156 -0
  77. package/lib/runner/cost-estimator.js +71 -0
  78. package/lib/runner/gate.js +46 -0
  79. package/lib/runner/index.js +165 -0
  80. package/lib/sidecar.d.ts +83 -0
  81. package/lib/sidecar.js +161 -0
  82. package/lib/sse.d.ts +15 -0
  83. package/lib/sse.js +30 -0
  84. package/lib/tools-scanner.js +91 -0
  85. package/lib/tui.js +253 -0
  86. package/lib/verifier-report.js +78 -0
  87. package/lib/verifier-runner.js +338 -0
  88. package/lib/verifier-scanner.js +70 -0
  89. package/lib/verifier-worker-pool.js +196 -0
  90. package/lib/views/chat.js +340 -0
  91. package/lib/views/endpoints.js +203 -0
  92. package/lib/views/eval-run.js +206 -0
  93. package/lib/views/forge-agent.js +538 -0
  94. package/lib/views/forge.js +410 -0
  95. package/lib/views/main-menu.js +275 -0
  96. package/lib/views/mediation.js +381 -0
  97. package/lib/views/model-compare.js +430 -0
  98. package/lib/views/model-comparison.js +333 -0
  99. package/lib/views/onboarding.js +470 -0
  100. package/lib/views/performance.js +237 -0
  101. package/lib/views/run-evals.js +205 -0
  102. package/lib/views/settings.js +829 -0
  103. package/lib/views/tools-evals.js +514 -0
  104. package/lib/views/verifier-coverage.js +617 -0
  105. package/lib/workers/verifier-worker.js +52 -0
  106. package/package.json +123 -0
  107. package/widget/forge-chat.js +789 -0
@@ -0,0 +1,417 @@
1
+ /**
2
+ * Forge Eval Generator — generates golden and labeled eval JSON via LLM.
3
+ *
4
+ * Does NOT write files — returns content and computed paths so the caller
5
+ * (forge.js) can preview and confirm before writing.
6
+ *
7
+ * @module forge-eval-generator
8
+ */
9
+
10
+ import { llmTurn } from './api-client.js';
11
+
12
+ // ── JSON array extraction ──────────────────────────────────────────────────
13
+
14
+ /**
15
+ * Extract a JSON array from raw LLM response text.
16
+ * Tries ```json...``` fenced block first, then falls back to first `[` to
17
+ * its matching closing `]`.
18
+ *
19
+ * @param {string} text - Raw LLM response text
20
+ * @returns {unknown[]} Parsed JSON array
21
+ * @throws {Error} If no valid JSON array can be found or parsed
22
+ */
23
+ function extractJsonArray(text) {
24
+ // Strategy 1: ```json ... ``` fenced block
25
+ const fenceMatch = text.match(/```json\s*([\s\S]*?)\s*```/);
26
+ if (fenceMatch) {
27
+ try {
28
+ const parsed = JSON.parse(fenceMatch[1]);
29
+ if (Array.isArray(parsed)) return parsed;
30
+ } catch (_) {
31
+ // Fenced block was malformed JSON — fall through to strategy 2
32
+ }
33
+ }
34
+
35
+ // Strategy 2: first `[` to its matching `]`
36
+ const start = text.indexOf('[');
37
+ if (start === -1) {
38
+ throw new Error('No JSON array found in LLM response');
39
+ }
40
+
41
+ let depth = 0;
42
+ let inString = false;
43
+ let escape = false;
44
+
45
+ for (let i = start; i < text.length; i++) {
46
+ const ch = text[i];
47
+
48
+ if (escape) {
49
+ escape = false;
50
+ continue;
51
+ }
52
+ if (ch === '\\' && inString) {
53
+ escape = true;
54
+ continue;
55
+ }
56
+ if (ch === '"') {
57
+ inString = !inString;
58
+ continue;
59
+ }
60
+ if (inString) continue;
61
+
62
+ if (ch === '[') depth++;
63
+ else if (ch === ']') {
64
+ depth--;
65
+ if (depth === 0) {
66
+ return JSON.parse(text.slice(start, i + 1));
67
+ }
68
+ }
69
+ }
70
+
71
+ throw new Error('Unbalanced JSON array in LLM response');
72
+ }
73
+
74
+ // ── Case validation & normalisation ───────────────────────────────────────
75
+
76
+ /**
77
+ * Validate and normalise a raw array of eval cases returned by the LLM.
78
+ * Items missing required fields are filtered out with a warning.
79
+ * Missing ids are assigned sequential defaults.
80
+ *
81
+ * @param {unknown[]} items - Raw parsed array from LLM
82
+ * @param {string} toolName - Used for default id generation
83
+ * @param {'golden'|'labeled'} kind
84
+ * @returns {object[]} Validated EvalCase array
85
+ */
86
+ function validateAndNormaliseCases(items, toolName, kind) {
87
+ if (!Array.isArray(items)) {
88
+ throw new Error('LLM response did not parse to an array');
89
+ }
90
+
91
+ const prefix = kind === 'golden'
92
+ ? `${toolName}_golden`
93
+ : `${toolName}_labeled`;
94
+
95
+ const valid = [];
96
+ let counter = 1;
97
+
98
+ for (const item of items) {
99
+ if (!item || typeof item !== 'object') continue;
100
+
101
+ // Required: input.message
102
+ if (!item.input || typeof item.input.message !== 'string' || item.input.message.trim() === '') {
103
+ continue;
104
+ }
105
+
106
+ // Assign default id if missing or blank
107
+ if (typeof item.id !== 'string' || item.id.trim() === '') {
108
+ item.id = `${prefix}_${String(counter).padStart(3, '0')}`;
109
+ }
110
+
111
+ // Assign default description if missing
112
+ if (typeof item.description !== 'string' || item.description.trim() === '') {
113
+ item.description = `${kind} case ${counter}`;
114
+ }
115
+
116
+ // Assign default difficulty if missing
117
+ if (typeof item.difficulty !== 'string' || item.difficulty.trim() === '') {
118
+ item.difficulty = 'easy';
119
+ }
120
+
121
+ counter++;
122
+ valid.push(item);
123
+ }
124
+
125
+ return valid;
126
+ }
127
+
128
+ // ── Prompt builders ────────────────────────────────────────────────────────
129
+
130
+ /**
131
+ * Build the LLM prompt for golden eval cases.
132
+ *
133
+ * @param {object} spec - Tool specification
134
+ * @param {{ total: number }} [mix] - Golden mix config
135
+ * @returns {string}
136
+ */
137
+ function buildGoldenPrompt(spec, mix) {
138
+ const triggers = Array.isArray(spec.triggerPhrases) && spec.triggerPhrases.length
139
+ ? spec.triggerPhrases.join(', ')
140
+ : '(none provided)';
141
+
142
+ const count = mix?.total || 8;
143
+
144
+ return `Generate exactly ${count} golden eval cases for the tool '${spec.name}'.
145
+ Description: ${spec.description}
146
+ Trigger phrases: ${triggers}
147
+
148
+ Each case should have a natural user message that would trigger this tool.
149
+ Golden cases must have toolsCalled: ["${spec.name}"].
150
+
151
+ Include a variety of phrasings — direct questions, rephrased requests, casual wording.
152
+ Also include one case testing that raw JSON/internals are not leaked in the response.
153
+
154
+ Return a JSON array of eval cases. Each case must have this shape:
155
+ {
156
+ "id": "${spec.name}_golden_001",
157
+ "description": "brief description of the case",
158
+ "difficulty": "easy" | "medium" | "hard",
159
+ "input": { "message": "<user message>" },
160
+ "expect": {
161
+ "toolsCalled": ["${spec.name}"],
162
+ "noToolErrors": true,
163
+ "responseNonEmpty": true
164
+ }
165
+ }
166
+
167
+ Respond ONLY with the JSON array — no prose, no markdown outside the JSON block.`;
168
+ }
169
+
170
+ /**
171
+ * Build the LLM prompt for labeled eval cases.
172
+ *
173
+ * @param {object} spec - Tool specification
174
+ * @param {Array<{name:string, description:string}>} allTools - All tools in registry
175
+ * @param {{ straightforward: number, ambiguous: number, edge: number, adversarial: number }} [mix]
176
+ * @returns {string}
177
+ */
178
+ function buildLabeledPrompt(spec, allTools, mix) {
179
+ const toolsListing = allTools.length
180
+ ? allTools.map((t) => `${t.name}: ${t.description}`).join('\n')
181
+ : `${spec.name}: ${spec.description}`;
182
+
183
+ const straight = mix?.straightforward ?? 3;
184
+ const ambiguous = mix?.ambiguous ?? 3;
185
+ const edge = mix?.edge ?? 2;
186
+ const adversarial = mix?.adversarial ?? 2;
187
+ const total = straight + ambiguous + edge + adversarial;
188
+
189
+ return `Generate exactly ${total} labeled eval cases for '${spec.name}' vs other tools.
190
+ All tools:
191
+ ${toolsListing}
192
+
193
+ Labeled cases test disambiguation — when the user's intent might match multiple tools or no tool.
194
+ Required distribution:
195
+ - ${straight} straightforward cases (clear intent, single tool)
196
+ - ${ambiguous} ambiguous cases (multiple tools could apply)
197
+ - ${edge} edge cases (prompt injection, off-topic, or general knowledge — use ["__none__"])
198
+ - ${adversarial} adversarial cases (attempts to trick or misuse the tool)
199
+
200
+ Each case has expect.toolsAcceptable (array of acceptable tool-name arrays).
201
+ Use ["__none__"] for cases where no tool should be called.
202
+
203
+ Return a JSON array of eval cases. Each case must have this shape:
204
+ {
205
+ "id": "${spec.name}_labeled_001",
206
+ "description": "brief description of the case",
207
+ "difficulty": "straightforward" | "ambiguous" | "edge" | "adversarial",
208
+ "input": { "message": "<user message>" },
209
+ "expect": {
210
+ "toolsAcceptable": [["tool_a"], ["tool_a", "tool_b"]],
211
+ "noToolErrors": true,
212
+ "responseNonEmpty": true
213
+ }
214
+ }
215
+
216
+ Respond ONLY with the JSON array — no prose, no markdown outside the JSON block.`;
217
+ }
218
+
219
+ // ── LLM call with retry ────────────────────────────────────────────────────
220
+
221
+ /**
222
+ * Call the LLM and extract a valid JSON array of eval cases.
223
+ * Retries up to MAX_RETRIES times with corrective nudges.
224
+ *
225
+ * @param {object} opts
226
+ * @param {object} opts.modelConfig - { provider, apiKey, model }
227
+ * @param {string} opts.prompt - User-turn prompt
228
+ * @param {string} opts.toolName - For id assignment
229
+ * @param {'golden'|'labeled'} opts.kind
230
+ * @param {number} [opts.maxRetries]
231
+ * @returns {Promise<object[]>}
232
+ */
233
+ async function callLlmForCases({ modelConfig, prompt, toolName, kind, maxRetries = 2 }) {
234
+ const messages = [{ role: 'user', content: prompt }];
235
+ let lastError;
236
+
237
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
238
+ let responseText;
239
+
240
+ try {
241
+ const turn = await llmTurn({
242
+ provider: modelConfig.provider,
243
+ apiKey: modelConfig.apiKey,
244
+ model: modelConfig.model,
245
+ messages,
246
+ maxTokens: 4096,
247
+ timeoutMs: 90_000
248
+ });
249
+ responseText = turn.text;
250
+ } catch (err) {
251
+ throw new Error(
252
+ `LLM API call failed while generating ${kind} evals for "${toolName}": ${err.message}`
253
+ );
254
+ }
255
+
256
+ if (!responseText || responseText.trim() === '') {
257
+ lastError = new Error(
258
+ `LLM returned an empty response on attempt ${attempt}/${maxRetries}`
259
+ );
260
+ messages.push({ role: 'assistant', content: responseText || '' });
261
+ messages.push({
262
+ role: 'user',
263
+ content:
264
+ 'Your response was empty. Please respond with ONLY a JSON array of eval cases.'
265
+ });
266
+ continue;
267
+ }
268
+
269
+ let parsed;
270
+ try {
271
+ parsed = extractJsonArray(responseText);
272
+ } catch (parseErr) {
273
+ lastError = new Error(
274
+ `Attempt ${attempt}/${maxRetries}: Could not extract JSON array from LLM response — ` +
275
+ parseErr.message +
276
+ `\nRaw response (first 300 chars): ${responseText.slice(0, 300)}`
277
+ );
278
+ messages.push({ role: 'assistant', content: responseText });
279
+ messages.push({
280
+ role: 'user',
281
+ content:
282
+ 'Your previous response did not contain a valid JSON array. ' +
283
+ 'Respond ONLY with a JSON array of eval case objects. ' +
284
+ 'Do not include any text outside the JSON.'
285
+ });
286
+ continue;
287
+ }
288
+
289
+ let validated;
290
+ try {
291
+ validated = validateAndNormaliseCases(parsed, toolName, kind);
292
+ } catch (validErr) {
293
+ lastError = new Error(
294
+ `Attempt ${attempt}/${maxRetries}: Eval case validation failed — ${validErr.message}`
295
+ );
296
+ messages.push({ role: 'assistant', content: responseText });
297
+ messages.push({
298
+ role: 'user',
299
+ content:
300
+ `The array you returned was invalid: ${validErr.message}. ` +
301
+ 'Please provide a JSON array where each item has at minimum ' +
302
+ '"id", "description", "input" (with "message"), and "expect".'
303
+ });
304
+ continue;
305
+ }
306
+
307
+ if (validated.length === 0) {
308
+ lastError = new Error(
309
+ `Attempt ${attempt}/${maxRetries}: LLM returned an array but no valid eval cases were found`
310
+ );
311
+ messages.push({ role: 'assistant', content: responseText });
312
+ messages.push({
313
+ role: 'user',
314
+ content:
315
+ 'None of the items in your array had the required shape. ' +
316
+ 'Each item needs at minimum an "input" object with a "message" string field.'
317
+ });
318
+ continue;
319
+ }
320
+
321
+ return validated;
322
+ }
323
+
324
+ throw new Error(
325
+ `generateEvals: failed to obtain valid ${kind} eval cases for "${toolName}" ` +
326
+ `after ${maxRetries} attempts. Last error: ${lastError?.message}`
327
+ );
328
+ }
329
+
330
+ // ── Main export ────────────────────────────────────────────────────────────
331
+
332
+ /**
333
+ * Generate golden and labeled eval JSON via LLM.
334
+ *
335
+ * Does NOT write files. Returns the case arrays and their intended paths so
336
+ * the caller (forge.js) can preview and confirm before writing.
337
+ *
338
+ * @param {object} opts
339
+ * @param {object} opts.spec - Tool specification
340
+ * @param {string} opts.spec.name - Snake_case tool name
341
+ * @param {string} opts.spec.description - Human-readable description
342
+ * @param {string[]} [opts.spec.triggerPhrases]
343
+ * @param {Array<{name:string, description:string}>} [opts.allTools]
344
+ * All tools in the registry (used to generate disambiguation labeled cases).
345
+ * Defaults to a single-entry list containing the spec itself if omitted.
346
+ * @param {object} opts.projectConfig - forge.config.json contents
347
+ * @param {string} opts.projectRoot - Absolute path to project root
348
+ * @param {object} opts.modelConfig - { provider, apiKey, model }
349
+ * @param {string} opts.modelConfig.provider - 'anthropic' | 'openai'
350
+ * @param {string} opts.modelConfig.apiKey
351
+ * @param {string} opts.modelConfig.model
352
+ *
353
+ * @returns {Promise<{
354
+ * goldenCases: object[],
355
+ * labeledCases: object[],
356
+ * goldenPath: string,
357
+ * labeledPath: string
358
+ * }>}
359
+ *
360
+ * @throws {Error} If LLM returns invalid content after retries
361
+ */
362
+ export async function generateEvals({
363
+ spec,
364
+ allTools = [],
365
+ projectConfig,
366
+ projectRoot,
367
+ modelConfig,
368
+ evalMix
369
+ }) {
370
+ const evalsDir = projectConfig?.project?.evalsDir || 'docs/examples';
371
+
372
+ // Resolve absolute evals directory for path construction
373
+ const absEvalsDir = evalsDir.startsWith('/')
374
+ ? evalsDir
375
+ : `${projectRoot}/${evalsDir}`;
376
+
377
+ const goldenPath = `${absEvalsDir}/${spec.name}.golden.json`;
378
+ const labeledPath = `${absEvalsDir}/${spec.name}.labeled.json`;
379
+
380
+ // Ensure allTools includes at least the spec tool itself
381
+ const toolsForLabeled = allTools.length
382
+ ? allTools
383
+ : [{ name: spec.name, description: spec.description }];
384
+
385
+ // Resolve mix — prefer explicit arg, then spec.evalMix, then config default, then hardcoded
386
+ const DEFAULT_MIX = {
387
+ golden: { total: 10 },
388
+ labeled: { straightforward: 3, ambiguous: 3, edge: 2, adversarial: 2 }
389
+ };
390
+ const configMix = projectConfig?.evals?.defaultMix;
391
+ const resolvedMix = evalMix || spec.evalMix || configMix || DEFAULT_MIX;
392
+
393
+ // ── Generate golden cases ──────────────────────────────────────────────
394
+ const goldenPrompt = buildGoldenPrompt(spec, resolvedMix.golden);
395
+ const goldenCases = await callLlmForCases({
396
+ modelConfig,
397
+ prompt: goldenPrompt,
398
+ toolName: spec.name,
399
+ kind: 'golden'
400
+ });
401
+
402
+ // ── Generate labeled cases ─────────────────────────────────────────────
403
+ const labeledPrompt = buildLabeledPrompt(spec, toolsForLabeled, resolvedMix.labeled);
404
+ const labeledCases = await callLlmForCases({
405
+ modelConfig,
406
+ prompt: labeledPrompt,
407
+ toolName: spec.name,
408
+ kind: 'labeled'
409
+ });
410
+
411
+ return {
412
+ goldenCases,
413
+ labeledCases,
414
+ goldenPath,
415
+ labeledPath
416
+ };
417
+ }