agent-tool-forge 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +209 -0
  3. package/lib/agent-registry.js +170 -0
  4. package/lib/api-client.js +792 -0
  5. package/lib/api-loader.js +260 -0
  6. package/lib/auth.d.ts +25 -0
  7. package/lib/auth.js +158 -0
  8. package/lib/checks/check-adapter.js +172 -0
  9. package/lib/checks/compose.js +42 -0
  10. package/lib/checks/content-match.js +14 -0
  11. package/lib/checks/cost-budget.js +11 -0
  12. package/lib/checks/index.js +18 -0
  13. package/lib/checks/json-valid.js +15 -0
  14. package/lib/checks/latency.js +11 -0
  15. package/lib/checks/length-bounds.js +17 -0
  16. package/lib/checks/negative-match.js +14 -0
  17. package/lib/checks/no-hallucinated-numbers.js +63 -0
  18. package/lib/checks/non-empty.js +34 -0
  19. package/lib/checks/regex-match.js +12 -0
  20. package/lib/checks/run-checks.js +84 -0
  21. package/lib/checks/schema-match.js +26 -0
  22. package/lib/checks/tool-call-count.js +16 -0
  23. package/lib/checks/tool-selection.js +34 -0
  24. package/lib/checks/types.js +45 -0
  25. package/lib/comparison/compare.js +86 -0
  26. package/lib/comparison/format.js +104 -0
  27. package/lib/comparison/index.js +6 -0
  28. package/lib/comparison/statistics.js +59 -0
  29. package/lib/comparison/types.js +41 -0
  30. package/lib/config-schema.js +200 -0
  31. package/lib/config.d.ts +66 -0
  32. package/lib/conversation-store.d.ts +77 -0
  33. package/lib/conversation-store.js +443 -0
  34. package/lib/db.d.ts +6 -0
  35. package/lib/db.js +1112 -0
  36. package/lib/dep-check.js +99 -0
  37. package/lib/drift-background.js +61 -0
  38. package/lib/drift-monitor.js +187 -0
  39. package/lib/eval-runner.js +566 -0
  40. package/lib/fixtures/fixture-store.js +161 -0
  41. package/lib/fixtures/index.js +11 -0
  42. package/lib/forge-engine.js +982 -0
  43. package/lib/forge-eval-generator.js +417 -0
  44. package/lib/forge-file-writer.js +386 -0
  45. package/lib/forge-service-client.js +190 -0
  46. package/lib/forge-service.d.ts +4 -0
  47. package/lib/forge-service.js +655 -0
  48. package/lib/forge-verifier-generator.js +271 -0
  49. package/lib/handlers/admin.js +151 -0
  50. package/lib/handlers/agents.js +229 -0
  51. package/lib/handlers/chat-resume.js +334 -0
  52. package/lib/handlers/chat-sync.js +320 -0
  53. package/lib/handlers/chat.js +320 -0
  54. package/lib/handlers/conversations.js +92 -0
  55. package/lib/handlers/preferences.js +88 -0
  56. package/lib/handlers/tools-list.js +58 -0
  57. package/lib/hitl-engine.d.ts +60 -0
  58. package/lib/hitl-engine.js +261 -0
  59. package/lib/http-utils.js +92 -0
  60. package/lib/index.d.ts +20 -0
  61. package/lib/index.js +141 -0
  62. package/lib/init.js +636 -0
  63. package/lib/manual-entry.js +59 -0
  64. package/lib/mcp-server.js +252 -0
  65. package/lib/output-groups.js +54 -0
  66. package/lib/postgres-store.d.ts +31 -0
  67. package/lib/postgres-store.js +465 -0
  68. package/lib/preference-store.d.ts +47 -0
  69. package/lib/preference-store.js +79 -0
  70. package/lib/prompt-store.d.ts +42 -0
  71. package/lib/prompt-store.js +60 -0
  72. package/lib/rate-limiter.d.ts +30 -0
  73. package/lib/rate-limiter.js +104 -0
  74. package/lib/react-engine.d.ts +110 -0
  75. package/lib/react-engine.js +337 -0
  76. package/lib/runner/cli.js +156 -0
  77. package/lib/runner/cost-estimator.js +71 -0
  78. package/lib/runner/gate.js +46 -0
  79. package/lib/runner/index.js +165 -0
  80. package/lib/sidecar.d.ts +83 -0
  81. package/lib/sidecar.js +161 -0
  82. package/lib/sse.d.ts +15 -0
  83. package/lib/sse.js +30 -0
  84. package/lib/tools-scanner.js +91 -0
  85. package/lib/tui.js +253 -0
  86. package/lib/verifier-report.js +78 -0
  87. package/lib/verifier-runner.js +338 -0
  88. package/lib/verifier-scanner.js +70 -0
  89. package/lib/verifier-worker-pool.js +196 -0
  90. package/lib/views/chat.js +340 -0
  91. package/lib/views/endpoints.js +203 -0
  92. package/lib/views/eval-run.js +206 -0
  93. package/lib/views/forge-agent.js +538 -0
  94. package/lib/views/forge.js +410 -0
  95. package/lib/views/main-menu.js +275 -0
  96. package/lib/views/mediation.js +381 -0
  97. package/lib/views/model-compare.js +430 -0
  98. package/lib/views/model-comparison.js +333 -0
  99. package/lib/views/onboarding.js +470 -0
  100. package/lib/views/performance.js +237 -0
  101. package/lib/views/run-evals.js +205 -0
  102. package/lib/views/settings.js +829 -0
  103. package/lib/views/tools-evals.js +514 -0
  104. package/lib/views/verifier-coverage.js +617 -0
  105. package/lib/workers/verifier-worker.js +52 -0
  106. package/package.json +123 -0
  107. package/widget/forge-chat.js +789 -0
@@ -0,0 +1,982 @@
1
+ /**
2
+ * forge-engine.js — Pure async state machine for the 12-phase tool forge dialogue (phases 0–11).
3
+ *
4
+ * No blessed dependencies. No UI. Pure logic.
5
+ * Uses cli/api-client.js for LLM calls.
6
+ */
7
+
8
+ import { llmTurn } from './api-client.js';
9
+
10
+ // ── Phase registry ─────────────────────────────────────────────────────────
11
+
12
+ export const PHASES = [
13
+ 'explore',
14
+ 'skeptic',
15
+ 'description',
16
+ 'fields',
17
+ 'routing',
18
+ 'deps',
19
+ 'confirm',
20
+ 'generate',
21
+ 'test',
22
+ 'evals',
23
+ 'verifiers',
24
+ 'done'
25
+ ];
26
+
27
+ /**
28
+ * Return the zero-based index of a phase name.
29
+ *
30
+ * @param {string} phase
31
+ * @returns {number} -1 if not found
32
+ */
33
+ export function getPhaseIndex(phase) {
34
+ return PHASES.indexOf(phase);
35
+ }
36
+
37
+ // ── Initial state factory ──────────────────────────────────────────────────
38
+
39
+ /**
40
+ * Return a fresh initial state with the explore phase active.
41
+ *
42
+ * @returns {object}
43
+ */
44
+ export function createInitialState() {
45
+ return {
46
+ phase: 'explore',
47
+ spec: {
48
+ name: null,
49
+ description: null,
50
+ schema: null,
51
+ category: null,
52
+ consequenceLevel: null,
53
+ requiresConfirmation: null,
54
+ timeout: null,
55
+ tags: [],
56
+ dependsOn: [],
57
+ triggerPhrases: [],
58
+ endpointTarget: null,
59
+ httpMethod: null,
60
+ authType: null,
61
+ paramMap: {},
62
+ evalMix: null
63
+ },
64
+ messages: [],
65
+ retryCount: 0,
66
+ lastValidationError: null,
67
+ generationId: null,
68
+ phaseStartIdx: 0,
69
+ skepticOverlaps: [],
70
+ skepticOverlapSurfaced: false
71
+ };
72
+ }
73
+
74
+ // ── Phase system prompts ───────────────────────────────────────────────────
75
+
76
+ const SYSTEM_PROMPTS = {
77
+ explore:
78
+ "You are a tool forge assistant helping design a new LLM agent tool. Ask the user what they want to build. Be curious and open. Try to understand the use case, the trigger phrase ('user says X'), and what the tool should do. Keep your response under 100 words.",
79
+
80
+ skepticV2:
81
+ "You are reviewing whether a new tool is necessary. Here are the existing tools with their descriptions and trigger phrases:\n\n{existingTools}\n\nCheck if the proposed tool overlaps semantically with any existing tool. Look for overlapping descriptions (similar intent or scope) and overlapping trigger phrases (a user might say the same thing to trigger both tools).\n\nFor each overlap found, output EXACTLY this format on its own line:\nOVERLAP FOUND: [tool_name] — [reason]\n\nIf no overlaps are found, output EXACTLY:\nNO_OVERLAP\n\nThen challenge the user: does this need to be a separate tool, or can it be a parameter variation? Be pointed.",
82
+
83
+ description:
84
+ "You are locking the description contract for a tool. The format MUST be: '<What the tool does>. Use when <trigger phrase>. <Disambiguation from similar tools if any>.' Extract: name (snake_case), description (this format), triggerPhrases (3+ variations a user might say to trigger this). Respond with JSON: { name, description, triggerPhrases }. Then ask the user to confirm.",
85
+
86
+ fields:
87
+ "Extract the tool's schema fields, category, consequence level, and confirmation requirement. Respond with JSON: { schema: { <fieldName>: { type, description, optional? } }, category: 'read'|'write'|'delete'|'side_effect', consequenceLevel: 'low'|'medium'|'high', requiresConfirmation: boolean }. Then show a summary.",
88
+
89
+ routing:
90
+ "This tool generates an MCP routing layer pointing to a real API endpoint. Collect: endpointTarget (URL string), httpMethod (GET|POST|PUT|DELETE|PATCH), authType (bearer|apiKey|none), and paramMap (object mapping schema field names to API parameter names — can be empty if names match). Respond with JSON: { endpointTarget, httpMethod, authType, paramMap }. Ask the user for these values if unclear.",
91
+
92
+ deps:
93
+ "Optionally collect tags and dependencies. Ask if this tool depends on any other tools. Respond with JSON: { tags: [], dependsOn: [] }. This phase can be skipped.",
94
+
95
+ confirm:
96
+ "Show the full spec and ask the user to type 'yes' to proceed to code generation, or describe any changes.",
97
+
98
+ generate:
99
+ "Auto-advance — no user input needed. Emit the write_file action.",
100
+
101
+ test:
102
+ "Auto-advance — no user input needed. Emit the run_tests action.",
103
+
104
+ evalsInteractive:
105
+ "We're about to generate eval cases for this tool. The default eval mix is: 10 golden cases + 10 labeled cases (3 straightforward, 3 ambiguous, 2 edge, 2 adversarial). Would you like to use the default mix, or customize the counts? If customizing, respond with JSON: { evalMix: { golden: { total: N }, labeled: { straightforward: N, ambiguous: N, edge: N, adversarial: N } } }. If using the default, just say 'default'.",
106
+
107
+ verifiers:
108
+ "Auto-advance — no user input needed. Emit the write_verifiers action.",
109
+
110
+ done:
111
+ "The tool forge dialogue is complete."
112
+ };
113
+
114
+ // ── Phase validators ───────────────────────────────────────────────────────
115
+
116
+ /**
117
+ * Validate JSON extracted from the description phase.
118
+ *
119
+ * @param {object} json
120
+ * @returns {{ valid: boolean, error: string|null }}
121
+ */
122
+ function validateDescription(json) {
123
+ if (!json || typeof json !== 'object') {
124
+ return { valid: false, error: 'Response must be a JSON object.' };
125
+ }
126
+
127
+ const { name, description, triggerPhrases } = json;
128
+
129
+ if (typeof name !== 'string' || !/^[a-z][a-z0-9_]*$/.test(name)) {
130
+ return {
131
+ valid: false,
132
+ error: 'name must be a non-empty snake_case string (e.g. "my_tool_name").'
133
+ };
134
+ }
135
+
136
+ if (typeof description !== 'string' || description.trim().length === 0) {
137
+ return { valid: false, error: 'description must be a non-empty string.' };
138
+ }
139
+
140
+ if (!description.toLowerCase().includes('use when')) {
141
+ return {
142
+ valid: false,
143
+ error:
144
+ 'description must follow the format: "<What it does>. Use when <trigger>. <Disambiguation>."'
145
+ };
146
+ }
147
+
148
+ if (!Array.isArray(triggerPhrases) || triggerPhrases.length < 2) {
149
+ return {
150
+ valid: false,
151
+ error: 'triggerPhrases must be an array with at least 2 entries.'
152
+ };
153
+ }
154
+
155
+ return { valid: true, error: null };
156
+ }
157
+
158
+ /**
159
+ * Validate JSON extracted from the fields phase.
160
+ *
161
+ * @param {object} json
162
+ * @returns {{ valid: boolean, error: string|null }}
163
+ */
164
+ function validateFields(json) {
165
+ if (!json || typeof json !== 'object') {
166
+ return { valid: false, error: 'Response must be a JSON object.' };
167
+ }
168
+
169
+ const { schema, category, consequenceLevel, requiresConfirmation } = json;
170
+
171
+ if (!schema || typeof schema !== 'object' || Array.isArray(schema)) {
172
+ return { valid: false, error: 'schema must be a non-null object mapping field names to definitions.' };
173
+ }
174
+
175
+ const validCategories = ['read', 'write', 'delete', 'side_effect'];
176
+ if (!validCategories.includes(category)) {
177
+ return {
178
+ valid: false,
179
+ error: `category must be one of: ${validCategories.join(', ')}.`
180
+ };
181
+ }
182
+
183
+ const validLevels = ['low', 'medium', 'high'];
184
+ if (!validLevels.includes(consequenceLevel)) {
185
+ return {
186
+ valid: false,
187
+ error: `consequenceLevel must be one of: ${validLevels.join(', ')}.`
188
+ };
189
+ }
190
+
191
+ if (typeof requiresConfirmation !== 'boolean') {
192
+ return { valid: false, error: 'requiresConfirmation must be a boolean.' };
193
+ }
194
+
195
+ return { valid: true, error: null };
196
+ }
197
+
198
+ /**
199
+ * Validate JSON extracted from the routing phase.
200
+ *
201
+ * @param {object} json
202
+ * @returns {{ valid: boolean, error: string|null }}
203
+ */
204
+ function validateRouting(json) {
205
+ if (!json || typeof json !== 'object') {
206
+ return { valid: false, error: 'Response must be a JSON object.' };
207
+ }
208
+
209
+ const { endpointTarget, httpMethod, authType, paramMap } = json;
210
+
211
+ if (typeof endpointTarget !== 'string' || endpointTarget.trim() === '') {
212
+ return { valid: false, error: 'endpointTarget must be a non-empty string URL.' };
213
+ }
214
+
215
+ const validMethods = ['GET', 'POST', 'PUT', 'DELETE', 'PATCH'];
216
+ if (!validMethods.includes(httpMethod)) {
217
+ return { valid: false, error: `httpMethod must be one of: ${validMethods.join(', ')}.` };
218
+ }
219
+
220
+ const validAuthTypes = ['bearer', 'apiKey', 'none'];
221
+ if (!validAuthTypes.includes(authType)) {
222
+ return { valid: false, error: `authType must be one of: ${validAuthTypes.join(', ')}.` };
223
+ }
224
+
225
+ if (!paramMap || typeof paramMap !== 'object' || Array.isArray(paramMap)) {
226
+ return { valid: false, error: 'paramMap must be an object (can be empty {}).' };
227
+ }
228
+
229
+ return { valid: true, error: null };
230
+ }
231
+
232
+ /**
233
+ * Validate JSON extracted from the evals interactive phase.
234
+ *
235
+ * @param {object} json
236
+ * @returns {{ valid: boolean, error: string|null }}
237
+ */
238
+ function validateEvalMix(json) {
239
+ if (!json || typeof json !== 'object') {
240
+ return { valid: false, error: 'Response must be a JSON object.' };
241
+ }
242
+
243
+ const { evalMix } = json;
244
+ if (!evalMix || typeof evalMix !== 'object') {
245
+ return { valid: false, error: 'evalMix must be an object.' };
246
+ }
247
+
248
+ if (!evalMix.golden || typeof evalMix.golden.total !== 'number') {
249
+ return { valid: false, error: 'evalMix.golden.total must be a number.' };
250
+ }
251
+
252
+ if (!evalMix.labeled || typeof evalMix.labeled !== 'object') {
253
+ return { valid: false, error: 'evalMix.labeled must be an object.' };
254
+ }
255
+
256
+ const labeledFields = ['straightforward', 'ambiguous', 'edge', 'adversarial'];
257
+ for (const f of labeledFields) {
258
+ if (evalMix.labeled[f] !== undefined && typeof evalMix.labeled[f] !== 'number') {
259
+ return { valid: false, error: `evalMix.labeled.${f} must be a number.` };
260
+ }
261
+ }
262
+
263
+ return { valid: true, error: null };
264
+ }
265
+
266
+ /**
267
+ * Parse the skeptic phase LLM response to find overlap findings.
268
+ *
269
+ * @param {string} text
270
+ * @returns {{ overlaps: string[], clear: boolean|null }}
271
+ */
272
+ function parseSkepticResult(text) {
273
+ if (!text) return { overlaps: [], clear: null };
274
+
275
+ // Check OVERLAP FOUND lines first — they take priority over the NO_OVERLAP sentinel.
276
+ // If both appear (malformed LLM output), the overlap wins.
277
+ const overlapLines = [];
278
+ const overlapRegex = /^OVERLAP FOUND:\s*(.+)$/gm;
279
+ let match;
280
+ while ((match = overlapRegex.exec(text)) !== null) {
281
+ overlapLines.push(match[1].trim());
282
+ }
283
+
284
+ if (overlapLines.length > 0) {
285
+ return { overlaps: overlapLines, clear: false };
286
+ }
287
+
288
+ if (text.includes('NO_OVERLAP')) {
289
+ return { overlaps: [], clear: true };
290
+ }
291
+
292
+ // LLM didn't follow the format — treat as unclear
293
+ return { overlaps: [], clear: null };
294
+ }
295
+
296
+ /**
297
+ * Validate JSON extracted from the deps phase.
298
+ *
299
+ * @param {object} json
300
+ * @returns {{ valid: boolean, error: string|null }}
301
+ */
302
+ function validateDeps(json) {
303
+ if (!json || typeof json !== 'object') {
304
+ return { valid: false, error: 'Response must be a JSON object.' };
305
+ }
306
+
307
+ const { tags, dependsOn } = json;
308
+
309
+ if (!Array.isArray(tags)) {
310
+ return { valid: false, error: 'tags must be an array.' };
311
+ }
312
+
313
+ if (!Array.isArray(dependsOn)) {
314
+ return { valid: false, error: 'dependsOn must be an array.' };
315
+ }
316
+
317
+ return { valid: true, error: null };
318
+ }
319
+
320
+ // ── JSON extraction ────────────────────────────────────────────────────────
321
+
322
+ /**
323
+ * Extract the first JSON object from an LLM response string.
324
+ * Tries ```json ... ``` fenced block first; falls back to first { to last }.
325
+ *
326
+ * @param {string} text
327
+ * @returns {object|null} Parsed object, or null if extraction failed.
328
+ */
329
+ function extractJson(text) {
330
+ // Strategy 1: fenced ```json ... ``` block
331
+ const fenced = text.match(/```json\s*([\s\S]*?)\s*```/);
332
+ if (fenced) {
333
+ try {
334
+ return JSON.parse(fenced[1]);
335
+ } catch (_) {
336
+ // fall through to strategy 2
337
+ }
338
+ }
339
+
340
+ // Strategy 2: first { to last }
341
+ const start = text.indexOf('{');
342
+ const end = text.lastIndexOf('}');
343
+ if (start !== -1 && end > start) {
344
+ try {
345
+ return JSON.parse(text.slice(start, end + 1));
346
+ } catch (_) {
347
+ // extraction failed
348
+ }
349
+ }
350
+
351
+ return null;
352
+ }
353
+
354
+ // ── LLM call helper ────────────────────────────────────────────────────────
355
+
356
+ /**
357
+ * Perform a single LLM turn, appending user input (if any) and returning the
358
+ * assistant text plus the updated messages array.
359
+ *
360
+ * @param {object[]} messages - Current conversation history (immutable)
361
+ * @param {string|null} userInput - New user message, or null to skip
362
+ * @param {string} systemPrompt - Phase system prompt (may be overridden via state._systemPromptOverride)
363
+ * @param {object} modelConfig - { provider, apiKey, model }
364
+ * @param {string|null} [retryHint] - If set, appended to system prompt to guide correction
365
+ * @param {string|null} [overridePrompt] - When non-empty, fully replaces systemPrompt for this turn
366
+ * @returns {Promise<{ assistantText: string, newMessages: object[] }>}
367
+ */
368
+ async function callLlm(messages, userInput, systemPrompt, modelConfig, retryHint = null, overridePrompt = null) {
369
+ const newMessages = [...messages];
370
+
371
+ if (userInput !== null && userInput !== undefined) {
372
+ newMessages.push({ role: 'user', content: userInput });
373
+ }
374
+
375
+ // Build the API payload separately — add a synthetic [continue] turn only for
376
+ // the API call when the last stored message is from the assistant and there is
377
+ // no new user input. The synthetic turn is NOT stored back into state.
378
+ const apiMessages = (
379
+ newMessages.length > 0 &&
380
+ newMessages[newMessages.length - 1].role === 'assistant' &&
381
+ (userInput === null || userInput === undefined)
382
+ ) ? [...newMessages, { role: 'user', content: '[continue]' }] : newMessages;
383
+
384
+ const basePrompt = (overridePrompt && overridePrompt.trim()) ? overridePrompt : systemPrompt;
385
+ const fullSystem = retryHint
386
+ ? basePrompt + '\n\nPrevious attempt failed: ' + retryHint + '\nPlease correct the JSON.'
387
+ : basePrompt;
388
+
389
+ let result;
390
+ try {
391
+ result = await llmTurn({
392
+ provider: modelConfig.provider,
393
+ apiKey: modelConfig.apiKey,
394
+ model: modelConfig.model,
395
+ system: fullSystem,
396
+ messages: apiMessages,
397
+ maxTokens: 4096
398
+ });
399
+ } catch (err) {
400
+ throw new Error(`LLM call failed (${modelConfig.provider}/${modelConfig.model}): ${err.message}`);
401
+ }
402
+
403
+ const assistantText = result.text || '';
404
+ if (assistantText) {
405
+ newMessages.push({ role: 'assistant', content: assistantText });
406
+ }
407
+
408
+ return { assistantText, newMessages };
409
+ }
410
+
411
+ // ── Phase handlers ─────────────────────────────────────────────────────────
412
+
413
+ async function handleExplore({ state, userInput, modelConfig }) {
414
+ const systemPrompt = SYSTEM_PROMPTS.explore;
415
+ const { assistantText, newMessages } = await callLlm(
416
+ state.messages,
417
+ userInput,
418
+ systemPrompt,
419
+ modelConfig,
420
+ null,
421
+ state._systemPromptOverride || null
422
+ );
423
+
424
+ // Advance after the AI has asked its opening question AND the user has replied.
425
+ // Heuristic: if there is already at least one user message in history before
426
+ // this call, the user has replied to the initial question — advance to skeptic.
427
+ const prevUserMessages = state.messages.filter((m) => m.role === 'user');
428
+ const advance = prevUserMessages.length >= 1 && userInput !== null;
429
+
430
+ const nextPhase = advance ? 'skeptic' : 'explore';
431
+
432
+ return {
433
+ nextState: {
434
+ ...state,
435
+ phase: nextPhase,
436
+ messages: newMessages,
437
+ phaseStartIdx: advance ? newMessages.length : (state.phaseStartIdx || 0)
438
+ },
439
+ assistantText,
440
+ specUpdate: null,
441
+ actions: [],
442
+ phaseChanged: advance
443
+ };
444
+ }
445
+
446
+ async function handleSkeptic({ state, userInput, modelConfig, existingTools }) {
447
+ // Build the tool listing — accept both { name, description, triggerPhrases }[] and legacy string[]
448
+ let toolListing = '(none)';
449
+ if (Array.isArray(existingTools) && existingTools.length > 0) {
450
+ if (typeof existingTools[0] === 'string') {
451
+ toolListing = existingTools.join(', ');
452
+ } else {
453
+ toolListing = existingTools.map((t) => {
454
+ const triggers = Array.isArray(t.triggerPhrases) && t.triggerPhrases.length
455
+ ? ` Triggers: ${t.triggerPhrases.join(', ')}`
456
+ : '';
457
+ return `${t.name}: ${t.description || '(no description)'}${triggers ? '\n' + triggers : ''}`;
458
+ }).join('\n\n');
459
+ }
460
+ }
461
+
462
+ const systemPrompt = SYSTEM_PROMPTS.skepticV2.replace('{existingTools}', toolListing);
463
+
464
+ const { assistantText, newMessages } = await callLlm(
465
+ state.messages,
466
+ userInput,
467
+ systemPrompt,
468
+ modelConfig,
469
+ null,
470
+ state._systemPromptOverride || null
471
+ );
472
+
473
+ const { overlaps, clear } = parseSkepticResult(assistantText);
474
+
475
+ // If overlaps found and not yet surfaced — block and mark surfaced
476
+ if (overlaps.length > 0 && !state.skepticOverlapSurfaced) {
477
+ return {
478
+ nextState: {
479
+ ...state,
480
+ phase: 'skeptic',
481
+ messages: newMessages,
482
+ skepticOverlaps: overlaps,
483
+ skepticOverlapSurfaced: true
484
+ },
485
+ assistantText,
486
+ specUpdate: null,
487
+ actions: [],
488
+ phaseChanged: false
489
+ };
490
+ }
491
+
492
+ // If overlaps were surfaced, advance after user has replied
493
+ if (state.skepticOverlapSurfaced && userInput !== null) {
494
+ return {
495
+ nextState: {
496
+ ...state,
497
+ phase: 'description',
498
+ messages: newMessages,
499
+ skepticOverlaps: state.skepticOverlaps,
500
+ skepticOverlapSurfaced: state.skepticOverlapSurfaced
501
+ },
502
+ assistantText,
503
+ specUpdate: null,
504
+ actions: [],
505
+ phaseChanged: true
506
+ };
507
+ }
508
+
509
+ // No overlaps — always require the user to reply after seeing the skeptic's response.
510
+ // Never auto-advance on the first skeptic call, even when clear === true, so the LLM's
511
+ // response is shown before the phase transitions.
512
+ const phaseStart = state.phaseStartIdx || 0;
513
+ const userMsgsInPhase = state.messages.slice(phaseStart).filter((m) => m.role === 'user');
514
+ const advance = userMsgsInPhase.length >= 1 && userInput !== null;
515
+
516
+ const nextPhase = advance ? 'description' : 'skeptic';
517
+
518
+ return {
519
+ nextState: {
520
+ ...state,
521
+ phase: nextPhase,
522
+ messages: newMessages
523
+ },
524
+ assistantText,
525
+ specUpdate: null,
526
+ actions: [],
527
+ phaseChanged: advance
528
+ };
529
+ }
530
+
531
+ async function handleJsonPhase({
532
+ state,
533
+ userInput,
534
+ modelConfig,
535
+ systemPrompt,
536
+ validator,
537
+ applySpec,
538
+ nextPhase
539
+ }) {
540
+ const effectiveState = userInput !== null
541
+ ? { ...state, retryCount: 0, lastValidationError: null }
542
+ : state;
543
+
544
+ const retryHint = userInput !== null ? null : (effectiveState.lastValidationError || null);
545
+ const { assistantText, newMessages } = await callLlm(
546
+ effectiveState.messages,
547
+ userInput,
548
+ systemPrompt,
549
+ modelConfig,
550
+ retryHint,
551
+ state._systemPromptOverride || null
552
+ );
553
+
554
+ const extracted = extractJson(assistantText);
555
+
556
+ if (!extracted) {
557
+ // No JSON found — ask again if retries remain.
558
+ if (effectiveState.retryCount < 3) {
559
+ const newRetryHint = 'I could not find a JSON block in your response. Please include a JSON object with the required fields, wrapped in ```json ... ``` fences.';
560
+ return {
561
+ nextState: {
562
+ ...effectiveState,
563
+ phase: effectiveState.phase,
564
+ messages: newMessages,
565
+ retryCount: effectiveState.retryCount + 1,
566
+ lastValidationError: newRetryHint
567
+ },
568
+ assistantText,
569
+ specUpdate: null,
570
+ actions: [],
571
+ phaseChanged: false
572
+ };
573
+ }
574
+
575
+ // Too many retries — surface to user, reset retry counter.
576
+ const exhaustedText = assistantText + '\n\n(Could not extract JSON after 3 attempts — please rephrase or simplify your request.)';
577
+ const updatedMessages = [...newMessages];
578
+ const lastMsgMissing = updatedMessages[updatedMessages.length - 1];
579
+ if (lastMsgMissing && lastMsgMissing.role === 'assistant') {
580
+ updatedMessages[updatedMessages.length - 1] = { ...lastMsgMissing, content: exhaustedText };
581
+ } else {
582
+ updatedMessages.push({ role: 'assistant', content: exhaustedText });
583
+ }
584
+ return {
585
+ nextState: {
586
+ ...effectiveState,
587
+ phase: effectiveState.phase,
588
+ messages: updatedMessages,
589
+ retryCount: 0,
590
+ lastValidationError: null
591
+ },
592
+ assistantText: exhaustedText,
593
+ specUpdate: null,
594
+ actions: [],
595
+ phaseChanged: false
596
+ };
597
+ }
598
+
599
+ const { valid, error } = validator(extracted);
600
+
601
+ if (!valid) {
602
+ if (effectiveState.retryCount < 3) {
603
+ const newRetryHint = `The JSON was found but failed validation: ${error}`;
604
+ return {
605
+ nextState: {
606
+ ...effectiveState,
607
+ phase: effectiveState.phase,
608
+ messages: newMessages,
609
+ retryCount: effectiveState.retryCount + 1,
610
+ lastValidationError: newRetryHint
611
+ },
612
+ assistantText,
613
+ specUpdate: null,
614
+ actions: [],
615
+ phaseChanged: false
616
+ };
617
+ }
618
+
619
+ const validationExhaustedText = assistantText + `\n\n(Validation failed after 3 attempts: ${error} — please rephrase or simplify your request.)`;
620
+ const updatedValidationMessages = [...newMessages];
621
+ const lastMsgValidation = updatedValidationMessages[updatedValidationMessages.length - 1];
622
+ if (lastMsgValidation && lastMsgValidation.role === 'assistant') {
623
+ updatedValidationMessages[updatedValidationMessages.length - 1] = { ...lastMsgValidation, content: validationExhaustedText };
624
+ } else {
625
+ updatedValidationMessages.push({ role: 'assistant', content: validationExhaustedText });
626
+ }
627
+ return {
628
+ nextState: {
629
+ ...effectiveState,
630
+ phase: effectiveState.phase,
631
+ messages: updatedValidationMessages,
632
+ retryCount: 0,
633
+ lastValidationError: null
634
+ },
635
+ assistantText: validationExhaustedText,
636
+ specUpdate: null,
637
+ actions: [],
638
+ phaseChanged: false
639
+ };
640
+ }
641
+
642
+ // Valid — apply spec update and advance.
643
+ const specUpdate = applySpec(extracted);
644
+ return {
645
+ nextState: {
646
+ ...effectiveState,
647
+ phase: nextPhase,
648
+ spec: { ...effectiveState.spec, ...specUpdate },
649
+ messages: newMessages,
650
+ retryCount: 0,
651
+ lastValidationError: null
652
+ },
653
+ assistantText,
654
+ specUpdate,
655
+ actions: [],
656
+ phaseChanged: true
657
+ };
658
+ }
659
+
660
+ async function handleDescription({ state, userInput, modelConfig }) {
661
+ return handleJsonPhase({
662
+ state,
663
+ userInput,
664
+ modelConfig,
665
+ systemPrompt: SYSTEM_PROMPTS.description,
666
+ validator: validateDescription,
667
+ applySpec: (json) => ({
668
+ name: json.name,
669
+ description: json.description,
670
+ triggerPhrases: json.triggerPhrases
671
+ }),
672
+ nextPhase: 'fields'
673
+ });
674
+ }
675
+
676
+ async function handleFields({ state, userInput, modelConfig }) {
677
+ return handleJsonPhase({
678
+ state,
679
+ userInput,
680
+ modelConfig,
681
+ systemPrompt: SYSTEM_PROMPTS.fields,
682
+ validator: validateFields,
683
+ applySpec: (json) => ({
684
+ schema: json.schema,
685
+ category: json.category,
686
+ consequenceLevel: json.consequenceLevel,
687
+ requiresConfirmation: json.requiresConfirmation
688
+ }),
689
+ nextPhase: 'routing'
690
+ });
691
+ }
692
+
693
+ async function handleRouting({ state, userInput, modelConfig }) {
694
+ return handleJsonPhase({
695
+ state,
696
+ userInput,
697
+ modelConfig,
698
+ systemPrompt: SYSTEM_PROMPTS.routing,
699
+ validator: validateRouting,
700
+ applySpec: (json) => ({
701
+ endpointTarget: json.endpointTarget,
702
+ httpMethod: json.httpMethod,
703
+ authType: json.authType,
704
+ paramMap: json.paramMap || {}
705
+ }),
706
+ nextPhase: 'deps'
707
+ });
708
+ }
709
+
710
+ async function handleDeps({ state, userInput, modelConfig }) {
711
+ return handleJsonPhase({
712
+ state,
713
+ userInput,
714
+ modelConfig,
715
+ systemPrompt: SYSTEM_PROMPTS.deps,
716
+ validator: validateDeps,
717
+ applySpec: (json) => ({
718
+ tags: json.tags,
719
+ dependsOn: json.dependsOn
720
+ }),
721
+ nextPhase: 'confirm'
722
+ });
723
+ }
724
+
725
+ async function handleConfirm({ state, userInput, modelConfig }) {
726
+ // Build a readable spec summary for the system prompt.
727
+ const specSummary = JSON.stringify(state.spec, null, 2);
728
+ const systemPrompt =
729
+ SYSTEM_PROMPTS.confirm +
730
+ '\n\nCurrent spec:\n```json\n' + specSummary + '\n```';
731
+
732
+ const { assistantText, newMessages } = await callLlm(
733
+ state.messages,
734
+ userInput,
735
+ systemPrompt,
736
+ modelConfig,
737
+ null,
738
+ state._systemPromptOverride || null
739
+ );
740
+
741
+ const confirmed = typeof userInput === 'string' && /^yes$/i.test(userInput.trim());
742
+
743
+ return {
744
+ nextState: {
745
+ ...state,
746
+ phase: confirmed ? 'generate' : 'confirm',
747
+ messages: newMessages
748
+ },
749
+ assistantText,
750
+ specUpdate: null,
751
+ actions: [],
752
+ phaseChanged: confirmed
753
+ };
754
+ }
755
+
756
+ function handleAutoAdvance({ state, assistantMessage, actions, nextPhase }) {
757
+ return {
758
+ nextState: {
759
+ ...state,
760
+ phase: nextPhase,
761
+ retryCount: 0,
762
+ lastValidationError: null
763
+ },
764
+ assistantText: assistantMessage,
765
+ specUpdate: null,
766
+ actions,
767
+ phaseChanged: true
768
+ };
769
+ }
770
+
771
+ function handleGenerate({ state, projectRoot }) {
772
+ // Derive expected file paths from the spec name.
773
+ const toolName = state.spec.name || 'unnamed_tool';
774
+ const toolPath = projectRoot
775
+ ? `${projectRoot}/tools/${toolName}.js`
776
+ : `tools/${toolName}.js`;
777
+ const testPath = projectRoot
778
+ ? `${projectRoot}/tools/${toolName}.test.js`
779
+ : `tools/${toolName}.test.js`;
780
+
781
+ const actions = [
782
+ {
783
+ type: 'write_file',
784
+ payload: { toolPath, testPath, barrelDiff: null }
785
+ }
786
+ ];
787
+
788
+ return handleAutoAdvance({
789
+ state,
790
+ assistantMessage: `Generating tool files for ${toolName}…`,
791
+ actions,
792
+ nextPhase: 'test'
793
+ });
794
+ }
795
+
796
+ function handleTest({ state }) {
797
+ const toolName = state.spec.name || 'unnamed_tool';
798
+ const actions = [
799
+ {
800
+ type: 'run_tests',
801
+ payload: { command: `npm test -- ${toolName}` }
802
+ }
803
+ ];
804
+
805
+ return handleAutoAdvance({
806
+ state,
807
+ assistantMessage: 'Running tests…',
808
+ actions,
809
+ nextPhase: 'evals'
810
+ });
811
+ }
812
+
813
+ async function handleEvals({ state, userInput, modelConfig, projectConfig }) {
814
+ // If the user just typed 'default' or similar, use the default mix
815
+ const isDefault = typeof userInput === 'string' && /^default/i.test(userInput.trim());
816
+
817
+ if (isDefault) {
818
+ const DEFAULT_MIX = {
819
+ golden: { total: 10 },
820
+ labeled: { straightforward: 3, ambiguous: 3, edge: 2, adversarial: 2 }
821
+ };
822
+ const evalMix = projectConfig?.evals?.defaultMix || DEFAULT_MIX;
823
+ const newMessages = [...state.messages, { role: 'user', content: userInput }];
824
+ return {
825
+ nextState: {
826
+ ...state,
827
+ phase: 'verifiers',
828
+ spec: { ...state.spec, evalMix },
829
+ messages: newMessages
830
+ },
831
+ assistantText: 'Using default eval mix. Generating eval cases…',
832
+ specUpdate: { evalMix },
833
+ actions: [{ type: 'write_evals', payload: { evalMix } }],
834
+ phaseChanged: true
835
+ };
836
+ }
837
+
838
+ // Otherwise use JSON phase to let user customize
839
+ const result = await handleJsonPhase({
840
+ state,
841
+ userInput,
842
+ modelConfig,
843
+ systemPrompt: SYSTEM_PROMPTS.evalsInteractive,
844
+ validator: validateEvalMix,
845
+ applySpec: (json) => ({ evalMix: json.evalMix }),
846
+ nextPhase: 'verifiers'
847
+ });
848
+
849
+ // Attach the write_evals action if phase advanced
850
+ if (result.phaseChanged) {
851
+ const evalMix = result.nextState.spec.evalMix;
852
+ return {
853
+ ...result,
854
+ actions: [{ type: 'write_evals', payload: { evalMix } }]
855
+ };
856
+ }
857
+
858
+ return result;
859
+ }
860
+
861
+ function handleVerifiers({ state }) {
862
+ return handleAutoAdvance({
863
+ state,
864
+ assistantMessage: 'Generating verifier stubs…',
865
+ actions: [{ type: 'write_verifiers' }],
866
+ nextPhase: 'done'
867
+ });
868
+ }
869
+
870
+ function handleDone({ state }) {
871
+ return {
872
+ nextState: { ...state, phase: 'done' },
873
+ assistantText: 'The tool forge dialogue is complete. Your tool has been generated.',
874
+ specUpdate: null,
875
+ actions: [],
876
+ phaseChanged: false
877
+ };
878
+ }
879
+
880
+ // ── Core export ────────────────────────────────────────────────────────────
881
+
882
+ /**
883
+ * Advance the forge state machine by one step.
884
+ *
885
+ * @param {object} opts
886
+ * @param {object} opts.state - Current forge state (from createInitialState or prior forgeStep)
887
+ * @param {string|null} opts.userInput - User message, or null for auto-advance phases
888
+ * @param {object} opts.modelConfig - { provider, apiKey, model }
889
+ * @param {string[]} [opts.existingTools] - Names of tools already in the registry
890
+ * @param {object} [opts.projectConfig] - Project-level config (passed through, not consumed here)
891
+ * @param {string} [opts.projectRoot] - Absolute path to project root (used for file path construction)
892
+ * @param {string} [opts.systemPromptOverride] - When provided and non-empty, replaces the phase's
893
+ * default system prompt for this turn only.
894
+ * Does not mutate forgeState.
895
+ * @returns {Promise<{
896
+ * nextState: object,
897
+ * assistantText: string,
898
+ * specUpdate: object|null,
899
+ * actions: Array<object>,
900
+ * phaseChanged: boolean
901
+ * }>}
902
+ */
903
+ export async function forgeStep({
904
+ state,
905
+ userInput,
906
+ modelConfig,
907
+ existingTools = [],
908
+ projectConfig,
909
+ projectRoot,
910
+ systemPromptOverride
911
+ }) {
912
+ const phase = state.phase;
913
+
914
+ // When a systemPromptOverride is provided, temporarily stamp it onto the
915
+ // state so callLlm can read it. It is stripped from nextState before return
916
+ // so it does not persist across turns.
917
+ const s = systemPromptOverride
918
+ ? { ...state, _systemPromptOverride: systemPromptOverride }
919
+ : state;
920
+
921
+ let result;
922
+ switch (phase) {
923
+ case 'explore':
924
+ result = await handleExplore({ state: s, userInput, modelConfig });
925
+ break;
926
+
927
+ case 'skeptic':
928
+ result = await handleSkeptic({ state: s, userInput, modelConfig, existingTools });
929
+ break;
930
+
931
+ case 'description':
932
+ result = await handleDescription({ state: s, userInput, modelConfig });
933
+ break;
934
+
935
+ case 'fields':
936
+ result = await handleFields({ state: s, userInput, modelConfig });
937
+ break;
938
+
939
+ case 'routing':
940
+ result = await handleRouting({ state: s, userInput, modelConfig });
941
+ break;
942
+
943
+ case 'deps':
944
+ result = await handleDeps({ state: s, userInput, modelConfig });
945
+ break;
946
+
947
+ case 'confirm':
948
+ result = await handleConfirm({ state: s, userInput, modelConfig });
949
+ break;
950
+
951
+ case 'generate':
952
+ result = handleGenerate({ state: s, projectRoot });
953
+ break;
954
+
955
+ case 'test':
956
+ result = handleTest({ state: s });
957
+ break;
958
+
959
+ case 'evals':
960
+ result = await handleEvals({ state: s, userInput, modelConfig, projectConfig });
961
+ break;
962
+
963
+ case 'verifiers':
964
+ result = handleVerifiers({ state: s });
965
+ break;
966
+
967
+ case 'done':
968
+ result = handleDone({ state: s });
969
+ break;
970
+
971
+ default:
972
+ throw new Error(`forgeStep: unknown phase "${phase}".`);
973
+ }
974
+
975
+ // Strip the override flag from nextState so it doesn't persist across turns.
976
+ if (result && result.nextState && '_systemPromptOverride' in result.nextState) {
977
+ const { _systemPromptOverride: _stripped, ...cleanState } = result.nextState;
978
+ result = { ...result, nextState: cleanState };
979
+ }
980
+
981
+ return result;
982
+ }