npm - agent-tool-forge - Versions diffs - 0.3.0 - Mend

agent-tool-forge 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

package/LICENSE +21 -0
package/README.md +209 -0
package/lib/agent-registry.js +170 -0
package/lib/api-client.js +792 -0
package/lib/api-loader.js +260 -0
package/lib/auth.d.ts +25 -0
package/lib/auth.js +158 -0
package/lib/checks/check-adapter.js +172 -0
package/lib/checks/compose.js +42 -0
package/lib/checks/content-match.js +14 -0
package/lib/checks/cost-budget.js +11 -0
package/lib/checks/index.js +18 -0
package/lib/checks/json-valid.js +15 -0
package/lib/checks/latency.js +11 -0
package/lib/checks/length-bounds.js +17 -0
package/lib/checks/negative-match.js +14 -0
package/lib/checks/no-hallucinated-numbers.js +63 -0
package/lib/checks/non-empty.js +34 -0
package/lib/checks/regex-match.js +12 -0
package/lib/checks/run-checks.js +84 -0
package/lib/checks/schema-match.js +26 -0
package/lib/checks/tool-call-count.js +16 -0
package/lib/checks/tool-selection.js +34 -0
package/lib/checks/types.js +45 -0
package/lib/comparison/compare.js +86 -0
package/lib/comparison/format.js +104 -0
package/lib/comparison/index.js +6 -0
package/lib/comparison/statistics.js +59 -0
package/lib/comparison/types.js +41 -0
package/lib/config-schema.js +200 -0
package/lib/config.d.ts +66 -0
package/lib/conversation-store.d.ts +77 -0
package/lib/conversation-store.js +443 -0
package/lib/db.d.ts +6 -0
package/lib/db.js +1112 -0
package/lib/dep-check.js +99 -0
package/lib/drift-background.js +61 -0
package/lib/drift-monitor.js +187 -0
package/lib/eval-runner.js +566 -0
package/lib/fixtures/fixture-store.js +161 -0
package/lib/fixtures/index.js +11 -0
package/lib/forge-engine.js +982 -0
package/lib/forge-eval-generator.js +417 -0
package/lib/forge-file-writer.js +386 -0
package/lib/forge-service-client.js +190 -0
package/lib/forge-service.d.ts +4 -0
package/lib/forge-service.js +655 -0
package/lib/forge-verifier-generator.js +271 -0
package/lib/handlers/admin.js +151 -0
package/lib/handlers/agents.js +229 -0
package/lib/handlers/chat-resume.js +334 -0
package/lib/handlers/chat-sync.js +320 -0
package/lib/handlers/chat.js +320 -0
package/lib/handlers/conversations.js +92 -0
package/lib/handlers/preferences.js +88 -0
package/lib/handlers/tools-list.js +58 -0
package/lib/hitl-engine.d.ts +60 -0
package/lib/hitl-engine.js +261 -0
package/lib/http-utils.js +92 -0
package/lib/index.d.ts +20 -0
package/lib/index.js +141 -0
package/lib/init.js +636 -0
package/lib/manual-entry.js +59 -0
package/lib/mcp-server.js +252 -0
package/lib/output-groups.js +54 -0
package/lib/postgres-store.d.ts +31 -0
package/lib/postgres-store.js +465 -0
package/lib/preference-store.d.ts +47 -0
package/lib/preference-store.js +79 -0
package/lib/prompt-store.d.ts +42 -0
package/lib/prompt-store.js +60 -0
package/lib/rate-limiter.d.ts +30 -0
package/lib/rate-limiter.js +104 -0
package/lib/react-engine.d.ts +110 -0
package/lib/react-engine.js +337 -0
package/lib/runner/cli.js +156 -0
package/lib/runner/cost-estimator.js +71 -0
package/lib/runner/gate.js +46 -0
package/lib/runner/index.js +165 -0
package/lib/sidecar.d.ts +83 -0
package/lib/sidecar.js +161 -0
package/lib/sse.d.ts +15 -0
package/lib/sse.js +30 -0
package/lib/tools-scanner.js +91 -0
package/lib/tui.js +253 -0
package/lib/verifier-report.js +78 -0
package/lib/verifier-runner.js +338 -0
package/lib/verifier-scanner.js +70 -0
package/lib/verifier-worker-pool.js +196 -0
package/lib/views/chat.js +340 -0
package/lib/views/endpoints.js +203 -0
package/lib/views/eval-run.js +206 -0
package/lib/views/forge-agent.js +538 -0
package/lib/views/forge.js +410 -0
package/lib/views/main-menu.js +275 -0
package/lib/views/mediation.js +381 -0
package/lib/views/model-compare.js +430 -0
package/lib/views/model-comparison.js +333 -0
package/lib/views/onboarding.js +470 -0
package/lib/views/performance.js +237 -0
package/lib/views/run-evals.js +205 -0
package/lib/views/settings.js +829 -0
package/lib/views/tools-evals.js +514 -0
package/lib/views/verifier-coverage.js +617 -0
package/lib/workers/verifier-worker.js +52 -0
package/package.json +123 -0
package/widget/forge-chat.js +789 -0

package/lib/forge-engine.js ADDED Viewed

@@ -0,0 +1,982 @@
+/**
+ * forge-engine.js — Pure async state machine for the 12-phase tool forge dialogue (phases 0–11).
+ *
+ * No blessed dependencies. No UI. Pure logic.
+ * Uses cli/api-client.js for LLM calls.
+ */
+import { llmTurn } from './api-client.js';
+// ── Phase registry ─────────────────────────────────────────────────────────
+export const PHASES = [
+  'explore',
+  'skeptic',
+  'description',
+  'fields',
+  'routing',
+  'deps',
+  'confirm',
+  'generate',
+  'test',
+  'evals',
+  'verifiers',
+  'done'
+];
+/**
+ * Return the zero-based index of a phase name.
+ *
+ * @param {string} phase
+ * @returns {number} -1 if not found
+ */
+export function getPhaseIndex(phase) {
+  return PHASES.indexOf(phase);
+}
+// ── Initial state factory ──────────────────────────────────────────────────
+/**
+ * Return a fresh initial state with the explore phase active.
+ *
+ * @returns {object}
+ */
+export function createInitialState() {
+  return {
+    phase: 'explore',
+    spec: {
+      name: null,
+      description: null,
+      schema: null,
+      category: null,
+      consequenceLevel: null,
+      requiresConfirmation: null,
+      timeout: null,
+      tags: [],
+      dependsOn: [],
+      triggerPhrases: [],
+      endpointTarget: null,
+      httpMethod: null,
+      authType: null,
+      paramMap: {},
+      evalMix: null
+    },
+    messages: [],
+    retryCount: 0,
+    lastValidationError: null,
+    generationId: null,
+    phaseStartIdx: 0,
+    skepticOverlaps: [],
+    skepticOverlapSurfaced: false
+  };
+}
+// ── Phase system prompts ───────────────────────────────────────────────────
+const SYSTEM_PROMPTS = {
+  explore:
+    "You are a tool forge assistant helping design a new LLM agent tool. Ask the user what they want to build. Be curious and open. Try to understand the use case, the trigger phrase ('user says X'), and what the tool should do. Keep your response under 100 words.",
+  skepticV2:
+    "You are reviewing whether a new tool is necessary. Here are the existing tools with their descriptions and trigger phrases:\n\n{existingTools}\n\nCheck if the proposed tool overlaps semantically with any existing tool. Look for overlapping descriptions (similar intent or scope) and overlapping trigger phrases (a user might say the same thing to trigger both tools).\n\nFor each overlap found, output EXACTLY this format on its own line:\nOVERLAP FOUND: [tool_name] — [reason]\n\nIf no overlaps are found, output EXACTLY:\nNO_OVERLAP\n\nThen challenge the user: does this need to be a separate tool, or can it be a parameter variation? Be pointed.",
+  description:
+    "You are locking the description contract for a tool. The format MUST be: '<What the tool does>. Use when <trigger phrase>. <Disambiguation from similar tools if any>.' Extract: name (snake_case), description (this format), triggerPhrases (3+ variations a user might say to trigger this). Respond with JSON: { name, description, triggerPhrases }. Then ask the user to confirm.",
+  fields:
+    "Extract the tool's schema fields, category, consequence level, and confirmation requirement. Respond with JSON: { schema: { <fieldName>: { type, description, optional? } }, category: 'read'|'write'|'delete'|'side_effect', consequenceLevel: 'low'|'medium'|'high', requiresConfirmation: boolean }. Then show a summary.",
+  routing:
+    "This tool generates an MCP routing layer pointing to a real API endpoint. Collect: endpointTarget (URL string), httpMethod (GET|POST|PUT|DELETE|PATCH), authType (bearer|apiKey|none), and paramMap (object mapping schema field names to API parameter names — can be empty if names match). Respond with JSON: { endpointTarget, httpMethod, authType, paramMap }. Ask the user for these values if unclear.",
+  deps:
+    "Optionally collect tags and dependencies. Ask if this tool depends on any other tools. Respond with JSON: { tags: [], dependsOn: [] }. This phase can be skipped.",
+  confirm:
+    "Show the full spec and ask the user to type 'yes' to proceed to code generation, or describe any changes.",
+  generate:
+    "Auto-advance — no user input needed. Emit the write_file action.",
+  test:
+    "Auto-advance — no user input needed. Emit the run_tests action.",
+  evalsInteractive:
+    "We're about to generate eval cases for this tool. The default eval mix is: 10 golden cases + 10 labeled cases (3 straightforward, 3 ambiguous, 2 edge, 2 adversarial). Would you like to use the default mix, or customize the counts? If customizing, respond with JSON: { evalMix: { golden: { total: N }, labeled: { straightforward: N, ambiguous: N, edge: N, adversarial: N } } }. If using the default, just say 'default'.",
+  verifiers:
+    "Auto-advance — no user input needed. Emit the write_verifiers action.",
+  done:
+    "The tool forge dialogue is complete."
+};
+// ── Phase validators ───────────────────────────────────────────────────────
+/**
+ * Validate JSON extracted from the description phase.
+ *
+ * @param {object} json
+ * @returns {{ valid: boolean, error: string|null }}
+ */
+function validateDescription(json) {
+  if (!json || typeof json !== 'object') {
+    return { valid: false, error: 'Response must be a JSON object.' };
+  }
+  const { name, description, triggerPhrases } = json;
+  if (typeof name !== 'string' || !/^[a-z][a-z0-9_]*$/.test(name)) {
+    return {
+      valid: false,
+      error: 'name must be a non-empty snake_case string (e.g. "my_tool_name").'
+    };
+  }
+  if (typeof description !== 'string' || description.trim().length === 0) {
+    return { valid: false, error: 'description must be a non-empty string.' };
+  }
+  if (!description.toLowerCase().includes('use when')) {
+    return {
+      valid: false,
+      error:
+        'description must follow the format: "<What it does>. Use when <trigger>. <Disambiguation>."'
+    };
+  }
+  if (!Array.isArray(triggerPhrases) || triggerPhrases.length < 2) {
+    return {
+      valid: false,
+      error: 'triggerPhrases must be an array with at least 2 entries.'
+    };
+  }
+  return { valid: true, error: null };
+}
+/**
+ * Validate JSON extracted from the fields phase.
+ *
+ * @param {object} json
+ * @returns {{ valid: boolean, error: string|null }}
+ */
+function validateFields(json) {
+  if (!json || typeof json !== 'object') {
+    return { valid: false, error: 'Response must be a JSON object.' };
+  }
+  const { schema, category, consequenceLevel, requiresConfirmation } = json;
+  if (!schema || typeof schema !== 'object' || Array.isArray(schema)) {
+    return { valid: false, error: 'schema must be a non-null object mapping field names to definitions.' };
+  }
+  const validCategories = ['read', 'write', 'delete', 'side_effect'];
+  if (!validCategories.includes(category)) {
+    return {
+      valid: false,
+      error: `category must be one of: ${validCategories.join(', ')}.`
+    };
+  }
+  const validLevels = ['low', 'medium', 'high'];
+  if (!validLevels.includes(consequenceLevel)) {
+    return {
+      valid: false,
+      error: `consequenceLevel must be one of: ${validLevels.join(', ')}.`
+    };
+  }
+  if (typeof requiresConfirmation !== 'boolean') {
+    return { valid: false, error: 'requiresConfirmation must be a boolean.' };
+  }
+  return { valid: true, error: null };
+}
+/**
+ * Validate JSON extracted from the routing phase.
+ *
+ * @param {object} json
+ * @returns {{ valid: boolean, error: string|null }}
+ */
+function validateRouting(json) {
+  if (!json || typeof json !== 'object') {
+    return { valid: false, error: 'Response must be a JSON object.' };
+  }
+  const { endpointTarget, httpMethod, authType, paramMap } = json;
+  if (typeof endpointTarget !== 'string' || endpointTarget.trim() === '') {
+    return { valid: false, error: 'endpointTarget must be a non-empty string URL.' };
+  }
+  const validMethods = ['GET', 'POST', 'PUT', 'DELETE', 'PATCH'];
+  if (!validMethods.includes(httpMethod)) {
+    return { valid: false, error: `httpMethod must be one of: ${validMethods.join(', ')}.` };
+  }
+  const validAuthTypes = ['bearer', 'apiKey', 'none'];
+  if (!validAuthTypes.includes(authType)) {
+    return { valid: false, error: `authType must be one of: ${validAuthTypes.join(', ')}.` };
+  }
+  if (!paramMap || typeof paramMap !== 'object' || Array.isArray(paramMap)) {
+    return { valid: false, error: 'paramMap must be an object (can be empty {}).' };
+  }
+  return { valid: true, error: null };
+}
+/**
+ * Validate JSON extracted from the evals interactive phase.
+ *
+ * @param {object} json
+ * @returns {{ valid: boolean, error: string|null }}
+ */
+function validateEvalMix(json) {
+  if (!json || typeof json !== 'object') {
+    return { valid: false, error: 'Response must be a JSON object.' };
+  }
+  const { evalMix } = json;
+  if (!evalMix || typeof evalMix !== 'object') {
+    return { valid: false, error: 'evalMix must be an object.' };
+  }
+  if (!evalMix.golden || typeof evalMix.golden.total !== 'number') {
+    return { valid: false, error: 'evalMix.golden.total must be a number.' };
+  }
+  if (!evalMix.labeled || typeof evalMix.labeled !== 'object') {
+    return { valid: false, error: 'evalMix.labeled must be an object.' };
+  }
+  const labeledFields = ['straightforward', 'ambiguous', 'edge', 'adversarial'];
+  for (const f of labeledFields) {
+    if (evalMix.labeled[f] !== undefined && typeof evalMix.labeled[f] !== 'number') {
+      return { valid: false, error: `evalMix.labeled.${f} must be a number.` };
+    }
+  }
+  return { valid: true, error: null };
+}
+/**
+ * Parse the skeptic phase LLM response to find overlap findings.
+ *
+ * @param {string} text
+ * @returns {{ overlaps: string[], clear: boolean|null }}
+ */
+function parseSkepticResult(text) {
+  if (!text) return { overlaps: [], clear: null };
+  // Check OVERLAP FOUND lines first — they take priority over the NO_OVERLAP sentinel.
+  // If both appear (malformed LLM output), the overlap wins.
+  const overlapLines = [];
+  const overlapRegex = /^OVERLAP FOUND:\s*(.+)$/gm;
+  let match;
+  while ((match = overlapRegex.exec(text)) !== null) {
+    overlapLines.push(match[1].trim());
+  }
+  if (overlapLines.length > 0) {
+    return { overlaps: overlapLines, clear: false };
+  }
+  if (text.includes('NO_OVERLAP')) {
+    return { overlaps: [], clear: true };
+  }
+  // LLM didn't follow the format — treat as unclear
+  return { overlaps: [], clear: null };
+}
+/**
+ * Validate JSON extracted from the deps phase.
+ *
+ * @param {object} json
+ * @returns {{ valid: boolean, error: string|null }}
+ */
+function validateDeps(json) {
+  if (!json || typeof json !== 'object') {
+    return { valid: false, error: 'Response must be a JSON object.' };
+  }
+  const { tags, dependsOn } = json;
+  if (!Array.isArray(tags)) {
+    return { valid: false, error: 'tags must be an array.' };
+  }
+  if (!Array.isArray(dependsOn)) {
+    return { valid: false, error: 'dependsOn must be an array.' };
+  }
+  return { valid: true, error: null };
+}
+// ── JSON extraction ────────────────────────────────────────────────────────
+/**
+ * Extract the first JSON object from an LLM response string.
+ * Tries ```json ... ``` fenced block first; falls back to first { to last }.
+ *
+ * @param {string} text
+ * @returns {object|null} Parsed object, or null if extraction failed.
+ */
+function extractJson(text) {
+  // Strategy 1: fenced ```json ... ``` block
+  const fenced = text.match(/```json\s*([\s\S]*?)\s*```/);
+  if (fenced) {
+    try {
+      return JSON.parse(fenced[1]);
+    } catch (_) {
+      // fall through to strategy 2
+    }
+  }
+  // Strategy 2: first { to last }
+  const start = text.indexOf('{');
+  const end   = text.lastIndexOf('}');
+  if (start !== -1 && end > start) {
+    try {
+      return JSON.parse(text.slice(start, end + 1));
+    } catch (_) {
+      // extraction failed
+    }
+  }
+  return null;
+}
+// ── LLM call helper ────────────────────────────────────────────────────────
+/**
+ * Perform a single LLM turn, appending user input (if any) and returning the
+ * assistant text plus the updated messages array.
+ *
+ * @param {object[]} messages         - Current conversation history (immutable)
+ * @param {string|null} userInput     - New user message, or null to skip
+ * @param {string} systemPrompt       - Phase system prompt (may be overridden via state._systemPromptOverride)
+ * @param {object} modelConfig        - { provider, apiKey, model }
+ * @param {string|null} [retryHint]   - If set, appended to system prompt to guide correction
+ * @param {string|null} [overridePrompt] - When non-empty, fully replaces systemPrompt for this turn
+ * @returns {Promise<{ assistantText: string, newMessages: object[] }>}
+ */
+async function callLlm(messages, userInput, systemPrompt, modelConfig, retryHint = null, overridePrompt = null) {
+  const newMessages = [...messages];
+  if (userInput !== null && userInput !== undefined) {
+    newMessages.push({ role: 'user', content: userInput });
+  }
+  // Build the API payload separately — add a synthetic [continue] turn only for
+  // the API call when the last stored message is from the assistant and there is
+  // no new user input.  The synthetic turn is NOT stored back into state.
+  const apiMessages = (
+    newMessages.length > 0 &&
+    newMessages[newMessages.length - 1].role === 'assistant' &&
+    (userInput === null || userInput === undefined)
+  ) ? [...newMessages, { role: 'user', content: '[continue]' }] : newMessages;
+  const basePrompt = (overridePrompt && overridePrompt.trim()) ? overridePrompt : systemPrompt;
+  const fullSystem = retryHint
+    ? basePrompt + '\n\nPrevious attempt failed: ' + retryHint + '\nPlease correct the JSON.'
+    : basePrompt;
+  let result;
+  try {
+    result = await llmTurn({
+      provider:  modelConfig.provider,
+      apiKey:    modelConfig.apiKey,
+      model:     modelConfig.model,
+      system:    fullSystem,
+      messages:  apiMessages,
+      maxTokens: 4096
+    });
+  } catch (err) {
+    throw new Error(`LLM call failed (${modelConfig.provider}/${modelConfig.model}): ${err.message}`);
+  }
+  const assistantText = result.text || '';
+  if (assistantText) {
+    newMessages.push({ role: 'assistant', content: assistantText });
+  }
+  return { assistantText, newMessages };
+}
+// ── Phase handlers ─────────────────────────────────────────────────────────
+async function handleExplore({ state, userInput, modelConfig }) {
+  const systemPrompt = SYSTEM_PROMPTS.explore;
+  const { assistantText, newMessages } = await callLlm(
+    state.messages,
+    userInput,
+    systemPrompt,
+    modelConfig,
+    null,
+    state._systemPromptOverride || null
+  );
+  // Advance after the AI has asked its opening question AND the user has replied.
+  // Heuristic: if there is already at least one user message in history before
+  // this call, the user has replied to the initial question — advance to skeptic.
+  const prevUserMessages = state.messages.filter((m) => m.role === 'user');
+  const advance = prevUserMessages.length >= 1 && userInput !== null;
+  const nextPhase = advance ? 'skeptic' : 'explore';
+  return {
+    nextState: {
+      ...state,
+      phase: nextPhase,
+      messages: newMessages,
+      phaseStartIdx: advance ? newMessages.length : (state.phaseStartIdx || 0)
+    },
+    assistantText,
+    specUpdate: null,
+    actions: [],
+    phaseChanged: advance
+  };
+}
+async function handleSkeptic({ state, userInput, modelConfig, existingTools }) {
+  // Build the tool listing — accept both { name, description, triggerPhrases }[] and legacy string[]
+  let toolListing = '(none)';
+  if (Array.isArray(existingTools) && existingTools.length > 0) {
+    if (typeof existingTools[0] === 'string') {
+      toolListing = existingTools.join(', ');
+    } else {
+      toolListing = existingTools.map((t) => {
+        const triggers = Array.isArray(t.triggerPhrases) && t.triggerPhrases.length
+          ? `  Triggers: ${t.triggerPhrases.join(', ')}`
+          : '';
+        return `${t.name}: ${t.description || '(no description)'}${triggers ? '\n' + triggers : ''}`;
+      }).join('\n\n');
+    }
+  }
+  const systemPrompt = SYSTEM_PROMPTS.skepticV2.replace('{existingTools}', toolListing);
+  const { assistantText, newMessages } = await callLlm(
+    state.messages,
+    userInput,
+    systemPrompt,
+    modelConfig,
+    null,
+    state._systemPromptOverride || null
+  );
+  const { overlaps, clear } = parseSkepticResult(assistantText);
+  // If overlaps found and not yet surfaced — block and mark surfaced
+  if (overlaps.length > 0 && !state.skepticOverlapSurfaced) {
+    return {
+      nextState: {
+        ...state,
+        phase: 'skeptic',
+        messages: newMessages,
+        skepticOverlaps: overlaps,
+        skepticOverlapSurfaced: true
+      },
+      assistantText,
+      specUpdate: null,
+      actions: [],
+      phaseChanged: false
+    };
+  }
+  // If overlaps were surfaced, advance after user has replied
+  if (state.skepticOverlapSurfaced && userInput !== null) {
+    return {
+      nextState: {
+        ...state,
+        phase: 'description',
+        messages: newMessages,
+        skepticOverlaps: state.skepticOverlaps,
+        skepticOverlapSurfaced: state.skepticOverlapSurfaced
+      },
+      assistantText,
+      specUpdate: null,
+      actions: [],
+      phaseChanged: true
+    };
+  }
+  // No overlaps — always require the user to reply after seeing the skeptic's response.
+  // Never auto-advance on the first skeptic call, even when clear === true, so the LLM's
+  // response is shown before the phase transitions.
+  const phaseStart = state.phaseStartIdx || 0;
+  const userMsgsInPhase = state.messages.slice(phaseStart).filter((m) => m.role === 'user');
+  const advance = userMsgsInPhase.length >= 1 && userInput !== null;
+  const nextPhase = advance ? 'description' : 'skeptic';
+  return {
+    nextState: {
+      ...state,
+      phase: nextPhase,
+      messages: newMessages
+    },
+    assistantText,
+    specUpdate: null,
+    actions: [],
+    phaseChanged: advance
+  };
+}
+async function handleJsonPhase({
+  state,
+  userInput,
+  modelConfig,
+  systemPrompt,
+  validator,
+  applySpec,
+  nextPhase
+}) {
+  const effectiveState = userInput !== null
+    ? { ...state, retryCount: 0, lastValidationError: null }
+    : state;
+  const retryHint = userInput !== null ? null : (effectiveState.lastValidationError || null);
+  const { assistantText, newMessages } = await callLlm(
+    effectiveState.messages,
+    userInput,
+    systemPrompt,
+    modelConfig,
+    retryHint,
+    state._systemPromptOverride || null
+  );
+  const extracted = extractJson(assistantText);
+  if (!extracted) {
+    // No JSON found — ask again if retries remain.
+    if (effectiveState.retryCount < 3) {
+      const newRetryHint = 'I could not find a JSON block in your response. Please include a JSON object with the required fields, wrapped in ```json ... ``` fences.';
+      return {
+        nextState: {
+          ...effectiveState,
+          phase: effectiveState.phase,
+          messages: newMessages,
+          retryCount: effectiveState.retryCount + 1,
+          lastValidationError: newRetryHint
+        },
+        assistantText,
+        specUpdate: null,
+        actions: [],
+        phaseChanged: false
+      };
+    }
+    // Too many retries — surface to user, reset retry counter.
+    const exhaustedText = assistantText + '\n\n(Could not extract JSON after 3 attempts — please rephrase or simplify your request.)';
+    const updatedMessages = [...newMessages];
+    const lastMsgMissing = updatedMessages[updatedMessages.length - 1];
+    if (lastMsgMissing && lastMsgMissing.role === 'assistant') {
+      updatedMessages[updatedMessages.length - 1] = { ...lastMsgMissing, content: exhaustedText };
+    } else {
+      updatedMessages.push({ role: 'assistant', content: exhaustedText });
+    }
+    return {
+      nextState: {
+        ...effectiveState,
+        phase: effectiveState.phase,
+        messages: updatedMessages,
+        retryCount: 0,
+        lastValidationError: null
+      },
+      assistantText: exhaustedText,
+      specUpdate: null,
+      actions: [],
+      phaseChanged: false
+    };
+  }
+  const { valid, error } = validator(extracted);
+  if (!valid) {
+    if (effectiveState.retryCount < 3) {
+      const newRetryHint = `The JSON was found but failed validation: ${error}`;
+      return {
+        nextState: {
+          ...effectiveState,
+          phase: effectiveState.phase,
+          messages: newMessages,
+          retryCount: effectiveState.retryCount + 1,
+          lastValidationError: newRetryHint
+        },
+        assistantText,
+        specUpdate: null,
+        actions: [],
+        phaseChanged: false
+      };
+    }
+    const validationExhaustedText = assistantText + `\n\n(Validation failed after 3 attempts: ${error} — please rephrase or simplify your request.)`;
+    const updatedValidationMessages = [...newMessages];
+    const lastMsgValidation = updatedValidationMessages[updatedValidationMessages.length - 1];
+    if (lastMsgValidation && lastMsgValidation.role === 'assistant') {
+      updatedValidationMessages[updatedValidationMessages.length - 1] = { ...lastMsgValidation, content: validationExhaustedText };
+    } else {
+      updatedValidationMessages.push({ role: 'assistant', content: validationExhaustedText });
+    }
+    return {
+      nextState: {
+        ...effectiveState,
+        phase: effectiveState.phase,
+        messages: updatedValidationMessages,
+        retryCount: 0,
+        lastValidationError: null
+      },
+      assistantText: validationExhaustedText,
+      specUpdate: null,
+      actions: [],
+      phaseChanged: false
+    };
+  }
+  // Valid — apply spec update and advance.
+  const specUpdate = applySpec(extracted);
+  return {
+    nextState: {
+      ...effectiveState,
+      phase: nextPhase,
+      spec: { ...effectiveState.spec, ...specUpdate },
+      messages: newMessages,
+      retryCount: 0,
+      lastValidationError: null
+    },
+    assistantText,
+    specUpdate,
+    actions: [],
+    phaseChanged: true
+  };
+}
+async function handleDescription({ state, userInput, modelConfig }) {
+  return handleJsonPhase({
+    state,
+    userInput,
+    modelConfig,
+    systemPrompt: SYSTEM_PROMPTS.description,
+    validator: validateDescription,
+    applySpec: (json) => ({
+      name: json.name,
+      description: json.description,
+      triggerPhrases: json.triggerPhrases
+    }),
+    nextPhase: 'fields'
+  });
+}
+async function handleFields({ state, userInput, modelConfig }) {
+  return handleJsonPhase({
+    state,
+    userInput,
+    modelConfig,
+    systemPrompt: SYSTEM_PROMPTS.fields,
+    validator: validateFields,
+    applySpec: (json) => ({
+      schema: json.schema,
+      category: json.category,
+      consequenceLevel: json.consequenceLevel,
+      requiresConfirmation: json.requiresConfirmation
+    }),
+    nextPhase: 'routing'
+  });
+}
+async function handleRouting({ state, userInput, modelConfig }) {
+  return handleJsonPhase({
+    state,
+    userInput,
+    modelConfig,
+    systemPrompt: SYSTEM_PROMPTS.routing,
+    validator: validateRouting,
+    applySpec: (json) => ({
+      endpointTarget: json.endpointTarget,
+      httpMethod: json.httpMethod,
+      authType: json.authType,
+      paramMap: json.paramMap || {}
+    }),
+    nextPhase: 'deps'
+  });
+}
+async function handleDeps({ state, userInput, modelConfig }) {
+  return handleJsonPhase({
+    state,
+    userInput,
+    modelConfig,
+    systemPrompt: SYSTEM_PROMPTS.deps,
+    validator: validateDeps,
+    applySpec: (json) => ({
+      tags: json.tags,
+      dependsOn: json.dependsOn
+    }),
+    nextPhase: 'confirm'
+  });
+}
+async function handleConfirm({ state, userInput, modelConfig }) {
+  // Build a readable spec summary for the system prompt.
+  const specSummary = JSON.stringify(state.spec, null, 2);
+  const systemPrompt =
+    SYSTEM_PROMPTS.confirm +
+    '\n\nCurrent spec:\n```json\n' + specSummary + '\n```';
+  const { assistantText, newMessages } = await callLlm(
+    state.messages,
+    userInput,
+    systemPrompt,
+    modelConfig,
+    null,
+    state._systemPromptOverride || null
+  );
+  const confirmed = typeof userInput === 'string' && /^yes$/i.test(userInput.trim());
+  return {
+    nextState: {
+      ...state,
+      phase: confirmed ? 'generate' : 'confirm',
+      messages: newMessages
+    },
+    assistantText,
+    specUpdate: null,
+    actions: [],
+    phaseChanged: confirmed
+  };
+}
+function handleAutoAdvance({ state, assistantMessage, actions, nextPhase }) {
+  return {
+    nextState: {
+      ...state,
+      phase: nextPhase,
+      retryCount: 0,
+      lastValidationError: null
+    },
+    assistantText: assistantMessage,
+    specUpdate: null,
+    actions,
+    phaseChanged: true
+  };
+}
+function handleGenerate({ state, projectRoot }) {
+  // Derive expected file paths from the spec name.
+  const toolName = state.spec.name || 'unnamed_tool';
+  const toolPath = projectRoot
+    ? `${projectRoot}/tools/${toolName}.js`
+    : `tools/${toolName}.js`;
+  const testPath = projectRoot
+    ? `${projectRoot}/tools/${toolName}.test.js`
+    : `tools/${toolName}.test.js`;
+  const actions = [
+    {
+      type: 'write_file',
+      payload: { toolPath, testPath, barrelDiff: null }
+    }
+  ];
+  return handleAutoAdvance({
+    state,
+    assistantMessage: `Generating tool files for ${toolName}…`,
+    actions,
+    nextPhase: 'test'
+  });
+}
+function handleTest({ state }) {
+  const toolName = state.spec.name || 'unnamed_tool';
+  const actions = [
+    {
+      type: 'run_tests',
+      payload: { command: `npm test -- ${toolName}` }
+    }
+  ];
+  return handleAutoAdvance({
+    state,
+    assistantMessage: 'Running tests…',
+    actions,
+    nextPhase: 'evals'
+  });
+}
+async function handleEvals({ state, userInput, modelConfig, projectConfig }) {
+  // If the user just typed 'default' or similar, use the default mix
+  const isDefault = typeof userInput === 'string' && /^default/i.test(userInput.trim());
+  if (isDefault) {
+    const DEFAULT_MIX = {
+      golden: { total: 10 },
+      labeled: { straightforward: 3, ambiguous: 3, edge: 2, adversarial: 2 }
+    };
+    const evalMix = projectConfig?.evals?.defaultMix || DEFAULT_MIX;
+    const newMessages = [...state.messages, { role: 'user', content: userInput }];
+    return {
+      nextState: {
+        ...state,
+        phase: 'verifiers',
+        spec: { ...state.spec, evalMix },
+        messages: newMessages
+      },
+      assistantText: 'Using default eval mix. Generating eval cases…',
+      specUpdate: { evalMix },
+      actions: [{ type: 'write_evals', payload: { evalMix } }],
+      phaseChanged: true
+    };
+  }
+  // Otherwise use JSON phase to let user customize
+  const result = await handleJsonPhase({
+    state,
+    userInput,
+    modelConfig,
+    systemPrompt: SYSTEM_PROMPTS.evalsInteractive,
+    validator: validateEvalMix,
+    applySpec: (json) => ({ evalMix: json.evalMix }),
+    nextPhase: 'verifiers'
+  });
+  // Attach the write_evals action if phase advanced
+  if (result.phaseChanged) {
+    const evalMix = result.nextState.spec.evalMix;
+    return {
+      ...result,
+      actions: [{ type: 'write_evals', payload: { evalMix } }]
+    };
+  }
+  return result;
+}
+function handleVerifiers({ state }) {
+  return handleAutoAdvance({
+    state,
+    assistantMessage: 'Generating verifier stubs…',
+    actions: [{ type: 'write_verifiers' }],
+    nextPhase: 'done'
+  });
+}
+function handleDone({ state }) {
+  return {
+    nextState: { ...state, phase: 'done' },
+    assistantText: 'The tool forge dialogue is complete. Your tool has been generated.',
+    specUpdate: null,
+    actions: [],
+    phaseChanged: false
+  };
+}
+// ── Core export ────────────────────────────────────────────────────────────
+/**
+ * Advance the forge state machine by one step.
+ *
+ * @param {object} opts
+ * @param {object}        opts.state                 - Current forge state (from createInitialState or prior forgeStep)
+ * @param {string|null}   opts.userInput             - User message, or null for auto-advance phases
+ * @param {object}        opts.modelConfig           - { provider, apiKey, model }
+ * @param {string[]}      [opts.existingTools]       - Names of tools already in the registry
+ * @param {object}        [opts.projectConfig]       - Project-level config (passed through, not consumed here)
+ * @param {string}        [opts.projectRoot]         - Absolute path to project root (used for file path construction)
+ * @param {string}        [opts.systemPromptOverride] - When provided and non-empty, replaces the phase's
+ *                                                      default system prompt for this turn only.
+ *                                                      Does not mutate forgeState.
+ * @returns {Promise<{
+ *   nextState: object,
+ *   assistantText: string,
+ *   specUpdate: object|null,
+ *   actions: Array<object>,
+ *   phaseChanged: boolean
+ * }>}
+ */
+export async function forgeStep({
+  state,
+  userInput,
+  modelConfig,
+  existingTools = [],
+  projectConfig,
+  projectRoot,
+  systemPromptOverride
+}) {
+  const phase = state.phase;
+  // When a systemPromptOverride is provided, temporarily stamp it onto the
+  // state so callLlm can read it.  It is stripped from nextState before return
+  // so it does not persist across turns.
+  const s = systemPromptOverride
+    ? { ...state, _systemPromptOverride: systemPromptOverride }
+    : state;
+  let result;
+  switch (phase) {
+    case 'explore':
+      result = await handleExplore({ state: s, userInput, modelConfig });
+      break;
+    case 'skeptic':
+      result = await handleSkeptic({ state: s, userInput, modelConfig, existingTools });
+      break;
+    case 'description':
+      result = await handleDescription({ state: s, userInput, modelConfig });
+      break;
+    case 'fields':
+      result = await handleFields({ state: s, userInput, modelConfig });
+      break;
+    case 'routing':
+      result = await handleRouting({ state: s, userInput, modelConfig });
+      break;
+    case 'deps':
+      result = await handleDeps({ state: s, userInput, modelConfig });
+      break;
+    case 'confirm':
+      result = await handleConfirm({ state: s, userInput, modelConfig });
+      break;
+    case 'generate':
+      result = handleGenerate({ state: s, projectRoot });
+      break;
+    case 'test':
+      result = handleTest({ state: s });
+      break;
+    case 'evals':
+      result = await handleEvals({ state: s, userInput, modelConfig, projectConfig });
+      break;
+    case 'verifiers':
+      result = handleVerifiers({ state: s });
+      break;
+    case 'done':
+      result = handleDone({ state: s });
+      break;
+    default:
+      throw new Error(`forgeStep: unknown phase "${phase}".`);
+  }
+  // Strip the override flag from nextState so it doesn't persist across turns.
+  if (result && result.nextState && '_systemPromptOverride' in result.nextState) {
+    const { _systemPromptOverride: _stripped, ...cleanState } = result.nextState;
+    result = { ...result, nextState: cleanState };
+  }
+  return result;
+}