npm - onbuzz - Versions diffs - 4.8.0 → 4.8.2 - Mend

onbuzz 4.8.0 → 4.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/package.json +1 -1
package/src/core/__tests__/agentPool.test.js +185 -0
package/src/core/__tests__/agentScheduler.nativePromptPick.test.js +319 -0
package/src/core/__tests__/agentScheduler.taskListInjection.test.js +94 -0
package/src/core/agentPool.js +319 -0
package/src/core/agentScheduler.js +216 -2
package/src/services/__tests__/conversationCompactionService.test.js +141 -0
package/src/services/__tests__/modelRouterNaming.test.js +41 -23
package/src/services/conversationCompactionService.js +120 -46
package/src/tools/__tests__/baseTool.test.js +171 -0
package/src/tools/__tests__/codeMapTool.test.js +179 -0
package/src/tools/__tests__/taskManagerTool.test.js +141 -0
package/src/tools/baseTool.js +89 -1
package/src/tools/openaiFunctionSchemas.js +14 -0
package/src/tools/skillsTool.js +282 -277
package/src/tools/taskManagerTool.js +72 -2
package/src/utilities/constants.js +19 -1

package/src/core/agentPool.js CHANGED Viewed

@@ -23,6 +23,17 @@ import DirectoryAccessManager from '../utilities/directoryAccessManager.js';
 import { getVisualEditorBridge } from '../services/visualEditorBridge.js';
 class AgentPool {
+  // Stopwords for the _tokenize / _jaccard similarity check used by
+  // auto-save-as-plan dedup. Tight list — only words that appear in
+  // virtually every English sentence regardless of content, so that
+  // their presence in both messages doesn't inflate similarity.
+  static _STOPWORDS = new Set([
+    'the', 'and', 'for', 'but', 'are', 'was', 'were',
+    'has', 'have', 'had', 'this', 'that', 'with', 'will',
+    'you', 'your', 'our', 'their', 'them', 'they',
+    'can', 'could', 'should', 'would',
+  ]);
   constructor(config, logger, stateManager, contextManager, toolsRegistry = null) {
     this.config = config;
     this.logger = logger;
@@ -380,6 +391,18 @@ class AgentPool {
           originalLength: baseSystemPrompt?.length || 0,
           enhancedLength: enhancedSystemPrompt?.length || 0
         });
+        // The scheduler caches per-(agent, model) Responses-API prompts
+        // built from this agent's `originalSystemPrompt` + capabilities.
+        // Both inputs just changed, so any cached rebuilds are stale.
+        // No-op when the scheduler isn't attached (tests / very-early
+        // boot) or when it predates this method (old binaries during
+        // a rolling upgrade).
+        try {
+          this.scheduler?._invalidateNativePromptCache?.(agentId);
+        } catch (e) {
+          this.logger.debug?.('Failed to invalidate native prompt cache', { agentId, error: e.message });
+        }
       } catch (error) {
         this.logger.error(`Failed to regenerate system prompt with updated capabilities`, {
           agentId,
@@ -1425,6 +1448,23 @@ class AgentPool {
       this._autoCreateTaskForMessage(agent, queuedMessage, 'user', 'high');
     }
+    // ── Auto-save substantive user messages as plan/* memories ───────
+    // Observed in production: across 670-message agent sessions the
+    // agent NEVER wrote a memory voluntarily. Compaction then summarized
+    // away the user's literal asks, the agent paraphrased what was left,
+    // and ended up doing work the user never requested. Belt-and-
+    // suspenders alongside the OPERATING POSTURE prompt nudge: when a
+    // user message looks substantive (long, or contains a numbered/
+    // bulleted multi-part ask), the SYSTEM saves it as `plan/<auto>` so
+    // the system-prompt auto-injection makes the user's words visible
+    // every turn — even if the agent itself never thought to save.
+    // Best-effort: never block the message-enqueue path.
+    this._autoSaveUserMessageAsPlan(agentId, queuedMessage).catch(err => {
+      this.logger.debug?.('Auto-save of user message as plan/* failed (continuing)', {
+        agentId, error: err?.message,
+      });
+    });
     await this.persistAgentState(agentId);
     // If we cleared a delay, surface it on the WS so the delay chip in the
@@ -1544,6 +1584,285 @@ class AgentPool {
    * @param {string} priority - Task priority ('high', 'medium', 'low')
    * @private
    */
+  /**
+   * Save a substantive user message as a `plan/*` memory automatically.
+   *
+   * Rationale (Talisman case study, May 2026): agents observed in
+   * production never wrote a single memory across hundreds of
+   * messages, even when the OPERATING POSTURE prompt explicitly told
+   * them to. The user's literal ask then got lost in compaction and
+   * the agent went off-course. This system-level safety net puts the
+   * user's message into the durable plan/* store — which the system
+   * prompt auto-injects every turn — without depending on the model
+   * making the call.
+   *
+   * What counts as "substantive":
+   *   - Content length ≥ 60 chars (~12 words) — short acks/yes-no don't qualify
+   *   - AND any of:
+   *       • contains a numbered list ("1.", "2.", "3." …)
+   *       • contains a bullet list (-, *, • at line start)
+   *       • OR is ≥ 120 chars (longer than a one-line ack)
+   *
+   * What gets saved:
+   *   - title: `plan/user-<short-slug>-<timestamp>`
+   *   - description: "auto-saved from user message at <iso>"
+   *   - content: the verbatim user message
+   *
+   * The agent can rename, consolidate, or delete these later. They
+   * exist as a fail-safe — if the agent does its job and saves its
+   * own better-named plan, these auto-saves can be cleaned up. If
+   * the agent doesn't, at least the user's words survive compaction.
+   *
+   * @param {string} agentId
+   * @param {Object} message - The queued user message
+   * @private
+   */
+  async _autoSaveUserMessageAsPlan(agentId, message) {
+    const content = typeof message?.content === 'string' ? message.content : '';
+    if (!content) return;
+    if (!this._looksSubstantive(content)) return;
+    // Lazy-load to keep agentPool's load order light. The same import
+    // pattern as agentScheduler's plan injection.
+    let memoryService;
+    try {
+      const mod = await import('../services/memoryService.js');
+      memoryService = mod.getMemoryService(this.logger);
+      await memoryService.initialize();
+    } catch (e) {
+      this.logger.debug?.('Auto-save plan: memory service unavailable', { error: e.message });
+      return;
+    }
+    // ── Deduplication ────────────────────────────────────────────────
+    // Users repeat themselves ("I repeat my old message", "did you do
+    // it all?" + paste the same thing). Without dedup the auto-saver
+    // would create N copies of essentially the same plan. Load
+    // existing plan/user-* memories and skip when the new content is
+    // ≥70% similar to any of them (Jaccard over normalized word sets).
+    let existingPlans = [];
+    try {
+      const all = await memoryService.loadMemories(agentId);
+      existingPlans = (all || []).filter(m =>
+        typeof m?.title === 'string' && m.title.startsWith('plan/user-')
+      );
+    } catch (e) {
+      // Treat unreadable store as empty — we may still write a fresh entry.
+      this.logger.debug?.('Auto-save plan: existing memories unreadable', { agentId, error: e.message });
+    }
+    const newTokens = this._tokenize(content);
+    for (const existing of existingPlans) {
+      const existingTokens = this._tokenize(existing.content || '');
+      const sim = this._jaccard(newTokens, existingTokens);
+      const containment = this._overlapCoefficient(newTokens, existingTokens);
+      // Jaccard catches near-identical reformulations. Containment
+      // catches the "I repeat my old message — <same content>" case
+      // where the user re-pastes the original plus a preamble. Either
+      // signal is enough to suppress the duplicate.
+      if (sim >= 0.7 || containment >= 0.85) {
+        this.logger.info?.('Auto-save plan: skipping near-duplicate of existing plan', {
+          agentId, existingTitle: existing.title,
+          jaccard: sim.toFixed(2), containment: containment.toFixed(2),
+        });
+        return;
+      }
+    }
+    // ── Per-agent cap ────────────────────────────────────────────────
+    // Bound the total auto-saved plans so an active session doesn't
+    // bloat the agent's plan/* namespace indefinitely. Keep the K most
+    // recent; delete the oldest auto-saves beyond that.
+    const AUTO_PLAN_CAP = 8;
+    const existingAutoSaves = existingPlans
+      .filter(m => /^plan\/user-/.test(m.title))
+      .sort((a, b) => String(a.createdAt || '').localeCompare(String(b.createdAt || '')));
+    while (existingAutoSaves.length >= AUTO_PLAN_CAP) {
+      const oldest = existingAutoSaves.shift();
+      try {
+        await memoryService.deleteMemory(agentId, oldest.id);
+        this.logger.info?.('Auto-save plan: retired oldest auto-save to keep cap', {
+          agentId, retiredTitle: oldest.title, cap: AUTO_PLAN_CAP,
+        });
+      } catch (e) {
+        // Non-fatal — if we can't delete the oldest, just skip this entry
+        // and proceed with the write. Worst case the plan list grows
+        // by one beyond the cap — still bounded over time.
+        this.logger.debug?.('Auto-save plan: retire-oldest failed', { agentId, error: e.message });
+        break;
+      }
+    }
+    // ── Write the new memory ─────────────────────────────────────────
+    const firstLine = (content.match(/[^\n]+/) || [''])[0].trim();
+    const slug = firstLine
+      .toLowerCase()
+      .replace(/[^a-z0-9]+/g, '-')
+      .replace(/^-+|-+$/g, '')
+      .slice(0, 40) || 'request';
+    const ts = new Date().toISOString().slice(0, 19).replace(/[:T]/g, '-');
+    const title = `plan/user-${slug}-${ts}`;
+    try {
+      await memoryService.addMemory(agentId, {
+        title,
+        description: `Auto-saved from user message at ${message.timestamp || new Date().toISOString()}`,
+        content,
+      });
+      this.logger.info?.('Auto-saved user message as plan/* memory', {
+        agentId, title, contentLength: content.length,
+      });
+    } catch (e) {
+      this.logger.debug?.('Auto-save plan: write failed', { agentId, title, error: e.message });
+    }
+  }
+  /**
+   * Tokenize a string into a lowercased word set for similarity checks.
+   * Strips punctuation, drops short words (<3 chars), and drops a
+   * small stopword set so that common words like "the" / "and" don't
+   * inflate similarity scores between otherwise different messages.
+   * @private
+   */
+  _tokenize(s) {
+    if (typeof s !== 'string') return new Set();
+    return new Set(
+      s.toLowerCase()
+       .replace(/[^a-z0-9\s]+/g, ' ')
+       .split(/\s+/)
+       .filter(w => w.length >= 3 && !AgentPool._STOPWORDS.has(w))
+    );
+  }
+  /**
+   * Jaccard similarity over two word sets.
+   * @private
+   */
+  _jaccard(a, b) {
+    if (a.size === 0 && b.size === 0) return 1;
+    if (a.size === 0 || b.size === 0) return 0;
+    let intersection = 0;
+    for (const w of a) if (b.has(w)) intersection += 1;
+    return intersection / (a.size + b.size - intersection);
+  }
+  /**
+   * Overlap coefficient — intersection / size-of-smaller-set.
+   * Returns 1.0 when one set is fully contained in the other,
+   * regardless of how much the other set adds. Catches the "user
+   * re-pastes their request with a preamble" duplicate case where
+   * Jaccard would mark the messages as merely similar.
+   * @private
+   */
+  _overlapCoefficient(a, b) {
+    if (a.size === 0 || b.size === 0) return 0;
+    let intersection = 0;
+    for (const w of a) if (b.has(w)) intersection += 1;
+    return intersection / Math.min(a.size, b.size);
+  }
+  /**
+   * Heuristic — does this user message look like a real request worth
+   * preserving as a plan/*? Errs on the side of saving more (recall
+   * over precision) — a stray auto-save is cheap; a lost user request
+   * is catastrophic.
+   * @private
+   */
+  _looksSubstantive(text) {
+    if (typeof text !== 'string') return false;
+    const t = text.trim();
+    if (t.length < 30) return false;
+    // Tool-result wrappers and previous-task boundaries are not user voice.
+    if (t.startsWith('[Tool Results') || t.startsWith('[Previous Task')) return false;
+    // ── Pollution filter 1: dominated by questions ────────────────────
+    // A message that's mostly questions wants an ANSWER, not a plan.
+    // If the majority of non-empty lines end in '?' (or are
+    // question-shaped), this is a query, not a request.
+    if (this._dominatedByQuestions(t)) return false;
+    // ── Pollution filter 2: list items are just refs (paths, urls) ───
+    // A list of file paths / URLs / commit hashes is the user pointing
+    // the agent at things, not a multi-part plan. Save it only if the
+    // surrounding prose carries imperative intent — and even then the
+    // length gate handles that path.
+    const hasList = /^\s*(?:\d+[.)]|[-*•])\s/m.test(t);
+    if (hasList && this._listItemsAreJustReferences(t)) return false;
+    // ── Now apply the structural triggers ────────────────────────────
+    // Numbered list — "1." / "1)" at a line start. Multi-part intent.
+    // Require a minimum total length to avoid "1. yes 2. no" nonsense.
+    if (/^\s*\d+[.)]\s/m.test(t) && t.length >= 60) return true;
+    // Bullet list at line start. Same — strong intent signal + length.
+    if (/^\s*[-*•]\s/m.test(t) && t.length >= 60) return true;
+    // Free-form prose with no list markers must be substantial AND
+    // contain an imperative-like signal (a verb you'd give as an
+    // order). Raised from 120 → 150 to skip more pleasantries.
+    if (t.length >= 150 && this._hasImperativeSignal(t)) return true;
+    return false;
+  }
+  /**
+   * Heuristic: is this message mostly questions?
+   * @private
+   */
+  _dominatedByQuestions(t) {
+    // Split into non-empty lines.
+    const lines = t.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
+    if (lines.length === 0) return false;
+    // Strip leading list markers so we can look at the line's intent.
+    const stripMarker = (l) => l.replace(/^(?:\d+[.)]|[-*•])\s+/, '');
+    let questionLines = 0;
+    for (const raw of lines) {
+      const line = stripMarker(raw);
+      // Ends in '?', OR starts with a question word at the line head.
+      if (/\?\s*$/.test(line) || /^(?:what|why|how|when|where|who|which|is\b|are\b|do\b|does\b|can\b|could\b|should\b|would\b)\b/i.test(line)) {
+        questionLines += 1;
+      }
+    }
+    // Strict-majority rule: more than half of lines are questions.
+    return questionLines * 2 > lines.length;
+  }
+  /**
+   * Heuristic: are the list items in this message just references
+   * (file paths, URLs, commit hashes) with no imperative verb of their own?
+   * @private
+   */
+  _listItemsAreJustReferences(t) {
+    const lines = t.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
+    const listItems = lines.filter(l => /^(?:\d+[.)]|[-*•])\s/.test(l));
+    if (listItems.length === 0) return false;
+    let refLikeCount = 0;
+    for (const li of listItems) {
+      const body = li.replace(/^(?:\d+[.)]|[-*•])\s+/, '').trim();
+      // Only treat as a "reference" if the line IS the reference —
+      // i.e. a path/URL/hash with no surrounding English. A short bug
+      // description like "login button does nothing on Safari" still
+      // counts as content, not a reference.
+      //   Path: contains '/' or '\' OR starts with '.' AND has NO spaces
+      //   URL:  starts with http(s)://
+      //   Hash: 7-40 hex chars only, no spaces
+      const isPath = (/[/\\]/.test(body) || /^\./.test(body)) && !/\s/.test(body);
+      const isUrl  = /^https?:\/\//.test(body) && !/\s/.test(body);
+      const isHash = /^[0-9a-f]{7,40}$/i.test(body);
+      if (isPath || isUrl || isHash) refLikeCount += 1;
+    }
+    // Strict-majority of list items are reference-like → ignore.
+    return refLikeCount * 2 > listItems.length;
+  }
+  /**
+   * Heuristic: does the message contain a verb that signals "do this"?
+   * Conservative — favors recall over precision.
+   * @private
+   */
+  _hasImperativeSignal(t) {
+    // Word-boundary match against a set of common imperative verbs.
+    // Order matters only for readability — we check membership.
+    return /\b(?:fix|add|build|implement|create|change|remove|delete|update|refactor|rewrite|migrate|integrate|configure|setup|set\s+up|design|generate|make|write|test|verify|ensure|review|optimize|improve|replace|move|rename|extract|split|merge|deploy|publish|ship|release|debug|investigate|analyze|reproduce|escalate|prioritize|schedule)\b/i.test(t);
+  }
   _autoCreateTaskForMessage(agent, message, source, priority) {
     if (!agent.taskList) {
       agent.taskList = { tasks: [], lastUpdated: new Date().toISOString() };

package/src/core/agentScheduler.js CHANGED Viewed

@@ -60,6 +60,19 @@ class AgentScheduler {
     // Initialize ContextInjectionService for file attachments
     this.contextInjectionService = new ContextInjectionService({}, logger);
+    // Per-turn system-prompt rebuild cache for native-API models.
+    // Agents persist a `systemPrompt` baked at create-time for the
+    // chat-completion shape (text descriptions of every tool). When a
+    // turn targets a Responses-API model (Codex / o-series / gpt-5-pro),
+    // we want a TRIMMED prompt that omits text docs for tools whose
+    // structured schemas are sent in `tools:`. Rebuilding fresh each
+    // turn would be wasteful — agents typically stay on the same model
+    // for many turns — so we memoize per (agentId, modelName).
+    //
+    // Cleared on process restart and on agent updates that change the
+    // base prompt or capabilities (see `_invalidateNativePromptCache`).
+    this._nativePromptCache = new Map();   // `${agentId}|${modelName}` → string
     // Initialize FlowContextService for flow execution context
     this.flowContextService = new FlowContextService({}, logger);
@@ -1919,8 +1932,17 @@ class AgentScheduler {
       // After compaction, retrieve messages from AgentPool (will use compacted if available)
       const messagesToSend = await this.agentPool.getMessagesForAI(agentId, targetModel);
-      // Inject TaskManager instructions for AGENT mode
-      let enhancedSystemPrompt = agent.systemPrompt;
+      // ── Pick the right system-prompt shape for the target model ──
+      // Default: use the agent's persisted `systemPrompt` (baked at
+      // create-time with full text descriptions for every tool — the
+      // chat-completion shape). For models that use the Responses API
+      // (native function-calling), rebuild a trimmed version that
+      // omits text docs for tools whose structured schemas we send in
+      // `tools:`. Falls back to the persisted prompt whenever the
+      // model's apiType is unknown OR the agent has no stored original
+      // prompt — preserves existing behaviour for old agents and
+      // unknown models. See `_pickSystemPromptForModel`.
+      let enhancedSystemPrompt = await this._pickSystemPromptForModel(agent, targetModel);
       if (agent.mode === AGENT_MODES.AGENT) {
         const taskManagerInstruction = "\n\nIMPORTANT: You are in AGENT mode. The use of TaskManager tool is mandatory.\n\n" +
           "TASK LIFECYCLE (follow this, don't improvise):\n" +
@@ -2083,6 +2105,48 @@ class AgentScheduler {
         });
       }
+      // ── Auto-inject CURRENT TASK LIST every turn ───────────────────
+      // The task list lives in `agent.taskList.tasks` — durable, never
+      // affected by compaction. But the conversation messages that
+      // CREATED those tasks ARE compacted, so an agent that lost its
+      // recent history may forget the task list exists. That's how
+      // the Talisman bug happened: the agent called sync with a fresh
+      // 4-task plan, silently wiping 9 in-flight tasks the user had
+      // implicitly requested. Surface the current task list to the
+      // agent every turn so it can never "forget" what's already on
+      // the plan. Cheap (a few hundred chars), invariant to
+      // compaction, and a natural deterrent against destructive sync.
+      try {
+        const tasks = agent.taskList?.tasks || [];
+        if (Array.isArray(tasks) && tasks.length > 0) {
+          const lines = ['\n\n## CURRENT TASK LIST (live from agent state — survives compaction)\n'];
+          lines.push('These tasks exist in your durable state RIGHT NOW. If the conversation history doesn\'t mention them, that\'s because compaction summarized that section away — the tasks are still there.\n');
+          lines.push('Before issuing `taskmanager sync`, READ this list. If you sync with a different plan, you will be dropping these.\n');
+          // Compact, scannable. Title + status + priority is enough.
+          const byStatus = { in_progress: [], pending: [], completed: [], cancelled: [] };
+          for (const t of tasks) {
+            const status = t.status || 'pending';
+            (byStatus[status] || (byStatus[status] = [])).push(t);
+          }
+          const order = ['in_progress', 'pending', 'completed', 'cancelled'];
+          for (const status of order) {
+            const group = byStatus[status] || [];
+            if (group.length === 0) continue;
+            lines.push(`\n**${status}** (${group.length}):`);
+            for (const t of group) {
+              const pri = t.priority ? ` [${t.priority}]` : '';
+              lines.push(`- ${t.title}${pri}`);
+            }
+          }
+          enhancedSystemPrompt = (enhancedSystemPrompt || '') + lines.join('\n');
+        }
+      } catch (taskInjectErr) {
+        // Best-effort — never block the turn on this.
+        this.logger.warn(`Task list injection failed for agent ${agentId} (continuing without)`, {
+          error: taskInjectErr?.message,
+        });
+      }
       // Check if streaming is enabled - consider both agent config and user message preference
       // Get the last user message to check for streaming preference
       const lastUserMsg = [...conversationHistory].reverse().find(m => m.role === 'user');
@@ -2169,6 +2233,156 @@ class AgentScheduler {
     }
   }
+  /**
+   * Choose the right base system prompt for the target model.
+   *
+   *   • If the model's catalog entry says it uses the Responses API
+   *     ('responses' in its api_type / capabilities) AND the agent has
+   *     an `originalSystemPrompt` we can rebuild from, return a
+   *     freshly-built prompt that omits text descriptions for tools
+   *     with native function schemas (see baseTool.js — those tools'
+   *     structured schemas in `tools:` are the canonical source for
+   *     these models, so the text docs are pure duplication).
+   *
+   *   • Otherwise return the agent's persisted `systemPrompt` exactly
+   *     as it is today. This covers:
+   *       – chat-completion models (no native function calling)
+   *       – models we can't classify (modelsService offline / catalog
+   *         field missing) — fail safe to old behaviour
+   *       – very old agents persisted before `originalSystemPrompt`
+   *         was stored — fail safe to old behaviour
+   *
+   * Result is memoized per `(agentId, targetModel)` to avoid rebuilding
+   * on every turn. The cache is invalidated whenever the agent's base
+   * prompt or capabilities change (see `_invalidateNativePromptCache`).
+   *
+   * @private
+   * @param {Object} agent - Agent record
+   * @param {string} targetModel - Model name about to be called
+   * @returns {Promise<string>} The prompt to use as the base
+   */
+  async _pickSystemPromptForModel(agent, targetModel) {
+    // 1. Resolve the model's API type. Unknown → use persisted prompt.
+    const apiType = this._resolveModelApiType(targetModel);
+    if (apiType !== 'responses') return agent.systemPrompt;
+    // 2. Need the original (un-enhanced) prompt to rebuild from. Without
+    //    it we can't safely re-add the trimmed tool docs — fall back
+    //    to the persisted shape (which works for chat-completion and
+    //    is also accepted by Responses API, just with the duplication
+    //    cost). This is the back-compat path for legacy agents.
+    if (!agent.originalSystemPrompt) return agent.systemPrompt;
+    // 3. Cache lookup.
+    const cacheKey = `${agent.id}|${targetModel}`;
+    const cached = this._nativePromptCache.get(cacheKey);
+    if (cached) return cached;
+    // 4. Rebuild. The agentPool stores the toolsRegistry — reuse it so
+    //    we go through the exact same code path that built the original
+    //    prompt, just with apiType set. Skills index + the rest of the
+    //    augmentation must be reapplied; mirror what createAgent does.
+    try {
+      const registry = this.agentPool?.toolsRegistry;
+      if (!registry) return agent.systemPrompt;
+      let rebuilt = registry.enhanceSystemPrompt(
+        agent.originalSystemPrompt,
+        agent.capabilities || [],
+        { apiType: 'responses' },
+      );
+      // Re-inject ASSIGNED SKILLS block if present (createAgent appends
+      // this after enhanceSystemPrompt — see agentPool.js:108).
+      if (Array.isArray(agent.skills) && agent.skills.length > 0) {
+        try {
+          const { getSkillsService } = await import('../services/skillsService.js');
+          const skillsService = getSkillsService(this.logger);
+          await skillsService.initialize();
+          const summaries = await skillsService.getSkillSummaries(agent.skills);
+          if (summaries.length > 0) {
+            rebuilt += '\n\n## ASSIGNED SKILLS\n\n';
+            rebuilt += 'Use the skills tool to browse and load skill content. Use "describe" to see sections, "read-section" to load specific parts.\n\n';
+            for (const s of summaries) {
+              const sections = s.sections?.length ? `\n    Sections: ${s.sections.map(h => h.replace(/^#+\s*/, '')).join(', ')}` : '';
+              rebuilt += `- **${s.name}** (${s.lineCount} lines): ${s.description}${sections}\n`;
+            }
+          }
+        } catch (e) {
+          this.logger?.debug?.('Failed to re-inject skills index for native prompt', { error: e.message });
+        }
+      }
+      this._nativePromptCache.set(cacheKey, rebuilt);
+      this.logger?.debug?.('Built native-API system prompt', {
+        agentId: agent.id,
+        targetModel,
+        originalLength: agent.systemPrompt?.length || 0,
+        rebuiltLength: rebuilt.length,
+        savedTokensApprox: Math.round(((agent.systemPrompt?.length || 0) - rebuilt.length) / 4),
+      });
+      return rebuilt;
+    } catch (err) {
+      // Anything goes wrong → fall back to old behaviour. Failing
+      // closed (no prompt) would break the agent's turn; failing open
+      // (use chat-completion shape) just keeps the duplication.
+      this.logger?.warn?.('Native system-prompt rebuild failed — using persisted prompt', {
+        agentId: agent.id,
+        targetModel,
+        error: err.message,
+      });
+      return agent.systemPrompt;
+    }
+  }
+  /**
+   * Look up a model's API type from the catalog. Returns 'responses',
+   * 'chat_completion', or undefined when unknown. The catalog exposes
+   * `api_type` as an array and/or `capabilities.responses`/`capabilities.chatCompletion`
+   * — mirror the backend's _inferRouting precedence so the CLI's
+   * classification matches the backend's routing decision exactly.
+   * @private
+   */
+  _resolveModelApiType(modelName) {
+    try {
+      if (!this.modelsService || typeof this.modelsService.getModels !== 'function') return undefined;
+      const models = this.modelsService.getModels();
+      const m = models.find(x => x.name === modelName);
+      if (!m) return undefined;
+      const apiType = Array.isArray(m.api_type) ? m.api_type : (m.api_type ? [m.api_type] : []);
+      const caps = m.capabilities || {};
+      // Mirrors backend services/llmServiceFactory.js _inferRouting:
+      //   responses if api_type contains 'responses' AND not 'chat_completion'
+      //   OR capabilities.responses === 'true' / chatCompletion === 'false'
+      //   OR explicit useResponsesApi flag
+      if (apiType.includes('responses') && !apiType.includes('chat_completion')) return 'responses';
+      if (caps.chatCompletion === 'false' && (caps.responses === 'true' || apiType.includes('responses'))) return 'responses';
+      if (m.useResponsesApi) return 'responses';
+      // Name-based fallback (last resort — only when catalog has no routing data)
+      if (/codex/i.test(modelName) || /gpt.*-pro$/i.test(modelName)) return 'responses';
+      return 'chat_completion';
+    } catch (err) {
+      // Defensive — never block the turn on a classification failure.
+      this.logger?.debug?.('Model apiType resolution failed', { modelName, error: err.message });
+      return undefined;
+    }
+  }
+  /**
+   * Drop cached native prompts for an agent. Called by agentPool when
+   * the base prompt or capabilities change so the next turn rebuilds.
+   * Exposed so agentPool can call it without poking internal state.
+   */
+  _invalidateNativePromptCache(agentId) {
+    for (const key of this._nativePromptCache.keys()) {
+      if (key.startsWith(`${agentId}|`)) {
+        this._nativePromptCache.delete(key);
+      }
+    }
+  }
   /**
    * Get AI response using streaming with WebSocket broadcast
    * @param {string} agentId - Agent ID