onbuzz 4.8.0 → 4.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,6 +23,17 @@ import DirectoryAccessManager from '../utilities/directoryAccessManager.js';
23
23
  import { getVisualEditorBridge } from '../services/visualEditorBridge.js';
24
24
 
25
25
  class AgentPool {
26
+ // Stopwords for the _tokenize / _jaccard similarity check used by
27
+ // auto-save-as-plan dedup. Tight list — only words that appear in
28
+ // virtually every English sentence regardless of content, so that
29
+ // their presence in both messages doesn't inflate similarity.
30
+ static _STOPWORDS = new Set([
31
+ 'the', 'and', 'for', 'but', 'are', 'was', 'were',
32
+ 'has', 'have', 'had', 'this', 'that', 'with', 'will',
33
+ 'you', 'your', 'our', 'their', 'them', 'they',
34
+ 'can', 'could', 'should', 'would',
35
+ ]);
36
+
26
37
  constructor(config, logger, stateManager, contextManager, toolsRegistry = null) {
27
38
  this.config = config;
28
39
  this.logger = logger;
@@ -380,6 +391,18 @@ class AgentPool {
380
391
  originalLength: baseSystemPrompt?.length || 0,
381
392
  enhancedLength: enhancedSystemPrompt?.length || 0
382
393
  });
394
+
395
+ // The scheduler caches per-(agent, model) Responses-API prompts
396
+ // built from this agent's `originalSystemPrompt` + capabilities.
397
+ // Both inputs just changed, so any cached rebuilds are stale.
398
+ // No-op when the scheduler isn't attached (tests / very-early
399
+ // boot) or when it predates this method (old binaries during
400
+ // a rolling upgrade).
401
+ try {
402
+ this.scheduler?._invalidateNativePromptCache?.(agentId);
403
+ } catch (e) {
404
+ this.logger.debug?.('Failed to invalidate native prompt cache', { agentId, error: e.message });
405
+ }
383
406
  } catch (error) {
384
407
  this.logger.error(`Failed to regenerate system prompt with updated capabilities`, {
385
408
  agentId,
@@ -1425,6 +1448,23 @@ class AgentPool {
1425
1448
  this._autoCreateTaskForMessage(agent, queuedMessage, 'user', 'high');
1426
1449
  }
1427
1450
 
1451
+ // ── Auto-save substantive user messages as plan/* memories ───────
1452
+ // Observed in production: across 670-message agent sessions the
1453
+ // agent NEVER wrote a memory voluntarily. Compaction then summarized
1454
+ // away the user's literal asks, the agent paraphrased what was left,
1455
+ // and ended up doing work the user never requested. Belt-and-
1456
+ // suspenders alongside the OPERATING POSTURE prompt nudge: when a
1457
+ // user message looks substantive (long, or contains a numbered/
1458
+ // bulleted multi-part ask), the SYSTEM saves it as `plan/<auto>` so
1459
+ // the system-prompt auto-injection makes the user's words visible
1460
+ // every turn — even if the agent itself never thought to save.
1461
+ // Best-effort: never block the message-enqueue path.
1462
+ this._autoSaveUserMessageAsPlan(agentId, queuedMessage).catch(err => {
1463
+ this.logger.debug?.('Auto-save of user message as plan/* failed (continuing)', {
1464
+ agentId, error: err?.message,
1465
+ });
1466
+ });
1467
+
1428
1468
  await this.persistAgentState(agentId);
1429
1469
 
1430
1470
  // If we cleared a delay, surface it on the WS so the delay chip in the
@@ -1544,6 +1584,285 @@ class AgentPool {
1544
1584
  * @param {string} priority - Task priority ('high', 'medium', 'low')
1545
1585
  * @private
1546
1586
  */
1587
+ /**
1588
+ * Save a substantive user message as a `plan/*` memory automatically.
1589
+ *
1590
+ * Rationale (Talisman case study, May 2026): agents observed in
1591
+ * production never wrote a single memory across hundreds of
1592
+ * messages, even when the OPERATING POSTURE prompt explicitly told
1593
+ * them to. The user's literal ask then got lost in compaction and
1594
+ * the agent went off-course. This system-level safety net puts the
1595
+ * user's message into the durable plan/* store — which the system
1596
+ * prompt auto-injects every turn — without depending on the model
1597
+ * making the call.
1598
+ *
1599
+ * What counts as "substantive":
1600
+ * - Content length ≥ 60 chars (~12 words) — short acks/yes-no don't qualify
1601
+ * - AND any of:
1602
+ * • contains a numbered list ("1.", "2.", "3." …)
1603
+ * • contains a bullet list (-, *, • at line start)
1604
+ * • OR is ≥ 120 chars (longer than a one-line ack)
1605
+ *
1606
+ * What gets saved:
1607
+ * - title: `plan/user-<short-slug>-<timestamp>`
1608
+ * - description: "auto-saved from user message at <iso>"
1609
+ * - content: the verbatim user message
1610
+ *
1611
+ * The agent can rename, consolidate, or delete these later. They
1612
+ * exist as a fail-safe — if the agent does its job and saves its
1613
+ * own better-named plan, these auto-saves can be cleaned up. If
1614
+ * the agent doesn't, at least the user's words survive compaction.
1615
+ *
1616
+ * @param {string} agentId
1617
+ * @param {Object} message - The queued user message
1618
+ * @private
1619
+ */
1620
+ async _autoSaveUserMessageAsPlan(agentId, message) {
1621
+ const content = typeof message?.content === 'string' ? message.content : '';
1622
+ if (!content) return;
1623
+ if (!this._looksSubstantive(content)) return;
1624
+
1625
+ // Lazy-load to keep agentPool's load order light. The same import
1626
+ // pattern as agentScheduler's plan injection.
1627
+ let memoryService;
1628
+ try {
1629
+ const mod = await import('../services/memoryService.js');
1630
+ memoryService = mod.getMemoryService(this.logger);
1631
+ await memoryService.initialize();
1632
+ } catch (e) {
1633
+ this.logger.debug?.('Auto-save plan: memory service unavailable', { error: e.message });
1634
+ return;
1635
+ }
1636
+
1637
+ // ── Deduplication ────────────────────────────────────────────────
1638
+ // Users repeat themselves ("I repeat my old message", "did you do
1639
+ // it all?" + paste the same thing). Without dedup the auto-saver
1640
+ // would create N copies of essentially the same plan. Load
1641
+ // existing plan/user-* memories and skip when the new content is
1642
+ // ≥70% similar to any of them (Jaccard over normalized word sets).
1643
+ let existingPlans = [];
1644
+ try {
1645
+ const all = await memoryService.loadMemories(agentId);
1646
+ existingPlans = (all || []).filter(m =>
1647
+ typeof m?.title === 'string' && m.title.startsWith('plan/user-')
1648
+ );
1649
+ } catch (e) {
1650
+ // Treat unreadable store as empty — we may still write a fresh entry.
1651
+ this.logger.debug?.('Auto-save plan: existing memories unreadable', { agentId, error: e.message });
1652
+ }
1653
+
1654
+ const newTokens = this._tokenize(content);
1655
+ for (const existing of existingPlans) {
1656
+ const existingTokens = this._tokenize(existing.content || '');
1657
+ const sim = this._jaccard(newTokens, existingTokens);
1658
+ const containment = this._overlapCoefficient(newTokens, existingTokens);
1659
+ // Jaccard catches near-identical reformulations. Containment
1660
+ // catches the "I repeat my old message — <same content>" case
1661
+ // where the user re-pastes the original plus a preamble. Either
1662
+ // signal is enough to suppress the duplicate.
1663
+ if (sim >= 0.7 || containment >= 0.85) {
1664
+ this.logger.info?.('Auto-save plan: skipping near-duplicate of existing plan', {
1665
+ agentId, existingTitle: existing.title,
1666
+ jaccard: sim.toFixed(2), containment: containment.toFixed(2),
1667
+ });
1668
+ return;
1669
+ }
1670
+ }
1671
+
1672
+ // ── Per-agent cap ────────────────────────────────────────────────
1673
+ // Bound the total auto-saved plans so an active session doesn't
1674
+ // bloat the agent's plan/* namespace indefinitely. Keep the K most
1675
+ // recent; delete the oldest auto-saves beyond that.
1676
+ const AUTO_PLAN_CAP = 8;
1677
+ const existingAutoSaves = existingPlans
1678
+ .filter(m => /^plan\/user-/.test(m.title))
1679
+ .sort((a, b) => String(a.createdAt || '').localeCompare(String(b.createdAt || '')));
1680
+ while (existingAutoSaves.length >= AUTO_PLAN_CAP) {
1681
+ const oldest = existingAutoSaves.shift();
1682
+ try {
1683
+ await memoryService.deleteMemory(agentId, oldest.id);
1684
+ this.logger.info?.('Auto-save plan: retired oldest auto-save to keep cap', {
1685
+ agentId, retiredTitle: oldest.title, cap: AUTO_PLAN_CAP,
1686
+ });
1687
+ } catch (e) {
1688
+ // Non-fatal — if we can't delete the oldest, just skip this entry
1689
+ // and proceed with the write. Worst case the plan list grows
1690
+ // by one beyond the cap — still bounded over time.
1691
+ this.logger.debug?.('Auto-save plan: retire-oldest failed', { agentId, error: e.message });
1692
+ break;
1693
+ }
1694
+ }
1695
+
1696
+ // ── Write the new memory ─────────────────────────────────────────
1697
+ const firstLine = (content.match(/[^\n]+/) || [''])[0].trim();
1698
+ const slug = firstLine
1699
+ .toLowerCase()
1700
+ .replace(/[^a-z0-9]+/g, '-')
1701
+ .replace(/^-+|-+$/g, '')
1702
+ .slice(0, 40) || 'request';
1703
+ const ts = new Date().toISOString().slice(0, 19).replace(/[:T]/g, '-');
1704
+ const title = `plan/user-${slug}-${ts}`;
1705
+
1706
+ try {
1707
+ await memoryService.addMemory(agentId, {
1708
+ title,
1709
+ description: `Auto-saved from user message at ${message.timestamp || new Date().toISOString()}`,
1710
+ content,
1711
+ });
1712
+ this.logger.info?.('Auto-saved user message as plan/* memory', {
1713
+ agentId, title, contentLength: content.length,
1714
+ });
1715
+ } catch (e) {
1716
+ this.logger.debug?.('Auto-save plan: write failed', { agentId, title, error: e.message });
1717
+ }
1718
+ }
1719
+
1720
+ /**
1721
+ * Tokenize a string into a lowercased word set for similarity checks.
1722
+ * Strips punctuation, drops short words (<3 chars), and drops a
1723
+ * small stopword set so that common words like "the" / "and" don't
1724
+ * inflate similarity scores between otherwise different messages.
1725
+ * @private
1726
+ */
1727
+ _tokenize(s) {
1728
+ if (typeof s !== 'string') return new Set();
1729
+ return new Set(
1730
+ s.toLowerCase()
1731
+ .replace(/[^a-z0-9\s]+/g, ' ')
1732
+ .split(/\s+/)
1733
+ .filter(w => w.length >= 3 && !AgentPool._STOPWORDS.has(w))
1734
+ );
1735
+ }
1736
+
1737
+ /**
1738
+ * Jaccard similarity over two word sets.
1739
+ * @private
1740
+ */
1741
+ _jaccard(a, b) {
1742
+ if (a.size === 0 && b.size === 0) return 1;
1743
+ if (a.size === 0 || b.size === 0) return 0;
1744
+ let intersection = 0;
1745
+ for (const w of a) if (b.has(w)) intersection += 1;
1746
+ return intersection / (a.size + b.size - intersection);
1747
+ }
1748
+
1749
+ /**
1750
+ * Overlap coefficient — intersection / size-of-smaller-set.
1751
+ * Returns 1.0 when one set is fully contained in the other,
1752
+ * regardless of how much the other set adds. Catches the "user
1753
+ * re-pastes their request with a preamble" duplicate case where
1754
+ * Jaccard would mark the messages as merely similar.
1755
+ * @private
1756
+ */
1757
+ _overlapCoefficient(a, b) {
1758
+ if (a.size === 0 || b.size === 0) return 0;
1759
+ let intersection = 0;
1760
+ for (const w of a) if (b.has(w)) intersection += 1;
1761
+ return intersection / Math.min(a.size, b.size);
1762
+ }
1763
+
1764
+ /**
1765
+ * Heuristic — does this user message look like a real request worth
1766
+ * preserving as a plan/*? Errs on the side of saving more (recall
1767
+ * over precision) — a stray auto-save is cheap; a lost user request
1768
+ * is catastrophic.
1769
+ * @private
1770
+ */
1771
+ _looksSubstantive(text) {
1772
+ if (typeof text !== 'string') return false;
1773
+ const t = text.trim();
1774
+ if (t.length < 30) return false;
1775
+ // Tool-result wrappers and previous-task boundaries are not user voice.
1776
+ if (t.startsWith('[Tool Results') || t.startsWith('[Previous Task')) return false;
1777
+
1778
+ // ── Pollution filter 1: dominated by questions ────────────────────
1779
+ // A message that's mostly questions wants an ANSWER, not a plan.
1780
+ // If the majority of non-empty lines end in '?' (or are
1781
+ // question-shaped), this is a query, not a request.
1782
+ if (this._dominatedByQuestions(t)) return false;
1783
+
1784
+ // ── Pollution filter 2: list items are just refs (paths, urls) ───
1785
+ // A list of file paths / URLs / commit hashes is the user pointing
1786
+ // the agent at things, not a multi-part plan. Save it only if the
1787
+ // surrounding prose carries imperative intent — and even then the
1788
+ // length gate handles that path.
1789
+ const hasList = /^\s*(?:\d+[.)]|[-*•])\s/m.test(t);
1790
+ if (hasList && this._listItemsAreJustReferences(t)) return false;
1791
+
1792
+ // ── Now apply the structural triggers ────────────────────────────
1793
+ // Numbered list — "1." / "1)" at a line start. Multi-part intent.
1794
+ // Require a minimum total length to avoid "1. yes 2. no" nonsense.
1795
+ if (/^\s*\d+[.)]\s/m.test(t) && t.length >= 60) return true;
1796
+ // Bullet list at line start. Same — strong intent signal + length.
1797
+ if (/^\s*[-*•]\s/m.test(t) && t.length >= 60) return true;
1798
+ // Free-form prose with no list markers must be substantial AND
1799
+ // contain an imperative-like signal (a verb you'd give as an
1800
+ // order). Raised from 120 → 150 to skip more pleasantries.
1801
+ if (t.length >= 150 && this._hasImperativeSignal(t)) return true;
1802
+ return false;
1803
+ }
1804
+
1805
+ /**
1806
+ * Heuristic: is this message mostly questions?
1807
+ * @private
1808
+ */
1809
+ _dominatedByQuestions(t) {
1810
+ // Split into non-empty lines.
1811
+ const lines = t.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
1812
+ if (lines.length === 0) return false;
1813
+ // Strip leading list markers so we can look at the line's intent.
1814
+ const stripMarker = (l) => l.replace(/^(?:\d+[.)]|[-*•])\s+/, '');
1815
+ let questionLines = 0;
1816
+ for (const raw of lines) {
1817
+ const line = stripMarker(raw);
1818
+ // Ends in '?', OR starts with a question word at the line head.
1819
+ if (/\?\s*$/.test(line) || /^(?:what|why|how|when|where|who|which|is\b|are\b|do\b|does\b|can\b|could\b|should\b|would\b)\b/i.test(line)) {
1820
+ questionLines += 1;
1821
+ }
1822
+ }
1823
+ // Strict-majority rule: more than half of lines are questions.
1824
+ return questionLines * 2 > lines.length;
1825
+ }
1826
+
1827
+ /**
1828
+ * Heuristic: are the list items in this message just references
1829
+ * (file paths, URLs, commit hashes) with no imperative verb of their own?
1830
+ * @private
1831
+ */
1832
+ _listItemsAreJustReferences(t) {
1833
+ const lines = t.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
1834
+ const listItems = lines.filter(l => /^(?:\d+[.)]|[-*•])\s/.test(l));
1835
+ if (listItems.length === 0) return false;
1836
+ let refLikeCount = 0;
1837
+ for (const li of listItems) {
1838
+ const body = li.replace(/^(?:\d+[.)]|[-*•])\s+/, '').trim();
1839
+ // Only treat as a "reference" if the line IS the reference —
1840
+ // i.e. a path/URL/hash with no surrounding English. A short bug
1841
+ // description like "login button does nothing on Safari" still
1842
+ // counts as content, not a reference.
1843
+ // Path: contains '/' or '\' OR starts with '.' AND has NO spaces
1844
+ // URL: starts with http(s)://
1845
+ // Hash: 7-40 hex chars only, no spaces
1846
+ const isPath = (/[/\\]/.test(body) || /^\./.test(body)) && !/\s/.test(body);
1847
+ const isUrl = /^https?:\/\//.test(body) && !/\s/.test(body);
1848
+ const isHash = /^[0-9a-f]{7,40}$/i.test(body);
1849
+ if (isPath || isUrl || isHash) refLikeCount += 1;
1850
+ }
1851
+ // Strict-majority of list items are reference-like → ignore.
1852
+ return refLikeCount * 2 > listItems.length;
1853
+ }
1854
+
1855
+ /**
1856
+ * Heuristic: does the message contain a verb that signals "do this"?
1857
+ * Conservative — favors recall over precision.
1858
+ * @private
1859
+ */
1860
+ _hasImperativeSignal(t) {
1861
+ // Word-boundary match against a set of common imperative verbs.
1862
+ // Order matters only for readability — we check membership.
1863
+ return /\b(?:fix|add|build|implement|create|change|remove|delete|update|refactor|rewrite|migrate|integrate|configure|setup|set\s+up|design|generate|make|write|test|verify|ensure|review|optimize|improve|replace|move|rename|extract|split|merge|deploy|publish|ship|release|debug|investigate|analyze|reproduce|escalate|prioritize|schedule)\b/i.test(t);
1864
+ }
1865
+
1547
1866
  _autoCreateTaskForMessage(agent, message, source, priority) {
1548
1867
  if (!agent.taskList) {
1549
1868
  agent.taskList = { tasks: [], lastUpdated: new Date().toISOString() };
@@ -60,6 +60,19 @@ class AgentScheduler {
60
60
  // Initialize ContextInjectionService for file attachments
61
61
  this.contextInjectionService = new ContextInjectionService({}, logger);
62
62
 
63
+ // Per-turn system-prompt rebuild cache for native-API models.
64
+ // Agents persist a `systemPrompt` baked at create-time for the
65
+ // chat-completion shape (text descriptions of every tool). When a
66
+ // turn targets a Responses-API model (Codex / o-series / gpt-5-pro),
67
+ // we want a TRIMMED prompt that omits text docs for tools whose
68
+ // structured schemas are sent in `tools:`. Rebuilding fresh each
69
+ // turn would be wasteful — agents typically stay on the same model
70
+ // for many turns — so we memoize per (agentId, modelName).
71
+ //
72
+ // Cleared on process restart and on agent updates that change the
73
+ // base prompt or capabilities (see `_invalidateNativePromptCache`).
74
+ this._nativePromptCache = new Map(); // `${agentId}|${modelName}` → string
75
+
63
76
  // Initialize FlowContextService for flow execution context
64
77
  this.flowContextService = new FlowContextService({}, logger);
65
78
 
@@ -1919,8 +1932,17 @@ class AgentScheduler {
1919
1932
  // After compaction, retrieve messages from AgentPool (will use compacted if available)
1920
1933
  const messagesToSend = await this.agentPool.getMessagesForAI(agentId, targetModel);
1921
1934
 
1922
- // Inject TaskManager instructions for AGENT mode
1923
- let enhancedSystemPrompt = agent.systemPrompt;
1935
+ // ── Pick the right system-prompt shape for the target model ──
1936
+ // Default: use the agent's persisted `systemPrompt` (baked at
1937
+ // create-time with full text descriptions for every tool — the
1938
+ // chat-completion shape). For models that use the Responses API
1939
+ // (native function-calling), rebuild a trimmed version that
1940
+ // omits text docs for tools whose structured schemas we send in
1941
+ // `tools:`. Falls back to the persisted prompt whenever the
1942
+ // model's apiType is unknown OR the agent has no stored original
1943
+ // prompt — preserves existing behaviour for old agents and
1944
+ // unknown models. See `_pickSystemPromptForModel`.
1945
+ let enhancedSystemPrompt = await this._pickSystemPromptForModel(agent, targetModel);
1924
1946
  if (agent.mode === AGENT_MODES.AGENT) {
1925
1947
  const taskManagerInstruction = "\n\nIMPORTANT: You are in AGENT mode. The use of TaskManager tool is mandatory.\n\n" +
1926
1948
  "TASK LIFECYCLE (follow this, don't improvise):\n" +
@@ -2083,6 +2105,48 @@ class AgentScheduler {
2083
2105
  });
2084
2106
  }
2085
2107
 
2108
+ // ── Auto-inject CURRENT TASK LIST every turn ───────────────────
2109
+ // The task list lives in `agent.taskList.tasks` — durable, never
2110
+ // affected by compaction. But the conversation messages that
2111
+ // CREATED those tasks ARE compacted, so an agent that lost its
2112
+ // recent history may forget the task list exists. That's how
2113
+ // the Talisman bug happened: the agent called sync with a fresh
2114
+ // 4-task plan, silently wiping 9 in-flight tasks the user had
2115
+ // implicitly requested. Surface the current task list to the
2116
+ // agent every turn so it can never "forget" what's already on
2117
+ // the plan. Cheap (a few hundred chars), invariant to
2118
+ // compaction, and a natural deterrent against destructive sync.
2119
+ try {
2120
+ const tasks = agent.taskList?.tasks || [];
2121
+ if (Array.isArray(tasks) && tasks.length > 0) {
2122
+ const lines = ['\n\n## CURRENT TASK LIST (live from agent state — survives compaction)\n'];
2123
+ lines.push('These tasks exist in your durable state RIGHT NOW. If the conversation history doesn\'t mention them, that\'s because compaction summarized that section away — the tasks are still there.\n');
2124
+ lines.push('Before issuing `taskmanager sync`, READ this list. If you sync with a different plan, you will be dropping these.\n');
2125
+ // Compact, scannable. Title + status + priority is enough.
2126
+ const byStatus = { in_progress: [], pending: [], completed: [], cancelled: [] };
2127
+ for (const t of tasks) {
2128
+ const status = t.status || 'pending';
2129
+ (byStatus[status] || (byStatus[status] = [])).push(t);
2130
+ }
2131
+ const order = ['in_progress', 'pending', 'completed', 'cancelled'];
2132
+ for (const status of order) {
2133
+ const group = byStatus[status] || [];
2134
+ if (group.length === 0) continue;
2135
+ lines.push(`\n**${status}** (${group.length}):`);
2136
+ for (const t of group) {
2137
+ const pri = t.priority ? ` [${t.priority}]` : '';
2138
+ lines.push(`- ${t.title}${pri}`);
2139
+ }
2140
+ }
2141
+ enhancedSystemPrompt = (enhancedSystemPrompt || '') + lines.join('\n');
2142
+ }
2143
+ } catch (taskInjectErr) {
2144
+ // Best-effort — never block the turn on this.
2145
+ this.logger.warn(`Task list injection failed for agent ${agentId} (continuing without)`, {
2146
+ error: taskInjectErr?.message,
2147
+ });
2148
+ }
2149
+
2086
2150
  // Check if streaming is enabled - consider both agent config and user message preference
2087
2151
  // Get the last user message to check for streaming preference
2088
2152
  const lastUserMsg = [...conversationHistory].reverse().find(m => m.role === 'user');
@@ -2169,6 +2233,156 @@ class AgentScheduler {
2169
2233
  }
2170
2234
  }
2171
2235
 
2236
+ /**
2237
+ * Choose the right base system prompt for the target model.
2238
+ *
2239
+ * • If the model's catalog entry says it uses the Responses API
2240
+ * ('responses' in its api_type / capabilities) AND the agent has
2241
+ * an `originalSystemPrompt` we can rebuild from, return a
2242
+ * freshly-built prompt that omits text descriptions for tools
2243
+ * with native function schemas (see baseTool.js — those tools'
2244
+ * structured schemas in `tools:` are the canonical source for
2245
+ * these models, so the text docs are pure duplication).
2246
+ *
2247
+ * • Otherwise return the agent's persisted `systemPrompt` exactly
2248
+ * as it is today. This covers:
2249
+ * – chat-completion models (no native function calling)
2250
+ * – models we can't classify (modelsService offline / catalog
2251
+ * field missing) — fail safe to old behaviour
2252
+ * – very old agents persisted before `originalSystemPrompt`
2253
+ * was stored — fail safe to old behaviour
2254
+ *
2255
+ * Result is memoized per `(agentId, targetModel)` to avoid rebuilding
2256
+ * on every turn. The cache is invalidated whenever the agent's base
2257
+ * prompt or capabilities change (see `_invalidateNativePromptCache`).
2258
+ *
2259
+ * @private
2260
+ * @param {Object} agent - Agent record
2261
+ * @param {string} targetModel - Model name about to be called
2262
+ * @returns {Promise<string>} The prompt to use as the base
2263
+ */
2264
+ async _pickSystemPromptForModel(agent, targetModel) {
2265
+ // 1. Resolve the model's API type. Unknown → use persisted prompt.
2266
+ const apiType = this._resolveModelApiType(targetModel);
2267
+ if (apiType !== 'responses') return agent.systemPrompt;
2268
+
2269
+ // 2. Need the original (un-enhanced) prompt to rebuild from. Without
2270
+ // it we can't safely re-add the trimmed tool docs — fall back
2271
+ // to the persisted shape (which works for chat-completion and
2272
+ // is also accepted by Responses API, just with the duplication
2273
+ // cost). This is the back-compat path for legacy agents.
2274
+ if (!agent.originalSystemPrompt) return agent.systemPrompt;
2275
+
2276
+ // 3. Cache lookup.
2277
+ const cacheKey = `${agent.id}|${targetModel}`;
2278
+ const cached = this._nativePromptCache.get(cacheKey);
2279
+ if (cached) return cached;
2280
+
2281
+ // 4. Rebuild. The agentPool stores the toolsRegistry — reuse it so
2282
+ // we go through the exact same code path that built the original
2283
+ // prompt, just with apiType set. Skills index + the rest of the
2284
+ // augmentation must be reapplied; mirror what createAgent does.
2285
+ try {
2286
+ const registry = this.agentPool?.toolsRegistry;
2287
+ if (!registry) return agent.systemPrompt;
2288
+
2289
+ let rebuilt = registry.enhanceSystemPrompt(
2290
+ agent.originalSystemPrompt,
2291
+ agent.capabilities || [],
2292
+ { apiType: 'responses' },
2293
+ );
2294
+
2295
+ // Re-inject ASSIGNED SKILLS block if present (createAgent appends
2296
+ // this after enhanceSystemPrompt — see agentPool.js:108).
2297
+ if (Array.isArray(agent.skills) && agent.skills.length > 0) {
2298
+ try {
2299
+ const { getSkillsService } = await import('../services/skillsService.js');
2300
+ const skillsService = getSkillsService(this.logger);
2301
+ await skillsService.initialize();
2302
+ const summaries = await skillsService.getSkillSummaries(agent.skills);
2303
+ if (summaries.length > 0) {
2304
+ rebuilt += '\n\n## ASSIGNED SKILLS\n\n';
2305
+ rebuilt += 'Use the skills tool to browse and load skill content. Use "describe" to see sections, "read-section" to load specific parts.\n\n';
2306
+ for (const s of summaries) {
2307
+ const sections = s.sections?.length ? `\n Sections: ${s.sections.map(h => h.replace(/^#+\s*/, '')).join(', ')}` : '';
2308
+ rebuilt += `- **${s.name}** (${s.lineCount} lines): ${s.description}${sections}\n`;
2309
+ }
2310
+ }
2311
+ } catch (e) {
2312
+ this.logger?.debug?.('Failed to re-inject skills index for native prompt', { error: e.message });
2313
+ }
2314
+ }
2315
+
2316
+ this._nativePromptCache.set(cacheKey, rebuilt);
2317
+ this.logger?.debug?.('Built native-API system prompt', {
2318
+ agentId: agent.id,
2319
+ targetModel,
2320
+ originalLength: agent.systemPrompt?.length || 0,
2321
+ rebuiltLength: rebuilt.length,
2322
+ savedTokensApprox: Math.round(((agent.systemPrompt?.length || 0) - rebuilt.length) / 4),
2323
+ });
2324
+ return rebuilt;
2325
+ } catch (err) {
2326
+ // Anything goes wrong → fall back to old behaviour. Failing
2327
+ // closed (no prompt) would break the agent's turn; failing open
2328
+ // (use chat-completion shape) just keeps the duplication.
2329
+ this.logger?.warn?.('Native system-prompt rebuild failed — using persisted prompt', {
2330
+ agentId: agent.id,
2331
+ targetModel,
2332
+ error: err.message,
2333
+ });
2334
+ return agent.systemPrompt;
2335
+ }
2336
+ }
2337
+
2338
+ /**
2339
+ * Look up a model's API type from the catalog. Returns 'responses',
2340
+ * 'chat_completion', or undefined when unknown. The catalog exposes
2341
+ * `api_type` as an array and/or `capabilities.responses`/`capabilities.chatCompletion`
2342
+ * — mirror the backend's _inferRouting precedence so the CLI's
2343
+ * classification matches the backend's routing decision exactly.
2344
+ * @private
2345
+ */
2346
+ _resolveModelApiType(modelName) {
2347
+ try {
2348
+ if (!this.modelsService || typeof this.modelsService.getModels !== 'function') return undefined;
2349
+ const models = this.modelsService.getModels();
2350
+ const m = models.find(x => x.name === modelName);
2351
+ if (!m) return undefined;
2352
+
2353
+ const apiType = Array.isArray(m.api_type) ? m.api_type : (m.api_type ? [m.api_type] : []);
2354
+ const caps = m.capabilities || {};
2355
+
2356
+ // Mirrors backend services/llmServiceFactory.js _inferRouting:
2357
+ // responses if api_type contains 'responses' AND not 'chat_completion'
2358
+ // OR capabilities.responses === 'true' / chatCompletion === 'false'
2359
+ // OR explicit useResponsesApi flag
2360
+ if (apiType.includes('responses') && !apiType.includes('chat_completion')) return 'responses';
2361
+ if (caps.chatCompletion === 'false' && (caps.responses === 'true' || apiType.includes('responses'))) return 'responses';
2362
+ if (m.useResponsesApi) return 'responses';
2363
+ // Name-based fallback (last resort — only when catalog has no routing data)
2364
+ if (/codex/i.test(modelName) || /gpt.*-pro$/i.test(modelName)) return 'responses';
2365
+ return 'chat_completion';
2366
+ } catch (err) {
2367
+ // Defensive — never block the turn on a classification failure.
2368
+ this.logger?.debug?.('Model apiType resolution failed', { modelName, error: err.message });
2369
+ return undefined;
2370
+ }
2371
+ }
2372
+
2373
+ /**
2374
+ * Drop cached native prompts for an agent. Called by agentPool when
2375
+ * the base prompt or capabilities change so the next turn rebuilds.
2376
+ * Exposed so agentPool can call it without poking internal state.
2377
+ */
2378
+ _invalidateNativePromptCache(agentId) {
2379
+ for (const key of this._nativePromptCache.keys()) {
2380
+ if (key.startsWith(`${agentId}|`)) {
2381
+ this._nativePromptCache.delete(key);
2382
+ }
2383
+ }
2384
+ }
2385
+
2172
2386
  /**
2173
2387
  * Get AI response using streaming with WebSocket broadcast
2174
2388
  * @param {string} agentId - Agent ID