onbuzz 4.8.0 → 4.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/core/__tests__/agentPool.test.js +185 -0
- package/src/core/__tests__/agentScheduler.nativePromptPick.test.js +319 -0
- package/src/core/__tests__/agentScheduler.taskListInjection.test.js +94 -0
- package/src/core/agentPool.js +319 -0
- package/src/core/agentScheduler.js +216 -2
- package/src/services/__tests__/conversationCompactionService.test.js +141 -0
- package/src/services/__tests__/modelRouterNaming.test.js +41 -23
- package/src/services/conversationCompactionService.js +120 -46
- package/src/tools/__tests__/baseTool.test.js +171 -0
- package/src/tools/__tests__/codeMapTool.test.js +179 -0
- package/src/tools/__tests__/taskManagerTool.test.js +141 -0
- package/src/tools/baseTool.js +89 -1
- package/src/tools/openaiFunctionSchemas.js +14 -0
- package/src/tools/skillsTool.js +282 -277
- package/src/tools/taskManagerTool.js +72 -2
- package/src/utilities/constants.js +19 -1
package/src/core/agentPool.js
CHANGED
|
@@ -23,6 +23,17 @@ import DirectoryAccessManager from '../utilities/directoryAccessManager.js';
|
|
|
23
23
|
import { getVisualEditorBridge } from '../services/visualEditorBridge.js';
|
|
24
24
|
|
|
25
25
|
class AgentPool {
|
|
26
|
+
// Stopwords for the _tokenize / _jaccard similarity check used by
|
|
27
|
+
// auto-save-as-plan dedup. Tight list — only words that appear in
|
|
28
|
+
// virtually every English sentence regardless of content, so that
|
|
29
|
+
// their presence in both messages doesn't inflate similarity.
|
|
30
|
+
static _STOPWORDS = new Set([
|
|
31
|
+
'the', 'and', 'for', 'but', 'are', 'was', 'were',
|
|
32
|
+
'has', 'have', 'had', 'this', 'that', 'with', 'will',
|
|
33
|
+
'you', 'your', 'our', 'their', 'them', 'they',
|
|
34
|
+
'can', 'could', 'should', 'would',
|
|
35
|
+
]);
|
|
36
|
+
|
|
26
37
|
constructor(config, logger, stateManager, contextManager, toolsRegistry = null) {
|
|
27
38
|
this.config = config;
|
|
28
39
|
this.logger = logger;
|
|
@@ -380,6 +391,18 @@ class AgentPool {
|
|
|
380
391
|
originalLength: baseSystemPrompt?.length || 0,
|
|
381
392
|
enhancedLength: enhancedSystemPrompt?.length || 0
|
|
382
393
|
});
|
|
394
|
+
|
|
395
|
+
// The scheduler caches per-(agent, model) Responses-API prompts
|
|
396
|
+
// built from this agent's `originalSystemPrompt` + capabilities.
|
|
397
|
+
// Both inputs just changed, so any cached rebuilds are stale.
|
|
398
|
+
// No-op when the scheduler isn't attached (tests / very-early
|
|
399
|
+
// boot) or when it predates this method (old binaries during
|
|
400
|
+
// a rolling upgrade).
|
|
401
|
+
try {
|
|
402
|
+
this.scheduler?._invalidateNativePromptCache?.(agentId);
|
|
403
|
+
} catch (e) {
|
|
404
|
+
this.logger.debug?.('Failed to invalidate native prompt cache', { agentId, error: e.message });
|
|
405
|
+
}
|
|
383
406
|
} catch (error) {
|
|
384
407
|
this.logger.error(`Failed to regenerate system prompt with updated capabilities`, {
|
|
385
408
|
agentId,
|
|
@@ -1425,6 +1448,23 @@ class AgentPool {
|
|
|
1425
1448
|
this._autoCreateTaskForMessage(agent, queuedMessage, 'user', 'high');
|
|
1426
1449
|
}
|
|
1427
1450
|
|
|
1451
|
+
// ── Auto-save substantive user messages as plan/* memories ───────
|
|
1452
|
+
// Observed in production: across 670-message agent sessions the
|
|
1453
|
+
// agent NEVER wrote a memory voluntarily. Compaction then summarized
|
|
1454
|
+
// away the user's literal asks, the agent paraphrased what was left,
|
|
1455
|
+
// and ended up doing work the user never requested. Belt-and-
|
|
1456
|
+
// suspenders alongside the OPERATING POSTURE prompt nudge: when a
|
|
1457
|
+
// user message looks substantive (long, or contains a numbered/
|
|
1458
|
+
// bulleted multi-part ask), the SYSTEM saves it as `plan/<auto>` so
|
|
1459
|
+
// the system-prompt auto-injection makes the user's words visible
|
|
1460
|
+
// every turn — even if the agent itself never thought to save.
|
|
1461
|
+
// Best-effort: never block the message-enqueue path.
|
|
1462
|
+
this._autoSaveUserMessageAsPlan(agentId, queuedMessage).catch(err => {
|
|
1463
|
+
this.logger.debug?.('Auto-save of user message as plan/* failed (continuing)', {
|
|
1464
|
+
agentId, error: err?.message,
|
|
1465
|
+
});
|
|
1466
|
+
});
|
|
1467
|
+
|
|
1428
1468
|
await this.persistAgentState(agentId);
|
|
1429
1469
|
|
|
1430
1470
|
// If we cleared a delay, surface it on the WS so the delay chip in the
|
|
@@ -1544,6 +1584,285 @@ class AgentPool {
|
|
|
1544
1584
|
* @param {string} priority - Task priority ('high', 'medium', 'low')
|
|
1545
1585
|
* @private
|
|
1546
1586
|
*/
|
|
1587
|
+
/**
|
|
1588
|
+
* Save a substantive user message as a `plan/*` memory automatically.
|
|
1589
|
+
*
|
|
1590
|
+
* Rationale (Talisman case study, May 2026): agents observed in
|
|
1591
|
+
* production never wrote a single memory across hundreds of
|
|
1592
|
+
* messages, even when the OPERATING POSTURE prompt explicitly told
|
|
1593
|
+
* them to. The user's literal ask then got lost in compaction and
|
|
1594
|
+
* the agent went off-course. This system-level safety net puts the
|
|
1595
|
+
* user's message into the durable plan/* store — which the system
|
|
1596
|
+
* prompt auto-injects every turn — without depending on the model
|
|
1597
|
+
* making the call.
|
|
1598
|
+
*
|
|
1599
|
+
* What counts as "substantive":
|
|
1600
|
+
* - Content length ≥ 60 chars (~12 words) — short acks/yes-no don't qualify
|
|
1601
|
+
* - AND any of:
|
|
1602
|
+
* • contains a numbered list ("1.", "2.", "3." …)
|
|
1603
|
+
* • contains a bullet list (-, *, • at line start)
|
|
1604
|
+
* • OR is ≥ 120 chars (longer than a one-line ack)
|
|
1605
|
+
*
|
|
1606
|
+
* What gets saved:
|
|
1607
|
+
* - title: `plan/user-<short-slug>-<timestamp>`
|
|
1608
|
+
* - description: "auto-saved from user message at <iso>"
|
|
1609
|
+
* - content: the verbatim user message
|
|
1610
|
+
*
|
|
1611
|
+
* The agent can rename, consolidate, or delete these later. They
|
|
1612
|
+
* exist as a fail-safe — if the agent does its job and saves its
|
|
1613
|
+
* own better-named plan, these auto-saves can be cleaned up. If
|
|
1614
|
+
* the agent doesn't, at least the user's words survive compaction.
|
|
1615
|
+
*
|
|
1616
|
+
* @param {string} agentId
|
|
1617
|
+
* @param {Object} message - The queued user message
|
|
1618
|
+
* @private
|
|
1619
|
+
*/
|
|
1620
|
+
async _autoSaveUserMessageAsPlan(agentId, message) {
|
|
1621
|
+
const content = typeof message?.content === 'string' ? message.content : '';
|
|
1622
|
+
if (!content) return;
|
|
1623
|
+
if (!this._looksSubstantive(content)) return;
|
|
1624
|
+
|
|
1625
|
+
// Lazy-load to keep agentPool's load order light. The same import
|
|
1626
|
+
// pattern as agentScheduler's plan injection.
|
|
1627
|
+
let memoryService;
|
|
1628
|
+
try {
|
|
1629
|
+
const mod = await import('../services/memoryService.js');
|
|
1630
|
+
memoryService = mod.getMemoryService(this.logger);
|
|
1631
|
+
await memoryService.initialize();
|
|
1632
|
+
} catch (e) {
|
|
1633
|
+
this.logger.debug?.('Auto-save plan: memory service unavailable', { error: e.message });
|
|
1634
|
+
return;
|
|
1635
|
+
}
|
|
1636
|
+
|
|
1637
|
+
// ── Deduplication ────────────────────────────────────────────────
|
|
1638
|
+
// Users repeat themselves ("I repeat my old message", "did you do
|
|
1639
|
+
// it all?" + paste the same thing). Without dedup the auto-saver
|
|
1640
|
+
// would create N copies of essentially the same plan. Load
|
|
1641
|
+
// existing plan/user-* memories and skip when the new content is
|
|
1642
|
+
// ≥70% similar to any of them (Jaccard over normalized word sets).
|
|
1643
|
+
let existingPlans = [];
|
|
1644
|
+
try {
|
|
1645
|
+
const all = await memoryService.loadMemories(agentId);
|
|
1646
|
+
existingPlans = (all || []).filter(m =>
|
|
1647
|
+
typeof m?.title === 'string' && m.title.startsWith('plan/user-')
|
|
1648
|
+
);
|
|
1649
|
+
} catch (e) {
|
|
1650
|
+
// Treat unreadable store as empty — we may still write a fresh entry.
|
|
1651
|
+
this.logger.debug?.('Auto-save plan: existing memories unreadable', { agentId, error: e.message });
|
|
1652
|
+
}
|
|
1653
|
+
|
|
1654
|
+
const newTokens = this._tokenize(content);
|
|
1655
|
+
for (const existing of existingPlans) {
|
|
1656
|
+
const existingTokens = this._tokenize(existing.content || '');
|
|
1657
|
+
const sim = this._jaccard(newTokens, existingTokens);
|
|
1658
|
+
const containment = this._overlapCoefficient(newTokens, existingTokens);
|
|
1659
|
+
// Jaccard catches near-identical reformulations. Containment
|
|
1660
|
+
// catches the "I repeat my old message — <same content>" case
|
|
1661
|
+
// where the user re-pastes the original plus a preamble. Either
|
|
1662
|
+
// signal is enough to suppress the duplicate.
|
|
1663
|
+
if (sim >= 0.7 || containment >= 0.85) {
|
|
1664
|
+
this.logger.info?.('Auto-save plan: skipping near-duplicate of existing plan', {
|
|
1665
|
+
agentId, existingTitle: existing.title,
|
|
1666
|
+
jaccard: sim.toFixed(2), containment: containment.toFixed(2),
|
|
1667
|
+
});
|
|
1668
|
+
return;
|
|
1669
|
+
}
|
|
1670
|
+
}
|
|
1671
|
+
|
|
1672
|
+
// ── Per-agent cap ────────────────────────────────────────────────
|
|
1673
|
+
// Bound the total auto-saved plans so an active session doesn't
|
|
1674
|
+
// bloat the agent's plan/* namespace indefinitely. Keep the K most
|
|
1675
|
+
// recent; delete the oldest auto-saves beyond that.
|
|
1676
|
+
const AUTO_PLAN_CAP = 8;
|
|
1677
|
+
const existingAutoSaves = existingPlans
|
|
1678
|
+
.filter(m => /^plan\/user-/.test(m.title))
|
|
1679
|
+
.sort((a, b) => String(a.createdAt || '').localeCompare(String(b.createdAt || '')));
|
|
1680
|
+
while (existingAutoSaves.length >= AUTO_PLAN_CAP) {
|
|
1681
|
+
const oldest = existingAutoSaves.shift();
|
|
1682
|
+
try {
|
|
1683
|
+
await memoryService.deleteMemory(agentId, oldest.id);
|
|
1684
|
+
this.logger.info?.('Auto-save plan: retired oldest auto-save to keep cap', {
|
|
1685
|
+
agentId, retiredTitle: oldest.title, cap: AUTO_PLAN_CAP,
|
|
1686
|
+
});
|
|
1687
|
+
} catch (e) {
|
|
1688
|
+
// Non-fatal — if we can't delete the oldest, just skip this entry
|
|
1689
|
+
// and proceed with the write. Worst case the plan list grows
|
|
1690
|
+
// by one beyond the cap — still bounded over time.
|
|
1691
|
+
this.logger.debug?.('Auto-save plan: retire-oldest failed', { agentId, error: e.message });
|
|
1692
|
+
break;
|
|
1693
|
+
}
|
|
1694
|
+
}
|
|
1695
|
+
|
|
1696
|
+
// ── Write the new memory ─────────────────────────────────────────
|
|
1697
|
+
const firstLine = (content.match(/[^\n]+/) || [''])[0].trim();
|
|
1698
|
+
const slug = firstLine
|
|
1699
|
+
.toLowerCase()
|
|
1700
|
+
.replace(/[^a-z0-9]+/g, '-')
|
|
1701
|
+
.replace(/^-+|-+$/g, '')
|
|
1702
|
+
.slice(0, 40) || 'request';
|
|
1703
|
+
const ts = new Date().toISOString().slice(0, 19).replace(/[:T]/g, '-');
|
|
1704
|
+
const title = `plan/user-${slug}-${ts}`;
|
|
1705
|
+
|
|
1706
|
+
try {
|
|
1707
|
+
await memoryService.addMemory(agentId, {
|
|
1708
|
+
title,
|
|
1709
|
+
description: `Auto-saved from user message at ${message.timestamp || new Date().toISOString()}`,
|
|
1710
|
+
content,
|
|
1711
|
+
});
|
|
1712
|
+
this.logger.info?.('Auto-saved user message as plan/* memory', {
|
|
1713
|
+
agentId, title, contentLength: content.length,
|
|
1714
|
+
});
|
|
1715
|
+
} catch (e) {
|
|
1716
|
+
this.logger.debug?.('Auto-save plan: write failed', { agentId, title, error: e.message });
|
|
1717
|
+
}
|
|
1718
|
+
}
|
|
1719
|
+
|
|
1720
|
+
/**
|
|
1721
|
+
* Tokenize a string into a lowercased word set for similarity checks.
|
|
1722
|
+
* Strips punctuation, drops short words (<3 chars), and drops a
|
|
1723
|
+
* small stopword set so that common words like "the" / "and" don't
|
|
1724
|
+
* inflate similarity scores between otherwise different messages.
|
|
1725
|
+
* @private
|
|
1726
|
+
*/
|
|
1727
|
+
_tokenize(s) {
|
|
1728
|
+
if (typeof s !== 'string') return new Set();
|
|
1729
|
+
return new Set(
|
|
1730
|
+
s.toLowerCase()
|
|
1731
|
+
.replace(/[^a-z0-9\s]+/g, ' ')
|
|
1732
|
+
.split(/\s+/)
|
|
1733
|
+
.filter(w => w.length >= 3 && !AgentPool._STOPWORDS.has(w))
|
|
1734
|
+
);
|
|
1735
|
+
}
|
|
1736
|
+
|
|
1737
|
+
/**
|
|
1738
|
+
* Jaccard similarity over two word sets.
|
|
1739
|
+
* @private
|
|
1740
|
+
*/
|
|
1741
|
+
_jaccard(a, b) {
|
|
1742
|
+
if (a.size === 0 && b.size === 0) return 1;
|
|
1743
|
+
if (a.size === 0 || b.size === 0) return 0;
|
|
1744
|
+
let intersection = 0;
|
|
1745
|
+
for (const w of a) if (b.has(w)) intersection += 1;
|
|
1746
|
+
return intersection / (a.size + b.size - intersection);
|
|
1747
|
+
}
|
|
1748
|
+
|
|
1749
|
+
/**
|
|
1750
|
+
* Overlap coefficient — intersection / size-of-smaller-set.
|
|
1751
|
+
* Returns 1.0 when one set is fully contained in the other,
|
|
1752
|
+
* regardless of how much the other set adds. Catches the "user
|
|
1753
|
+
* re-pastes their request with a preamble" duplicate case where
|
|
1754
|
+
* Jaccard would mark the messages as merely similar.
|
|
1755
|
+
* @private
|
|
1756
|
+
*/
|
|
1757
|
+
_overlapCoefficient(a, b) {
|
|
1758
|
+
if (a.size === 0 || b.size === 0) return 0;
|
|
1759
|
+
let intersection = 0;
|
|
1760
|
+
for (const w of a) if (b.has(w)) intersection += 1;
|
|
1761
|
+
return intersection / Math.min(a.size, b.size);
|
|
1762
|
+
}
|
|
1763
|
+
|
|
1764
|
+
/**
|
|
1765
|
+
* Heuristic — does this user message look like a real request worth
|
|
1766
|
+
* preserving as a plan/*? Errs on the side of saving more (recall
|
|
1767
|
+
* over precision) — a stray auto-save is cheap; a lost user request
|
|
1768
|
+
* is catastrophic.
|
|
1769
|
+
* @private
|
|
1770
|
+
*/
|
|
1771
|
+
_looksSubstantive(text) {
|
|
1772
|
+
if (typeof text !== 'string') return false;
|
|
1773
|
+
const t = text.trim();
|
|
1774
|
+
if (t.length < 30) return false;
|
|
1775
|
+
// Tool-result wrappers and previous-task boundaries are not user voice.
|
|
1776
|
+
if (t.startsWith('[Tool Results') || t.startsWith('[Previous Task')) return false;
|
|
1777
|
+
|
|
1778
|
+
// ── Pollution filter 1: dominated by questions ────────────────────
|
|
1779
|
+
// A message that's mostly questions wants an ANSWER, not a plan.
|
|
1780
|
+
// If the majority of non-empty lines end in '?' (or are
|
|
1781
|
+
// question-shaped), this is a query, not a request.
|
|
1782
|
+
if (this._dominatedByQuestions(t)) return false;
|
|
1783
|
+
|
|
1784
|
+
// ── Pollution filter 2: list items are just refs (paths, urls) ───
|
|
1785
|
+
// A list of file paths / URLs / commit hashes is the user pointing
|
|
1786
|
+
// the agent at things, not a multi-part plan. Save it only if the
|
|
1787
|
+
// surrounding prose carries imperative intent — and even then the
|
|
1788
|
+
// length gate handles that path.
|
|
1789
|
+
const hasList = /^\s*(?:\d+[.)]|[-*•])\s/m.test(t);
|
|
1790
|
+
if (hasList && this._listItemsAreJustReferences(t)) return false;
|
|
1791
|
+
|
|
1792
|
+
// ── Now apply the structural triggers ────────────────────────────
|
|
1793
|
+
// Numbered list — "1." / "1)" at a line start. Multi-part intent.
|
|
1794
|
+
// Require a minimum total length to avoid "1. yes 2. no" nonsense.
|
|
1795
|
+
if (/^\s*\d+[.)]\s/m.test(t) && t.length >= 60) return true;
|
|
1796
|
+
// Bullet list at line start. Same — strong intent signal + length.
|
|
1797
|
+
if (/^\s*[-*•]\s/m.test(t) && t.length >= 60) return true;
|
|
1798
|
+
// Free-form prose with no list markers must be substantial AND
|
|
1799
|
+
// contain an imperative-like signal (a verb you'd give as an
|
|
1800
|
+
// order). Raised from 120 → 150 to skip more pleasantries.
|
|
1801
|
+
if (t.length >= 150 && this._hasImperativeSignal(t)) return true;
|
|
1802
|
+
return false;
|
|
1803
|
+
}
|
|
1804
|
+
|
|
1805
|
+
/**
|
|
1806
|
+
* Heuristic: is this message mostly questions?
|
|
1807
|
+
* @private
|
|
1808
|
+
*/
|
|
1809
|
+
_dominatedByQuestions(t) {
|
|
1810
|
+
// Split into non-empty lines.
|
|
1811
|
+
const lines = t.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
|
|
1812
|
+
if (lines.length === 0) return false;
|
|
1813
|
+
// Strip leading list markers so we can look at the line's intent.
|
|
1814
|
+
const stripMarker = (l) => l.replace(/^(?:\d+[.)]|[-*•])\s+/, '');
|
|
1815
|
+
let questionLines = 0;
|
|
1816
|
+
for (const raw of lines) {
|
|
1817
|
+
const line = stripMarker(raw);
|
|
1818
|
+
// Ends in '?', OR starts with a question word at the line head.
|
|
1819
|
+
if (/\?\s*$/.test(line) || /^(?:what|why|how|when|where|who|which|is\b|are\b|do\b|does\b|can\b|could\b|should\b|would\b)\b/i.test(line)) {
|
|
1820
|
+
questionLines += 1;
|
|
1821
|
+
}
|
|
1822
|
+
}
|
|
1823
|
+
// Strict-majority rule: more than half of lines are questions.
|
|
1824
|
+
return questionLines * 2 > lines.length;
|
|
1825
|
+
}
|
|
1826
|
+
|
|
1827
|
+
/**
|
|
1828
|
+
* Heuristic: are the list items in this message just references
|
|
1829
|
+
* (file paths, URLs, commit hashes) with no imperative verb of their own?
|
|
1830
|
+
* @private
|
|
1831
|
+
*/
|
|
1832
|
+
_listItemsAreJustReferences(t) {
|
|
1833
|
+
const lines = t.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
|
|
1834
|
+
const listItems = lines.filter(l => /^(?:\d+[.)]|[-*•])\s/.test(l));
|
|
1835
|
+
if (listItems.length === 0) return false;
|
|
1836
|
+
let refLikeCount = 0;
|
|
1837
|
+
for (const li of listItems) {
|
|
1838
|
+
const body = li.replace(/^(?:\d+[.)]|[-*•])\s+/, '').trim();
|
|
1839
|
+
// Only treat as a "reference" if the line IS the reference —
|
|
1840
|
+
// i.e. a path/URL/hash with no surrounding English. A short bug
|
|
1841
|
+
// description like "login button does nothing on Safari" still
|
|
1842
|
+
// counts as content, not a reference.
|
|
1843
|
+
// Path: contains '/' or '\' OR starts with '.' AND has NO spaces
|
|
1844
|
+
// URL: starts with http(s)://
|
|
1845
|
+
// Hash: 7-40 hex chars only, no spaces
|
|
1846
|
+
const isPath = (/[/\\]/.test(body) || /^\./.test(body)) && !/\s/.test(body);
|
|
1847
|
+
const isUrl = /^https?:\/\//.test(body) && !/\s/.test(body);
|
|
1848
|
+
const isHash = /^[0-9a-f]{7,40}$/i.test(body);
|
|
1849
|
+
if (isPath || isUrl || isHash) refLikeCount += 1;
|
|
1850
|
+
}
|
|
1851
|
+
// Strict-majority of list items are reference-like → ignore.
|
|
1852
|
+
return refLikeCount * 2 > listItems.length;
|
|
1853
|
+
}
|
|
1854
|
+
|
|
1855
|
+
/**
|
|
1856
|
+
* Heuristic: does the message contain a verb that signals "do this"?
|
|
1857
|
+
* Conservative — favors recall over precision.
|
|
1858
|
+
* @private
|
|
1859
|
+
*/
|
|
1860
|
+
_hasImperativeSignal(t) {
|
|
1861
|
+
// Word-boundary match against a set of common imperative verbs.
|
|
1862
|
+
// Order matters only for readability — we check membership.
|
|
1863
|
+
return /\b(?:fix|add|build|implement|create|change|remove|delete|update|refactor|rewrite|migrate|integrate|configure|setup|set\s+up|design|generate|make|write|test|verify|ensure|review|optimize|improve|replace|move|rename|extract|split|merge|deploy|publish|ship|release|debug|investigate|analyze|reproduce|escalate|prioritize|schedule)\b/i.test(t);
|
|
1864
|
+
}
|
|
1865
|
+
|
|
1547
1866
|
_autoCreateTaskForMessage(agent, message, source, priority) {
|
|
1548
1867
|
if (!agent.taskList) {
|
|
1549
1868
|
agent.taskList = { tasks: [], lastUpdated: new Date().toISOString() };
|
|
@@ -60,6 +60,19 @@ class AgentScheduler {
|
|
|
60
60
|
// Initialize ContextInjectionService for file attachments
|
|
61
61
|
this.contextInjectionService = new ContextInjectionService({}, logger);
|
|
62
62
|
|
|
63
|
+
// Per-turn system-prompt rebuild cache for native-API models.
|
|
64
|
+
// Agents persist a `systemPrompt` baked at create-time for the
|
|
65
|
+
// chat-completion shape (text descriptions of every tool). When a
|
|
66
|
+
// turn targets a Responses-API model (Codex / o-series / gpt-5-pro),
|
|
67
|
+
// we want a TRIMMED prompt that omits text docs for tools whose
|
|
68
|
+
// structured schemas are sent in `tools:`. Rebuilding fresh each
|
|
69
|
+
// turn would be wasteful — agents typically stay on the same model
|
|
70
|
+
// for many turns — so we memoize per (agentId, modelName).
|
|
71
|
+
//
|
|
72
|
+
// Cleared on process restart and on agent updates that change the
|
|
73
|
+
// base prompt or capabilities (see `_invalidateNativePromptCache`).
|
|
74
|
+
this._nativePromptCache = new Map(); // `${agentId}|${modelName}` → string
|
|
75
|
+
|
|
63
76
|
// Initialize FlowContextService for flow execution context
|
|
64
77
|
this.flowContextService = new FlowContextService({}, logger);
|
|
65
78
|
|
|
@@ -1919,8 +1932,17 @@ class AgentScheduler {
|
|
|
1919
1932
|
// After compaction, retrieve messages from AgentPool (will use compacted if available)
|
|
1920
1933
|
const messagesToSend = await this.agentPool.getMessagesForAI(agentId, targetModel);
|
|
1921
1934
|
|
|
1922
|
-
//
|
|
1923
|
-
|
|
1935
|
+
// ── Pick the right system-prompt shape for the target model ──
|
|
1936
|
+
// Default: use the agent's persisted `systemPrompt` (baked at
|
|
1937
|
+
// create-time with full text descriptions for every tool — the
|
|
1938
|
+
// chat-completion shape). For models that use the Responses API
|
|
1939
|
+
// (native function-calling), rebuild a trimmed version that
|
|
1940
|
+
// omits text docs for tools whose structured schemas we send in
|
|
1941
|
+
// `tools:`. Falls back to the persisted prompt whenever the
|
|
1942
|
+
// model's apiType is unknown OR the agent has no stored original
|
|
1943
|
+
// prompt — preserves existing behaviour for old agents and
|
|
1944
|
+
// unknown models. See `_pickSystemPromptForModel`.
|
|
1945
|
+
let enhancedSystemPrompt = await this._pickSystemPromptForModel(agent, targetModel);
|
|
1924
1946
|
if (agent.mode === AGENT_MODES.AGENT) {
|
|
1925
1947
|
const taskManagerInstruction = "\n\nIMPORTANT: You are in AGENT mode. The use of TaskManager tool is mandatory.\n\n" +
|
|
1926
1948
|
"TASK LIFECYCLE (follow this, don't improvise):\n" +
|
|
@@ -2083,6 +2105,48 @@ class AgentScheduler {
|
|
|
2083
2105
|
});
|
|
2084
2106
|
}
|
|
2085
2107
|
|
|
2108
|
+
// ── Auto-inject CURRENT TASK LIST every turn ───────────────────
|
|
2109
|
+
// The task list lives in `agent.taskList.tasks` — durable, never
|
|
2110
|
+
// affected by compaction. But the conversation messages that
|
|
2111
|
+
// CREATED those tasks ARE compacted, so an agent that lost its
|
|
2112
|
+
// recent history may forget the task list exists. That's how
|
|
2113
|
+
// the Talisman bug happened: the agent called sync with a fresh
|
|
2114
|
+
// 4-task plan, silently wiping 9 in-flight tasks the user had
|
|
2115
|
+
// implicitly requested. Surface the current task list to the
|
|
2116
|
+
// agent every turn so it can never "forget" what's already on
|
|
2117
|
+
// the plan. Cheap (a few hundred chars), invariant to
|
|
2118
|
+
// compaction, and a natural deterrent against destructive sync.
|
|
2119
|
+
try {
|
|
2120
|
+
const tasks = agent.taskList?.tasks || [];
|
|
2121
|
+
if (Array.isArray(tasks) && tasks.length > 0) {
|
|
2122
|
+
const lines = ['\n\n## CURRENT TASK LIST (live from agent state — survives compaction)\n'];
|
|
2123
|
+
lines.push('These tasks exist in your durable state RIGHT NOW. If the conversation history doesn\'t mention them, that\'s because compaction summarized that section away — the tasks are still there.\n');
|
|
2124
|
+
lines.push('Before issuing `taskmanager sync`, READ this list. If you sync with a different plan, you will be dropping these.\n');
|
|
2125
|
+
// Compact, scannable. Title + status + priority is enough.
|
|
2126
|
+
const byStatus = { in_progress: [], pending: [], completed: [], cancelled: [] };
|
|
2127
|
+
for (const t of tasks) {
|
|
2128
|
+
const status = t.status || 'pending';
|
|
2129
|
+
(byStatus[status] || (byStatus[status] = [])).push(t);
|
|
2130
|
+
}
|
|
2131
|
+
const order = ['in_progress', 'pending', 'completed', 'cancelled'];
|
|
2132
|
+
for (const status of order) {
|
|
2133
|
+
const group = byStatus[status] || [];
|
|
2134
|
+
if (group.length === 0) continue;
|
|
2135
|
+
lines.push(`\n**${status}** (${group.length}):`);
|
|
2136
|
+
for (const t of group) {
|
|
2137
|
+
const pri = t.priority ? ` [${t.priority}]` : '';
|
|
2138
|
+
lines.push(`- ${t.title}${pri}`);
|
|
2139
|
+
}
|
|
2140
|
+
}
|
|
2141
|
+
enhancedSystemPrompt = (enhancedSystemPrompt || '') + lines.join('\n');
|
|
2142
|
+
}
|
|
2143
|
+
} catch (taskInjectErr) {
|
|
2144
|
+
// Best-effort — never block the turn on this.
|
|
2145
|
+
this.logger.warn(`Task list injection failed for agent ${agentId} (continuing without)`, {
|
|
2146
|
+
error: taskInjectErr?.message,
|
|
2147
|
+
});
|
|
2148
|
+
}
|
|
2149
|
+
|
|
2086
2150
|
// Check if streaming is enabled - consider both agent config and user message preference
|
|
2087
2151
|
// Get the last user message to check for streaming preference
|
|
2088
2152
|
const lastUserMsg = [...conversationHistory].reverse().find(m => m.role === 'user');
|
|
@@ -2169,6 +2233,156 @@ class AgentScheduler {
|
|
|
2169
2233
|
}
|
|
2170
2234
|
}
|
|
2171
2235
|
|
|
2236
|
+
/**
|
|
2237
|
+
* Choose the right base system prompt for the target model.
|
|
2238
|
+
*
|
|
2239
|
+
* • If the model's catalog entry says it uses the Responses API
|
|
2240
|
+
* ('responses' in its api_type / capabilities) AND the agent has
|
|
2241
|
+
* an `originalSystemPrompt` we can rebuild from, return a
|
|
2242
|
+
* freshly-built prompt that omits text descriptions for tools
|
|
2243
|
+
* with native function schemas (see baseTool.js — those tools'
|
|
2244
|
+
* structured schemas in `tools:` are the canonical source for
|
|
2245
|
+
* these models, so the text docs are pure duplication).
|
|
2246
|
+
*
|
|
2247
|
+
* • Otherwise return the agent's persisted `systemPrompt` exactly
|
|
2248
|
+
* as it is today. This covers:
|
|
2249
|
+
* – chat-completion models (no native function calling)
|
|
2250
|
+
* – models we can't classify (modelsService offline / catalog
|
|
2251
|
+
* field missing) — fail safe to old behaviour
|
|
2252
|
+
* – very old agents persisted before `originalSystemPrompt`
|
|
2253
|
+
* was stored — fail safe to old behaviour
|
|
2254
|
+
*
|
|
2255
|
+
* Result is memoized per `(agentId, targetModel)` to avoid rebuilding
|
|
2256
|
+
* on every turn. The cache is invalidated whenever the agent's base
|
|
2257
|
+
* prompt or capabilities change (see `_invalidateNativePromptCache`).
|
|
2258
|
+
*
|
|
2259
|
+
* @private
|
|
2260
|
+
* @param {Object} agent - Agent record
|
|
2261
|
+
* @param {string} targetModel - Model name about to be called
|
|
2262
|
+
* @returns {Promise<string>} The prompt to use as the base
|
|
2263
|
+
*/
|
|
2264
|
+
async _pickSystemPromptForModel(agent, targetModel) {
|
|
2265
|
+
// 1. Resolve the model's API type. Unknown → use persisted prompt.
|
|
2266
|
+
const apiType = this._resolveModelApiType(targetModel);
|
|
2267
|
+
if (apiType !== 'responses') return agent.systemPrompt;
|
|
2268
|
+
|
|
2269
|
+
// 2. Need the original (un-enhanced) prompt to rebuild from. Without
|
|
2270
|
+
// it we can't safely re-add the trimmed tool docs — fall back
|
|
2271
|
+
// to the persisted shape (which works for chat-completion and
|
|
2272
|
+
// is also accepted by Responses API, just with the duplication
|
|
2273
|
+
// cost). This is the back-compat path for legacy agents.
|
|
2274
|
+
if (!agent.originalSystemPrompt) return agent.systemPrompt;
|
|
2275
|
+
|
|
2276
|
+
// 3. Cache lookup.
|
|
2277
|
+
const cacheKey = `${agent.id}|${targetModel}`;
|
|
2278
|
+
const cached = this._nativePromptCache.get(cacheKey);
|
|
2279
|
+
if (cached) return cached;
|
|
2280
|
+
|
|
2281
|
+
// 4. Rebuild. The agentPool stores the toolsRegistry — reuse it so
|
|
2282
|
+
// we go through the exact same code path that built the original
|
|
2283
|
+
// prompt, just with apiType set. Skills index + the rest of the
|
|
2284
|
+
// augmentation must be reapplied; mirror what createAgent does.
|
|
2285
|
+
try {
|
|
2286
|
+
const registry = this.agentPool?.toolsRegistry;
|
|
2287
|
+
if (!registry) return agent.systemPrompt;
|
|
2288
|
+
|
|
2289
|
+
let rebuilt = registry.enhanceSystemPrompt(
|
|
2290
|
+
agent.originalSystemPrompt,
|
|
2291
|
+
agent.capabilities || [],
|
|
2292
|
+
{ apiType: 'responses' },
|
|
2293
|
+
);
|
|
2294
|
+
|
|
2295
|
+
// Re-inject ASSIGNED SKILLS block if present (createAgent appends
|
|
2296
|
+
// this after enhanceSystemPrompt — see agentPool.js:108).
|
|
2297
|
+
if (Array.isArray(agent.skills) && agent.skills.length > 0) {
|
|
2298
|
+
try {
|
|
2299
|
+
const { getSkillsService } = await import('../services/skillsService.js');
|
|
2300
|
+
const skillsService = getSkillsService(this.logger);
|
|
2301
|
+
await skillsService.initialize();
|
|
2302
|
+
const summaries = await skillsService.getSkillSummaries(agent.skills);
|
|
2303
|
+
if (summaries.length > 0) {
|
|
2304
|
+
rebuilt += '\n\n## ASSIGNED SKILLS\n\n';
|
|
2305
|
+
rebuilt += 'Use the skills tool to browse and load skill content. Use "describe" to see sections, "read-section" to load specific parts.\n\n';
|
|
2306
|
+
for (const s of summaries) {
|
|
2307
|
+
const sections = s.sections?.length ? `\n Sections: ${s.sections.map(h => h.replace(/^#+\s*/, '')).join(', ')}` : '';
|
|
2308
|
+
rebuilt += `- **${s.name}** (${s.lineCount} lines): ${s.description}${sections}\n`;
|
|
2309
|
+
}
|
|
2310
|
+
}
|
|
2311
|
+
} catch (e) {
|
|
2312
|
+
this.logger?.debug?.('Failed to re-inject skills index for native prompt', { error: e.message });
|
|
2313
|
+
}
|
|
2314
|
+
}
|
|
2315
|
+
|
|
2316
|
+
this._nativePromptCache.set(cacheKey, rebuilt);
|
|
2317
|
+
this.logger?.debug?.('Built native-API system prompt', {
|
|
2318
|
+
agentId: agent.id,
|
|
2319
|
+
targetModel,
|
|
2320
|
+
originalLength: agent.systemPrompt?.length || 0,
|
|
2321
|
+
rebuiltLength: rebuilt.length,
|
|
2322
|
+
savedTokensApprox: Math.round(((agent.systemPrompt?.length || 0) - rebuilt.length) / 4),
|
|
2323
|
+
});
|
|
2324
|
+
return rebuilt;
|
|
2325
|
+
} catch (err) {
|
|
2326
|
+
// Anything goes wrong → fall back to old behaviour. Failing
|
|
2327
|
+
// closed (no prompt) would break the agent's turn; failing open
|
|
2328
|
+
// (use chat-completion shape) just keeps the duplication.
|
|
2329
|
+
this.logger?.warn?.('Native system-prompt rebuild failed — using persisted prompt', {
|
|
2330
|
+
agentId: agent.id,
|
|
2331
|
+
targetModel,
|
|
2332
|
+
error: err.message,
|
|
2333
|
+
});
|
|
2334
|
+
return agent.systemPrompt;
|
|
2335
|
+
}
|
|
2336
|
+
}
|
|
2337
|
+
|
|
2338
|
+
/**
|
|
2339
|
+
* Look up a model's API type from the catalog. Returns 'responses',
|
|
2340
|
+
* 'chat_completion', or undefined when unknown. The catalog exposes
|
|
2341
|
+
* `api_type` as an array and/or `capabilities.responses`/`capabilities.chatCompletion`
|
|
2342
|
+
* — mirror the backend's _inferRouting precedence so the CLI's
|
|
2343
|
+
* classification matches the backend's routing decision exactly.
|
|
2344
|
+
* @private
|
|
2345
|
+
*/
|
|
2346
|
+
_resolveModelApiType(modelName) {
|
|
2347
|
+
try {
|
|
2348
|
+
if (!this.modelsService || typeof this.modelsService.getModels !== 'function') return undefined;
|
|
2349
|
+
const models = this.modelsService.getModels();
|
|
2350
|
+
const m = models.find(x => x.name === modelName);
|
|
2351
|
+
if (!m) return undefined;
|
|
2352
|
+
|
|
2353
|
+
const apiType = Array.isArray(m.api_type) ? m.api_type : (m.api_type ? [m.api_type] : []);
|
|
2354
|
+
const caps = m.capabilities || {};
|
|
2355
|
+
|
|
2356
|
+
// Mirrors backend services/llmServiceFactory.js _inferRouting:
|
|
2357
|
+
// responses if api_type contains 'responses' AND not 'chat_completion'
|
|
2358
|
+
// OR capabilities.responses === 'true' / chatCompletion === 'false'
|
|
2359
|
+
// OR explicit useResponsesApi flag
|
|
2360
|
+
if (apiType.includes('responses') && !apiType.includes('chat_completion')) return 'responses';
|
|
2361
|
+
if (caps.chatCompletion === 'false' && (caps.responses === 'true' || apiType.includes('responses'))) return 'responses';
|
|
2362
|
+
if (m.useResponsesApi) return 'responses';
|
|
2363
|
+
// Name-based fallback (last resort — only when catalog has no routing data)
|
|
2364
|
+
if (/codex/i.test(modelName) || /gpt.*-pro$/i.test(modelName)) return 'responses';
|
|
2365
|
+
return 'chat_completion';
|
|
2366
|
+
} catch (err) {
|
|
2367
|
+
// Defensive — never block the turn on a classification failure.
|
|
2368
|
+
this.logger?.debug?.('Model apiType resolution failed', { modelName, error: err.message });
|
|
2369
|
+
return undefined;
|
|
2370
|
+
}
|
|
2371
|
+
}
|
|
2372
|
+
|
|
2373
|
+
/**
|
|
2374
|
+
* Drop cached native prompts for an agent. Called by agentPool when
|
|
2375
|
+
* the base prompt or capabilities change so the next turn rebuilds.
|
|
2376
|
+
* Exposed so agentPool can call it without poking internal state.
|
|
2377
|
+
*/
|
|
2378
|
+
_invalidateNativePromptCache(agentId) {
|
|
2379
|
+
for (const key of this._nativePromptCache.keys()) {
|
|
2380
|
+
if (key.startsWith(`${agentId}|`)) {
|
|
2381
|
+
this._nativePromptCache.delete(key);
|
|
2382
|
+
}
|
|
2383
|
+
}
|
|
2384
|
+
}
|
|
2385
|
+
|
|
2172
2386
|
/**
|
|
2173
2387
|
* Get AI response using streaming with WebSocket broadcast
|
|
2174
2388
|
* @param {string} agentId - Agent ID
|