onbuzz 4.8.0 → 4.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,6 +34,147 @@ describe('ConversationCompactionService', () => {
34
34
  expect(service.compactionModelIndex).toBe(0);
35
35
  });
36
36
 
37
+ // ─── Compaction-input pre-tagging ─────────────────────────────────────
38
+ //
39
+ // The summarizer is fed PRE-TAGGED messages so it doesn't have to
40
+ // guess whether a `role: user` message is a real user typing or a
41
+ // tool-result wrapper. We pin the tagging rules here.
42
+
43
+ describe('_categorizeMessage — input pre-tagging', () => {
44
+ test('assistant role → AGENT', () => {
45
+ expect(service._categorizeMessage({ role: 'assistant', content: 'hi' })).toBe('AGENT');
46
+ });
47
+
48
+ test('system role → SYSTEM', () => {
49
+ expect(service._categorizeMessage({ role: 'system', content: 'note' })).toBe('SYSTEM');
50
+ });
51
+
52
+ test('user role with plain content → REAL_USER', () => {
53
+ expect(service._categorizeMessage({ role: 'user', content: 'please fix the board' })).toBe('REAL_USER');
54
+ });
55
+
56
+ test('user role with [Tool Results …] prefix → TOOL_RESULT', () => {
57
+ expect(service._categorizeMessage({
58
+ role: 'user',
59
+ content: '[Tool Results — 1 result] [filesystem] {...}',
60
+ })).toBe('TOOL_RESULT');
61
+ });
62
+
63
+ test('user role with [Previous Task …] prefix → PREVIOUS_TASK', () => {
64
+ expect(service._categorizeMessage({
65
+ role: 'user',
66
+ content: '[Previous Task — Final Tool Results] [jobdone] {...}',
67
+ })).toBe('PREVIOUS_TASK');
68
+ });
69
+
70
+ test('REGRESSION: leading whitespace before [Tool Results doesn\'t fool the categorizer', () => {
71
+ expect(service._categorizeMessage({
72
+ role: 'user',
73
+ content: '\n [Tool Results — 1] {}',
74
+ })).toBe('TOOL_RESULT');
75
+ });
76
+
77
+ test('non-string content (rare) is treated as REAL_USER (defensive)', () => {
78
+ // If content somehow isn't a string, we can't sniff the prefix —
79
+ // safest fallback is to treat it as a real user message so it
80
+ // surfaces in PASS 1 rather than being silently dropped.
81
+ expect(service._categorizeMessage({ role: 'user', content: { foo: 1 } })).toBe('REAL_USER');
82
+ });
83
+ });
84
+
85
+ // ─── Summary prompt contract — anti-lossy-compaction guard ────────────
86
+ //
87
+ // History: an earlier prompt told the summarizer to paraphrase user
88
+ // requests "under HIGH PRIORITY". The Talisman case study (May 2026)
89
+ // showed that paraphrasing alone caused the agent to lose track of
90
+ // literal user asks → off-track work (building a Settings screen
91
+ // nobody asked for). Then a verbatim-everything prompt fixed that
92
+ // half but ate the agent-side narrative — PASS 2 never reached.
93
+ // The final prompt (3-pass, pre-tagged input, EVENT LOG + STATE)
94
+ // was validated across 3 models × 7 variants. These tests pin it.
95
+
96
+ describe('_createSummaryPromptTemplate — three-pass contract', () => {
97
+ test('includes the {middle_segment} placeholder for interpolation', () => {
98
+ const tpl = service._createSummaryPromptTemplate();
99
+ expect(tpl).toContain('{middle_segment}');
100
+ });
101
+
102
+ test('declares the pre-tagging categories the summarizer can trust', () => {
103
+ // The prompt must list every category the categorizer emits, so
104
+ // the summarizer doesn't fall back to guessing from content.
105
+ const tpl = service._createSummaryPromptTemplate();
106
+ expect(tpl).toContain('[REAL_USER]');
107
+ expect(tpl).toContain('[TOOL_RESULT]');
108
+ expect(tpl).toContain('[PREVIOUS_TASK]');
109
+ expect(tpl).toContain('[AGENT]');
110
+ expect(tpl).toContain('[SYSTEM]');
111
+ });
112
+
113
+ test('mandates VERBATIM user-message transcription, NOT paraphrase', () => {
114
+ const tpl = service._createSummaryPromptTemplate();
115
+ expect(tpl).toMatch(/word[\s,-]?for[\s,-]?word/i);
116
+ expect(tpl).toMatch(/transcription/i);
117
+ expect(tpl).toMatch(/(do not|don't|never).*(paraphras|condens|omit)/i);
118
+ });
119
+
120
+ test('uses a 3-pass shape: USER VOICE → EVENT LOG → STATE NARRATIVE', () => {
121
+ const tpl = service._createSummaryPromptTemplate();
122
+ const userIdx = tpl.indexOf('USER VOICE');
123
+ const eventIdx = tpl.indexOf('EVENT LOG');
124
+ const stateIdx = tpl.indexOf('STATE NARRATIVE');
125
+ expect(userIdx).toBeGreaterThan(-1);
126
+ expect(eventIdx).toBeGreaterThan(-1);
127
+ expect(stateIdx).toBeGreaterThan(-1);
128
+ // User voice first, then event log, then narrative.
129
+ expect(userIdx).toBeLessThan(eventIdx);
130
+ expect(eventIdx).toBeLessThan(stateIdx);
131
+ });
132
+
133
+ test('PASS 1 forbids quoting non-REAL_USER tagged messages', () => {
134
+ // The most common failure mode in early experiments was the
135
+ // summarizer quoting tool-result wrappers as if they were user
136
+ // messages. The prompt explicitly forbids this.
137
+ const tpl = service._createSummaryPromptTemplate();
138
+ expect(tpl).toMatch(/only \[REAL_USER\] messages/i);
139
+ // And lists the categories it must NOT quote.
140
+ expect(tpl).toMatch(/do not quote.*\[TOOL_RESULT\]/i);
141
+ });
142
+
143
+ test('PASS 2 demands concrete details (file paths, tool names, line numbers)', () => {
144
+ const tpl = service._createSummaryPromptTemplate();
145
+ expect(tpl).toMatch(/file path/i);
146
+ expect(tpl).toMatch(/tool name/i);
147
+ expect(tpl).toMatch(/line number/i);
148
+ });
149
+
150
+ test('PASS 3 must explicitly map back to [REAL_USER] requests', () => {
151
+ // The Talisman bug was the agent doing work UNRELATED to the
152
+ // user's requests. PASS 3 must surface that misalignment when
153
+ // it exists — the prompt requires "name the gaps clearly".
154
+ const tpl = service._createSummaryPromptTemplate();
155
+ expect(tpl).toMatch(/map back/i);
156
+ expect(tpl).toMatch(/name the gaps/i);
157
+ });
158
+
159
+ test('forbids skipping a [REAL_USER] message on "already addressed" reasoning', () => {
160
+ const tpl = service._createSummaryPromptTemplate();
161
+ expect(tpl).toMatch(/(do not|don't|never).*skip.*\[?REAL_USER\]?/i);
162
+ expect(tpl).toMatch(/already addressed/i);
163
+ });
164
+
165
+ test('output contract: no preamble, exact headers', () => {
166
+ const tpl = service._createSummaryPromptTemplate();
167
+ expect(tpl).toMatch(/no preamble/i);
168
+ expect(tpl).toMatch(/exactly the section headers above/i);
169
+ });
170
+
171
+ test('REGRESSION: does NOT carry forward the lossy "HIGH PRIORITY paraphrase" guidance', () => {
172
+ const tpl = service._createSummaryPromptTemplate();
173
+ expect(tpl).not.toMatch(/HIGH PRIORITY \(Always Preserve\)/);
174
+ expect(tpl).not.toMatch(/PRESERVATION GUIDELINES/);
175
+ });
176
+ });
177
+
37
178
  // ─── compactConversation ─────────────────────────────────────────────
38
179
 
39
180
  test('compactConversation throws on empty messages array', async () => {
@@ -1,47 +1,65 @@
1
1
  /**
2
2
  * Regression tests for the router model name convention.
3
3
  *
4
- * Background: Dynamic Routing silently no-op'd for an entire release
5
- * because the CLI's `ROUTER_MODEL` constant was `'autopilot-model-router'`
6
- * while the catalog (the source of truth that the backend's /llm/chat
7
- * looks up) keys the entry as `'model-router'`. The catalog's regex
8
- * fallback is wrapped with ^…$ at index-build time in the backend
9
- * (services/modelCatalogService.js:109), so the `autopilot-` prefix
10
- * caused every routing-decision call to return 400, the
11
- * ModelRouterService caught + fell back to the current model, and no
12
- * routing ever happened.
4
+ * Background two rounds of this same shape of bug:
13
5
  *
14
- * These tests pin the post-fix invariant and catch any reintroduction
15
- * of a product-name prefix in the future.
6
+ * Round 1: Dynamic Routing silently no-op'd because ROUTER_MODEL
7
+ * was `'autopilot-model-router'` while the catalog keyed the entry
8
+ * as `'model-router'`. Fixed by changing the constant to the bare
9
+ * form.
10
+ *
11
+ * Round 2: Even the bare `'model-router'` key didn't exist in the
12
+ * live catalog, because the catalog discovers deployments by
13
+ * underlying-model-name (not by Azure deployment-name). The team's
14
+ * actual deployment was named `autopilot-model-router` in Azure but
15
+ * its underlying model is `gpt-4.1-nano` — so the live catalog
16
+ * keys the entry under `gpt-4.1-nano`. The CLI now defaults to
17
+ * that name.
18
+ *
19
+ * These tests pin the new invariant + the override mechanism + catch
20
+ * any reintroduction of a product-name prefix.
16
21
  *
17
22
  * Why a separate test file: the existing modelRouterService.test.js
18
23
  * mocks the constants module at the top of the file, so it cannot
19
24
  * assert anything about the REAL value of MODEL_ROUTER_CONFIG. This
20
25
  * file imports the real constants instead.
21
26
  */
22
- import { describe, test, expect } from '@jest/globals';
27
+ import { describe, test, expect, beforeEach, afterEach } from '@jest/globals';
23
28
  import fs from 'node:fs';
24
29
  import path from 'node:path';
25
30
  import { fileURLToPath } from 'node:url';
26
31
 
27
- import { MODEL_ROUTER_CONFIG } from '../../utilities/constants.js';
28
-
29
32
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
30
33
  const SRC_ROOT = path.resolve(__dirname, '../..');
31
34
 
32
35
  describe('Router model naming — matches catalog convention', () => {
33
- test('ROUTER_MODEL is exactly "model-router"', () => {
34
- // This is the modelKey in autopilot-model-catalog's models_registry.json.
35
- // Any other value would force the backend through the regex fallback,
36
- // which is anchored ^…$ and will reject product-prefixed forms.
37
- expect(MODEL_ROUTER_CONFIG.ROUTER_MODEL).toBe('model-router');
36
+ test('default ROUTER_MODEL is "gpt-4.1-nano" (current live catalog key)', async () => {
37
+ // The autopilot-model-router deployment's underlying model is
38
+ // gpt-4.1-nano. The catalog keys entries by underlying model, so
39
+ // this is the canonical name the CLI must request.
40
+ delete process.env.LOXIA_ROUTER_MODEL;
41
+ // Re-import to pick up the (re-)evaluated default.
42
+ const fresh = await import(`../../utilities/constants.js?nocache=${Date.now()}`);
43
+ expect(fresh.MODEL_ROUTER_CONFIG.ROUTER_MODEL).toBe('gpt-4.1-nano');
44
+ });
45
+
46
+ test('LOXIA_ROUTER_MODEL env var overrides the default (no rebuild needed)', async () => {
47
+ process.env.LOXIA_ROUTER_MODEL = 'gpt-4o-mini';
48
+ try {
49
+ const fresh = await import(`../../utilities/constants.js?nocache=${Date.now()}`);
50
+ expect(fresh.MODEL_ROUTER_CONFIG.ROUTER_MODEL).toBe('gpt-4o-mini');
51
+ } finally {
52
+ delete process.env.LOXIA_ROUTER_MODEL;
53
+ }
38
54
  });
39
55
 
40
- test('ROUTER_MODEL does NOT carry a product/brand prefix', () => {
41
- // Defense-in-depth: even if the catalog modelKey ever changes,
42
- // the name must not start with a product prefix like "autopilot-"
56
+ test('ROUTER_MODEL does NOT carry a product/brand prefix', async () => {
57
+ // Defense-in-depth: even if the canonical name ever changes,
58
+ // it must not start with a product prefix like "autopilot-"
43
59
  // or "onbuzz-". The catalog's canonical names are product-agnostic.
44
- const v = MODEL_ROUTER_CONFIG.ROUTER_MODEL;
60
+ delete process.env.LOXIA_ROUTER_MODEL;
61
+ const fresh = await import(`../../utilities/constants.js?nocache=${Date.now()}`);
62
+ const v = fresh.MODEL_ROUTER_CONFIG.ROUTER_MODEL;
45
63
  expect(v).not.toMatch(/^autopilot[-_]/i);
46
64
  expect(v).not.toMatch(/^onbuzz[-_]/i);
47
65
  expect(v).not.toMatch(/^loxia[-_]/i);
@@ -632,10 +632,27 @@ class ConversationCompactionService {
632
632
  };
633
633
  }
634
634
 
635
- // Format middle messages for summarization
635
+ // Format middle messages for summarization — PRE-TAG each message
636
+ // with a category the summarizer can trust without inference.
637
+ //
638
+ // Why pre-tag instead of letting the summarizer figure it out:
639
+ // tool-result wrappers carry `role: user` (they come back as
640
+ // user-role messages by convention in this codebase). A summarizer
641
+ // staring at raw `user:` prefixes can't reliably tell a literal
642
+ // user typing from a tool-result blob — and in our experiments
643
+ // both gpt-4.1-mini and gpt-4.1-nano routinely quoted tool blobs
644
+ // as if they were user messages, wasting budget and corrupting
645
+ // the user-voice section. Categorizing here eliminates that whole
646
+ // failure class. See _categorizeMessage for the rules.
636
647
  let middleContent = middleMessages
637
- .map(msg => `${msg.role}: ${typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content)}`)
638
- .join('\n\n');
648
+ .map(msg => {
649
+ const cat = this._categorizeMessage(msg);
650
+ const body = typeof msg.content === 'string'
651
+ ? msg.content
652
+ : JSON.stringify(msg.content);
653
+ return `[${cat}] ${body}`;
654
+ })
655
+ .join('\n\n────────\n\n');
639
656
 
640
657
  // Estimate input tokens
641
658
  const estimatedInputTokens = Math.ceil(middleContent.length / COMPACTION_CONFIG.CHARS_PER_TOKEN_ESTIMATE);
@@ -1128,55 +1145,112 @@ class ConversationCompactionService {
1128
1145
  }
1129
1146
 
1130
1147
  /**
1131
- * Create summary prompt template with preservation guidelines
1148
+ * Categorize one conversation message for compaction tagging.
1149
+ *
1150
+ * Returns one of:
1151
+ * REAL_USER — a literal user typing turn
1152
+ * TOOL_RESULT — a `[Tool Results …]` wrapper (carries role:user)
1153
+ * PREVIOUS_TASK — a `[Previous Task — Final Tool Results]` boundary
1154
+ * AGENT — assistant turn
1155
+ * SYSTEM — system message
1156
+ *
1157
+ * The categorization is deterministic — text-prefix sniffing on the
1158
+ * content, not heuristic. Matches the convention used everywhere
1159
+ * else in the CLI for marking tool-result envelopes.
1160
+ *
1161
+ * @param {object} msg - { role, content }
1162
+ * @returns {string} one of the categories above
1163
+ * @private
1164
+ */
1165
+ _categorizeMessage(msg) {
1166
+ if (msg.role === 'assistant') return 'AGENT';
1167
+ if (msg.role === 'system') return 'SYSTEM';
1168
+ // role === 'user' — could be a real user message OR a tool-result wrapper.
1169
+ const c = typeof msg.content === 'string' ? msg.content.trimStart() : '';
1170
+ if (c.startsWith('[Tool Results')) return 'TOOL_RESULT';
1171
+ if (c.startsWith('[Previous Task')) return 'PREVIOUS_TASK';
1172
+ return 'REAL_USER';
1173
+ }
1174
+
1175
+ /**
1176
+ * Create the compaction-summary prompt template.
1177
+ *
1178
+ * Why this prompt is shaped this way:
1179
+ * The previous "paraphrase-everything" template was found to drop
1180
+ * the user's literal asks during compaction (see the Talisman
1181
+ * case study: the agent paraphrased the user's 3-point UI request
1182
+ * into "redesign UI" and then went off and built a Settings
1183
+ * screen). Re-tested across 3 models × 5 prompt variants, this
1184
+ * two-pass shape was the highest-fidelity option that worked
1185
+ * uniformly well across gpt-4.1-mini, gpt-4.1-nano, and
1186
+ * FW-Kimi-K2.5. See tmp-compaction-experiment/ for the harness.
1187
+ *
1188
+ * PASS 1 is transcription. The summarizer is NOT allowed to filter
1189
+ * user messages by "I think the agent already handled this." That
1190
+ * determination belongs to the consumer agent reading the summary,
1191
+ * not to the summarizer itself — making the summarizer choose was
1192
+ * how completed-vs-open misjudgments crept in. The blockquote
1193
+ * format gives the consumer agent a strong visual signal to
1194
+ * anchor on those literal asks.
1195
+ *
1196
+ * PASS 2 is the narrative summary of the agent's work — files,
1197
+ * tools, decisions, state. Heavy compression OK here; only the
1198
+ * user-voice section is sacred.
1199
+ *
1132
1200
  * @private
1133
1201
  */
1134
1202
  _createSummaryPromptTemplate() {
1135
- return `You are compacting a conversation to preserve critical information while reducing token count.
1136
-
1137
- CONTEXT: You are summarizing the EARLIEST portion of a conversation. The most recent messages are preserved separately. Your summary should capture what was accomplished, key decisions, and any information still relevant for continuation. Focus on outcomes over process.
1138
-
1139
- PRESERVATION GUIDELINES:
1140
-
1141
- HIGH PRIORITY (Always Preserve):
1142
- - User requests and goals: What the user asked for, their stated preferences, and desired outcomes — these drive all ongoing work
1143
- - Current task and next steps: What the agent is actively working on and what remains to be done
1144
- - Recent achievements and current status: What was accomplished, what state the work is in now
1145
- - Files created or modified successfully: Full file paths that were written, created, or changed
1146
- - Meaningful tool invocations and their outcomes: Tool calls that produced important results or side effects
1147
- - Future reference value: Information likely to be referenced again
1148
- - Decisions and reasoning: WHY things were decided, not just what
1149
- - API signatures and interfaces: Function definitions, method calls
1150
- - Active dependencies: Information that ongoing work relies on
1151
- - Error patterns and solutions: What failed and how it was fixed
1152
- - Key facts and data: Specific numbers, names, configurations
1153
-
1154
- MEDIUM PRIORITY (Compress Intelligently):
1155
- - Code blocks: Keep function signatures + brief description, compress implementation details
1156
- - Working solutions: Essence and outcome, not every implementation step
1157
- - Failed attempts: Brief mention of what didn't work and why, skip detailed troubleshooting
1158
- - Repetitive content: Consolidate similar examples or explanations
1159
-
1160
- LOW PRIORITY (Heavily Compress/Remove):
1161
- - Completed calculations: Keep results, skip intermediate steps
1162
- - Verbose explanations: Summarize well-known concepts
1163
- - Debug output: Skip terminal logs and error messages that served their purpose
1164
- - Trial-and-error sequences: Skip multiple failed attempts with no lasting value
1165
- - Acknowledgments and pleasantries: Skip "thank you", "sure", "okay" type exchanges
1166
-
1167
- CONVERSATION SEGMENT TO SUMMARIZE:
1168
- {middle_segment}
1203
+ return `You are compacting an earlier portion of an agent-user conversation. The input has been PRE-TAGGED every message starts with one of:
1204
+
1205
+ [REAL_USER] — a literal user message; TRANSCRIBE VERBATIM in PASS 1
1206
+ [AGENT] — assistant turn (tool calls + reasoning)
1207
+ [TOOL_RESULT] — a tool's output; the consumer agent does NOT need these verbatim
1208
+ [PREVIOUS_TASK] — final tool-result block from a previous task boundary
1209
+ [SYSTEM] — system note
1169
1210
 
1170
- TASK: Create a concise summary that preserves logical flow and critical information. Focus on:
1171
- 1. Key decisions and their reasoning
1172
- 2. Important facts, data, and configurations
1173
- 3. Active context needed for continuation
1174
- 4. Problem-solving outcomes (skip the debugging process)
1175
- 5. Dependencies and interfaces that code/work relies on
1211
+ You DO NOT need to detect categories yourself. Trust the tags. The pre-tagging is deterministic.
1176
1212
 
1177
- Someone reading this should understand the conversation progression and have all information needed for effective continuation.
1213
+ Write the summary in THREE passes, in this exact order.
1214
+
1215
+ ──────────────────────────────────────────────
1216
+ PASS 1 — USER VOICE (transcription only, no judgment)
1217
+ ──────────────────────────────────────────────
1218
+
1219
+ For EVERY [REAL_USER] message — and ONLY [REAL_USER] messages — emit a blockquote:
1220
+
1221
+ > **User said (orig idx N):** "<exact text, word for word, all of it>"
1222
+
1223
+ Absolute rules:
1224
+ - Do NOT quote any [TOOL_RESULT], [AGENT], [PREVIOUS_TASK], or [SYSTEM] message here.
1225
+ - Do NOT condense, paraphrase, or omit any [REAL_USER] message.
1226
+ - Do NOT skip a [REAL_USER] message on the assumption "the agent already addressed it." That determination belongs to the consumer agent, not to you. Your job here is transcription.
1227
+ - Reproduce every [REAL_USER] message, in original order, including punctuation and typos.
1228
+ - If the input has no [REAL_USER] messages, write "(no user messages in this segment)" and proceed.
1229
+
1230
+ ──────────────────────────────────────────────
1231
+ PASS 2 — EVENT LOG (chronological bullets, concrete details)
1232
+ ──────────────────────────────────────────────
1233
+
1234
+ A bulleted list of every notable event between/after the user messages. ONE bullet per event:
1235
+
1236
+ - [orig idx N] <one-line description — include full file paths, tool names, line numbers, status, and outcome>
1237
+
1238
+ Cover: file writes, successful tool calls that changed state, decisions made by the agent, errors that affected outcome, task-list changes (especially destructive ones like 'removed: N tasks'). Skip: pure-read tool calls that didn't change state, repeated reads, pleasantries, verbose tool output dumps.
1239
+
1240
+ A consumer agent should be able to read this log and reconstruct the cause-and-effect chain — what happened to each [REAL_USER] request.
1241
+
1242
+ ──────────────────────────────────────────────
1243
+ PASS 3 — STATE NARRATIVE (2–4 sentences)
1244
+ ──────────────────────────────────────────────
1245
+
1246
+ Plain prose describing the situation at the end of this segment: what is done, what is mid-flight, what is open — and where possible, map back to which [REAL_USER] request each piece corresponds to. If [REAL_USER] requests are still open with no work toward them, say so explicitly. This is the place where lossy paraphrase is most dangerous — name the gaps clearly.
1247
+
1248
+ ──────────────────────────────────────────────
1249
+
1250
+ CONVERSATION SEGMENT TO COMPACT:
1251
+ {middle_segment}
1178
1252
 
1179
- OUTPUT: Provide ONLY the summary text without preamble, explanation, or meta-commentary.`;
1253
+ OUTPUT: PASS 1, PASS 2, PASS 3 in that order. Use exactly the section headers above. No preamble, no meta-commentary.`;
1180
1254
  }
1181
1255
  }
1182
1256
 
@@ -364,6 +364,177 @@ describe('ToolsRegistry', () => {
364
364
  expect(desc).toContain('HOW TO GET TOOL DOCUMENTATION');
365
365
  });
366
366
 
367
+ describe('OPERATING POSTURE section', () => {
368
+ // Minimal fakes so the registry will accept these as memory/skills/taskmanager.
369
+ // BaseTool derives `this.id` from the class name (lowercased, with
370
+ // "Tool" stripped) — so the class must be named exactly *Tool* and
371
+ // the id is derived. We override `this.id` after super() to pin it
372
+ // independently of the class name, which keeps the test classes
373
+ // readable. validateTool() also requires parseParameters.
374
+ class FakeMemoryTool extends BaseTool {
375
+ constructor() { super(); this.id = 'memory'; }
376
+ getDescription() { return 'Memory tool stub'; }
377
+ parseParameters() { return {}; }
378
+ async execute() { return { ok: true }; }
379
+ }
380
+ class FakeSkillsTool extends BaseTool {
381
+ constructor() { super(); this.id = 'skills'; }
382
+ getDescription() { return 'Skills tool stub'; }
383
+ parseParameters() { return {}; }
384
+ async execute() { return { ok: true }; }
385
+ }
386
+ class FakeTaskManagerTool extends BaseTool {
387
+ constructor() { super(); this.id = 'taskmanager'; }
388
+ getDescription() { return 'TaskManager tool stub'; }
389
+ parseParameters() { return {}; }
390
+ async execute() { return { ok: true }; }
391
+ }
392
+
393
+ test('appears when memory tool is in capabilities (proactive memory nudge)', async () => {
394
+ await registry.registerTool(FakeMemoryTool);
395
+ const desc = registry.generateToolDescriptionsForPrompt(['memory']);
396
+ expect(desc).toContain('OPERATING POSTURE');
397
+ expect(desc).toMatch(/memory.*list/i);
398
+ // Plan/* should be cross-referenced here so agents writing a plan
399
+ // memory isn't an isolated tip buried in the memory tool's own desc.
400
+ expect(desc).toContain('plan/');
401
+ });
402
+
403
+ test('appears when skills tool is in capabilities (proactive skills nudge)', async () => {
404
+ await registry.registerTool(FakeSkillsTool);
405
+ const desc = registry.generateToolDescriptionsForPrompt(['skills']);
406
+ expect(desc).toContain('OPERATING POSTURE');
407
+ expect(desc).toMatch(/skills.*list/i);
408
+ });
409
+
410
+ test('does NOT appear when neither memory nor skills is in capabilities', async () => {
411
+ await registry.registerTool(TestTool);
412
+ const desc = registry.generateToolDescriptionsForPrompt(['test']);
413
+ expect(desc).not.toContain('OPERATING POSTURE');
414
+ });
415
+
416
+ test('distinguishes memory vs taskmanager when both are present (so agents know which to use)', async () => {
417
+ await registry.registerTool(FakeMemoryTool);
418
+ await registry.registerTool(FakeTaskManagerTool);
419
+ const desc = registry.generateToolDescriptionsForPrompt(['memory', 'taskmanager']);
420
+ expect(desc).toContain('OPERATING POSTURE');
421
+ expect(desc).toContain('persistent knowledge'); // memory
422
+ expect(desc).toContain('step-by-step'); // taskmanager
423
+ });
424
+
425
+ // REGRESSION: production observation — agents had 0 memory writes
426
+ // across 670-message sessions despite the previous wording asking
427
+ // them to "save when you recognize multi-turn work". Vague
428
+ // judgment-based triggers don't produce action. Tests pin that
429
+ // the new triggers are concrete and event-based.
430
+ test('REGRESSION: write-triggers are concrete events, not vague judgment', async () => {
431
+ await registry.registerTool(FakeMemoryTool);
432
+ const desc = registry.generateToolDescriptionsForPrompt(['memory']);
433
+ // The new wording should reference specific observable triggers,
434
+ // not "when you recognize" / "when you think".
435
+ expect(desc).toMatch(/numbered list|multi-bullet|substantive request/i);
436
+ expect(desc).toMatch(/before.*taskmanager.*sync|`taskmanager`.*sync/i);
437
+ expect(desc).toMatch(/non-obvious decision|tricky bug|unexpected error/i);
438
+ expect(desc).toMatch(/user gave you a preference/i);
439
+ // Should explicitly label the triggers as mandatory.
440
+ expect(desc).toMatch(/mandatory/i);
441
+ // And should NOT contain the old vague language.
442
+ expect(desc).not.toMatch(/when you recognize the work is multi-turn/i);
443
+ });
444
+
445
+ test('REGRESSION: write trigger mentions saving the user message VERBATIM', async () => {
446
+ // The Talisman bug was about losing the user's literal words.
447
+ // The trigger must instruct the agent to save the entire user
448
+ // message word-for-word, not a paraphrase of it.
449
+ await registry.registerTool(FakeMemoryTool);
450
+ const desc = registry.generateToolDescriptionsForPrompt(['memory']);
451
+ expect(desc).toMatch(/user'?s entire message verbatim/i);
452
+ });
453
+ });
454
+
455
+ // ── Per-model prompt shape: skip text docs for tools with native schemas
456
+ // when the target uses the Responses API (codex / o-series / gpt-5-pro).
457
+ describe('apiType="responses" — trims duplication with native function schemas', () => {
458
+ class FakeMemoryTool extends BaseTool {
459
+ constructor() { super(); this.id = 'memory'; }
460
+ getDescription() { return 'Memory tool stub with LONG description that would normally take many tokens'; }
461
+ getSummary() { return 'Persistent memory'; }
462
+ parseParameters() { return {}; }
463
+ async execute() { return { ok: true }; }
464
+ }
465
+ class FakeTerminalTool extends BaseTool {
466
+ constructor() { super(); this.id = 'terminal'; }
467
+ getDescription() { return 'Terminal tool LONG description that would normally take many tokens'; }
468
+ getSummary() { return 'Shell access'; }
469
+ parseParameters() { return {}; }
470
+ async execute() { return { ok: true }; }
471
+ }
472
+ class FakeWebTool extends BaseTool {
473
+ // 'web' is NOT in OPENAI_FUNCTION_SCHEMAS — its text doc must always appear.
474
+ constructor() { super(); this.id = 'web'; }
475
+ getDescription() { return 'Web tool LONG description that would normally take many tokens'; }
476
+ getSummary() { return 'Browser automation'; }
477
+ parseParameters() { return {}; }
478
+ async execute() { return { ok: true }; }
479
+ }
480
+
481
+ test('omits text description for tools that have native function schemas (memory, terminal)', async () => {
482
+ await registry.registerTool(FakeMemoryTool);
483
+ await registry.registerTool(FakeTerminalTool);
484
+
485
+ const responsesDesc = registry.generateToolDescriptionsForPrompt(
486
+ ['memory', 'terminal'],
487
+ { apiType: 'responses' },
488
+ );
489
+ // Header still present + one-line pointer to structured schema.
490
+ expect(responsesDesc).toContain('AVAILABLE TOOLS');
491
+ expect(responsesDesc).toContain('see structured schema');
492
+ // The big multi-line text doc must NOT be repeated.
493
+ expect(responsesDesc).not.toContain('### MEMORY TOOL');
494
+ expect(responsesDesc).not.toContain('### TERMINAL TOOL');
495
+ expect(responsesDesc).not.toContain('LONG description that would normally take many tokens');
496
+ });
497
+
498
+ test('keeps text description for tools that do NOT have native function schemas (e.g. web)', async () => {
499
+ await registry.registerTool(FakeWebTool);
500
+ const responsesDesc = registry.generateToolDescriptionsForPrompt(
501
+ ['web'],
502
+ { apiType: 'responses' },
503
+ );
504
+ // 'web' has no native schema → text doc MUST be present.
505
+ expect(responsesDesc).toContain('### WEB TOOL');
506
+ expect(responsesDesc).toContain('LONG description that would normally take many tokens');
507
+ });
508
+
509
+ test('BACKWARD COMPAT: without apiType option, behaves exactly as before (full text for everything)', async () => {
510
+ await registry.registerTool(FakeMemoryTool);
511
+ await registry.registerTool(FakeTerminalTool);
512
+
513
+ const defaultDesc = registry.generateToolDescriptionsForPrompt(['memory', 'terminal']);
514
+ // No apiType → keep the heavy text docs as today.
515
+ expect(defaultDesc).toContain('### MEMORY TOOL');
516
+ expect(defaultDesc).toContain('### TERMINAL TOOL');
517
+ });
518
+
519
+ test('BACKWARD COMPAT: apiType="chat_completion" is equivalent to no apiType', async () => {
520
+ await registry.registerTool(FakeMemoryTool);
521
+ const a = registry.generateToolDescriptionsForPrompt(['memory'], { apiType: 'chat_completion' });
522
+ const b = registry.generateToolDescriptionsForPrompt(['memory']);
523
+ expect(a).toBe(b);
524
+ });
525
+
526
+ test('enhanceSystemPrompt forwards apiType option to the description builder', async () => {
527
+ await registry.registerTool(FakeMemoryTool);
528
+ const native = registry.enhanceSystemPrompt('Base.', ['memory'], { apiType: 'responses' });
529
+ const inline = registry.enhanceSystemPrompt('Base.', ['memory']);
530
+ // Native form is meaningfully shorter (we dropped the per-tool block).
531
+ expect(native.length).toBeLessThan(inline.length);
532
+ // Both still contain the section headers and the original base prompt.
533
+ expect(native).toContain('Base.');
534
+ expect(native).toContain('AVAILABLE TOOLS');
535
+ });
536
+ });
537
+
367
538
  test('enhanceSystemPrompt appends tool docs', async () => {
368
539
  await registry.registerTool(TestTool);
369
540
  const enhanced = registry.enhanceSystemPrompt('Base prompt.', []);