onbuzz 4.8.1 → 4.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/core/__tests__/agentPool.test.js +185 -0
- package/src/core/__tests__/agentScheduler.taskListInjection.test.js +94 -0
- package/src/core/agentPool.js +307 -0
- package/src/core/agentScheduler.js +42 -0
- package/src/services/__tests__/compactionRetry.test.js +42 -42
- package/src/services/__tests__/conversationCompactionService.test.js +141 -0
- package/src/services/conversationCompactionService.js +120 -46
- package/src/tools/__tests__/baseTool.test.js +29 -0
- package/src/tools/__tests__/codeMapTool.test.js +179 -0
- package/src/tools/__tests__/taskManagerTool.test.js +141 -0
- package/src/tools/baseTool.js +14 -8
- package/src/tools/taskManagerTool.js +72 -2
- package/src/utilities/constants.js +19 -11
|
@@ -19,11 +19,11 @@ describe('ConversationCompactionService - _generateSummary retry behavior', () =
|
|
|
19
19
|
};
|
|
20
20
|
|
|
21
21
|
const mockModelsService = {
|
|
22
|
-
getAvailableModelNames: jest.fn().mockReturnValue(['gpt-
|
|
22
|
+
getAvailableModelNames: jest.fn().mockReturnValue(['gpt-4.1-nano', 'gpt-4.1-mini']),
|
|
23
23
|
getModels: jest.fn().mockReturnValue([
|
|
24
|
-
{ name: 'gpt-
|
|
25
|
-
{ name: 'gpt-
|
|
26
|
-
{ name: 'gpt-
|
|
24
|
+
{ name: 'gpt-4.1-nano', type: 'chat', contextWindow: 400000 },
|
|
25
|
+
{ name: 'gpt-4.1-mini', type: 'chat', contextWindow: 400000 },
|
|
26
|
+
{ name: 'gpt-4.1-nano', type: 'chat', contextWindow: 400000 },
|
|
27
27
|
{ name: 'random-model-xyz', type: 'chat', contextWindow: 200000 }
|
|
28
28
|
])
|
|
29
29
|
};
|
|
@@ -53,12 +53,12 @@ describe('ConversationCompactionService - _generateSummary retry behavior', () =
|
|
|
53
53
|
const onRetryAttempt = jest.fn();
|
|
54
54
|
|
|
55
55
|
// Only 2 validated models available; first fails, second succeeds
|
|
56
|
-
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-
|
|
56
|
+
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-4.1-nano', 'gpt-4.1-mini']);
|
|
57
57
|
mockAiService.sendMessage
|
|
58
58
|
.mockRejectedValueOnce(new Error('Service unavailable'))
|
|
59
59
|
.mockResolvedValueOnce({ content: 'Summary of conversation' });
|
|
60
60
|
|
|
61
|
-
await service._generateSummary(testMessages, 'gpt-
|
|
61
|
+
await service._generateSummary(testMessages, 'gpt-4.1-nano', { onRetryAttempt });
|
|
62
62
|
|
|
63
63
|
expect(onRetryAttempt).toHaveBeenCalledTimes(1);
|
|
64
64
|
});
|
|
@@ -66,18 +66,18 @@ describe('ConversationCompactionService - _generateSummary retry behavior', () =
|
|
|
66
66
|
test('onRetryAttempt receives correct message, failedModel, nextModel, attempt', async () => {
|
|
67
67
|
const onRetryAttempt = jest.fn();
|
|
68
68
|
|
|
69
|
-
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-
|
|
69
|
+
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-4.1-nano', 'gpt-4.1-mini']);
|
|
70
70
|
mockAiService.sendMessage
|
|
71
71
|
.mockRejectedValueOnce(new Error('Service unavailable'))
|
|
72
72
|
.mockResolvedValueOnce({ content: 'Summary of conversation' });
|
|
73
73
|
|
|
74
|
-
await service._generateSummary(testMessages, 'gpt-
|
|
74
|
+
await service._generateSummary(testMessages, 'gpt-4.1-nano', { onRetryAttempt });
|
|
75
75
|
|
|
76
76
|
expect(onRetryAttempt).toHaveBeenCalledWith(
|
|
77
77
|
expect.objectContaining({
|
|
78
78
|
type: 'compaction_retry',
|
|
79
|
-
failedModel: 'gpt-
|
|
80
|
-
nextModel: 'gpt-
|
|
79
|
+
failedModel: 'gpt-4.1-nano',
|
|
80
|
+
nextModel: 'gpt-4.1-mini',
|
|
81
81
|
attempt: 1
|
|
82
82
|
})
|
|
83
83
|
);
|
|
@@ -87,14 +87,14 @@ describe('ConversationCompactionService - _generateSummary retry behavior', () =
|
|
|
87
87
|
const onRetryAttempt = jest.fn();
|
|
88
88
|
|
|
89
89
|
// Only 1 validated model, and no suitable random models
|
|
90
|
-
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-
|
|
90
|
+
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-4.1-nano']);
|
|
91
91
|
mockModelsService.getModels.mockReturnValue([
|
|
92
|
-
{ name: 'gpt-
|
|
92
|
+
{ name: 'gpt-4.1-nano', type: 'chat', contextWindow: 400000 }
|
|
93
93
|
]);
|
|
94
94
|
mockAiService.sendMessage.mockRejectedValue(new Error('Service unavailable'));
|
|
95
95
|
|
|
96
96
|
await expect(
|
|
97
|
-
service._generateSummary(testMessages, 'gpt-
|
|
97
|
+
service._generateSummary(testMessages, 'gpt-4.1-nano', { onRetryAttempt })
|
|
98
98
|
).rejects.toThrow();
|
|
99
99
|
|
|
100
100
|
// The only call to onRetryAttempt would be from the last-resort block, but
|
|
@@ -108,16 +108,16 @@ describe('ConversationCompactionService - _generateSummary retry behavior', () =
|
|
|
108
108
|
|
|
109
109
|
describe('Random model fallback', () => {
|
|
110
110
|
test('after all recommended models fail, tries a random model from modelsService', async () => {
|
|
111
|
-
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-
|
|
111
|
+
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-4.1-nano']);
|
|
112
112
|
mockModelsService.getModels.mockReturnValue([
|
|
113
|
-
{ name: 'gpt-
|
|
113
|
+
{ name: 'gpt-4.1-nano', type: 'chat', contextWindow: 400000 },
|
|
114
114
|
{ name: 'random-model-xyz', type: 'chat', contextWindow: 200000 }
|
|
115
115
|
]);
|
|
116
116
|
mockAiService.sendMessage
|
|
117
117
|
.mockRejectedValueOnce(new Error('Service unavailable')) // recommended model fails
|
|
118
118
|
.mockResolvedValueOnce({ content: 'Last-resort summary' }); // random model succeeds
|
|
119
119
|
|
|
120
|
-
const result = await service._generateSummary(testMessages, 'gpt-
|
|
120
|
+
const result = await service._generateSummary(testMessages, 'gpt-4.1-nano', {});
|
|
121
121
|
|
|
122
122
|
// Should have been called twice: once for recommended, once for random
|
|
123
123
|
expect(mockAiService.sendMessage).toHaveBeenCalledTimes(2);
|
|
@@ -125,16 +125,16 @@ describe('ConversationCompactionService - _generateSummary retry behavior', () =
|
|
|
125
125
|
});
|
|
126
126
|
|
|
127
127
|
test('random model success returns valid summary and does not throw', async () => {
|
|
128
|
-
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-
|
|
128
|
+
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-4.1-nano']);
|
|
129
129
|
mockModelsService.getModels.mockReturnValue([
|
|
130
|
-
{ name: 'gpt-
|
|
130
|
+
{ name: 'gpt-4.1-nano', type: 'chat', contextWindow: 400000 },
|
|
131
131
|
{ name: 'fallback-model', type: 'chat', contextWindow: 200000 }
|
|
132
132
|
]);
|
|
133
133
|
mockAiService.sendMessage
|
|
134
134
|
.mockRejectedValueOnce(new Error('Service unavailable'))
|
|
135
135
|
.mockResolvedValueOnce({ content: 'Fallback summary content' });
|
|
136
136
|
|
|
137
|
-
const result = await service._generateSummary(testMessages, 'gpt-
|
|
137
|
+
const result = await service._generateSummary(testMessages, 'gpt-4.1-nano', {});
|
|
138
138
|
|
|
139
139
|
expect(result.role).toBe('system');
|
|
140
140
|
expect(result.type).toBe('summary');
|
|
@@ -142,44 +142,44 @@ describe('ConversationCompactionService - _generateSummary retry behavior', () =
|
|
|
142
142
|
});
|
|
143
143
|
|
|
144
144
|
test('random model failure still throws ALL_MODELS_EXHAUSTED', async () => {
|
|
145
|
-
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-
|
|
145
|
+
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-4.1-nano']);
|
|
146
146
|
mockAiService.sendMessage.mockRejectedValue(new Error('Everything is broken'));
|
|
147
147
|
|
|
148
148
|
await expect(
|
|
149
|
-
service._generateSummary(testMessages, 'gpt-
|
|
149
|
+
service._generateSummary(testMessages, 'gpt-4.1-nano', {})
|
|
150
150
|
).rejects.toThrow('ALL_MODELS_EXHAUSTED');
|
|
151
151
|
});
|
|
152
152
|
|
|
153
153
|
test('random model is NOT one already attempted (filtered out)', async () => {
|
|
154
|
-
// Only gpt-
|
|
155
|
-
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-
|
|
154
|
+
// Only gpt-4.1-nano is validated; random pool has others
|
|
155
|
+
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-4.1-nano']);
|
|
156
156
|
mockModelsService.getModels.mockReturnValue([
|
|
157
|
-
{ name: 'gpt-
|
|
157
|
+
{ name: 'gpt-4.1-nano', type: 'chat', contextWindow: 400000 },
|
|
158
158
|
{ name: 'random-model-xyz', type: 'chat', contextWindow: 200000 }
|
|
159
159
|
]);
|
|
160
160
|
mockAiService.sendMessage
|
|
161
|
-
.mockRejectedValueOnce(new Error('fail')) // gpt-
|
|
161
|
+
.mockRejectedValueOnce(new Error('fail')) // gpt-4.1-nano fails
|
|
162
162
|
.mockResolvedValueOnce({ content: 'Random success' }); // random-model-xyz succeeds
|
|
163
163
|
|
|
164
|
-
const result = await service._generateSummary(testMessages, 'gpt-
|
|
164
|
+
const result = await service._generateSummary(testMessages, 'gpt-4.1-nano', {});
|
|
165
165
|
|
|
166
166
|
// Second call should be the random model, not the already-attempted one
|
|
167
167
|
const secondCallModel = mockAiService.sendMessage.mock.calls[1][0];
|
|
168
|
-
expect(secondCallModel).not.toBe('gpt-
|
|
169
|
-
expect(result.metadata.compactionModel).not.toBe('gpt-
|
|
168
|
+
expect(secondCallModel).not.toBe('gpt-4.1-nano');
|
|
169
|
+
expect(result.metadata.compactionModel).not.toBe('gpt-4.1-nano');
|
|
170
170
|
});
|
|
171
171
|
|
|
172
172
|
test('random model must have sufficient context window', async () => {
|
|
173
173
|
// All models except the recommended one have tiny context windows
|
|
174
|
-
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-
|
|
174
|
+
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-4.1-nano']);
|
|
175
175
|
mockModelsService.getModels.mockReturnValue([
|
|
176
|
-
{ name: 'gpt-
|
|
176
|
+
{ name: 'gpt-4.1-nano', type: 'chat', contextWindow: 400000 },
|
|
177
177
|
{ name: 'tiny-model', type: 'chat', contextWindow: 100 } // too small
|
|
178
178
|
]);
|
|
179
179
|
mockAiService.sendMessage.mockRejectedValue(new Error('fail'));
|
|
180
180
|
|
|
181
181
|
await expect(
|
|
182
|
-
service._generateSummary(testMessages, 'gpt-
|
|
182
|
+
service._generateSummary(testMessages, 'gpt-4.1-nano', {})
|
|
183
183
|
).rejects.toThrow('ALL_MODELS_EXHAUSTED');
|
|
184
184
|
|
|
185
185
|
// Should only have tried the recommended model, not the tiny one
|
|
@@ -193,11 +193,11 @@ describe('ConversationCompactionService - _generateSummary retry behavior', () =
|
|
|
193
193
|
test('onAllModelsExhausted is called only after ALL models (including random) fail', async () => {
|
|
194
194
|
const onAllModelsExhausted = jest.fn();
|
|
195
195
|
|
|
196
|
-
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-
|
|
196
|
+
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-4.1-nano']);
|
|
197
197
|
mockAiService.sendMessage.mockRejectedValue(new Error('fail'));
|
|
198
198
|
|
|
199
199
|
await expect(
|
|
200
|
-
service._generateSummary(testMessages, 'gpt-
|
|
200
|
+
service._generateSummary(testMessages, 'gpt-4.1-nano', { onAllModelsExhausted })
|
|
201
201
|
).rejects.toThrow('ALL_MODELS_EXHAUSTED');
|
|
202
202
|
|
|
203
203
|
expect(onAllModelsExhausted).toHaveBeenCalledTimes(1);
|
|
@@ -211,16 +211,16 @@ describe('ConversationCompactionService - _generateSummary retry behavior', () =
|
|
|
211
211
|
test('onAllModelsExhausted includes all attempted model names', async () => {
|
|
212
212
|
const onAllModelsExhausted = jest.fn();
|
|
213
213
|
|
|
214
|
-
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-
|
|
214
|
+
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-4.1-nano', 'gpt-4.1-mini']);
|
|
215
215
|
mockAiService.sendMessage.mockRejectedValue(new Error('fail'));
|
|
216
216
|
|
|
217
217
|
await expect(
|
|
218
|
-
service._generateSummary(testMessages, 'gpt-
|
|
218
|
+
service._generateSummary(testMessages, 'gpt-4.1-nano', { onAllModelsExhausted })
|
|
219
219
|
).rejects.toThrow('ALL_MODELS_EXHAUSTED');
|
|
220
220
|
|
|
221
221
|
const callArg = onAllModelsExhausted.mock.calls[0][0];
|
|
222
|
-
expect(callArg.models).toContain('gpt-
|
|
223
|
-
expect(callArg.models).toContain('gpt-
|
|
222
|
+
expect(callArg.models).toContain('gpt-4.1-nano');
|
|
223
|
+
expect(callArg.models).toContain('gpt-4.1-mini');
|
|
224
224
|
// Should also include at least one random model that was attempted
|
|
225
225
|
expect(callArg.models.length).toBeGreaterThanOrEqual(2);
|
|
226
226
|
});
|
|
@@ -233,10 +233,10 @@ describe('ConversationCompactionService - _generateSummary retry behavior', () =
|
|
|
233
233
|
const onRetryAttempt = jest.fn();
|
|
234
234
|
const onAllModelsExhausted = jest.fn();
|
|
235
235
|
|
|
236
|
-
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-
|
|
236
|
+
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-4.1-nano', 'gpt-4.1-mini']);
|
|
237
237
|
mockAiService.sendMessage.mockResolvedValueOnce({ content: 'Great summary here' });
|
|
238
238
|
|
|
239
|
-
const result = await service._generateSummary(testMessages, 'gpt-
|
|
239
|
+
const result = await service._generateSummary(testMessages, 'gpt-4.1-nano', {
|
|
240
240
|
onRetryAttempt,
|
|
241
241
|
onAllModelsExhausted
|
|
242
242
|
});
|
|
@@ -253,12 +253,12 @@ describe('ConversationCompactionService - _generateSummary retry behavior', () =
|
|
|
253
253
|
const onRetryAttempt = jest.fn();
|
|
254
254
|
const onAllModelsExhausted = jest.fn();
|
|
255
255
|
|
|
256
|
-
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-
|
|
256
|
+
mockModelsService.getAvailableModelNames.mockReturnValue(['gpt-4.1-nano', 'gpt-4.1-mini']);
|
|
257
257
|
mockAiService.sendMessage
|
|
258
258
|
.mockRejectedValueOnce(new Error('429 rate limit'))
|
|
259
259
|
.mockResolvedValueOnce({ content: 'Second model summary' });
|
|
260
260
|
|
|
261
|
-
const result = await service._generateSummary(testMessages, 'gpt-
|
|
261
|
+
const result = await service._generateSummary(testMessages, 'gpt-4.1-nano', {
|
|
262
262
|
onRetryAttempt,
|
|
263
263
|
onAllModelsExhausted
|
|
264
264
|
});
|
|
@@ -266,7 +266,7 @@ describe('ConversationCompactionService - _generateSummary retry behavior', () =
|
|
|
266
266
|
expect(onRetryAttempt).toHaveBeenCalledTimes(1);
|
|
267
267
|
expect(onAllModelsExhausted).not.toHaveBeenCalled();
|
|
268
268
|
expect(result.content).toContain('Second model summary');
|
|
269
|
-
expect(result.metadata.compactionModel).toBe('gpt-
|
|
269
|
+
expect(result.metadata.compactionModel).toBe('gpt-4.1-mini');
|
|
270
270
|
});
|
|
271
271
|
});
|
|
272
272
|
});
|
|
@@ -34,6 +34,147 @@ describe('ConversationCompactionService', () => {
|
|
|
34
34
|
expect(service.compactionModelIndex).toBe(0);
|
|
35
35
|
});
|
|
36
36
|
|
|
37
|
+
// ─── Compaction-input pre-tagging ─────────────────────────────────────
|
|
38
|
+
//
|
|
39
|
+
// The summarizer is fed PRE-TAGGED messages so it doesn't have to
|
|
40
|
+
// guess whether a `role: user` message is a real user typing or a
|
|
41
|
+
// tool-result wrapper. We pin the tagging rules here.
|
|
42
|
+
|
|
43
|
+
describe('_categorizeMessage — input pre-tagging', () => {
|
|
44
|
+
test('assistant role → AGENT', () => {
|
|
45
|
+
expect(service._categorizeMessage({ role: 'assistant', content: 'hi' })).toBe('AGENT');
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
test('system role → SYSTEM', () => {
|
|
49
|
+
expect(service._categorizeMessage({ role: 'system', content: 'note' })).toBe('SYSTEM');
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
test('user role with plain content → REAL_USER', () => {
|
|
53
|
+
expect(service._categorizeMessage({ role: 'user', content: 'please fix the board' })).toBe('REAL_USER');
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
test('user role with [Tool Results …] prefix → TOOL_RESULT', () => {
|
|
57
|
+
expect(service._categorizeMessage({
|
|
58
|
+
role: 'user',
|
|
59
|
+
content: '[Tool Results — 1 result] [filesystem] {...}',
|
|
60
|
+
})).toBe('TOOL_RESULT');
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
test('user role with [Previous Task …] prefix → PREVIOUS_TASK', () => {
|
|
64
|
+
expect(service._categorizeMessage({
|
|
65
|
+
role: 'user',
|
|
66
|
+
content: '[Previous Task — Final Tool Results] [jobdone] {...}',
|
|
67
|
+
})).toBe('PREVIOUS_TASK');
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
test('REGRESSION: leading whitespace before [Tool Results doesn\'t fool the categorizer', () => {
|
|
71
|
+
expect(service._categorizeMessage({
|
|
72
|
+
role: 'user',
|
|
73
|
+
content: '\n [Tool Results — 1] {}',
|
|
74
|
+
})).toBe('TOOL_RESULT');
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
test('non-string content (rare) is treated as REAL_USER (defensive)', () => {
|
|
78
|
+
// If content somehow isn't a string, we can't sniff the prefix —
|
|
79
|
+
// safest fallback is to treat it as a real user message so it
|
|
80
|
+
// surfaces in PASS 1 rather than being silently dropped.
|
|
81
|
+
expect(service._categorizeMessage({ role: 'user', content: { foo: 1 } })).toBe('REAL_USER');
|
|
82
|
+
});
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
// ─── Summary prompt contract — anti-lossy-compaction guard ────────────
|
|
86
|
+
//
|
|
87
|
+
// History: an earlier prompt told the summarizer to paraphrase user
|
|
88
|
+
// requests "under HIGH PRIORITY". The Talisman case study (May 2026)
|
|
89
|
+
// showed that paraphrasing alone caused the agent to lose track of
|
|
90
|
+
// literal user asks → off-track work (building a Settings screen
|
|
91
|
+
// nobody asked for). Then a verbatim-everything prompt fixed that
|
|
92
|
+
// half but ate the agent-side narrative — PASS 2 never reached.
|
|
93
|
+
// The final prompt (3-pass, pre-tagged input, EVENT LOG + STATE)
|
|
94
|
+
// was validated across 3 models × 7 variants. These tests pin it.
|
|
95
|
+
|
|
96
|
+
describe('_createSummaryPromptTemplate — three-pass contract', () => {
|
|
97
|
+
test('includes the {middle_segment} placeholder for interpolation', () => {
|
|
98
|
+
const tpl = service._createSummaryPromptTemplate();
|
|
99
|
+
expect(tpl).toContain('{middle_segment}');
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
test('declares the pre-tagging categories the summarizer can trust', () => {
|
|
103
|
+
// The prompt must list every category the categorizer emits, so
|
|
104
|
+
// the summarizer doesn't fall back to guessing from content.
|
|
105
|
+
const tpl = service._createSummaryPromptTemplate();
|
|
106
|
+
expect(tpl).toContain('[REAL_USER]');
|
|
107
|
+
expect(tpl).toContain('[TOOL_RESULT]');
|
|
108
|
+
expect(tpl).toContain('[PREVIOUS_TASK]');
|
|
109
|
+
expect(tpl).toContain('[AGENT]');
|
|
110
|
+
expect(tpl).toContain('[SYSTEM]');
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
test('mandates VERBATIM user-message transcription, NOT paraphrase', () => {
|
|
114
|
+
const tpl = service._createSummaryPromptTemplate();
|
|
115
|
+
expect(tpl).toMatch(/word[\s,-]?for[\s,-]?word/i);
|
|
116
|
+
expect(tpl).toMatch(/transcription/i);
|
|
117
|
+
expect(tpl).toMatch(/(do not|don't|never).*(paraphras|condens|omit)/i);
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
test('uses a 3-pass shape: USER VOICE → EVENT LOG → STATE NARRATIVE', () => {
|
|
121
|
+
const tpl = service._createSummaryPromptTemplate();
|
|
122
|
+
const userIdx = tpl.indexOf('USER VOICE');
|
|
123
|
+
const eventIdx = tpl.indexOf('EVENT LOG');
|
|
124
|
+
const stateIdx = tpl.indexOf('STATE NARRATIVE');
|
|
125
|
+
expect(userIdx).toBeGreaterThan(-1);
|
|
126
|
+
expect(eventIdx).toBeGreaterThan(-1);
|
|
127
|
+
expect(stateIdx).toBeGreaterThan(-1);
|
|
128
|
+
// User voice first, then event log, then narrative.
|
|
129
|
+
expect(userIdx).toBeLessThan(eventIdx);
|
|
130
|
+
expect(eventIdx).toBeLessThan(stateIdx);
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
test('PASS 1 forbids quoting non-REAL_USER tagged messages', () => {
|
|
134
|
+
// The most common failure mode in early experiments was the
|
|
135
|
+
// summarizer quoting tool-result wrappers as if they were user
|
|
136
|
+
// messages. The prompt explicitly forbids this.
|
|
137
|
+
const tpl = service._createSummaryPromptTemplate();
|
|
138
|
+
expect(tpl).toMatch(/only \[REAL_USER\] messages/i);
|
|
139
|
+
// And lists the categories it must NOT quote.
|
|
140
|
+
expect(tpl).toMatch(/do not quote.*\[TOOL_RESULT\]/i);
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
test('PASS 2 demands concrete details (file paths, tool names, line numbers)', () => {
|
|
144
|
+
const tpl = service._createSummaryPromptTemplate();
|
|
145
|
+
expect(tpl).toMatch(/file path/i);
|
|
146
|
+
expect(tpl).toMatch(/tool name/i);
|
|
147
|
+
expect(tpl).toMatch(/line number/i);
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
test('PASS 3 must explicitly map back to [REAL_USER] requests', () => {
|
|
151
|
+
// The Talisman bug was the agent doing work UNRELATED to the
|
|
152
|
+
// user's requests. PASS 3 must surface that misalignment when
|
|
153
|
+
// it exists — the prompt requires "name the gaps clearly".
|
|
154
|
+
const tpl = service._createSummaryPromptTemplate();
|
|
155
|
+
expect(tpl).toMatch(/map back/i);
|
|
156
|
+
expect(tpl).toMatch(/name the gaps/i);
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
test('forbids skipping a [REAL_USER] message on "already addressed" reasoning', () => {
|
|
160
|
+
const tpl = service._createSummaryPromptTemplate();
|
|
161
|
+
expect(tpl).toMatch(/(do not|don't|never).*skip.*\[?REAL_USER\]?/i);
|
|
162
|
+
expect(tpl).toMatch(/already addressed/i);
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
test('output contract: no preamble, exact headers', () => {
|
|
166
|
+
const tpl = service._createSummaryPromptTemplate();
|
|
167
|
+
expect(tpl).toMatch(/no preamble/i);
|
|
168
|
+
expect(tpl).toMatch(/exactly the section headers above/i);
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
test('REGRESSION: does NOT carry forward the lossy "HIGH PRIORITY paraphrase" guidance', () => {
|
|
172
|
+
const tpl = service._createSummaryPromptTemplate();
|
|
173
|
+
expect(tpl).not.toMatch(/HIGH PRIORITY \(Always Preserve\)/);
|
|
174
|
+
expect(tpl).not.toMatch(/PRESERVATION GUIDELINES/);
|
|
175
|
+
});
|
|
176
|
+
});
|
|
177
|
+
|
|
37
178
|
// ─── compactConversation ─────────────────────────────────────────────
|
|
38
179
|
|
|
39
180
|
test('compactConversation throws on empty messages array', async () => {
|
|
@@ -632,10 +632,27 @@ class ConversationCompactionService {
|
|
|
632
632
|
};
|
|
633
633
|
}
|
|
634
634
|
|
|
635
|
-
// Format middle messages for summarization
|
|
635
|
+
// Format middle messages for summarization — PRE-TAG each message
|
|
636
|
+
// with a category the summarizer can trust without inference.
|
|
637
|
+
//
|
|
638
|
+
// Why pre-tag instead of letting the summarizer figure it out:
|
|
639
|
+
// tool-result wrappers carry `role: user` (they come back as
|
|
640
|
+
// user-role messages by convention in this codebase). A summarizer
|
|
641
|
+
// staring at raw `user:` prefixes can't reliably tell a literal
|
|
642
|
+
// user typing from a tool-result blob — and in our experiments
|
|
643
|
+
// both gpt-4.1-mini and gpt-4.1-nano routinely quoted tool blobs
|
|
644
|
+
// as if they were user messages, wasting budget and corrupting
|
|
645
|
+
// the user-voice section. Categorizing here eliminates that whole
|
|
646
|
+
// failure class. See _categorizeMessage for the rules.
|
|
636
647
|
let middleContent = middleMessages
|
|
637
|
-
.map(msg =>
|
|
638
|
-
|
|
648
|
+
.map(msg => {
|
|
649
|
+
const cat = this._categorizeMessage(msg);
|
|
650
|
+
const body = typeof msg.content === 'string'
|
|
651
|
+
? msg.content
|
|
652
|
+
: JSON.stringify(msg.content);
|
|
653
|
+
return `[${cat}] ${body}`;
|
|
654
|
+
})
|
|
655
|
+
.join('\n\n────────\n\n');
|
|
639
656
|
|
|
640
657
|
// Estimate input tokens
|
|
641
658
|
const estimatedInputTokens = Math.ceil(middleContent.length / COMPACTION_CONFIG.CHARS_PER_TOKEN_ESTIMATE);
|
|
@@ -1128,55 +1145,112 @@ class ConversationCompactionService {
|
|
|
1128
1145
|
}
|
|
1129
1146
|
|
|
1130
1147
|
/**
|
|
1131
|
-
*
|
|
1148
|
+
* Categorize one conversation message for compaction tagging.
|
|
1149
|
+
*
|
|
1150
|
+
* Returns one of:
|
|
1151
|
+
* REAL_USER — a literal user typing turn
|
|
1152
|
+
* TOOL_RESULT — a `[Tool Results …]` wrapper (carries role:user)
|
|
1153
|
+
* PREVIOUS_TASK — a `[Previous Task — Final Tool Results]` boundary
|
|
1154
|
+
* AGENT — assistant turn
|
|
1155
|
+
* SYSTEM — system message
|
|
1156
|
+
*
|
|
1157
|
+
* The categorization is deterministic — text-prefix sniffing on the
|
|
1158
|
+
* content, not heuristic. Matches the convention used everywhere
|
|
1159
|
+
* else in the CLI for marking tool-result envelopes.
|
|
1160
|
+
*
|
|
1161
|
+
* @param {object} msg - { role, content }
|
|
1162
|
+
* @returns {string} one of the categories above
|
|
1163
|
+
* @private
|
|
1164
|
+
*/
|
|
1165
|
+
_categorizeMessage(msg) {
|
|
1166
|
+
if (msg.role === 'assistant') return 'AGENT';
|
|
1167
|
+
if (msg.role === 'system') return 'SYSTEM';
|
|
1168
|
+
// role === 'user' — could be a real user message OR a tool-result wrapper.
|
|
1169
|
+
const c = typeof msg.content === 'string' ? msg.content.trimStart() : '';
|
|
1170
|
+
if (c.startsWith('[Tool Results')) return 'TOOL_RESULT';
|
|
1171
|
+
if (c.startsWith('[Previous Task')) return 'PREVIOUS_TASK';
|
|
1172
|
+
return 'REAL_USER';
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
/**
|
|
1176
|
+
* Create the compaction-summary prompt template.
|
|
1177
|
+
*
|
|
1178
|
+
* Why this prompt is shaped this way:
|
|
1179
|
+
* The previous "paraphrase-everything" template was found to drop
|
|
1180
|
+
* the user's literal asks during compaction (see the Talisman
|
|
1181
|
+
* case study: the agent paraphrased the user's 3-point UI request
|
|
1182
|
+
* into "redesign UI" and then went off and built a Settings
|
|
1183
|
+
* screen). Re-tested across 3 models × 5 prompt variants, this
|
|
1184
|
+
* two-pass shape was the highest-fidelity option that worked
|
|
1185
|
+
* uniformly well across gpt-4.1-mini, gpt-4.1-nano, and
|
|
1186
|
+
* FW-Kimi-K2.5. See tmp-compaction-experiment/ for the harness.
|
|
1187
|
+
*
|
|
1188
|
+
* PASS 1 is transcription. The summarizer is NOT allowed to filter
|
|
1189
|
+
* user messages by "I think the agent already handled this." That
|
|
1190
|
+
* determination belongs to the consumer agent reading the summary,
|
|
1191
|
+
* not to the summarizer itself — making the summarizer choose was
|
|
1192
|
+
* how completed-vs-open misjudgments crept in. The blockquote
|
|
1193
|
+
* format gives the consumer agent a strong visual signal to
|
|
1194
|
+
* anchor on those literal asks.
|
|
1195
|
+
*
|
|
1196
|
+
* PASS 2 is the narrative summary of the agent's work — files,
|
|
1197
|
+
* tools, decisions, state. Heavy compression OK here; only the
|
|
1198
|
+
* user-voice section is sacred.
|
|
1199
|
+
*
|
|
1132
1200
|
* @private
|
|
1133
1201
|
*/
|
|
1134
1202
|
_createSummaryPromptTemplate() {
|
|
1135
|
-
return `You are compacting
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
- User requests and goals: What the user asked for, their stated preferences, and desired outcomes — these drive all ongoing work
|
|
1143
|
-
- Current task and next steps: What the agent is actively working on and what remains to be done
|
|
1144
|
-
- Recent achievements and current status: What was accomplished, what state the work is in now
|
|
1145
|
-
- Files created or modified successfully: Full file paths that were written, created, or changed
|
|
1146
|
-
- Meaningful tool invocations and their outcomes: Tool calls that produced important results or side effects
|
|
1147
|
-
- Future reference value: Information likely to be referenced again
|
|
1148
|
-
- Decisions and reasoning: WHY things were decided, not just what
|
|
1149
|
-
- API signatures and interfaces: Function definitions, method calls
|
|
1150
|
-
- Active dependencies: Information that ongoing work relies on
|
|
1151
|
-
- Error patterns and solutions: What failed and how it was fixed
|
|
1152
|
-
- Key facts and data: Specific numbers, names, configurations
|
|
1153
|
-
|
|
1154
|
-
MEDIUM PRIORITY (Compress Intelligently):
|
|
1155
|
-
- Code blocks: Keep function signatures + brief description, compress implementation details
|
|
1156
|
-
- Working solutions: Essence and outcome, not every implementation step
|
|
1157
|
-
- Failed attempts: Brief mention of what didn't work and why, skip detailed troubleshooting
|
|
1158
|
-
- Repetitive content: Consolidate similar examples or explanations
|
|
1159
|
-
|
|
1160
|
-
LOW PRIORITY (Heavily Compress/Remove):
|
|
1161
|
-
- Completed calculations: Keep results, skip intermediate steps
|
|
1162
|
-
- Verbose explanations: Summarize well-known concepts
|
|
1163
|
-
- Debug output: Skip terminal logs and error messages that served their purpose
|
|
1164
|
-
- Trial-and-error sequences: Skip multiple failed attempts with no lasting value
|
|
1165
|
-
- Acknowledgments and pleasantries: Skip "thank you", "sure", "okay" type exchanges
|
|
1166
|
-
|
|
1167
|
-
CONVERSATION SEGMENT TO SUMMARIZE:
|
|
1168
|
-
{middle_segment}
|
|
1203
|
+
return `You are compacting an earlier portion of an agent-user conversation. The input has been PRE-TAGGED — every message starts with one of:
|
|
1204
|
+
|
|
1205
|
+
[REAL_USER] — a literal user message; TRANSCRIBE VERBATIM in PASS 1
|
|
1206
|
+
[AGENT] — assistant turn (tool calls + reasoning)
|
|
1207
|
+
[TOOL_RESULT] — a tool's output; the consumer agent does NOT need these verbatim
|
|
1208
|
+
[PREVIOUS_TASK] — final tool-result block from a previous task boundary
|
|
1209
|
+
[SYSTEM] — system note
|
|
1169
1210
|
|
|
1170
|
-
|
|
1171
|
-
1. Key decisions and their reasoning
|
|
1172
|
-
2. Important facts, data, and configurations
|
|
1173
|
-
3. Active context needed for continuation
|
|
1174
|
-
4. Problem-solving outcomes (skip the debugging process)
|
|
1175
|
-
5. Dependencies and interfaces that code/work relies on
|
|
1211
|
+
You DO NOT need to detect categories yourself. Trust the tags. The pre-tagging is deterministic.
|
|
1176
1212
|
|
|
1177
|
-
|
|
1213
|
+
Write the summary in THREE passes, in this exact order.
|
|
1214
|
+
|
|
1215
|
+
──────────────────────────────────────────────
|
|
1216
|
+
PASS 1 — USER VOICE (transcription only, no judgment)
|
|
1217
|
+
──────────────────────────────────────────────
|
|
1218
|
+
|
|
1219
|
+
For EVERY [REAL_USER] message — and ONLY [REAL_USER] messages — emit a blockquote:
|
|
1220
|
+
|
|
1221
|
+
> **User said (orig idx N):** "<exact text, word for word, all of it>"
|
|
1222
|
+
|
|
1223
|
+
Absolute rules:
|
|
1224
|
+
- Do NOT quote any [TOOL_RESULT], [AGENT], [PREVIOUS_TASK], or [SYSTEM] message here.
|
|
1225
|
+
- Do NOT condense, paraphrase, or omit any [REAL_USER] message.
|
|
1226
|
+
- Do NOT skip a [REAL_USER] message on the assumption "the agent already addressed it." That determination belongs to the consumer agent, not to you. Your job here is transcription.
|
|
1227
|
+
- Reproduce every [REAL_USER] message, in original order, including punctuation and typos.
|
|
1228
|
+
- If the input has no [REAL_USER] messages, write "(no user messages in this segment)" and proceed.
|
|
1229
|
+
|
|
1230
|
+
──────────────────────────────────────────────
|
|
1231
|
+
PASS 2 — EVENT LOG (chronological bullets, concrete details)
|
|
1232
|
+
──────────────────────────────────────────────
|
|
1233
|
+
|
|
1234
|
+
A bulleted list of every notable event between/after the user messages. ONE bullet per event:
|
|
1235
|
+
|
|
1236
|
+
- [orig idx N] <one-line description — include full file paths, tool names, line numbers, status, and outcome>
|
|
1237
|
+
|
|
1238
|
+
Cover: file writes, successful tool calls that changed state, decisions made by the agent, errors that affected outcome, task-list changes (especially destructive ones like 'removed: N tasks'). Skip: pure-read tool calls that didn't change state, repeated reads, pleasantries, verbose tool output dumps.
|
|
1239
|
+
|
|
1240
|
+
A consumer agent should be able to read this log and reconstruct the cause-and-effect chain — what happened to each [REAL_USER] request.
|
|
1241
|
+
|
|
1242
|
+
──────────────────────────────────────────────
|
|
1243
|
+
PASS 3 — STATE NARRATIVE (2–4 sentences)
|
|
1244
|
+
──────────────────────────────────────────────
|
|
1245
|
+
|
|
1246
|
+
Plain prose describing the situation at the end of this segment: what is done, what is mid-flight, what is open — and where possible, map back to which [REAL_USER] request each piece corresponds to. If [REAL_USER] requests are still open with no work toward them, say so explicitly. This is the place where lossy paraphrase is most dangerous — name the gaps clearly.
|
|
1247
|
+
|
|
1248
|
+
──────────────────────────────────────────────
|
|
1249
|
+
|
|
1250
|
+
CONVERSATION SEGMENT TO COMPACT:
|
|
1251
|
+
{middle_segment}
|
|
1178
1252
|
|
|
1179
|
-
OUTPUT:
|
|
1253
|
+
OUTPUT: PASS 1, PASS 2, PASS 3 in that order. Use exactly the section headers above. No preamble, no meta-commentary.`;
|
|
1180
1254
|
}
|
|
1181
1255
|
}
|
|
1182
1256
|
|
|
@@ -421,6 +421,35 @@ describe('ToolsRegistry', () => {
|
|
|
421
421
|
expect(desc).toContain('persistent knowledge'); // memory
|
|
422
422
|
expect(desc).toContain('step-by-step'); // taskmanager
|
|
423
423
|
});
|
|
424
|
+
|
|
425
|
+
// REGRESSION: production observation — agents had 0 memory writes
|
|
426
|
+
// across 670-message sessions despite the previous wording asking
|
|
427
|
+
// them to "save when you recognize multi-turn work". Vague
|
|
428
|
+
// judgment-based triggers don't produce action. Tests pin that
|
|
429
|
+
// the new triggers are concrete and event-based.
|
|
430
|
+
test('REGRESSION: write-triggers are concrete events, not vague judgment', async () => {
|
|
431
|
+
await registry.registerTool(FakeMemoryTool);
|
|
432
|
+
const desc = registry.generateToolDescriptionsForPrompt(['memory']);
|
|
433
|
+
// The new wording should reference specific observable triggers,
|
|
434
|
+
// not "when you recognize" / "when you think".
|
|
435
|
+
expect(desc).toMatch(/numbered list|multi-bullet|substantive request/i);
|
|
436
|
+
expect(desc).toMatch(/before.*taskmanager.*sync|`taskmanager`.*sync/i);
|
|
437
|
+
expect(desc).toMatch(/non-obvious decision|tricky bug|unexpected error/i);
|
|
438
|
+
expect(desc).toMatch(/user gave you a preference/i);
|
|
439
|
+
// Should explicitly label the triggers as mandatory.
|
|
440
|
+
expect(desc).toMatch(/mandatory/i);
|
|
441
|
+
// And should NOT contain the old vague language.
|
|
442
|
+
expect(desc).not.toMatch(/when you recognize the work is multi-turn/i);
|
|
443
|
+
});
|
|
444
|
+
|
|
445
|
+
test('REGRESSION: write trigger mentions saving the user message VERBATIM', async () => {
|
|
446
|
+
// The Talisman bug was about losing the user's literal words.
|
|
447
|
+
// The trigger must instruct the agent to save the entire user
|
|
448
|
+
// message word-for-word, not a paraphrase of it.
|
|
449
|
+
await registry.registerTool(FakeMemoryTool);
|
|
450
|
+
const desc = registry.generateToolDescriptionsForPrompt(['memory']);
|
|
451
|
+
expect(desc).toMatch(/user'?s entire message verbatim/i);
|
|
452
|
+
});
|
|
424
453
|
});
|
|
425
454
|
|
|
426
455
|
// ── Per-model prompt shape: skip text docs for tools with native schemas
|