@mariozechner/pi-coding-agent 0.23.2 → 0.23.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +40 -0
- package/README.md +89 -148
- package/dist/core/agent-session.d.ts +3 -1
- package/dist/core/agent-session.d.ts.map +1 -1
- package/dist/core/agent-session.js +27 -6
- package/dist/core/agent-session.js.map +1 -1
- package/dist/core/custom-tools/loader.d.ts.map +1 -1
- package/dist/core/custom-tools/loader.js +4 -2
- package/dist/core/custom-tools/loader.js.map +1 -1
- package/dist/core/hooks/loader.d.ts.map +1 -1
- package/dist/core/hooks/loader.js +29 -3
- package/dist/core/hooks/loader.js.map +1 -1
- package/dist/core/hooks/types.d.ts +2 -1
- package/dist/core/hooks/types.d.ts.map +1 -1
- package/dist/core/hooks/types.js.map +1 -1
- package/dist/core/system-prompt.d.ts.map +1 -1
- package/dist/core/system-prompt.js +3 -3
- package/dist/core/system-prompt.js.map +1 -1
- package/dist/modes/interactive/components/diff.d.ts +12 -0
- package/dist/modes/interactive/components/diff.d.ts.map +1 -0
- package/dist/modes/interactive/components/diff.js +133 -0
- package/dist/modes/interactive/components/diff.js.map +1 -0
- package/dist/modes/interactive/components/tool-execution.d.ts.map +1 -1
- package/dist/modes/interactive/components/tool-execution.js +26 -20
- package/dist/modes/interactive/components/tool-execution.js.map +1 -1
- package/dist/modes/interactive/theme/dark.json +9 -9
- package/dist/modes/interactive/theme/light.json +9 -9
- package/dist/modes/interactive/theme/theme.d.ts +10 -0
- package/dist/modes/interactive/theme/theme.d.ts.map +1 -1
- package/dist/modes/interactive/theme/theme.js +131 -3
- package/dist/modes/interactive/theme/theme.js.map +1 -1
- package/docs/custom-tools.md +19 -1
- package/docs/hooks.md +39 -19
- package/docs/rpc.md +14 -0
- package/docs/skills.md +148 -52
- package/docs/theme.md +23 -21
- package/package.json +6 -6
- package/docs/compaction-new.md +0 -387
- package/docs/compaction-strategies.ts +0 -502
- package/docs/compaction.md +0 -519
- package/docs/gemini.md +0 -255
- package/docs/truncation.md +0 -235
- package/docs/undercompaction.md +0 -313
package/docs/undercompaction.md
DELETED
|
@@ -1,313 +0,0 @@
|
|
|
1
|
-
# Under-Compaction Analysis
|
|
2
|
-
|
|
3
|
-
## Problem Statement
|
|
4
|
-
|
|
5
|
-
Auto-compaction triggers too late, causing context window overflows that result in failed LLM calls with `stopReason == "length"`.
|
|
6
|
-
|
|
7
|
-
## Architecture Overview
|
|
8
|
-
|
|
9
|
-
### Event Flow
|
|
10
|
-
|
|
11
|
-
```
|
|
12
|
-
User prompt
|
|
13
|
-
│
|
|
14
|
-
▼
|
|
15
|
-
agent.prompt()
|
|
16
|
-
│
|
|
17
|
-
▼
|
|
18
|
-
agentLoop() in packages/ai/src/agent/agent-loop.ts
|
|
19
|
-
│
|
|
20
|
-
├─► streamAssistantResponse()
|
|
21
|
-
│ │
|
|
22
|
-
│ ▼
|
|
23
|
-
│ LLM provider (Anthropic, OpenAI, etc.)
|
|
24
|
-
│ │
|
|
25
|
-
│ ▼
|
|
26
|
-
│ Events: message_start → message_update* → message_end
|
|
27
|
-
│ │
|
|
28
|
-
│ ▼
|
|
29
|
-
│ AssistantMessage with usage stats (input, output, cacheRead, cacheWrite)
|
|
30
|
-
│
|
|
31
|
-
├─► If assistant has tool calls:
|
|
32
|
-
│ │
|
|
33
|
-
│ ▼
|
|
34
|
-
│ executeToolCalls()
|
|
35
|
-
│ │
|
|
36
|
-
│ ├─► tool_execution_start (toolCallId, toolName, args)
|
|
37
|
-
│ │
|
|
38
|
-
│ ├─► tool.execute() runs (read, bash, write, edit, etc.)
|
|
39
|
-
│ │
|
|
40
|
-
│ ├─► tool_execution_end (toolCallId, toolName, result, isError)
|
|
41
|
-
│ │
|
|
42
|
-
│ └─► message_start + message_end for ToolResultMessage
|
|
43
|
-
│
|
|
44
|
-
└─► Loop continues until no more tool calls
|
|
45
|
-
│
|
|
46
|
-
▼
|
|
47
|
-
agent_end
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
### Token Usage Reporting
|
|
51
|
-
|
|
52
|
-
Token usage is ONLY available in `AssistantMessage.usage` after the LLM responds:
|
|
53
|
-
|
|
54
|
-
```typescript
|
|
55
|
-
// From packages/ai/src/types.ts
|
|
56
|
-
export interface Usage {
|
|
57
|
-
input: number; // Tokens in the request
|
|
58
|
-
output: number; // Tokens generated
|
|
59
|
-
cacheRead: number; // Cached tokens read
|
|
60
|
-
cacheWrite: number; // Cached tokens written
|
|
61
|
-
cost: Cost;
|
|
62
|
-
}
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
The `input` field represents the total context size sent to the LLM, which includes:
|
|
66
|
-
- System prompt
|
|
67
|
-
- All conversation messages
|
|
68
|
-
- All tool results from previous calls
|
|
69
|
-
|
|
70
|
-
### Current Compaction Check
|
|
71
|
-
|
|
72
|
-
Both TUI (`tui-renderer.ts`) and RPC (`main.ts`) modes check compaction identically:
|
|
73
|
-
|
|
74
|
-
```typescript
|
|
75
|
-
// In agent.subscribe() callback:
|
|
76
|
-
if (event.type === "message_end") {
|
|
77
|
-
// ...
|
|
78
|
-
if (event.message.role === "assistant") {
|
|
79
|
-
await checkAutoCompaction();
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
async function checkAutoCompaction() {
|
|
84
|
-
// Get last non-aborted assistant message
|
|
85
|
-
const messages = agent.state.messages;
|
|
86
|
-
let lastAssistant = findLastNonAbortedAssistant(messages);
|
|
87
|
-
if (!lastAssistant) return;
|
|
88
|
-
|
|
89
|
-
const contextTokens = calculateContextTokens(lastAssistant.usage);
|
|
90
|
-
const contextWindow = agent.state.model.contextWindow;
|
|
91
|
-
|
|
92
|
-
if (!shouldCompact(contextTokens, contextWindow, settings)) return;
|
|
93
|
-
|
|
94
|
-
// Trigger compaction...
|
|
95
|
-
}
|
|
96
|
-
```
|
|
97
|
-
|
|
98
|
-
**The check happens on `message_end` for assistant messages only.**
|
|
99
|
-
|
|
100
|
-
## The Under-Compaction Problem
|
|
101
|
-
|
|
102
|
-
### Failure Scenario
|
|
103
|
-
|
|
104
|
-
```
|
|
105
|
-
Context window: 200,000 tokens
|
|
106
|
-
Reserve tokens: 16,384 (default)
|
|
107
|
-
Threshold: 200,000 - 16,384 = 183,616
|
|
108
|
-
|
|
109
|
-
Turn N:
|
|
110
|
-
1. Assistant message received, usage shows 180,000 tokens
|
|
111
|
-
2. shouldCompact(180000, 200000, settings) → 180000 > 183616 → FALSE
|
|
112
|
-
3. Tool executes: `cat large-file.txt` → outputs 100KB (~25,000 tokens)
|
|
113
|
-
4. Context now effectively 205,000 tokens, but we don't know this
|
|
114
|
-
5. Next LLM call fails: context exceeds 200,000 window
|
|
115
|
-
```
|
|
116
|
-
|
|
117
|
-
The problem occurs when:
|
|
118
|
-
1. Context is below threshold (so compaction doesn't trigger)
|
|
119
|
-
2. A tool adds enough content to push it over the window limit
|
|
120
|
-
3. We only discover this when the next LLM call fails
|
|
121
|
-
|
|
122
|
-
### Root Cause
|
|
123
|
-
|
|
124
|
-
1. **Token counts are retrospective**: We only learn the context size AFTER the LLM processes it
|
|
125
|
-
2. **Tool results are blind spots**: When a tool executes and returns a large result, we don't know how many tokens it adds until the next LLM call
|
|
126
|
-
3. **No estimation before submission**: We submit the context and hope it fits
|
|
127
|
-
|
|
128
|
-
## Current Tool Output Limits
|
|
129
|
-
|
|
130
|
-
| Tool | Our Limit | Worst Case |
|
|
131
|
-
|------|-----------|------------|
|
|
132
|
-
| bash | 10MB per stream | 20MB (~5M tokens) |
|
|
133
|
-
| read | 2000 lines × 2000 chars | 4MB (~1M tokens) |
|
|
134
|
-
| write | Byte count only | Minimal |
|
|
135
|
-
| edit | Diff output | Variable |
|
|
136
|
-
|
|
137
|
-
## How Other Tools Handle This
|
|
138
|
-
|
|
139
|
-
### SST/OpenCode
|
|
140
|
-
|
|
141
|
-
**Tool Output Limits (during execution):**
|
|
142
|
-
|
|
143
|
-
| Tool | Limit | Details |
|
|
144
|
-
|------|-------|---------|
|
|
145
|
-
| bash | 30KB chars | `MAX_OUTPUT_LENGTH = 30_000`, truncates with notice |
|
|
146
|
-
| read | 2000 lines × 2000 chars/line | No total cap, theoretically 4MB |
|
|
147
|
-
| grep | 100 matches, 2000 chars/line | Truncates with notice |
|
|
148
|
-
| ls | 100 files | Truncates with notice |
|
|
149
|
-
| glob | 100 results | Truncates with notice |
|
|
150
|
-
| webfetch | 5MB | `MAX_RESPONSE_SIZE` |
|
|
151
|
-
|
|
152
|
-
**Overflow Detection:**
|
|
153
|
-
- `isOverflow()` runs BEFORE each turn (not during)
|
|
154
|
-
- Uses last LLM-reported token count: `tokens.input + tokens.cache.read + tokens.output`
|
|
155
|
-
- Triggers if `count > context - maxOutput`
|
|
156
|
-
- Does NOT detect overflow from tool results in current turn
|
|
157
|
-
|
|
158
|
-
**Recovery - Pruning:**
|
|
159
|
-
- `prune()` runs AFTER each turn completes
|
|
160
|
-
- Walks backwards through completed tool results
|
|
161
|
-
- Keeps last 40k tokens of tool outputs (`PRUNE_PROTECT`)
|
|
162
|
-
- Removes content from older tool results (marks `time.compacted`)
|
|
163
|
-
- Only prunes if savings > 20k tokens (`PRUNE_MINIMUM`)
|
|
164
|
-
- Token estimation: `chars / 4`
|
|
165
|
-
|
|
166
|
-
**Recovery - Compaction:**
|
|
167
|
-
- Triggered when `isOverflow()` returns true before a turn
|
|
168
|
-
- LLM generates summary of conversation
|
|
169
|
-
- Replaces old messages with summary
|
|
170
|
-
|
|
171
|
-
**Gap:** No mid-turn protection. A single read returning 4MB would overflow. The 30KB bash limit is their primary practical protection.
|
|
172
|
-
|
|
173
|
-
### OpenAI/Codex
|
|
174
|
-
|
|
175
|
-
**Tool Output Limits (during execution):**
|
|
176
|
-
|
|
177
|
-
| Tool | Limit | Details |
|
|
178
|
-
|------|-------|---------|
|
|
179
|
-
| shell/exec | 10k tokens or 10k bytes | Per-model `TruncationPolicy`, user-configurable |
|
|
180
|
-
| read_file | 2000 lines, 500 chars/line | `MAX_LINE_LENGTH = 500`, ~1MB max |
|
|
181
|
-
| grep_files | 100 matches | Default limit |
|
|
182
|
-
| list_dir | Configurable | BFS with depth limits |
|
|
183
|
-
|
|
184
|
-
**Truncation Policy:**
|
|
185
|
-
- Per-model family setting: `TruncationPolicy::Bytes(10_000)` or `TruncationPolicy::Tokens(10_000)`
|
|
186
|
-
- User can override via `tool_output_token_limit` config
|
|
187
|
-
- Applied to ALL tool outputs uniformly via `truncate_function_output_items_with_policy()`
|
|
188
|
-
- Preserves beginning and end, removes middle with `"…N tokens truncated…"` marker
|
|
189
|
-
|
|
190
|
-
**Overflow Detection:**
|
|
191
|
-
- After each successful turn: `if total_usage_tokens >= auto_compact_token_limit { compact() }`
|
|
192
|
-
- Per-model thresholds (e.g., 180k for 200k context window)
|
|
193
|
-
- `ContextWindowExceeded` error caught and handled
|
|
194
|
-
|
|
195
|
-
**Recovery - Compaction:**
|
|
196
|
-
- If tokens exceed threshold after turn, triggers `run_inline_auto_compact_task()`
|
|
197
|
-
- During compaction, if `ContextWindowExceeded`: removes oldest history item and retries
|
|
198
|
-
- Loop: `history.remove_first_item()` until it fits
|
|
199
|
-
- Notifies user: "Trimmed N older conversation item(s)"
|
|
200
|
-
|
|
201
|
-
**Recovery - Turn Error:**
|
|
202
|
-
- On `ContextWindowExceeded` during normal turn: marks tokens as full, returns error to user
|
|
203
|
-
- Does NOT auto-retry the failed turn
|
|
204
|
-
- User must manually continue
|
|
205
|
-
|
|
206
|
-
**Gap:** Still no mid-turn protection, but aggressive 10k token truncation on all tool outputs prevents most issues in practice.
|
|
207
|
-
|
|
208
|
-
### Comparison
|
|
209
|
-
|
|
210
|
-
| Feature | pi-coding-agent | OpenCode | Codex |
|
|
211
|
-
|---------|-----------------|----------|-------|
|
|
212
|
-
| Bash limit | 10MB | 30KB | ~40KB (10k tokens) |
|
|
213
|
-
| Read limit | 2000×2000 (4MB) | 2000×2000 (4MB) | 2000×500 (1MB) |
|
|
214
|
-
| Truncation policy | None | Per-tool | Per-model, uniform |
|
|
215
|
-
| Token estimation | None | chars/4 | chars/4 |
|
|
216
|
-
| Pre-turn check | No | Yes (last tokens) | Yes (threshold) |
|
|
217
|
-
| Mid-turn check | No | No | No |
|
|
218
|
-
| Post-turn pruning | No | Yes (removes old tool output) | No |
|
|
219
|
-
| Overflow recovery | No | Compaction | Trim oldest + compact |
|
|
220
|
-
|
|
221
|
-
**Key insight:** None of these tools protect against mid-turn overflow. Their practical protection is aggressive static limits on tool output, especially bash. OpenCode's 30KB bash limit vs our 10MB is the critical difference.
|
|
222
|
-
|
|
223
|
-
## Recommended Solution
|
|
224
|
-
|
|
225
|
-
### Phase 1: Static Limits (immediate)
|
|
226
|
-
|
|
227
|
-
Add hard limits to tool outputs matching industry practice:
|
|
228
|
-
|
|
229
|
-
```typescript
|
|
230
|
-
// packages/coding-agent/src/tools/limits.ts
|
|
231
|
-
export const MAX_TOOL_OUTPUT_CHARS = 30_000; // ~7.5k tokens, matches OpenCode bash
|
|
232
|
-
export const MAX_TOOL_OUTPUT_NOTICE = "\n\n...(truncated, output exceeded limit)...";
|
|
233
|
-
```
|
|
234
|
-
|
|
235
|
-
Apply to all tools:
|
|
236
|
-
- bash: 10MB → 30KB
|
|
237
|
-
- read: Add 100KB total output cap
|
|
238
|
-
- edit: Cap diff output
|
|
239
|
-
|
|
240
|
-
### Phase 2: Post-Tool Estimation
|
|
241
|
-
|
|
242
|
-
After `tool_execution_end`, estimate and flag:
|
|
243
|
-
|
|
244
|
-
```typescript
|
|
245
|
-
let needsCompactionAfterTurn = false;
|
|
246
|
-
|
|
247
|
-
agent.subscribe(async (event) => {
|
|
248
|
-
if (event.type === "tool_execution_end") {
|
|
249
|
-
const resultChars = extractTextLength(event.result);
|
|
250
|
-
const estimatedTokens = Math.ceil(resultChars / 4);
|
|
251
|
-
|
|
252
|
-
const lastUsage = getLastAssistantUsage(agent.state.messages);
|
|
253
|
-
if (lastUsage) {
|
|
254
|
-
const current = calculateContextTokens(lastUsage);
|
|
255
|
-
const projected = current + estimatedTokens;
|
|
256
|
-
const threshold = agent.state.model.contextWindow - settings.reserveTokens;
|
|
257
|
-
if (projected > threshold) {
|
|
258
|
-
needsCompactionAfterTurn = true;
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
if (event.type === "turn_end" && needsCompactionAfterTurn) {
|
|
264
|
-
needsCompactionAfterTurn = false;
|
|
265
|
-
await triggerCompaction();
|
|
266
|
-
}
|
|
267
|
-
});
|
|
268
|
-
```
|
|
269
|
-
|
|
270
|
-
### Phase 3: Overflow Recovery (like Codex)
|
|
271
|
-
|
|
272
|
-
Handle `stopReason === "length"` gracefully:
|
|
273
|
-
|
|
274
|
-
```typescript
|
|
275
|
-
if (event.type === "message_end" && event.message.role === "assistant") {
|
|
276
|
-
if (event.message.stopReason === "length") {
|
|
277
|
-
// Context overflow occurred
|
|
278
|
-
await triggerCompaction();
|
|
279
|
-
// Optionally: retry the turn
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
```
|
|
283
|
-
|
|
284
|
-
During compaction, if it also overflows, trim oldest messages:
|
|
285
|
-
|
|
286
|
-
```typescript
|
|
287
|
-
async function compactWithRetry() {
|
|
288
|
-
while (true) {
|
|
289
|
-
try {
|
|
290
|
-
await compact();
|
|
291
|
-
break;
|
|
292
|
-
} catch (e) {
|
|
293
|
-
if (isContextOverflow(e) && messages.length > 1) {
|
|
294
|
-
messages.shift(); // Remove oldest
|
|
295
|
-
continue;
|
|
296
|
-
}
|
|
297
|
-
throw e;
|
|
298
|
-
}
|
|
299
|
-
}
|
|
300
|
-
}
|
|
301
|
-
```
|
|
302
|
-
|
|
303
|
-
## Summary
|
|
304
|
-
|
|
305
|
-
The under-compaction problem occurs because:
|
|
306
|
-
1. We only check context size after assistant messages
|
|
307
|
-
2. Tool results can add arbitrary amounts of content
|
|
308
|
-
3. We discover overflows only when the next LLM call fails
|
|
309
|
-
|
|
310
|
-
The fix requires:
|
|
311
|
-
1. Aggressive static limits on tool output (immediate safety net)
|
|
312
|
-
2. Token estimation after tool execution (proactive detection)
|
|
313
|
-
3. Graceful handling of overflow errors (fallback recovery)
|