zwarm 0.1.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zwarm/adapters/claude_code.py +55 -3
- zwarm/adapters/codex_mcp.py +433 -122
- zwarm/adapters/test_codex_mcp.py +26 -26
- zwarm/cli/main.py +464 -3
- zwarm/core/compact.py +329 -0
- zwarm/core/config.py +51 -9
- zwarm/core/environment.py +104 -33
- zwarm/core/models.py +16 -0
- zwarm/core/test_compact.py +312 -0
- zwarm/orchestrator.py +222 -39
- zwarm/prompts/orchestrator.py +128 -146
- zwarm/test_orchestrator_watchers.py +23 -0
- zwarm/tools/delegation.py +23 -4
- zwarm/watchers/builtin.py +90 -4
- zwarm/watchers/manager.py +46 -8
- zwarm/watchers/test_watchers.py +42 -0
- {zwarm-0.1.0.dist-info → zwarm-1.0.1.dist-info}/METADATA +162 -36
- zwarm-1.0.1.dist-info/RECORD +33 -0
- zwarm-0.1.0.dist-info/RECORD +0 -30
- {zwarm-0.1.0.dist-info → zwarm-1.0.1.dist-info}/WHEEL +0 -0
- {zwarm-0.1.0.dist-info → zwarm-1.0.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""Tests for the compact module."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from zwarm.core.compact import (
|
|
6
|
+
compact_messages,
|
|
7
|
+
estimate_tokens,
|
|
8
|
+
find_tool_groups,
|
|
9
|
+
should_compact,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TestEstimateTokens:
|
|
14
|
+
def test_simple_messages(self):
|
|
15
|
+
"""Estimate tokens for simple text messages."""
|
|
16
|
+
messages = [
|
|
17
|
+
{"role": "user", "content": "Hello world"}, # 11 chars
|
|
18
|
+
{"role": "assistant", "content": "Hi there!"}, # 9 chars
|
|
19
|
+
]
|
|
20
|
+
# ~20 chars / 4 = ~5 tokens
|
|
21
|
+
tokens = estimate_tokens(messages)
|
|
22
|
+
assert tokens == 5
|
|
23
|
+
|
|
24
|
+
def test_empty_messages(self):
|
|
25
|
+
"""Empty messages return 0 tokens."""
|
|
26
|
+
assert estimate_tokens([]) == 0
|
|
27
|
+
|
|
28
|
+
def test_messages_with_tool_calls(self):
|
|
29
|
+
"""Tool calls add to token count."""
|
|
30
|
+
messages = [
|
|
31
|
+
{
|
|
32
|
+
"role": "assistant",
|
|
33
|
+
"content": "Let me check",
|
|
34
|
+
"tool_calls": [
|
|
35
|
+
{"function": {"name": "read", "arguments": '{"path": "/foo/bar"}'}}
|
|
36
|
+
],
|
|
37
|
+
}
|
|
38
|
+
]
|
|
39
|
+
tokens = estimate_tokens(messages)
|
|
40
|
+
assert tokens > 0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TestFindToolGroups:
|
|
44
|
+
def test_no_tool_calls(self):
|
|
45
|
+
"""No tool groups in simple conversation."""
|
|
46
|
+
messages = [
|
|
47
|
+
{"role": "system", "content": "You are helpful"},
|
|
48
|
+
{"role": "user", "content": "Hello"},
|
|
49
|
+
{"role": "assistant", "content": "Hi!"},
|
|
50
|
+
]
|
|
51
|
+
groups = find_tool_groups(messages)
|
|
52
|
+
assert groups == []
|
|
53
|
+
|
|
54
|
+
def test_openai_format_tool_call(self):
|
|
55
|
+
"""Detect OpenAI-style tool call groups."""
|
|
56
|
+
messages = [
|
|
57
|
+
{"role": "system", "content": "System"},
|
|
58
|
+
{"role": "user", "content": "Read file"},
|
|
59
|
+
{
|
|
60
|
+
"role": "assistant",
|
|
61
|
+
"content": "Reading...",
|
|
62
|
+
"tool_calls": [{"id": "tc1", "function": {"name": "read"}}],
|
|
63
|
+
},
|
|
64
|
+
{"role": "tool", "tool_call_id": "tc1", "content": "file contents"},
|
|
65
|
+
{"role": "assistant", "content": "Here's the file"},
|
|
66
|
+
]
|
|
67
|
+
groups = find_tool_groups(messages)
|
|
68
|
+
assert groups == [(2, 3)] # Assistant with tool_calls + tool response
|
|
69
|
+
|
|
70
|
+
def test_multiple_tool_responses(self):
|
|
71
|
+
"""Group includes all consecutive tool responses."""
|
|
72
|
+
messages = [
|
|
73
|
+
{"role": "user", "content": "Do things"},
|
|
74
|
+
{
|
|
75
|
+
"role": "assistant",
|
|
76
|
+
"tool_calls": [
|
|
77
|
+
{"id": "tc1", "function": {"name": "a"}},
|
|
78
|
+
{"id": "tc2", "function": {"name": "b"}},
|
|
79
|
+
],
|
|
80
|
+
},
|
|
81
|
+
{"role": "tool", "tool_call_id": "tc1", "content": "result1"},
|
|
82
|
+
{"role": "tool", "tool_call_id": "tc2", "content": "result2"},
|
|
83
|
+
{"role": "assistant", "content": "Done"},
|
|
84
|
+
]
|
|
85
|
+
groups = find_tool_groups(messages)
|
|
86
|
+
assert groups == [(1, 3)] # Indices 1, 2, 3 form one group
|
|
87
|
+
|
|
88
|
+
def test_anthropic_format_tool_use(self):
|
|
89
|
+
"""Detect Anthropic-style tool_use content blocks."""
|
|
90
|
+
messages = [
|
|
91
|
+
{"role": "user", "content": "Read file"},
|
|
92
|
+
{
|
|
93
|
+
"role": "assistant",
|
|
94
|
+
"content": [
|
|
95
|
+
{"type": "text", "text": "Reading..."},
|
|
96
|
+
{"type": "tool_use", "id": "tu1", "name": "read", "input": {}},
|
|
97
|
+
],
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
"role": "user",
|
|
101
|
+
"content": [
|
|
102
|
+
{"type": "tool_result", "tool_use_id": "tu1", "content": "data"},
|
|
103
|
+
],
|
|
104
|
+
},
|
|
105
|
+
{"role": "assistant", "content": "Got it"},
|
|
106
|
+
]
|
|
107
|
+
groups = find_tool_groups(messages)
|
|
108
|
+
assert groups == [(1, 2)] # Assistant with tool_use + user with tool_result
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class TestCompactMessages:
|
|
112
|
+
def test_no_compaction_needed_few_messages(self):
|
|
113
|
+
"""Don't compact if we have fewer messages than keep thresholds."""
|
|
114
|
+
messages = [
|
|
115
|
+
{"role": "system", "content": "System"},
|
|
116
|
+
{"role": "user", "content": "Task"},
|
|
117
|
+
{"role": "assistant", "content": "Response"},
|
|
118
|
+
]
|
|
119
|
+
result = compact_messages(messages, keep_first_n=2, keep_last_n=2)
|
|
120
|
+
assert not result.was_compacted
|
|
121
|
+
assert result.messages == messages
|
|
122
|
+
assert "Too few" in result.preserved_reason
|
|
123
|
+
|
|
124
|
+
def test_compacts_middle_messages(self):
|
|
125
|
+
"""Remove messages from the middle, keeping first and last."""
|
|
126
|
+
messages = [
|
|
127
|
+
{"role": "system", "content": "System"},
|
|
128
|
+
{"role": "user", "content": "Task"},
|
|
129
|
+
{"role": "assistant", "content": "Step 1"},
|
|
130
|
+
{"role": "user", "content": "Continue"},
|
|
131
|
+
{"role": "assistant", "content": "Step 2"},
|
|
132
|
+
{"role": "user", "content": "More"},
|
|
133
|
+
{"role": "assistant", "content": "Step 3"},
|
|
134
|
+
{"role": "user", "content": "Final"},
|
|
135
|
+
{"role": "assistant", "content": "Done"},
|
|
136
|
+
]
|
|
137
|
+
result = compact_messages(messages, keep_first_n=2, keep_last_n=2)
|
|
138
|
+
|
|
139
|
+
assert result.was_compacted
|
|
140
|
+
assert result.removed_count > 0
|
|
141
|
+
# First 2 and last 2 should be preserved
|
|
142
|
+
assert result.messages[0]["content"] == "System"
|
|
143
|
+
assert result.messages[1]["content"] == "Task"
|
|
144
|
+
assert result.messages[-1]["content"] == "Done"
|
|
145
|
+
assert result.messages[-2]["content"] == "Final"
|
|
146
|
+
|
|
147
|
+
def test_preserves_tool_call_pairs(self):
|
|
148
|
+
"""Never split tool call from its response."""
|
|
149
|
+
messages = [
|
|
150
|
+
{"role": "system", "content": "System"},
|
|
151
|
+
{"role": "user", "content": "Task"},
|
|
152
|
+
{"role": "assistant", "content": "Old message 1"},
|
|
153
|
+
{"role": "assistant", "content": "Old message 2"},
|
|
154
|
+
{
|
|
155
|
+
"role": "assistant",
|
|
156
|
+
"content": "Calling tool",
|
|
157
|
+
"tool_calls": [{"id": "tc1", "function": {"name": "test"}}],
|
|
158
|
+
},
|
|
159
|
+
{"role": "tool", "tool_call_id": "tc1", "content": "Tool result"},
|
|
160
|
+
{"role": "assistant", "content": "Recent 1"},
|
|
161
|
+
{"role": "user", "content": "Recent 2"},
|
|
162
|
+
]
|
|
163
|
+
result = compact_messages(messages, keep_first_n=2, keep_last_n=2)
|
|
164
|
+
|
|
165
|
+
# The tool call pair should either both be kept or both removed
|
|
166
|
+
has_tool_call = any(m.get("tool_calls") for m in result.messages)
|
|
167
|
+
has_tool_response = any(m.get("role") == "tool" for m in result.messages)
|
|
168
|
+
|
|
169
|
+
# They should match - either both present or both absent
|
|
170
|
+
assert has_tool_call == has_tool_response
|
|
171
|
+
|
|
172
|
+
def test_adds_compaction_marker(self):
|
|
173
|
+
"""Add a marker message when compaction occurs."""
|
|
174
|
+
messages = [
|
|
175
|
+
{"role": "system", "content": "System"},
|
|
176
|
+
{"role": "user", "content": "Task"},
|
|
177
|
+
] + [{"role": "assistant", "content": f"Msg {i}"} for i in range(20)]
|
|
178
|
+
|
|
179
|
+
result = compact_messages(messages, keep_first_n=2, keep_last_n=3)
|
|
180
|
+
|
|
181
|
+
if result.was_compacted:
|
|
182
|
+
# Should have a system message about compaction
|
|
183
|
+
marker_msgs = [
|
|
184
|
+
m for m in result.messages
|
|
185
|
+
if m.get("role") == "system" and "compacted" in m.get("content", "").lower()
|
|
186
|
+
]
|
|
187
|
+
assert len(marker_msgs) == 1
|
|
188
|
+
|
|
189
|
+
def test_token_based_compaction(self):
|
|
190
|
+
"""Compact based on token threshold."""
|
|
191
|
+
# Create messages that exceed token limit
|
|
192
|
+
messages = [
|
|
193
|
+
{"role": "system", "content": "System prompt " * 100},
|
|
194
|
+
{"role": "user", "content": "Task " * 100},
|
|
195
|
+
] + [
|
|
196
|
+
{"role": "assistant", "content": f"Response {i} " * 50}
|
|
197
|
+
for i in range(10)
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
# Should not compact if under limit
|
|
201
|
+
result_under = compact_messages(messages, max_tokens=100000)
|
|
202
|
+
# Might or might not compact depending on estimate
|
|
203
|
+
|
|
204
|
+
# Should compact if over limit
|
|
205
|
+
result_over = compact_messages(messages, max_tokens=100, target_token_pct=0.5)
|
|
206
|
+
# With such a low limit, should definitely try to compact
|
|
207
|
+
assert result_over.original_count == len(messages)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class TestShouldCompact:
|
|
211
|
+
def test_under_threshold(self):
|
|
212
|
+
"""Don't compact when under threshold."""
|
|
213
|
+
messages = [{"role": "user", "content": "Hello"}]
|
|
214
|
+
assert not should_compact(messages, max_tokens=1000, threshold_pct=0.85)
|
|
215
|
+
|
|
216
|
+
def test_over_threshold(self):
|
|
217
|
+
"""Compact when over threshold."""
|
|
218
|
+
messages = [{"role": "user", "content": "x" * 4000}] # ~1000 tokens
|
|
219
|
+
assert should_compact(messages, max_tokens=500, threshold_pct=0.85)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
class TestEdgeCases:
|
|
223
|
+
def test_all_tool_calls(self):
|
|
224
|
+
"""Handle conversation that's mostly tool calls."""
|
|
225
|
+
messages = [
|
|
226
|
+
{"role": "system", "content": "System"},
|
|
227
|
+
{"role": "user", "content": "Task"},
|
|
228
|
+
]
|
|
229
|
+
# Add many tool call pairs
|
|
230
|
+
for i in range(5):
|
|
231
|
+
messages.append({
|
|
232
|
+
"role": "assistant",
|
|
233
|
+
"tool_calls": [{"id": f"tc{i}", "function": {"name": "test"}}],
|
|
234
|
+
})
|
|
235
|
+
messages.append({"role": "tool", "tool_call_id": f"tc{i}", "content": f"result{i}"})
|
|
236
|
+
|
|
237
|
+
messages.append({"role": "assistant", "content": "Final"})
|
|
238
|
+
|
|
239
|
+
result = compact_messages(messages, keep_first_n=2, keep_last_n=1)
|
|
240
|
+
|
|
241
|
+
# Should still produce valid output
|
|
242
|
+
assert len(result.messages) > 0
|
|
243
|
+
|
|
244
|
+
# Check no orphaned tool calls
|
|
245
|
+
for i, msg in enumerate(result.messages):
|
|
246
|
+
if msg.get("tool_calls"):
|
|
247
|
+
# Next message should be a tool response
|
|
248
|
+
if i + 1 < len(result.messages):
|
|
249
|
+
# Either next is tool response, or this is at the end
|
|
250
|
+
pass # Structural validity checked by not raising
|
|
251
|
+
|
|
252
|
+
def test_empty_messages(self):
|
|
253
|
+
"""Handle empty message list."""
|
|
254
|
+
result = compact_messages([])
|
|
255
|
+
assert result.messages == []
|
|
256
|
+
assert not result.was_compacted
|
|
257
|
+
|
|
258
|
+
def test_only_system_and_user(self):
|
|
259
|
+
"""Handle minimal conversation."""
|
|
260
|
+
messages = [
|
|
261
|
+
{"role": "system", "content": "System"},
|
|
262
|
+
{"role": "user", "content": "Hello"},
|
|
263
|
+
]
|
|
264
|
+
result = compact_messages(messages, keep_first_n=2, keep_last_n=2)
|
|
265
|
+
assert not result.was_compacted
|
|
266
|
+
assert result.messages == messages
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class TestPydanticModelMessages:
|
|
270
|
+
"""Test handling of Pydantic model messages (not just dicts)."""
|
|
271
|
+
|
|
272
|
+
def test_estimate_tokens_with_objects(self):
|
|
273
|
+
"""estimate_tokens should handle objects with attributes."""
|
|
274
|
+
class MockMessage:
|
|
275
|
+
def __init__(self, role, content):
|
|
276
|
+
self.role = role
|
|
277
|
+
self.content = content
|
|
278
|
+
|
|
279
|
+
messages = [
|
|
280
|
+
MockMessage("user", "Hello world"),
|
|
281
|
+
MockMessage("assistant", "Hi there!"),
|
|
282
|
+
]
|
|
283
|
+
tokens = estimate_tokens(messages)
|
|
284
|
+
assert tokens > 0
|
|
285
|
+
|
|
286
|
+
def test_should_compact_with_objects(self):
|
|
287
|
+
"""should_compact should handle objects with attributes."""
|
|
288
|
+
class MockMessage:
|
|
289
|
+
def __init__(self, role, content):
|
|
290
|
+
self.role = role
|
|
291
|
+
self.content = content
|
|
292
|
+
|
|
293
|
+
messages = [MockMessage("user", "x" * 4000)]
|
|
294
|
+
# Should not crash
|
|
295
|
+
result = should_compact(messages, max_tokens=500, threshold_pct=0.85)
|
|
296
|
+
assert result is True
|
|
297
|
+
|
|
298
|
+
def test_find_tool_groups_with_objects(self):
|
|
299
|
+
"""find_tool_groups should handle objects with attributes."""
|
|
300
|
+
class MockMessage:
|
|
301
|
+
def __init__(self, role, content=None, tool_calls=None):
|
|
302
|
+
self.role = role
|
|
303
|
+
self.content = content
|
|
304
|
+
self.tool_calls = tool_calls
|
|
305
|
+
|
|
306
|
+
messages = [
|
|
307
|
+
MockMessage("user", "Task"),
|
|
308
|
+
MockMessage("assistant", "Done"),
|
|
309
|
+
]
|
|
310
|
+
# Should not crash
|
|
311
|
+
groups = find_tool_groups(messages)
|
|
312
|
+
assert groups == []
|