zwarm 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zwarm/adapters/claude_code.py +55 -3
- zwarm/adapters/codex_mcp.py +433 -122
- zwarm/adapters/test_codex_mcp.py +26 -26
- zwarm/cli/main.py +464 -3
- zwarm/core/compact.py +312 -0
- zwarm/core/config.py +51 -9
- zwarm/core/environment.py +104 -33
- zwarm/core/models.py +16 -0
- zwarm/core/test_compact.py +266 -0
- zwarm/orchestrator.py +222 -39
- zwarm/prompts/orchestrator.py +128 -146
- zwarm/test_orchestrator_watchers.py +23 -0
- zwarm/tools/delegation.py +23 -4
- zwarm/watchers/builtin.py +90 -4
- zwarm/watchers/manager.py +46 -8
- zwarm/watchers/test_watchers.py +42 -0
- {zwarm-0.1.0.dist-info → zwarm-1.0.0.dist-info}/METADATA +162 -36
- zwarm-1.0.0.dist-info/RECORD +33 -0
- zwarm-0.1.0.dist-info/RECORD +0 -30
- {zwarm-0.1.0.dist-info → zwarm-1.0.0.dist-info}/WHEEL +0 -0
- {zwarm-0.1.0.dist-info → zwarm-1.0.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
"""Tests for the compact module."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from zwarm.core.compact import (
|
|
6
|
+
compact_messages,
|
|
7
|
+
estimate_tokens,
|
|
8
|
+
find_tool_groups,
|
|
9
|
+
should_compact,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TestEstimateTokens:
|
|
14
|
+
def test_simple_messages(self):
|
|
15
|
+
"""Estimate tokens for simple text messages."""
|
|
16
|
+
messages = [
|
|
17
|
+
{"role": "user", "content": "Hello world"}, # 11 chars
|
|
18
|
+
{"role": "assistant", "content": "Hi there!"}, # 9 chars
|
|
19
|
+
]
|
|
20
|
+
# ~20 chars / 4 = ~5 tokens
|
|
21
|
+
tokens = estimate_tokens(messages)
|
|
22
|
+
assert tokens == 5
|
|
23
|
+
|
|
24
|
+
def test_empty_messages(self):
|
|
25
|
+
"""Empty messages return 0 tokens."""
|
|
26
|
+
assert estimate_tokens([]) == 0
|
|
27
|
+
|
|
28
|
+
def test_messages_with_tool_calls(self):
|
|
29
|
+
"""Tool calls add to token count."""
|
|
30
|
+
messages = [
|
|
31
|
+
{
|
|
32
|
+
"role": "assistant",
|
|
33
|
+
"content": "Let me check",
|
|
34
|
+
"tool_calls": [
|
|
35
|
+
{"function": {"name": "read", "arguments": '{"path": "/foo/bar"}'}}
|
|
36
|
+
],
|
|
37
|
+
}
|
|
38
|
+
]
|
|
39
|
+
tokens = estimate_tokens(messages)
|
|
40
|
+
assert tokens > 0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TestFindToolGroups:
|
|
44
|
+
def test_no_tool_calls(self):
|
|
45
|
+
"""No tool groups in simple conversation."""
|
|
46
|
+
messages = [
|
|
47
|
+
{"role": "system", "content": "You are helpful"},
|
|
48
|
+
{"role": "user", "content": "Hello"},
|
|
49
|
+
{"role": "assistant", "content": "Hi!"},
|
|
50
|
+
]
|
|
51
|
+
groups = find_tool_groups(messages)
|
|
52
|
+
assert groups == []
|
|
53
|
+
|
|
54
|
+
def test_openai_format_tool_call(self):
|
|
55
|
+
"""Detect OpenAI-style tool call groups."""
|
|
56
|
+
messages = [
|
|
57
|
+
{"role": "system", "content": "System"},
|
|
58
|
+
{"role": "user", "content": "Read file"},
|
|
59
|
+
{
|
|
60
|
+
"role": "assistant",
|
|
61
|
+
"content": "Reading...",
|
|
62
|
+
"tool_calls": [{"id": "tc1", "function": {"name": "read"}}],
|
|
63
|
+
},
|
|
64
|
+
{"role": "tool", "tool_call_id": "tc1", "content": "file contents"},
|
|
65
|
+
{"role": "assistant", "content": "Here's the file"},
|
|
66
|
+
]
|
|
67
|
+
groups = find_tool_groups(messages)
|
|
68
|
+
assert groups == [(2, 3)] # Assistant with tool_calls + tool response
|
|
69
|
+
|
|
70
|
+
def test_multiple_tool_responses(self):
|
|
71
|
+
"""Group includes all consecutive tool responses."""
|
|
72
|
+
messages = [
|
|
73
|
+
{"role": "user", "content": "Do things"},
|
|
74
|
+
{
|
|
75
|
+
"role": "assistant",
|
|
76
|
+
"tool_calls": [
|
|
77
|
+
{"id": "tc1", "function": {"name": "a"}},
|
|
78
|
+
{"id": "tc2", "function": {"name": "b"}},
|
|
79
|
+
],
|
|
80
|
+
},
|
|
81
|
+
{"role": "tool", "tool_call_id": "tc1", "content": "result1"},
|
|
82
|
+
{"role": "tool", "tool_call_id": "tc2", "content": "result2"},
|
|
83
|
+
{"role": "assistant", "content": "Done"},
|
|
84
|
+
]
|
|
85
|
+
groups = find_tool_groups(messages)
|
|
86
|
+
assert groups == [(1, 3)] # Indices 1, 2, 3 form one group
|
|
87
|
+
|
|
88
|
+
def test_anthropic_format_tool_use(self):
|
|
89
|
+
"""Detect Anthropic-style tool_use content blocks."""
|
|
90
|
+
messages = [
|
|
91
|
+
{"role": "user", "content": "Read file"},
|
|
92
|
+
{
|
|
93
|
+
"role": "assistant",
|
|
94
|
+
"content": [
|
|
95
|
+
{"type": "text", "text": "Reading..."},
|
|
96
|
+
{"type": "tool_use", "id": "tu1", "name": "read", "input": {}},
|
|
97
|
+
],
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
"role": "user",
|
|
101
|
+
"content": [
|
|
102
|
+
{"type": "tool_result", "tool_use_id": "tu1", "content": "data"},
|
|
103
|
+
],
|
|
104
|
+
},
|
|
105
|
+
{"role": "assistant", "content": "Got it"},
|
|
106
|
+
]
|
|
107
|
+
groups = find_tool_groups(messages)
|
|
108
|
+
assert groups == [(1, 2)] # Assistant with tool_use + user with tool_result
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class TestCompactMessages:
|
|
112
|
+
def test_no_compaction_needed_few_messages(self):
|
|
113
|
+
"""Don't compact if we have fewer messages than keep thresholds."""
|
|
114
|
+
messages = [
|
|
115
|
+
{"role": "system", "content": "System"},
|
|
116
|
+
{"role": "user", "content": "Task"},
|
|
117
|
+
{"role": "assistant", "content": "Response"},
|
|
118
|
+
]
|
|
119
|
+
result = compact_messages(messages, keep_first_n=2, keep_last_n=2)
|
|
120
|
+
assert not result.was_compacted
|
|
121
|
+
assert result.messages == messages
|
|
122
|
+
assert "Too few" in result.preserved_reason
|
|
123
|
+
|
|
124
|
+
def test_compacts_middle_messages(self):
|
|
125
|
+
"""Remove messages from the middle, keeping first and last."""
|
|
126
|
+
messages = [
|
|
127
|
+
{"role": "system", "content": "System"},
|
|
128
|
+
{"role": "user", "content": "Task"},
|
|
129
|
+
{"role": "assistant", "content": "Step 1"},
|
|
130
|
+
{"role": "user", "content": "Continue"},
|
|
131
|
+
{"role": "assistant", "content": "Step 2"},
|
|
132
|
+
{"role": "user", "content": "More"},
|
|
133
|
+
{"role": "assistant", "content": "Step 3"},
|
|
134
|
+
{"role": "user", "content": "Final"},
|
|
135
|
+
{"role": "assistant", "content": "Done"},
|
|
136
|
+
]
|
|
137
|
+
result = compact_messages(messages, keep_first_n=2, keep_last_n=2)
|
|
138
|
+
|
|
139
|
+
assert result.was_compacted
|
|
140
|
+
assert result.removed_count > 0
|
|
141
|
+
# First 2 and last 2 should be preserved
|
|
142
|
+
assert result.messages[0]["content"] == "System"
|
|
143
|
+
assert result.messages[1]["content"] == "Task"
|
|
144
|
+
assert result.messages[-1]["content"] == "Done"
|
|
145
|
+
assert result.messages[-2]["content"] == "Final"
|
|
146
|
+
|
|
147
|
+
def test_preserves_tool_call_pairs(self):
|
|
148
|
+
"""Never split tool call from its response."""
|
|
149
|
+
messages = [
|
|
150
|
+
{"role": "system", "content": "System"},
|
|
151
|
+
{"role": "user", "content": "Task"},
|
|
152
|
+
{"role": "assistant", "content": "Old message 1"},
|
|
153
|
+
{"role": "assistant", "content": "Old message 2"},
|
|
154
|
+
{
|
|
155
|
+
"role": "assistant",
|
|
156
|
+
"content": "Calling tool",
|
|
157
|
+
"tool_calls": [{"id": "tc1", "function": {"name": "test"}}],
|
|
158
|
+
},
|
|
159
|
+
{"role": "tool", "tool_call_id": "tc1", "content": "Tool result"},
|
|
160
|
+
{"role": "assistant", "content": "Recent 1"},
|
|
161
|
+
{"role": "user", "content": "Recent 2"},
|
|
162
|
+
]
|
|
163
|
+
result = compact_messages(messages, keep_first_n=2, keep_last_n=2)
|
|
164
|
+
|
|
165
|
+
# The tool call pair should either both be kept or both removed
|
|
166
|
+
has_tool_call = any(m.get("tool_calls") for m in result.messages)
|
|
167
|
+
has_tool_response = any(m.get("role") == "tool" for m in result.messages)
|
|
168
|
+
|
|
169
|
+
# They should match - either both present or both absent
|
|
170
|
+
assert has_tool_call == has_tool_response
|
|
171
|
+
|
|
172
|
+
def test_adds_compaction_marker(self):
|
|
173
|
+
"""Add a marker message when compaction occurs."""
|
|
174
|
+
messages = [
|
|
175
|
+
{"role": "system", "content": "System"},
|
|
176
|
+
{"role": "user", "content": "Task"},
|
|
177
|
+
] + [{"role": "assistant", "content": f"Msg {i}"} for i in range(20)]
|
|
178
|
+
|
|
179
|
+
result = compact_messages(messages, keep_first_n=2, keep_last_n=3)
|
|
180
|
+
|
|
181
|
+
if result.was_compacted:
|
|
182
|
+
# Should have a system message about compaction
|
|
183
|
+
marker_msgs = [
|
|
184
|
+
m for m in result.messages
|
|
185
|
+
if m.get("role") == "system" and "compacted" in m.get("content", "").lower()
|
|
186
|
+
]
|
|
187
|
+
assert len(marker_msgs) == 1
|
|
188
|
+
|
|
189
|
+
def test_token_based_compaction(self):
|
|
190
|
+
"""Compact based on token threshold."""
|
|
191
|
+
# Create messages that exceed token limit
|
|
192
|
+
messages = [
|
|
193
|
+
{"role": "system", "content": "System prompt " * 100},
|
|
194
|
+
{"role": "user", "content": "Task " * 100},
|
|
195
|
+
] + [
|
|
196
|
+
{"role": "assistant", "content": f"Response {i} " * 50}
|
|
197
|
+
for i in range(10)
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
# Should not compact if under limit
|
|
201
|
+
result_under = compact_messages(messages, max_tokens=100000)
|
|
202
|
+
# Might or might not compact depending on estimate
|
|
203
|
+
|
|
204
|
+
# Should compact if over limit
|
|
205
|
+
result_over = compact_messages(messages, max_tokens=100, target_token_pct=0.5)
|
|
206
|
+
# With such a low limit, should definitely try to compact
|
|
207
|
+
assert result_over.original_count == len(messages)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class TestShouldCompact:
|
|
211
|
+
def test_under_threshold(self):
|
|
212
|
+
"""Don't compact when under threshold."""
|
|
213
|
+
messages = [{"role": "user", "content": "Hello"}]
|
|
214
|
+
assert not should_compact(messages, max_tokens=1000, threshold_pct=0.85)
|
|
215
|
+
|
|
216
|
+
def test_over_threshold(self):
|
|
217
|
+
"""Compact when over threshold."""
|
|
218
|
+
messages = [{"role": "user", "content": "x" * 4000}] # ~1000 tokens
|
|
219
|
+
assert should_compact(messages, max_tokens=500, threshold_pct=0.85)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
class TestEdgeCases:
|
|
223
|
+
def test_all_tool_calls(self):
|
|
224
|
+
"""Handle conversation that's mostly tool calls."""
|
|
225
|
+
messages = [
|
|
226
|
+
{"role": "system", "content": "System"},
|
|
227
|
+
{"role": "user", "content": "Task"},
|
|
228
|
+
]
|
|
229
|
+
# Add many tool call pairs
|
|
230
|
+
for i in range(5):
|
|
231
|
+
messages.append({
|
|
232
|
+
"role": "assistant",
|
|
233
|
+
"tool_calls": [{"id": f"tc{i}", "function": {"name": "test"}}],
|
|
234
|
+
})
|
|
235
|
+
messages.append({"role": "tool", "tool_call_id": f"tc{i}", "content": f"result{i}"})
|
|
236
|
+
|
|
237
|
+
messages.append({"role": "assistant", "content": "Final"})
|
|
238
|
+
|
|
239
|
+
result = compact_messages(messages, keep_first_n=2, keep_last_n=1)
|
|
240
|
+
|
|
241
|
+
# Should still produce valid output
|
|
242
|
+
assert len(result.messages) > 0
|
|
243
|
+
|
|
244
|
+
# Check no orphaned tool calls
|
|
245
|
+
for i, msg in enumerate(result.messages):
|
|
246
|
+
if msg.get("tool_calls"):
|
|
247
|
+
# Next message should be a tool response
|
|
248
|
+
if i + 1 < len(result.messages):
|
|
249
|
+
# Either next is tool response, or this is at the end
|
|
250
|
+
pass # Structural validity checked by not raising
|
|
251
|
+
|
|
252
|
+
def test_empty_messages(self):
|
|
253
|
+
"""Handle empty message list."""
|
|
254
|
+
result = compact_messages([])
|
|
255
|
+
assert result.messages == []
|
|
256
|
+
assert not result.was_compacted
|
|
257
|
+
|
|
258
|
+
def test_only_system_and_user(self):
|
|
259
|
+
"""Handle minimal conversation."""
|
|
260
|
+
messages = [
|
|
261
|
+
{"role": "system", "content": "System"},
|
|
262
|
+
{"role": "user", "content": "Hello"},
|
|
263
|
+
]
|
|
264
|
+
result = compact_messages(messages, keep_first_n=2, keep_last_n=2)
|
|
265
|
+
assert not result.was_compacted
|
|
266
|
+
assert result.messages == messages
|
zwarm/orchestrator.py
CHANGED
|
@@ -24,12 +24,18 @@ from wbal.helper import TOOL_CALL_TYPE, format_openai_tool_response
|
|
|
24
24
|
from zwarm.adapters.base import ExecutorAdapter
|
|
25
25
|
from zwarm.adapters.claude_code import ClaudeCodeAdapter
|
|
26
26
|
from zwarm.adapters.codex_mcp import CodexMCPAdapter
|
|
27
|
+
from zwarm.core.compact import compact_messages, should_compact
|
|
27
28
|
from zwarm.core.config import ZwarmConfig, load_config
|
|
28
29
|
from zwarm.core.environment import OrchestratorEnv
|
|
29
30
|
from zwarm.core.models import ConversationSession
|
|
30
31
|
from zwarm.core.state import StateManager
|
|
31
32
|
from zwarm.prompts import get_orchestrator_prompt
|
|
32
|
-
from zwarm.watchers import
|
|
33
|
+
from zwarm.watchers import (
|
|
34
|
+
WatcherManager,
|
|
35
|
+
WatcherContext,
|
|
36
|
+
WatcherAction,
|
|
37
|
+
build_watcher_manager,
|
|
38
|
+
)
|
|
33
39
|
|
|
34
40
|
|
|
35
41
|
class Orchestrator(YamlAgent):
|
|
@@ -49,10 +55,12 @@ class Orchestrator(YamlAgent):
|
|
|
49
55
|
working_dir: Path = Field(default_factory=Path.cwd)
|
|
50
56
|
|
|
51
57
|
# Load tools from modules (delegation + bash for verification)
|
|
52
|
-
agent_tool_modules: list[str] = Field(
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
58
|
+
agent_tool_modules: list[str] = Field(
|
|
59
|
+
default=[
|
|
60
|
+
"zwarm.tools.delegation",
|
|
61
|
+
"wbal.tools.bash",
|
|
62
|
+
]
|
|
63
|
+
)
|
|
56
64
|
|
|
57
65
|
# State management
|
|
58
66
|
_state: StateManager = PrivateAttr()
|
|
@@ -60,6 +68,12 @@ class Orchestrator(YamlAgent):
|
|
|
60
68
|
_adapters: dict[str, ExecutorAdapter] = PrivateAttr(default_factory=dict)
|
|
61
69
|
_watcher_manager: WatcherManager | None = PrivateAttr(default=None)
|
|
62
70
|
_resumed: bool = PrivateAttr(default=False)
|
|
71
|
+
_total_tokens: int = PrivateAttr(default=0) # Cumulative orchestrator tokens
|
|
72
|
+
_executor_usage: dict[str, int] = PrivateAttr(default_factory=lambda: {
|
|
73
|
+
"input_tokens": 0,
|
|
74
|
+
"output_tokens": 0,
|
|
75
|
+
"total_tokens": 0,
|
|
76
|
+
})
|
|
63
77
|
|
|
64
78
|
def model_post_init(self, __context: Any) -> None:
|
|
65
79
|
"""Initialize state and adapters after model creation."""
|
|
@@ -80,17 +94,29 @@ class Orchestrator(YamlAgent):
|
|
|
80
94
|
|
|
81
95
|
# Initialize watchers if configured
|
|
82
96
|
if self.config.watchers.enabled:
|
|
83
|
-
self._watcher_manager = build_watcher_manager(
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
97
|
+
self._watcher_manager = build_watcher_manager(
|
|
98
|
+
{
|
|
99
|
+
"watchers": [
|
|
100
|
+
{"name": w.name, "enabled": w.enabled, "config": w.config}
|
|
101
|
+
for w in self.config.watchers.watchers
|
|
102
|
+
]
|
|
103
|
+
}
|
|
104
|
+
)
|
|
89
105
|
|
|
90
106
|
# Link sessions to environment for observe()
|
|
91
|
-
if hasattr(self.env,
|
|
107
|
+
if hasattr(self.env, "set_sessions"):
|
|
92
108
|
self.env.set_sessions(self._sessions)
|
|
93
109
|
|
|
110
|
+
# Set budget limits in environment
|
|
111
|
+
if hasattr(self.env, "set_budget"):
|
|
112
|
+
# Extract budget from watcher config if available
|
|
113
|
+
max_sessions = None
|
|
114
|
+
for w in self.config.watchers.watchers:
|
|
115
|
+
if w.name == "budget" and w.config:
|
|
116
|
+
max_sessions = w.config.get("max_sessions")
|
|
117
|
+
break
|
|
118
|
+
self.env.set_budget(max_sessions=max_sessions)
|
|
119
|
+
|
|
94
120
|
@property
|
|
95
121
|
def state(self) -> StateManager:
|
|
96
122
|
"""Access state manager."""
|
|
@@ -99,22 +125,97 @@ class Orchestrator(YamlAgent):
|
|
|
99
125
|
def _get_adapter(self, name: str) -> ExecutorAdapter:
|
|
100
126
|
"""Get or create an adapter by name."""
|
|
101
127
|
if name not in self._adapters:
|
|
128
|
+
# Get model from config (adapters have their own defaults if None)
|
|
129
|
+
model = self.config.executor.model
|
|
102
130
|
if name == "codex_mcp":
|
|
103
|
-
self._adapters[name] = CodexMCPAdapter()
|
|
131
|
+
self._adapters[name] = CodexMCPAdapter(model=model)
|
|
104
132
|
elif name == "claude_code":
|
|
105
|
-
self._adapters[name] = ClaudeCodeAdapter()
|
|
133
|
+
self._adapters[name] = ClaudeCodeAdapter(model=model)
|
|
106
134
|
else:
|
|
107
135
|
raise ValueError(f"Unknown adapter: {name}")
|
|
108
136
|
return self._adapters[name]
|
|
109
137
|
|
|
138
|
+
def get_executor_usage(self) -> dict[str, int]:
|
|
139
|
+
"""Get aggregated token usage across all executors."""
|
|
140
|
+
total = {
|
|
141
|
+
"input_tokens": 0,
|
|
142
|
+
"output_tokens": 0,
|
|
143
|
+
"total_tokens": 0,
|
|
144
|
+
}
|
|
145
|
+
for adapter in self._adapters.values():
|
|
146
|
+
if hasattr(adapter, "total_usage"):
|
|
147
|
+
usage = adapter.total_usage
|
|
148
|
+
for key in total:
|
|
149
|
+
total[key] += usage.get(key, 0)
|
|
150
|
+
return total
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def executor_usage(self) -> dict[str, int]:
|
|
154
|
+
"""Aggregated executor token usage (for Weave tracking)."""
|
|
155
|
+
return self.get_executor_usage()
|
|
156
|
+
|
|
110
157
|
def save_state(self) -> None:
|
|
111
158
|
"""Save orchestrator state for resume."""
|
|
112
159
|
self._state.save_orchestrator_messages(self.messages)
|
|
113
160
|
|
|
114
161
|
def load_state(self) -> None:
|
|
115
|
-
"""Load orchestrator state for resume.
|
|
116
|
-
|
|
117
|
-
|
|
162
|
+
"""Load orchestrator state for resume.
|
|
163
|
+
|
|
164
|
+
Only marks as resumed if we actually loaded non-empty messages.
|
|
165
|
+
This prevents the resume message from being injected before the
|
|
166
|
+
system prompt when there's no saved state to resume from.
|
|
167
|
+
"""
|
|
168
|
+
loaded_messages = self._state.load_orchestrator_messages()
|
|
169
|
+
if loaded_messages:
|
|
170
|
+
self.messages = loaded_messages
|
|
171
|
+
self._resumed = True
|
|
172
|
+
# If no messages were saved, don't set _resumed - start fresh
|
|
173
|
+
|
|
174
|
+
def _maybe_compact(self) -> bool:
|
|
175
|
+
"""
|
|
176
|
+
Check if compaction is needed and compact if so.
|
|
177
|
+
|
|
178
|
+
Returns True if compaction was performed.
|
|
179
|
+
"""
|
|
180
|
+
compact_config = self.config.orchestrator.compaction
|
|
181
|
+
if not compact_config.enabled:
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
# Check if we should compact
|
|
185
|
+
if not should_compact(
|
|
186
|
+
self.messages,
|
|
187
|
+
max_tokens=compact_config.max_tokens,
|
|
188
|
+
threshold_pct=compact_config.threshold_pct,
|
|
189
|
+
):
|
|
190
|
+
return False
|
|
191
|
+
|
|
192
|
+
# Perform compaction
|
|
193
|
+
result = compact_messages(
|
|
194
|
+
self.messages,
|
|
195
|
+
keep_first_n=compact_config.keep_first_n,
|
|
196
|
+
keep_last_n=compact_config.keep_last_n,
|
|
197
|
+
max_tokens=compact_config.max_tokens,
|
|
198
|
+
target_token_pct=compact_config.target_pct,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if result.was_compacted:
|
|
202
|
+
self.messages = result.messages
|
|
203
|
+
|
|
204
|
+
# Log compaction event
|
|
205
|
+
from zwarm.core.models import Event
|
|
206
|
+
self._state.log_event(Event(
|
|
207
|
+
kind="context_compacted",
|
|
208
|
+
payload={
|
|
209
|
+
"step": self._step_count,
|
|
210
|
+
"original_count": result.original_count,
|
|
211
|
+
"new_count": len(result.messages),
|
|
212
|
+
"removed_count": result.removed_count,
|
|
213
|
+
},
|
|
214
|
+
))
|
|
215
|
+
|
|
216
|
+
return True
|
|
217
|
+
|
|
218
|
+
return False
|
|
118
219
|
|
|
119
220
|
def _inject_resume_message(self) -> None:
|
|
120
221
|
"""Inject a system message about resumed state."""
|
|
@@ -124,7 +225,9 @@ class Orchestrator(YamlAgent):
|
|
|
124
225
|
# Build list of old sessions
|
|
125
226
|
old_sessions = []
|
|
126
227
|
for sid, session in self._sessions.items():
|
|
127
|
-
old_sessions.append(
|
|
228
|
+
old_sessions.append(
|
|
229
|
+
f" - {sid[:8]}... ({session.adapter}, {session.status.value})"
|
|
230
|
+
)
|
|
128
231
|
|
|
129
232
|
session_info = "\n".join(old_sessions) if old_sessions else " (none)"
|
|
130
233
|
|
|
@@ -139,31 +242,67 @@ Previous sessions (now stale):
|
|
|
139
242
|
|
|
140
243
|
You must start NEW sessions with delegate() if you need to continue work. Do NOT try to use converse() or check_session() with the old session IDs - they will fail.
|
|
141
244
|
|
|
142
|
-
Continue with your task from where you left off."""
|
|
245
|
+
Continue with your task from where you left off.""",
|
|
143
246
|
}
|
|
144
247
|
|
|
145
248
|
self.messages.append(resume_msg)
|
|
146
249
|
self._resumed = False # Only inject once
|
|
147
250
|
|
|
251
|
+
def perceive(self) -> None:
|
|
252
|
+
"""
|
|
253
|
+
Override perceive to refresh environment observation each step.
|
|
254
|
+
|
|
255
|
+
The base YamlAgent only adds env.observe() on step 0. We need to
|
|
256
|
+
update it each step to show current progress, sessions, etc.
|
|
257
|
+
"""
|
|
258
|
+
# Let base class do initial setup
|
|
259
|
+
super().perceive()
|
|
260
|
+
|
|
261
|
+
# Update environment observation
|
|
262
|
+
env_obs = (self.env.observe() or "").strip()
|
|
263
|
+
if not env_obs:
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
# Find and update existing env observation, or append new one
|
|
267
|
+
# Look for a system message containing our markers
|
|
268
|
+
env_marker = "## Progress" # Our env observation has this
|
|
269
|
+
|
|
270
|
+
for i, msg in enumerate(self.messages):
|
|
271
|
+
if msg.get("role") == "system" and env_marker in msg.get("content", ""):
|
|
272
|
+
# Update in place
|
|
273
|
+
self.messages[i]["content"] = env_obs
|
|
274
|
+
return
|
|
275
|
+
|
|
276
|
+
# Not found - append as new system message (shouldn't happen after step 0)
|
|
277
|
+
self.messages.append({"role": "system", "content": env_obs})
|
|
278
|
+
|
|
279
|
+
@weave.op()
|
|
148
280
|
def _run_watchers(self) -> WatcherAction:
|
|
149
281
|
"""Run watchers and return the action to take."""
|
|
150
282
|
if not self._watcher_manager:
|
|
151
283
|
return WatcherAction.CONTINUE
|
|
152
284
|
|
|
153
285
|
# Build watcher context
|
|
286
|
+
task = getattr(self.env, "task", "") if self.env else ""
|
|
287
|
+
events = [e.to_dict() for e in self.state.get_events(limit=200)]
|
|
154
288
|
ctx = WatcherContext(
|
|
289
|
+
task=task,
|
|
155
290
|
step=self._step_count,
|
|
291
|
+
max_steps=self.maxSteps,
|
|
156
292
|
messages=self.messages,
|
|
157
|
-
sessions=
|
|
158
|
-
|
|
293
|
+
sessions=[s.to_dict() for s in self._sessions.values()],
|
|
294
|
+
events=events,
|
|
295
|
+
working_dir=str(self.working_dir.absolute()) if self.working_dir else None,
|
|
159
296
|
metadata={
|
|
160
|
-
"
|
|
161
|
-
|
|
297
|
+
"config": self.config.to_dict()
|
|
298
|
+
if hasattr(self.config, "to_dict")
|
|
299
|
+
else {},
|
|
162
300
|
},
|
|
163
301
|
)
|
|
164
302
|
|
|
165
303
|
# Run watchers synchronously (they're async internally)
|
|
166
304
|
import asyncio
|
|
305
|
+
|
|
167
306
|
try:
|
|
168
307
|
loop = asyncio.get_running_loop()
|
|
169
308
|
except RuntimeError:
|
|
@@ -172,18 +311,37 @@ Continue with your task from where you left off."""
|
|
|
172
311
|
if loop and loop.is_running():
|
|
173
312
|
# We're in an async context, create a task
|
|
174
313
|
import concurrent.futures
|
|
314
|
+
|
|
175
315
|
with concurrent.futures.ThreadPoolExecutor() as pool:
|
|
176
|
-
result = pool.submit(
|
|
316
|
+
result = pool.submit(
|
|
317
|
+
asyncio.run, self._watcher_manager.observe(ctx)
|
|
318
|
+
).result()
|
|
177
319
|
else:
|
|
178
320
|
result = asyncio.run(self._watcher_manager.observe(ctx))
|
|
179
321
|
|
|
322
|
+
# Log watcher execution to events
|
|
323
|
+
from zwarm.core.models import Event
|
|
324
|
+
watcher_names = [w.name for w in self.config.watchers.watchers if w.enabled]
|
|
325
|
+
self.state.log_event(Event(
|
|
326
|
+
kind="watchers_run",
|
|
327
|
+
payload={
|
|
328
|
+
"step": self._step_count,
|
|
329
|
+
"watchers": watcher_names,
|
|
330
|
+
"action": result.action.value,
|
|
331
|
+
"triggered_by": result.metadata.get("triggered_by"),
|
|
332
|
+
"reason": result.metadata.get("reason"),
|
|
333
|
+
},
|
|
334
|
+
))
|
|
335
|
+
|
|
180
336
|
# Handle watcher result
|
|
181
337
|
if result.action == WatcherAction.NUDGE and result.guidance:
|
|
182
338
|
# Inject guidance as a system message
|
|
183
|
-
self.messages.append(
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
339
|
+
self.messages.append(
|
|
340
|
+
{
|
|
341
|
+
"role": "user",
|
|
342
|
+
"content": f"[WATCHER: {result.metadata.get('triggered_by', 'unknown')}] {result.guidance}",
|
|
343
|
+
}
|
|
344
|
+
)
|
|
187
345
|
|
|
188
346
|
return result.action
|
|
189
347
|
|
|
@@ -200,20 +358,19 @@ Continue with your task from where you left off."""
|
|
|
200
358
|
if self._last_response is None:
|
|
201
359
|
return []
|
|
202
360
|
|
|
203
|
-
output = getattr(self._last_response,
|
|
361
|
+
output = getattr(self._last_response, "output", None)
|
|
204
362
|
if output is None:
|
|
205
363
|
return []
|
|
206
364
|
|
|
207
365
|
# Extract tool calls
|
|
208
366
|
tool_calls = [
|
|
209
|
-
item for item in output
|
|
210
|
-
if getattr(item, 'type', None) == TOOL_CALL_TYPE
|
|
367
|
+
item for item in output if getattr(item, "type", None) == TOOL_CALL_TYPE
|
|
211
368
|
]
|
|
212
369
|
|
|
213
370
|
# If no tool calls, handle text output
|
|
214
371
|
if not tool_calls:
|
|
215
|
-
output_text = getattr(self._last_response,
|
|
216
|
-
if output_text and hasattr(self.env,
|
|
372
|
+
output_text = getattr(self._last_response, "output_text", "")
|
|
373
|
+
if output_text and hasattr(self.env, "output_handler"):
|
|
217
374
|
self.env.output_handler(output_text)
|
|
218
375
|
return []
|
|
219
376
|
|
|
@@ -221,9 +378,9 @@ Continue with your task from where you left off."""
|
|
|
221
378
|
tool_results: list[tuple[dict[str, Any], Any]] = []
|
|
222
379
|
|
|
223
380
|
for tc in tool_calls:
|
|
224
|
-
tc_name = getattr(tc,
|
|
225
|
-
tc_args_raw = getattr(tc,
|
|
226
|
-
tc_id = getattr(tc,
|
|
381
|
+
tc_name = getattr(tc, "name", "")
|
|
382
|
+
tc_args_raw = getattr(tc, "arguments", "{}")
|
|
383
|
+
tc_id = getattr(tc, "call_id", "")
|
|
227
384
|
|
|
228
385
|
# Parse arguments
|
|
229
386
|
if isinstance(tc_args_raw, str):
|
|
@@ -271,14 +428,37 @@ Continue with your task from where you left off."""
|
|
|
271
428
|
- tool_call_info: {"name": str, "args": dict, "call_id": str}
|
|
272
429
|
- result: The tool output (any type)
|
|
273
430
|
"""
|
|
431
|
+
# Check for context compaction before perceive
|
|
432
|
+
# This prevents context overflow on long-running tasks
|
|
433
|
+
self._maybe_compact()
|
|
434
|
+
|
|
435
|
+
# Update environment with current progress before perceive
|
|
436
|
+
if hasattr(self.env, "update_progress"):
|
|
437
|
+
executor_usage = self.get_executor_usage()
|
|
438
|
+
self.env.update_progress(
|
|
439
|
+
step_count=self._step_count,
|
|
440
|
+
max_steps=self.maxSteps,
|
|
441
|
+
total_tokens=self._total_tokens,
|
|
442
|
+
executor_tokens=executor_usage.get("total_tokens", 0),
|
|
443
|
+
)
|
|
444
|
+
|
|
274
445
|
self.perceive()
|
|
275
446
|
self.invoke()
|
|
447
|
+
|
|
448
|
+
# Track cumulative token usage from the API response
|
|
449
|
+
if self._last_response and hasattr(self._last_response, "usage"):
|
|
450
|
+
usage = self._last_response.usage
|
|
451
|
+
if usage:
|
|
452
|
+
self._total_tokens += getattr(usage, "total_tokens", 0)
|
|
453
|
+
|
|
276
454
|
tool_results = self.do()
|
|
277
455
|
self._step_count += 1
|
|
278
456
|
return tool_results
|
|
279
457
|
|
|
280
458
|
@weave.op()
|
|
281
|
-
def run(
|
|
459
|
+
def run(
|
|
460
|
+
self, task: str | None = None, max_steps: int | None = None
|
|
461
|
+
) -> dict[str, Any]:
|
|
282
462
|
"""
|
|
283
463
|
Run the orchestrator until stop condition is met.
|
|
284
464
|
|
|
@@ -299,8 +479,9 @@ Continue with your task from where you left off."""
|
|
|
299
479
|
if max_steps is not None:
|
|
300
480
|
self.maxSteps = max_steps
|
|
301
481
|
|
|
302
|
-
# Reset
|
|
482
|
+
# Reset counters
|
|
303
483
|
self._step_count = 0
|
|
484
|
+
self._total_tokens = 0
|
|
304
485
|
|
|
305
486
|
# Inject resume message if we were resumed
|
|
306
487
|
self._inject_resume_message()
|
|
@@ -402,4 +583,6 @@ def build_orchestrator(
|
|
|
402
583
|
|
|
403
584
|
def _build_system_prompt(config: ZwarmConfig, working_dir: Path | None = None) -> str:
|
|
404
585
|
"""Build the orchestrator system prompt."""
|
|
405
|
-
return get_orchestrator_prompt(
|
|
586
|
+
return get_orchestrator_prompt(
|
|
587
|
+
working_dir=str(working_dir) if working_dir else None
|
|
588
|
+
)
|