zwarm 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,266 @@
1
+ """Tests for the compact module."""
2
+
3
+ import pytest
4
+
5
+ from zwarm.core.compact import (
6
+ compact_messages,
7
+ estimate_tokens,
8
+ find_tool_groups,
9
+ should_compact,
10
+ )
11
+
12
+
13
+ class TestEstimateTokens:
14
+ def test_simple_messages(self):
15
+ """Estimate tokens for simple text messages."""
16
+ messages = [
17
+ {"role": "user", "content": "Hello world"}, # 11 chars
18
+ {"role": "assistant", "content": "Hi there!"}, # 9 chars
19
+ ]
20
+ # ~20 chars / 4 = ~5 tokens
21
+ tokens = estimate_tokens(messages)
22
+ assert tokens == 5
23
+
24
+ def test_empty_messages(self):
25
+ """Empty messages return 0 tokens."""
26
+ assert estimate_tokens([]) == 0
27
+
28
+ def test_messages_with_tool_calls(self):
29
+ """Tool calls add to token count."""
30
+ messages = [
31
+ {
32
+ "role": "assistant",
33
+ "content": "Let me check",
34
+ "tool_calls": [
35
+ {"function": {"name": "read", "arguments": '{"path": "/foo/bar"}'}}
36
+ ],
37
+ }
38
+ ]
39
+ tokens = estimate_tokens(messages)
40
+ assert tokens > 0
41
+
42
+
43
+ class TestFindToolGroups:
44
+ def test_no_tool_calls(self):
45
+ """No tool groups in simple conversation."""
46
+ messages = [
47
+ {"role": "system", "content": "You are helpful"},
48
+ {"role": "user", "content": "Hello"},
49
+ {"role": "assistant", "content": "Hi!"},
50
+ ]
51
+ groups = find_tool_groups(messages)
52
+ assert groups == []
53
+
54
+ def test_openai_format_tool_call(self):
55
+ """Detect OpenAI-style tool call groups."""
56
+ messages = [
57
+ {"role": "system", "content": "System"},
58
+ {"role": "user", "content": "Read file"},
59
+ {
60
+ "role": "assistant",
61
+ "content": "Reading...",
62
+ "tool_calls": [{"id": "tc1", "function": {"name": "read"}}],
63
+ },
64
+ {"role": "tool", "tool_call_id": "tc1", "content": "file contents"},
65
+ {"role": "assistant", "content": "Here's the file"},
66
+ ]
67
+ groups = find_tool_groups(messages)
68
+ assert groups == [(2, 3)] # Assistant with tool_calls + tool response
69
+
70
+ def test_multiple_tool_responses(self):
71
+ """Group includes all consecutive tool responses."""
72
+ messages = [
73
+ {"role": "user", "content": "Do things"},
74
+ {
75
+ "role": "assistant",
76
+ "tool_calls": [
77
+ {"id": "tc1", "function": {"name": "a"}},
78
+ {"id": "tc2", "function": {"name": "b"}},
79
+ ],
80
+ },
81
+ {"role": "tool", "tool_call_id": "tc1", "content": "result1"},
82
+ {"role": "tool", "tool_call_id": "tc2", "content": "result2"},
83
+ {"role": "assistant", "content": "Done"},
84
+ ]
85
+ groups = find_tool_groups(messages)
86
+ assert groups == [(1, 3)] # Indices 1, 2, 3 form one group
87
+
88
+ def test_anthropic_format_tool_use(self):
89
+ """Detect Anthropic-style tool_use content blocks."""
90
+ messages = [
91
+ {"role": "user", "content": "Read file"},
92
+ {
93
+ "role": "assistant",
94
+ "content": [
95
+ {"type": "text", "text": "Reading..."},
96
+ {"type": "tool_use", "id": "tu1", "name": "read", "input": {}},
97
+ ],
98
+ },
99
+ {
100
+ "role": "user",
101
+ "content": [
102
+ {"type": "tool_result", "tool_use_id": "tu1", "content": "data"},
103
+ ],
104
+ },
105
+ {"role": "assistant", "content": "Got it"},
106
+ ]
107
+ groups = find_tool_groups(messages)
108
+ assert groups == [(1, 2)] # Assistant with tool_use + user with tool_result
109
+
110
+
111
+ class TestCompactMessages:
112
+ def test_no_compaction_needed_few_messages(self):
113
+ """Don't compact if we have fewer messages than keep thresholds."""
114
+ messages = [
115
+ {"role": "system", "content": "System"},
116
+ {"role": "user", "content": "Task"},
117
+ {"role": "assistant", "content": "Response"},
118
+ ]
119
+ result = compact_messages(messages, keep_first_n=2, keep_last_n=2)
120
+ assert not result.was_compacted
121
+ assert result.messages == messages
122
+ assert "Too few" in result.preserved_reason
123
+
124
+ def test_compacts_middle_messages(self):
125
+ """Remove messages from the middle, keeping first and last."""
126
+ messages = [
127
+ {"role": "system", "content": "System"},
128
+ {"role": "user", "content": "Task"},
129
+ {"role": "assistant", "content": "Step 1"},
130
+ {"role": "user", "content": "Continue"},
131
+ {"role": "assistant", "content": "Step 2"},
132
+ {"role": "user", "content": "More"},
133
+ {"role": "assistant", "content": "Step 3"},
134
+ {"role": "user", "content": "Final"},
135
+ {"role": "assistant", "content": "Done"},
136
+ ]
137
+ result = compact_messages(messages, keep_first_n=2, keep_last_n=2)
138
+
139
+ assert result.was_compacted
140
+ assert result.removed_count > 0
141
+ # First 2 and last 2 should be preserved
142
+ assert result.messages[0]["content"] == "System"
143
+ assert result.messages[1]["content"] == "Task"
144
+ assert result.messages[-1]["content"] == "Done"
145
+ assert result.messages[-2]["content"] == "Final"
146
+
147
+ def test_preserves_tool_call_pairs(self):
148
+ """Never split tool call from its response."""
149
+ messages = [
150
+ {"role": "system", "content": "System"},
151
+ {"role": "user", "content": "Task"},
152
+ {"role": "assistant", "content": "Old message 1"},
153
+ {"role": "assistant", "content": "Old message 2"},
154
+ {
155
+ "role": "assistant",
156
+ "content": "Calling tool",
157
+ "tool_calls": [{"id": "tc1", "function": {"name": "test"}}],
158
+ },
159
+ {"role": "tool", "tool_call_id": "tc1", "content": "Tool result"},
160
+ {"role": "assistant", "content": "Recent 1"},
161
+ {"role": "user", "content": "Recent 2"},
162
+ ]
163
+ result = compact_messages(messages, keep_first_n=2, keep_last_n=2)
164
+
165
+ # The tool call pair should either both be kept or both removed
166
+ has_tool_call = any(m.get("tool_calls") for m in result.messages)
167
+ has_tool_response = any(m.get("role") == "tool" for m in result.messages)
168
+
169
+ # They should match - either both present or both absent
170
+ assert has_tool_call == has_tool_response
171
+
172
+ def test_adds_compaction_marker(self):
173
+ """Add a marker message when compaction occurs."""
174
+ messages = [
175
+ {"role": "system", "content": "System"},
176
+ {"role": "user", "content": "Task"},
177
+ ] + [{"role": "assistant", "content": f"Msg {i}"} for i in range(20)]
178
+
179
+ result = compact_messages(messages, keep_first_n=2, keep_last_n=3)
180
+
181
+ if result.was_compacted:
182
+ # Should have a system message about compaction
183
+ marker_msgs = [
184
+ m for m in result.messages
185
+ if m.get("role") == "system" and "compacted" in m.get("content", "").lower()
186
+ ]
187
+ assert len(marker_msgs) == 1
188
+
189
+ def test_token_based_compaction(self):
190
+ """Compact based on token threshold."""
191
+ # Create messages that exceed token limit
192
+ messages = [
193
+ {"role": "system", "content": "System prompt " * 100},
194
+ {"role": "user", "content": "Task " * 100},
195
+ ] + [
196
+ {"role": "assistant", "content": f"Response {i} " * 50}
197
+ for i in range(10)
198
+ ]
199
+
200
+ # Should not compact if under limit
201
+ result_under = compact_messages(messages, max_tokens=100000)
202
+ # Might or might not compact depending on estimate
203
+
204
+ # Should compact if over limit
205
+ result_over = compact_messages(messages, max_tokens=100, target_token_pct=0.5)
206
+ # With such a low limit, should definitely try to compact
207
+ assert result_over.original_count == len(messages)
208
+
209
+
210
+ class TestShouldCompact:
211
+ def test_under_threshold(self):
212
+ """Don't compact when under threshold."""
213
+ messages = [{"role": "user", "content": "Hello"}]
214
+ assert not should_compact(messages, max_tokens=1000, threshold_pct=0.85)
215
+
216
+ def test_over_threshold(self):
217
+ """Compact when over threshold."""
218
+ messages = [{"role": "user", "content": "x" * 4000}] # ~1000 tokens
219
+ assert should_compact(messages, max_tokens=500, threshold_pct=0.85)
220
+
221
+
222
+ class TestEdgeCases:
223
+ def test_all_tool_calls(self):
224
+ """Handle conversation that's mostly tool calls."""
225
+ messages = [
226
+ {"role": "system", "content": "System"},
227
+ {"role": "user", "content": "Task"},
228
+ ]
229
+ # Add many tool call pairs
230
+ for i in range(5):
231
+ messages.append({
232
+ "role": "assistant",
233
+ "tool_calls": [{"id": f"tc{i}", "function": {"name": "test"}}],
234
+ })
235
+ messages.append({"role": "tool", "tool_call_id": f"tc{i}", "content": f"result{i}"})
236
+
237
+ messages.append({"role": "assistant", "content": "Final"})
238
+
239
+ result = compact_messages(messages, keep_first_n=2, keep_last_n=1)
240
+
241
+ # Should still produce valid output
242
+ assert len(result.messages) > 0
243
+
244
+ # Check no orphaned tool calls
245
+ for i, msg in enumerate(result.messages):
246
+ if msg.get("tool_calls"):
247
+ # Next message should be a tool response
248
+ if i + 1 < len(result.messages):
249
+ # Either next is tool response, or this is at the end
250
+ pass # Structural validity checked by not raising
251
+
252
+ def test_empty_messages(self):
253
+ """Handle empty message list."""
254
+ result = compact_messages([])
255
+ assert result.messages == []
256
+ assert not result.was_compacted
257
+
258
+ def test_only_system_and_user(self):
259
+ """Handle minimal conversation."""
260
+ messages = [
261
+ {"role": "system", "content": "System"},
262
+ {"role": "user", "content": "Hello"},
263
+ ]
264
+ result = compact_messages(messages, keep_first_n=2, keep_last_n=2)
265
+ assert not result.was_compacted
266
+ assert result.messages == messages
zwarm/orchestrator.py CHANGED
@@ -24,12 +24,18 @@ from wbal.helper import TOOL_CALL_TYPE, format_openai_tool_response
24
24
  from zwarm.adapters.base import ExecutorAdapter
25
25
  from zwarm.adapters.claude_code import ClaudeCodeAdapter
26
26
  from zwarm.adapters.codex_mcp import CodexMCPAdapter
27
+ from zwarm.core.compact import compact_messages, should_compact
27
28
  from zwarm.core.config import ZwarmConfig, load_config
28
29
  from zwarm.core.environment import OrchestratorEnv
29
30
  from zwarm.core.models import ConversationSession
30
31
  from zwarm.core.state import StateManager
31
32
  from zwarm.prompts import get_orchestrator_prompt
32
- from zwarm.watchers import WatcherManager, WatcherContext, WatcherAction, build_watcher_manager
33
+ from zwarm.watchers import (
34
+ WatcherManager,
35
+ WatcherContext,
36
+ WatcherAction,
37
+ build_watcher_manager,
38
+ )
33
39
 
34
40
 
35
41
  class Orchestrator(YamlAgent):
@@ -49,10 +55,12 @@ class Orchestrator(YamlAgent):
49
55
  working_dir: Path = Field(default_factory=Path.cwd)
50
56
 
51
57
  # Load tools from modules (delegation + bash for verification)
52
- agent_tool_modules: list[str] = Field(default=[
53
- "zwarm.tools.delegation",
54
- "wbal.tools.bash",
55
- ])
58
+ agent_tool_modules: list[str] = Field(
59
+ default=[
60
+ "zwarm.tools.delegation",
61
+ "wbal.tools.bash",
62
+ ]
63
+ )
56
64
 
57
65
  # State management
58
66
  _state: StateManager = PrivateAttr()
@@ -60,6 +68,12 @@ class Orchestrator(YamlAgent):
60
68
  _adapters: dict[str, ExecutorAdapter] = PrivateAttr(default_factory=dict)
61
69
  _watcher_manager: WatcherManager | None = PrivateAttr(default=None)
62
70
  _resumed: bool = PrivateAttr(default=False)
71
+ _total_tokens: int = PrivateAttr(default=0) # Cumulative orchestrator tokens
72
+ _executor_usage: dict[str, int] = PrivateAttr(default_factory=lambda: {
73
+ "input_tokens": 0,
74
+ "output_tokens": 0,
75
+ "total_tokens": 0,
76
+ })
63
77
 
64
78
  def model_post_init(self, __context: Any) -> None:
65
79
  """Initialize state and adapters after model creation."""
@@ -80,17 +94,29 @@ class Orchestrator(YamlAgent):
80
94
 
81
95
  # Initialize watchers if configured
82
96
  if self.config.watchers.enabled:
83
- self._watcher_manager = build_watcher_manager({
84
- "watchers": [
85
- {"name": w.name, "enabled": w.enabled, "config": w.config}
86
- for w in self.config.watchers.watchers
87
- ]
88
- })
97
+ self._watcher_manager = build_watcher_manager(
98
+ {
99
+ "watchers": [
100
+ {"name": w.name, "enabled": w.enabled, "config": w.config}
101
+ for w in self.config.watchers.watchers
102
+ ]
103
+ }
104
+ )
89
105
 
90
106
  # Link sessions to environment for observe()
91
- if hasattr(self.env, 'set_sessions'):
107
+ if hasattr(self.env, "set_sessions"):
92
108
  self.env.set_sessions(self._sessions)
93
109
 
110
+ # Set budget limits in environment
111
+ if hasattr(self.env, "set_budget"):
112
+ # Extract budget from watcher config if available
113
+ max_sessions = None
114
+ for w in self.config.watchers.watchers:
115
+ if w.name == "budget" and w.config:
116
+ max_sessions = w.config.get("max_sessions")
117
+ break
118
+ self.env.set_budget(max_sessions=max_sessions)
119
+
94
120
  @property
95
121
  def state(self) -> StateManager:
96
122
  """Access state manager."""
@@ -99,22 +125,97 @@ class Orchestrator(YamlAgent):
99
125
  def _get_adapter(self, name: str) -> ExecutorAdapter:
100
126
  """Get or create an adapter by name."""
101
127
  if name not in self._adapters:
128
+ # Get model from config (adapters have their own defaults if None)
129
+ model = self.config.executor.model
102
130
  if name == "codex_mcp":
103
- self._adapters[name] = CodexMCPAdapter()
131
+ self._adapters[name] = CodexMCPAdapter(model=model)
104
132
  elif name == "claude_code":
105
- self._adapters[name] = ClaudeCodeAdapter()
133
+ self._adapters[name] = ClaudeCodeAdapter(model=model)
106
134
  else:
107
135
  raise ValueError(f"Unknown adapter: {name}")
108
136
  return self._adapters[name]
109
137
 
138
+ def get_executor_usage(self) -> dict[str, int]:
139
+ """Get aggregated token usage across all executors."""
140
+ total = {
141
+ "input_tokens": 0,
142
+ "output_tokens": 0,
143
+ "total_tokens": 0,
144
+ }
145
+ for adapter in self._adapters.values():
146
+ if hasattr(adapter, "total_usage"):
147
+ usage = adapter.total_usage
148
+ for key in total:
149
+ total[key] += usage.get(key, 0)
150
+ return total
151
+
152
+ @property
153
+ def executor_usage(self) -> dict[str, int]:
154
+ """Aggregated executor token usage (for Weave tracking)."""
155
+ return self.get_executor_usage()
156
+
110
157
  def save_state(self) -> None:
111
158
  """Save orchestrator state for resume."""
112
159
  self._state.save_orchestrator_messages(self.messages)
113
160
 
114
161
  def load_state(self) -> None:
115
- """Load orchestrator state for resume."""
116
- self.messages = self._state.load_orchestrator_messages()
117
- self._resumed = True
162
+ """Load orchestrator state for resume.
163
+
164
+ Only marks as resumed if we actually loaded non-empty messages.
165
+ This prevents the resume message from being injected before the
166
+ system prompt when there's no saved state to resume from.
167
+ """
168
+ loaded_messages = self._state.load_orchestrator_messages()
169
+ if loaded_messages:
170
+ self.messages = loaded_messages
171
+ self._resumed = True
172
+ # If no messages were saved, don't set _resumed - start fresh
173
+
174
+ def _maybe_compact(self) -> bool:
175
+ """
176
+ Check if compaction is needed and compact if so.
177
+
178
+ Returns True if compaction was performed.
179
+ """
180
+ compact_config = self.config.orchestrator.compaction
181
+ if not compact_config.enabled:
182
+ return False
183
+
184
+ # Check if we should compact
185
+ if not should_compact(
186
+ self.messages,
187
+ max_tokens=compact_config.max_tokens,
188
+ threshold_pct=compact_config.threshold_pct,
189
+ ):
190
+ return False
191
+
192
+ # Perform compaction
193
+ result = compact_messages(
194
+ self.messages,
195
+ keep_first_n=compact_config.keep_first_n,
196
+ keep_last_n=compact_config.keep_last_n,
197
+ max_tokens=compact_config.max_tokens,
198
+ target_token_pct=compact_config.target_pct,
199
+ )
200
+
201
+ if result.was_compacted:
202
+ self.messages = result.messages
203
+
204
+ # Log compaction event
205
+ from zwarm.core.models import Event
206
+ self._state.log_event(Event(
207
+ kind="context_compacted",
208
+ payload={
209
+ "step": self._step_count,
210
+ "original_count": result.original_count,
211
+ "new_count": len(result.messages),
212
+ "removed_count": result.removed_count,
213
+ },
214
+ ))
215
+
216
+ return True
217
+
218
+ return False
118
219
 
119
220
  def _inject_resume_message(self) -> None:
120
221
  """Inject a system message about resumed state."""
@@ -124,7 +225,9 @@ class Orchestrator(YamlAgent):
124
225
  # Build list of old sessions
125
226
  old_sessions = []
126
227
  for sid, session in self._sessions.items():
127
- old_sessions.append(f" - {sid[:8]}... ({session.adapter}, {session.status.value})")
228
+ old_sessions.append(
229
+ f" - {sid[:8]}... ({session.adapter}, {session.status.value})"
230
+ )
128
231
 
129
232
  session_info = "\n".join(old_sessions) if old_sessions else " (none)"
130
233
 
@@ -139,31 +242,67 @@ Previous sessions (now stale):
139
242
 
140
243
  You must start NEW sessions with delegate() if you need to continue work. Do NOT try to use converse() or check_session() with the old session IDs - they will fail.
141
244
 
142
- Continue with your task from where you left off."""
245
+ Continue with your task from where you left off.""",
143
246
  }
144
247
 
145
248
  self.messages.append(resume_msg)
146
249
  self._resumed = False # Only inject once
147
250
 
251
+ def perceive(self) -> None:
252
+ """
253
+ Override perceive to refresh environment observation each step.
254
+
255
+ The base YamlAgent only adds env.observe() on step 0. We need to
256
+ update it each step to show current progress, sessions, etc.
257
+ """
258
+ # Let base class do initial setup
259
+ super().perceive()
260
+
261
+ # Update environment observation
262
+ env_obs = (self.env.observe() or "").strip()
263
+ if not env_obs:
264
+ return
265
+
266
+ # Find and update existing env observation, or append new one
267
+ # Look for a system message containing our markers
268
+ env_marker = "## Progress" # Our env observation has this
269
+
270
+ for i, msg in enumerate(self.messages):
271
+ if msg.get("role") == "system" and env_marker in msg.get("content", ""):
272
+ # Update in place
273
+ self.messages[i]["content"] = env_obs
274
+ return
275
+
276
+ # Not found - append as new system message (shouldn't happen after step 0)
277
+ self.messages.append({"role": "system", "content": env_obs})
278
+
279
+ @weave.op()
148
280
  def _run_watchers(self) -> WatcherAction:
149
281
  """Run watchers and return the action to take."""
150
282
  if not self._watcher_manager:
151
283
  return WatcherAction.CONTINUE
152
284
 
153
285
  # Build watcher context
286
+ task = getattr(self.env, "task", "") if self.env else ""
287
+ events = [e.to_dict() for e in self.state.get_events(limit=200)]
154
288
  ctx = WatcherContext(
289
+ task=task,
155
290
  step=self._step_count,
291
+ max_steps=self.maxSteps,
156
292
  messages=self.messages,
157
- sessions={sid: s.to_dict() for sid, s in self._sessions.items()},
158
- task=self.env.task if hasattr(self.env, 'task') else "",
293
+ sessions=[s.to_dict() for s in self._sessions.values()],
294
+ events=events,
295
+ working_dir=str(self.working_dir.absolute()) if self.working_dir else None,
159
296
  metadata={
160
- "max_steps": self.maxSteps,
161
- "config": self.config.to_dict() if hasattr(self.config, 'to_dict') else {},
297
+ "config": self.config.to_dict()
298
+ if hasattr(self.config, "to_dict")
299
+ else {},
162
300
  },
163
301
  )
164
302
 
165
303
  # Run watchers synchronously (they're async internally)
166
304
  import asyncio
305
+
167
306
  try:
168
307
  loop = asyncio.get_running_loop()
169
308
  except RuntimeError:
@@ -172,18 +311,37 @@ Continue with your task from where you left off."""
172
311
  if loop and loop.is_running():
173
312
  # We're in an async context, create a task
174
313
  import concurrent.futures
314
+
175
315
  with concurrent.futures.ThreadPoolExecutor() as pool:
176
- result = pool.submit(asyncio.run, self._watcher_manager.observe(ctx)).result()
316
+ result = pool.submit(
317
+ asyncio.run, self._watcher_manager.observe(ctx)
318
+ ).result()
177
319
  else:
178
320
  result = asyncio.run(self._watcher_manager.observe(ctx))
179
321
 
322
+ # Log watcher execution to events
323
+ from zwarm.core.models import Event
324
+ watcher_names = [w.name for w in self.config.watchers.watchers if w.enabled]
325
+ self.state.log_event(Event(
326
+ kind="watchers_run",
327
+ payload={
328
+ "step": self._step_count,
329
+ "watchers": watcher_names,
330
+ "action": result.action.value,
331
+ "triggered_by": result.metadata.get("triggered_by"),
332
+ "reason": result.metadata.get("reason"),
333
+ },
334
+ ))
335
+
180
336
  # Handle watcher result
181
337
  if result.action == WatcherAction.NUDGE and result.guidance:
182
338
  # Inject guidance as a system message
183
- self.messages.append({
184
- "role": "user",
185
- "content": f"[WATCHER: {result.metadata.get('triggered_by', 'unknown')}] {result.guidance}"
186
- })
339
+ self.messages.append(
340
+ {
341
+ "role": "user",
342
+ "content": f"[WATCHER: {result.metadata.get('triggered_by', 'unknown')}] {result.guidance}",
343
+ }
344
+ )
187
345
 
188
346
  return result.action
189
347
 
@@ -200,20 +358,19 @@ Continue with your task from where you left off."""
200
358
  if self._last_response is None:
201
359
  return []
202
360
 
203
- output = getattr(self._last_response, 'output', None)
361
+ output = getattr(self._last_response, "output", None)
204
362
  if output is None:
205
363
  return []
206
364
 
207
365
  # Extract tool calls
208
366
  tool_calls = [
209
- item for item in output
210
- if getattr(item, 'type', None) == TOOL_CALL_TYPE
367
+ item for item in output if getattr(item, "type", None) == TOOL_CALL_TYPE
211
368
  ]
212
369
 
213
370
  # If no tool calls, handle text output
214
371
  if not tool_calls:
215
- output_text = getattr(self._last_response, 'output_text', '')
216
- if output_text and hasattr(self.env, 'output_handler'):
372
+ output_text = getattr(self._last_response, "output_text", "")
373
+ if output_text and hasattr(self.env, "output_handler"):
217
374
  self.env.output_handler(output_text)
218
375
  return []
219
376
 
@@ -221,9 +378,9 @@ Continue with your task from where you left off."""
221
378
  tool_results: list[tuple[dict[str, Any], Any]] = []
222
379
 
223
380
  for tc in tool_calls:
224
- tc_name = getattr(tc, 'name', '')
225
- tc_args_raw = getattr(tc, 'arguments', '{}')
226
- tc_id = getattr(tc, 'call_id', '')
381
+ tc_name = getattr(tc, "name", "")
382
+ tc_args_raw = getattr(tc, "arguments", "{}")
383
+ tc_id = getattr(tc, "call_id", "")
227
384
 
228
385
  # Parse arguments
229
386
  if isinstance(tc_args_raw, str):
@@ -271,14 +428,37 @@ Continue with your task from where you left off."""
271
428
  - tool_call_info: {"name": str, "args": dict, "call_id": str}
272
429
  - result: The tool output (any type)
273
430
  """
431
+ # Check for context compaction before perceive
432
+ # This prevents context overflow on long-running tasks
433
+ self._maybe_compact()
434
+
435
+ # Update environment with current progress before perceive
436
+ if hasattr(self.env, "update_progress"):
437
+ executor_usage = self.get_executor_usage()
438
+ self.env.update_progress(
439
+ step_count=self._step_count,
440
+ max_steps=self.maxSteps,
441
+ total_tokens=self._total_tokens,
442
+ executor_tokens=executor_usage.get("total_tokens", 0),
443
+ )
444
+
274
445
  self.perceive()
275
446
  self.invoke()
447
+
448
+ # Track cumulative token usage from the API response
449
+ if self._last_response and hasattr(self._last_response, "usage"):
450
+ usage = self._last_response.usage
451
+ if usage:
452
+ self._total_tokens += getattr(usage, "total_tokens", 0)
453
+
276
454
  tool_results = self.do()
277
455
  self._step_count += 1
278
456
  return tool_results
279
457
 
280
458
  @weave.op()
281
- def run(self, task: str | None = None, max_steps: int | None = None) -> dict[str, Any]:
459
+ def run(
460
+ self, task: str | None = None, max_steps: int | None = None
461
+ ) -> dict[str, Any]:
282
462
  """
283
463
  Run the orchestrator until stop condition is met.
284
464
 
@@ -299,8 +479,9 @@ Continue with your task from where you left off."""
299
479
  if max_steps is not None:
300
480
  self.maxSteps = max_steps
301
481
 
302
- # Reset step counter
482
+ # Reset counters
303
483
  self._step_count = 0
484
+ self._total_tokens = 0
304
485
 
305
486
  # Inject resume message if we were resumed
306
487
  self._inject_resume_message()
@@ -402,4 +583,6 @@ def build_orchestrator(
402
583
 
403
584
  def _build_system_prompt(config: ZwarmConfig, working_dir: Path | None = None) -> str:
404
585
  """Build the orchestrator system prompt."""
405
- return get_orchestrator_prompt(working_dir=str(working_dir) if working_dir else None)
586
+ return get_orchestrator_prompt(
587
+ working_dir=str(working_dir) if working_dir else None
588
+ )