zwarm 2.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,319 @@
1
+ """
2
+ LLM-based watcher for nuanced trajectory analysis.
3
+
4
+ Unlike rule-based watchers, this watcher uses a language model to assess
5
+ the orchestrator's trajectory and provide context-aware guidance.
6
+
7
+ The watcher compresses the full message history into a compact trajectory
8
+ representation (similar to what Codex shows in its UI) to minimize token
9
+ usage while preserving the "shape" of the agent's behavior.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import logging
16
+ from typing import Any
17
+
18
+ from zwarm.watchers.base import Watcher, WatcherContext, WatcherResult
19
+ from zwarm.watchers.registry import register_watcher
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def _get_field(item: Any, name: str, default: Any = None) -> Any:
25
+ """Get field from dict or object."""
26
+ if isinstance(item, dict):
27
+ return item.get(name, default)
28
+ return getattr(item, name, default)
29
+
30
+
31
+ def _extract_tool_call_summary(tc: Any) -> str:
32
+ """Extract a compact summary of a tool call."""
33
+ if isinstance(tc, dict):
34
+ func = tc.get("function", tc)
35
+ name = func.get("name", tc.get("name", "?"))
36
+ args = func.get("arguments", tc.get("arguments", ""))
37
+ else:
38
+ name = getattr(tc, "name", "?")
39
+ args = getattr(tc, "arguments", "")
40
+
41
+ # Parse args if JSON string
42
+ if isinstance(args, str):
43
+ try:
44
+ args = json.loads(args)
45
+ except (json.JSONDecodeError, TypeError):
46
+ pass
47
+
48
+ # Create compact arg summary
49
+ if isinstance(args, dict):
50
+ # Show key args based on tool type
51
+ if name == "delegate":
52
+ task = args.get("task", "")[:50]
53
+ mode = args.get("mode", "sync")
54
+ return f"delegate({mode}): {task}..."
55
+ elif name == "converse":
56
+ msg = args.get("message", "")[:40]
57
+ return f"converse: {msg}..."
58
+ elif name == "bash":
59
+ cmd = args.get("command", "")[:60]
60
+ return f"$ {cmd}"
61
+ elif name in ("check_session", "peek_session", "end_session"):
62
+ sid = args.get("session_id", "")[:8]
63
+ return f"{name}({sid})"
64
+ elif name == "list_sessions":
65
+ return "list_sessions()"
66
+ else:
67
+ # Generic: show first arg
68
+ first_val = next(iter(args.values()), "") if args else ""
69
+ if isinstance(first_val, str) and len(first_val) > 30:
70
+ first_val = first_val[:30] + "..."
71
+ return f"{name}({first_val})"
72
+ else:
73
+ return f"{name}({str(args)[:30]})"
74
+
75
+
76
+ def compress_trajectory(messages: list[dict[str, Any]], max_steps: int = 50) -> str:
77
+ """
78
+ Compress full message history into a compact trajectory representation.
79
+
80
+ Output format (similar to Codex UI):
81
+ ```
82
+ [1] thinking: "preparing to inspect the codebase"
83
+ → delegate(sync): Add authentication to...
84
+ [2] thinking: "checking session status"
85
+ → check_session(abc123)
86
+ [3] thinking: "session completed, verifying"
87
+ → $ pytest tests/
88
+ ```
89
+
90
+ Args:
91
+ messages: Full message history from orchestrator
92
+ max_steps: Maximum steps to include (most recent)
93
+
94
+ Returns:
95
+ Compact trajectory string
96
+ """
97
+ steps = []
98
+ step_num = 0
99
+
100
+ for msg in messages:
101
+ role = _get_field(msg, "role", "")
102
+
103
+ if role == "system":
104
+ continue # Skip system messages
105
+
106
+ if role == "assistant":
107
+ step_num += 1
108
+ content = _get_field(msg, "content", "")
109
+ tool_calls = _get_field(msg, "tool_calls", [])
110
+
111
+ # Extract thinking/reasoning summary
112
+ thinking = ""
113
+ if content:
114
+ # Take first line or first 80 chars as "thinking"
115
+ first_line = content.split("\n")[0].strip()
116
+ if len(first_line) > 80:
117
+ thinking = first_line[:80] + "..."
118
+ else:
119
+ thinking = first_line
120
+
121
+ # Extract tool calls
122
+ actions = []
123
+ if tool_calls:
124
+ for tc in tool_calls[:3]: # Max 3 tool calls per step
125
+ actions.append(_extract_tool_call_summary(tc))
126
+ if len(tool_calls) > 3:
127
+ actions.append(f"... +{len(tool_calls) - 3} more")
128
+
129
+ # Format step
130
+ step_lines = [f"[{step_num}]"]
131
+ if thinking:
132
+ step_lines[0] += f' thinking: "{thinking}"'
133
+ for action in actions:
134
+ step_lines.append(f" → {action}")
135
+
136
+ steps.append("\n".join(step_lines))
137
+
138
+ elif role == "tool":
139
+ # Tool results - just note if error
140
+ content = str(_get_field(msg, "content", ""))
141
+ if "error" in content.lower() or "failed" in content.lower():
142
+ steps.append(f" ⚠ tool returned error")
143
+
144
+ elif role == "user" and step_num > 0:
145
+ # User message mid-conversation (watcher nudge, etc.)
146
+ content = _get_field(msg, "content", "")
147
+ if content and "[WATCHER" in content:
148
+ steps.append(f" 📍 watcher nudge")
149
+ elif content:
150
+ preview = content[:50].replace("\n", " ")
151
+ steps.append(f" 💬 user: {preview}...")
152
+
153
+ # Take most recent steps
154
+ if len(steps) > max_steps:
155
+ steps = ["... (earlier steps omitted)"] + steps[-max_steps:]
156
+
157
+ return "\n".join(steps)
158
+
159
+
160
+ def _build_watcher_prompt(
161
+ trajectory: str,
162
+ task: str,
163
+ step: int,
164
+ max_steps: int,
165
+ session_summary: str,
166
+ ) -> str:
167
+ """Build the prompt for the LLM watcher."""
168
+ return f"""You are a trajectory watcher observing an orchestrator agent. Your job is to assess whether the agent is on track and provide guidance if needed.
169
+
170
+ ## Original Task
171
+ {task}
172
+
173
+ ## Progress
174
+ Step {step}/{max_steps}
175
+
176
+ ## Active Sessions
177
+ {session_summary}
178
+
179
+ ## Trajectory (recent steps)
180
+ {trajectory}
181
+
182
+ ---
183
+
184
+ Analyze this trajectory and respond with a JSON object:
185
+ {{
186
+ "status": "ok" | "concern" | "problem",
187
+ "assessment": "Brief 1-2 sentence assessment of trajectory health",
188
+ "guidance": "If status is concern/problem, specific actionable guidance. Otherwise null."
189
+ }}
190
+
191
+ Things to watch for:
192
+ - Is the agent making progress toward the task?
193
+ - Is it spinning or repeating actions?
194
+ - Is it going off on tangents unrelated to the task?
195
+ - Is it delegating appropriately or trying to do everything directly?
196
+ - Are sessions being completed or just started and abandoned?
197
+
198
+ Be concise. Only flag real issues, not minor inefficiencies."""
199
+
200
+
201
+ @register_watcher("llm")
202
+ class LLMWatcher(Watcher):
203
+ """
204
+ LLM-based watcher for nuanced trajectory analysis.
205
+
206
+ Uses a language model to assess the orchestrator's trajectory
207
+ and provide context-aware guidance that rule-based watchers can't.
208
+
209
+ Config options:
210
+ model: Model to use (default: gpt-4o-mini)
211
+ threshold: How often to run (every N steps, default: 5)
212
+ temperature: LLM temperature (default: 0.3)
213
+ """
214
+
215
+ name = "llm"
216
+ description = "LLM-based trajectory analysis for nuanced guidance"
217
+
218
+ async def observe(self, ctx: WatcherContext) -> WatcherResult:
219
+ config = self.config
220
+ threshold = config.get("threshold", 5)
221
+ model = config.get("model", "gpt-4o-mini")
222
+ temperature = config.get("temperature", 0.3)
223
+
224
+ # Only run every N steps to save costs
225
+ if ctx.step % threshold != 0 or ctx.step == 0:
226
+ return WatcherResult.ok()
227
+
228
+ try:
229
+ # Compress trajectory
230
+ trajectory = compress_trajectory(ctx.messages)
231
+
232
+ # Build session summary
233
+ active = [s for s in ctx.sessions if s.get("status") == "running"]
234
+ completed = [s for s in ctx.sessions if s.get("status") == "completed"]
235
+ failed = [s for s in ctx.sessions if s.get("status") == "failed"]
236
+ session_summary = f"{len(active)} running, {len(completed)} completed, {len(failed)} failed"
237
+
238
+ # Build prompt
239
+ prompt = _build_watcher_prompt(
240
+ trajectory=trajectory,
241
+ task=ctx.task,
242
+ step=ctx.step,
243
+ max_steps=ctx.max_steps,
244
+ session_summary=session_summary,
245
+ )
246
+
247
+ # Call LLM
248
+ response = await self._call_llm(prompt, model, temperature)
249
+
250
+ # Parse response
251
+ result = self._parse_response(response)
252
+
253
+ if result["status"] == "ok":
254
+ return WatcherResult.ok()
255
+ elif result["status"] == "concern":
256
+ return WatcherResult.nudge(
257
+ guidance=result.get("guidance", result["assessment"]),
258
+ reason=f"LLM assessment: {result['assessment']}",
259
+ metadata={"llm_response": result},
260
+ )
261
+ else: # problem
262
+ return WatcherResult.nudge(
263
+ guidance=result.get("guidance", result["assessment"]),
264
+ reason=f"LLM detected problem: {result['assessment']}",
265
+ priority=10, # Higher priority for problems
266
+ metadata={"llm_response": result},
267
+ )
268
+
269
+ except Exception as e:
270
+ logger.warning(f"LLM watcher failed: {e}")
271
+ return WatcherResult.ok() # Don't block on watcher failure
272
+
273
+ async def _call_llm(self, prompt: str, model: str, temperature: float) -> str:
274
+ """Call the LLM using OpenAI Responses API."""
275
+ import openai
276
+
277
+ client = openai.AsyncOpenAI()
278
+
279
+ # Use Responses API (consistent with wbal)
280
+ response = await client.responses.create(
281
+ model=model,
282
+ input=[{"role": "user", "content": prompt}],
283
+ temperature=temperature,
284
+ text={"format": {"type": "json_object"}},
285
+ )
286
+
287
+ # Extract text from response
288
+ output_text = getattr(response, "output_text", None)
289
+ if output_text:
290
+ return output_text
291
+
292
+ # Fallback: look through output items
293
+ for item in getattr(response, "output", []):
294
+ if getattr(item, "type", None) == "message":
295
+ for content in getattr(item, "content", []):
296
+ if getattr(content, "type", None) == "output_text":
297
+ return getattr(content, "text", "{}")
298
+ # Also check for direct text attribute
299
+ text = getattr(item, "text", None)
300
+ if text:
301
+ return text
302
+
303
+ return "{}"
304
+
305
+ def _parse_response(self, response: str) -> dict[str, Any]:
306
+ """Parse LLM response JSON."""
307
+ try:
308
+ result = json.loads(response)
309
+ # Validate required fields
310
+ if "status" not in result:
311
+ result["status"] = "ok"
312
+ if "assessment" not in result:
313
+ result["assessment"] = "No assessment provided"
314
+ return result
315
+ except json.JSONDecodeError:
316
+ return {
317
+ "status": "ok",
318
+ "assessment": "Failed to parse LLM response",
319
+ }
@@ -0,0 +1,181 @@
1
+ """
2
+ Watcher manager for running multiple watchers.
3
+
4
+ Handles:
5
+ - Running watchers in parallel
6
+ - Combining results by priority
7
+ - Injecting guidance into orchestrator
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import asyncio
13
+ from dataclasses import dataclass, field
14
+ from typing import Any
15
+
16
+ import weave
17
+
18
+ from zwarm.watchers.base import Watcher, WatcherContext, WatcherResult, WatcherAction
19
+ from zwarm.watchers.registry import get_watcher
20
+
21
+
22
+ @dataclass
23
+ class WatcherConfig:
24
+ """Configuration for a watcher instance."""
25
+
26
+ name: str
27
+ enabled: bool = True
28
+ config: dict[str, Any] = field(default_factory=dict)
29
+
30
+
31
+ class WatcherManager:
32
+ """
33
+ Manages and runs multiple watchers.
34
+
35
+ Watchers run in parallel and results are combined by priority.
36
+ """
37
+
38
+ def __init__(self, watcher_configs: list[WatcherConfig | dict] | None = None):
39
+ """
40
+ Initialize manager with watcher configurations.
41
+
42
+ Args:
43
+ watcher_configs: List of WatcherConfig or dicts with watcher configs
44
+ """
45
+ self._watchers: list[Watcher] = []
46
+ self._results_history: list[tuple[str, WatcherResult]] = []
47
+
48
+ # Load watchers from configs
49
+ for cfg in watcher_configs or []:
50
+ if isinstance(cfg, dict):
51
+ cfg = WatcherConfig(**cfg)
52
+
53
+ if cfg.enabled:
54
+ try:
55
+ watcher = get_watcher(cfg.name, cfg.config)
56
+ self._watchers.append(watcher)
57
+ except ValueError:
58
+ # Unknown watcher, skip
59
+ pass
60
+
61
+ def add_watcher(self, watcher: Watcher) -> None:
62
+ """Add a watcher instance."""
63
+ self._watchers.append(watcher)
64
+
65
+ @weave.op()
66
+ async def _run_single_watcher(
67
+ self,
68
+ watcher_name: str,
69
+ watcher: Watcher,
70
+ ctx: WatcherContext,
71
+ ) -> dict[str, Any]:
72
+ """Run a single watcher - traced by Weave."""
73
+ try:
74
+ result = await watcher.observe(ctx)
75
+ return {
76
+ "watcher": watcher_name,
77
+ "action": result.action.value,
78
+ "priority": result.priority,
79
+ "reason": result.reason,
80
+ "guidance": result.guidance,
81
+ "metadata": result.metadata,
82
+ "success": True,
83
+ }
84
+ except Exception as e:
85
+ return {
86
+ "watcher": watcher_name,
87
+ "success": False,
88
+ "error": str(e),
89
+ }
90
+
91
+ @weave.op()
92
+ async def observe(self, ctx: WatcherContext) -> WatcherResult:
93
+ """
94
+ Run all watchers and return combined result.
95
+
96
+ Results are combined by priority:
97
+ - ABORT takes precedence over everything
98
+ - PAUSE takes precedence over NUDGE
99
+ - NUDGE takes precedence over CONTINUE
100
+ - Within same action, higher priority wins
101
+
102
+ Args:
103
+ ctx: Context for watchers
104
+
105
+ Returns:
106
+ Combined WatcherResult
107
+ """
108
+ if not self._watchers:
109
+ return WatcherResult.ok()
110
+
111
+ # Run all watchers in parallel - each traced individually
112
+ tasks = [
113
+ self._run_single_watcher(watcher.name, watcher, ctx)
114
+ for watcher in self._watchers
115
+ ]
116
+ watcher_outputs = await asyncio.gather(*tasks)
117
+
118
+ # Collect valid results with their watcher names
119
+ valid_results: list[tuple[str, WatcherResult]] = []
120
+ for watcher, output in zip(self._watchers, watcher_outputs):
121
+ if not output.get("success"):
122
+ # Log and skip failed watchers
123
+ continue
124
+ result = WatcherResult(
125
+ action=WatcherAction(output["action"]),
126
+ priority=output["priority"],
127
+ reason=output.get("reason"),
128
+ guidance=output.get("guidance"),
129
+ metadata=output.get("metadata", {}),
130
+ )
131
+ valid_results.append((watcher.name, result))
132
+ self._results_history.append((watcher.name, result))
133
+
134
+ if not valid_results:
135
+ return WatcherResult.ok()
136
+
137
+ # Sort by action severity (abort > pause > nudge > continue) then priority
138
+ def sort_key(item: tuple[str, WatcherResult]) -> tuple[int, int]:
139
+ _, result = item
140
+ action_order = {
141
+ WatcherAction.ABORT: 0,
142
+ WatcherAction.PAUSE: 1,
143
+ WatcherAction.NUDGE: 2,
144
+ WatcherAction.CONTINUE: 3,
145
+ }
146
+ return (action_order[result.action], -result.priority)
147
+
148
+ valid_results.sort(key=sort_key)
149
+
150
+ # Return highest priority non-continue result
151
+ for name, result in valid_results:
152
+ if result.action != WatcherAction.CONTINUE:
153
+ # Add which watcher triggered this
154
+ result.metadata["triggered_by"] = name
155
+ return result
156
+
157
+ return WatcherResult.ok()
158
+
159
+ def get_history(self) -> list[tuple[str, WatcherResult]]:
160
+ """Get history of all watcher results."""
161
+ return list(self._results_history)
162
+
163
+ def clear_history(self) -> None:
164
+ """Clear results history."""
165
+ self._results_history.clear()
166
+
167
+
168
+ def build_watcher_manager(
169
+ config: dict[str, Any] | None = None
170
+ ) -> WatcherManager:
171
+ """
172
+ Build a WatcherManager from configuration.
173
+
174
+ Args:
175
+ config: Dict with "watchers" key containing list of watcher configs
176
+
177
+ Returns:
178
+ Configured WatcherManager
179
+ """
180
+ watcher_configs = (config or {}).get("watchers", [])
181
+ return WatcherManager(watcher_configs)
@@ -0,0 +1,57 @@
1
+ """
2
+ Watcher registry for discovering and instantiating watchers.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Type
8
+
9
+ from zwarm.watchers.base import Watcher
10
+
11
+
12
+ # Global watcher registry
13
+ _WATCHERS: dict[str, Type[Watcher]] = {}
14
+
15
+
16
+ def register_watcher(name: str):
17
+ """
18
+ Decorator to register a watcher class.
19
+
20
+ Example:
21
+ @register_watcher("progress")
22
+ class ProgressWatcher(Watcher):
23
+ ...
24
+ """
25
+
26
+ def decorator(cls: Type[Watcher]) -> Type[Watcher]:
27
+ cls.name = name
28
+ _WATCHERS[name] = cls
29
+ return cls
30
+
31
+ return decorator
32
+
33
+
34
+ def get_watcher(name: str, config: dict[str, Any] | None = None) -> Watcher:
35
+ """
36
+ Get a watcher instance by name.
37
+
38
+ Args:
39
+ name: Registered watcher name
40
+ config: Optional config to pass to watcher
41
+
42
+ Returns:
43
+ Instantiated watcher
44
+
45
+ Raises:
46
+ ValueError: If watcher not found
47
+ """
48
+ if name not in _WATCHERS:
49
+ raise ValueError(
50
+ f"Unknown watcher: {name}. Available: {list(_WATCHERS.keys())}"
51
+ )
52
+ return _WATCHERS[name](config)
53
+
54
+
55
+ def list_watchers() -> list[str]:
56
+ """List all registered watcher names."""
57
+ return list(_WATCHERS.keys())