zwarm 2.3__py3-none-any.whl → 3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zwarm/cli/main.py +210 -1
- zwarm/cli/pilot.py +1000 -0
- zwarm/core/environment.py +51 -32
- zwarm/orchestrator.py +8 -3
- zwarm/prompts/__init__.py +3 -0
- zwarm/prompts/orchestrator.py +36 -29
- zwarm/prompts/pilot.py +147 -0
- zwarm/sessions/manager.py +112 -0
- zwarm/tools/delegation.py +151 -28
- zwarm/watchers/__init__.py +5 -0
- zwarm/watchers/llm_watcher.py +319 -0
- {zwarm-2.3.dist-info → zwarm-3.0.dist-info}/METADATA +1 -1
- {zwarm-2.3.dist-info → zwarm-3.0.dist-info}/RECORD +15 -12
- {zwarm-2.3.dist-info → zwarm-3.0.dist-info}/WHEEL +0 -0
- {zwarm-2.3.dist-info → zwarm-3.0.dist-info}/entry_points.txt +0 -0
zwarm/tools/delegation.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_session_manager(orchestrator: "Orchestrator"):
|
|
|
33
33
|
|
|
34
34
|
Both `zwarm interactive` and `zwarm orchestrate` use the same session manager.
|
|
35
35
|
The orchestrator is just another user that happens to be an LLM.
|
|
36
|
+
|
|
37
|
+
The session manager is created eagerly in Orchestrator.model_post_init()
|
|
38
|
+
and shared with the environment for observe() visibility.
|
|
36
39
|
"""
|
|
37
|
-
|
|
40
|
+
# Should already exist from model_post_init, but create if not
|
|
41
|
+
if not hasattr(orchestrator, "_session_manager") or orchestrator._session_manager is None:
|
|
38
42
|
from zwarm.sessions import CodexSessionManager
|
|
39
43
|
orchestrator._session_manager = CodexSessionManager(orchestrator.working_dir / ".zwarm")
|
|
40
44
|
return orchestrator._session_manager
|
|
@@ -154,7 +158,7 @@ def _validate_working_dir(
|
|
|
154
158
|
def delegate(
|
|
155
159
|
self: "Orchestrator",
|
|
156
160
|
task: str,
|
|
157
|
-
mode: Literal["sync", "async"] = "
|
|
161
|
+
mode: Literal["sync", "async"] = "async",
|
|
158
162
|
model: str | None = None,
|
|
159
163
|
working_dir: str | None = None,
|
|
160
164
|
) -> dict[str, Any]:
|
|
@@ -162,28 +166,34 @@ def delegate(
|
|
|
162
166
|
Delegate work to a Codex agent.
|
|
163
167
|
|
|
164
168
|
This spawns a codex session - the exact same way `zwarm interactive` does.
|
|
165
|
-
Two modes available:
|
|
166
169
|
|
|
167
|
-
**
|
|
168
|
-
|
|
170
|
+
**NOTE: All sessions run async.** The mode parameter is ignored - sessions
|
|
171
|
+
always return immediately. Use sleep() + peek_session() to poll for completion.
|
|
169
172
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
+
Async workflow pattern:
|
|
174
|
+
1. delegate(task="Add logout button") -> session_id
|
|
175
|
+
2. sleep(30) -> give it time
|
|
176
|
+
3. peek_session(session_id) -> check if done
|
|
177
|
+
4. Repeat 2-3 if still running
|
|
178
|
+
5. check_session(session_id) -> get full results
|
|
173
179
|
|
|
174
180
|
Args:
|
|
175
181
|
task: Clear description of what to do. Be specific about requirements.
|
|
176
|
-
mode:
|
|
182
|
+
mode: IGNORED - always async. (Legacy parameter, will be removed.)
|
|
177
183
|
model: Model override (default: gpt-5.1-codex-mini).
|
|
178
184
|
working_dir: Directory for codex to work in (default: orchestrator's dir).
|
|
179
185
|
|
|
180
186
|
Returns:
|
|
181
|
-
{session_id, status,
|
|
187
|
+
{session_id, status: "running", task, hint}
|
|
182
188
|
|
|
183
189
|
Example:
|
|
184
|
-
delegate(task="Add a logout button to the navbar"
|
|
185
|
-
|
|
190
|
+
delegate(task="Add a logout button to the navbar")
|
|
191
|
+
sleep(30)
|
|
192
|
+
peek_session(session_id) # Check progress
|
|
186
193
|
"""
|
|
194
|
+
# Force async mode - sync is deprecated
|
|
195
|
+
# TODO: Remove sync codepath entirely (see STATE.md)
|
|
196
|
+
mode = "async"
|
|
187
197
|
# Validate working directory
|
|
188
198
|
effective_dir, dir_error = _validate_working_dir(
|
|
189
199
|
working_dir,
|
|
@@ -293,7 +303,7 @@ def converse(
|
|
|
293
303
|
self: "Orchestrator",
|
|
294
304
|
session_id: str,
|
|
295
305
|
message: str,
|
|
296
|
-
wait: bool =
|
|
306
|
+
wait: bool = False,
|
|
297
307
|
) -> dict[str, Any]:
|
|
298
308
|
"""
|
|
299
309
|
Continue a conversation with a codex session.
|
|
@@ -301,29 +311,26 @@ def converse(
|
|
|
301
311
|
This injects a follow-up message into the session, providing the
|
|
302
312
|
conversation history as context. Like chatting with a developer.
|
|
303
313
|
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
- **wait=False**: Fire-and-forget. Message sent, codex runs in background.
|
|
307
|
-
Use check_session() later to see the response.
|
|
314
|
+
**NOTE: Always runs async.** The wait parameter is ignored - messages
|
|
315
|
+
are sent and return immediately. Use sleep() + check_session() to poll.
|
|
308
316
|
|
|
309
317
|
Args:
|
|
310
318
|
session_id: The session to continue (from delegate() result).
|
|
311
319
|
message: Your next message to codex.
|
|
312
|
-
wait:
|
|
320
|
+
wait: IGNORED - always async. (Legacy parameter, will be removed.)
|
|
313
321
|
|
|
314
322
|
Returns:
|
|
315
|
-
{session_id,
|
|
316
|
-
|
|
317
|
-
Example (sync):
|
|
318
|
-
result = delegate(task="Add user authentication")
|
|
319
|
-
converse(session_id=result["session_id"], message="Use JWT")
|
|
320
|
-
# Returns with response
|
|
323
|
+
{session_id, turn, status: "running"}
|
|
321
324
|
|
|
322
|
-
Example
|
|
323
|
-
converse(session_id="abc123", message="Add tests"
|
|
324
|
-
|
|
325
|
-
#
|
|
325
|
+
Example:
|
|
326
|
+
converse(session_id="abc123", message="Add tests")
|
|
327
|
+
sleep(30)
|
|
328
|
+
check_session(session_id) # Get response
|
|
326
329
|
"""
|
|
330
|
+
# Force async mode - sync is deprecated
|
|
331
|
+
# TODO: Remove sync codepath entirely (see STATE.md)
|
|
332
|
+
wait = False
|
|
333
|
+
|
|
327
334
|
manager = _get_session_manager(self)
|
|
328
335
|
|
|
329
336
|
# Get current session
|
|
@@ -517,6 +524,81 @@ def peek_session(
|
|
|
517
524
|
}
|
|
518
525
|
|
|
519
526
|
|
|
527
|
+
@weaveTool
|
|
528
|
+
def get_trajectory(
|
|
529
|
+
self: "Orchestrator",
|
|
530
|
+
session_id: str,
|
|
531
|
+
full: bool = False,
|
|
532
|
+
) -> dict[str, Any]:
|
|
533
|
+
"""
|
|
534
|
+
Get the full trajectory of a session - all steps the agent took.
|
|
535
|
+
|
|
536
|
+
Shows reasoning, commands, tool calls, and responses in order.
|
|
537
|
+
Useful for understanding HOW the agent completed a task, not just
|
|
538
|
+
the final result.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
session_id: The session to get trajectory for.
|
|
542
|
+
full: If True, include full untruncated content (default: False for summary view).
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
{steps: [...], step_count}
|
|
546
|
+
"""
|
|
547
|
+
manager = _get_session_manager(self)
|
|
548
|
+
|
|
549
|
+
session = manager.get_session(session_id)
|
|
550
|
+
if not session:
|
|
551
|
+
return {"success": False, "error": f"Unknown session: {session_id}"}
|
|
552
|
+
|
|
553
|
+
trajectory = manager.get_trajectory(session_id, full=full)
|
|
554
|
+
|
|
555
|
+
# Format steps for easy reading
|
|
556
|
+
formatted_steps = []
|
|
557
|
+
for step in trajectory:
|
|
558
|
+
step_type = step.get("type", "unknown")
|
|
559
|
+
|
|
560
|
+
if step_type == "reasoning":
|
|
561
|
+
text = step.get("full_text") if full else step.get("summary", "")
|
|
562
|
+
formatted_steps.append(f"[thinking] {text}")
|
|
563
|
+
elif step_type == "command":
|
|
564
|
+
cmd = step.get("command", "")
|
|
565
|
+
output = step.get("output", "")
|
|
566
|
+
exit_code = step.get("exit_code")
|
|
567
|
+
step_str = f"[command] $ {cmd}"
|
|
568
|
+
if output:
|
|
569
|
+
if full:
|
|
570
|
+
step_str += f"\n → {output}"
|
|
571
|
+
else:
|
|
572
|
+
step_str += f"\n → {output[:100]}{'...' if len(output) > 100 else ''}"
|
|
573
|
+
if exit_code and exit_code != 0:
|
|
574
|
+
step_str += f" (exit: {exit_code})"
|
|
575
|
+
formatted_steps.append(step_str)
|
|
576
|
+
elif step_type == "tool_call":
|
|
577
|
+
if full and step.get("full_args"):
|
|
578
|
+
import json
|
|
579
|
+
args_str = json.dumps(step["full_args"], indent=2)
|
|
580
|
+
formatted_steps.append(f"[tool] {step.get('tool', 'unknown')}\n {args_str}")
|
|
581
|
+
else:
|
|
582
|
+
formatted_steps.append(f"[tool] {step.get('tool', 'unknown')}({step.get('args_preview', '')})")
|
|
583
|
+
elif step_type == "tool_output":
|
|
584
|
+
output = step.get("output", "")
|
|
585
|
+
if not full:
|
|
586
|
+
output = output[:100]
|
|
587
|
+
formatted_steps.append(f"[result] {output}")
|
|
588
|
+
elif step_type == "message":
|
|
589
|
+
text = step.get("full_text") if full else step.get("summary", "")
|
|
590
|
+
formatted_steps.append(f"[response] {text}")
|
|
591
|
+
|
|
592
|
+
return {
|
|
593
|
+
"success": True,
|
|
594
|
+
"session_id": session.short_id,
|
|
595
|
+
"task": _truncate(session.task, 80),
|
|
596
|
+
"step_count": len(trajectory),
|
|
597
|
+
"steps": formatted_steps,
|
|
598
|
+
"mode": "full" if full else "summary",
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
|
|
520
602
|
@weaveTool
|
|
521
603
|
def end_session(
|
|
522
604
|
self: "Orchestrator",
|
|
@@ -703,3 +785,44 @@ def list_sessions(
|
|
|
703
785
|
"filter": status or "all",
|
|
704
786
|
"hint": "Sessions with needs_attention=True have new responses to review" if needs_attention_count else None,
|
|
705
787
|
}
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
@weaveTool
|
|
791
|
+
def sleep(self, seconds: float) -> dict[str, Any]:
|
|
792
|
+
"""
|
|
793
|
+
Sleep for a specified number of seconds.
|
|
794
|
+
|
|
795
|
+
Use this when you've started async sessions (wait=False) and want to
|
|
796
|
+
give them time to complete before checking their status. This lets you
|
|
797
|
+
manage your own polling loop:
|
|
798
|
+
|
|
799
|
+
1. delegate(task, wait=False) -> start background work
|
|
800
|
+
2. sleep(10) -> wait a bit
|
|
801
|
+
3. peek_session(id) -> check if done
|
|
802
|
+
4. Repeat 2-3 if still running
|
|
803
|
+
|
|
804
|
+
Args:
|
|
805
|
+
seconds: Number of seconds to sleep (max 300 = 5 minutes)
|
|
806
|
+
|
|
807
|
+
Returns:
|
|
808
|
+
Dict with success status and actual sleep duration
|
|
809
|
+
"""
|
|
810
|
+
# Cap at 5 minutes to prevent accidental long hangs
|
|
811
|
+
max_sleep = 300.0
|
|
812
|
+
actual_seconds = min(float(seconds), max_sleep)
|
|
813
|
+
|
|
814
|
+
if actual_seconds <= 0:
|
|
815
|
+
return {
|
|
816
|
+
"success": False,
|
|
817
|
+
"error": "Sleep duration must be positive",
|
|
818
|
+
"requested": seconds,
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
time.sleep(actual_seconds)
|
|
822
|
+
|
|
823
|
+
return {
|
|
824
|
+
"success": True,
|
|
825
|
+
"slept_seconds": actual_seconds,
|
|
826
|
+
"capped": actual_seconds < seconds,
|
|
827
|
+
"max_allowed": max_sleep if actual_seconds < seconds else None,
|
|
828
|
+
}
|
zwarm/watchers/__init__.py
CHANGED
|
@@ -11,6 +11,10 @@ from zwarm.watchers.manager import WatcherManager, WatcherConfig, build_watcher_
|
|
|
11
11
|
|
|
12
12
|
# Import built-in watchers to register them
|
|
13
13
|
from zwarm.watchers import builtin as _builtin # noqa: F401
|
|
14
|
+
from zwarm.watchers import llm_watcher as _llm_watcher # noqa: F401
|
|
15
|
+
|
|
16
|
+
# Export trajectory compression utility
|
|
17
|
+
from zwarm.watchers.llm_watcher import compress_trajectory
|
|
14
18
|
|
|
15
19
|
__all__ = [
|
|
16
20
|
"Watcher",
|
|
@@ -23,4 +27,5 @@ __all__ = [
|
|
|
23
27
|
"get_watcher",
|
|
24
28
|
"list_watchers",
|
|
25
29
|
"build_watcher_manager",
|
|
30
|
+
"compress_trajectory",
|
|
26
31
|
]
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM-based watcher for nuanced trajectory analysis.
|
|
3
|
+
|
|
4
|
+
Unlike rule-based watchers, this watcher uses a language model to assess
|
|
5
|
+
the orchestrator's trajectory and provide context-aware guidance.
|
|
6
|
+
|
|
7
|
+
The watcher compresses the full message history into a compact trajectory
|
|
8
|
+
representation (similar to what Codex shows in its UI) to minimize token
|
|
9
|
+
usage while preserving the "shape" of the agent's behavior.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from zwarm.watchers.base import Watcher, WatcherContext, WatcherResult
|
|
19
|
+
from zwarm.watchers.registry import register_watcher
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _get_field(item: Any, name: str, default: Any = None) -> Any:
|
|
25
|
+
"""Get field from dict or object."""
|
|
26
|
+
if isinstance(item, dict):
|
|
27
|
+
return item.get(name, default)
|
|
28
|
+
return getattr(item, name, default)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _extract_tool_call_summary(tc: Any) -> str:
|
|
32
|
+
"""Extract a compact summary of a tool call."""
|
|
33
|
+
if isinstance(tc, dict):
|
|
34
|
+
func = tc.get("function", tc)
|
|
35
|
+
name = func.get("name", tc.get("name", "?"))
|
|
36
|
+
args = func.get("arguments", tc.get("arguments", ""))
|
|
37
|
+
else:
|
|
38
|
+
name = getattr(tc, "name", "?")
|
|
39
|
+
args = getattr(tc, "arguments", "")
|
|
40
|
+
|
|
41
|
+
# Parse args if JSON string
|
|
42
|
+
if isinstance(args, str):
|
|
43
|
+
try:
|
|
44
|
+
args = json.loads(args)
|
|
45
|
+
except (json.JSONDecodeError, TypeError):
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
# Create compact arg summary
|
|
49
|
+
if isinstance(args, dict):
|
|
50
|
+
# Show key args based on tool type
|
|
51
|
+
if name == "delegate":
|
|
52
|
+
task = args.get("task", "")[:50]
|
|
53
|
+
mode = args.get("mode", "sync")
|
|
54
|
+
return f"delegate({mode}): {task}..."
|
|
55
|
+
elif name == "converse":
|
|
56
|
+
msg = args.get("message", "")[:40]
|
|
57
|
+
return f"converse: {msg}..."
|
|
58
|
+
elif name == "bash":
|
|
59
|
+
cmd = args.get("command", "")[:60]
|
|
60
|
+
return f"$ {cmd}"
|
|
61
|
+
elif name in ("check_session", "peek_session", "end_session"):
|
|
62
|
+
sid = args.get("session_id", "")[:8]
|
|
63
|
+
return f"{name}({sid})"
|
|
64
|
+
elif name == "list_sessions":
|
|
65
|
+
return "list_sessions()"
|
|
66
|
+
else:
|
|
67
|
+
# Generic: show first arg
|
|
68
|
+
first_val = next(iter(args.values()), "") if args else ""
|
|
69
|
+
if isinstance(first_val, str) and len(first_val) > 30:
|
|
70
|
+
first_val = first_val[:30] + "..."
|
|
71
|
+
return f"{name}({first_val})"
|
|
72
|
+
else:
|
|
73
|
+
return f"{name}({str(args)[:30]})"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def compress_trajectory(messages: list[dict[str, Any]], max_steps: int = 50) -> str:
|
|
77
|
+
"""
|
|
78
|
+
Compress full message history into a compact trajectory representation.
|
|
79
|
+
|
|
80
|
+
Output format (similar to Codex UI):
|
|
81
|
+
```
|
|
82
|
+
[1] thinking: "preparing to inspect the codebase"
|
|
83
|
+
→ delegate(sync): Add authentication to...
|
|
84
|
+
[2] thinking: "checking session status"
|
|
85
|
+
→ check_session(abc123)
|
|
86
|
+
[3] thinking: "session completed, verifying"
|
|
87
|
+
→ $ pytest tests/
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
messages: Full message history from orchestrator
|
|
92
|
+
max_steps: Maximum steps to include (most recent)
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Compact trajectory string
|
|
96
|
+
"""
|
|
97
|
+
steps = []
|
|
98
|
+
step_num = 0
|
|
99
|
+
|
|
100
|
+
for msg in messages:
|
|
101
|
+
role = _get_field(msg, "role", "")
|
|
102
|
+
|
|
103
|
+
if role == "system":
|
|
104
|
+
continue # Skip system messages
|
|
105
|
+
|
|
106
|
+
if role == "assistant":
|
|
107
|
+
step_num += 1
|
|
108
|
+
content = _get_field(msg, "content", "")
|
|
109
|
+
tool_calls = _get_field(msg, "tool_calls", [])
|
|
110
|
+
|
|
111
|
+
# Extract thinking/reasoning summary
|
|
112
|
+
thinking = ""
|
|
113
|
+
if content:
|
|
114
|
+
# Take first line or first 80 chars as "thinking"
|
|
115
|
+
first_line = content.split("\n")[0].strip()
|
|
116
|
+
if len(first_line) > 80:
|
|
117
|
+
thinking = first_line[:80] + "..."
|
|
118
|
+
else:
|
|
119
|
+
thinking = first_line
|
|
120
|
+
|
|
121
|
+
# Extract tool calls
|
|
122
|
+
actions = []
|
|
123
|
+
if tool_calls:
|
|
124
|
+
for tc in tool_calls[:3]: # Max 3 tool calls per step
|
|
125
|
+
actions.append(_extract_tool_call_summary(tc))
|
|
126
|
+
if len(tool_calls) > 3:
|
|
127
|
+
actions.append(f"... +{len(tool_calls) - 3} more")
|
|
128
|
+
|
|
129
|
+
# Format step
|
|
130
|
+
step_lines = [f"[{step_num}]"]
|
|
131
|
+
if thinking:
|
|
132
|
+
step_lines[0] += f' thinking: "{thinking}"'
|
|
133
|
+
for action in actions:
|
|
134
|
+
step_lines.append(f" → {action}")
|
|
135
|
+
|
|
136
|
+
steps.append("\n".join(step_lines))
|
|
137
|
+
|
|
138
|
+
elif role == "tool":
|
|
139
|
+
# Tool results - just note if error
|
|
140
|
+
content = str(_get_field(msg, "content", ""))
|
|
141
|
+
if "error" in content.lower() or "failed" in content.lower():
|
|
142
|
+
steps.append(f" ⚠ tool returned error")
|
|
143
|
+
|
|
144
|
+
elif role == "user" and step_num > 0:
|
|
145
|
+
# User message mid-conversation (watcher nudge, etc.)
|
|
146
|
+
content = _get_field(msg, "content", "")
|
|
147
|
+
if content and "[WATCHER" in content:
|
|
148
|
+
steps.append(f" 📍 watcher nudge")
|
|
149
|
+
elif content:
|
|
150
|
+
preview = content[:50].replace("\n", " ")
|
|
151
|
+
steps.append(f" 💬 user: {preview}...")
|
|
152
|
+
|
|
153
|
+
# Take most recent steps
|
|
154
|
+
if len(steps) > max_steps:
|
|
155
|
+
steps = ["... (earlier steps omitted)"] + steps[-max_steps:]
|
|
156
|
+
|
|
157
|
+
return "\n".join(steps)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _build_watcher_prompt(
|
|
161
|
+
trajectory: str,
|
|
162
|
+
task: str,
|
|
163
|
+
step: int,
|
|
164
|
+
max_steps: int,
|
|
165
|
+
session_summary: str,
|
|
166
|
+
) -> str:
|
|
167
|
+
"""Build the prompt for the LLM watcher."""
|
|
168
|
+
return f"""You are a trajectory watcher observing an orchestrator agent. Your job is to assess whether the agent is on track and provide guidance if needed.
|
|
169
|
+
|
|
170
|
+
## Original Task
|
|
171
|
+
{task}
|
|
172
|
+
|
|
173
|
+
## Progress
|
|
174
|
+
Step {step}/{max_steps}
|
|
175
|
+
|
|
176
|
+
## Active Sessions
|
|
177
|
+
{session_summary}
|
|
178
|
+
|
|
179
|
+
## Trajectory (recent steps)
|
|
180
|
+
{trajectory}
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
Analyze this trajectory and respond with a JSON object:
|
|
185
|
+
{{
|
|
186
|
+
"status": "ok" | "concern" | "problem",
|
|
187
|
+
"assessment": "Brief 1-2 sentence assessment of trajectory health",
|
|
188
|
+
"guidance": "If status is concern/problem, specific actionable guidance. Otherwise null."
|
|
189
|
+
}}
|
|
190
|
+
|
|
191
|
+
Things to watch for:
|
|
192
|
+
- Is the agent making progress toward the task?
|
|
193
|
+
- Is it spinning or repeating actions?
|
|
194
|
+
- Is it going off on tangents unrelated to the task?
|
|
195
|
+
- Is it delegating appropriately or trying to do everything directly?
|
|
196
|
+
- Are sessions being completed or just started and abandoned?
|
|
197
|
+
|
|
198
|
+
Be concise. Only flag real issues, not minor inefficiencies."""
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@register_watcher("llm")
|
|
202
|
+
class LLMWatcher(Watcher):
|
|
203
|
+
"""
|
|
204
|
+
LLM-based watcher for nuanced trajectory analysis.
|
|
205
|
+
|
|
206
|
+
Uses a language model to assess the orchestrator's trajectory
|
|
207
|
+
and provide context-aware guidance that rule-based watchers can't.
|
|
208
|
+
|
|
209
|
+
Config options:
|
|
210
|
+
model: Model to use (default: gpt-4o-mini)
|
|
211
|
+
threshold: How often to run (every N steps, default: 5)
|
|
212
|
+
temperature: LLM temperature (default: 0.3)
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
name = "llm"
|
|
216
|
+
description = "LLM-based trajectory analysis for nuanced guidance"
|
|
217
|
+
|
|
218
|
+
async def observe(self, ctx: WatcherContext) -> WatcherResult:
|
|
219
|
+
config = self.config
|
|
220
|
+
threshold = config.get("threshold", 5)
|
|
221
|
+
model = config.get("model", "gpt-4o-mini")
|
|
222
|
+
temperature = config.get("temperature", 0.3)
|
|
223
|
+
|
|
224
|
+
# Only run every N steps to save costs
|
|
225
|
+
if ctx.step % threshold != 0 or ctx.step == 0:
|
|
226
|
+
return WatcherResult.ok()
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
# Compress trajectory
|
|
230
|
+
trajectory = compress_trajectory(ctx.messages)
|
|
231
|
+
|
|
232
|
+
# Build session summary
|
|
233
|
+
active = [s for s in ctx.sessions if s.get("status") == "running"]
|
|
234
|
+
completed = [s for s in ctx.sessions if s.get("status") == "completed"]
|
|
235
|
+
failed = [s for s in ctx.sessions if s.get("status") == "failed"]
|
|
236
|
+
session_summary = f"{len(active)} running, {len(completed)} completed, {len(failed)} failed"
|
|
237
|
+
|
|
238
|
+
# Build prompt
|
|
239
|
+
prompt = _build_watcher_prompt(
|
|
240
|
+
trajectory=trajectory,
|
|
241
|
+
task=ctx.task,
|
|
242
|
+
step=ctx.step,
|
|
243
|
+
max_steps=ctx.max_steps,
|
|
244
|
+
session_summary=session_summary,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# Call LLM
|
|
248
|
+
response = await self._call_llm(prompt, model, temperature)
|
|
249
|
+
|
|
250
|
+
# Parse response
|
|
251
|
+
result = self._parse_response(response)
|
|
252
|
+
|
|
253
|
+
if result["status"] == "ok":
|
|
254
|
+
return WatcherResult.ok()
|
|
255
|
+
elif result["status"] == "concern":
|
|
256
|
+
return WatcherResult.nudge(
|
|
257
|
+
guidance=result.get("guidance", result["assessment"]),
|
|
258
|
+
reason=f"LLM assessment: {result['assessment']}",
|
|
259
|
+
metadata={"llm_response": result},
|
|
260
|
+
)
|
|
261
|
+
else: # problem
|
|
262
|
+
return WatcherResult.nudge(
|
|
263
|
+
guidance=result.get("guidance", result["assessment"]),
|
|
264
|
+
reason=f"LLM detected problem: {result['assessment']}",
|
|
265
|
+
priority=10, # Higher priority for problems
|
|
266
|
+
metadata={"llm_response": result},
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
except Exception as e:
|
|
270
|
+
logger.warning(f"LLM watcher failed: {e}")
|
|
271
|
+
return WatcherResult.ok() # Don't block on watcher failure
|
|
272
|
+
|
|
273
|
+
async def _call_llm(self, prompt: str, model: str, temperature: float) -> str:
|
|
274
|
+
"""Call the LLM using OpenAI Responses API."""
|
|
275
|
+
import openai
|
|
276
|
+
|
|
277
|
+
client = openai.AsyncOpenAI()
|
|
278
|
+
|
|
279
|
+
# Use Responses API (consistent with wbal)
|
|
280
|
+
response = await client.responses.create(
|
|
281
|
+
model=model,
|
|
282
|
+
input=[{"role": "user", "content": prompt}],
|
|
283
|
+
temperature=temperature,
|
|
284
|
+
text={"format": {"type": "json_object"}},
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Extract text from response
|
|
288
|
+
output_text = getattr(response, "output_text", None)
|
|
289
|
+
if output_text:
|
|
290
|
+
return output_text
|
|
291
|
+
|
|
292
|
+
# Fallback: look through output items
|
|
293
|
+
for item in getattr(response, "output", []):
|
|
294
|
+
if getattr(item, "type", None) == "message":
|
|
295
|
+
for content in getattr(item, "content", []):
|
|
296
|
+
if getattr(content, "type", None) == "output_text":
|
|
297
|
+
return getattr(content, "text", "{}")
|
|
298
|
+
# Also check for direct text attribute
|
|
299
|
+
text = getattr(item, "text", None)
|
|
300
|
+
if text:
|
|
301
|
+
return text
|
|
302
|
+
|
|
303
|
+
return "{}"
|
|
304
|
+
|
|
305
|
+
def _parse_response(self, response: str) -> dict[str, Any]:
|
|
306
|
+
"""Parse LLM response JSON."""
|
|
307
|
+
try:
|
|
308
|
+
result = json.loads(response)
|
|
309
|
+
# Validate required fields
|
|
310
|
+
if "status" not in result:
|
|
311
|
+
result["status"] = "ok"
|
|
312
|
+
if "assessment" not in result:
|
|
313
|
+
result["assessment"] = "No assessment provided"
|
|
314
|
+
return result
|
|
315
|
+
except json.JSONDecodeError:
|
|
316
|
+
return {
|
|
317
|
+
"status": "ok",
|
|
318
|
+
"assessment": "Failed to parse LLM response",
|
|
319
|
+
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
zwarm/__init__.py,sha256=3i3LMjHwIzE-LFIS2aUrwv3EZmpkvVMe-xj1h97rcSM,837
|
|
2
|
-
zwarm/orchestrator.py,sha256=
|
|
2
|
+
zwarm/orchestrator.py,sha256=JGRGuJP05Nf5QibuWytjQAC_NuGGaGUR3G-tLq4SVxY,23624
|
|
3
3
|
zwarm/test_orchestrator_watchers.py,sha256=QpoaehPU7ekT4XshbTOWnJ2H0wRveV3QOZjxbgyJJLY,807
|
|
4
4
|
zwarm/adapters/__init__.py,sha256=O0b-SfZpb6txeNqFkXZ2aaf34yLFYreznyrAV25jF_Q,656
|
|
5
5
|
zwarm/adapters/base.py,sha256=fZlQviTgVvOcwnxduTla6WuM6FzQJ_yoHMW5SxwVgQg,2527
|
|
@@ -9,29 +9,32 @@ zwarm/adapters/registry.py,sha256=EdyHECaNA5Kv1od64pYFBJyA_r_6I1r_eJTNP1XYLr4,17
|
|
|
9
9
|
zwarm/adapters/test_codex_mcp.py,sha256=0qhVzxn_KF-XUS30gXSJKwMdR3kWGsDY9iPk1Ihqn3w,10698
|
|
10
10
|
zwarm/adapters/test_registry.py,sha256=otxcVDONwFCMisyANToF3iy7Y8dSbCL8bTmZNhxNuF4,2383
|
|
11
11
|
zwarm/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
zwarm/cli/main.py,sha256=
|
|
12
|
+
zwarm/cli/main.py,sha256=ztGb0jNHO6NkRT9Qg-4_gAg1U1lMNZUyjJDY5wQyZ-k,97634
|
|
13
|
+
zwarm/cli/pilot.py,sha256=Gg0c2x-aUJuG2grpNWJd7f0K0NQna9cAOnjfiufDsHo,33035
|
|
13
14
|
zwarm/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
15
|
zwarm/core/compact.py,sha256=Y8C7Gs-5-WOU43WRvQ863Qzd5xtuEqR6Aw3r2p8_-i8,10907
|
|
15
16
|
zwarm/core/config.py,sha256=331i4io9uEnloFwUMjTPJ5_lQFKJR1nhTpA4SPfSpiI,11748
|
|
16
|
-
zwarm/core/environment.py,sha256=
|
|
17
|
+
zwarm/core/environment.py,sha256=zrgh0N3Ng4HI2F1gCYkcQVGzjQPKiIFWuRe1OPRuRn0,6558
|
|
17
18
|
zwarm/core/models.py,sha256=PrC3okRBVJxISUa1Fax4KkagqLT6Xub-kTxC9drN0sY,10083
|
|
18
19
|
zwarm/core/state.py,sha256=MzrvODKEiJovI7YI1jajW4uukineZ3ezmW5oQinMgjg,11563
|
|
19
20
|
zwarm/core/test_compact.py,sha256=WSdjCB5t4YMcknsrkmJIUsVOPY28s4y9GnDmu3Z4BFw,11878
|
|
20
21
|
zwarm/core/test_config.py,sha256=26ozyiFOdjFF2c9Q-HDfFM6GOLfgw_5FZ55nTDMNYA8,4888
|
|
21
22
|
zwarm/core/test_models.py,sha256=sWTIhMZvuLP5AooGR6y8OR2EyWydqVfhmGrE7NPBBnk,8450
|
|
22
|
-
zwarm/prompts/__init__.py,sha256=
|
|
23
|
-
zwarm/prompts/orchestrator.py,sha256
|
|
23
|
+
zwarm/prompts/__init__.py,sha256=DI307o712F8qQyDt5vwnFgpVBrxpKwjhr0MaBHLzr9E,334
|
|
24
|
+
zwarm/prompts/orchestrator.py,sha256=AkVbEpT91QbYFjUYOzm0d37wXrpm0esLBD1MG_W-3FI,15367
|
|
25
|
+
zwarm/prompts/pilot.py,sha256=BcaV04-43FZyrtmoqCbA7DqnTlQ330TcDp9wNGhRojo,5586
|
|
24
26
|
zwarm/sessions/__init__.py,sha256=jRibY8IfmNcnkgNmrgK2T81oa1w71wP_KQp9A1hPL7Q,568
|
|
25
|
-
zwarm/sessions/manager.py,sha256=
|
|
27
|
+
zwarm/sessions/manager.py,sha256=Aq7Wh-WW7ZMP8LgGa3g70wfGg6E2GYjJOBucy6HUfGc,27700
|
|
26
28
|
zwarm/tools/__init__.py,sha256=FpqxwXJA6-fQ7C-oLj30jjK_0qqcE7MbI0dQuaB56kU,290
|
|
27
|
-
zwarm/tools/delegation.py,sha256=
|
|
28
|
-
zwarm/watchers/__init__.py,sha256=
|
|
29
|
+
zwarm/tools/delegation.py,sha256=LUf48Z2aXVvDxgScYMTwOICJ2jq0KB1DWmf6VA7BhXU,26442
|
|
30
|
+
zwarm/watchers/__init__.py,sha256=a96s7X6ruYkF2ItWWOZ3Q5QUOMOoeCW4Vz8XXcYLXPM,956
|
|
29
31
|
zwarm/watchers/base.py,sha256=r1GoPlj06nOT2xp4fghfSjxbRyFFFQUB6HpZbEyO2OY,3834
|
|
30
32
|
zwarm/watchers/builtin.py,sha256=IL5QwwKOIqWEfJ_uQWb321Px4i5OLtI_vnWQMudqKoA,19064
|
|
33
|
+
zwarm/watchers/llm_watcher.py,sha256=yJGpE3BGKNZX3qgPsiNtJ5d3UJpiTT1V-A-Rh4AiMYM,11029
|
|
31
34
|
zwarm/watchers/manager.py,sha256=XZjBVeHjgCUlkTUeHqdvBvHoBC862U1ik0fG6nlRGog,5587
|
|
32
35
|
zwarm/watchers/registry.py,sha256=A9iBIVIFNtO7KPX0kLpUaP8dAK7ozqWLA44ocJGnOw4,1219
|
|
33
36
|
zwarm/watchers/test_watchers.py,sha256=zOsxumBqKfR5ZVGxrNlxz6KcWjkcdp0QhW9WB0_20zM,7855
|
|
34
|
-
zwarm-
|
|
35
|
-
zwarm-
|
|
36
|
-
zwarm-
|
|
37
|
-
zwarm-
|
|
37
|
+
zwarm-3.0.dist-info/METADATA,sha256=ZpFa_QdNIUBX2Ay0lAuT7fP_5WOSomtcF1dynknQOQA,7678
|
|
38
|
+
zwarm-3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
39
|
+
zwarm-3.0.dist-info/entry_points.txt,sha256=u0OXq4q8d3yJ3EkUXwZfkS-Y8Lcy0F8cWrcQfoRxM6Q,46
|
|
40
|
+
zwarm-3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|