zwarm 2.0.2__tar.gz → 2.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {zwarm-2.0.2 → zwarm-2.3.5}/PKG-INFO +1 -1
- {zwarm-2.0.2 → zwarm-2.3.5}/pyproject.toml +1 -1
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/cli/main.py +103 -1
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/environment.py +51 -32
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/orchestrator.py +8 -3
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/sessions/manager.py +125 -1
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/tools/delegation.py +126 -7
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/__init__.py +5 -0
- zwarm-2.3.5/src/zwarm/watchers/llm_watcher.py +319 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/.gitignore +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/README.md +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/__init__.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/__init__.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/base.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/claude_code.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/codex_mcp.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/registry.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/test_codex_mcp.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/test_registry.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/cli/__init__.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/__init__.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/compact.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/config.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/models.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/state.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/test_compact.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/test_config.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/test_models.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/prompts/__init__.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/prompts/orchestrator.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/sessions/__init__.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/test_orchestrator_watchers.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/tools/__init__.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/base.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/builtin.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/manager.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/registry.py +0 -0
- {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/test_watchers.py +0 -0
|
@@ -1151,6 +1151,7 @@ def interactive(
|
|
|
1151
1151
|
[cyan]ls[/] / [cyan]list[/] Dashboard of all sessions
|
|
1152
1152
|
[cyan]?[/] ID Quick peek (status + latest message)
|
|
1153
1153
|
[cyan]show[/] ID Full session details & history
|
|
1154
|
+
[cyan]traj[/] ID Show trajectory (all steps taken)
|
|
1154
1155
|
[cyan]c[/] / [cyan]continue[/] ID "msg" Continue a sync conversation
|
|
1155
1156
|
[cyan]kill[/] ID Stop a session (keeps in history)
|
|
1156
1157
|
[cyan]rm[/] ID Delete session entirely
|
|
@@ -1230,11 +1231,14 @@ def interactive(
|
|
|
1230
1231
|
help_table.add_row(" --async", "Background mode (don't wait)")
|
|
1231
1232
|
help_table.add_row("", "")
|
|
1232
1233
|
help_table.add_row("ls / list", "Dashboard of all sessions")
|
|
1233
|
-
help_table.add_row("? /
|
|
1234
|
+
help_table.add_row("? ID / peek ID", "Quick peek (status + latest message)")
|
|
1235
|
+
help_table.add_row("show ID", "Full session details & messages")
|
|
1236
|
+
help_table.add_row("traj ID [--full]", "Show trajectory (all steps taken)")
|
|
1234
1237
|
help_table.add_row('c ID "msg"', "Continue conversation (wait for response)")
|
|
1235
1238
|
help_table.add_row('ca ID "msg"', "Continue async (fire-and-forget)")
|
|
1236
1239
|
help_table.add_row("check ID", "Check session status")
|
|
1237
1240
|
help_table.add_row("kill ID", "Stop a running session")
|
|
1241
|
+
help_table.add_row("rm ID", "Delete session entirely")
|
|
1238
1242
|
help_table.add_row("killall", "Stop all running sessions")
|
|
1239
1243
|
help_table.add_row("clean", "Remove old completed sessions")
|
|
1240
1244
|
help_table.add_row("q / quit", "Exit")
|
|
@@ -1619,6 +1623,93 @@ def interactive(
|
|
|
1619
1623
|
if session.error:
|
|
1620
1624
|
console.print(f"[red]Error:[/] {session.error}")
|
|
1621
1625
|
|
|
1626
|
+
def do_trajectory(session_id: str, full: bool = False):
|
|
1627
|
+
"""Show the full trajectory of a session - all steps in order."""
|
|
1628
|
+
from zwarm.sessions import CodexSessionManager
|
|
1629
|
+
|
|
1630
|
+
manager = CodexSessionManager(default_dir / ".zwarm")
|
|
1631
|
+
session = manager.get_session(session_id)
|
|
1632
|
+
|
|
1633
|
+
if not session:
|
|
1634
|
+
console.print(f" [red]Session not found:[/] {session_id}")
|
|
1635
|
+
return
|
|
1636
|
+
|
|
1637
|
+
trajectory = manager.get_trajectory(session_id, full=full)
|
|
1638
|
+
|
|
1639
|
+
if not trajectory:
|
|
1640
|
+
console.print("[dim]No trajectory data available.[/]")
|
|
1641
|
+
return
|
|
1642
|
+
|
|
1643
|
+
mode = "[bold](full)[/] " if full else ""
|
|
1644
|
+
console.print(f"\n[bold cyan]Trajectory: {session.short_id}[/] {mode}({len(trajectory)} steps)")
|
|
1645
|
+
console.print(f"[dim]Task: {session.task[:60]}{'...' if len(session.task) > 60 else ''}[/]")
|
|
1646
|
+
console.print()
|
|
1647
|
+
|
|
1648
|
+
# Display each step
|
|
1649
|
+
for step in trajectory:
|
|
1650
|
+
turn = step.get("turn", 1)
|
|
1651
|
+
step_num = step.get("step", 0)
|
|
1652
|
+
step_type = step.get("type", "unknown")
|
|
1653
|
+
|
|
1654
|
+
prefix = f"[dim]T{turn}.{step_num:02d}[/]"
|
|
1655
|
+
|
|
1656
|
+
if step_type == "reasoning":
|
|
1657
|
+
if full and step.get("full_text"):
|
|
1658
|
+
console.print(f"{prefix} [yellow]thinking:[/]")
|
|
1659
|
+
console.print(f" {step['full_text']}")
|
|
1660
|
+
else:
|
|
1661
|
+
summary = step.get("summary", "")
|
|
1662
|
+
console.print(f"{prefix} [yellow]thinking:[/] {summary}")
|
|
1663
|
+
|
|
1664
|
+
elif step_type == "command":
|
|
1665
|
+
cmd = step.get("command", "")
|
|
1666
|
+
output = step.get("output", "")
|
|
1667
|
+
exit_code = step.get("exit_code", "?")
|
|
1668
|
+
# Show command
|
|
1669
|
+
console.print(f"{prefix} [cyan]$ {cmd}[/]")
|
|
1670
|
+
if output:
|
|
1671
|
+
if full:
|
|
1672
|
+
# Show all output
|
|
1673
|
+
for line in output.split("\n"):
|
|
1674
|
+
console.print(f" [dim]{line}[/]")
|
|
1675
|
+
else:
|
|
1676
|
+
# Indent output, max 5 lines
|
|
1677
|
+
for line in output.split("\n")[:5]:
|
|
1678
|
+
console.print(f" [dim]{line}[/]")
|
|
1679
|
+
if output.count("\n") > 5:
|
|
1680
|
+
console.print(f" [dim]... ({output.count(chr(10))} lines)[/]")
|
|
1681
|
+
if exit_code != 0 and exit_code is not None:
|
|
1682
|
+
console.print(f" [red]exit: {exit_code}[/]")
|
|
1683
|
+
|
|
1684
|
+
elif step_type == "tool_call":
|
|
1685
|
+
tool = step.get("tool", "unknown")
|
|
1686
|
+
if full and step.get("full_args"):
|
|
1687
|
+
import json
|
|
1688
|
+
console.print(f"{prefix} [magenta]tool:[/] {tool}")
|
|
1689
|
+
console.print(f" {json.dumps(step['full_args'], indent=2)}")
|
|
1690
|
+
else:
|
|
1691
|
+
args = step.get("args_preview", "")
|
|
1692
|
+
console.print(f"{prefix} [magenta]tool:[/] {tool}({args})")
|
|
1693
|
+
|
|
1694
|
+
elif step_type == "tool_output":
|
|
1695
|
+
output = step.get("output", "")
|
|
1696
|
+
if not full:
|
|
1697
|
+
output = output[:100]
|
|
1698
|
+
console.print(f"{prefix} [dim]→ {output}[/]")
|
|
1699
|
+
|
|
1700
|
+
elif step_type == "message":
|
|
1701
|
+
if full and step.get("full_text"):
|
|
1702
|
+
console.print(f"{prefix} [green]response:[/]")
|
|
1703
|
+
console.print(f" {step['full_text']}")
|
|
1704
|
+
else:
|
|
1705
|
+
summary = step.get("summary", "")
|
|
1706
|
+
full_len = step.get("full_length", 0)
|
|
1707
|
+
console.print(f"{prefix} [green]response:[/] {summary}")
|
|
1708
|
+
if full_len > 200:
|
|
1709
|
+
console.print(f" [dim]({full_len} chars total)[/]")
|
|
1710
|
+
|
|
1711
|
+
console.print()
|
|
1712
|
+
|
|
1622
1713
|
def do_continue(session_id: str, message: str, wait: bool = True):
|
|
1623
1714
|
"""
|
|
1624
1715
|
Continue a conversation using CodexSessionManager.inject_message().
|
|
@@ -1872,6 +1963,17 @@ def interactive(
|
|
|
1872
1963
|
else:
|
|
1873
1964
|
do_show(args[0])
|
|
1874
1965
|
|
|
1966
|
+
elif cmd in ("traj", "trajectory"):
|
|
1967
|
+
if not args:
|
|
1968
|
+
console.print(" [red]Usage:[/] traj SESSION_ID [--full]")
|
|
1969
|
+
else:
|
|
1970
|
+
full_mode = "--full" in args
|
|
1971
|
+
session_arg = [a for a in args if a != "--full"]
|
|
1972
|
+
if session_arg:
|
|
1973
|
+
do_trajectory(session_arg[0], full=full_mode)
|
|
1974
|
+
else:
|
|
1975
|
+
console.print(" [red]Usage:[/] traj SESSION_ID [--full]")
|
|
1976
|
+
|
|
1875
1977
|
elif cmd in ("c", "continue"):
|
|
1876
1978
|
# Sync continue - waits for response
|
|
1877
1979
|
if len(args) < 2:
|
|
@@ -17,6 +17,7 @@ from wbal.environment import Environment
|
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
from zwarm.core.models import ConversationSession
|
|
20
|
+
from zwarm.sessions import CodexSessionManager
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class OrchestratorEnv(Environment):
|
|
@@ -36,7 +37,10 @@ class OrchestratorEnv(Environment):
|
|
|
36
37
|
working_dir: Path = Path(".")
|
|
37
38
|
output_handler: Callable[[str], None] = lambda x: print(x)
|
|
38
39
|
|
|
39
|
-
# Session
|
|
40
|
+
# Session manager (set by orchestrator) - pulls live data each observe()
|
|
41
|
+
_session_manager: "CodexSessionManager | None" = PrivateAttr(default=None)
|
|
42
|
+
|
|
43
|
+
# Legacy: old sessions dict (deprecated, for backwards compat)
|
|
40
44
|
_sessions: dict[str, "ConversationSession"] | None = PrivateAttr(default=None)
|
|
41
45
|
|
|
42
46
|
# Progress tracking (updated by orchestrator each step)
|
|
@@ -48,8 +52,12 @@ class OrchestratorEnv(Environment):
|
|
|
48
52
|
# Budget config (set from config)
|
|
49
53
|
_budget_max_sessions: int | None = PrivateAttr(default=None)
|
|
50
54
|
|
|
55
|
+
def set_session_manager(self, manager: "CodexSessionManager") -> None:
|
|
56
|
+
"""Set the session manager for live session visibility in observe()."""
|
|
57
|
+
self._session_manager = manager
|
|
58
|
+
|
|
51
59
|
def set_sessions(self, sessions: dict[str, "ConversationSession"]) -> None:
|
|
52
|
-
"""Set the sessions dict for observe() visibility."""
|
|
60
|
+
"""Legacy: Set the sessions dict for observe() visibility."""
|
|
53
61
|
self._sessions = sessions
|
|
54
62
|
|
|
55
63
|
def update_progress(
|
|
@@ -75,7 +83,7 @@ class OrchestratorEnv(Environment):
|
|
|
75
83
|
|
|
76
84
|
Shows:
|
|
77
85
|
- Progress (steps, tokens)
|
|
78
|
-
- Session summary
|
|
86
|
+
- Session summary (pulled LIVE from CodexSessionManager)
|
|
79
87
|
- Active sessions with their status
|
|
80
88
|
- Working directory
|
|
81
89
|
|
|
@@ -108,45 +116,56 @@ class OrchestratorEnv(Environment):
|
|
|
108
116
|
|
|
109
117
|
parts.append("## Progress\n" + "\n".join(progress_lines))
|
|
110
118
|
|
|
111
|
-
# Session summary
|
|
112
|
-
if self.
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
)
|
|
116
|
-
completed = sum(
|
|
117
|
-
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
)
|
|
122
|
-
total = len(self._sessions)
|
|
123
|
-
|
|
124
|
-
summary = f"Sessions: {active} active, {completed} done, {failed} failed ({total} total)"
|
|
119
|
+
# Session summary - pull LIVE from CodexSessionManager
|
|
120
|
+
if self._session_manager is not None:
|
|
121
|
+
sessions = self._session_manager.list_sessions()
|
|
122
|
+
|
|
123
|
+
running = sum(1 for s in sessions if s.status.value == "running")
|
|
124
|
+
completed = sum(1 for s in sessions if s.status.value == "completed")
|
|
125
|
+
failed = sum(1 for s in sessions if s.status.value == "failed")
|
|
126
|
+
total = len(sessions)
|
|
127
|
+
|
|
128
|
+
summary = f"Sessions: {running} running, {completed} done, {failed} failed ({total} total)"
|
|
125
129
|
if self._budget_max_sessions:
|
|
126
130
|
summary += f" [limit: {self._budget_max_sessions}]"
|
|
127
131
|
|
|
128
132
|
parts.append(f"## Resources\n{summary}")
|
|
129
133
|
|
|
130
|
-
#
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
134
|
+
# Running sessions detail
|
|
135
|
+
running_sessions = [s for s in sessions if s.status.value == "running"]
|
|
136
|
+
if running_sessions:
|
|
137
|
+
session_lines = []
|
|
138
|
+
for session in running_sessions:
|
|
139
|
+
task_preview = (
|
|
140
|
+
session.task[:50] + "..."
|
|
141
|
+
if len(session.task) > 50
|
|
142
|
+
else session.task
|
|
143
|
+
)
|
|
144
|
+
tokens = session.token_usage.get("total_tokens", 0)
|
|
145
|
+
token_info = f", {tokens:,} tok" if tokens else ""
|
|
146
|
+
session_lines.append(
|
|
147
|
+
f" • {session.short_id} (turn {session.turn}{token_info}): {task_preview}"
|
|
148
|
+
)
|
|
149
|
+
parts.append("## Running Sessions\n" + "\n".join(session_lines))
|
|
150
|
+
|
|
151
|
+
# Recently completed (for visibility)
|
|
152
|
+
recent_completed = [
|
|
153
|
+
s for s in sessions
|
|
154
|
+
if s.status.value == "completed"
|
|
155
|
+
][:3] # Last 3 completed
|
|
156
|
+
if recent_completed:
|
|
137
157
|
session_lines = []
|
|
138
|
-
for
|
|
139
|
-
mode_tag = "sync" if session.mode.value == "sync" else "async"
|
|
140
|
-
turns = len([m for m in session.messages if m.role == "user"])
|
|
158
|
+
for session in recent_completed:
|
|
141
159
|
task_preview = (
|
|
142
|
-
session.
|
|
143
|
-
if len(session.
|
|
144
|
-
else session.
|
|
160
|
+
session.task[:40] + "..."
|
|
161
|
+
if len(session.task) > 40
|
|
162
|
+
else session.task
|
|
145
163
|
)
|
|
164
|
+
tokens = session.token_usage.get("total_tokens", 0)
|
|
146
165
|
session_lines.append(
|
|
147
|
-
f"
|
|
166
|
+
f" • {session.short_id} ✓ ({tokens:,} tok): {task_preview}"
|
|
148
167
|
)
|
|
149
|
-
parts.append("##
|
|
168
|
+
parts.append("## Recently Completed\n" + "\n".join(session_lines))
|
|
150
169
|
|
|
151
170
|
# Working directory (less prominent)
|
|
152
171
|
parts.append(f"## Context\nWorking dir: {self.working_dir.absolute()}")
|
|
@@ -127,9 +127,14 @@ class Orchestrator(YamlAgent):
|
|
|
127
127
|
}
|
|
128
128
|
)
|
|
129
129
|
|
|
130
|
-
#
|
|
131
|
-
|
|
132
|
-
|
|
130
|
+
# Initialize CodexSessionManager and link to environment
|
|
131
|
+
# This is the SAME manager used by delegation tools
|
|
132
|
+
from zwarm.sessions import CodexSessionManager
|
|
133
|
+
self._session_manager = CodexSessionManager(self.working_dir / ".zwarm")
|
|
134
|
+
|
|
135
|
+
# Link session manager to environment for live session visibility in observe()
|
|
136
|
+
if hasattr(self.env, "set_session_manager"):
|
|
137
|
+
self.env.set_session_manager(self._session_manager)
|
|
133
138
|
|
|
134
139
|
# Set budget limits in environment
|
|
135
140
|
if hasattr(self.env, "set_budget"):
|
|
@@ -301,9 +301,18 @@ class CodexSessionManager:
|
|
|
301
301
|
session.messages = messages
|
|
302
302
|
session.token_usage = usage
|
|
303
303
|
|
|
304
|
-
if
|
|
304
|
+
# Check if we got actual assistant responses
|
|
305
|
+
has_response = any(m.role == "assistant" for m in messages)
|
|
306
|
+
|
|
307
|
+
if error and not has_response:
|
|
308
|
+
# Only mark as failed if we have an error AND no response
|
|
305
309
|
session.status = SessionStatus.FAILED
|
|
306
310
|
session.error = error
|
|
311
|
+
elif error and has_response:
|
|
312
|
+
# Got response but also an error (e.g., network disconnect at end)
|
|
313
|
+
# Treat as completed but note the error
|
|
314
|
+
session.status = SessionStatus.COMPLETED
|
|
315
|
+
session.error = f"Completed with error: {error}"
|
|
307
316
|
else:
|
|
308
317
|
session.status = SessionStatus.COMPLETED
|
|
309
318
|
else:
|
|
@@ -634,12 +643,127 @@ Continue from where you left off, addressing the user's new message."""
|
|
|
634
643
|
turn_usage = event.get("usage", {})
|
|
635
644
|
for key, value in turn_usage.items():
|
|
636
645
|
usage[key] = usage.get(key, 0) + value
|
|
646
|
+
# Compute total_tokens if not present
|
|
647
|
+
if "total_tokens" not in usage:
|
|
648
|
+
usage["total_tokens"] = usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
|
|
637
649
|
|
|
638
650
|
elif event_type == "error":
|
|
639
651
|
error = event.get("message", str(event))
|
|
640
652
|
|
|
641
653
|
return messages, usage, error
|
|
642
654
|
|
|
655
|
+
def get_trajectory(self, session_id: str, full: bool = False, max_output_len: int = 200) -> list[dict]:
|
|
656
|
+
"""
|
|
657
|
+
Get the full trajectory of a session - all steps in order.
|
|
658
|
+
|
|
659
|
+
Args:
|
|
660
|
+
session_id: Session to get trajectory for
|
|
661
|
+
full: If True, include full untruncated content
|
|
662
|
+
max_output_len: Max length for outputs when full=False
|
|
663
|
+
|
|
664
|
+
Returns a list of step dicts with type, summary, and details.
|
|
665
|
+
This shows the "broad strokes" of what the agent did.
|
|
666
|
+
"""
|
|
667
|
+
if full:
|
|
668
|
+
max_output_len = 999999 # Effectively unlimited
|
|
669
|
+
session = self.get_session(session_id)
|
|
670
|
+
if not session:
|
|
671
|
+
return []
|
|
672
|
+
|
|
673
|
+
trajectory = []
|
|
674
|
+
|
|
675
|
+
for turn in range(1, session.turn + 1):
|
|
676
|
+
output_path = self._output_path(session.id, turn)
|
|
677
|
+
if not output_path.exists():
|
|
678
|
+
continue
|
|
679
|
+
|
|
680
|
+
content = output_path.read_text()
|
|
681
|
+
step_num = 0
|
|
682
|
+
|
|
683
|
+
for line in content.strip().split("\n"):
|
|
684
|
+
if not line.strip():
|
|
685
|
+
continue
|
|
686
|
+
|
|
687
|
+
try:
|
|
688
|
+
event = json.loads(line)
|
|
689
|
+
except json.JSONDecodeError:
|
|
690
|
+
continue
|
|
691
|
+
|
|
692
|
+
event_type = event.get("type", "")
|
|
693
|
+
|
|
694
|
+
if event_type == "item.completed":
|
|
695
|
+
item = event.get("item", {})
|
|
696
|
+
item_type = item.get("type", "")
|
|
697
|
+
step_num += 1
|
|
698
|
+
|
|
699
|
+
if item_type == "reasoning":
|
|
700
|
+
text = item.get("text", "")
|
|
701
|
+
summary_len = max_output_len if full else 100
|
|
702
|
+
trajectory.append({
|
|
703
|
+
"turn": turn,
|
|
704
|
+
"step": step_num,
|
|
705
|
+
"type": "reasoning",
|
|
706
|
+
"summary": text[:summary_len] + ("..." if len(text) > summary_len else ""),
|
|
707
|
+
"full_text": text if full else None,
|
|
708
|
+
})
|
|
709
|
+
|
|
710
|
+
elif item_type == "command_execution":
|
|
711
|
+
cmd = item.get("command", "")
|
|
712
|
+
output = item.get("aggregated_output", "")
|
|
713
|
+
exit_code = item.get("exit_code")
|
|
714
|
+
# Truncate output
|
|
715
|
+
output_preview = output[:max_output_len]
|
|
716
|
+
if len(output) > max_output_len:
|
|
717
|
+
output_preview += "..."
|
|
718
|
+
trajectory.append({
|
|
719
|
+
"turn": turn,
|
|
720
|
+
"step": step_num,
|
|
721
|
+
"type": "command",
|
|
722
|
+
"command": cmd,
|
|
723
|
+
"output": output_preview.strip(),
|
|
724
|
+
"exit_code": exit_code,
|
|
725
|
+
})
|
|
726
|
+
|
|
727
|
+
elif item_type == "function_call":
|
|
728
|
+
func_name = item.get("name", "unknown")
|
|
729
|
+
args = item.get("arguments", {})
|
|
730
|
+
args_str = str(args)
|
|
731
|
+
args_len = max_output_len if full else 100
|
|
732
|
+
trajectory.append({
|
|
733
|
+
"turn": turn,
|
|
734
|
+
"step": step_num,
|
|
735
|
+
"type": "tool_call",
|
|
736
|
+
"tool": func_name,
|
|
737
|
+
"args_preview": args_str[:args_len] + ("..." if len(args_str) > args_len else ""),
|
|
738
|
+
"full_args": args if full else None,
|
|
739
|
+
})
|
|
740
|
+
|
|
741
|
+
elif item_type == "function_call_output":
|
|
742
|
+
output = item.get("output", "")
|
|
743
|
+
output_preview = output[:max_output_len]
|
|
744
|
+
if len(output) > max_output_len:
|
|
745
|
+
output_preview += "..."
|
|
746
|
+
trajectory.append({
|
|
747
|
+
"turn": turn,
|
|
748
|
+
"step": step_num,
|
|
749
|
+
"type": "tool_output",
|
|
750
|
+
"output": output_preview,
|
|
751
|
+
})
|
|
752
|
+
|
|
753
|
+
elif item_type == "agent_message":
|
|
754
|
+
text = item.get("text", "")
|
|
755
|
+
summary_len = max_output_len if full else 200
|
|
756
|
+
trajectory.append({
|
|
757
|
+
"turn": turn,
|
|
758
|
+
"step": step_num,
|
|
759
|
+
"type": "message",
|
|
760
|
+
"summary": text[:summary_len] + ("..." if len(text) > summary_len else ""),
|
|
761
|
+
"full_text": text if full else None,
|
|
762
|
+
"full_length": len(text),
|
|
763
|
+
})
|
|
764
|
+
|
|
765
|
+
return trajectory
|
|
766
|
+
|
|
643
767
|
def cleanup_completed(self, keep_days: int = 7) -> int:
|
|
644
768
|
"""
|
|
645
769
|
Remove old completed/failed/killed sessions.
|
|
@@ -33,8 +33,12 @@ def _get_session_manager(orchestrator: "Orchestrator"):
|
|
|
33
33
|
|
|
34
34
|
Both `zwarm interactive` and `zwarm orchestrate` use the same session manager.
|
|
35
35
|
The orchestrator is just another user that happens to be an LLM.
|
|
36
|
+
|
|
37
|
+
The session manager is created eagerly in Orchestrator.model_post_init()
|
|
38
|
+
and shared with the environment for observe() visibility.
|
|
36
39
|
"""
|
|
37
|
-
|
|
40
|
+
# Should already exist from model_post_init, but create if not
|
|
41
|
+
if not hasattr(orchestrator, "_session_manager") or orchestrator._session_manager is None:
|
|
38
42
|
from zwarm.sessions import CodexSessionManager
|
|
39
43
|
orchestrator._session_manager = CodexSessionManager(orchestrator.working_dir / ".zwarm")
|
|
40
44
|
return orchestrator._session_manager
|
|
@@ -83,6 +87,14 @@ def _format_session_header(session) -> str:
|
|
|
83
87
|
return f"[{session.short_id}] codex ({session.status.value})"
|
|
84
88
|
|
|
85
89
|
|
|
90
|
+
def _get_total_tokens(session) -> int:
|
|
91
|
+
"""Get total tokens, computing from input+output if not present."""
|
|
92
|
+
usage = session.token_usage
|
|
93
|
+
if "total_tokens" in usage:
|
|
94
|
+
return usage["total_tokens"]
|
|
95
|
+
return usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
|
|
96
|
+
|
|
97
|
+
|
|
86
98
|
def _validate_working_dir(
|
|
87
99
|
requested_dir: Path | str | None,
|
|
88
100
|
default_dir: Path,
|
|
@@ -238,6 +250,25 @@ def delegate(
|
|
|
238
250
|
response_text = msg.content
|
|
239
251
|
break # Take first assistant message
|
|
240
252
|
|
|
253
|
+
# Build log path for debugging
|
|
254
|
+
log_path = str(manager._output_path(session.id, session.turn))
|
|
255
|
+
|
|
256
|
+
# Check if session failed
|
|
257
|
+
from zwarm.sessions import SessionStatus
|
|
258
|
+
if session.status == SessionStatus.FAILED:
|
|
259
|
+
return {
|
|
260
|
+
"success": False,
|
|
261
|
+
"session": _format_session_header(session),
|
|
262
|
+
"session_id": session.id,
|
|
263
|
+
"status": "failed",
|
|
264
|
+
"task": _truncate(task, 100),
|
|
265
|
+
"error": session.error or "Unknown error",
|
|
266
|
+
"response": response_text or "(no response captured)",
|
|
267
|
+
"tokens": _get_total_tokens(session),
|
|
268
|
+
"log_file": log_path,
|
|
269
|
+
"hint": "Check log_file for raw codex output. Use bash('cat <log_file>') to inspect.",
|
|
270
|
+
}
|
|
271
|
+
|
|
241
272
|
return {
|
|
242
273
|
"success": True,
|
|
243
274
|
"session": _format_session_header(session),
|
|
@@ -245,7 +276,8 @@ def delegate(
|
|
|
245
276
|
"status": session.status.value,
|
|
246
277
|
"task": _truncate(task, 100),
|
|
247
278
|
"response": response_text or "(no response captured)",
|
|
248
|
-
"tokens": session
|
|
279
|
+
"tokens": _get_total_tokens(session),
|
|
280
|
+
"log_file": log_path,
|
|
249
281
|
"hint": "Use converse(session_id, message) to send follow-up messages",
|
|
250
282
|
}
|
|
251
283
|
else:
|
|
@@ -382,7 +414,7 @@ def converse(
|
|
|
382
414
|
"turn": session.turn,
|
|
383
415
|
"you_said": _truncate(message, 100),
|
|
384
416
|
"response": response_text or "(no response captured)",
|
|
385
|
-
"tokens": session
|
|
417
|
+
"tokens": _get_total_tokens(session),
|
|
386
418
|
}
|
|
387
419
|
|
|
388
420
|
|
|
@@ -423,7 +455,10 @@ def check_session(
|
|
|
423
455
|
response_text = msg.content
|
|
424
456
|
break
|
|
425
457
|
|
|
426
|
-
|
|
458
|
+
# Build log path
|
|
459
|
+
log_path = str(manager._output_path(session.id, session.turn))
|
|
460
|
+
|
|
461
|
+
result = {
|
|
427
462
|
"success": True,
|
|
428
463
|
"session": _format_session_header(session),
|
|
429
464
|
"session_id": session_id,
|
|
@@ -433,10 +468,19 @@ def check_session(
|
|
|
433
468
|
"message_count": len(messages),
|
|
434
469
|
"task": _truncate(session.task, 80),
|
|
435
470
|
"response": _truncate(response_text, 500) if response_text else "(no response yet)",
|
|
436
|
-
"tokens": session
|
|
471
|
+
"tokens": _get_total_tokens(session),
|
|
437
472
|
"runtime": session.runtime,
|
|
473
|
+
"log_file": log_path,
|
|
438
474
|
}
|
|
439
475
|
|
|
476
|
+
# Add error info if failed
|
|
477
|
+
from zwarm.sessions import SessionStatus
|
|
478
|
+
if session.status == SessionStatus.FAILED:
|
|
479
|
+
result["success"] = False
|
|
480
|
+
result["error"] = session.error or "Unknown error"
|
|
481
|
+
|
|
482
|
+
return result
|
|
483
|
+
|
|
440
484
|
|
|
441
485
|
@weaveTool
|
|
442
486
|
def peek_session(
|
|
@@ -477,6 +521,81 @@ def peek_session(
|
|
|
477
521
|
}
|
|
478
522
|
|
|
479
523
|
|
|
524
|
+
@weaveTool
|
|
525
|
+
def get_trajectory(
|
|
526
|
+
self: "Orchestrator",
|
|
527
|
+
session_id: str,
|
|
528
|
+
full: bool = False,
|
|
529
|
+
) -> dict[str, Any]:
|
|
530
|
+
"""
|
|
531
|
+
Get the full trajectory of a session - all steps the agent took.
|
|
532
|
+
|
|
533
|
+
Shows reasoning, commands, tool calls, and responses in order.
|
|
534
|
+
Useful for understanding HOW the agent completed a task, not just
|
|
535
|
+
the final result.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
session_id: The session to get trajectory for.
|
|
539
|
+
full: If True, include full untruncated content (default: False for summary view).
|
|
540
|
+
|
|
541
|
+
Returns:
|
|
542
|
+
{steps: [...], step_count}
|
|
543
|
+
"""
|
|
544
|
+
manager = _get_session_manager(self)
|
|
545
|
+
|
|
546
|
+
session = manager.get_session(session_id)
|
|
547
|
+
if not session:
|
|
548
|
+
return {"success": False, "error": f"Unknown session: {session_id}"}
|
|
549
|
+
|
|
550
|
+
trajectory = manager.get_trajectory(session_id, full=full)
|
|
551
|
+
|
|
552
|
+
# Format steps for easy reading
|
|
553
|
+
formatted_steps = []
|
|
554
|
+
for step in trajectory:
|
|
555
|
+
step_type = step.get("type", "unknown")
|
|
556
|
+
|
|
557
|
+
if step_type == "reasoning":
|
|
558
|
+
text = step.get("full_text") if full else step.get("summary", "")
|
|
559
|
+
formatted_steps.append(f"[thinking] {text}")
|
|
560
|
+
elif step_type == "command":
|
|
561
|
+
cmd = step.get("command", "")
|
|
562
|
+
output = step.get("output", "")
|
|
563
|
+
exit_code = step.get("exit_code")
|
|
564
|
+
step_str = f"[command] $ {cmd}"
|
|
565
|
+
if output:
|
|
566
|
+
if full:
|
|
567
|
+
step_str += f"\n → {output}"
|
|
568
|
+
else:
|
|
569
|
+
step_str += f"\n → {output[:100]}{'...' if len(output) > 100 else ''}"
|
|
570
|
+
if exit_code and exit_code != 0:
|
|
571
|
+
step_str += f" (exit: {exit_code})"
|
|
572
|
+
formatted_steps.append(step_str)
|
|
573
|
+
elif step_type == "tool_call":
|
|
574
|
+
if full and step.get("full_args"):
|
|
575
|
+
import json
|
|
576
|
+
args_str = json.dumps(step["full_args"], indent=2)
|
|
577
|
+
formatted_steps.append(f"[tool] {step.get('tool', 'unknown')}\n {args_str}")
|
|
578
|
+
else:
|
|
579
|
+
formatted_steps.append(f"[tool] {step.get('tool', 'unknown')}({step.get('args_preview', '')})")
|
|
580
|
+
elif step_type == "tool_output":
|
|
581
|
+
output = step.get("output", "")
|
|
582
|
+
if not full:
|
|
583
|
+
output = output[:100]
|
|
584
|
+
formatted_steps.append(f"[result] {output}")
|
|
585
|
+
elif step_type == "message":
|
|
586
|
+
text = step.get("full_text") if full else step.get("summary", "")
|
|
587
|
+
formatted_steps.append(f"[response] {text}")
|
|
588
|
+
|
|
589
|
+
return {
|
|
590
|
+
"success": True,
|
|
591
|
+
"session_id": session.short_id,
|
|
592
|
+
"task": _truncate(session.task, 80),
|
|
593
|
+
"step_count": len(trajectory),
|
|
594
|
+
"steps": formatted_steps,
|
|
595
|
+
"mode": "full" if full else "summary",
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
|
|
480
599
|
@weaveTool
|
|
481
600
|
def end_session(
|
|
482
601
|
self: "Orchestrator",
|
|
@@ -539,7 +658,7 @@ def end_session(
|
|
|
539
658
|
"status": session.status.value,
|
|
540
659
|
"reason": reason or "ended by orchestrator",
|
|
541
660
|
"turn": session.turn,
|
|
542
|
-
"tokens": session
|
|
661
|
+
"tokens": _get_total_tokens(session),
|
|
543
662
|
}
|
|
544
663
|
|
|
545
664
|
|
|
@@ -646,7 +765,7 @@ def list_sessions(
|
|
|
646
765
|
"updated_secs": int(updated_secs),
|
|
647
766
|
"last_message": _truncate(last_message, 100) if last_message else "(no response yet)",
|
|
648
767
|
"needs_attention": needs_attention,
|
|
649
|
-
"tokens": s
|
|
768
|
+
"tokens": _get_total_tokens(s),
|
|
650
769
|
})
|
|
651
770
|
|
|
652
771
|
# Summary counts
|
|
@@ -11,6 +11,10 @@ from zwarm.watchers.manager import WatcherManager, WatcherConfig, build_watcher_
|
|
|
11
11
|
|
|
12
12
|
# Import built-in watchers to register them
|
|
13
13
|
from zwarm.watchers import builtin as _builtin # noqa: F401
|
|
14
|
+
from zwarm.watchers import llm_watcher as _llm_watcher # noqa: F401
|
|
15
|
+
|
|
16
|
+
# Export trajectory compression utility
|
|
17
|
+
from zwarm.watchers.llm_watcher import compress_trajectory
|
|
14
18
|
|
|
15
19
|
__all__ = [
|
|
16
20
|
"Watcher",
|
|
@@ -23,4 +27,5 @@ __all__ = [
|
|
|
23
27
|
"get_watcher",
|
|
24
28
|
"list_watchers",
|
|
25
29
|
"build_watcher_manager",
|
|
30
|
+
"compress_trajectory",
|
|
26
31
|
]
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM-based watcher for nuanced trajectory analysis.
|
|
3
|
+
|
|
4
|
+
Unlike rule-based watchers, this watcher uses a language model to assess
|
|
5
|
+
the orchestrator's trajectory and provide context-aware guidance.
|
|
6
|
+
|
|
7
|
+
The watcher compresses the full message history into a compact trajectory
|
|
8
|
+
representation (similar to what Codex shows in its UI) to minimize token
|
|
9
|
+
usage while preserving the "shape" of the agent's behavior.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from zwarm.watchers.base import Watcher, WatcherContext, WatcherResult
|
|
19
|
+
from zwarm.watchers.registry import register_watcher
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _get_field(item: Any, name: str, default: Any = None) -> Any:
|
|
25
|
+
"""Get field from dict or object."""
|
|
26
|
+
if isinstance(item, dict):
|
|
27
|
+
return item.get(name, default)
|
|
28
|
+
return getattr(item, name, default)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _extract_tool_call_summary(tc: Any) -> str:
|
|
32
|
+
"""Extract a compact summary of a tool call."""
|
|
33
|
+
if isinstance(tc, dict):
|
|
34
|
+
func = tc.get("function", tc)
|
|
35
|
+
name = func.get("name", tc.get("name", "?"))
|
|
36
|
+
args = func.get("arguments", tc.get("arguments", ""))
|
|
37
|
+
else:
|
|
38
|
+
name = getattr(tc, "name", "?")
|
|
39
|
+
args = getattr(tc, "arguments", "")
|
|
40
|
+
|
|
41
|
+
# Parse args if JSON string
|
|
42
|
+
if isinstance(args, str):
|
|
43
|
+
try:
|
|
44
|
+
args = json.loads(args)
|
|
45
|
+
except (json.JSONDecodeError, TypeError):
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
# Create compact arg summary
|
|
49
|
+
if isinstance(args, dict):
|
|
50
|
+
# Show key args based on tool type
|
|
51
|
+
if name == "delegate":
|
|
52
|
+
task = args.get("task", "")[:50]
|
|
53
|
+
mode = args.get("mode", "sync")
|
|
54
|
+
return f"delegate({mode}): {task}..."
|
|
55
|
+
elif name == "converse":
|
|
56
|
+
msg = args.get("message", "")[:40]
|
|
57
|
+
return f"converse: {msg}..."
|
|
58
|
+
elif name == "bash":
|
|
59
|
+
cmd = args.get("command", "")[:60]
|
|
60
|
+
return f"$ {cmd}"
|
|
61
|
+
elif name in ("check_session", "peek_session", "end_session"):
|
|
62
|
+
sid = args.get("session_id", "")[:8]
|
|
63
|
+
return f"{name}({sid})"
|
|
64
|
+
elif name == "list_sessions":
|
|
65
|
+
return "list_sessions()"
|
|
66
|
+
else:
|
|
67
|
+
# Generic: show first arg
|
|
68
|
+
first_val = next(iter(args.values()), "") if args else ""
|
|
69
|
+
if isinstance(first_val, str) and len(first_val) > 30:
|
|
70
|
+
first_val = first_val[:30] + "..."
|
|
71
|
+
return f"{name}({first_val})"
|
|
72
|
+
else:
|
|
73
|
+
return f"{name}({str(args)[:30]})"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def compress_trajectory(messages: list[dict[str, Any]], max_steps: int = 50) -> str:
|
|
77
|
+
"""
|
|
78
|
+
Compress full message history into a compact trajectory representation.
|
|
79
|
+
|
|
80
|
+
Output format (similar to Codex UI):
|
|
81
|
+
```
|
|
82
|
+
[1] thinking: "preparing to inspect the codebase"
|
|
83
|
+
→ delegate(sync): Add authentication to...
|
|
84
|
+
[2] thinking: "checking session status"
|
|
85
|
+
→ check_session(abc123)
|
|
86
|
+
[3] thinking: "session completed, verifying"
|
|
87
|
+
→ $ pytest tests/
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
messages: Full message history from orchestrator
|
|
92
|
+
max_steps: Maximum steps to include (most recent)
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Compact trajectory string
|
|
96
|
+
"""
|
|
97
|
+
steps = []
|
|
98
|
+
step_num = 0
|
|
99
|
+
|
|
100
|
+
for msg in messages:
|
|
101
|
+
role = _get_field(msg, "role", "")
|
|
102
|
+
|
|
103
|
+
if role == "system":
|
|
104
|
+
continue # Skip system messages
|
|
105
|
+
|
|
106
|
+
if role == "assistant":
|
|
107
|
+
step_num += 1
|
|
108
|
+
content = _get_field(msg, "content", "")
|
|
109
|
+
tool_calls = _get_field(msg, "tool_calls", [])
|
|
110
|
+
|
|
111
|
+
# Extract thinking/reasoning summary
|
|
112
|
+
thinking = ""
|
|
113
|
+
if content:
|
|
114
|
+
# Take first line or first 80 chars as "thinking"
|
|
115
|
+
first_line = content.split("\n")[0].strip()
|
|
116
|
+
if len(first_line) > 80:
|
|
117
|
+
thinking = first_line[:80] + "..."
|
|
118
|
+
else:
|
|
119
|
+
thinking = first_line
|
|
120
|
+
|
|
121
|
+
# Extract tool calls
|
|
122
|
+
actions = []
|
|
123
|
+
if tool_calls:
|
|
124
|
+
for tc in tool_calls[:3]: # Max 3 tool calls per step
|
|
125
|
+
actions.append(_extract_tool_call_summary(tc))
|
|
126
|
+
if len(tool_calls) > 3:
|
|
127
|
+
actions.append(f"... +{len(tool_calls) - 3} more")
|
|
128
|
+
|
|
129
|
+
# Format step
|
|
130
|
+
step_lines = [f"[{step_num}]"]
|
|
131
|
+
if thinking:
|
|
132
|
+
step_lines[0] += f' thinking: "{thinking}"'
|
|
133
|
+
for action in actions:
|
|
134
|
+
step_lines.append(f" → {action}")
|
|
135
|
+
|
|
136
|
+
steps.append("\n".join(step_lines))
|
|
137
|
+
|
|
138
|
+
elif role == "tool":
|
|
139
|
+
# Tool results - just note if error
|
|
140
|
+
content = str(_get_field(msg, "content", ""))
|
|
141
|
+
if "error" in content.lower() or "failed" in content.lower():
|
|
142
|
+
steps.append(f" ⚠ tool returned error")
|
|
143
|
+
|
|
144
|
+
elif role == "user" and step_num > 0:
|
|
145
|
+
# User message mid-conversation (watcher nudge, etc.)
|
|
146
|
+
content = _get_field(msg, "content", "")
|
|
147
|
+
if content and "[WATCHER" in content:
|
|
148
|
+
steps.append(f" 📍 watcher nudge")
|
|
149
|
+
elif content:
|
|
150
|
+
preview = content[:50].replace("\n", " ")
|
|
151
|
+
steps.append(f" 💬 user: {preview}...")
|
|
152
|
+
|
|
153
|
+
# Take most recent steps
|
|
154
|
+
if len(steps) > max_steps:
|
|
155
|
+
steps = ["... (earlier steps omitted)"] + steps[-max_steps:]
|
|
156
|
+
|
|
157
|
+
return "\n".join(steps)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _build_watcher_prompt(
|
|
161
|
+
trajectory: str,
|
|
162
|
+
task: str,
|
|
163
|
+
step: int,
|
|
164
|
+
max_steps: int,
|
|
165
|
+
session_summary: str,
|
|
166
|
+
) -> str:
|
|
167
|
+
"""Build the prompt for the LLM watcher."""
|
|
168
|
+
return f"""You are a trajectory watcher observing an orchestrator agent. Your job is to assess whether the agent is on track and provide guidance if needed.
|
|
169
|
+
|
|
170
|
+
## Original Task
|
|
171
|
+
{task}
|
|
172
|
+
|
|
173
|
+
## Progress
|
|
174
|
+
Step {step}/{max_steps}
|
|
175
|
+
|
|
176
|
+
## Active Sessions
|
|
177
|
+
{session_summary}
|
|
178
|
+
|
|
179
|
+
## Trajectory (recent steps)
|
|
180
|
+
{trajectory}
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
Analyze this trajectory and respond with a JSON object:
|
|
185
|
+
{{
|
|
186
|
+
"status": "ok" | "concern" | "problem",
|
|
187
|
+
"assessment": "Brief 1-2 sentence assessment of trajectory health",
|
|
188
|
+
"guidance": "If status is concern/problem, specific actionable guidance. Otherwise null."
|
|
189
|
+
}}
|
|
190
|
+
|
|
191
|
+
Things to watch for:
|
|
192
|
+
- Is the agent making progress toward the task?
|
|
193
|
+
- Is it spinning or repeating actions?
|
|
194
|
+
- Is it going off on tangents unrelated to the task?
|
|
195
|
+
- Is it delegating appropriately or trying to do everything directly?
|
|
196
|
+
- Are sessions being completed or just started and abandoned?
|
|
197
|
+
|
|
198
|
+
Be concise. Only flag real issues, not minor inefficiencies."""
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@register_watcher("llm")
|
|
202
|
+
class LLMWatcher(Watcher):
|
|
203
|
+
"""
|
|
204
|
+
LLM-based watcher for nuanced trajectory analysis.
|
|
205
|
+
|
|
206
|
+
Uses a language model to assess the orchestrator's trajectory
|
|
207
|
+
and provide context-aware guidance that rule-based watchers can't.
|
|
208
|
+
|
|
209
|
+
Config options:
|
|
210
|
+
model: Model to use (default: gpt-4o-mini)
|
|
211
|
+
threshold: How often to run (every N steps, default: 5)
|
|
212
|
+
temperature: LLM temperature (default: 0.3)
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
name = "llm"
|
|
216
|
+
description = "LLM-based trajectory analysis for nuanced guidance"
|
|
217
|
+
|
|
218
|
+
async def observe(self, ctx: WatcherContext) -> WatcherResult:
|
|
219
|
+
config = self.config
|
|
220
|
+
threshold = config.get("threshold", 5)
|
|
221
|
+
model = config.get("model", "gpt-4o-mini")
|
|
222
|
+
temperature = config.get("temperature", 0.3)
|
|
223
|
+
|
|
224
|
+
# Only run every N steps to save costs
|
|
225
|
+
if ctx.step % threshold != 0 or ctx.step == 0:
|
|
226
|
+
return WatcherResult.ok()
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
# Compress trajectory
|
|
230
|
+
trajectory = compress_trajectory(ctx.messages)
|
|
231
|
+
|
|
232
|
+
# Build session summary
|
|
233
|
+
active = [s for s in ctx.sessions if s.get("status") == "running"]
|
|
234
|
+
completed = [s for s in ctx.sessions if s.get("status") == "completed"]
|
|
235
|
+
failed = [s for s in ctx.sessions if s.get("status") == "failed"]
|
|
236
|
+
session_summary = f"{len(active)} running, {len(completed)} completed, {len(failed)} failed"
|
|
237
|
+
|
|
238
|
+
# Build prompt
|
|
239
|
+
prompt = _build_watcher_prompt(
|
|
240
|
+
trajectory=trajectory,
|
|
241
|
+
task=ctx.task,
|
|
242
|
+
step=ctx.step,
|
|
243
|
+
max_steps=ctx.max_steps,
|
|
244
|
+
session_summary=session_summary,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# Call LLM
|
|
248
|
+
response = await self._call_llm(prompt, model, temperature)
|
|
249
|
+
|
|
250
|
+
# Parse response
|
|
251
|
+
result = self._parse_response(response)
|
|
252
|
+
|
|
253
|
+
if result["status"] == "ok":
|
|
254
|
+
return WatcherResult.ok()
|
|
255
|
+
elif result["status"] == "concern":
|
|
256
|
+
return WatcherResult.nudge(
|
|
257
|
+
guidance=result.get("guidance", result["assessment"]),
|
|
258
|
+
reason=f"LLM assessment: {result['assessment']}",
|
|
259
|
+
metadata={"llm_response": result},
|
|
260
|
+
)
|
|
261
|
+
else: # problem
|
|
262
|
+
return WatcherResult.nudge(
|
|
263
|
+
guidance=result.get("guidance", result["assessment"]),
|
|
264
|
+
reason=f"LLM detected problem: {result['assessment']}",
|
|
265
|
+
priority=10, # Higher priority for problems
|
|
266
|
+
metadata={"llm_response": result},
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
except Exception as e:
|
|
270
|
+
logger.warning(f"LLM watcher failed: {e}")
|
|
271
|
+
return WatcherResult.ok() # Don't block on watcher failure
|
|
272
|
+
|
|
273
|
+
async def _call_llm(self, prompt: str, model: str, temperature: float) -> str:
|
|
274
|
+
"""Call the LLM using OpenAI Responses API."""
|
|
275
|
+
import openai
|
|
276
|
+
|
|
277
|
+
client = openai.AsyncOpenAI()
|
|
278
|
+
|
|
279
|
+
# Use Responses API (consistent with wbal)
|
|
280
|
+
response = await client.responses.create(
|
|
281
|
+
model=model,
|
|
282
|
+
input=[{"role": "user", "content": prompt}],
|
|
283
|
+
temperature=temperature,
|
|
284
|
+
text={"format": {"type": "json_object"}},
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Extract text from response
|
|
288
|
+
output_text = getattr(response, "output_text", None)
|
|
289
|
+
if output_text:
|
|
290
|
+
return output_text
|
|
291
|
+
|
|
292
|
+
# Fallback: look through output items
|
|
293
|
+
for item in getattr(response, "output", []):
|
|
294
|
+
if getattr(item, "type", None) == "message":
|
|
295
|
+
for content in getattr(item, "content", []):
|
|
296
|
+
if getattr(content, "type", None) == "output_text":
|
|
297
|
+
return getattr(content, "text", "{}")
|
|
298
|
+
# Also check for direct text attribute
|
|
299
|
+
text = getattr(item, "text", None)
|
|
300
|
+
if text:
|
|
301
|
+
return text
|
|
302
|
+
|
|
303
|
+
return "{}"
|
|
304
|
+
|
|
305
|
+
def _parse_response(self, response: str) -> dict[str, Any]:
|
|
306
|
+
"""Parse LLM response JSON."""
|
|
307
|
+
try:
|
|
308
|
+
result = json.loads(response)
|
|
309
|
+
# Validate required fields
|
|
310
|
+
if "status" not in result:
|
|
311
|
+
result["status"] = "ok"
|
|
312
|
+
if "assessment" not in result:
|
|
313
|
+
result["assessment"] = "No assessment provided"
|
|
314
|
+
return result
|
|
315
|
+
except json.JSONDecodeError:
|
|
316
|
+
return {
|
|
317
|
+
"status": "ok",
|
|
318
|
+
"assessment": "Failed to parse LLM response",
|
|
319
|
+
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|