zwarm 2.0.2__tar.gz → 2.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {zwarm-2.0.2 → zwarm-2.3.5}/PKG-INFO +1 -1
  2. {zwarm-2.0.2 → zwarm-2.3.5}/pyproject.toml +1 -1
  3. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/cli/main.py +103 -1
  4. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/environment.py +51 -32
  5. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/orchestrator.py +8 -3
  6. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/sessions/manager.py +125 -1
  7. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/tools/delegation.py +126 -7
  8. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/__init__.py +5 -0
  9. zwarm-2.3.5/src/zwarm/watchers/llm_watcher.py +319 -0
  10. {zwarm-2.0.2 → zwarm-2.3.5}/.gitignore +0 -0
  11. {zwarm-2.0.2 → zwarm-2.3.5}/README.md +0 -0
  12. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/__init__.py +0 -0
  13. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/__init__.py +0 -0
  14. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/base.py +0 -0
  15. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/claude_code.py +0 -0
  16. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/codex_mcp.py +0 -0
  17. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/registry.py +0 -0
  18. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/test_codex_mcp.py +0 -0
  19. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/adapters/test_registry.py +0 -0
  20. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/cli/__init__.py +0 -0
  21. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/__init__.py +0 -0
  22. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/compact.py +0 -0
  23. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/config.py +0 -0
  24. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/models.py +0 -0
  25. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/state.py +0 -0
  26. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/test_compact.py +0 -0
  27. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/test_config.py +0 -0
  28. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/test_models.py +0 -0
  29. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/prompts/__init__.py +0 -0
  30. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/prompts/orchestrator.py +0 -0
  31. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/sessions/__init__.py +0 -0
  32. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/test_orchestrator_watchers.py +0 -0
  33. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/tools/__init__.py +0 -0
  34. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/base.py +0 -0
  35. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/builtin.py +0 -0
  36. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/manager.py +0 -0
  37. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/registry.py +0 -0
  38. {zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/test_watchers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zwarm
3
- Version: 2.0.2
3
+ Version: 2.3.5
4
4
  Summary: Multi-Agent CLI Orchestration Research Platform
5
5
  Requires-Python: <3.14,>=3.13
6
6
  Requires-Dist: python-dotenv>=1.0.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "zwarm"
3
- version = "2.0.2"
3
+ version = "2.3.5"
4
4
  description = "Multi-Agent CLI Orchestration Research Platform"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.13,<3.14"
@@ -1151,6 +1151,7 @@ def interactive(
1151
1151
  [cyan]ls[/] / [cyan]list[/] Dashboard of all sessions
1152
1152
  [cyan]?[/] ID Quick peek (status + latest message)
1153
1153
  [cyan]show[/] ID Full session details & history
1154
+ [cyan]traj[/] ID Show trajectory (all steps taken)
1154
1155
  [cyan]c[/] / [cyan]continue[/] ID "msg" Continue a sync conversation
1155
1156
  [cyan]kill[/] ID Stop a session (keeps in history)
1156
1157
  [cyan]rm[/] ID Delete session entirely
@@ -1230,11 +1231,14 @@ def interactive(
1230
1231
  help_table.add_row(" --async", "Background mode (don't wait)")
1231
1232
  help_table.add_row("", "")
1232
1233
  help_table.add_row("ls / list", "Dashboard of all sessions")
1233
- help_table.add_row("? / show ID", "Show session details & messages")
1234
+ help_table.add_row("? ID / peek ID", "Quick peek (status + latest message)")
1235
+ help_table.add_row("show ID", "Full session details & messages")
1236
+ help_table.add_row("traj ID [--full]", "Show trajectory (all steps taken)")
1234
1237
  help_table.add_row('c ID "msg"', "Continue conversation (wait for response)")
1235
1238
  help_table.add_row('ca ID "msg"', "Continue async (fire-and-forget)")
1236
1239
  help_table.add_row("check ID", "Check session status")
1237
1240
  help_table.add_row("kill ID", "Stop a running session")
1241
+ help_table.add_row("rm ID", "Delete session entirely")
1238
1242
  help_table.add_row("killall", "Stop all running sessions")
1239
1243
  help_table.add_row("clean", "Remove old completed sessions")
1240
1244
  help_table.add_row("q / quit", "Exit")
@@ -1619,6 +1623,93 @@ def interactive(
1619
1623
  if session.error:
1620
1624
  console.print(f"[red]Error:[/] {session.error}")
1621
1625
 
1626
+ def do_trajectory(session_id: str, full: bool = False):
1627
+ """Show the full trajectory of a session - all steps in order."""
1628
+ from zwarm.sessions import CodexSessionManager
1629
+
1630
+ manager = CodexSessionManager(default_dir / ".zwarm")
1631
+ session = manager.get_session(session_id)
1632
+
1633
+ if not session:
1634
+ console.print(f" [red]Session not found:[/] {session_id}")
1635
+ return
1636
+
1637
+ trajectory = manager.get_trajectory(session_id, full=full)
1638
+
1639
+ if not trajectory:
1640
+ console.print("[dim]No trajectory data available.[/]")
1641
+ return
1642
+
1643
+ mode = "[bold](full)[/] " if full else ""
1644
+ console.print(f"\n[bold cyan]Trajectory: {session.short_id}[/] {mode}({len(trajectory)} steps)")
1645
+ console.print(f"[dim]Task: {session.task[:60]}{'...' if len(session.task) > 60 else ''}[/]")
1646
+ console.print()
1647
+
1648
+ # Display each step
1649
+ for step in trajectory:
1650
+ turn = step.get("turn", 1)
1651
+ step_num = step.get("step", 0)
1652
+ step_type = step.get("type", "unknown")
1653
+
1654
+ prefix = f"[dim]T{turn}.{step_num:02d}[/]"
1655
+
1656
+ if step_type == "reasoning":
1657
+ if full and step.get("full_text"):
1658
+ console.print(f"{prefix} [yellow]thinking:[/]")
1659
+ console.print(f" {step['full_text']}")
1660
+ else:
1661
+ summary = step.get("summary", "")
1662
+ console.print(f"{prefix} [yellow]thinking:[/] {summary}")
1663
+
1664
+ elif step_type == "command":
1665
+ cmd = step.get("command", "")
1666
+ output = step.get("output", "")
1667
+ exit_code = step.get("exit_code", "?")
1668
+ # Show command
1669
+ console.print(f"{prefix} [cyan]$ {cmd}[/]")
1670
+ if output:
1671
+ if full:
1672
+ # Show all output
1673
+ for line in output.split("\n"):
1674
+ console.print(f" [dim]{line}[/]")
1675
+ else:
1676
+ # Indent output, max 5 lines
1677
+ for line in output.split("\n")[:5]:
1678
+ console.print(f" [dim]{line}[/]")
1679
+ if output.count("\n") > 5:
1680
+ console.print(f" [dim]... ({output.count(chr(10))} lines)[/]")
1681
+ if exit_code != 0 and exit_code is not None:
1682
+ console.print(f" [red]exit: {exit_code}[/]")
1683
+
1684
+ elif step_type == "tool_call":
1685
+ tool = step.get("tool", "unknown")
1686
+ if full and step.get("full_args"):
1687
+ import json
1688
+ console.print(f"{prefix} [magenta]tool:[/] {tool}")
1689
+ console.print(f" {json.dumps(step['full_args'], indent=2)}")
1690
+ else:
1691
+ args = step.get("args_preview", "")
1692
+ console.print(f"{prefix} [magenta]tool:[/] {tool}({args})")
1693
+
1694
+ elif step_type == "tool_output":
1695
+ output = step.get("output", "")
1696
+ if not full:
1697
+ output = output[:100]
1698
+ console.print(f"{prefix} [dim]→ {output}[/]")
1699
+
1700
+ elif step_type == "message":
1701
+ if full and step.get("full_text"):
1702
+ console.print(f"{prefix} [green]response:[/]")
1703
+ console.print(f" {step['full_text']}")
1704
+ else:
1705
+ summary = step.get("summary", "")
1706
+ full_len = step.get("full_length", 0)
1707
+ console.print(f"{prefix} [green]response:[/] {summary}")
1708
+ if full_len > 200:
1709
+ console.print(f" [dim]({full_len} chars total)[/]")
1710
+
1711
+ console.print()
1712
+
1622
1713
  def do_continue(session_id: str, message: str, wait: bool = True):
1623
1714
  """
1624
1715
  Continue a conversation using CodexSessionManager.inject_message().
@@ -1872,6 +1963,17 @@ def interactive(
1872
1963
  else:
1873
1964
  do_show(args[0])
1874
1965
 
1966
+ elif cmd in ("traj", "trajectory"):
1967
+ if not args:
1968
+ console.print(" [red]Usage:[/] traj SESSION_ID [--full]")
1969
+ else:
1970
+ full_mode = "--full" in args
1971
+ session_arg = [a for a in args if a != "--full"]
1972
+ if session_arg:
1973
+ do_trajectory(session_arg[0], full=full_mode)
1974
+ else:
1975
+ console.print(" [red]Usage:[/] traj SESSION_ID [--full]")
1976
+
1875
1977
  elif cmd in ("c", "continue"):
1876
1978
  # Sync continue - waits for response
1877
1979
  if len(args) < 2:
@@ -17,6 +17,7 @@ from wbal.environment import Environment
17
17
 
18
18
  if TYPE_CHECKING:
19
19
  from zwarm.core.models import ConversationSession
20
+ from zwarm.sessions import CodexSessionManager
20
21
 
21
22
 
22
23
  class OrchestratorEnv(Environment):
@@ -36,7 +37,10 @@ class OrchestratorEnv(Environment):
36
37
  working_dir: Path = Path(".")
37
38
  output_handler: Callable[[str], None] = lambda x: print(x)
38
39
 
39
- # Session tracking (set by orchestrator)
40
+ # Session manager (set by orchestrator) - pulls live data each observe()
41
+ _session_manager: "CodexSessionManager | None" = PrivateAttr(default=None)
42
+
43
+ # Legacy: old sessions dict (deprecated, for backwards compat)
40
44
  _sessions: dict[str, "ConversationSession"] | None = PrivateAttr(default=None)
41
45
 
42
46
  # Progress tracking (updated by orchestrator each step)
@@ -48,8 +52,12 @@ class OrchestratorEnv(Environment):
48
52
  # Budget config (set from config)
49
53
  _budget_max_sessions: int | None = PrivateAttr(default=None)
50
54
 
55
+ def set_session_manager(self, manager: "CodexSessionManager") -> None:
56
+ """Set the session manager for live session visibility in observe()."""
57
+ self._session_manager = manager
58
+
51
59
  def set_sessions(self, sessions: dict[str, "ConversationSession"]) -> None:
52
- """Set the sessions dict for observe() visibility."""
60
+ """Legacy: Set the sessions dict for observe() visibility."""
53
61
  self._sessions = sessions
54
62
 
55
63
  def update_progress(
@@ -75,7 +83,7 @@ class OrchestratorEnv(Environment):
75
83
 
76
84
  Shows:
77
85
  - Progress (steps, tokens)
78
- - Session summary
86
+ - Session summary (pulled LIVE from CodexSessionManager)
79
87
  - Active sessions with their status
80
88
  - Working directory
81
89
 
@@ -108,45 +116,56 @@ class OrchestratorEnv(Environment):
108
116
 
109
117
  parts.append("## Progress\n" + "\n".join(progress_lines))
110
118
 
111
- # Session summary
112
- if self._sessions is not None:
113
- active = sum(
114
- 1 for s in self._sessions.values() if s.status.value == "active"
115
- )
116
- completed = sum(
117
- 1 for s in self._sessions.values() if s.status.value == "completed"
118
- )
119
- failed = sum(
120
- 1 for s in self._sessions.values() if s.status.value == "failed"
121
- )
122
- total = len(self._sessions)
123
-
124
- summary = f"Sessions: {active} active, {completed} done, {failed} failed ({total} total)"
119
+ # Session summary - pull LIVE from CodexSessionManager
120
+ if self._session_manager is not None:
121
+ sessions = self._session_manager.list_sessions()
122
+
123
+ running = sum(1 for s in sessions if s.status.value == "running")
124
+ completed = sum(1 for s in sessions if s.status.value == "completed")
125
+ failed = sum(1 for s in sessions if s.status.value == "failed")
126
+ total = len(sessions)
127
+
128
+ summary = f"Sessions: {running} running, {completed} done, {failed} failed ({total} total)"
125
129
  if self._budget_max_sessions:
126
130
  summary += f" [limit: {self._budget_max_sessions}]"
127
131
 
128
132
  parts.append(f"## Resources\n{summary}")
129
133
 
130
- # Active sessions detail
131
- active_sessions = [
132
- (sid, s)
133
- for sid, s in self._sessions.items()
134
- if s.status.value == "active"
135
- ]
136
- if active_sessions:
134
+ # Running sessions detail
135
+ running_sessions = [s for s in sessions if s.status.value == "running"]
136
+ if running_sessions:
137
+ session_lines = []
138
+ for session in running_sessions:
139
+ task_preview = (
140
+ session.task[:50] + "..."
141
+ if len(session.task) > 50
142
+ else session.task
143
+ )
144
+ tokens = session.token_usage.get("total_tokens", 0)
145
+ token_info = f", {tokens:,} tok" if tokens else ""
146
+ session_lines.append(
147
+ f" • {session.short_id} (turn {session.turn}{token_info}): {task_preview}"
148
+ )
149
+ parts.append("## Running Sessions\n" + "\n".join(session_lines))
150
+
151
+ # Recently completed (for visibility)
152
+ recent_completed = [
153
+ s for s in sessions
154
+ if s.status.value == "completed"
155
+ ][:3] # Last 3 completed
156
+ if recent_completed:
137
157
  session_lines = []
138
- for sid, session in active_sessions:
139
- mode_tag = "sync" if session.mode.value == "sync" else "async"
140
- turns = len([m for m in session.messages if m.role == "user"])
158
+ for session in recent_completed:
141
159
  task_preview = (
142
- session.task_description[:50] + "..."
143
- if len(session.task_description) > 50
144
- else session.task_description
160
+ session.task[:40] + "..."
161
+ if len(session.task) > 40
162
+ else session.task
145
163
  )
164
+ tokens = session.token_usage.get("total_tokens", 0)
146
165
  session_lines.append(
147
- f"\n • {sid[:8]} ({session.adapter}, {mode_tag}, {turns} turns): {task_preview}"
166
+ f" • {session.short_id} ({tokens:,} tok): {task_preview}"
148
167
  )
149
- parts.append("## Active Sessions\n" + "\n".join(session_lines))
168
+ parts.append("## Recently Completed\n" + "\n".join(session_lines))
150
169
 
151
170
  # Working directory (less prominent)
152
171
  parts.append(f"## Context\nWorking dir: {self.working_dir.absolute()}")
@@ -127,9 +127,14 @@ class Orchestrator(YamlAgent):
127
127
  }
128
128
  )
129
129
 
130
- # Link sessions to environment for observe()
131
- if hasattr(self.env, "set_sessions"):
132
- self.env.set_sessions(self._sessions)
130
+ # Initialize CodexSessionManager and link to environment
131
+ # This is the SAME manager used by delegation tools
132
+ from zwarm.sessions import CodexSessionManager
133
+ self._session_manager = CodexSessionManager(self.working_dir / ".zwarm")
134
+
135
+ # Link session manager to environment for live session visibility in observe()
136
+ if hasattr(self.env, "set_session_manager"):
137
+ self.env.set_session_manager(self._session_manager)
133
138
 
134
139
  # Set budget limits in environment
135
140
  if hasattr(self.env, "set_budget"):
@@ -301,9 +301,18 @@ class CodexSessionManager:
301
301
  session.messages = messages
302
302
  session.token_usage = usage
303
303
 
304
- if error:
304
+ # Check if we got actual assistant responses
305
+ has_response = any(m.role == "assistant" for m in messages)
306
+
307
+ if error and not has_response:
308
+ # Only mark as failed if we have an error AND no response
305
309
  session.status = SessionStatus.FAILED
306
310
  session.error = error
311
+ elif error and has_response:
312
+ # Got response but also an error (e.g., network disconnect at end)
313
+ # Treat as completed but note the error
314
+ session.status = SessionStatus.COMPLETED
315
+ session.error = f"Completed with error: {error}"
307
316
  else:
308
317
  session.status = SessionStatus.COMPLETED
309
318
  else:
@@ -634,12 +643,127 @@ Continue from where you left off, addressing the user's new message."""
634
643
  turn_usage = event.get("usage", {})
635
644
  for key, value in turn_usage.items():
636
645
  usage[key] = usage.get(key, 0) + value
646
+ # Compute total_tokens if not present
647
+ if "total_tokens" not in usage:
648
+ usage["total_tokens"] = usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
637
649
 
638
650
  elif event_type == "error":
639
651
  error = event.get("message", str(event))
640
652
 
641
653
  return messages, usage, error
642
654
 
655
+ def get_trajectory(self, session_id: str, full: bool = False, max_output_len: int = 200) -> list[dict]:
656
+ """
657
+ Get the full trajectory of a session - all steps in order.
658
+
659
+ Args:
660
+ session_id: Session to get trajectory for
661
+ full: If True, include full untruncated content
662
+ max_output_len: Max length for outputs when full=False
663
+
664
+ Returns a list of step dicts with type, summary, and details.
665
+ This shows the "broad strokes" of what the agent did.
666
+ """
667
+ if full:
668
+ max_output_len = 999999 # Effectively unlimited
669
+ session = self.get_session(session_id)
670
+ if not session:
671
+ return []
672
+
673
+ trajectory = []
674
+
675
+ for turn in range(1, session.turn + 1):
676
+ output_path = self._output_path(session.id, turn)
677
+ if not output_path.exists():
678
+ continue
679
+
680
+ content = output_path.read_text()
681
+ step_num = 0
682
+
683
+ for line in content.strip().split("\n"):
684
+ if not line.strip():
685
+ continue
686
+
687
+ try:
688
+ event = json.loads(line)
689
+ except json.JSONDecodeError:
690
+ continue
691
+
692
+ event_type = event.get("type", "")
693
+
694
+ if event_type == "item.completed":
695
+ item = event.get("item", {})
696
+ item_type = item.get("type", "")
697
+ step_num += 1
698
+
699
+ if item_type == "reasoning":
700
+ text = item.get("text", "")
701
+ summary_len = max_output_len if full else 100
702
+ trajectory.append({
703
+ "turn": turn,
704
+ "step": step_num,
705
+ "type": "reasoning",
706
+ "summary": text[:summary_len] + ("..." if len(text) > summary_len else ""),
707
+ "full_text": text if full else None,
708
+ })
709
+
710
+ elif item_type == "command_execution":
711
+ cmd = item.get("command", "")
712
+ output = item.get("aggregated_output", "")
713
+ exit_code = item.get("exit_code")
714
+ # Truncate output
715
+ output_preview = output[:max_output_len]
716
+ if len(output) > max_output_len:
717
+ output_preview += "..."
718
+ trajectory.append({
719
+ "turn": turn,
720
+ "step": step_num,
721
+ "type": "command",
722
+ "command": cmd,
723
+ "output": output_preview.strip(),
724
+ "exit_code": exit_code,
725
+ })
726
+
727
+ elif item_type == "function_call":
728
+ func_name = item.get("name", "unknown")
729
+ args = item.get("arguments", {})
730
+ args_str = str(args)
731
+ args_len = max_output_len if full else 100
732
+ trajectory.append({
733
+ "turn": turn,
734
+ "step": step_num,
735
+ "type": "tool_call",
736
+ "tool": func_name,
737
+ "args_preview": args_str[:args_len] + ("..." if len(args_str) > args_len else ""),
738
+ "full_args": args if full else None,
739
+ })
740
+
741
+ elif item_type == "function_call_output":
742
+ output = item.get("output", "")
743
+ output_preview = output[:max_output_len]
744
+ if len(output) > max_output_len:
745
+ output_preview += "..."
746
+ trajectory.append({
747
+ "turn": turn,
748
+ "step": step_num,
749
+ "type": "tool_output",
750
+ "output": output_preview,
751
+ })
752
+
753
+ elif item_type == "agent_message":
754
+ text = item.get("text", "")
755
+ summary_len = max_output_len if full else 200
756
+ trajectory.append({
757
+ "turn": turn,
758
+ "step": step_num,
759
+ "type": "message",
760
+ "summary": text[:summary_len] + ("..." if len(text) > summary_len else ""),
761
+ "full_text": text if full else None,
762
+ "full_length": len(text),
763
+ })
764
+
765
+ return trajectory
766
+
643
767
  def cleanup_completed(self, keep_days: int = 7) -> int:
644
768
  """
645
769
  Remove old completed/failed/killed sessions.
@@ -33,8 +33,12 @@ def _get_session_manager(orchestrator: "Orchestrator"):
33
33
 
34
34
  Both `zwarm interactive` and `zwarm orchestrate` use the same session manager.
35
35
  The orchestrator is just another user that happens to be an LLM.
36
+
37
+ The session manager is created eagerly in Orchestrator.model_post_init()
38
+ and shared with the environment for observe() visibility.
36
39
  """
37
- if not hasattr(orchestrator, "_session_manager"):
40
+ # Should already exist from model_post_init, but create if not
41
+ if not hasattr(orchestrator, "_session_manager") or orchestrator._session_manager is None:
38
42
  from zwarm.sessions import CodexSessionManager
39
43
  orchestrator._session_manager = CodexSessionManager(orchestrator.working_dir / ".zwarm")
40
44
  return orchestrator._session_manager
@@ -83,6 +87,14 @@ def _format_session_header(session) -> str:
83
87
  return f"[{session.short_id}] codex ({session.status.value})"
84
88
 
85
89
 
90
+ def _get_total_tokens(session) -> int:
91
+ """Get total tokens, computing from input+output if not present."""
92
+ usage = session.token_usage
93
+ if "total_tokens" in usage:
94
+ return usage["total_tokens"]
95
+ return usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
96
+
97
+
86
98
  def _validate_working_dir(
87
99
  requested_dir: Path | str | None,
88
100
  default_dir: Path,
@@ -238,6 +250,25 @@ def delegate(
238
250
  response_text = msg.content
239
251
  break # Take first assistant message
240
252
 
253
+ # Build log path for debugging
254
+ log_path = str(manager._output_path(session.id, session.turn))
255
+
256
+ # Check if session failed
257
+ from zwarm.sessions import SessionStatus
258
+ if session.status == SessionStatus.FAILED:
259
+ return {
260
+ "success": False,
261
+ "session": _format_session_header(session),
262
+ "session_id": session.id,
263
+ "status": "failed",
264
+ "task": _truncate(task, 100),
265
+ "error": session.error or "Unknown error",
266
+ "response": response_text or "(no response captured)",
267
+ "tokens": _get_total_tokens(session),
268
+ "log_file": log_path,
269
+ "hint": "Check log_file for raw codex output. Use bash('cat <log_file>') to inspect.",
270
+ }
271
+
241
272
  return {
242
273
  "success": True,
243
274
  "session": _format_session_header(session),
@@ -245,7 +276,8 @@ def delegate(
245
276
  "status": session.status.value,
246
277
  "task": _truncate(task, 100),
247
278
  "response": response_text or "(no response captured)",
248
- "tokens": session.token_usage.get("total_tokens", 0),
279
+ "tokens": _get_total_tokens(session),
280
+ "log_file": log_path,
249
281
  "hint": "Use converse(session_id, message) to send follow-up messages",
250
282
  }
251
283
  else:
@@ -382,7 +414,7 @@ def converse(
382
414
  "turn": session.turn,
383
415
  "you_said": _truncate(message, 100),
384
416
  "response": response_text or "(no response captured)",
385
- "tokens": session.token_usage.get("total_tokens", 0),
417
+ "tokens": _get_total_tokens(session),
386
418
  }
387
419
 
388
420
 
@@ -423,7 +455,10 @@ def check_session(
423
455
  response_text = msg.content
424
456
  break
425
457
 
426
- return {
458
+ # Build log path
459
+ log_path = str(manager._output_path(session.id, session.turn))
460
+
461
+ result = {
427
462
  "success": True,
428
463
  "session": _format_session_header(session),
429
464
  "session_id": session_id,
@@ -433,10 +468,19 @@ def check_session(
433
468
  "message_count": len(messages),
434
469
  "task": _truncate(session.task, 80),
435
470
  "response": _truncate(response_text, 500) if response_text else "(no response yet)",
436
- "tokens": session.token_usage.get("total_tokens", 0),
471
+ "tokens": _get_total_tokens(session),
437
472
  "runtime": session.runtime,
473
+ "log_file": log_path,
438
474
  }
439
475
 
476
+ # Add error info if failed
477
+ from zwarm.sessions import SessionStatus
478
+ if session.status == SessionStatus.FAILED:
479
+ result["success"] = False
480
+ result["error"] = session.error or "Unknown error"
481
+
482
+ return result
483
+
440
484
 
441
485
  @weaveTool
442
486
  def peek_session(
@@ -477,6 +521,81 @@ def peek_session(
477
521
  }
478
522
 
479
523
 
524
+ @weaveTool
525
+ def get_trajectory(
526
+ self: "Orchestrator",
527
+ session_id: str,
528
+ full: bool = False,
529
+ ) -> dict[str, Any]:
530
+ """
531
+ Get the full trajectory of a session - all steps the agent took.
532
+
533
+ Shows reasoning, commands, tool calls, and responses in order.
534
+ Useful for understanding HOW the agent completed a task, not just
535
+ the final result.
536
+
537
+ Args:
538
+ session_id: The session to get trajectory for.
539
+ full: If True, include full untruncated content (default: False for summary view).
540
+
541
+ Returns:
542
+ {steps: [...], step_count}
543
+ """
544
+ manager = _get_session_manager(self)
545
+
546
+ session = manager.get_session(session_id)
547
+ if not session:
548
+ return {"success": False, "error": f"Unknown session: {session_id}"}
549
+
550
+ trajectory = manager.get_trajectory(session_id, full=full)
551
+
552
+ # Format steps for easy reading
553
+ formatted_steps = []
554
+ for step in trajectory:
555
+ step_type = step.get("type", "unknown")
556
+
557
+ if step_type == "reasoning":
558
+ text = step.get("full_text") if full else step.get("summary", "")
559
+ formatted_steps.append(f"[thinking] {text}")
560
+ elif step_type == "command":
561
+ cmd = step.get("command", "")
562
+ output = step.get("output", "")
563
+ exit_code = step.get("exit_code")
564
+ step_str = f"[command] $ {cmd}"
565
+ if output:
566
+ if full:
567
+ step_str += f"\n → {output}"
568
+ else:
569
+ step_str += f"\n → {output[:100]}{'...' if len(output) > 100 else ''}"
570
+ if exit_code and exit_code != 0:
571
+ step_str += f" (exit: {exit_code})"
572
+ formatted_steps.append(step_str)
573
+ elif step_type == "tool_call":
574
+ if full and step.get("full_args"):
575
+ import json
576
+ args_str = json.dumps(step["full_args"], indent=2)
577
+ formatted_steps.append(f"[tool] {step.get('tool', 'unknown')}\n {args_str}")
578
+ else:
579
+ formatted_steps.append(f"[tool] {step.get('tool', 'unknown')}({step.get('args_preview', '')})")
580
+ elif step_type == "tool_output":
581
+ output = step.get("output", "")
582
+ if not full:
583
+ output = output[:100]
584
+ formatted_steps.append(f"[result] {output}")
585
+ elif step_type == "message":
586
+ text = step.get("full_text") if full else step.get("summary", "")
587
+ formatted_steps.append(f"[response] {text}")
588
+
589
+ return {
590
+ "success": True,
591
+ "session_id": session.short_id,
592
+ "task": _truncate(session.task, 80),
593
+ "step_count": len(trajectory),
594
+ "steps": formatted_steps,
595
+ "mode": "full" if full else "summary",
596
+ }
597
+
598
+
480
599
  @weaveTool
481
600
  def end_session(
482
601
  self: "Orchestrator",
@@ -539,7 +658,7 @@ def end_session(
539
658
  "status": session.status.value,
540
659
  "reason": reason or "ended by orchestrator",
541
660
  "turn": session.turn,
542
- "tokens": session.token_usage.get("total_tokens", 0),
661
+ "tokens": _get_total_tokens(session),
543
662
  }
544
663
 
545
664
 
@@ -646,7 +765,7 @@ def list_sessions(
646
765
  "updated_secs": int(updated_secs),
647
766
  "last_message": _truncate(last_message, 100) if last_message else "(no response yet)",
648
767
  "needs_attention": needs_attention,
649
- "tokens": s.token_usage.get("total_tokens", 0),
768
+ "tokens": _get_total_tokens(s),
650
769
  })
651
770
 
652
771
  # Summary counts
@@ -11,6 +11,10 @@ from zwarm.watchers.manager import WatcherManager, WatcherConfig, build_watcher_
11
11
 
12
12
  # Import built-in watchers to register them
13
13
  from zwarm.watchers import builtin as _builtin # noqa: F401
14
+ from zwarm.watchers import llm_watcher as _llm_watcher # noqa: F401
15
+
16
+ # Export trajectory compression utility
17
+ from zwarm.watchers.llm_watcher import compress_trajectory
14
18
 
15
19
  __all__ = [
16
20
  "Watcher",
@@ -23,4 +27,5 @@ __all__ = [
23
27
  "get_watcher",
24
28
  "list_watchers",
25
29
  "build_watcher_manager",
30
+ "compress_trajectory",
26
31
  ]
@@ -0,0 +1,319 @@
1
+ """
2
+ LLM-based watcher for nuanced trajectory analysis.
3
+
4
+ Unlike rule-based watchers, this watcher uses a language model to assess
5
+ the orchestrator's trajectory and provide context-aware guidance.
6
+
7
+ The watcher compresses the full message history into a compact trajectory
8
+ representation (similar to what Codex shows in its UI) to minimize token
9
+ usage while preserving the "shape" of the agent's behavior.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import logging
16
+ from typing import Any
17
+
18
+ from zwarm.watchers.base import Watcher, WatcherContext, WatcherResult
19
+ from zwarm.watchers.registry import register_watcher
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def _get_field(item: Any, name: str, default: Any = None) -> Any:
25
+ """Get field from dict or object."""
26
+ if isinstance(item, dict):
27
+ return item.get(name, default)
28
+ return getattr(item, name, default)
29
+
30
+
31
+ def _extract_tool_call_summary(tc: Any) -> str:
32
+ """Extract a compact summary of a tool call."""
33
+ if isinstance(tc, dict):
34
+ func = tc.get("function", tc)
35
+ name = func.get("name", tc.get("name", "?"))
36
+ args = func.get("arguments", tc.get("arguments", ""))
37
+ else:
38
+ name = getattr(tc, "name", "?")
39
+ args = getattr(tc, "arguments", "")
40
+
41
+ # Parse args if JSON string
42
+ if isinstance(args, str):
43
+ try:
44
+ args = json.loads(args)
45
+ except (json.JSONDecodeError, TypeError):
46
+ pass
47
+
48
+ # Create compact arg summary
49
+ if isinstance(args, dict):
50
+ # Show key args based on tool type
51
+ if name == "delegate":
52
+ task = args.get("task", "")[:50]
53
+ mode = args.get("mode", "sync")
54
+ return f"delegate({mode}): {task}..."
55
+ elif name == "converse":
56
+ msg = args.get("message", "")[:40]
57
+ return f"converse: {msg}..."
58
+ elif name == "bash":
59
+ cmd = args.get("command", "")[:60]
60
+ return f"$ {cmd}"
61
+ elif name in ("check_session", "peek_session", "end_session"):
62
+ sid = args.get("session_id", "")[:8]
63
+ return f"{name}({sid})"
64
+ elif name == "list_sessions":
65
+ return "list_sessions()"
66
+ else:
67
+ # Generic: show first arg
68
+ first_val = next(iter(args.values()), "") if args else ""
69
+ if isinstance(first_val, str) and len(first_val) > 30:
70
+ first_val = first_val[:30] + "..."
71
+ return f"{name}({first_val})"
72
+ else:
73
+ return f"{name}({str(args)[:30]})"
74
+
75
+
76
+ def compress_trajectory(messages: list[dict[str, Any]], max_steps: int = 50) -> str:
77
+ """
78
+ Compress full message history into a compact trajectory representation.
79
+
80
+ Output format (similar to Codex UI):
81
+ ```
82
+ [1] thinking: "preparing to inspect the codebase"
83
+ → delegate(sync): Add authentication to...
84
+ [2] thinking: "checking session status"
85
+ → check_session(abc123)
86
+ [3] thinking: "session completed, verifying"
87
+ → $ pytest tests/
88
+ ```
89
+
90
+ Args:
91
+ messages: Full message history from orchestrator
92
+ max_steps: Maximum steps to include (most recent)
93
+
94
+ Returns:
95
+ Compact trajectory string
96
+ """
97
+ steps = []
98
+ step_num = 0
99
+
100
+ for msg in messages:
101
+ role = _get_field(msg, "role", "")
102
+
103
+ if role == "system":
104
+ continue # Skip system messages
105
+
106
+ if role == "assistant":
107
+ step_num += 1
108
+ content = _get_field(msg, "content", "")
109
+ tool_calls = _get_field(msg, "tool_calls", [])
110
+
111
+ # Extract thinking/reasoning summary
112
+ thinking = ""
113
+ if content:
114
+ # Take first line or first 80 chars as "thinking"
115
+ first_line = content.split("\n")[0].strip()
116
+ if len(first_line) > 80:
117
+ thinking = first_line[:80] + "..."
118
+ else:
119
+ thinking = first_line
120
+
121
+ # Extract tool calls
122
+ actions = []
123
+ if tool_calls:
124
+ for tc in tool_calls[:3]: # Max 3 tool calls per step
125
+ actions.append(_extract_tool_call_summary(tc))
126
+ if len(tool_calls) > 3:
127
+ actions.append(f"... +{len(tool_calls) - 3} more")
128
+
129
+ # Format step
130
+ step_lines = [f"[{step_num}]"]
131
+ if thinking:
132
+ step_lines[0] += f' thinking: "{thinking}"'
133
+ for action in actions:
134
+ step_lines.append(f" → {action}")
135
+
136
+ steps.append("\n".join(step_lines))
137
+
138
+ elif role == "tool":
139
+ # Tool results - just note if error
140
+ content = str(_get_field(msg, "content", ""))
141
+ if "error" in content.lower() or "failed" in content.lower():
142
+ steps.append(f" ⚠ tool returned error")
143
+
144
+ elif role == "user" and step_num > 0:
145
+ # User message mid-conversation (watcher nudge, etc.)
146
+ content = _get_field(msg, "content", "")
147
+ if content and "[WATCHER" in content:
148
+ steps.append(f" 📍 watcher nudge")
149
+ elif content:
150
+ preview = content[:50].replace("\n", " ")
151
+ steps.append(f" 💬 user: {preview}...")
152
+
153
+ # Take most recent steps
154
+ if len(steps) > max_steps:
155
+ steps = ["... (earlier steps omitted)"] + steps[-max_steps:]
156
+
157
+ return "\n".join(steps)
158
+
159
+
160
+ def _build_watcher_prompt(
161
+ trajectory: str,
162
+ task: str,
163
+ step: int,
164
+ max_steps: int,
165
+ session_summary: str,
166
+ ) -> str:
167
+ """Build the prompt for the LLM watcher."""
168
+ return f"""You are a trajectory watcher observing an orchestrator agent. Your job is to assess whether the agent is on track and provide guidance if needed.
169
+
170
+ ## Original Task
171
+ {task}
172
+
173
+ ## Progress
174
+ Step {step}/{max_steps}
175
+
176
+ ## Active Sessions
177
+ {session_summary}
178
+
179
+ ## Trajectory (recent steps)
180
+ {trajectory}
181
+
182
+ ---
183
+
184
+ Analyze this trajectory and respond with a JSON object:
185
+ {{
186
+ "status": "ok" | "concern" | "problem",
187
+ "assessment": "Brief 1-2 sentence assessment of trajectory health",
188
+ "guidance": "If status is concern/problem, specific actionable guidance. Otherwise null."
189
+ }}
190
+
191
+ Things to watch for:
192
+ - Is the agent making progress toward the task?
193
+ - Is it spinning or repeating actions?
194
+ - Is it going off on tangents unrelated to the task?
195
+ - Is it delegating appropriately or trying to do everything directly?
196
+ - Are sessions being completed or just started and abandoned?
197
+
198
+ Be concise. Only flag real issues, not minor inefficiencies."""
199
+
200
+
201
+ @register_watcher("llm")
202
+ class LLMWatcher(Watcher):
203
+ """
204
+ LLM-based watcher for nuanced trajectory analysis.
205
+
206
+ Uses a language model to assess the orchestrator's trajectory
207
+ and provide context-aware guidance that rule-based watchers can't.
208
+
209
+ Config options:
210
+ model: Model to use (default: gpt-4o-mini)
211
+ threshold: How often to run (every N steps, default: 5)
212
+ temperature: LLM temperature (default: 0.3)
213
+ """
214
+
215
+ name = "llm"
216
+ description = "LLM-based trajectory analysis for nuanced guidance"
217
+
218
+ async def observe(self, ctx: WatcherContext) -> WatcherResult:
219
+ config = self.config
220
+ threshold = config.get("threshold", 5)
221
+ model = config.get("model", "gpt-4o-mini")
222
+ temperature = config.get("temperature", 0.3)
223
+
224
+ # Only run every N steps to save costs
225
+ if ctx.step % threshold != 0 or ctx.step == 0:
226
+ return WatcherResult.ok()
227
+
228
+ try:
229
+ # Compress trajectory
230
+ trajectory = compress_trajectory(ctx.messages)
231
+
232
+ # Build session summary
233
+ active = [s for s in ctx.sessions if s.get("status") == "running"]
234
+ completed = [s for s in ctx.sessions if s.get("status") == "completed"]
235
+ failed = [s for s in ctx.sessions if s.get("status") == "failed"]
236
+ session_summary = f"{len(active)} running, {len(completed)} completed, {len(failed)} failed"
237
+
238
+ # Build prompt
239
+ prompt = _build_watcher_prompt(
240
+ trajectory=trajectory,
241
+ task=ctx.task,
242
+ step=ctx.step,
243
+ max_steps=ctx.max_steps,
244
+ session_summary=session_summary,
245
+ )
246
+
247
+ # Call LLM
248
+ response = await self._call_llm(prompt, model, temperature)
249
+
250
+ # Parse response
251
+ result = self._parse_response(response)
252
+
253
+ if result["status"] == "ok":
254
+ return WatcherResult.ok()
255
+ elif result["status"] == "concern":
256
+ return WatcherResult.nudge(
257
+ guidance=result.get("guidance", result["assessment"]),
258
+ reason=f"LLM assessment: {result['assessment']}",
259
+ metadata={"llm_response": result},
260
+ )
261
+ else: # problem
262
+ return WatcherResult.nudge(
263
+ guidance=result.get("guidance", result["assessment"]),
264
+ reason=f"LLM detected problem: {result['assessment']}",
265
+ priority=10, # Higher priority for problems
266
+ metadata={"llm_response": result},
267
+ )
268
+
269
+ except Exception as e:
270
+ logger.warning(f"LLM watcher failed: {e}")
271
+ return WatcherResult.ok() # Don't block on watcher failure
272
+
273
+ async def _call_llm(self, prompt: str, model: str, temperature: float) -> str:
274
+ """Call the LLM using OpenAI Responses API."""
275
+ import openai
276
+
277
+ client = openai.AsyncOpenAI()
278
+
279
+ # Use Responses API (consistent with wbal)
280
+ response = await client.responses.create(
281
+ model=model,
282
+ input=[{"role": "user", "content": prompt}],
283
+ temperature=temperature,
284
+ text={"format": {"type": "json_object"}},
285
+ )
286
+
287
+ # Extract text from response
288
+ output_text = getattr(response, "output_text", None)
289
+ if output_text:
290
+ return output_text
291
+
292
+ # Fallback: look through output items
293
+ for item in getattr(response, "output", []):
294
+ if getattr(item, "type", None) == "message":
295
+ for content in getattr(item, "content", []):
296
+ if getattr(content, "type", None) == "output_text":
297
+ return getattr(content, "text", "{}")
298
+ # Also check for direct text attribute
299
+ text = getattr(item, "text", None)
300
+ if text:
301
+ return text
302
+
303
+ return "{}"
304
+
305
+ def _parse_response(self, response: str) -> dict[str, Any]:
306
+ """Parse LLM response JSON."""
307
+ try:
308
+ result = json.loads(response)
309
+ # Validate required fields
310
+ if "status" not in result:
311
+ result["status"] = "ok"
312
+ if "assessment" not in result:
313
+ result["assessment"] = "No assessment provided"
314
+ return result
315
+ except json.JSONDecodeError:
316
+ return {
317
+ "status": "ok",
318
+ "assessment": "Failed to parse LLM response",
319
+ }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes