PyPI - zwarm - Versions diffs - 2.0.2__tar.gz → 2.3.5__tar.gz - Mend

zwarm 2.0.2tar.gz → 2.3.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{zwarm-2.0.2 → zwarm-2.3.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: zwarm
-Version: 2.0.2
+Version: 2.3.5
 Summary: Multi-Agent CLI Orchestration Research Platform
 Requires-Python: <3.14,>=3.13
 Requires-Dist: python-dotenv>=1.0.0

{zwarm-2.0.2 → zwarm-2.3.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "zwarm"
-version = "2.0.2"
+version = "2.3.5"
 description = "Multi-Agent CLI Orchestration Research Platform"
 readme = "README.md"
 requires-python = ">=3.13,<3.14"

{zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/cli/main.py RENAMED Viewed

@@ -1151,6 +1151,7 @@ def interactive(
         [cyan]ls[/] / [cyan]list[/]               Dashboard of all sessions
         [cyan]?[/] ID                    Quick peek (status + latest message)
         [cyan]show[/] ID                  Full session details & history
+        [cyan]traj[/] ID                  Show trajectory (all steps taken)
         [cyan]c[/] / [cyan]continue[/] ID "msg"   Continue a sync conversation
         [cyan]kill[/] ID                  Stop a session (keeps in history)
         [cyan]rm[/] ID                    Delete session entirely
@@ -1230,11 +1231,14 @@ def interactive(
         help_table.add_row("  --async", "Background mode (don't wait)")
         help_table.add_row("", "")
         help_table.add_row("ls / list", "Dashboard of all sessions")
-        help_table.add_row("? / show ID", "Show session details & messages")
+        help_table.add_row("? ID / peek ID", "Quick peek (status + latest message)")
+        help_table.add_row("show ID", "Full session details & messages")
+        help_table.add_row("traj ID [--full]", "Show trajectory (all steps taken)")
         help_table.add_row('c ID "msg"', "Continue conversation (wait for response)")
         help_table.add_row('ca ID "msg"', "Continue async (fire-and-forget)")
         help_table.add_row("check ID", "Check session status")
         help_table.add_row("kill ID", "Stop a running session")
+        help_table.add_row("rm ID", "Delete session entirely")
         help_table.add_row("killall", "Stop all running sessions")
         help_table.add_row("clean", "Remove old completed sessions")
         help_table.add_row("q / quit", "Exit")
@@ -1619,6 +1623,93 @@ def interactive(
         if session.error:
             console.print(f"[red]Error:[/] {session.error}")
+    def do_trajectory(session_id: str, full: bool = False):
+        """Show the full trajectory of a session - all steps in order."""
+        from zwarm.sessions import CodexSessionManager
+        manager = CodexSessionManager(default_dir / ".zwarm")
+        session = manager.get_session(session_id)
+        if not session:
+            console.print(f"  [red]Session not found:[/] {session_id}")
+            return
+        trajectory = manager.get_trajectory(session_id, full=full)
+        if not trajectory:
+            console.print("[dim]No trajectory data available.[/]")
+            return
+        mode = "[bold](full)[/] " if full else ""
+        console.print(f"\n[bold cyan]Trajectory: {session.short_id}[/] {mode}({len(trajectory)} steps)")
+        console.print(f"[dim]Task: {session.task[:60]}{'...' if len(session.task) > 60 else ''}[/]")
+        console.print()
+        # Display each step
+        for step in trajectory:
+            turn = step.get("turn", 1)
+            step_num = step.get("step", 0)
+            step_type = step.get("type", "unknown")
+            prefix = f"[dim]T{turn}.{step_num:02d}[/]"
+            if step_type == "reasoning":
+                if full and step.get("full_text"):
+                    console.print(f"{prefix} [yellow]thinking:[/]")
+                    console.print(f"       {step['full_text']}")
+                else:
+                    summary = step.get("summary", "")
+                    console.print(f"{prefix} [yellow]thinking:[/] {summary}")
+            elif step_type == "command":
+                cmd = step.get("command", "")
+                output = step.get("output", "")
+                exit_code = step.get("exit_code", "?")
+                # Show command
+                console.print(f"{prefix} [cyan]$ {cmd}[/]")
+                if output:
+                    if full:
+                        # Show all output
+                        for line in output.split("\n"):
+                            console.print(f"       [dim]{line}[/]")
+                    else:
+                        # Indent output, max 5 lines
+                        for line in output.split("\n")[:5]:
+                            console.print(f"       [dim]{line}[/]")
+                        if output.count("\n") > 5:
+                            console.print(f"       [dim]... ({output.count(chr(10))} lines)[/]")
+                if exit_code != 0 and exit_code is not None:
+                    console.print(f"       [red]exit: {exit_code}[/]")
+            elif step_type == "tool_call":
+                tool = step.get("tool", "unknown")
+                if full and step.get("full_args"):
+                    import json
+                    console.print(f"{prefix} [magenta]tool:[/] {tool}")
+                    console.print(f"       {json.dumps(step['full_args'], indent=2)}")
+                else:
+                    args = step.get("args_preview", "")
+                    console.print(f"{prefix} [magenta]tool:[/] {tool}({args})")
+            elif step_type == "tool_output":
+                output = step.get("output", "")
+                if not full:
+                    output = output[:100]
+                console.print(f"{prefix} [dim]→ {output}[/]")
+            elif step_type == "message":
+                if full and step.get("full_text"):
+                    console.print(f"{prefix} [green]response:[/]")
+                    console.print(f"       {step['full_text']}")
+                else:
+                    summary = step.get("summary", "")
+                    full_len = step.get("full_length", 0)
+                    console.print(f"{prefix} [green]response:[/] {summary}")
+                    if full_len > 200:
+                        console.print(f"       [dim]({full_len} chars total)[/]")
+        console.print()
     def do_continue(session_id: str, message: str, wait: bool = True):
         """
         Continue a conversation using CodexSessionManager.inject_message().
@@ -1872,6 +1963,17 @@ def interactive(
                 else:
                     do_show(args[0])
+            elif cmd in ("traj", "trajectory"):
+                if not args:
+                    console.print("  [red]Usage:[/] traj SESSION_ID [--full]")
+                else:
+                    full_mode = "--full" in args
+                    session_arg = [a for a in args if a != "--full"]
+                    if session_arg:
+                        do_trajectory(session_arg[0], full=full_mode)
+                    else:
+                        console.print("  [red]Usage:[/] traj SESSION_ID [--full]")
             elif cmd in ("c", "continue"):
                 # Sync continue - waits for response
                 if len(args) < 2:

{zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/core/environment.py RENAMED Viewed

@@ -17,6 +17,7 @@ from wbal.environment import Environment
 if TYPE_CHECKING:
     from zwarm.core.models import ConversationSession
+    from zwarm.sessions import CodexSessionManager
 class OrchestratorEnv(Environment):
@@ -36,7 +37,10 @@ class OrchestratorEnv(Environment):
     working_dir: Path = Path(".")
     output_handler: Callable[[str], None] = lambda x: print(x)
-    # Session tracking (set by orchestrator)
+    # Session manager (set by orchestrator) - pulls live data each observe()
+    _session_manager: "CodexSessionManager | None" = PrivateAttr(default=None)
+    # Legacy: old sessions dict (deprecated, for backwards compat)
     _sessions: dict[str, "ConversationSession"] | None = PrivateAttr(default=None)
     # Progress tracking (updated by orchestrator each step)
@@ -48,8 +52,12 @@ class OrchestratorEnv(Environment):
     # Budget config (set from config)
     _budget_max_sessions: int | None = PrivateAttr(default=None)
+    def set_session_manager(self, manager: "CodexSessionManager") -> None:
+        """Set the session manager for live session visibility in observe()."""
+        self._session_manager = manager
     def set_sessions(self, sessions: dict[str, "ConversationSession"]) -> None:
-        """Set the sessions dict for observe() visibility."""
+        """Legacy: Set the sessions dict for observe() visibility."""
         self._sessions = sessions
     def update_progress(
@@ -75,7 +83,7 @@ class OrchestratorEnv(Environment):
         Shows:
         - Progress (steps, tokens)
-        - Session summary
+        - Session summary (pulled LIVE from CodexSessionManager)
         - Active sessions with their status
         - Working directory
@@ -108,45 +116,56 @@ class OrchestratorEnv(Environment):
         parts.append("## Progress\n" + "\n".join(progress_lines))
-        # Session summary
-        if self._sessions is not None:
-            active = sum(
-                1 for s in self._sessions.values() if s.status.value == "active"
-            )
-            completed = sum(
-                1 for s in self._sessions.values() if s.status.value == "completed"
-            )
-            failed = sum(
-                1 for s in self._sessions.values() if s.status.value == "failed"
-            )
-            total = len(self._sessions)
-            summary = f"Sessions: {active} active, {completed} done, {failed} failed ({total} total)"
+        # Session summary - pull LIVE from CodexSessionManager
+        if self._session_manager is not None:
+            sessions = self._session_manager.list_sessions()
+            running = sum(1 for s in sessions if s.status.value == "running")
+            completed = sum(1 for s in sessions if s.status.value == "completed")
+            failed = sum(1 for s in sessions if s.status.value == "failed")
+            total = len(sessions)
+            summary = f"Sessions: {running} running, {completed} done, {failed} failed ({total} total)"
             if self._budget_max_sessions:
                 summary += f" [limit: {self._budget_max_sessions}]"
             parts.append(f"## Resources\n{summary}")
-            # Active sessions detail
-            active_sessions = [
-                (sid, s)
-                for sid, s in self._sessions.items()
-                if s.status.value == "active"
-            ]
-            if active_sessions:
+            # Running sessions detail
+            running_sessions = [s for s in sessions if s.status.value == "running"]
+            if running_sessions:
+                session_lines = []
+                for session in running_sessions:
+                    task_preview = (
+                        session.task[:50] + "..."
+                        if len(session.task) > 50
+                        else session.task
+                    )
+                    tokens = session.token_usage.get("total_tokens", 0)
+                    token_info = f", {tokens:,} tok" if tokens else ""
+                    session_lines.append(
+                        f"  • {session.short_id} (turn {session.turn}{token_info}): {task_preview}"
+                    )
+                parts.append("## Running Sessions\n" + "\n".join(session_lines))
+            # Recently completed (for visibility)
+            recent_completed = [
+                s for s in sessions
+                if s.status.value == "completed"
+            ][:3]  # Last 3 completed
+            if recent_completed:
                 session_lines = []
-                for sid, session in active_sessions:
-                    mode_tag = "sync" if session.mode.value == "sync" else "async"
-                    turns = len([m for m in session.messages if m.role == "user"])
+                for session in recent_completed:
                     task_preview = (
-                        session.task_description[:50] + "..."
-                        if len(session.task_description) > 50
-                        else session.task_description
+                        session.task[:40] + "..."
+                        if len(session.task) > 40
+                        else session.task
                     )
+                    tokens = session.token_usage.get("total_tokens", 0)
                     session_lines.append(
-                        f"\n  • {sid[:8]} ({session.adapter}, {mode_tag}, {turns} turns): {task_preview}"
+                        f"  • {session.short_id} ✓ ({tokens:,} tok): {task_preview}"
                     )
-                parts.append("## Active Sessions\n" + "\n".join(session_lines))
+                parts.append("## Recently Completed\n" + "\n".join(session_lines))
         # Working directory (less prominent)
         parts.append(f"## Context\nWorking dir: {self.working_dir.absolute()}")

{zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/orchestrator.py RENAMED Viewed

@@ -127,9 +127,14 @@ class Orchestrator(YamlAgent):
                 }
             )
-        # Link sessions to environment for observe()
-        if hasattr(self.env, "set_sessions"):
-            self.env.set_sessions(self._sessions)
+        # Initialize CodexSessionManager and link to environment
+        # This is the SAME manager used by delegation tools
+        from zwarm.sessions import CodexSessionManager
+        self._session_manager = CodexSessionManager(self.working_dir / ".zwarm")
+        # Link session manager to environment for live session visibility in observe()
+        if hasattr(self.env, "set_session_manager"):
+            self.env.set_session_manager(self._session_manager)
         # Set budget limits in environment
         if hasattr(self.env, "set_budget"):

{zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/sessions/manager.py RENAMED Viewed

@@ -301,9 +301,18 @@ class CodexSessionManager:
             session.messages = messages
             session.token_usage = usage
-            if error:
+            # Check if we got actual assistant responses
+            has_response = any(m.role == "assistant" for m in messages)
+            if error and not has_response:
+                # Only mark as failed if we have an error AND no response
                 session.status = SessionStatus.FAILED
                 session.error = error
+            elif error and has_response:
+                # Got response but also an error (e.g., network disconnect at end)
+                # Treat as completed but note the error
+                session.status = SessionStatus.COMPLETED
+                session.error = f"Completed with error: {error}"
             else:
                 session.status = SessionStatus.COMPLETED
         else:
@@ -634,12 +643,127 @@ Continue from where you left off, addressing the user's new message."""
                 turn_usage = event.get("usage", {})
                 for key, value in turn_usage.items():
                     usage[key] = usage.get(key, 0) + value
+                # Compute total_tokens if not present
+                if "total_tokens" not in usage:
+                    usage["total_tokens"] = usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
             elif event_type == "error":
                 error = event.get("message", str(event))
         return messages, usage, error
+    def get_trajectory(self, session_id: str, full: bool = False, max_output_len: int = 200) -> list[dict]:
+        """
+        Get the full trajectory of a session - all steps in order.
+        Args:
+            session_id: Session to get trajectory for
+            full: If True, include full untruncated content
+            max_output_len: Max length for outputs when full=False
+        Returns a list of step dicts with type, summary, and details.
+        This shows the "broad strokes" of what the agent did.
+        """
+        if full:
+            max_output_len = 999999  # Effectively unlimited
+        session = self.get_session(session_id)
+        if not session:
+            return []
+        trajectory = []
+        for turn in range(1, session.turn + 1):
+            output_path = self._output_path(session.id, turn)
+            if not output_path.exists():
+                continue
+            content = output_path.read_text()
+            step_num = 0
+            for line in content.strip().split("\n"):
+                if not line.strip():
+                    continue
+                try:
+                    event = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                event_type = event.get("type", "")
+                if event_type == "item.completed":
+                    item = event.get("item", {})
+                    item_type = item.get("type", "")
+                    step_num += 1
+                    if item_type == "reasoning":
+                        text = item.get("text", "")
+                        summary_len = max_output_len if full else 100
+                        trajectory.append({
+                            "turn": turn,
+                            "step": step_num,
+                            "type": "reasoning",
+                            "summary": text[:summary_len] + ("..." if len(text) > summary_len else ""),
+                            "full_text": text if full else None,
+                        })
+                    elif item_type == "command_execution":
+                        cmd = item.get("command", "")
+                        output = item.get("aggregated_output", "")
+                        exit_code = item.get("exit_code")
+                        # Truncate output
+                        output_preview = output[:max_output_len]
+                        if len(output) > max_output_len:
+                            output_preview += "..."
+                        trajectory.append({
+                            "turn": turn,
+                            "step": step_num,
+                            "type": "command",
+                            "command": cmd,
+                            "output": output_preview.strip(),
+                            "exit_code": exit_code,
+                        })
+                    elif item_type == "function_call":
+                        func_name = item.get("name", "unknown")
+                        args = item.get("arguments", {})
+                        args_str = str(args)
+                        args_len = max_output_len if full else 100
+                        trajectory.append({
+                            "turn": turn,
+                            "step": step_num,
+                            "type": "tool_call",
+                            "tool": func_name,
+                            "args_preview": args_str[:args_len] + ("..." if len(args_str) > args_len else ""),
+                            "full_args": args if full else None,
+                        })
+                    elif item_type == "function_call_output":
+                        output = item.get("output", "")
+                        output_preview = output[:max_output_len]
+                        if len(output) > max_output_len:
+                            output_preview += "..."
+                        trajectory.append({
+                            "turn": turn,
+                            "step": step_num,
+                            "type": "tool_output",
+                            "output": output_preview,
+                        })
+                    elif item_type == "agent_message":
+                        text = item.get("text", "")
+                        summary_len = max_output_len if full else 200
+                        trajectory.append({
+                            "turn": turn,
+                            "step": step_num,
+                            "type": "message",
+                            "summary": text[:summary_len] + ("..." if len(text) > summary_len else ""),
+                            "full_text": text if full else None,
+                            "full_length": len(text),
+                        })
+        return trajectory
     def cleanup_completed(self, keep_days: int = 7) -> int:
         """
         Remove old completed/failed/killed sessions.

{zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/tools/delegation.py RENAMED Viewed

@@ -33,8 +33,12 @@ def _get_session_manager(orchestrator: "Orchestrator"):
     Both `zwarm interactive` and `zwarm orchestrate` use the same session manager.
     The orchestrator is just another user that happens to be an LLM.
+    The session manager is created eagerly in Orchestrator.model_post_init()
+    and shared with the environment for observe() visibility.
     """
-    if not hasattr(orchestrator, "_session_manager"):
+    # Should already exist from model_post_init, but create if not
+    if not hasattr(orchestrator, "_session_manager") or orchestrator._session_manager is None:
         from zwarm.sessions import CodexSessionManager
         orchestrator._session_manager = CodexSessionManager(orchestrator.working_dir / ".zwarm")
     return orchestrator._session_manager
@@ -83,6 +87,14 @@ def _format_session_header(session) -> str:
     return f"[{session.short_id}] codex ({session.status.value})"
+def _get_total_tokens(session) -> int:
+    """Get total tokens, computing from input+output if not present."""
+    usage = session.token_usage
+    if "total_tokens" in usage:
+        return usage["total_tokens"]
+    return usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
 def _validate_working_dir(
     requested_dir: Path | str | None,
     default_dir: Path,
@@ -238,6 +250,25 @@ def delegate(
                 response_text = msg.content
                 break  # Take first assistant message
+        # Build log path for debugging
+        log_path = str(manager._output_path(session.id, session.turn))
+        # Check if session failed
+        from zwarm.sessions import SessionStatus
+        if session.status == SessionStatus.FAILED:
+            return {
+                "success": False,
+                "session": _format_session_header(session),
+                "session_id": session.id,
+                "status": "failed",
+                "task": _truncate(task, 100),
+                "error": session.error or "Unknown error",
+                "response": response_text or "(no response captured)",
+                "tokens": _get_total_tokens(session),
+                "log_file": log_path,
+                "hint": "Check log_file for raw codex output. Use bash('cat <log_file>') to inspect.",
+            }
         return {
             "success": True,
             "session": _format_session_header(session),
@@ -245,7 +276,8 @@ def delegate(
             "status": session.status.value,
             "task": _truncate(task, 100),
             "response": response_text or "(no response captured)",
-            "tokens": session.token_usage.get("total_tokens", 0),
+            "tokens": _get_total_tokens(session),
+            "log_file": log_path,
             "hint": "Use converse(session_id, message) to send follow-up messages",
         }
     else:
@@ -382,7 +414,7 @@ def converse(
         "turn": session.turn,
         "you_said": _truncate(message, 100),
         "response": response_text or "(no response captured)",
-        "tokens": session.token_usage.get("total_tokens", 0),
+        "tokens": _get_total_tokens(session),
     }
@@ -423,7 +455,10 @@ def check_session(
             response_text = msg.content
             break
-    return {
+    # Build log path
+    log_path = str(manager._output_path(session.id, session.turn))
+    result = {
         "success": True,
         "session": _format_session_header(session),
         "session_id": session_id,
@@ -433,10 +468,19 @@ def check_session(
         "message_count": len(messages),
         "task": _truncate(session.task, 80),
         "response": _truncate(response_text, 500) if response_text else "(no response yet)",
-        "tokens": session.token_usage.get("total_tokens", 0),
+        "tokens": _get_total_tokens(session),
         "runtime": session.runtime,
+        "log_file": log_path,
     }
+    # Add error info if failed
+    from zwarm.sessions import SessionStatus
+    if session.status == SessionStatus.FAILED:
+        result["success"] = False
+        result["error"] = session.error or "Unknown error"
+    return result
 @weaveTool
 def peek_session(
@@ -477,6 +521,81 @@ def peek_session(
     }
+@weaveTool
+def get_trajectory(
+    self: "Orchestrator",
+    session_id: str,
+    full: bool = False,
+) -> dict[str, Any]:
+    """
+    Get the full trajectory of a session - all steps the agent took.
+    Shows reasoning, commands, tool calls, and responses in order.
+    Useful for understanding HOW the agent completed a task, not just
+    the final result.
+    Args:
+        session_id: The session to get trajectory for.
+        full: If True, include full untruncated content (default: False for summary view).
+    Returns:
+        {steps: [...], step_count}
+    """
+    manager = _get_session_manager(self)
+    session = manager.get_session(session_id)
+    if not session:
+        return {"success": False, "error": f"Unknown session: {session_id}"}
+    trajectory = manager.get_trajectory(session_id, full=full)
+    # Format steps for easy reading
+    formatted_steps = []
+    for step in trajectory:
+        step_type = step.get("type", "unknown")
+        if step_type == "reasoning":
+            text = step.get("full_text") if full else step.get("summary", "")
+            formatted_steps.append(f"[thinking] {text}")
+        elif step_type == "command":
+            cmd = step.get("command", "")
+            output = step.get("output", "")
+            exit_code = step.get("exit_code")
+            step_str = f"[command] $ {cmd}"
+            if output:
+                if full:
+                    step_str += f"\n  → {output}"
+                else:
+                    step_str += f"\n  → {output[:100]}{'...' if len(output) > 100 else ''}"
+            if exit_code and exit_code != 0:
+                step_str += f" (exit: {exit_code})"
+            formatted_steps.append(step_str)
+        elif step_type == "tool_call":
+            if full and step.get("full_args"):
+                import json
+                args_str = json.dumps(step["full_args"], indent=2)
+                formatted_steps.append(f"[tool] {step.get('tool', 'unknown')}\n  {args_str}")
+            else:
+                formatted_steps.append(f"[tool] {step.get('tool', 'unknown')}({step.get('args_preview', '')})")
+        elif step_type == "tool_output":
+            output = step.get("output", "")
+            if not full:
+                output = output[:100]
+            formatted_steps.append(f"[result] {output}")
+        elif step_type == "message":
+            text = step.get("full_text") if full else step.get("summary", "")
+            formatted_steps.append(f"[response] {text}")
+    return {
+        "success": True,
+        "session_id": session.short_id,
+        "task": _truncate(session.task, 80),
+        "step_count": len(trajectory),
+        "steps": formatted_steps,
+        "mode": "full" if full else "summary",
+    }
 @weaveTool
 def end_session(
     self: "Orchestrator",
@@ -539,7 +658,7 @@ def end_session(
         "status": session.status.value,
         "reason": reason or "ended by orchestrator",
         "turn": session.turn,
-        "tokens": session.token_usage.get("total_tokens", 0),
+        "tokens": _get_total_tokens(session),
     }
@@ -646,7 +765,7 @@ def list_sessions(
             "updated_secs": int(updated_secs),
             "last_message": _truncate(last_message, 100) if last_message else "(no response yet)",
             "needs_attention": needs_attention,
-            "tokens": s.token_usage.get("total_tokens", 0),
+            "tokens": _get_total_tokens(s),
         })
     # Summary counts

{zwarm-2.0.2 → zwarm-2.3.5}/src/zwarm/watchers/__init__.py RENAMED Viewed

@@ -11,6 +11,10 @@ from zwarm.watchers.manager import WatcherManager, WatcherConfig, build_watcher_
 # Import built-in watchers to register them
 from zwarm.watchers import builtin as _builtin  # noqa: F401
+from zwarm.watchers import llm_watcher as _llm_watcher  # noqa: F401
+# Export trajectory compression utility
+from zwarm.watchers.llm_watcher import compress_trajectory
 __all__ = [
     "Watcher",
@@ -23,4 +27,5 @@ __all__ = [
     "get_watcher",
     "list_watchers",
     "build_watcher_manager",
+    "compress_trajectory",
 ]

zwarm-2.3.5/src/zwarm/watchers/llm_watcher.py ADDED Viewed

@@ -0,0 +1,319 @@
+"""
+LLM-based watcher for nuanced trajectory analysis.
+Unlike rule-based watchers, this watcher uses a language model to assess
+the orchestrator's trajectory and provide context-aware guidance.
+The watcher compresses the full message history into a compact trajectory
+representation (similar to what Codex shows in its UI) to minimize token
+usage while preserving the "shape" of the agent's behavior.
+"""
+from __future__ import annotations
+import json
+import logging
+from typing import Any
+from zwarm.watchers.base import Watcher, WatcherContext, WatcherResult
+from zwarm.watchers.registry import register_watcher
+logger = logging.getLogger(__name__)
+def _get_field(item: Any, name: str, default: Any = None) -> Any:
+    """Get field from dict or object."""
+    if isinstance(item, dict):
+        return item.get(name, default)
+    return getattr(item, name, default)
+def _extract_tool_call_summary(tc: Any) -> str:
+    """Extract a compact summary of a tool call."""
+    if isinstance(tc, dict):
+        func = tc.get("function", tc)
+        name = func.get("name", tc.get("name", "?"))
+        args = func.get("arguments", tc.get("arguments", ""))
+    else:
+        name = getattr(tc, "name", "?")
+        args = getattr(tc, "arguments", "")
+    # Parse args if JSON string
+    if isinstance(args, str):
+        try:
+            args = json.loads(args)
+        except (json.JSONDecodeError, TypeError):
+            pass
+    # Create compact arg summary
+    if isinstance(args, dict):
+        # Show key args based on tool type
+        if name == "delegate":
+            task = args.get("task", "")[:50]
+            mode = args.get("mode", "sync")
+            return f"delegate({mode}): {task}..."
+        elif name == "converse":
+            msg = args.get("message", "")[:40]
+            return f"converse: {msg}..."
+        elif name == "bash":
+            cmd = args.get("command", "")[:60]
+            return f"$ {cmd}"
+        elif name in ("check_session", "peek_session", "end_session"):
+            sid = args.get("session_id", "")[:8]
+            return f"{name}({sid})"
+        elif name == "list_sessions":
+            return "list_sessions()"
+        else:
+            # Generic: show first arg
+            first_val = next(iter(args.values()), "") if args else ""
+            if isinstance(first_val, str) and len(first_val) > 30:
+                first_val = first_val[:30] + "..."
+            return f"{name}({first_val})"
+    else:
+        return f"{name}({str(args)[:30]})"
+def compress_trajectory(messages: list[dict[str, Any]], max_steps: int = 50) -> str:
+    """
+    Compress full message history into a compact trajectory representation.
+    Output format (similar to Codex UI):
+    ```
+    [1] thinking: "preparing to inspect the codebase"
+        → delegate(sync): Add authentication to...
+    [2] thinking: "checking session status"
+        → check_session(abc123)
+    [3] thinking: "session completed, verifying"
+        → $ pytest tests/
+    ```
+    Args:
+        messages: Full message history from orchestrator
+        max_steps: Maximum steps to include (most recent)
+    Returns:
+        Compact trajectory string
+    """
+    steps = []
+    step_num = 0
+    for msg in messages:
+        role = _get_field(msg, "role", "")
+        if role == "system":
+            continue  # Skip system messages
+        if role == "assistant":
+            step_num += 1
+            content = _get_field(msg, "content", "")
+            tool_calls = _get_field(msg, "tool_calls", [])
+            # Extract thinking/reasoning summary
+            thinking = ""
+            if content:
+                # Take first line or first 80 chars as "thinking"
+                first_line = content.split("\n")[0].strip()
+                if len(first_line) > 80:
+                    thinking = first_line[:80] + "..."
+                else:
+                    thinking = first_line
+            # Extract tool calls
+            actions = []
+            if tool_calls:
+                for tc in tool_calls[:3]:  # Max 3 tool calls per step
+                    actions.append(_extract_tool_call_summary(tc))
+                if len(tool_calls) > 3:
+                    actions.append(f"... +{len(tool_calls) - 3} more")
+            # Format step
+            step_lines = [f"[{step_num}]"]
+            if thinking:
+                step_lines[0] += f' thinking: "{thinking}"'
+            for action in actions:
+                step_lines.append(f"    → {action}")
+            steps.append("\n".join(step_lines))
+        elif role == "tool":
+            # Tool results - just note if error
+            content = str(_get_field(msg, "content", ""))
+            if "error" in content.lower() or "failed" in content.lower():
+                steps.append(f"    ⚠ tool returned error")
+        elif role == "user" and step_num > 0:
+            # User message mid-conversation (watcher nudge, etc.)
+            content = _get_field(msg, "content", "")
+            if content and "[WATCHER" in content:
+                steps.append(f"    📍 watcher nudge")
+            elif content:
+                preview = content[:50].replace("\n", " ")
+                steps.append(f"    💬 user: {preview}...")
+    # Take most recent steps
+    if len(steps) > max_steps:
+        steps = ["... (earlier steps omitted)"] + steps[-max_steps:]
+    return "\n".join(steps)
+def _build_watcher_prompt(
+    trajectory: str,
+    task: str,
+    step: int,
+    max_steps: int,
+    session_summary: str,
+) -> str:
+    """Build the prompt for the LLM watcher."""
+    return f"""You are a trajectory watcher observing an orchestrator agent. Your job is to assess whether the agent is on track and provide guidance if needed.
+## Original Task
+{task}
+## Progress
+Step {step}/{max_steps}
+## Active Sessions
+{session_summary}
+## Trajectory (recent steps)
+{trajectory}
+---
+Analyze this trajectory and respond with a JSON object:
+{{
+  "status": "ok" | "concern" | "problem",
+  "assessment": "Brief 1-2 sentence assessment of trajectory health",
+  "guidance": "If status is concern/problem, specific actionable guidance. Otherwise null."
+}}
+Things to watch for:
+- Is the agent making progress toward the task?
+- Is it spinning or repeating actions?
+- Is it going off on tangents unrelated to the task?
+- Is it delegating appropriately or trying to do everything directly?
+- Are sessions being completed or just started and abandoned?
+Be concise. Only flag real issues, not minor inefficiencies."""
+@register_watcher("llm")
+class LLMWatcher(Watcher):
+    """
+    LLM-based watcher for nuanced trajectory analysis.
+    Uses a language model to assess the orchestrator's trajectory
+    and provide context-aware guidance that rule-based watchers can't.
+    Config options:
+        model: Model to use (default: gpt-4o-mini)
+        threshold: How often to run (every N steps, default: 5)
+        temperature: LLM temperature (default: 0.3)
+    """
+    name = "llm"
+    description = "LLM-based trajectory analysis for nuanced guidance"
+    async def observe(self, ctx: WatcherContext) -> WatcherResult:
+        config = self.config
+        threshold = config.get("threshold", 5)
+        model = config.get("model", "gpt-4o-mini")
+        temperature = config.get("temperature", 0.3)
+        # Only run every N steps to save costs
+        if ctx.step % threshold != 0 or ctx.step == 0:
+            return WatcherResult.ok()
+        try:
+            # Compress trajectory
+            trajectory = compress_trajectory(ctx.messages)
+            # Build session summary
+            active = [s for s in ctx.sessions if s.get("status") == "running"]
+            completed = [s for s in ctx.sessions if s.get("status") == "completed"]
+            failed = [s for s in ctx.sessions if s.get("status") == "failed"]
+            session_summary = f"{len(active)} running, {len(completed)} completed, {len(failed)} failed"
+            # Build prompt
+            prompt = _build_watcher_prompt(
+                trajectory=trajectory,
+                task=ctx.task,
+                step=ctx.step,
+                max_steps=ctx.max_steps,
+                session_summary=session_summary,
+            )
+            # Call LLM
+            response = await self._call_llm(prompt, model, temperature)
+            # Parse response
+            result = self._parse_response(response)
+            if result["status"] == "ok":
+                return WatcherResult.ok()
+            elif result["status"] == "concern":
+                return WatcherResult.nudge(
+                    guidance=result.get("guidance", result["assessment"]),
+                    reason=f"LLM assessment: {result['assessment']}",
+                    metadata={"llm_response": result},
+                )
+            else:  # problem
+                return WatcherResult.nudge(
+                    guidance=result.get("guidance", result["assessment"]),
+                    reason=f"LLM detected problem: {result['assessment']}",
+                    priority=10,  # Higher priority for problems
+                    metadata={"llm_response": result},
+                )
+        except Exception as e:
+            logger.warning(f"LLM watcher failed: {e}")
+            return WatcherResult.ok()  # Don't block on watcher failure
+    async def _call_llm(self, prompt: str, model: str, temperature: float) -> str:
+        """Call the LLM using OpenAI Responses API."""
+        import openai
+        client = openai.AsyncOpenAI()
+        # Use Responses API (consistent with wbal)
+        response = await client.responses.create(
+            model=model,
+            input=[{"role": "user", "content": prompt}],
+            temperature=temperature,
+            text={"format": {"type": "json_object"}},
+        )
+        # Extract text from response
+        output_text = getattr(response, "output_text", None)
+        if output_text:
+            return output_text
+        # Fallback: look through output items
+        for item in getattr(response, "output", []):
+            if getattr(item, "type", None) == "message":
+                for content in getattr(item, "content", []):
+                    if getattr(content, "type", None) == "output_text":
+                        return getattr(content, "text", "{}")
+            # Also check for direct text attribute
+            text = getattr(item, "text", None)
+            if text:
+                return text
+        return "{}"
+    def _parse_response(self, response: str) -> dict[str, Any]:
+        """Parse LLM response JSON."""
+        try:
+            result = json.loads(response)
+            # Validate required fields
+            if "status" not in result:
+                result["status"] = "ok"
+            if "assessment" not in result:
+                result["assessment"] = "No assessment provided"
+            return result
+        except json.JSONDecodeError:
+            return {
+                "status": "ok",
+                "assessment": "Failed to parse LLM response",
+            }