PyPI - zwarm - Versions diffs - 1.3.10__py3-none-any.whl - Mend

zwarm 1.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

zwarm/__init__.py +38 -0
zwarm/adapters/__init__.py +21 -0
zwarm/adapters/base.py +109 -0
zwarm/adapters/claude_code.py +357 -0
zwarm/adapters/codex_mcp.py +968 -0
zwarm/adapters/registry.py +69 -0
zwarm/adapters/test_codex_mcp.py +274 -0
zwarm/adapters/test_registry.py +68 -0
zwarm/cli/__init__.py +0 -0
zwarm/cli/main.py +2052 -0
zwarm/core/__init__.py +0 -0
zwarm/core/compact.py +329 -0
zwarm/core/config.py +342 -0
zwarm/core/environment.py +154 -0
zwarm/core/models.py +315 -0
zwarm/core/state.py +355 -0
zwarm/core/test_compact.py +312 -0
zwarm/core/test_config.py +160 -0
zwarm/core/test_models.py +265 -0
zwarm/orchestrator.py +623 -0
zwarm/prompts/__init__.py +10 -0
zwarm/prompts/orchestrator.py +214 -0
zwarm/sessions/__init__.py +24 -0
zwarm/sessions/manager.py +589 -0
zwarm/test_orchestrator_watchers.py +23 -0
zwarm/tools/__init__.py +17 -0
zwarm/tools/delegation.py +630 -0
zwarm/watchers/__init__.py +26 -0
zwarm/watchers/base.py +131 -0
zwarm/watchers/builtin.py +424 -0
zwarm/watchers/manager.py +181 -0
zwarm/watchers/registry.py +57 -0
zwarm/watchers/test_watchers.py +237 -0
zwarm-1.3.10.dist-info/METADATA +525 -0
zwarm-1.3.10.dist-info/RECORD +37 -0
zwarm-1.3.10.dist-info/WHEEL +4 -0
zwarm-1.3.10.dist-info/entry_points.txt +2 -0

zwarm/adapters/codex_mcp.py ADDED Viewed

@@ -0,0 +1,968 @@
+"""
+Codex MCP adapter for sync conversations.
+Uses codex mcp-server for true iterative conversations:
+- codex() to start a session with conversationId
+- codex-reply() to continue the conversation
+"""
+from __future__ import annotations
+import json
+import logging
+import queue
+import subprocess
+import threading
+import time
+from pathlib import Path
+from typing import Any, Literal
+import weave
+from zwarm.adapters.base import ExecutorAdapter
+from zwarm.adapters.registry import register_adapter
+from zwarm.core.models import (
+    ConversationSession,
+    SessionMode,
+    SessionStatus,
+)
+logger = logging.getLogger(__name__)
+class MCPClient:
+    """
+    Robust MCP client for communicating with codex mcp-server.
+    Uses subprocess.Popen (NOT asyncio.subprocess) to avoid being tied to
+    any specific event loop. This allows the MCP server to stay alive across
+    multiple asyncio.run() calls, preserving conversation state.
+    Uses dedicated reader threads that queue lines, avoiding the race condition
+    of spawning new reader threads on timeout.
+    """
+    def __init__(self):
+        self._proc: subprocess.Popen | None = None
+        self._proc_pid: int | None = None  # Track PID to detect restarts
+        self._request_id = 0
+        self._initialized = False
+        self._stderr_thread: threading.Thread | None = None
+        self._stdout_thread: threading.Thread | None = None
+        self._stderr_lines: list[str] = []
+        self._stdout_queue: queue.Queue[str | None] = queue.Queue()
+        self._lock = threading.Lock()  # Protect writes only
+        self._start_count = 0  # Track how many times we've started
+    def start(self) -> None:
+        """Start the MCP server process."""
+        with self._lock:
+            if self._proc is not None and self._proc.poll() is None:
+                logger.debug(f"MCP server already running (pid={self._proc.pid}, start_count={self._start_count})")
+                return  # Already running
+            # Check if this is a restart (previous server died)
+            if self._proc_pid is not None:
+                logger.warning(
+                    f"MCP server restart detected! Previous pid={self._proc_pid}, "
+                    f"start_count={self._start_count}. All conversation state will be lost."
+                )
+            self._start_count += 1
+            logger.info(f"Starting codex mcp-server... (start_count={self._start_count})")
+            self._proc = subprocess.Popen(
+                ["codex", "mcp-server"],
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=False,  # Binary mode for explicit encoding control
+            )
+            self._proc_pid = self._proc.pid
+            self._initialized = False
+            self._stderr_lines = []
+            self._stdout_queue = queue.Queue()  # Fresh queue
+            # Start background thread to read stderr
+            self._stderr_thread = threading.Thread(
+                target=self._read_stderr_loop,
+                daemon=True,
+                name="mcp-stderr-reader",
+            )
+            self._stderr_thread.start()
+            # Start background thread to read stdout into queue
+            self._stdout_thread = threading.Thread(
+                target=self._read_stdout_loop,
+                daemon=True,
+                name="mcp-stdout-reader",
+            )
+            self._stdout_thread.start()
+            logger.info(f"MCP server started (pid={self._proc.pid})")
+    def _read_stderr_loop(self) -> None:
+        """Background thread to read stderr and log errors."""
+        if not self._proc or not self._proc.stderr:
+            return
+        try:
+            while True:
+                line = self._proc.stderr.readline()
+                if not line:
+                    break
+                decoded = line.decode().strip()
+                if decoded:
+                    self._stderr_lines.append(decoded)
+                    # Keep only last 100 lines
+                    if len(self._stderr_lines) > 100:
+                        self._stderr_lines = self._stderr_lines[-100:]
+                    # Log errors prominently
+                    if "error" in decoded.lower() or "ERROR" in decoded:
+                        logger.error(f"[MCP stderr] {decoded}")
+                    else:
+                        logger.debug(f"[MCP stderr] {decoded}")
+        except Exception as e:
+            logger.warning(f"stderr reader stopped: {e}")
+    def _read_stdout_loop(self) -> None:
+        """Background thread to read stdout and queue lines."""
+        if not self._proc or not self._proc.stdout:
+            return
+        try:
+            while True:
+                line = self._proc.stdout.readline()
+                if not line:
+                    # EOF - signal end
+                    self._stdout_queue.put(None)
+                    break
+                decoded = line.decode()
+                self._stdout_queue.put(decoded)
+        except Exception as e:
+            logger.warning(f"stdout reader stopped: {e}")
+            self._stdout_queue.put(None)  # Signal error
+    def _next_id(self) -> int:
+        self._request_id += 1
+        return self._request_id
+    def _write(self, data: str) -> None:
+        """Write to stdin with error handling."""
+        if not self._proc or not self._proc.stdin:
+            raise RuntimeError("MCP server not running")
+        if self._proc.poll() is not None:
+            raise RuntimeError(f"MCP server died (exit code {self._proc.returncode})")
+        self._proc.stdin.write(data.encode())
+        self._proc.stdin.flush()
+    def _read_line(self, timeout: float = 120.0) -> str:
+        """
+        Read a line from the stdout queue with timeout.
+        Uses a dedicated reader thread that queues lines, so we never
+        lose data on timeout - we just haven't received it yet.
+        """
+        if not self._proc:
+            raise RuntimeError("MCP server not running")
+        try:
+            line = self._stdout_queue.get(timeout=timeout)
+        except queue.Empty:
+            # Timeout - check process health
+            if self._proc.poll() is not None:
+                stderr_context = "\n".join(self._stderr_lines[-10:]) if self._stderr_lines else "(no stderr)"
+                raise RuntimeError(
+                    f"MCP server died (exit code {self._proc.returncode}).\n"
+                    f"Recent stderr:\n{stderr_context}"
+                )
+            # Process still alive, just slow - return empty to let caller decide
+            return ""
+        if line is None:
+            # EOF or error from reader thread
+            stderr_context = "\n".join(self._stderr_lines[-10:]) if self._stderr_lines else "(no stderr)"
+            if self._proc.poll() is not None:
+                raise RuntimeError(
+                    f"MCP server exited (code {self._proc.returncode}).\n"
+                    f"Recent stderr:\n{stderr_context}"
+                )
+            raise RuntimeError(f"MCP stdout closed unexpectedly.\nRecent stderr:\n{stderr_context}")
+        return line
+    def _check_alive(self) -> None:
+        """Check if the MCP server is still alive, raise if not."""
+        if not self._proc:
+            raise RuntimeError("MCP server not started")
+        if self._proc.poll() is not None:
+            stderr_context = "\n".join(self._stderr_lines[-10:]) if self._stderr_lines else "(no stderr)"
+            raise RuntimeError(
+                f"MCP server died (exit code {self._proc.returncode}).\n"
+                f"Recent stderr:\n{stderr_context}"
+            )
+    def initialize(self) -> dict:
+        """Initialize MCP connection."""
+        self._check_alive()
+        request = {
+            "jsonrpc": "2.0",
+            "id": self._next_id(),
+            "method": "initialize",
+            "params": {
+                "protocolVersion": "2024-11-05",
+                "capabilities": {},
+                "clientInfo": {"name": "zwarm", "version": "0.1.0"},
+            },
+        }
+        with self._lock:
+            self._write(json.dumps(request) + "\n")
+        response_line = self._read_line(timeout=30.0)
+        if not response_line:
+            raise RuntimeError("No response from MCP server during init")
+        response = json.loads(response_line)
+        if "error" in response:
+            raise RuntimeError(f"MCP init error: {response['error']}")
+        # Send initialized notification
+        notif = {"jsonrpc": "2.0", "method": "notifications/initialized"}
+        with self._lock:
+            self._write(json.dumps(notif) + "\n")
+        self._initialized = True
+        logger.info("MCP connection initialized")
+        return response
+    def call_tool(self, name: str, arguments: dict, timeout: float = 300.0) -> dict:
+        """
+        Call an MCP tool and collect streaming events.
+        Args:
+            name: Tool name (codex, codex-reply)
+            arguments: Tool arguments
+            timeout: Overall timeout for the call (default 5 min)
+        """
+        self._check_alive()
+        if not self._initialized:
+            self.initialize()
+        request_id = self._next_id()
+        request = {
+            "jsonrpc": "2.0",
+            "id": request_id,
+            "method": "tools/call",
+            "params": {"name": name, "arguments": arguments},
+        }
+        logger.debug(f"Calling MCP tool: {name} with args: {list(arguments.keys())}")
+        with self._lock:
+            self._write(json.dumps(request) + "\n")
+        # Collect streaming events until final result
+        # Reader thread queues lines, we pull from queue with timeout
+        session_id = None
+        conversation_id = None  # Track conversation ID separately
+        agent_messages: list[str] = []
+        streaming_text: list[str] = []  # Accumulate streaming delta text
+        final_result = None
+        token_usage: dict[str, Any] = {}  # Track token usage
+        start_time = time.time()
+        all_events: list[dict] = []  # Keep ALL events for debugging
+        for event_count in range(1000):  # Safety limit on events
+            self._check_alive()
+            # Check overall timeout
+            elapsed = time.time() - start_time
+            if elapsed > timeout:
+                raise RuntimeError(f"MCP call timed out after {timeout}s ({event_count} events received)")
+            # Read from queue with per-event timeout
+            # Empty string = timeout (process still alive, just waiting)
+            # None sentinel is handled inside _read_line (raises RuntimeError)
+            line = self._read_line(timeout=30.0)
+            if not line:
+                # Timeout waiting for event - process is still alive, just slow
+                # This is normal during long codex operations
+                logger.debug(f"Waiting for MCP event... (elapsed: {elapsed:.0f}s, events: {event_count})")
+                continue
+            try:
+                event = json.loads(line)
+                all_events.append(event)  # Keep for debugging
+            except json.JSONDecodeError as e:
+                logger.warning(f"Invalid JSON from MCP: {line[:100]}... - {e}")
+                continue
+            # Check for final result (has matching id)
+            if event.get("id") == request_id:
+                if "result" in event:
+                    final_result = event["result"]
+                    # Extract conversation ID from final result
+                    if isinstance(final_result, dict):
+                        conversation_id = final_result.get("conversationId") or final_result.get("conversation_id")
+                    logger.debug(f"Got final result after {event_count} events, conversation_id={conversation_id}")
+                    break
+                elif "error" in event:
+                    error = event["error"]
+                    raise RuntimeError(f"MCP tool error: {error.get('message', error)}")
+            # Process streaming events
+            if event.get("method") == "codex/event":
+                params = event.get("params", {})
+                msg = params.get("msg", {})
+                msg_type = msg.get("type")
+                # Log ALL event types to help debug missing messages
+                logger.debug(f"MCP event: type={msg_type}, keys={list(msg.keys())}")
+                if msg_type == "session_configured":
+                    session_id = msg.get("session_id")
+                    logger.debug(f"Session configured: {session_id}")
+                elif msg_type == "item_completed":
+                    item = msg.get("item", {})
+                    item_type = item.get("type")
+                    # Log ALL item_completed events to help debug
+                    logger.debug(f"item_completed: type={item_type}, keys={list(item.keys())}")
+                    # Agent text responses - codex uses "AgentMessage" type
+                    if item_type == "AgentMessage":
+                        content = item.get("content", [])
+                        for block in content:
+                            if isinstance(block, dict) and block.get("text"):
+                                agent_messages.append(block["text"])
+                            elif isinstance(block, str):
+                                agent_messages.append(block)
+                    # Also check for "agent_message" (lowercase) variant
+                    elif item_type == "agent_message":
+                        text = item.get("text", "") or item.get("message", "")
+                        if text:
+                            agent_messages.append(text)
+                        # Also check content array
+                        content = item.get("content", [])
+                        for block in content:
+                            if isinstance(block, dict) and block.get("text"):
+                                agent_messages.append(block["text"])
+                            elif isinstance(block, str):
+                                agent_messages.append(block)
+                    # Legacy format check
+                    elif item_type == "message" and item.get("role") == "assistant":
+                        content = item.get("content", [])
+                        for block in content:
+                            if isinstance(block, dict) and block.get("text"):
+                                agent_messages.append(block["text"])
+                            elif isinstance(block, str):
+                                agent_messages.append(block)
+                    # Generic message type - check for text/content
+                    elif item_type == "message":
+                        text = item.get("text", "")
+                        if text:
+                            agent_messages.append(text)
+                        content = item.get("content", [])
+                        if isinstance(content, str):
+                            agent_messages.append(content)
+                        elif isinstance(content, list):
+                            for block in content:
+                                if isinstance(block, dict) and block.get("text"):
+                                    agent_messages.append(block["text"])
+                                elif isinstance(block, str):
+                                    agent_messages.append(block)
+                    # Function call outputs (for context)
+                    elif item_type == "function_call_output":
+                        output = item.get("output", "")
+                        if output and len(output) < 1000:
+                            agent_messages.append(f"[Tool output]: {output[:500]}")
+                    # Log other item types we're not handling
+                    elif item_type not in ("function_call", "tool_call", "UserMessage", "user_message"):
+                        logger.debug(f"Unhandled item_completed type: {item_type}, item={item}")
+                elif msg_type == "agent_message":
+                    # Direct agent message event
+                    message = msg.get("message", "")
+                    if message:
+                        agent_messages.append(message)
+                elif msg_type in ("task_complete", "task_completed"):
+                    # Task is done - capture last_agent_message as fallback
+                    last_msg = msg.get("last_agent_message")
+                    if last_msg and last_msg not in agent_messages:
+                        agent_messages.append(last_msg)
+                    logger.debug(f"Task complete after {event_count} events")
+                    break
+                elif msg_type == "token_count":
+                    # Capture token usage for cost tracking
+                    info = msg.get("info") or {}
+                    if info:
+                        usage = info.get("total_token_usage", {})
+                        if usage:
+                            token_usage = {
+                                "input_tokens": usage.get("input_tokens", 0),
+                                "output_tokens": usage.get("output_tokens", 0),
+                                "cached_input_tokens": usage.get("cached_input_tokens", 0),
+                                "reasoning_tokens": usage.get("reasoning_output_tokens", 0),
+                                "total_tokens": usage.get("total_tokens", 0),
+                            }
+                            logger.debug(f"Token usage: {token_usage}")
+                elif msg_type == "error":
+                    error_msg = msg.get("error", msg.get("message", str(msg)))
+                    raise RuntimeError(f"Codex error: {error_msg}")
+                # Handle streaming text events (various formats)
+                elif msg_type in ("text_delta", "content_block_delta", "message_delta"):
+                    delta = msg.get("delta", {})
+                    text = delta.get("text", "") or msg.get("text", "")
+                    if text:
+                        streaming_text.append(text)
+                elif msg_type == "text":
+                    text = msg.get("text", "")
+                    if text:
+                        streaming_text.append(text)
+                elif msg_type == "response":
+                    # Some versions send the full response this way
+                    response_text = msg.get("response", "") or msg.get("text", "")
+                    if response_text:
+                        agent_messages.append(response_text)
+                elif msg_type == "message":
+                    # Direct message event
+                    text = msg.get("text", "") or msg.get("content", "")
+                    if text:
+                        agent_messages.append(text)
+                else:
+                    # Log unknown event types at debug level to help diagnose
+                    if msg_type and msg_type not in ("session_started", "thinking", "tool_call", "function_call"):
+                        logger.debug(f"Unhandled MCP event type: {msg_type}, msg keys: {list(msg.keys())}")
+        # Merge streaming text into messages if we got any
+        if streaming_text:
+            full_streaming = "".join(streaming_text)
+            if full_streaming.strip():
+                agent_messages.append(full_streaming)
+                logger.debug(f"Captured {len(streaming_text)} streaming chunks ({len(full_streaming)} chars)")
+        # Try to extract content from final_result if we have no messages
+        if final_result and not agent_messages:
+            if "content" in final_result:
+                content = final_result["content"]
+                if isinstance(content, list):
+                    for block in content:
+                        if isinstance(block, dict) and block.get("text"):
+                            agent_messages.append(block["text"])
+                        elif isinstance(block, str):
+                            agent_messages.append(block)
+                elif isinstance(content, str):
+                    agent_messages.append(content)
+            # Also check for text field
+            if not agent_messages and "text" in final_result:
+                agent_messages.append(final_result["text"])
+        # Build result - prefer conversation_id from final result, fallback to session_id from events
+        effective_conversation_id = conversation_id or session_id
+        result = {
+            "conversationId": effective_conversation_id,
+            "messages": agent_messages,
+            "output": "\n".join(agent_messages) if agent_messages else "",
+            "usage": token_usage,  # Token usage for cost tracking
+        }
+        # Log detailed debug info if we didn't capture any messages
+        if not agent_messages:
+            event_types = [e.get("method") or f"id:{e.get('id')}" for e in all_events[:20]]
+            logger.warning(
+                f"MCP call returned no messages. "
+                f"conversation_id={effective_conversation_id}, "
+                f"session_id={session_id}, "
+                f"event_count={len(all_events)}, "
+                f"event_types={event_types}, "
+                f"final_result_keys={list(final_result.keys()) if final_result else 'None'}"
+            )
+            # Log codex/event details for debugging
+            codex_events = [e for e in all_events if e.get("method") == "codex/event"]
+            if codex_events:
+                for ce in codex_events[-5:]:  # Last 5 codex events
+                    msg = ce.get("params", {}).get("msg", {})
+                    logger.debug(f"  codex/event: type={msg.get('type')}, keys={list(msg.keys())}")
+        logger.debug(f"MCP call complete: {len(agent_messages)} messages, conversation_id={effective_conversation_id}")
+        return result
+    def close(self) -> None:
+        """Close the MCP connection gracefully."""
+        if self._proc and self._proc.poll() is None:
+            logger.info("Terminating MCP server...")
+            self._proc.terminate()
+            try:
+                self._proc.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                logger.warning("MCP server didn't terminate, killing...")
+                self._proc.kill()
+                self._proc.wait()
+        self._proc = None
+        self._initialized = False
+    @property
+    def is_alive(self) -> bool:
+        """Check if the MCP server is running."""
+        return self._proc is not None and self._proc.poll() is None
+@register_adapter("codex_mcp")
+class CodexMCPAdapter(ExecutorAdapter):
+    """
+    Codex adapter using MCP server for sync conversations.
+    This is the recommended way to have iterative conversations with Codex.
+    The MCP client uses subprocess.Popen (not asyncio) so it persists across
+    multiple asyncio.run() calls, preserving conversation state.
+    """
+    DEFAULT_MODEL = "gpt-5.1-codex-mini"  # Default codex model
+    def __init__(self, model: str | None = None):
+        self._model = model or self.DEFAULT_MODEL
+        self._mcp_client: MCPClient | None = None
+        self._sessions: dict[str, str] = {}  # session_id -> conversationId
+        # Cumulative token usage for cost tracking
+        self._total_usage: dict[str, int] = {
+            "input_tokens": 0,
+            "output_tokens": 0,
+            "cached_input_tokens": 0,
+            "reasoning_tokens": 0,
+            "total_tokens": 0,
+        }
+    def _accumulate_usage(self, usage: dict[str, Any]) -> None:
+        """Add usage to cumulative totals."""
+        if not usage:
+            return
+        for key in self._total_usage:
+            self._total_usage[key] += usage.get(key, 0)
+    @property
+    def total_usage(self) -> dict[str, int]:
+        """Get cumulative token usage across all calls."""
+        return self._total_usage.copy()
+    def _ensure_client(self) -> MCPClient:
+        """Ensure MCP client is running and return it."""
+        if self._mcp_client is None:
+            self._mcp_client = MCPClient()
+        if not self._mcp_client.is_alive:
+            self._mcp_client.start()
+        return self._mcp_client
+    @weave.op()
+    def _call_codex(
+        self,
+        task: str,
+        cwd: str,
+        sandbox: str,
+        model: str | None = None,
+    ) -> dict[str, Any]:
+        """
+        Call codex MCP tool - traced by Weave.
+        This is synchronous (uses subprocess.Popen, not asyncio) so the MCP
+        server persists across calls.
+        """
+        client = self._ensure_client()
+        args: dict[str, Any] = {
+            "prompt": task,
+            "cwd": cwd,
+            "sandbox": sandbox,
+        }
+        if model:
+            args["model"] = model
+        logger.info(f"Calling codex with task_len={len(task)}, cwd={cwd}, model={model or 'default'}")
+        result = client.call_tool("codex", args)
+        # Log the result structure
+        conversation_id = result.get("conversationId")
+        messages_count = len(result.get("messages", []))
+        output_len = len(result.get("output", ""))
+        usage = result.get("usage", {})
+        logger.info(
+            f"codex result: conversation_id={conversation_id}, "
+            f"messages_count={messages_count}, output_len={output_len}, "
+            f"usage={usage.get('total_tokens', 0)} tokens"
+        )
+        # Warn if we got a conversation ID but no messages (agent did work but we lost output)
+        if conversation_id and not messages_count and not output_len:
+            logger.warning(
+                f"codex returned conversation_id={conversation_id} but NO messages/output! "
+                f"The agent processed {usage.get('total_tokens', 0)} tokens but we didn't capture the response. "
+                f"This may indicate an issue with event parsing."
+            )
+        # Track usage
+        self._accumulate_usage(usage)
+        return {
+            "conversation_id": conversation_id,
+            "response": self._extract_response(result),
+            "raw_messages": result.get("messages", []),
+            "usage": usage,
+            "total_usage": self.total_usage,
+        }
+    @weave.op()
+    def _call_codex_reply(
+        self,
+        conversation_id: str,
+        message: str,
+    ) -> dict[str, Any]:
+        """
+        Call codex-reply MCP tool - traced by Weave.
+        This is synchronous (uses subprocess.Popen, not asyncio) so the MCP
+        server persists across calls.
+        """
+        client = self._ensure_client()
+        logger.info(f"Calling codex-reply with conversation_id={conversation_id}, message_len={len(message)}")
+        logger.debug(f"MCP client alive: {client.is_alive}, initialized: {client._initialized}")
+        result = client.call_tool("codex-reply", {
+            "conversationId": conversation_id,
+            "prompt": message,
+        })
+        # Log the full result structure for debugging
+        logger.info(
+            f"codex-reply result: conversationId={result.get('conversationId')}, "
+            f"messages_count={len(result.get('messages', []))}, "
+            f"output_len={len(result.get('output', ''))}, "
+            f"usage={result.get('usage', {}).get('total_tokens', 0)} tokens"
+        )
+        # Check for conversation loss - MCP returns empty result when session not found
+        if not result.get("messages") and not result.get("output"):
+            logger.error(
+                f"codex-reply returned empty result for conversation_id={conversation_id}. "
+                f"The MCP server may have lost the conversation state. Result: {result}"
+            )
+        # Track usage
+        usage = result.get("usage", {})
+        self._accumulate_usage(usage)
+        response = self._extract_response(result)
+        logger.debug(f"codex-reply response length: {len(response)} chars")
+        return {
+            "response": response,
+            "raw_messages": result.get("messages", []),
+            "usage": usage,
+            "total_usage": self.total_usage,
+            "conversation_lost": not result.get("messages") and not result.get("output"),
+        }
+    @weave.op()
+    async def start_session(
+        self,
+        task: str,
+        working_dir: Path,
+        mode: Literal["sync", "async"] = "sync",
+        model: str | None = None,
+        sandbox: str = "workspace-write",
+        **kwargs,
+    ) -> ConversationSession:
+        """Start a Codex session (sync or async mode)."""
+        effective_model = model or self._model
+        session = ConversationSession(
+            adapter=self.name,
+            mode=SessionMode(mode),
+            working_dir=working_dir,
+            task_description=task,
+            model=effective_model,
+        )
+        if mode == "sync":
+            # Use traced codex call (synchronous - MCP client persists across calls)
+            result = self._call_codex(
+                task=task,
+                cwd=str(working_dir.absolute()),
+                sandbox=sandbox,
+                model=effective_model,
+            )
+            # Extract conversation ID and response
+            session.conversation_id = result["conversation_id"]
+            if session.conversation_id:
+                self._sessions[session.id] = session.conversation_id
+                logger.debug(f"Session {session.id[:8]} mapped to conversation {session.conversation_id}")
+            else:
+                # This is bad - we won't be able to continue this conversation
+                logger.warning(
+                    f"Session {session.id[:8]} started but MCP didn't return a conversation ID. "
+                    "Further converse() calls will fail."
+                )
+            session.add_message("user", task)
+            session.add_message("assistant", result["response"])
+            # Track token usage on the session
+            session.add_usage(result.get("usage", {}))
+        else:
+            # Async mode: use codex exec (fire-and-forget)
+            # This runs in a subprocess without MCP, outputs JSONL events
+            cmd = [
+                "codex", "exec",
+                "--dangerously-bypass-approvals-and-sandbox",
+                "--skip-git-repo-check",
+                "--json",
+                "--model", effective_model,
+                "-C", str(working_dir.absolute()),  # Explicit working directory
+                "--", task,
+            ]
+            logger.info(f"Starting async codex: {' '.join(cmd[:8])}...")
+            proc = subprocess.Popen(
+                cmd,
+                cwd=working_dir,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+            )
+            session.process = proc
+            session.add_message("user", task)
+        return session
+    async def send_message(
+        self,
+        session: ConversationSession,
+        message: str,
+    ) -> str:
+        """Send a message to continue a sync conversation."""
+        if session.mode != SessionMode.SYNC:
+            raise ValueError("Cannot send message to async session")
+        if session.status != SessionStatus.ACTIVE:
+            raise ValueError(f"Session is not active: {session.status}")
+        if not session.conversation_id:
+            raise ValueError("Session has no conversation ID")
+        # Use traced codex-reply call (synchronous - MCP client persists across calls)
+        result = self._call_codex_reply(
+            conversation_id=session.conversation_id,
+            message=message,
+        )
+        response_text = result["response"]
+        # Check if conversation was lost
+        if result.get("conversation_lost"):
+            logger.warning(
+                f"Conversation {session.conversation_id} was lost. "
+                f"Session {session.id} will be marked as needing re-delegation."
+            )
+            # Mark the session as having a lost conversation so orchestrator can handle it
+            session.conversation_id = None  # Clear the stale ID
+        session.add_message("user", message)
+        session.add_message("assistant", response_text)
+        # Track token usage on the session
+        session.add_usage(result.get("usage", {}))
+        return response_text
+    @weave.op()
+    def _parse_jsonl_output(self, stdout: str) -> dict[str, Any]:
+        """
+        Parse JSONL output from codex exec --json.
+        Returns dict with:
+        - response: The agent's message text
+        - usage: Token usage stats
+        - thread_id: The conversation thread ID
+        - events: All parsed events (for debugging)
+        """
+        response_parts = []
+        usage = {}
+        thread_id = None
+        events = []
+        for line in stdout.strip().split("\n"):
+            if not line.strip():
+                continue
+            try:
+                event = json.loads(line)
+                events.append(event)
+                event_type = event.get("type", "")
+                if event_type == "thread.started":
+                    thread_id = event.get("thread_id")
+                elif event_type == "item.completed":
+                    item = event.get("item", {})
+                    if item.get("type") == "agent_message":
+                        response_parts.append(item.get("text", ""))
+                elif event_type == "turn.completed":
+                    usage = event.get("usage", {})
+            except json.JSONDecodeError:
+                logger.warning(f"Failed to parse JSONL line: {line[:100]}")
+                continue
+        return {
+            "response": "\n".join(response_parts),
+            "usage": usage,
+            "thread_id": thread_id,
+            "events": events,
+        }
+    @weave.op()
+    async def check_status(
+        self,
+        session: ConversationSession,
+    ) -> dict:
+        """Check status of an async session."""
+        if session.mode != SessionMode.ASYNC:
+            return {"status": session.status.value}
+        if session.process is None:
+            return {"status": "unknown", "error": "No process handle"}
+        # Check if process is still running
+        poll = session.process.poll()
+        if poll is None:
+            return {"status": "running"}
+        # Process finished - parse the JSONL output
+        stdout, stderr = session.process.communicate()
+        if poll == 0:
+            # Parse JSONL to extract actual response
+            parsed = self._parse_jsonl_output(stdout)
+            response_text = parsed["response"] or "(no response captured)"
+            # Add the response as a message
+            session.add_message("assistant", response_text)
+            # Track token usage
+            if parsed["usage"]:
+                session.add_usage({
+                    "input_tokens": parsed["usage"].get("input_tokens", 0),
+                    "output_tokens": parsed["usage"].get("output_tokens", 0),
+                    "total_tokens": (
+                        parsed["usage"].get("input_tokens", 0) +
+                        parsed["usage"].get("output_tokens", 0)
+                    ),
+                })
+            session.complete(response_text[:500])
+            return {
+                "status": "completed",
+                "response": response_text,
+                "usage": parsed["usage"],
+                "thread_id": parsed["thread_id"],
+            }
+        else:
+            # Try to parse stderr or stdout for error info
+            error_msg = stderr.strip() if stderr else f"Exit code: {poll}"
+            # Sometimes errors come through stdout as JSONL too
+            if stdout and not stderr:
+                try:
+                    parsed = self._parse_jsonl_output(stdout)
+                    if not parsed["response"]:
+                        error_msg = f"Process failed with no response. Exit code: {poll}"
+                except Exception:
+                    error_msg = stdout[:500] if stdout else f"Exit code: {poll}"
+            session.fail(error_msg[:500])
+            return {"status": "failed", "error": error_msg, "exit_code": poll}
+    async def stop(
+        self,
+        session: ConversationSession,
+    ) -> None:
+        """Stop a session."""
+        import subprocess
+        if session.process and session.process.poll() is None:
+            session.process.terminate()
+            try:
+                session.process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                session.process.kill()
+        session.fail("Stopped by user")
+        # Remove from tracking
+        if session.id in self._sessions:
+            del self._sessions[session.id]
+    async def cleanup(self) -> None:
+        """Clean up MCP server."""
+        if self._mcp_client:
+            self._mcp_client.close()
+            self._mcp_client = None
+    def _extract_response(self, result: dict) -> str:
+        """Extract response text from MCP result."""
+        # Check for error indicators - empty result suggests lost conversation
+        if (
+            result.get("conversationId") is None
+            and not result.get("messages")
+            and not result.get("output")
+        ):
+            logger.warning(f"MCP returned empty result - conversation may be lost: {result}")
+            return "[ERROR] Conversation lost - the MCP server no longer has this session. Please re-delegate the task."
+        # First check for our collected output
+        if result.get("output"):
+            return result["output"]
+        # Check for messages list
+        if result.get("messages"):
+            return "\n".join(result["messages"])
+        # Result may have different structures depending on codex version
+        if "content" in result:
+            content = result["content"]
+            if isinstance(content, list):
+                texts = []
+                for block in content:
+                    if isinstance(block, dict) and "text" in block:
+                        texts.append(block["text"])
+                    elif isinstance(block, str):
+                        texts.append(block)
+                if texts:
+                    return "\n".join(texts)
+            elif isinstance(content, str):
+                return content
+        if "text" in result:
+            return result["text"]
+        # Fallback: stringify the result (but log it as unexpected)
+        logger.warning(f"Unexpected MCP result format, returning raw: {list(result.keys())}")
+        return json.dumps(result, indent=2)