zwarm 1.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,968 @@
1
+ """
2
+ Codex MCP adapter for sync conversations.
3
+
4
+ Uses codex mcp-server for true iterative conversations:
5
+ - codex() to start a session with conversationId
6
+ - codex-reply() to continue the conversation
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import logging
13
+ import queue
14
+ import subprocess
15
+ import threading
16
+ import time
17
+ from pathlib import Path
18
+ from typing import Any, Literal
19
+
20
+ import weave
21
+
22
+ from zwarm.adapters.base import ExecutorAdapter
23
+ from zwarm.adapters.registry import register_adapter
24
+ from zwarm.core.models import (
25
+ ConversationSession,
26
+ SessionMode,
27
+ SessionStatus,
28
+ )
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class MCPClient:
34
+ """
35
+ Robust MCP client for communicating with codex mcp-server.
36
+
37
+ Uses subprocess.Popen (NOT asyncio.subprocess) to avoid being tied to
38
+ any specific event loop. This allows the MCP server to stay alive across
39
+ multiple asyncio.run() calls, preserving conversation state.
40
+
41
+ Uses dedicated reader threads that queue lines, avoiding the race condition
42
+ of spawning new reader threads on timeout.
43
+ """
44
+
45
+ def __init__(self):
46
+ self._proc: subprocess.Popen | None = None
47
+ self._proc_pid: int | None = None # Track PID to detect restarts
48
+ self._request_id = 0
49
+ self._initialized = False
50
+ self._stderr_thread: threading.Thread | None = None
51
+ self._stdout_thread: threading.Thread | None = None
52
+ self._stderr_lines: list[str] = []
53
+ self._stdout_queue: queue.Queue[str | None] = queue.Queue()
54
+ self._lock = threading.Lock() # Protect writes only
55
+ self._start_count = 0 # Track how many times we've started
56
+
57
+ def start(self) -> None:
58
+ """Start the MCP server process."""
59
+ with self._lock:
60
+ if self._proc is not None and self._proc.poll() is None:
61
+ logger.debug(f"MCP server already running (pid={self._proc.pid}, start_count={self._start_count})")
62
+ return # Already running
63
+
64
+ # Check if this is a restart (previous server died)
65
+ if self._proc_pid is not None:
66
+ logger.warning(
67
+ f"MCP server restart detected! Previous pid={self._proc_pid}, "
68
+ f"start_count={self._start_count}. All conversation state will be lost."
69
+ )
70
+
71
+ self._start_count += 1
72
+ logger.info(f"Starting codex mcp-server... (start_count={self._start_count})")
73
+ self._proc = subprocess.Popen(
74
+ ["codex", "mcp-server"],
75
+ stdin=subprocess.PIPE,
76
+ stdout=subprocess.PIPE,
77
+ stderr=subprocess.PIPE,
78
+ text=False, # Binary mode for explicit encoding control
79
+ )
80
+ self._proc_pid = self._proc.pid
81
+ self._initialized = False
82
+ self._stderr_lines = []
83
+ self._stdout_queue = queue.Queue() # Fresh queue
84
+
85
+ # Start background thread to read stderr
86
+ self._stderr_thread = threading.Thread(
87
+ target=self._read_stderr_loop,
88
+ daemon=True,
89
+ name="mcp-stderr-reader",
90
+ )
91
+ self._stderr_thread.start()
92
+
93
+ # Start background thread to read stdout into queue
94
+ self._stdout_thread = threading.Thread(
95
+ target=self._read_stdout_loop,
96
+ daemon=True,
97
+ name="mcp-stdout-reader",
98
+ )
99
+ self._stdout_thread.start()
100
+
101
+ logger.info(f"MCP server started (pid={self._proc.pid})")
102
+
103
+ def _read_stderr_loop(self) -> None:
104
+ """Background thread to read stderr and log errors."""
105
+ if not self._proc or not self._proc.stderr:
106
+ return
107
+ try:
108
+ while True:
109
+ line = self._proc.stderr.readline()
110
+ if not line:
111
+ break
112
+ decoded = line.decode().strip()
113
+ if decoded:
114
+ self._stderr_lines.append(decoded)
115
+ # Keep only last 100 lines
116
+ if len(self._stderr_lines) > 100:
117
+ self._stderr_lines = self._stderr_lines[-100:]
118
+ # Log errors prominently
119
+ if "error" in decoded.lower() or "ERROR" in decoded:
120
+ logger.error(f"[MCP stderr] {decoded}")
121
+ else:
122
+ logger.debug(f"[MCP stderr] {decoded}")
123
+ except Exception as e:
124
+ logger.warning(f"stderr reader stopped: {e}")
125
+
126
+ def _read_stdout_loop(self) -> None:
127
+ """Background thread to read stdout and queue lines."""
128
+ if not self._proc or not self._proc.stdout:
129
+ return
130
+ try:
131
+ while True:
132
+ line = self._proc.stdout.readline()
133
+ if not line:
134
+ # EOF - signal end
135
+ self._stdout_queue.put(None)
136
+ break
137
+ decoded = line.decode()
138
+ self._stdout_queue.put(decoded)
139
+ except Exception as e:
140
+ logger.warning(f"stdout reader stopped: {e}")
141
+ self._stdout_queue.put(None) # Signal error
142
+
143
+ def _next_id(self) -> int:
144
+ self._request_id += 1
145
+ return self._request_id
146
+
147
+ def _write(self, data: str) -> None:
148
+ """Write to stdin with error handling."""
149
+ if not self._proc or not self._proc.stdin:
150
+ raise RuntimeError("MCP server not running")
151
+ if self._proc.poll() is not None:
152
+ raise RuntimeError(f"MCP server died (exit code {self._proc.returncode})")
153
+
154
+ self._proc.stdin.write(data.encode())
155
+ self._proc.stdin.flush()
156
+
157
+ def _read_line(self, timeout: float = 120.0) -> str:
158
+ """
159
+ Read a line from the stdout queue with timeout.
160
+
161
+ Uses a dedicated reader thread that queues lines, so we never
162
+ lose data on timeout - we just haven't received it yet.
163
+ """
164
+ if not self._proc:
165
+ raise RuntimeError("MCP server not running")
166
+
167
+ try:
168
+ line = self._stdout_queue.get(timeout=timeout)
169
+ except queue.Empty:
170
+ # Timeout - check process health
171
+ if self._proc.poll() is not None:
172
+ stderr_context = "\n".join(self._stderr_lines[-10:]) if self._stderr_lines else "(no stderr)"
173
+ raise RuntimeError(
174
+ f"MCP server died (exit code {self._proc.returncode}).\n"
175
+ f"Recent stderr:\n{stderr_context}"
176
+ )
177
+ # Process still alive, just slow - return empty to let caller decide
178
+ return ""
179
+
180
+ if line is None:
181
+ # EOF or error from reader thread
182
+ stderr_context = "\n".join(self._stderr_lines[-10:]) if self._stderr_lines else "(no stderr)"
183
+ if self._proc.poll() is not None:
184
+ raise RuntimeError(
185
+ f"MCP server exited (code {self._proc.returncode}).\n"
186
+ f"Recent stderr:\n{stderr_context}"
187
+ )
188
+ raise RuntimeError(f"MCP stdout closed unexpectedly.\nRecent stderr:\n{stderr_context}")
189
+
190
+ return line
191
+
192
+ def _check_alive(self) -> None:
193
+ """Check if the MCP server is still alive, raise if not."""
194
+ if not self._proc:
195
+ raise RuntimeError("MCP server not started")
196
+ if self._proc.poll() is not None:
197
+ stderr_context = "\n".join(self._stderr_lines[-10:]) if self._stderr_lines else "(no stderr)"
198
+ raise RuntimeError(
199
+ f"MCP server died (exit code {self._proc.returncode}).\n"
200
+ f"Recent stderr:\n{stderr_context}"
201
+ )
202
+
203
+ def initialize(self) -> dict:
204
+ """Initialize MCP connection."""
205
+ self._check_alive()
206
+
207
+ request = {
208
+ "jsonrpc": "2.0",
209
+ "id": self._next_id(),
210
+ "method": "initialize",
211
+ "params": {
212
+ "protocolVersion": "2024-11-05",
213
+ "capabilities": {},
214
+ "clientInfo": {"name": "zwarm", "version": "0.1.0"},
215
+ },
216
+ }
217
+ with self._lock:
218
+ self._write(json.dumps(request) + "\n")
219
+
220
+ response_line = self._read_line(timeout=30.0)
221
+ if not response_line:
222
+ raise RuntimeError("No response from MCP server during init")
223
+
224
+ response = json.loads(response_line)
225
+ if "error" in response:
226
+ raise RuntimeError(f"MCP init error: {response['error']}")
227
+
228
+ # Send initialized notification
229
+ notif = {"jsonrpc": "2.0", "method": "notifications/initialized"}
230
+ with self._lock:
231
+ self._write(json.dumps(notif) + "\n")
232
+
233
+ self._initialized = True
234
+ logger.info("MCP connection initialized")
235
+ return response
236
+
237
+ def call_tool(self, name: str, arguments: dict, timeout: float = 300.0) -> dict:
238
+ """
239
+ Call an MCP tool and collect streaming events.
240
+
241
+ Args:
242
+ name: Tool name (codex, codex-reply)
243
+ arguments: Tool arguments
244
+ timeout: Overall timeout for the call (default 5 min)
245
+ """
246
+ self._check_alive()
247
+
248
+ if not self._initialized:
249
+ self.initialize()
250
+
251
+ request_id = self._next_id()
252
+ request = {
253
+ "jsonrpc": "2.0",
254
+ "id": request_id,
255
+ "method": "tools/call",
256
+ "params": {"name": name, "arguments": arguments},
257
+ }
258
+
259
+ logger.debug(f"Calling MCP tool: {name} with args: {list(arguments.keys())}")
260
+ with self._lock:
261
+ self._write(json.dumps(request) + "\n")
262
+
263
+ # Collect streaming events until final result
264
+ # Reader thread queues lines, we pull from queue with timeout
265
+ session_id = None
266
+ conversation_id = None # Track conversation ID separately
267
+ agent_messages: list[str] = []
268
+ streaming_text: list[str] = [] # Accumulate streaming delta text
269
+ final_result = None
270
+ token_usage: dict[str, Any] = {} # Track token usage
271
+ start_time = time.time()
272
+ all_events: list[dict] = [] # Keep ALL events for debugging
273
+
274
+ for event_count in range(1000): # Safety limit on events
275
+ self._check_alive()
276
+
277
+ # Check overall timeout
278
+ elapsed = time.time() - start_time
279
+ if elapsed > timeout:
280
+ raise RuntimeError(f"MCP call timed out after {timeout}s ({event_count} events received)")
281
+
282
+ # Read from queue with per-event timeout
283
+ # Empty string = timeout (process still alive, just waiting)
284
+ # None sentinel is handled inside _read_line (raises RuntimeError)
285
+ line = self._read_line(timeout=30.0)
286
+
287
+ if not line:
288
+ # Timeout waiting for event - process is still alive, just slow
289
+ # This is normal during long codex operations
290
+ logger.debug(f"Waiting for MCP event... (elapsed: {elapsed:.0f}s, events: {event_count})")
291
+ continue
292
+
293
+ try:
294
+ event = json.loads(line)
295
+ all_events.append(event) # Keep for debugging
296
+ except json.JSONDecodeError as e:
297
+ logger.warning(f"Invalid JSON from MCP: {line[:100]}... - {e}")
298
+ continue
299
+
300
+ # Check for final result (has matching id)
301
+ if event.get("id") == request_id:
302
+ if "result" in event:
303
+ final_result = event["result"]
304
+ # Extract conversation ID from final result
305
+ if isinstance(final_result, dict):
306
+ conversation_id = final_result.get("conversationId") or final_result.get("conversation_id")
307
+ logger.debug(f"Got final result after {event_count} events, conversation_id={conversation_id}")
308
+ break
309
+ elif "error" in event:
310
+ error = event["error"]
311
+ raise RuntimeError(f"MCP tool error: {error.get('message', error)}")
312
+
313
+ # Process streaming events
314
+ if event.get("method") == "codex/event":
315
+ params = event.get("params", {})
316
+ msg = params.get("msg", {})
317
+ msg_type = msg.get("type")
318
+
319
+ # Log ALL event types to help debug missing messages
320
+ logger.debug(f"MCP event: type={msg_type}, keys={list(msg.keys())}")
321
+
322
+ if msg_type == "session_configured":
323
+ session_id = msg.get("session_id")
324
+ logger.debug(f"Session configured: {session_id}")
325
+
326
+ elif msg_type == "item_completed":
327
+ item = msg.get("item", {})
328
+ item_type = item.get("type")
329
+
330
+ # Log ALL item_completed events to help debug
331
+ logger.debug(f"item_completed: type={item_type}, keys={list(item.keys())}")
332
+
333
+ # Agent text responses - codex uses "AgentMessage" type
334
+ if item_type == "AgentMessage":
335
+ content = item.get("content", [])
336
+ for block in content:
337
+ if isinstance(block, dict) and block.get("text"):
338
+ agent_messages.append(block["text"])
339
+ elif isinstance(block, str):
340
+ agent_messages.append(block)
341
+
342
+ # Also check for "agent_message" (lowercase) variant
343
+ elif item_type == "agent_message":
344
+ text = item.get("text", "") or item.get("message", "")
345
+ if text:
346
+ agent_messages.append(text)
347
+ # Also check content array
348
+ content = item.get("content", [])
349
+ for block in content:
350
+ if isinstance(block, dict) and block.get("text"):
351
+ agent_messages.append(block["text"])
352
+ elif isinstance(block, str):
353
+ agent_messages.append(block)
354
+
355
+ # Legacy format check
356
+ elif item_type == "message" and item.get("role") == "assistant":
357
+ content = item.get("content", [])
358
+ for block in content:
359
+ if isinstance(block, dict) and block.get("text"):
360
+ agent_messages.append(block["text"])
361
+ elif isinstance(block, str):
362
+ agent_messages.append(block)
363
+
364
+ # Generic message type - check for text/content
365
+ elif item_type == "message":
366
+ text = item.get("text", "")
367
+ if text:
368
+ agent_messages.append(text)
369
+ content = item.get("content", [])
370
+ if isinstance(content, str):
371
+ agent_messages.append(content)
372
+ elif isinstance(content, list):
373
+ for block in content:
374
+ if isinstance(block, dict) and block.get("text"):
375
+ agent_messages.append(block["text"])
376
+ elif isinstance(block, str):
377
+ agent_messages.append(block)
378
+
379
+ # Function call outputs (for context)
380
+ elif item_type == "function_call_output":
381
+ output = item.get("output", "")
382
+ if output and len(output) < 1000:
383
+ agent_messages.append(f"[Tool output]: {output[:500]}")
384
+
385
+ # Log other item types we're not handling
386
+ elif item_type not in ("function_call", "tool_call", "UserMessage", "user_message"):
387
+ logger.debug(f"Unhandled item_completed type: {item_type}, item={item}")
388
+
389
+ elif msg_type == "agent_message":
390
+ # Direct agent message event
391
+ message = msg.get("message", "")
392
+ if message:
393
+ agent_messages.append(message)
394
+
395
+ elif msg_type in ("task_complete", "task_completed"):
396
+ # Task is done - capture last_agent_message as fallback
397
+ last_msg = msg.get("last_agent_message")
398
+ if last_msg and last_msg not in agent_messages:
399
+ agent_messages.append(last_msg)
400
+ logger.debug(f"Task complete after {event_count} events")
401
+ break
402
+
403
+ elif msg_type == "token_count":
404
+ # Capture token usage for cost tracking
405
+ info = msg.get("info") or {}
406
+ if info:
407
+ usage = info.get("total_token_usage", {})
408
+ if usage:
409
+ token_usage = {
410
+ "input_tokens": usage.get("input_tokens", 0),
411
+ "output_tokens": usage.get("output_tokens", 0),
412
+ "cached_input_tokens": usage.get("cached_input_tokens", 0),
413
+ "reasoning_tokens": usage.get("reasoning_output_tokens", 0),
414
+ "total_tokens": usage.get("total_tokens", 0),
415
+ }
416
+ logger.debug(f"Token usage: {token_usage}")
417
+
418
+ elif msg_type == "error":
419
+ error_msg = msg.get("error", msg.get("message", str(msg)))
420
+ raise RuntimeError(f"Codex error: {error_msg}")
421
+
422
+ # Handle streaming text events (various formats)
423
+ elif msg_type in ("text_delta", "content_block_delta", "message_delta"):
424
+ delta = msg.get("delta", {})
425
+ text = delta.get("text", "") or msg.get("text", "")
426
+ if text:
427
+ streaming_text.append(text)
428
+
429
+ elif msg_type == "text":
430
+ text = msg.get("text", "")
431
+ if text:
432
+ streaming_text.append(text)
433
+
434
+ elif msg_type == "response":
435
+ # Some versions send the full response this way
436
+ response_text = msg.get("response", "") or msg.get("text", "")
437
+ if response_text:
438
+ agent_messages.append(response_text)
439
+
440
+ elif msg_type == "message":
441
+ # Direct message event
442
+ text = msg.get("text", "") or msg.get("content", "")
443
+ if text:
444
+ agent_messages.append(text)
445
+
446
+ else:
447
+ # Log unknown event types at debug level to help diagnose
448
+ if msg_type and msg_type not in ("session_started", "thinking", "tool_call", "function_call"):
449
+ logger.debug(f"Unhandled MCP event type: {msg_type}, msg keys: {list(msg.keys())}")
450
+
451
+ # Merge streaming text into messages if we got any
452
+ if streaming_text:
453
+ full_streaming = "".join(streaming_text)
454
+ if full_streaming.strip():
455
+ agent_messages.append(full_streaming)
456
+ logger.debug(f"Captured {len(streaming_text)} streaming chunks ({len(full_streaming)} chars)")
457
+
458
+ # Try to extract content from final_result if we have no messages
459
+ if final_result and not agent_messages:
460
+ if "content" in final_result:
461
+ content = final_result["content"]
462
+ if isinstance(content, list):
463
+ for block in content:
464
+ if isinstance(block, dict) and block.get("text"):
465
+ agent_messages.append(block["text"])
466
+ elif isinstance(block, str):
467
+ agent_messages.append(block)
468
+ elif isinstance(content, str):
469
+ agent_messages.append(content)
470
+ # Also check for text field
471
+ if not agent_messages and "text" in final_result:
472
+ agent_messages.append(final_result["text"])
473
+
474
+ # Build result - prefer conversation_id from final result, fallback to session_id from events
475
+ effective_conversation_id = conversation_id or session_id
476
+ result = {
477
+ "conversationId": effective_conversation_id,
478
+ "messages": agent_messages,
479
+ "output": "\n".join(agent_messages) if agent_messages else "",
480
+ "usage": token_usage, # Token usage for cost tracking
481
+ }
482
+
483
+ # Log detailed debug info if we didn't capture any messages
484
+ if not agent_messages:
485
+ event_types = [e.get("method") or f"id:{e.get('id')}" for e in all_events[:20]]
486
+ logger.warning(
487
+ f"MCP call returned no messages. "
488
+ f"conversation_id={effective_conversation_id}, "
489
+ f"session_id={session_id}, "
490
+ f"event_count={len(all_events)}, "
491
+ f"event_types={event_types}, "
492
+ f"final_result_keys={list(final_result.keys()) if final_result else 'None'}"
493
+ )
494
+ # Log codex/event details for debugging
495
+ codex_events = [e for e in all_events if e.get("method") == "codex/event"]
496
+ if codex_events:
497
+ for ce in codex_events[-5:]: # Last 5 codex events
498
+ msg = ce.get("params", {}).get("msg", {})
499
+ logger.debug(f" codex/event: type={msg.get('type')}, keys={list(msg.keys())}")
500
+
501
+ logger.debug(f"MCP call complete: {len(agent_messages)} messages, conversation_id={effective_conversation_id}")
502
+ return result
503
+
504
+ def close(self) -> None:
505
+ """Close the MCP connection gracefully."""
506
+ if self._proc and self._proc.poll() is None:
507
+ logger.info("Terminating MCP server...")
508
+ self._proc.terminate()
509
+ try:
510
+ self._proc.wait(timeout=5)
511
+ except subprocess.TimeoutExpired:
512
+ logger.warning("MCP server didn't terminate, killing...")
513
+ self._proc.kill()
514
+ self._proc.wait()
515
+
516
+ self._proc = None
517
+ self._initialized = False
518
+
519
+ @property
520
+ def is_alive(self) -> bool:
521
+ """Check if the MCP server is running."""
522
+ return self._proc is not None and self._proc.poll() is None
523
+
524
+
525
+ @register_adapter("codex_mcp")
526
+ class CodexMCPAdapter(ExecutorAdapter):
527
+ """
528
+ Codex adapter using MCP server for sync conversations.
529
+
530
+ This is the recommended way to have iterative conversations with Codex.
531
+ The MCP client uses subprocess.Popen (not asyncio) so it persists across
532
+ multiple asyncio.run() calls, preserving conversation state.
533
+ """
534
+ DEFAULT_MODEL = "gpt-5.1-codex-mini" # Default codex model
535
+
536
+ def __init__(self, model: str | None = None):
537
+ self._model = model or self.DEFAULT_MODEL
538
+ self._mcp_client: MCPClient | None = None
539
+ self._sessions: dict[str, str] = {} # session_id -> conversationId
540
+ # Cumulative token usage for cost tracking
541
+ self._total_usage: dict[str, int] = {
542
+ "input_tokens": 0,
543
+ "output_tokens": 0,
544
+ "cached_input_tokens": 0,
545
+ "reasoning_tokens": 0,
546
+ "total_tokens": 0,
547
+ }
548
+
549
+ def _accumulate_usage(self, usage: dict[str, Any]) -> None:
550
+ """Add usage to cumulative totals."""
551
+ if not usage:
552
+ return
553
+ for key in self._total_usage:
554
+ self._total_usage[key] += usage.get(key, 0)
555
+
556
+ @property
557
+ def total_usage(self) -> dict[str, int]:
558
+ """Get cumulative token usage across all calls."""
559
+ return self._total_usage.copy()
560
+
561
+ def _ensure_client(self) -> MCPClient:
562
+ """Ensure MCP client is running and return it."""
563
+ if self._mcp_client is None:
564
+ self._mcp_client = MCPClient()
565
+
566
+ if not self._mcp_client.is_alive:
567
+ self._mcp_client.start()
568
+
569
+ return self._mcp_client
570
+
571
+ @weave.op()
572
+ def _call_codex(
573
+ self,
574
+ task: str,
575
+ cwd: str,
576
+ sandbox: str,
577
+ model: str | None = None,
578
+ ) -> dict[str, Any]:
579
+ """
580
+ Call codex MCP tool - traced by Weave.
581
+
582
+ This is synchronous (uses subprocess.Popen, not asyncio) so the MCP
583
+ server persists across calls.
584
+ """
585
+ client = self._ensure_client()
586
+
587
+ args: dict[str, Any] = {
588
+ "prompt": task,
589
+ "cwd": cwd,
590
+ "sandbox": sandbox,
591
+ }
592
+ if model:
593
+ args["model"] = model
594
+
595
+ logger.info(f"Calling codex with task_len={len(task)}, cwd={cwd}, model={model or 'default'}")
596
+
597
+ result = client.call_tool("codex", args)
598
+
599
+ # Log the result structure
600
+ conversation_id = result.get("conversationId")
601
+ messages_count = len(result.get("messages", []))
602
+ output_len = len(result.get("output", ""))
603
+ usage = result.get("usage", {})
604
+
605
+ logger.info(
606
+ f"codex result: conversation_id={conversation_id}, "
607
+ f"messages_count={messages_count}, output_len={output_len}, "
608
+ f"usage={usage.get('total_tokens', 0)} tokens"
609
+ )
610
+
611
+ # Warn if we got a conversation ID but no messages (agent did work but we lost output)
612
+ if conversation_id and not messages_count and not output_len:
613
+ logger.warning(
614
+ f"codex returned conversation_id={conversation_id} but NO messages/output! "
615
+ f"The agent processed {usage.get('total_tokens', 0)} tokens but we didn't capture the response. "
616
+ f"This may indicate an issue with event parsing."
617
+ )
618
+
619
+ # Track usage
620
+ self._accumulate_usage(usage)
621
+
622
+ return {
623
+ "conversation_id": conversation_id,
624
+ "response": self._extract_response(result),
625
+ "raw_messages": result.get("messages", []),
626
+ "usage": usage,
627
+ "total_usage": self.total_usage,
628
+ }
629
+
630
+ @weave.op()
631
+ def _call_codex_reply(
632
+ self,
633
+ conversation_id: str,
634
+ message: str,
635
+ ) -> dict[str, Any]:
636
+ """
637
+ Call codex-reply MCP tool - traced by Weave.
638
+
639
+ This is synchronous (uses subprocess.Popen, not asyncio) so the MCP
640
+ server persists across calls.
641
+ """
642
+ client = self._ensure_client()
643
+
644
+ logger.info(f"Calling codex-reply with conversation_id={conversation_id}, message_len={len(message)}")
645
+ logger.debug(f"MCP client alive: {client.is_alive}, initialized: {client._initialized}")
646
+
647
+ result = client.call_tool("codex-reply", {
648
+ "conversationId": conversation_id,
649
+ "prompt": message,
650
+ })
651
+
652
+ # Log the full result structure for debugging
653
+ logger.info(
654
+ f"codex-reply result: conversationId={result.get('conversationId')}, "
655
+ f"messages_count={len(result.get('messages', []))}, "
656
+ f"output_len={len(result.get('output', ''))}, "
657
+ f"usage={result.get('usage', {}).get('total_tokens', 0)} tokens"
658
+ )
659
+
660
+ # Check for conversation loss - MCP returns empty result when session not found
661
+ if not result.get("messages") and not result.get("output"):
662
+ logger.error(
663
+ f"codex-reply returned empty result for conversation_id={conversation_id}. "
664
+ f"The MCP server may have lost the conversation state. Result: {result}"
665
+ )
666
+
667
+ # Track usage
668
+ usage = result.get("usage", {})
669
+ self._accumulate_usage(usage)
670
+
671
+ response = self._extract_response(result)
672
+ logger.debug(f"codex-reply response length: {len(response)} chars")
673
+
674
+ return {
675
+ "response": response,
676
+ "raw_messages": result.get("messages", []),
677
+ "usage": usage,
678
+ "total_usage": self.total_usage,
679
+ "conversation_lost": not result.get("messages") and not result.get("output"),
680
+ }
681
+
682
+ @weave.op()
683
+ async def start_session(
684
+ self,
685
+ task: str,
686
+ working_dir: Path,
687
+ mode: Literal["sync", "async"] = "sync",
688
+ model: str | None = None,
689
+ sandbox: str = "workspace-write",
690
+ **kwargs,
691
+ ) -> ConversationSession:
692
+ """Start a Codex session (sync or async mode)."""
693
+ effective_model = model or self._model
694
+ session = ConversationSession(
695
+ adapter=self.name,
696
+ mode=SessionMode(mode),
697
+ working_dir=working_dir,
698
+ task_description=task,
699
+ model=effective_model,
700
+ )
701
+
702
+ if mode == "sync":
703
+ # Use traced codex call (synchronous - MCP client persists across calls)
704
+ result = self._call_codex(
705
+ task=task,
706
+ cwd=str(working_dir.absolute()),
707
+ sandbox=sandbox,
708
+ model=effective_model,
709
+ )
710
+
711
+ # Extract conversation ID and response
712
+ session.conversation_id = result["conversation_id"]
713
+ if session.conversation_id:
714
+ self._sessions[session.id] = session.conversation_id
715
+ logger.debug(f"Session {session.id[:8]} mapped to conversation {session.conversation_id}")
716
+ else:
717
+ # This is bad - we won't be able to continue this conversation
718
+ logger.warning(
719
+ f"Session {session.id[:8]} started but MCP didn't return a conversation ID. "
720
+ "Further converse() calls will fail."
721
+ )
722
+
723
+ session.add_message("user", task)
724
+ session.add_message("assistant", result["response"])
725
+
726
+ # Track token usage on the session
727
+ session.add_usage(result.get("usage", {}))
728
+
729
+ else:
730
+ # Async mode: use codex exec (fire-and-forget)
731
+ # This runs in a subprocess without MCP, outputs JSONL events
732
+ cmd = [
733
+ "codex", "exec",
734
+ "--dangerously-bypass-approvals-and-sandbox",
735
+ "--skip-git-repo-check",
736
+ "--json",
737
+ "--model", effective_model,
738
+ "-C", str(working_dir.absolute()), # Explicit working directory
739
+ "--", task,
740
+ ]
741
+
742
+ logger.info(f"Starting async codex: {' '.join(cmd[:8])}...")
743
+
744
+ proc = subprocess.Popen(
745
+ cmd,
746
+ cwd=working_dir,
747
+ stdout=subprocess.PIPE,
748
+ stderr=subprocess.PIPE,
749
+ text=True,
750
+ )
751
+ session.process = proc
752
+ session.add_message("user", task)
753
+
754
+ return session
755
+
756
+ async def send_message(
757
+ self,
758
+ session: ConversationSession,
759
+ message: str,
760
+ ) -> str:
761
+ """Send a message to continue a sync conversation."""
762
+ if session.mode != SessionMode.SYNC:
763
+ raise ValueError("Cannot send message to async session")
764
+ if session.status != SessionStatus.ACTIVE:
765
+ raise ValueError(f"Session is not active: {session.status}")
766
+ if not session.conversation_id:
767
+ raise ValueError("Session has no conversation ID")
768
+
769
+ # Use traced codex-reply call (synchronous - MCP client persists across calls)
770
+ result = self._call_codex_reply(
771
+ conversation_id=session.conversation_id,
772
+ message=message,
773
+ )
774
+
775
+ response_text = result["response"]
776
+
777
+ # Check if conversation was lost
778
+ if result.get("conversation_lost"):
779
+ logger.warning(
780
+ f"Conversation {session.conversation_id} was lost. "
781
+ f"Session {session.id} will be marked as needing re-delegation."
782
+ )
783
+ # Mark the session as having a lost conversation so orchestrator can handle it
784
+ session.conversation_id = None # Clear the stale ID
785
+
786
+ session.add_message("user", message)
787
+ session.add_message("assistant", response_text)
788
+
789
+ # Track token usage on the session
790
+ session.add_usage(result.get("usage", {}))
791
+
792
+ return response_text
793
+
794
+ @weave.op()
795
+ def _parse_jsonl_output(self, stdout: str) -> dict[str, Any]:
796
+ """
797
+ Parse JSONL output from codex exec --json.
798
+
799
+ Returns dict with:
800
+ - response: The agent's message text
801
+ - usage: Token usage stats
802
+ - thread_id: The conversation thread ID
803
+ - events: All parsed events (for debugging)
804
+ """
805
+ response_parts = []
806
+ usage = {}
807
+ thread_id = None
808
+ events = []
809
+
810
+ for line in stdout.strip().split("\n"):
811
+ if not line.strip():
812
+ continue
813
+ try:
814
+ event = json.loads(line)
815
+ events.append(event)
816
+
817
+ event_type = event.get("type", "")
818
+
819
+ if event_type == "thread.started":
820
+ thread_id = event.get("thread_id")
821
+
822
+ elif event_type == "item.completed":
823
+ item = event.get("item", {})
824
+ if item.get("type") == "agent_message":
825
+ response_parts.append(item.get("text", ""))
826
+
827
+ elif event_type == "turn.completed":
828
+ usage = event.get("usage", {})
829
+
830
+ except json.JSONDecodeError:
831
+ logger.warning(f"Failed to parse JSONL line: {line[:100]}")
832
+ continue
833
+
834
+ return {
835
+ "response": "\n".join(response_parts),
836
+ "usage": usage,
837
+ "thread_id": thread_id,
838
+ "events": events,
839
+ }
840
+
841
+ @weave.op()
842
+ async def check_status(
843
+ self,
844
+ session: ConversationSession,
845
+ ) -> dict:
846
+ """Check status of an async session."""
847
+ if session.mode != SessionMode.ASYNC:
848
+ return {"status": session.status.value}
849
+
850
+ if session.process is None:
851
+ return {"status": "unknown", "error": "No process handle"}
852
+
853
+ # Check if process is still running
854
+ poll = session.process.poll()
855
+ if poll is None:
856
+ return {"status": "running"}
857
+
858
+ # Process finished - parse the JSONL output
859
+ stdout, stderr = session.process.communicate()
860
+
861
+ if poll == 0:
862
+ # Parse JSONL to extract actual response
863
+ parsed = self._parse_jsonl_output(stdout)
864
+ response_text = parsed["response"] or "(no response captured)"
865
+
866
+ # Add the response as a message
867
+ session.add_message("assistant", response_text)
868
+
869
+ # Track token usage
870
+ if parsed["usage"]:
871
+ session.add_usage({
872
+ "input_tokens": parsed["usage"].get("input_tokens", 0),
873
+ "output_tokens": parsed["usage"].get("output_tokens", 0),
874
+ "total_tokens": (
875
+ parsed["usage"].get("input_tokens", 0) +
876
+ parsed["usage"].get("output_tokens", 0)
877
+ ),
878
+ })
879
+
880
+ session.complete(response_text[:500])
881
+ return {
882
+ "status": "completed",
883
+ "response": response_text,
884
+ "usage": parsed["usage"],
885
+ "thread_id": parsed["thread_id"],
886
+ }
887
+ else:
888
+ # Try to parse stderr or stdout for error info
889
+ error_msg = stderr.strip() if stderr else f"Exit code: {poll}"
890
+
891
+ # Sometimes errors come through stdout as JSONL too
892
+ if stdout and not stderr:
893
+ try:
894
+ parsed = self._parse_jsonl_output(stdout)
895
+ if not parsed["response"]:
896
+ error_msg = f"Process failed with no response. Exit code: {poll}"
897
+ except Exception:
898
+ error_msg = stdout[:500] if stdout else f"Exit code: {poll}"
899
+
900
+ session.fail(error_msg[:500])
901
+ return {"status": "failed", "error": error_msg, "exit_code": poll}
902
+
903
+ async def stop(
904
+ self,
905
+ session: ConversationSession,
906
+ ) -> None:
907
+ """Stop a session."""
908
+ import subprocess
909
+
910
+ if session.process and session.process.poll() is None:
911
+ session.process.terminate()
912
+ try:
913
+ session.process.wait(timeout=5)
914
+ except subprocess.TimeoutExpired:
915
+ session.process.kill()
916
+
917
+ session.fail("Stopped by user")
918
+
919
+ # Remove from tracking
920
+ if session.id in self._sessions:
921
+ del self._sessions[session.id]
922
+
923
+ async def cleanup(self) -> None:
924
+ """Clean up MCP server."""
925
+ if self._mcp_client:
926
+ self._mcp_client.close()
927
+ self._mcp_client = None
928
+
929
+ def _extract_response(self, result: dict) -> str:
930
+ """Extract response text from MCP result."""
931
+ # Check for error indicators - empty result suggests lost conversation
932
+ if (
933
+ result.get("conversationId") is None
934
+ and not result.get("messages")
935
+ and not result.get("output")
936
+ ):
937
+ logger.warning(f"MCP returned empty result - conversation may be lost: {result}")
938
+ return "[ERROR] Conversation lost - the MCP server no longer has this session. Please re-delegate the task."
939
+
940
+ # First check for our collected output
941
+ if result.get("output"):
942
+ return result["output"]
943
+
944
+ # Check for messages list
945
+ if result.get("messages"):
946
+ return "\n".join(result["messages"])
947
+
948
+ # Result may have different structures depending on codex version
949
+ if "content" in result:
950
+ content = result["content"]
951
+ if isinstance(content, list):
952
+ texts = []
953
+ for block in content:
954
+ if isinstance(block, dict) and "text" in block:
955
+ texts.append(block["text"])
956
+ elif isinstance(block, str):
957
+ texts.append(block)
958
+ if texts:
959
+ return "\n".join(texts)
960
+ elif isinstance(content, str):
961
+ return content
962
+
963
+ if "text" in result:
964
+ return result["text"]
965
+
966
+ # Fallback: stringify the result (but log it as unexpected)
967
+ logger.warning(f"Unexpected MCP result format, returning raw: {list(result.keys())}")
968
+ return json.dumps(result, indent=2)