zwarm 3.2.0__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1262 +0,0 @@
1
- """
2
- Codex MCP adapter for sync conversations.
3
-
4
- Uses codex mcp-server for true iterative conversations:
5
- - codex() to start a session with conversationId
6
- - codex-reply() to continue the conversation
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- import hashlib
12
- import json
13
- import logging
14
- import queue
15
- import subprocess
16
- import threading
17
- import time
18
- from dataclasses import dataclass, field
19
- from pathlib import Path
20
- from typing import Any, Literal
21
-
22
- import weave
23
-
24
- from zwarm.adapters.base import ExecutorAdapter
25
- from zwarm.adapters.registry import register_adapter
26
- from zwarm.core.models import (
27
- ConversationSession,
28
- SessionMode,
29
- SessionStatus,
30
- )
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
-
35
- # =============================================================================
36
- # MessageCollector: Robust event collection with deduplication
37
- # =============================================================================
38
-
39
-
40
- @dataclass
41
- class MessageSegment:
42
- """A segment within an assistant turn (for future segment-aware rendering)."""
43
- id: str
44
- kind: Literal["assistant_text", "progress", "tool_call", "tool_result", "error"]
45
- text: str
46
- status: Literal["open", "closed"] = "open"
47
- source_event_ids: set[str] = field(default_factory=set)
48
-
49
-
50
- class MessageCollector:
51
- """
52
- Collects and deduplicates messages from MCP event stream.
53
-
54
- Solves the transcript rendering bugs by:
55
- 1. Deduplicating events by ID
56
- 2. Using priority-based message selection (item_completed > task_complete > streaming)
57
- 3. Tracking message sources for debugging
58
- 4. Never mixing streaming deltas with finalized messages
59
-
60
- Priority order (highest to lowest):
61
- - item_completed with AgentMessage/agent_message → DEFINITIVE
62
- - task_complete.last_agent_message → FALLBACK ONLY
63
- - streaming deltas → ONLY IF NO DEFINITIVE SOURCE
64
- """
65
-
66
- def __init__(self):
67
- # Deduplication
68
- self._seen_event_ids: set[str] = set()
69
- self._seen_content_hashes: set[str] = set() # Content-based dedup
70
-
71
- # Message collection (priority-ordered)
72
- self._definitive_messages: list[str] = [] # From item_completed
73
- self._fallback_message: str | None = None # From task_complete
74
- self._streaming_buffer: list[str] = [] # Streaming deltas
75
-
76
- # Metadata
77
- self._conversation_id: str | None = None
78
- self._session_id: str | None = None
79
- self._token_usage: dict[str, Any] = {}
80
- self._is_complete: bool = False
81
-
82
- # Debug tracking
83
- self._message_sources: list[tuple[str, str]] = [] # (source, text_preview)
84
-
85
- def _extract_event_id(self, event: dict) -> str | None:
86
- """Extract a unique event ID for deduplication."""
87
- # Try various ID fields that MCP events might have
88
- for key in ("id", "event_id", "item_id", "message_id"):
89
- if key in event:
90
- return str(event[key])
91
-
92
- # For nested events, try params
93
- params = event.get("params", {})
94
- msg = params.get("msg", {})
95
- for key in ("id", "event_id", "item_id"):
96
- if key in msg:
97
- return str(msg[key])
98
-
99
- return None
100
-
101
- def _content_hash(self, text: str) -> str:
102
- """Create a hash of content for deduplication."""
103
- # Normalize whitespace for comparison
104
- normalized = " ".join(text.split())
105
- return hashlib.md5(normalized.encode()).hexdigest()[:16]
106
-
107
- def _is_duplicate_content(self, text: str) -> bool:
108
- """Check if this content was already collected."""
109
- if not text or not text.strip():
110
- return True # Empty is "duplicate" (skip it)
111
-
112
- content_hash = self._content_hash(text)
113
- if content_hash in self._seen_content_hashes:
114
- return True
115
-
116
- self._seen_content_hashes.add(content_hash)
117
- return False
118
-
119
- def _add_definitive_message(self, text: str, source: str) -> None:
120
- """Add a definitive message (from item_completed)."""
121
- if not text or not text.strip():
122
- return
123
-
124
- if self._is_duplicate_content(text):
125
- logger.debug(f"Skipping duplicate message from {source}: {text[:50]}...")
126
- return
127
-
128
- self._definitive_messages.append(text)
129
- self._message_sources.append((source, text[:50]))
130
- logger.debug(f"Added definitive message from {source}: {text[:50]}...")
131
-
132
- def _set_fallback_message(self, text: str, source: str) -> None:
133
- """Set fallback message (from task_complete). Only used if no definitive."""
134
- if not text or not text.strip():
135
- return
136
-
137
- # Only set if we don't have definitive messages
138
- if self._definitive_messages:
139
- logger.debug(f"Ignoring fallback from {source}: have definitive messages")
140
- return
141
-
142
- if self._is_duplicate_content(text):
143
- logger.debug(f"Skipping duplicate fallback from {source}")
144
- return
145
-
146
- self._fallback_message = text
147
- self._message_sources.append((source, text[:50]))
148
-
149
- def _add_streaming_delta(self, text: str) -> None:
150
- """Add streaming delta. Only used if no definitive messages at end."""
151
- if text:
152
- self._streaming_buffer.append(text)
153
-
154
- def process_event(self, event: dict) -> bool:
155
- """
156
- Process a single MCP event.
157
-
158
- Returns True if processing should continue, False if complete.
159
- """
160
- # 1. Check for event ID and dedupe
161
- event_id = self._extract_event_id(event)
162
- if event_id and event_id in self._seen_event_ids:
163
- logger.debug(f"Skipping duplicate event: {event_id}")
164
- return True
165
- if event_id:
166
- self._seen_event_ids.add(event_id)
167
-
168
- # 2. Handle codex/event notifications
169
- if event.get("method") == "codex/event":
170
- params = event.get("params", {})
171
- msg = params.get("msg", {})
172
- msg_type = msg.get("type")
173
-
174
- self._handle_codex_event(msg, msg_type)
175
-
176
- # Check for completion events
177
- if msg_type in ("task_complete", "task_completed"):
178
- self._is_complete = True
179
- return False
180
-
181
- return True
182
-
183
- def _handle_codex_event(self, msg: dict, msg_type: str | None) -> None:
184
- """Handle a codex/event notification."""
185
- if not msg_type:
186
- return
187
-
188
- # Session configuration
189
- if msg_type == "session_configured":
190
- self._session_id = msg.get("session_id")
191
- logger.debug(f"Session configured: {self._session_id}")
192
-
193
- # Item completed - DEFINITIVE SOURCE
194
- elif msg_type == "item_completed":
195
- self._handle_item_completed(msg)
196
-
197
- # Direct agent message - DEFINITIVE SOURCE
198
- elif msg_type == "agent_message":
199
- text = msg.get("message", "") or msg.get("text", "") or msg.get("content", "")
200
- self._add_definitive_message(text, "agent_message_event")
201
-
202
- # Task complete - FALLBACK SOURCE
203
- elif msg_type in ("task_complete", "task_completed"):
204
- last_msg = msg.get("last_agent_message")
205
- if last_msg:
206
- self._set_fallback_message(last_msg, "task_complete")
207
-
208
- # Token usage
209
- elif msg_type == "token_count":
210
- info = msg.get("info") or {}
211
- if info:
212
- usage = info.get("total_token_usage", {})
213
- if usage:
214
- self._token_usage = {
215
- "input_tokens": usage.get("input_tokens", 0),
216
- "output_tokens": usage.get("output_tokens", 0),
217
- "cached_input_tokens": usage.get("cached_input_tokens", 0),
218
- "reasoning_tokens": usage.get("reasoning_output_tokens", 0),
219
- "total_tokens": usage.get("total_tokens", 0),
220
- }
221
-
222
- # Streaming deltas - LOWEST PRIORITY
223
- elif msg_type in ("text_delta", "content_block_delta", "message_delta", "text"):
224
- delta = msg.get("delta", {})
225
- text = delta.get("text", "") or msg.get("text", "")
226
- self._add_streaming_delta(text)
227
-
228
- # Response event - MEDIUM PRIORITY (treat as definitive)
229
- elif msg_type == "response":
230
- text = msg.get("response", "") or msg.get("text", "")
231
- self._add_definitive_message(text, "response_event")
232
-
233
- # Message event - check role
234
- elif msg_type == "message":
235
- role = msg.get("role", "").lower()
236
- if role in ("assistant", "agent", ""):
237
- text = msg.get("text", "") or msg.get("content", "")
238
- if text and role != "user":
239
- self._add_definitive_message(text, "message_event")
240
-
241
- # Output event
242
- elif msg_type == "output":
243
- text = msg.get("output", "") or msg.get("text", "") or msg.get("content", "")
244
- self._add_definitive_message(text, "output_event")
245
-
246
- # Completion variants
247
- elif msg_type in ("item.completed", "response.completed"):
248
- item = msg.get("item", {})
249
- if item.get("type") == "agent_message":
250
- text = item.get("text", "")
251
- self._add_definitive_message(text, f"{msg_type}_event")
252
- elif "text" in msg:
253
- self._add_definitive_message(msg["text"], f"{msg_type}_direct")
254
-
255
- # Error
256
- elif msg_type == "error":
257
- error_msg = msg.get("error", msg.get("message", str(msg)))
258
- raise RuntimeError(f"Codex error: {error_msg}")
259
-
260
- def _handle_item_completed(self, msg: dict) -> None:
261
- """Handle item_completed event - the primary source of messages."""
262
- item = msg.get("item", {})
263
- item_type = item.get("type")
264
-
265
- # AgentMessage - primary format
266
- if item_type == "AgentMessage":
267
- content = item.get("content", [])
268
- for block in content:
269
- if isinstance(block, dict) and block.get("text"):
270
- self._add_definitive_message(block["text"], "AgentMessage")
271
- elif isinstance(block, str):
272
- self._add_definitive_message(block, "AgentMessage_str")
273
-
274
- # agent_message - variant spelling
275
- elif item_type == "agent_message":
276
- text = item.get("text", "") or item.get("message", "")
277
- if text:
278
- self._add_definitive_message(text, "agent_message")
279
- content = item.get("content", [])
280
- for block in content:
281
- if isinstance(block, dict) and block.get("text"):
282
- self._add_definitive_message(block["text"], "agent_message_content")
283
- elif isinstance(block, str):
284
- self._add_definitive_message(block, "agent_message_content_str")
285
-
286
- # Generic message with assistant role
287
- elif item_type == "message":
288
- role = item.get("role", "")
289
- if role == "assistant":
290
- content = item.get("content", [])
291
- for block in content:
292
- if isinstance(block, dict) and block.get("text"):
293
- self._add_definitive_message(block["text"], "message_assistant")
294
- elif isinstance(block, str):
295
- self._add_definitive_message(block, "message_assistant_str")
296
- # Also check text field directly
297
- text = item.get("text", "")
298
- if text:
299
- self._add_definitive_message(text, "message_text")
300
-
301
- # Function call output (for context, truncated)
302
- elif item_type == "function_call_output":
303
- output = item.get("output", "")
304
- if output and len(output) < 1000:
305
- # Don't add to messages, just log
306
- logger.debug(f"Tool output: {output[:100]}...")
307
-
308
- def set_conversation_id(self, conv_id: str | None) -> None:
309
- """Set conversation ID from final result."""
310
- if conv_id:
311
- self._conversation_id = conv_id
312
-
313
- @property
314
- def conversation_id(self) -> str | None:
315
- """Get the conversation ID."""
316
- return self._conversation_id or self._session_id
317
-
318
- @property
319
- def token_usage(self) -> dict[str, Any]:
320
- """Get token usage stats."""
321
- return self._token_usage
322
-
323
- @property
324
- def is_complete(self) -> bool:
325
- """Check if collection is complete."""
326
- return self._is_complete
327
-
328
- def get_messages(self) -> list[str]:
329
- """
330
- Get the final deduplicated message list.
331
-
332
- Priority:
333
- 1. Definitive messages (from item_completed)
334
- 2. Fallback message (from task_complete)
335
- 3. Streaming buffer (only if no definitive or fallback)
336
- """
337
- # Prefer definitive messages
338
- if self._definitive_messages:
339
- logger.debug(f"Returning {len(self._definitive_messages)} definitive messages")
340
- return self._definitive_messages
341
-
342
- # Fall back to task_complete message
343
- if self._fallback_message:
344
- logger.debug("Returning fallback message from task_complete")
345
- return [self._fallback_message]
346
-
347
- # Last resort: streaming buffer
348
- if self._streaming_buffer:
349
- full_text = "".join(self._streaming_buffer)
350
- if full_text.strip():
351
- logger.debug(f"Returning streaming buffer ({len(self._streaming_buffer)} chunks)")
352
- return [full_text]
353
-
354
- return []
355
-
356
- def get_response(self) -> str:
357
- """Get the final response as a single string."""
358
- messages = self.get_messages()
359
- return "\n".join(messages) if messages else ""
360
-
361
- def get_debug_info(self) -> dict:
362
- """Get debug information about message collection."""
363
- return {
364
- "seen_event_ids": len(self._seen_event_ids),
365
- "seen_content_hashes": len(self._seen_content_hashes),
366
- "definitive_messages": len(self._definitive_messages),
367
- "has_fallback": self._fallback_message is not None,
368
- "streaming_chunks": len(self._streaming_buffer),
369
- "message_sources": self._message_sources,
370
- }
371
-
372
-
373
- class MCPClient:
374
- """
375
- Robust MCP client for communicating with codex mcp-server.
376
-
377
- Uses subprocess.Popen (NOT asyncio.subprocess) to avoid being tied to
378
- any specific event loop. This allows the MCP server to stay alive across
379
- multiple asyncio.run() calls, preserving conversation state.
380
-
381
- Uses dedicated reader threads that queue lines, avoiding the race condition
382
- of spawning new reader threads on timeout.
383
- """
384
-
385
- # Default config overrides for zwarm-managed codex sessions
386
- # These override ~/.codex/config.toml to ensure consistent behavior
387
- # Only used as fallback if no config_path is provided
388
- DEFAULT_CONFIG_OVERRIDES: dict[str, str] = {
389
- "model_reasoning_effort": "high", # Use 'high' for compatibility with all models
390
- }
391
-
392
- def __init__(
393
- self,
394
- config_path: Path | None = None,
395
- config_overrides: dict[str, str] | None = None,
396
- ):
397
- self._proc: subprocess.Popen | None = None
398
- self._proc_pid: int | None = None # Track PID to detect restarts
399
- self._request_id = 0
400
- self._initialized = False
401
- self._stderr_thread: threading.Thread | None = None
402
- self._stdout_thread: threading.Thread | None = None
403
- self._stderr_lines: list[str] = []
404
- self._stdout_queue: queue.Queue[str | None] = queue.Queue()
405
- self._lock = threading.Lock() # Protect writes only
406
- self._start_count = 0 # Track how many times we've started
407
- # Config path for full isolation (preferred)
408
- self._config_path = config_path
409
- # Fallback: merge default overrides with any custom ones (used if no config_path)
410
- self._config_overrides = {**self.DEFAULT_CONFIG_OVERRIDES, **(config_overrides or {})}
411
-
412
- def start(self) -> None:
413
- """Start the MCP server process."""
414
- with self._lock:
415
- if self._proc is not None and self._proc.poll() is None:
416
- logger.debug(f"MCP server already running (pid={self._proc.pid}, start_count={self._start_count})")
417
- return # Already running
418
-
419
- # Check if this is a restart (previous server died)
420
- if self._proc_pid is not None:
421
- logger.warning(
422
- f"MCP server restart detected! Previous pid={self._proc_pid}, "
423
- f"start_count={self._start_count}. All conversation state will be lost."
424
- )
425
-
426
- self._start_count += 1
427
-
428
- # Build command - prefer config file for full isolation, fallback to overrides
429
- cmd = ["codex", "mcp-server"]
430
- if self._config_path and self._config_path.exists():
431
- cmd.extend(["--config", str(self._config_path)])
432
- logger.info(f"Starting codex mcp-server with config: {self._config_path} (start_count={self._start_count})")
433
- else:
434
- # Fallback to individual overrides
435
- for key, value in self._config_overrides.items():
436
- cmd.extend(["-c", f'{key}="{value}"'])
437
- logger.info(f"Starting codex mcp-server with overrides: {self._config_overrides} (start_count={self._start_count})")
438
- self._proc = subprocess.Popen(
439
- cmd,
440
- stdin=subprocess.PIPE,
441
- stdout=subprocess.PIPE,
442
- stderr=subprocess.PIPE,
443
- text=False, # Binary mode for explicit encoding control
444
- )
445
- self._proc_pid = self._proc.pid
446
- self._initialized = False
447
- self._stderr_lines = []
448
- self._stdout_queue = queue.Queue() # Fresh queue
449
-
450
- # Start background thread to read stderr
451
- self._stderr_thread = threading.Thread(
452
- target=self._read_stderr_loop,
453
- daemon=True,
454
- name="mcp-stderr-reader",
455
- )
456
- self._stderr_thread.start()
457
-
458
- # Start background thread to read stdout into queue
459
- self._stdout_thread = threading.Thread(
460
- target=self._read_stdout_loop,
461
- daemon=True,
462
- name="mcp-stdout-reader",
463
- )
464
- self._stdout_thread.start()
465
-
466
- logger.info(f"MCP server started (pid={self._proc.pid})")
467
-
468
- def _read_stderr_loop(self) -> None:
469
- """Background thread to read stderr and log errors."""
470
- if not self._proc or not self._proc.stderr:
471
- return
472
- try:
473
- while True:
474
- line = self._proc.stderr.readline()
475
- if not line:
476
- break
477
- decoded = line.decode().strip()
478
- if decoded:
479
- self._stderr_lines.append(decoded)
480
- # Keep only last 100 lines
481
- if len(self._stderr_lines) > 100:
482
- self._stderr_lines = self._stderr_lines[-100:]
483
- # Log errors prominently
484
- if "error" in decoded.lower() or "ERROR" in decoded:
485
- logger.error(f"[MCP stderr] {decoded}")
486
- else:
487
- logger.debug(f"[MCP stderr] {decoded}")
488
- except Exception as e:
489
- logger.warning(f"stderr reader stopped: {e}")
490
-
491
- def _read_stdout_loop(self) -> None:
492
- """Background thread to read stdout and queue lines."""
493
- if not self._proc or not self._proc.stdout:
494
- return
495
- try:
496
- while True:
497
- line = self._proc.stdout.readline()
498
- if not line:
499
- # EOF - signal end
500
- self._stdout_queue.put(None)
501
- break
502
- decoded = line.decode()
503
- self._stdout_queue.put(decoded)
504
- except Exception as e:
505
- logger.warning(f"stdout reader stopped: {e}")
506
- self._stdout_queue.put(None) # Signal error
507
-
508
- def _next_id(self) -> int:
509
- self._request_id += 1
510
- return self._request_id
511
-
512
- def _write(self, data: str) -> None:
513
- """Write to stdin with error handling."""
514
- if not self._proc or not self._proc.stdin:
515
- raise RuntimeError("MCP server not running")
516
- if self._proc.poll() is not None:
517
- raise RuntimeError(f"MCP server died (exit code {self._proc.returncode})")
518
-
519
- self._proc.stdin.write(data.encode())
520
- self._proc.stdin.flush()
521
-
522
- def _read_line(self, timeout: float = 120.0) -> str:
523
- """
524
- Read a line from the stdout queue with timeout.
525
-
526
- Uses a dedicated reader thread that queues lines, so we never
527
- lose data on timeout - we just haven't received it yet.
528
- """
529
- if not self._proc:
530
- raise RuntimeError("MCP server not running")
531
-
532
- try:
533
- line = self._stdout_queue.get(timeout=timeout)
534
- except queue.Empty:
535
- # Timeout - check process health
536
- if self._proc.poll() is not None:
537
- stderr_context = "\n".join(self._stderr_lines[-10:]) if self._stderr_lines else "(no stderr)"
538
- raise RuntimeError(
539
- f"MCP server died (exit code {self._proc.returncode}).\n"
540
- f"Recent stderr:\n{stderr_context}"
541
- )
542
- # Process still alive, just slow - return empty to let caller decide
543
- return ""
544
-
545
- if line is None:
546
- # EOF or error from reader thread
547
- stderr_context = "\n".join(self._stderr_lines[-10:]) if self._stderr_lines else "(no stderr)"
548
- if self._proc.poll() is not None:
549
- raise RuntimeError(
550
- f"MCP server exited (code {self._proc.returncode}).\n"
551
- f"Recent stderr:\n{stderr_context}"
552
- )
553
- raise RuntimeError(f"MCP stdout closed unexpectedly.\nRecent stderr:\n{stderr_context}")
554
-
555
- return line
556
-
557
- def _check_alive(self) -> None:
558
- """Check if the MCP server is still alive, raise if not."""
559
- if not self._proc:
560
- raise RuntimeError("MCP server not started")
561
- if self._proc.poll() is not None:
562
- stderr_context = "\n".join(self._stderr_lines[-10:]) if self._stderr_lines else "(no stderr)"
563
- raise RuntimeError(
564
- f"MCP server died (exit code {self._proc.returncode}).\n"
565
- f"Recent stderr:\n{stderr_context}"
566
- )
567
-
568
- def initialize(self) -> dict:
569
- """Initialize MCP connection."""
570
- self._check_alive()
571
-
572
- request = {
573
- "jsonrpc": "2.0",
574
- "id": self._next_id(),
575
- "method": "initialize",
576
- "params": {
577
- "protocolVersion": "2024-11-05",
578
- "capabilities": {},
579
- "clientInfo": {"name": "zwarm", "version": "0.1.0"},
580
- },
581
- }
582
- with self._lock:
583
- self._write(json.dumps(request) + "\n")
584
-
585
- response_line = self._read_line(timeout=30.0)
586
- if not response_line:
587
- raise RuntimeError("No response from MCP server during init")
588
-
589
- response = json.loads(response_line)
590
- if "error" in response:
591
- raise RuntimeError(f"MCP init error: {response['error']}")
592
-
593
- # Send initialized notification
594
- notif = {"jsonrpc": "2.0", "method": "notifications/initialized"}
595
- with self._lock:
596
- self._write(json.dumps(notif) + "\n")
597
-
598
- self._initialized = True
599
- logger.info("MCP connection initialized")
600
- return response
601
-
602
- def call_tool(self, name: str, arguments: dict, timeout: float = 300.0) -> dict:
603
- """
604
- Call an MCP tool and collect streaming events.
605
-
606
- Uses MessageCollector for robust deduplication and priority-based
607
- message selection. This prevents the transcript rendering bugs:
608
- - Message duplication
609
- - Role contamination
610
- - Turn mis-association
611
-
612
- Args:
613
- name: Tool name (codex, codex-reply)
614
- arguments: Tool arguments
615
- timeout: Overall timeout for the call (default 5 min)
616
- """
617
- self._check_alive()
618
-
619
- if not self._initialized:
620
- self.initialize()
621
-
622
- request_id = self._next_id()
623
- request = {
624
- "jsonrpc": "2.0",
625
- "id": request_id,
626
- "method": "tools/call",
627
- "params": {"name": name, "arguments": arguments},
628
- }
629
-
630
- logger.debug(f"Calling MCP tool: {name} with args: {list(arguments.keys())}")
631
- with self._lock:
632
- self._write(json.dumps(request) + "\n")
633
-
634
- # Use MessageCollector for robust event handling
635
- collector = MessageCollector()
636
- final_result = None
637
- start_time = time.time()
638
- all_events: list[dict] = [] # Keep ALL events for debugging
639
-
640
- for event_count in range(1000): # Safety limit on events
641
- self._check_alive()
642
-
643
- # Check overall timeout
644
- elapsed = time.time() - start_time
645
- if elapsed > timeout:
646
- raise RuntimeError(f"MCP call timed out after {timeout}s ({event_count} events received)")
647
-
648
- # Read from queue with per-event timeout
649
- line = self._read_line(timeout=30.0)
650
-
651
- if not line:
652
- # Timeout waiting for event - process is still alive, just slow
653
- logger.debug(f"Waiting for MCP event... (elapsed: {elapsed:.0f}s, events: {event_count})")
654
- continue
655
-
656
- try:
657
- event = json.loads(line)
658
- all_events.append(event) # Keep for debugging
659
- except json.JSONDecodeError as e:
660
- logger.warning(f"Invalid JSON from MCP: {line[:100]}... - {e}")
661
- continue
662
-
663
- # Check for final result (has matching id)
664
- if event.get("id") == request_id:
665
- if "result" in event:
666
- final_result = event["result"]
667
- # Extract conversation ID from final result
668
- if isinstance(final_result, dict):
669
- conv_id = final_result.get("conversationId") or final_result.get("conversation_id")
670
- collector.set_conversation_id(conv_id)
671
- logger.debug(f"Got final result after {event_count} events")
672
- break
673
- elif "error" in event:
674
- error = event["error"]
675
- raise RuntimeError(f"MCP tool error: {error.get('message', error)}")
676
-
677
- # Process event through collector
678
- try:
679
- should_continue = collector.process_event(event)
680
- if not should_continue:
681
- logger.debug(f"Collector signaled completion after {event_count} events")
682
- break
683
- except RuntimeError as e:
684
- # Collector raises RuntimeError for codex errors
685
- raise
686
-
687
- # Try to extract content from final_result if collector has no messages
688
- messages = collector.get_messages()
689
- if final_result and not messages:
690
- if "content" in final_result:
691
- content = final_result["content"]
692
- if isinstance(content, list):
693
- for block in content:
694
- if isinstance(block, dict) and block.get("text"):
695
- messages.append(block["text"])
696
- elif isinstance(block, str):
697
- messages.append(block)
698
- elif isinstance(content, str):
699
- messages.append(content)
700
- if not messages and "text" in final_result:
701
- messages.append(final_result["text"])
702
-
703
- # Build result
704
- result = {
705
- "conversationId": collector.conversation_id,
706
- "messages": messages,
707
- "output": "\n".join(messages) if messages else "",
708
- "usage": collector.token_usage,
709
- }
710
-
711
- # Log detailed debug info if we didn't capture any messages
712
- if not messages:
713
- debug_info = collector.get_debug_info()
714
- event_types = [e.get("method") or f"id:{e.get('id')}" for e in all_events[:20]]
715
- logger.warning(
716
- f"MCP call returned no messages. "
717
- f"conversation_id={collector.conversation_id}, "
718
- f"event_count={len(all_events)}, "
719
- f"event_types={event_types}, "
720
- f"collector_debug={debug_info}, "
721
- f"final_result_keys={list(final_result.keys()) if final_result else 'None'}"
722
- )
723
- # Log codex/event details for debugging
724
- codex_events = [e for e in all_events if e.get("method") == "codex/event"]
725
- if codex_events:
726
- for ce in codex_events[-5:]: # Last 5 codex events
727
- msg = ce.get("params", {}).get("msg", {})
728
- logger.debug(f" codex/event: type={msg.get('type')}, keys={list(msg.keys())}")
729
-
730
- logger.debug(f"MCP call complete: {len(messages)} messages, conversation_id={collector.conversation_id}")
731
- return result
732
-
733
- def close(self) -> None:
734
- """Close the MCP connection gracefully."""
735
- if self._proc and self._proc.poll() is None:
736
- logger.info("Terminating MCP server...")
737
- self._proc.terminate()
738
- try:
739
- self._proc.wait(timeout=5)
740
- except subprocess.TimeoutExpired:
741
- logger.warning("MCP server didn't terminate, killing...")
742
- self._proc.kill()
743
- self._proc.wait()
744
-
745
- self._proc = None
746
- self._initialized = False
747
-
748
- @property
749
- def is_alive(self) -> bool:
750
- """Check if the MCP server is running."""
751
- return self._proc is not None and self._proc.poll() is None
752
-
753
-
754
- @register_adapter("codex_mcp")
755
- class CodexMCPAdapter(ExecutorAdapter):
756
- """
757
- Codex adapter using MCP server for sync conversations.
758
-
759
- This is the recommended way to have iterative conversations with Codex.
760
- The MCP client uses subprocess.Popen (not asyncio) so it persists across
761
- multiple asyncio.run() calls, preserving conversation state.
762
-
763
- Config isolation: Pass config_path to use a local codex.toml instead of
764
- the user's global ~/.codex/config.toml. This is the preferred approach.
765
- Falls back to config_overrides if no config_path is provided.
766
- """
767
- DEFAULT_MODEL = "gpt-5.1-codex-mini" # Default codex model
768
-
769
- def __init__(
770
- self,
771
- model: str | None = None,
772
- config_path: Path | None = None,
773
- config_overrides: dict[str, str] | None = None,
774
- ):
775
- self._model = model or self.DEFAULT_MODEL
776
- self._config_path = config_path # Path to local codex.toml for isolation
777
- self._config_overrides = config_overrides or {}
778
- self._mcp_client: MCPClient | None = None
779
- self._sessions: dict[str, str] = {} # session_id -> conversationId
780
- # Cumulative token usage for cost tracking
781
- self._total_usage: dict[str, int] = {
782
- "input_tokens": 0,
783
- "output_tokens": 0,
784
- "cached_input_tokens": 0,
785
- "reasoning_tokens": 0,
786
- "total_tokens": 0,
787
- }
788
-
789
- def _accumulate_usage(self, usage: dict[str, Any]) -> None:
790
- """Add usage to cumulative totals."""
791
- if not usage:
792
- return
793
- for key in self._total_usage:
794
- self._total_usage[key] += usage.get(key, 0)
795
-
796
- @property
797
- def total_usage(self) -> dict[str, int]:
798
- """Get cumulative token usage across all calls."""
799
- return self._total_usage.copy()
800
-
801
- def _ensure_client(self) -> MCPClient:
802
- """Ensure MCP client is running and return it."""
803
- if self._mcp_client is None:
804
- self._mcp_client = MCPClient(
805
- config_path=self._config_path,
806
- config_overrides=self._config_overrides,
807
- )
808
-
809
- if not self._mcp_client.is_alive:
810
- self._mcp_client.start()
811
-
812
- return self._mcp_client
813
-
814
- @weave.op()
815
- def _call_codex(
816
- self,
817
- task: str,
818
- cwd: str,
819
- sandbox: str,
820
- model: str | None = None,
821
- reasoning_effort: str | None = None,
822
- ) -> dict[str, Any]:
823
- """
824
- Call codex MCP tool - traced by Weave.
825
-
826
- This is synchronous (uses subprocess.Popen, not asyncio) so the MCP
827
- server persists across calls.
828
- """
829
- client = self._ensure_client()
830
-
831
- args: dict[str, Any] = {
832
- "prompt": task,
833
- "cwd": cwd,
834
- "sandbox": sandbox,
835
- }
836
- if model:
837
- args["model"] = model
838
-
839
- # Pass reasoning_effort to override codex config defaults
840
- # The config key is "model_reasoning_effort"
841
- if reasoning_effort:
842
- args["model_reasoning_effort"] = reasoning_effort
843
-
844
- logger.info(f"Calling codex with task_len={len(task)}, cwd={cwd}, model={model or 'default'}, reasoning_effort={reasoning_effort or 'default'}")
845
- logger.debug(f"Full codex args: {args}")
846
-
847
- result = client.call_tool("codex", args)
848
-
849
- # Log the result structure
850
- conversation_id = result.get("conversationId")
851
- messages_count = len(result.get("messages", []))
852
- output_len = len(result.get("output", ""))
853
- usage = result.get("usage", {})
854
-
855
- logger.info(
856
- f"codex result: conversation_id={conversation_id}, "
857
- f"messages_count={messages_count}, output_len={output_len}, "
858
- f"usage={usage.get('total_tokens', 0)} tokens"
859
- )
860
-
861
- # Warn if we got a conversation ID but no messages (agent did work but we lost output)
862
- if conversation_id and not messages_count and not output_len:
863
- logger.warning(
864
- f"codex returned conversation_id={conversation_id} but NO messages/output! "
865
- f"The agent processed {usage.get('total_tokens', 0)} tokens but we didn't capture the response. "
866
- f"This may indicate an issue with event parsing."
867
- )
868
-
869
- # Track usage
870
- self._accumulate_usage(usage)
871
-
872
- return {
873
- "conversation_id": conversation_id,
874
- "response": self._extract_response(result),
875
- "raw_messages": result.get("messages", []),
876
- "usage": usage,
877
- "total_usage": self.total_usage,
878
- }
879
-
880
- @weave.op()
881
- def _call_codex_reply(
882
- self,
883
- conversation_id: str,
884
- message: str,
885
- ) -> dict[str, Any]:
886
- """
887
- Call codex-reply MCP tool - traced by Weave.
888
-
889
- This is synchronous (uses subprocess.Popen, not asyncio) so the MCP
890
- server persists across calls.
891
- """
892
- client = self._ensure_client()
893
-
894
- logger.info(f"Calling codex-reply with conversation_id={conversation_id}, message_len={len(message)}")
895
- logger.debug(f"MCP client alive: {client.is_alive}, initialized: {client._initialized}")
896
-
897
- result = client.call_tool("codex-reply", {
898
- "conversationId": conversation_id,
899
- "prompt": message,
900
- })
901
-
902
- # Log the full result structure for debugging
903
- logger.info(
904
- f"codex-reply result: conversationId={result.get('conversationId')}, "
905
- f"messages_count={len(result.get('messages', []))}, "
906
- f"output_len={len(result.get('output', ''))}, "
907
- f"usage={result.get('usage', {}).get('total_tokens', 0)} tokens"
908
- )
909
-
910
- # Check for conversation loss - MCP returns empty result when session not found
911
- if not result.get("messages") and not result.get("output"):
912
- logger.error(
913
- f"codex-reply returned empty result for conversation_id={conversation_id}. "
914
- f"The MCP server may have lost the conversation state. Result: {result}"
915
- )
916
-
917
- # Track usage
918
- usage = result.get("usage", {})
919
- self._accumulate_usage(usage)
920
-
921
- # Filter out the sent message from the response using content hashing
922
- # The MCP may echo our prompt back, but we use robust content comparison
923
- raw_messages = result.get("messages", [])
924
-
925
- # Create hash of user message for comparison (normalized)
926
- def normalize_for_comparison(text: str) -> str:
927
- """Normalize text for comparison (lowercase, collapsed whitespace)."""
928
- return " ".join(text.lower().split())
929
-
930
- user_msg_normalized = normalize_for_comparison(message)
931
- user_msg_hash = hashlib.md5(user_msg_normalized.encode()).hexdigest()
932
-
933
- def is_user_message_echo(text: str) -> bool:
934
- """Check if text is just an echo of the user message."""
935
- if not text:
936
- return True # Empty is effectively an echo (skip it)
937
-
938
- text_normalized = normalize_for_comparison(text)
939
- text_hash = hashlib.md5(text_normalized.encode()).hexdigest()
940
-
941
- # Exact match (case-insensitive, whitespace-normalized)
942
- if text_hash == user_msg_hash:
943
- return True
944
-
945
- # Check if text IS the user message (not just starts with it)
946
- # This avoids the bug where "Fix bug by X" gets filtered when user said "Fix bug"
947
- if text_normalized == user_msg_normalized:
948
- return True
949
-
950
- return False
951
-
952
- filtered_messages = [m for m in raw_messages if not is_user_message_echo(m)]
953
-
954
- # Build filtered result for extraction
955
- filtered_result = {
956
- **result,
957
- "messages": filtered_messages,
958
- "output": "\n".join(filtered_messages) if filtered_messages else result.get("output", ""),
959
- }
960
-
961
- response = self._extract_response(filtered_result)
962
- filtered_count = len(raw_messages) - len(filtered_messages)
963
- if filtered_count > 0:
964
- logger.debug(f"Filtered {filtered_count} user echo messages from response")
965
- logger.debug(f"codex-reply response length: {len(response)} chars")
966
-
967
- return {
968
- "response": response,
969
- "raw_messages": filtered_messages, # Return filtered messages
970
- "usage": usage,
971
- "total_usage": self.total_usage,
972
- "conversation_lost": not result.get("messages") and not result.get("output"),
973
- }
974
-
975
- @weave.op()
976
- async def start_session(
977
- self,
978
- task: str,
979
- working_dir: Path,
980
- mode: Literal["sync", "async"] = "sync",
981
- model: str | None = None,
982
- sandbox: str = "workspace-write",
983
- **kwargs,
984
- ) -> ConversationSession:
985
- """Start a Codex session (sync or async mode)."""
986
- effective_model = model or self._model
987
- session = ConversationSession(
988
- adapter=self.name,
989
- mode=SessionMode(mode),
990
- working_dir=working_dir,
991
- task_description=task,
992
- model=effective_model,
993
- )
994
-
995
- if mode == "sync":
996
- # Use traced codex call (synchronous - MCP client persists across calls)
997
- result = self._call_codex(
998
- task=task,
999
- cwd=str(working_dir.absolute()),
1000
- sandbox=sandbox,
1001
- model=effective_model,
1002
- reasoning_effort=kwargs.get("reasoning_effort"),
1003
- )
1004
-
1005
- # Extract conversation ID and response
1006
- session.conversation_id = result["conversation_id"]
1007
- if session.conversation_id:
1008
- self._sessions[session.id] = session.conversation_id
1009
- logger.debug(f"Session {session.id[:8]} mapped to conversation {session.conversation_id}")
1010
- else:
1011
- # This is bad - we won't be able to continue this conversation
1012
- logger.warning(
1013
- f"Session {session.id[:8]} started but MCP didn't return a conversation ID. "
1014
- "Further converse() calls will fail."
1015
- )
1016
-
1017
- session.add_message("user", task)
1018
- session.add_message("assistant", result["response"])
1019
-
1020
- # Track token usage on the session
1021
- session.add_usage(result.get("usage", {}))
1022
-
1023
- else:
1024
- # Async mode: use codex exec (fire-and-forget)
1025
- # This runs in a subprocess without MCP, outputs JSONL events
1026
- cmd = [
1027
- "codex", "exec",
1028
- "--dangerously-bypass-approvals-and-sandbox",
1029
- "--skip-git-repo-check",
1030
- "--json",
1031
- "--model", effective_model,
1032
- "-C", str(working_dir.absolute()), # Explicit working directory
1033
- "--", task,
1034
- ]
1035
-
1036
- logger.info(f"Starting async codex: {' '.join(cmd[:8])}...")
1037
-
1038
- proc = subprocess.Popen(
1039
- cmd,
1040
- cwd=working_dir,
1041
- stdout=subprocess.PIPE,
1042
- stderr=subprocess.PIPE,
1043
- text=True,
1044
- )
1045
- session.process = proc
1046
- session.add_message("user", task)
1047
-
1048
- return session
1049
-
1050
- async def send_message(
1051
- self,
1052
- session: ConversationSession,
1053
- message: str,
1054
- ) -> str:
1055
- """Send a message to continue a sync conversation."""
1056
- if session.mode != SessionMode.SYNC:
1057
- raise ValueError("Cannot send message to async session")
1058
- if session.status != SessionStatus.ACTIVE:
1059
- raise ValueError(f"Session is not active: {session.status}")
1060
- if not session.conversation_id:
1061
- raise ValueError("Session has no conversation ID")
1062
-
1063
- # Use traced codex-reply call (synchronous - MCP client persists across calls)
1064
- result = self._call_codex_reply(
1065
- conversation_id=session.conversation_id,
1066
- message=message,
1067
- )
1068
-
1069
- response_text = result["response"]
1070
-
1071
- # Check if conversation was lost
1072
- if result.get("conversation_lost"):
1073
- logger.warning(
1074
- f"Conversation {session.conversation_id} was lost. "
1075
- f"Session {session.id} will be marked as needing re-delegation."
1076
- )
1077
- # Mark the session as having a lost conversation so orchestrator can handle it
1078
- session.conversation_id = None # Clear the stale ID
1079
-
1080
- session.add_message("user", message)
1081
- session.add_message("assistant", response_text)
1082
-
1083
- # Track token usage on the session
1084
- session.add_usage(result.get("usage", {}))
1085
-
1086
- return response_text
1087
-
1088
- @weave.op()
1089
- def _parse_jsonl_output(self, stdout: str) -> dict[str, Any]:
1090
- """
1091
- Parse JSONL output from codex exec --json.
1092
-
1093
- Returns dict with:
1094
- - response: The agent's message text
1095
- - usage: Token usage stats
1096
- - thread_id: The conversation thread ID
1097
- - events: All parsed events (for debugging)
1098
- """
1099
- response_parts = []
1100
- usage = {}
1101
- thread_id = None
1102
- events = []
1103
-
1104
- for line in stdout.strip().split("\n"):
1105
- if not line.strip():
1106
- continue
1107
- try:
1108
- event = json.loads(line)
1109
- events.append(event)
1110
-
1111
- event_type = event.get("type", "")
1112
-
1113
- if event_type == "thread.started":
1114
- thread_id = event.get("thread_id")
1115
-
1116
- elif event_type == "item.completed":
1117
- item = event.get("item", {})
1118
- if item.get("type") == "agent_message":
1119
- response_parts.append(item.get("text", ""))
1120
-
1121
- elif event_type == "turn.completed":
1122
- usage = event.get("usage", {})
1123
-
1124
- except json.JSONDecodeError:
1125
- logger.warning(f"Failed to parse JSONL line: {line[:100]}")
1126
- continue
1127
-
1128
- return {
1129
- "response": "\n".join(response_parts),
1130
- "usage": usage,
1131
- "thread_id": thread_id,
1132
- "events": events,
1133
- }
1134
-
1135
- @weave.op()
1136
- async def check_status(
1137
- self,
1138
- session: ConversationSession,
1139
- ) -> dict:
1140
- """Check status of an async session."""
1141
- if session.mode != SessionMode.ASYNC:
1142
- return {"status": session.status.value}
1143
-
1144
- if session.process is None:
1145
- return {"status": "unknown", "error": "No process handle"}
1146
-
1147
- # Check if process is still running
1148
- poll = session.process.poll()
1149
- if poll is None:
1150
- return {"status": "running"}
1151
-
1152
- # Process finished - parse the JSONL output
1153
- stdout, stderr = session.process.communicate()
1154
-
1155
- if poll == 0:
1156
- # Parse JSONL to extract actual response
1157
- parsed = self._parse_jsonl_output(stdout)
1158
- response_text = parsed["response"] or "(no response captured)"
1159
-
1160
- # Add the response as a message
1161
- session.add_message("assistant", response_text)
1162
-
1163
- # Track token usage
1164
- if parsed["usage"]:
1165
- session.add_usage({
1166
- "input_tokens": parsed["usage"].get("input_tokens", 0),
1167
- "output_tokens": parsed["usage"].get("output_tokens", 0),
1168
- "total_tokens": (
1169
- parsed["usage"].get("input_tokens", 0) +
1170
- parsed["usage"].get("output_tokens", 0)
1171
- ),
1172
- })
1173
-
1174
- session.complete(response_text[:500])
1175
- return {
1176
- "status": "completed",
1177
- "response": response_text,
1178
- "usage": parsed["usage"],
1179
- "thread_id": parsed["thread_id"],
1180
- }
1181
- else:
1182
- # Try to parse stderr or stdout for error info
1183
- error_msg = stderr.strip() if stderr else f"Exit code: {poll}"
1184
-
1185
- # Sometimes errors come through stdout as JSONL too
1186
- if stdout and not stderr:
1187
- try:
1188
- parsed = self._parse_jsonl_output(stdout)
1189
- if not parsed["response"]:
1190
- error_msg = f"Process failed with no response. Exit code: {poll}"
1191
- except Exception:
1192
- error_msg = stdout[:500] if stdout else f"Exit code: {poll}"
1193
-
1194
- session.fail(error_msg[:500])
1195
- return {"status": "failed", "error": error_msg, "exit_code": poll}
1196
-
1197
- async def stop(
1198
- self,
1199
- session: ConversationSession,
1200
- ) -> None:
1201
- """Stop a session."""
1202
- import subprocess
1203
-
1204
- if session.process and session.process.poll() is None:
1205
- session.process.terminate()
1206
- try:
1207
- session.process.wait(timeout=5)
1208
- except subprocess.TimeoutExpired:
1209
- session.process.kill()
1210
-
1211
- session.fail("Stopped by user")
1212
-
1213
- # Remove from tracking
1214
- if session.id in self._sessions:
1215
- del self._sessions[session.id]
1216
-
1217
- async def cleanup(self) -> None:
1218
- """Clean up MCP server."""
1219
- if self._mcp_client:
1220
- self._mcp_client.close()
1221
- self._mcp_client = None
1222
-
1223
- def _extract_response(self, result: dict) -> str:
1224
- """Extract response text from MCP result."""
1225
- # Check for error indicators - empty result suggests lost conversation
1226
- if (
1227
- result.get("conversationId") is None
1228
- and not result.get("messages")
1229
- and not result.get("output")
1230
- ):
1231
- logger.warning(f"MCP returned empty result - conversation may be lost: {result}")
1232
- return "[ERROR] Conversation lost - the MCP server no longer has this session. Please re-delegate the task."
1233
-
1234
- # First check for our collected output
1235
- if result.get("output"):
1236
- return result["output"]
1237
-
1238
- # Check for messages list
1239
- if result.get("messages"):
1240
- return "\n".join(result["messages"])
1241
-
1242
- # Result may have different structures depending on codex version
1243
- if "content" in result:
1244
- content = result["content"]
1245
- if isinstance(content, list):
1246
- texts = []
1247
- for block in content:
1248
- if isinstance(block, dict) and "text" in block:
1249
- texts.append(block["text"])
1250
- elif isinstance(block, str):
1251
- texts.append(block)
1252
- if texts:
1253
- return "\n".join(texts)
1254
- elif isinstance(content, str):
1255
- return content
1256
-
1257
- if "text" in result:
1258
- return result["text"]
1259
-
1260
- # Fallback: stringify the result (but log it as unexpected)
1261
- logger.warning(f"Unexpected MCP result format, returning raw: {list(result.keys())}")
1262
- return json.dumps(result, indent=2)