zwarm 1.3.10__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,12 +8,14 @@ Uses codex mcp-server for true iterative conversations:
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ import hashlib
11
12
  import json
12
13
  import logging
13
14
  import queue
14
15
  import subprocess
15
16
  import threading
16
17
  import time
18
+ from dataclasses import dataclass, field
17
19
  from pathlib import Path
18
20
  from typing import Any, Literal
19
21
 
@@ -30,6 +32,344 @@ from zwarm.core.models import (
30
32
  logger = logging.getLogger(__name__)
31
33
 
32
34
 
35
+ # =============================================================================
36
+ # MessageCollector: Robust event collection with deduplication
37
+ # =============================================================================
38
+
39
+
40
+ @dataclass
41
+ class MessageSegment:
42
+ """A segment within an assistant turn (for future segment-aware rendering)."""
43
+ id: str
44
+ kind: Literal["assistant_text", "progress", "tool_call", "tool_result", "error"]
45
+ text: str
46
+ status: Literal["open", "closed"] = "open"
47
+ source_event_ids: set[str] = field(default_factory=set)
48
+
49
+
50
+ class MessageCollector:
51
+ """
52
+ Collects and deduplicates messages from MCP event stream.
53
+
54
+ Solves the transcript rendering bugs by:
55
+ 1. Deduplicating events by ID
56
+ 2. Using priority-based message selection (item_completed > task_complete > streaming)
57
+ 3. Tracking message sources for debugging
58
+ 4. Never mixing streaming deltas with finalized messages
59
+
60
+ Priority order (highest to lowest):
61
+ - item_completed with AgentMessage/agent_message → DEFINITIVE
62
+ - task_complete.last_agent_message → FALLBACK ONLY
63
+ - streaming deltas → ONLY IF NO DEFINITIVE SOURCE
64
+ """
65
+
66
+ def __init__(self):
67
+ # Deduplication
68
+ self._seen_event_ids: set[str] = set()
69
+ self._seen_content_hashes: set[str] = set() # Content-based dedup
70
+
71
+ # Message collection (priority-ordered)
72
+ self._definitive_messages: list[str] = [] # From item_completed
73
+ self._fallback_message: str | None = None # From task_complete
74
+ self._streaming_buffer: list[str] = [] # Streaming deltas
75
+
76
+ # Metadata
77
+ self._conversation_id: str | None = None
78
+ self._session_id: str | None = None
79
+ self._token_usage: dict[str, Any] = {}
80
+ self._is_complete: bool = False
81
+
82
+ # Debug tracking
83
+ self._message_sources: list[tuple[str, str]] = [] # (source, text_preview)
84
+
85
+ def _extract_event_id(self, event: dict) -> str | None:
86
+ """Extract a unique event ID for deduplication."""
87
+ # Try various ID fields that MCP events might have
88
+ for key in ("id", "event_id", "item_id", "message_id"):
89
+ if key in event:
90
+ return str(event[key])
91
+
92
+ # For nested events, try params
93
+ params = event.get("params", {})
94
+ msg = params.get("msg", {})
95
+ for key in ("id", "event_id", "item_id"):
96
+ if key in msg:
97
+ return str(msg[key])
98
+
99
+ return None
100
+
101
+ def _content_hash(self, text: str) -> str:
102
+ """Create a hash of content for deduplication."""
103
+ # Normalize whitespace for comparison
104
+ normalized = " ".join(text.split())
105
+ return hashlib.md5(normalized.encode()).hexdigest()[:16]
106
+
107
+ def _is_duplicate_content(self, text: str) -> bool:
108
+ """Check if this content was already collected."""
109
+ if not text or not text.strip():
110
+ return True # Empty is "duplicate" (skip it)
111
+
112
+ content_hash = self._content_hash(text)
113
+ if content_hash in self._seen_content_hashes:
114
+ return True
115
+
116
+ self._seen_content_hashes.add(content_hash)
117
+ return False
118
+
119
+ def _add_definitive_message(self, text: str, source: str) -> None:
120
+ """Add a definitive message (from item_completed)."""
121
+ if not text or not text.strip():
122
+ return
123
+
124
+ if self._is_duplicate_content(text):
125
+ logger.debug(f"Skipping duplicate message from {source}: {text[:50]}...")
126
+ return
127
+
128
+ self._definitive_messages.append(text)
129
+ self._message_sources.append((source, text[:50]))
130
+ logger.debug(f"Added definitive message from {source}: {text[:50]}...")
131
+
132
+ def _set_fallback_message(self, text: str, source: str) -> None:
133
+ """Set fallback message (from task_complete). Only used if no definitive."""
134
+ if not text or not text.strip():
135
+ return
136
+
137
+ # Only set if we don't have definitive messages
138
+ if self._definitive_messages:
139
+ logger.debug(f"Ignoring fallback from {source}: have definitive messages")
140
+ return
141
+
142
+ if self._is_duplicate_content(text):
143
+ logger.debug(f"Skipping duplicate fallback from {source}")
144
+ return
145
+
146
+ self._fallback_message = text
147
+ self._message_sources.append((source, text[:50]))
148
+
149
+ def _add_streaming_delta(self, text: str) -> None:
150
+ """Add streaming delta. Only used if no definitive messages at end."""
151
+ if text:
152
+ self._streaming_buffer.append(text)
153
+
154
+ def process_event(self, event: dict) -> bool:
155
+ """
156
+ Process a single MCP event.
157
+
158
+ Returns True if processing should continue, False if complete.
159
+ """
160
+ # 1. Check for event ID and dedupe
161
+ event_id = self._extract_event_id(event)
162
+ if event_id and event_id in self._seen_event_ids:
163
+ logger.debug(f"Skipping duplicate event: {event_id}")
164
+ return True
165
+ if event_id:
166
+ self._seen_event_ids.add(event_id)
167
+
168
+ # 2. Handle codex/event notifications
169
+ if event.get("method") == "codex/event":
170
+ params = event.get("params", {})
171
+ msg = params.get("msg", {})
172
+ msg_type = msg.get("type")
173
+
174
+ self._handle_codex_event(msg, msg_type)
175
+
176
+ # Check for completion events
177
+ if msg_type in ("task_complete", "task_completed"):
178
+ self._is_complete = True
179
+ return False
180
+
181
+ return True
182
+
183
+ def _handle_codex_event(self, msg: dict, msg_type: str | None) -> None:
184
+ """Handle a codex/event notification."""
185
+ if not msg_type:
186
+ return
187
+
188
+ # Session configuration
189
+ if msg_type == "session_configured":
190
+ self._session_id = msg.get("session_id")
191
+ logger.debug(f"Session configured: {self._session_id}")
192
+
193
+ # Item completed - DEFINITIVE SOURCE
194
+ elif msg_type == "item_completed":
195
+ self._handle_item_completed(msg)
196
+
197
+ # Direct agent message - DEFINITIVE SOURCE
198
+ elif msg_type == "agent_message":
199
+ text = msg.get("message", "") or msg.get("text", "") or msg.get("content", "")
200
+ self._add_definitive_message(text, "agent_message_event")
201
+
202
+ # Task complete - FALLBACK SOURCE
203
+ elif msg_type in ("task_complete", "task_completed"):
204
+ last_msg = msg.get("last_agent_message")
205
+ if last_msg:
206
+ self._set_fallback_message(last_msg, "task_complete")
207
+
208
+ # Token usage
209
+ elif msg_type == "token_count":
210
+ info = msg.get("info") or {}
211
+ if info:
212
+ usage = info.get("total_token_usage", {})
213
+ if usage:
214
+ self._token_usage = {
215
+ "input_tokens": usage.get("input_tokens", 0),
216
+ "output_tokens": usage.get("output_tokens", 0),
217
+ "cached_input_tokens": usage.get("cached_input_tokens", 0),
218
+ "reasoning_tokens": usage.get("reasoning_output_tokens", 0),
219
+ "total_tokens": usage.get("total_tokens", 0),
220
+ }
221
+
222
+ # Streaming deltas - LOWEST PRIORITY
223
+ elif msg_type in ("text_delta", "content_block_delta", "message_delta", "text"):
224
+ delta = msg.get("delta", {})
225
+ text = delta.get("text", "") or msg.get("text", "")
226
+ self._add_streaming_delta(text)
227
+
228
+ # Response event - MEDIUM PRIORITY (treat as definitive)
229
+ elif msg_type == "response":
230
+ text = msg.get("response", "") or msg.get("text", "")
231
+ self._add_definitive_message(text, "response_event")
232
+
233
+ # Message event - check role
234
+ elif msg_type == "message":
235
+ role = msg.get("role", "").lower()
236
+ if role in ("assistant", "agent", ""):
237
+ text = msg.get("text", "") or msg.get("content", "")
238
+ if text and role != "user":
239
+ self._add_definitive_message(text, "message_event")
240
+
241
+ # Output event
242
+ elif msg_type == "output":
243
+ text = msg.get("output", "") or msg.get("text", "") or msg.get("content", "")
244
+ self._add_definitive_message(text, "output_event")
245
+
246
+ # Completion variants
247
+ elif msg_type in ("item.completed", "response.completed"):
248
+ item = msg.get("item", {})
249
+ if item.get("type") == "agent_message":
250
+ text = item.get("text", "")
251
+ self._add_definitive_message(text, f"{msg_type}_event")
252
+ elif "text" in msg:
253
+ self._add_definitive_message(msg["text"], f"{msg_type}_direct")
254
+
255
+ # Error
256
+ elif msg_type == "error":
257
+ error_msg = msg.get("error", msg.get("message", str(msg)))
258
+ raise RuntimeError(f"Codex error: {error_msg}")
259
+
260
+ def _handle_item_completed(self, msg: dict) -> None:
261
+ """Handle item_completed event - the primary source of messages."""
262
+ item = msg.get("item", {})
263
+ item_type = item.get("type")
264
+
265
+ # AgentMessage - primary format
266
+ if item_type == "AgentMessage":
267
+ content = item.get("content", [])
268
+ for block in content:
269
+ if isinstance(block, dict) and block.get("text"):
270
+ self._add_definitive_message(block["text"], "AgentMessage")
271
+ elif isinstance(block, str):
272
+ self._add_definitive_message(block, "AgentMessage_str")
273
+
274
+ # agent_message - variant spelling
275
+ elif item_type == "agent_message":
276
+ text = item.get("text", "") or item.get("message", "")
277
+ if text:
278
+ self._add_definitive_message(text, "agent_message")
279
+ content = item.get("content", [])
280
+ for block in content:
281
+ if isinstance(block, dict) and block.get("text"):
282
+ self._add_definitive_message(block["text"], "agent_message_content")
283
+ elif isinstance(block, str):
284
+ self._add_definitive_message(block, "agent_message_content_str")
285
+
286
+ # Generic message with assistant role
287
+ elif item_type == "message":
288
+ role = item.get("role", "")
289
+ if role == "assistant":
290
+ content = item.get("content", [])
291
+ for block in content:
292
+ if isinstance(block, dict) and block.get("text"):
293
+ self._add_definitive_message(block["text"], "message_assistant")
294
+ elif isinstance(block, str):
295
+ self._add_definitive_message(block, "message_assistant_str")
296
+ # Also check text field directly
297
+ text = item.get("text", "")
298
+ if text:
299
+ self._add_definitive_message(text, "message_text")
300
+
301
+ # Function call output (for context, truncated)
302
+ elif item_type == "function_call_output":
303
+ output = item.get("output", "")
304
+ if output and len(output) < 1000:
305
+ # Don't add to messages, just log
306
+ logger.debug(f"Tool output: {output[:100]}...")
307
+
308
+ def set_conversation_id(self, conv_id: str | None) -> None:
309
+ """Set conversation ID from final result."""
310
+ if conv_id:
311
+ self._conversation_id = conv_id
312
+
313
+ @property
314
+ def conversation_id(self) -> str | None:
315
+ """Get the conversation ID."""
316
+ return self._conversation_id or self._session_id
317
+
318
+ @property
319
+ def token_usage(self) -> dict[str, Any]:
320
+ """Get token usage stats."""
321
+ return self._token_usage
322
+
323
+ @property
324
+ def is_complete(self) -> bool:
325
+ """Check if collection is complete."""
326
+ return self._is_complete
327
+
328
+ def get_messages(self) -> list[str]:
329
+ """
330
+ Get the final deduplicated message list.
331
+
332
+ Priority:
333
+ 1. Definitive messages (from item_completed)
334
+ 2. Fallback message (from task_complete)
335
+ 3. Streaming buffer (only if no definitive or fallback)
336
+ """
337
+ # Prefer definitive messages
338
+ if self._definitive_messages:
339
+ logger.debug(f"Returning {len(self._definitive_messages)} definitive messages")
340
+ return self._definitive_messages
341
+
342
+ # Fall back to task_complete message
343
+ if self._fallback_message:
344
+ logger.debug("Returning fallback message from task_complete")
345
+ return [self._fallback_message]
346
+
347
+ # Last resort: streaming buffer
348
+ if self._streaming_buffer:
349
+ full_text = "".join(self._streaming_buffer)
350
+ if full_text.strip():
351
+ logger.debug(f"Returning streaming buffer ({len(self._streaming_buffer)} chunks)")
352
+ return [full_text]
353
+
354
+ return []
355
+
356
+ def get_response(self) -> str:
357
+ """Get the final response as a single string."""
358
+ messages = self.get_messages()
359
+ return "\n".join(messages) if messages else ""
360
+
361
+ def get_debug_info(self) -> dict:
362
+ """Get debug information about message collection."""
363
+ return {
364
+ "seen_event_ids": len(self._seen_event_ids),
365
+ "seen_content_hashes": len(self._seen_content_hashes),
366
+ "definitive_messages": len(self._definitive_messages),
367
+ "has_fallback": self._fallback_message is not None,
368
+ "streaming_chunks": len(self._streaming_buffer),
369
+ "message_sources": self._message_sources,
370
+ }
371
+
372
+
33
373
  class MCPClient:
34
374
  """
35
375
  Robust MCP client for communicating with codex mcp-server.
@@ -42,7 +382,18 @@ class MCPClient:
42
382
  of spawning new reader threads on timeout.
43
383
  """
44
384
 
45
- def __init__(self):
385
+ # Default config overrides for zwarm-managed codex sessions
386
+ # These override ~/.codex/config.toml to ensure consistent behavior
387
+ # Only used as fallback if no config_path is provided
388
+ DEFAULT_CONFIG_OVERRIDES: dict[str, str] = {
389
+ "model_reasoning_effort": "high", # Use 'high' for compatibility with all models
390
+ }
391
+
392
+ def __init__(
393
+ self,
394
+ config_path: Path | None = None,
395
+ config_overrides: dict[str, str] | None = None,
396
+ ):
46
397
  self._proc: subprocess.Popen | None = None
47
398
  self._proc_pid: int | None = None # Track PID to detect restarts
48
399
  self._request_id = 0
@@ -53,6 +404,10 @@ class MCPClient:
53
404
  self._stdout_queue: queue.Queue[str | None] = queue.Queue()
54
405
  self._lock = threading.Lock() # Protect writes only
55
406
  self._start_count = 0 # Track how many times we've started
407
+ # Config path for full isolation (preferred)
408
+ self._config_path = config_path
409
+ # Fallback: merge default overrides with any custom ones (used if no config_path)
410
+ self._config_overrides = {**self.DEFAULT_CONFIG_OVERRIDES, **(config_overrides or {})}
56
411
 
57
412
  def start(self) -> None:
58
413
  """Start the MCP server process."""
@@ -69,9 +424,19 @@ class MCPClient:
69
424
  )
70
425
 
71
426
  self._start_count += 1
72
- logger.info(f"Starting codex mcp-server... (start_count={self._start_count})")
427
+
428
+ # Build command - prefer config file for full isolation, fallback to overrides
429
+ cmd = ["codex", "mcp-server"]
430
+ if self._config_path and self._config_path.exists():
431
+ cmd.extend(["--config", str(self._config_path)])
432
+ logger.info(f"Starting codex mcp-server with config: {self._config_path} (start_count={self._start_count})")
433
+ else:
434
+ # Fallback to individual overrides
435
+ for key, value in self._config_overrides.items():
436
+ cmd.extend(["-c", f'{key}="{value}"'])
437
+ logger.info(f"Starting codex mcp-server with overrides: {self._config_overrides} (start_count={self._start_count})")
73
438
  self._proc = subprocess.Popen(
74
- ["codex", "mcp-server"],
439
+ cmd,
75
440
  stdin=subprocess.PIPE,
76
441
  stdout=subprocess.PIPE,
77
442
  stderr=subprocess.PIPE,
@@ -238,6 +603,12 @@ class MCPClient:
238
603
  """
239
604
  Call an MCP tool and collect streaming events.
240
605
 
606
+ Uses MessageCollector for robust deduplication and priority-based
607
+ message selection. This prevents the transcript rendering bugs:
608
+ - Message duplication
609
+ - Role contamination
610
+ - Turn mis-association
611
+
241
612
  Args:
242
613
  name: Tool name (codex, codex-reply)
243
614
  arguments: Tool arguments
@@ -260,14 +631,9 @@ class MCPClient:
260
631
  with self._lock:
261
632
  self._write(json.dumps(request) + "\n")
262
633
 
263
- # Collect streaming events until final result
264
- # Reader thread queues lines, we pull from queue with timeout
265
- session_id = None
266
- conversation_id = None # Track conversation ID separately
267
- agent_messages: list[str] = []
268
- streaming_text: list[str] = [] # Accumulate streaming delta text
634
+ # Use MessageCollector for robust event handling
635
+ collector = MessageCollector()
269
636
  final_result = None
270
- token_usage: dict[str, Any] = {} # Track token usage
271
637
  start_time = time.time()
272
638
  all_events: list[dict] = [] # Keep ALL events for debugging
273
639
 
@@ -280,13 +646,10 @@ class MCPClient:
280
646
  raise RuntimeError(f"MCP call timed out after {timeout}s ({event_count} events received)")
281
647
 
282
648
  # Read from queue with per-event timeout
283
- # Empty string = timeout (process still alive, just waiting)
284
- # None sentinel is handled inside _read_line (raises RuntimeError)
285
649
  line = self._read_line(timeout=30.0)
286
650
 
287
651
  if not line:
288
652
  # Timeout waiting for event - process is still alive, just slow
289
- # This is normal during long codex operations
290
653
  logger.debug(f"Waiting for MCP event... (elapsed: {elapsed:.0f}s, events: {event_count})")
291
654
  continue
292
655
 
@@ -303,192 +666,58 @@ class MCPClient:
303
666
  final_result = event["result"]
304
667
  # Extract conversation ID from final result
305
668
  if isinstance(final_result, dict):
306
- conversation_id = final_result.get("conversationId") or final_result.get("conversation_id")
307
- logger.debug(f"Got final result after {event_count} events, conversation_id={conversation_id}")
669
+ conv_id = final_result.get("conversationId") or final_result.get("conversation_id")
670
+ collector.set_conversation_id(conv_id)
671
+ logger.debug(f"Got final result after {event_count} events")
308
672
  break
309
673
  elif "error" in event:
310
674
  error = event["error"]
311
675
  raise RuntimeError(f"MCP tool error: {error.get('message', error)}")
312
676
 
313
- # Process streaming events
314
- if event.get("method") == "codex/event":
315
- params = event.get("params", {})
316
- msg = params.get("msg", {})
317
- msg_type = msg.get("type")
318
-
319
- # Log ALL event types to help debug missing messages
320
- logger.debug(f"MCP event: type={msg_type}, keys={list(msg.keys())}")
321
-
322
- if msg_type == "session_configured":
323
- session_id = msg.get("session_id")
324
- logger.debug(f"Session configured: {session_id}")
325
-
326
- elif msg_type == "item_completed":
327
- item = msg.get("item", {})
328
- item_type = item.get("type")
329
-
330
- # Log ALL item_completed events to help debug
331
- logger.debug(f"item_completed: type={item_type}, keys={list(item.keys())}")
332
-
333
- # Agent text responses - codex uses "AgentMessage" type
334
- if item_type == "AgentMessage":
335
- content = item.get("content", [])
336
- for block in content:
337
- if isinstance(block, dict) and block.get("text"):
338
- agent_messages.append(block["text"])
339
- elif isinstance(block, str):
340
- agent_messages.append(block)
341
-
342
- # Also check for "agent_message" (lowercase) variant
343
- elif item_type == "agent_message":
344
- text = item.get("text", "") or item.get("message", "")
345
- if text:
346
- agent_messages.append(text)
347
- # Also check content array
348
- content = item.get("content", [])
349
- for block in content:
350
- if isinstance(block, dict) and block.get("text"):
351
- agent_messages.append(block["text"])
352
- elif isinstance(block, str):
353
- agent_messages.append(block)
354
-
355
- # Legacy format check
356
- elif item_type == "message" and item.get("role") == "assistant":
357
- content = item.get("content", [])
358
- for block in content:
359
- if isinstance(block, dict) and block.get("text"):
360
- agent_messages.append(block["text"])
361
- elif isinstance(block, str):
362
- agent_messages.append(block)
363
-
364
- # Generic message type - check for text/content
365
- elif item_type == "message":
366
- text = item.get("text", "")
367
- if text:
368
- agent_messages.append(text)
369
- content = item.get("content", [])
370
- if isinstance(content, str):
371
- agent_messages.append(content)
372
- elif isinstance(content, list):
373
- for block in content:
374
- if isinstance(block, dict) and block.get("text"):
375
- agent_messages.append(block["text"])
376
- elif isinstance(block, str):
377
- agent_messages.append(block)
378
-
379
- # Function call outputs (for context)
380
- elif item_type == "function_call_output":
381
- output = item.get("output", "")
382
- if output and len(output) < 1000:
383
- agent_messages.append(f"[Tool output]: {output[:500]}")
384
-
385
- # Log other item types we're not handling
386
- elif item_type not in ("function_call", "tool_call", "UserMessage", "user_message"):
387
- logger.debug(f"Unhandled item_completed type: {item_type}, item={item}")
388
-
389
- elif msg_type == "agent_message":
390
- # Direct agent message event
391
- message = msg.get("message", "")
392
- if message:
393
- agent_messages.append(message)
394
-
395
- elif msg_type in ("task_complete", "task_completed"):
396
- # Task is done - capture last_agent_message as fallback
397
- last_msg = msg.get("last_agent_message")
398
- if last_msg and last_msg not in agent_messages:
399
- agent_messages.append(last_msg)
400
- logger.debug(f"Task complete after {event_count} events")
677
+ # Process event through collector
678
+ try:
679
+ should_continue = collector.process_event(event)
680
+ if not should_continue:
681
+ logger.debug(f"Collector signaled completion after {event_count} events")
401
682
  break
683
+ except RuntimeError as e:
684
+ # Collector raises RuntimeError for codex errors
685
+ raise
402
686
 
403
- elif msg_type == "token_count":
404
- # Capture token usage for cost tracking
405
- info = msg.get("info") or {}
406
- if info:
407
- usage = info.get("total_token_usage", {})
408
- if usage:
409
- token_usage = {
410
- "input_tokens": usage.get("input_tokens", 0),
411
- "output_tokens": usage.get("output_tokens", 0),
412
- "cached_input_tokens": usage.get("cached_input_tokens", 0),
413
- "reasoning_tokens": usage.get("reasoning_output_tokens", 0),
414
- "total_tokens": usage.get("total_tokens", 0),
415
- }
416
- logger.debug(f"Token usage: {token_usage}")
417
-
418
- elif msg_type == "error":
419
- error_msg = msg.get("error", msg.get("message", str(msg)))
420
- raise RuntimeError(f"Codex error: {error_msg}")
421
-
422
- # Handle streaming text events (various formats)
423
- elif msg_type in ("text_delta", "content_block_delta", "message_delta"):
424
- delta = msg.get("delta", {})
425
- text = delta.get("text", "") or msg.get("text", "")
426
- if text:
427
- streaming_text.append(text)
428
-
429
- elif msg_type == "text":
430
- text = msg.get("text", "")
431
- if text:
432
- streaming_text.append(text)
433
-
434
- elif msg_type == "response":
435
- # Some versions send the full response this way
436
- response_text = msg.get("response", "") or msg.get("text", "")
437
- if response_text:
438
- agent_messages.append(response_text)
439
-
440
- elif msg_type == "message":
441
- # Direct message event
442
- text = msg.get("text", "") or msg.get("content", "")
443
- if text:
444
- agent_messages.append(text)
445
-
446
- else:
447
- # Log unknown event types at debug level to help diagnose
448
- if msg_type and msg_type not in ("session_started", "thinking", "tool_call", "function_call"):
449
- logger.debug(f"Unhandled MCP event type: {msg_type}, msg keys: {list(msg.keys())}")
450
-
451
- # Merge streaming text into messages if we got any
452
- if streaming_text:
453
- full_streaming = "".join(streaming_text)
454
- if full_streaming.strip():
455
- agent_messages.append(full_streaming)
456
- logger.debug(f"Captured {len(streaming_text)} streaming chunks ({len(full_streaming)} chars)")
457
-
458
- # Try to extract content from final_result if we have no messages
459
- if final_result and not agent_messages:
687
+ # Try to extract content from final_result if collector has no messages
688
+ messages = collector.get_messages()
689
+ if final_result and not messages:
460
690
  if "content" in final_result:
461
691
  content = final_result["content"]
462
692
  if isinstance(content, list):
463
693
  for block in content:
464
694
  if isinstance(block, dict) and block.get("text"):
465
- agent_messages.append(block["text"])
695
+ messages.append(block["text"])
466
696
  elif isinstance(block, str):
467
- agent_messages.append(block)
697
+ messages.append(block)
468
698
  elif isinstance(content, str):
469
- agent_messages.append(content)
470
- # Also check for text field
471
- if not agent_messages and "text" in final_result:
472
- agent_messages.append(final_result["text"])
699
+ messages.append(content)
700
+ if not messages and "text" in final_result:
701
+ messages.append(final_result["text"])
473
702
 
474
- # Build result - prefer conversation_id from final result, fallback to session_id from events
475
- effective_conversation_id = conversation_id or session_id
703
+ # Build result
476
704
  result = {
477
- "conversationId": effective_conversation_id,
478
- "messages": agent_messages,
479
- "output": "\n".join(agent_messages) if agent_messages else "",
480
- "usage": token_usage, # Token usage for cost tracking
705
+ "conversationId": collector.conversation_id,
706
+ "messages": messages,
707
+ "output": "\n".join(messages) if messages else "",
708
+ "usage": collector.token_usage,
481
709
  }
482
710
 
483
711
  # Log detailed debug info if we didn't capture any messages
484
- if not agent_messages:
712
+ if not messages:
713
+ debug_info = collector.get_debug_info()
485
714
  event_types = [e.get("method") or f"id:{e.get('id')}" for e in all_events[:20]]
486
715
  logger.warning(
487
716
  f"MCP call returned no messages. "
488
- f"conversation_id={effective_conversation_id}, "
489
- f"session_id={session_id}, "
717
+ f"conversation_id={collector.conversation_id}, "
490
718
  f"event_count={len(all_events)}, "
491
719
  f"event_types={event_types}, "
720
+ f"collector_debug={debug_info}, "
492
721
  f"final_result_keys={list(final_result.keys()) if final_result else 'None'}"
493
722
  )
494
723
  # Log codex/event details for debugging
@@ -498,7 +727,7 @@ class MCPClient:
498
727
  msg = ce.get("params", {}).get("msg", {})
499
728
  logger.debug(f" codex/event: type={msg.get('type')}, keys={list(msg.keys())}")
500
729
 
501
- logger.debug(f"MCP call complete: {len(agent_messages)} messages, conversation_id={effective_conversation_id}")
730
+ logger.debug(f"MCP call complete: {len(messages)} messages, conversation_id={collector.conversation_id}")
502
731
  return result
503
732
 
504
733
  def close(self) -> None:
@@ -530,11 +759,22 @@ class CodexMCPAdapter(ExecutorAdapter):
530
759
  This is the recommended way to have iterative conversations with Codex.
531
760
  The MCP client uses subprocess.Popen (not asyncio) so it persists across
532
761
  multiple asyncio.run() calls, preserving conversation state.
762
+
763
+ Config isolation: Pass config_path to use a local codex.toml instead of
764
+ the user's global ~/.codex/config.toml. This is the preferred approach.
765
+ Falls back to config_overrides if no config_path is provided.
533
766
  """
534
767
  DEFAULT_MODEL = "gpt-5.1-codex-mini" # Default codex model
535
768
 
536
- def __init__(self, model: str | None = None):
769
+ def __init__(
770
+ self,
771
+ model: str | None = None,
772
+ config_path: Path | None = None,
773
+ config_overrides: dict[str, str] | None = None,
774
+ ):
537
775
  self._model = model or self.DEFAULT_MODEL
776
+ self._config_path = config_path # Path to local codex.toml for isolation
777
+ self._config_overrides = config_overrides or {}
538
778
  self._mcp_client: MCPClient | None = None
539
779
  self._sessions: dict[str, str] = {} # session_id -> conversationId
540
780
  # Cumulative token usage for cost tracking
@@ -561,7 +801,10 @@ class CodexMCPAdapter(ExecutorAdapter):
561
801
  def _ensure_client(self) -> MCPClient:
562
802
  """Ensure MCP client is running and return it."""
563
803
  if self._mcp_client is None:
564
- self._mcp_client = MCPClient()
804
+ self._mcp_client = MCPClient(
805
+ config_path=self._config_path,
806
+ config_overrides=self._config_overrides,
807
+ )
565
808
 
566
809
  if not self._mcp_client.is_alive:
567
810
  self._mcp_client.start()
@@ -575,6 +818,7 @@ class CodexMCPAdapter(ExecutorAdapter):
575
818
  cwd: str,
576
819
  sandbox: str,
577
820
  model: str | None = None,
821
+ reasoning_effort: str | None = None,
578
822
  ) -> dict[str, Any]:
579
823
  """
580
824
  Call codex MCP tool - traced by Weave.
@@ -592,7 +836,13 @@ class CodexMCPAdapter(ExecutorAdapter):
592
836
  if model:
593
837
  args["model"] = model
594
838
 
595
- logger.info(f"Calling codex with task_len={len(task)}, cwd={cwd}, model={model or 'default'}")
839
+ # Pass reasoning_effort to override codex config defaults
840
+ # The config key is "model_reasoning_effort"
841
+ if reasoning_effort:
842
+ args["model_reasoning_effort"] = reasoning_effort
843
+
844
+ logger.info(f"Calling codex with task_len={len(task)}, cwd={cwd}, model={model or 'default'}, reasoning_effort={reasoning_effort or 'default'}")
845
+ logger.debug(f"Full codex args: {args}")
596
846
 
597
847
  result = client.call_tool("codex", args)
598
848
 
@@ -668,12 +918,55 @@ class CodexMCPAdapter(ExecutorAdapter):
668
918
  usage = result.get("usage", {})
669
919
  self._accumulate_usage(usage)
670
920
 
671
- response = self._extract_response(result)
921
+ # Filter out the sent message from the response using content hashing
922
+ # The MCP may echo our prompt back, but we use robust content comparison
923
+ raw_messages = result.get("messages", [])
924
+
925
+ # Create hash of user message for comparison (normalized)
926
+ def normalize_for_comparison(text: str) -> str:
927
+ """Normalize text for comparison (lowercase, collapsed whitespace)."""
928
+ return " ".join(text.lower().split())
929
+
930
+ user_msg_normalized = normalize_for_comparison(message)
931
+ user_msg_hash = hashlib.md5(user_msg_normalized.encode()).hexdigest()
932
+
933
+ def is_user_message_echo(text: str) -> bool:
934
+ """Check if text is just an echo of the user message."""
935
+ if not text:
936
+ return True # Empty is effectively an echo (skip it)
937
+
938
+ text_normalized = normalize_for_comparison(text)
939
+ text_hash = hashlib.md5(text_normalized.encode()).hexdigest()
940
+
941
+ # Exact match (case-insensitive, whitespace-normalized)
942
+ if text_hash == user_msg_hash:
943
+ return True
944
+
945
+ # Check if text IS the user message (not just starts with it)
946
+ # This avoids the bug where "Fix bug by X" gets filtered when user said "Fix bug"
947
+ if text_normalized == user_msg_normalized:
948
+ return True
949
+
950
+ return False
951
+
952
+ filtered_messages = [m for m in raw_messages if not is_user_message_echo(m)]
953
+
954
+ # Build filtered result for extraction
955
+ filtered_result = {
956
+ **result,
957
+ "messages": filtered_messages,
958
+ "output": "\n".join(filtered_messages) if filtered_messages else result.get("output", ""),
959
+ }
960
+
961
+ response = self._extract_response(filtered_result)
962
+ filtered_count = len(raw_messages) - len(filtered_messages)
963
+ if filtered_count > 0:
964
+ logger.debug(f"Filtered {filtered_count} user echo messages from response")
672
965
  logger.debug(f"codex-reply response length: {len(response)} chars")
673
966
 
674
967
  return {
675
968
  "response": response,
676
- "raw_messages": result.get("messages", []),
969
+ "raw_messages": filtered_messages, # Return filtered messages
677
970
  "usage": usage,
678
971
  "total_usage": self.total_usage,
679
972
  "conversation_lost": not result.get("messages") and not result.get("output"),
@@ -706,6 +999,7 @@ class CodexMCPAdapter(ExecutorAdapter):
706
999
  cwd=str(working_dir.absolute()),
707
1000
  sandbox=sandbox,
708
1001
  model=effective_model,
1002
+ reasoning_effort=kwargs.get("reasoning_effort"),
709
1003
  )
710
1004
 
711
1005
  # Extract conversation ID and response