zwarm 1.3.11__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zwarm/adapters/codex_mcp.py +475 -227
- zwarm/cli/main.py +483 -143
- zwarm/core/config.py +2 -0
- zwarm/orchestrator.py +41 -2
- zwarm/prompts/orchestrator.py +29 -13
- zwarm/sessions/__init__.py +2 -0
- zwarm/sessions/manager.py +87 -8
- zwarm/tools/delegation.py +356 -324
- zwarm-2.0.0.dist-info/METADATA +309 -0
- {zwarm-1.3.11.dist-info → zwarm-2.0.0.dist-info}/RECORD +12 -12
- zwarm-1.3.11.dist-info/METADATA +0 -525
- {zwarm-1.3.11.dist-info → zwarm-2.0.0.dist-info}/WHEEL +0 -0
- {zwarm-1.3.11.dist-info → zwarm-2.0.0.dist-info}/entry_points.txt +0 -0
zwarm/adapters/codex_mcp.py
CHANGED
|
@@ -8,12 +8,14 @@ Uses codex mcp-server for true iterative conversations:
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
+
import hashlib
|
|
11
12
|
import json
|
|
12
13
|
import logging
|
|
13
14
|
import queue
|
|
14
15
|
import subprocess
|
|
15
16
|
import threading
|
|
16
17
|
import time
|
|
18
|
+
from dataclasses import dataclass, field
|
|
17
19
|
from pathlib import Path
|
|
18
20
|
from typing import Any, Literal
|
|
19
21
|
|
|
@@ -30,6 +32,344 @@ from zwarm.core.models import (
|
|
|
30
32
|
logger = logging.getLogger(__name__)
|
|
31
33
|
|
|
32
34
|
|
|
35
|
+
# =============================================================================
|
|
36
|
+
# MessageCollector: Robust event collection with deduplication
|
|
37
|
+
# =============================================================================
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class MessageSegment:
|
|
42
|
+
"""A segment within an assistant turn (for future segment-aware rendering)."""
|
|
43
|
+
id: str
|
|
44
|
+
kind: Literal["assistant_text", "progress", "tool_call", "tool_result", "error"]
|
|
45
|
+
text: str
|
|
46
|
+
status: Literal["open", "closed"] = "open"
|
|
47
|
+
source_event_ids: set[str] = field(default_factory=set)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class MessageCollector:
|
|
51
|
+
"""
|
|
52
|
+
Collects and deduplicates messages from MCP event stream.
|
|
53
|
+
|
|
54
|
+
Solves the transcript rendering bugs by:
|
|
55
|
+
1. Deduplicating events by ID
|
|
56
|
+
2. Using priority-based message selection (item_completed > task_complete > streaming)
|
|
57
|
+
3. Tracking message sources for debugging
|
|
58
|
+
4. Never mixing streaming deltas with finalized messages
|
|
59
|
+
|
|
60
|
+
Priority order (highest to lowest):
|
|
61
|
+
- item_completed with AgentMessage/agent_message → DEFINITIVE
|
|
62
|
+
- task_complete.last_agent_message → FALLBACK ONLY
|
|
63
|
+
- streaming deltas → ONLY IF NO DEFINITIVE SOURCE
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(self):
|
|
67
|
+
# Deduplication
|
|
68
|
+
self._seen_event_ids: set[str] = set()
|
|
69
|
+
self._seen_content_hashes: set[str] = set() # Content-based dedup
|
|
70
|
+
|
|
71
|
+
# Message collection (priority-ordered)
|
|
72
|
+
self._definitive_messages: list[str] = [] # From item_completed
|
|
73
|
+
self._fallback_message: str | None = None # From task_complete
|
|
74
|
+
self._streaming_buffer: list[str] = [] # Streaming deltas
|
|
75
|
+
|
|
76
|
+
# Metadata
|
|
77
|
+
self._conversation_id: str | None = None
|
|
78
|
+
self._session_id: str | None = None
|
|
79
|
+
self._token_usage: dict[str, Any] = {}
|
|
80
|
+
self._is_complete: bool = False
|
|
81
|
+
|
|
82
|
+
# Debug tracking
|
|
83
|
+
self._message_sources: list[tuple[str, str]] = [] # (source, text_preview)
|
|
84
|
+
|
|
85
|
+
def _extract_event_id(self, event: dict) -> str | None:
|
|
86
|
+
"""Extract a unique event ID for deduplication."""
|
|
87
|
+
# Try various ID fields that MCP events might have
|
|
88
|
+
for key in ("id", "event_id", "item_id", "message_id"):
|
|
89
|
+
if key in event:
|
|
90
|
+
return str(event[key])
|
|
91
|
+
|
|
92
|
+
# For nested events, try params
|
|
93
|
+
params = event.get("params", {})
|
|
94
|
+
msg = params.get("msg", {})
|
|
95
|
+
for key in ("id", "event_id", "item_id"):
|
|
96
|
+
if key in msg:
|
|
97
|
+
return str(msg[key])
|
|
98
|
+
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
def _content_hash(self, text: str) -> str:
|
|
102
|
+
"""Create a hash of content for deduplication."""
|
|
103
|
+
# Normalize whitespace for comparison
|
|
104
|
+
normalized = " ".join(text.split())
|
|
105
|
+
return hashlib.md5(normalized.encode()).hexdigest()[:16]
|
|
106
|
+
|
|
107
|
+
def _is_duplicate_content(self, text: str) -> bool:
|
|
108
|
+
"""Check if this content was already collected."""
|
|
109
|
+
if not text or not text.strip():
|
|
110
|
+
return True # Empty is "duplicate" (skip it)
|
|
111
|
+
|
|
112
|
+
content_hash = self._content_hash(text)
|
|
113
|
+
if content_hash in self._seen_content_hashes:
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
self._seen_content_hashes.add(content_hash)
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
def _add_definitive_message(self, text: str, source: str) -> None:
|
|
120
|
+
"""Add a definitive message (from item_completed)."""
|
|
121
|
+
if not text or not text.strip():
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
if self._is_duplicate_content(text):
|
|
125
|
+
logger.debug(f"Skipping duplicate message from {source}: {text[:50]}...")
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
self._definitive_messages.append(text)
|
|
129
|
+
self._message_sources.append((source, text[:50]))
|
|
130
|
+
logger.debug(f"Added definitive message from {source}: {text[:50]}...")
|
|
131
|
+
|
|
132
|
+
def _set_fallback_message(self, text: str, source: str) -> None:
|
|
133
|
+
"""Set fallback message (from task_complete). Only used if no definitive."""
|
|
134
|
+
if not text or not text.strip():
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
# Only set if we don't have definitive messages
|
|
138
|
+
if self._definitive_messages:
|
|
139
|
+
logger.debug(f"Ignoring fallback from {source}: have definitive messages")
|
|
140
|
+
return
|
|
141
|
+
|
|
142
|
+
if self._is_duplicate_content(text):
|
|
143
|
+
logger.debug(f"Skipping duplicate fallback from {source}")
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
self._fallback_message = text
|
|
147
|
+
self._message_sources.append((source, text[:50]))
|
|
148
|
+
|
|
149
|
+
def _add_streaming_delta(self, text: str) -> None:
|
|
150
|
+
"""Add streaming delta. Only used if no definitive messages at end."""
|
|
151
|
+
if text:
|
|
152
|
+
self._streaming_buffer.append(text)
|
|
153
|
+
|
|
154
|
+
def process_event(self, event: dict) -> bool:
|
|
155
|
+
"""
|
|
156
|
+
Process a single MCP event.
|
|
157
|
+
|
|
158
|
+
Returns True if processing should continue, False if complete.
|
|
159
|
+
"""
|
|
160
|
+
# 1. Check for event ID and dedupe
|
|
161
|
+
event_id = self._extract_event_id(event)
|
|
162
|
+
if event_id and event_id in self._seen_event_ids:
|
|
163
|
+
logger.debug(f"Skipping duplicate event: {event_id}")
|
|
164
|
+
return True
|
|
165
|
+
if event_id:
|
|
166
|
+
self._seen_event_ids.add(event_id)
|
|
167
|
+
|
|
168
|
+
# 2. Handle codex/event notifications
|
|
169
|
+
if event.get("method") == "codex/event":
|
|
170
|
+
params = event.get("params", {})
|
|
171
|
+
msg = params.get("msg", {})
|
|
172
|
+
msg_type = msg.get("type")
|
|
173
|
+
|
|
174
|
+
self._handle_codex_event(msg, msg_type)
|
|
175
|
+
|
|
176
|
+
# Check for completion events
|
|
177
|
+
if msg_type in ("task_complete", "task_completed"):
|
|
178
|
+
self._is_complete = True
|
|
179
|
+
return False
|
|
180
|
+
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
def _handle_codex_event(self, msg: dict, msg_type: str | None) -> None:
|
|
184
|
+
"""Handle a codex/event notification."""
|
|
185
|
+
if not msg_type:
|
|
186
|
+
return
|
|
187
|
+
|
|
188
|
+
# Session configuration
|
|
189
|
+
if msg_type == "session_configured":
|
|
190
|
+
self._session_id = msg.get("session_id")
|
|
191
|
+
logger.debug(f"Session configured: {self._session_id}")
|
|
192
|
+
|
|
193
|
+
# Item completed - DEFINITIVE SOURCE
|
|
194
|
+
elif msg_type == "item_completed":
|
|
195
|
+
self._handle_item_completed(msg)
|
|
196
|
+
|
|
197
|
+
# Direct agent message - DEFINITIVE SOURCE
|
|
198
|
+
elif msg_type == "agent_message":
|
|
199
|
+
text = msg.get("message", "") or msg.get("text", "") or msg.get("content", "")
|
|
200
|
+
self._add_definitive_message(text, "agent_message_event")
|
|
201
|
+
|
|
202
|
+
# Task complete - FALLBACK SOURCE
|
|
203
|
+
elif msg_type in ("task_complete", "task_completed"):
|
|
204
|
+
last_msg = msg.get("last_agent_message")
|
|
205
|
+
if last_msg:
|
|
206
|
+
self._set_fallback_message(last_msg, "task_complete")
|
|
207
|
+
|
|
208
|
+
# Token usage
|
|
209
|
+
elif msg_type == "token_count":
|
|
210
|
+
info = msg.get("info") or {}
|
|
211
|
+
if info:
|
|
212
|
+
usage = info.get("total_token_usage", {})
|
|
213
|
+
if usage:
|
|
214
|
+
self._token_usage = {
|
|
215
|
+
"input_tokens": usage.get("input_tokens", 0),
|
|
216
|
+
"output_tokens": usage.get("output_tokens", 0),
|
|
217
|
+
"cached_input_tokens": usage.get("cached_input_tokens", 0),
|
|
218
|
+
"reasoning_tokens": usage.get("reasoning_output_tokens", 0),
|
|
219
|
+
"total_tokens": usage.get("total_tokens", 0),
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
# Streaming deltas - LOWEST PRIORITY
|
|
223
|
+
elif msg_type in ("text_delta", "content_block_delta", "message_delta", "text"):
|
|
224
|
+
delta = msg.get("delta", {})
|
|
225
|
+
text = delta.get("text", "") or msg.get("text", "")
|
|
226
|
+
self._add_streaming_delta(text)
|
|
227
|
+
|
|
228
|
+
# Response event - MEDIUM PRIORITY (treat as definitive)
|
|
229
|
+
elif msg_type == "response":
|
|
230
|
+
text = msg.get("response", "") or msg.get("text", "")
|
|
231
|
+
self._add_definitive_message(text, "response_event")
|
|
232
|
+
|
|
233
|
+
# Message event - check role
|
|
234
|
+
elif msg_type == "message":
|
|
235
|
+
role = msg.get("role", "").lower()
|
|
236
|
+
if role in ("assistant", "agent", ""):
|
|
237
|
+
text = msg.get("text", "") or msg.get("content", "")
|
|
238
|
+
if text and role != "user":
|
|
239
|
+
self._add_definitive_message(text, "message_event")
|
|
240
|
+
|
|
241
|
+
# Output event
|
|
242
|
+
elif msg_type == "output":
|
|
243
|
+
text = msg.get("output", "") or msg.get("text", "") or msg.get("content", "")
|
|
244
|
+
self._add_definitive_message(text, "output_event")
|
|
245
|
+
|
|
246
|
+
# Completion variants
|
|
247
|
+
elif msg_type in ("item.completed", "response.completed"):
|
|
248
|
+
item = msg.get("item", {})
|
|
249
|
+
if item.get("type") == "agent_message":
|
|
250
|
+
text = item.get("text", "")
|
|
251
|
+
self._add_definitive_message(text, f"{msg_type}_event")
|
|
252
|
+
elif "text" in msg:
|
|
253
|
+
self._add_definitive_message(msg["text"], f"{msg_type}_direct")
|
|
254
|
+
|
|
255
|
+
# Error
|
|
256
|
+
elif msg_type == "error":
|
|
257
|
+
error_msg = msg.get("error", msg.get("message", str(msg)))
|
|
258
|
+
raise RuntimeError(f"Codex error: {error_msg}")
|
|
259
|
+
|
|
260
|
+
def _handle_item_completed(self, msg: dict) -> None:
|
|
261
|
+
"""Handle item_completed event - the primary source of messages."""
|
|
262
|
+
item = msg.get("item", {})
|
|
263
|
+
item_type = item.get("type")
|
|
264
|
+
|
|
265
|
+
# AgentMessage - primary format
|
|
266
|
+
if item_type == "AgentMessage":
|
|
267
|
+
content = item.get("content", [])
|
|
268
|
+
for block in content:
|
|
269
|
+
if isinstance(block, dict) and block.get("text"):
|
|
270
|
+
self._add_definitive_message(block["text"], "AgentMessage")
|
|
271
|
+
elif isinstance(block, str):
|
|
272
|
+
self._add_definitive_message(block, "AgentMessage_str")
|
|
273
|
+
|
|
274
|
+
# agent_message - variant spelling
|
|
275
|
+
elif item_type == "agent_message":
|
|
276
|
+
text = item.get("text", "") or item.get("message", "")
|
|
277
|
+
if text:
|
|
278
|
+
self._add_definitive_message(text, "agent_message")
|
|
279
|
+
content = item.get("content", [])
|
|
280
|
+
for block in content:
|
|
281
|
+
if isinstance(block, dict) and block.get("text"):
|
|
282
|
+
self._add_definitive_message(block["text"], "agent_message_content")
|
|
283
|
+
elif isinstance(block, str):
|
|
284
|
+
self._add_definitive_message(block, "agent_message_content_str")
|
|
285
|
+
|
|
286
|
+
# Generic message with assistant role
|
|
287
|
+
elif item_type == "message":
|
|
288
|
+
role = item.get("role", "")
|
|
289
|
+
if role == "assistant":
|
|
290
|
+
content = item.get("content", [])
|
|
291
|
+
for block in content:
|
|
292
|
+
if isinstance(block, dict) and block.get("text"):
|
|
293
|
+
self._add_definitive_message(block["text"], "message_assistant")
|
|
294
|
+
elif isinstance(block, str):
|
|
295
|
+
self._add_definitive_message(block, "message_assistant_str")
|
|
296
|
+
# Also check text field directly
|
|
297
|
+
text = item.get("text", "")
|
|
298
|
+
if text:
|
|
299
|
+
self._add_definitive_message(text, "message_text")
|
|
300
|
+
|
|
301
|
+
# Function call output (for context, truncated)
|
|
302
|
+
elif item_type == "function_call_output":
|
|
303
|
+
output = item.get("output", "")
|
|
304
|
+
if output and len(output) < 1000:
|
|
305
|
+
# Don't add to messages, just log
|
|
306
|
+
logger.debug(f"Tool output: {output[:100]}...")
|
|
307
|
+
|
|
308
|
+
def set_conversation_id(self, conv_id: str | None) -> None:
|
|
309
|
+
"""Set conversation ID from final result."""
|
|
310
|
+
if conv_id:
|
|
311
|
+
self._conversation_id = conv_id
|
|
312
|
+
|
|
313
|
+
@property
|
|
314
|
+
def conversation_id(self) -> str | None:
|
|
315
|
+
"""Get the conversation ID."""
|
|
316
|
+
return self._conversation_id or self._session_id
|
|
317
|
+
|
|
318
|
+
@property
|
|
319
|
+
def token_usage(self) -> dict[str, Any]:
|
|
320
|
+
"""Get token usage stats."""
|
|
321
|
+
return self._token_usage
|
|
322
|
+
|
|
323
|
+
@property
|
|
324
|
+
def is_complete(self) -> bool:
|
|
325
|
+
"""Check if collection is complete."""
|
|
326
|
+
return self._is_complete
|
|
327
|
+
|
|
328
|
+
def get_messages(self) -> list[str]:
|
|
329
|
+
"""
|
|
330
|
+
Get the final deduplicated message list.
|
|
331
|
+
|
|
332
|
+
Priority:
|
|
333
|
+
1. Definitive messages (from item_completed)
|
|
334
|
+
2. Fallback message (from task_complete)
|
|
335
|
+
3. Streaming buffer (only if no definitive or fallback)
|
|
336
|
+
"""
|
|
337
|
+
# Prefer definitive messages
|
|
338
|
+
if self._definitive_messages:
|
|
339
|
+
logger.debug(f"Returning {len(self._definitive_messages)} definitive messages")
|
|
340
|
+
return self._definitive_messages
|
|
341
|
+
|
|
342
|
+
# Fall back to task_complete message
|
|
343
|
+
if self._fallback_message:
|
|
344
|
+
logger.debug("Returning fallback message from task_complete")
|
|
345
|
+
return [self._fallback_message]
|
|
346
|
+
|
|
347
|
+
# Last resort: streaming buffer
|
|
348
|
+
if self._streaming_buffer:
|
|
349
|
+
full_text = "".join(self._streaming_buffer)
|
|
350
|
+
if full_text.strip():
|
|
351
|
+
logger.debug(f"Returning streaming buffer ({len(self._streaming_buffer)} chunks)")
|
|
352
|
+
return [full_text]
|
|
353
|
+
|
|
354
|
+
return []
|
|
355
|
+
|
|
356
|
+
def get_response(self) -> str:
|
|
357
|
+
"""Get the final response as a single string."""
|
|
358
|
+
messages = self.get_messages()
|
|
359
|
+
return "\n".join(messages) if messages else ""
|
|
360
|
+
|
|
361
|
+
def get_debug_info(self) -> dict:
|
|
362
|
+
"""Get debug information about message collection."""
|
|
363
|
+
return {
|
|
364
|
+
"seen_event_ids": len(self._seen_event_ids),
|
|
365
|
+
"seen_content_hashes": len(self._seen_content_hashes),
|
|
366
|
+
"definitive_messages": len(self._definitive_messages),
|
|
367
|
+
"has_fallback": self._fallback_message is not None,
|
|
368
|
+
"streaming_chunks": len(self._streaming_buffer),
|
|
369
|
+
"message_sources": self._message_sources,
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
|
|
33
373
|
class MCPClient:
|
|
34
374
|
"""
|
|
35
375
|
Robust MCP client for communicating with codex mcp-server.
|
|
@@ -42,7 +382,18 @@ class MCPClient:
|
|
|
42
382
|
of spawning new reader threads on timeout.
|
|
43
383
|
"""
|
|
44
384
|
|
|
45
|
-
|
|
385
|
+
# Default config overrides for zwarm-managed codex sessions
|
|
386
|
+
# These override ~/.codex/config.toml to ensure consistent behavior
|
|
387
|
+
# Only used as fallback if no config_path is provided
|
|
388
|
+
DEFAULT_CONFIG_OVERRIDES: dict[str, str] = {
|
|
389
|
+
"model_reasoning_effort": "high", # Use 'high' for compatibility with all models
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
def __init__(
|
|
393
|
+
self,
|
|
394
|
+
config_path: Path | None = None,
|
|
395
|
+
config_overrides: dict[str, str] | None = None,
|
|
396
|
+
):
|
|
46
397
|
self._proc: subprocess.Popen | None = None
|
|
47
398
|
self._proc_pid: int | None = None # Track PID to detect restarts
|
|
48
399
|
self._request_id = 0
|
|
@@ -53,6 +404,10 @@ class MCPClient:
|
|
|
53
404
|
self._stdout_queue: queue.Queue[str | None] = queue.Queue()
|
|
54
405
|
self._lock = threading.Lock() # Protect writes only
|
|
55
406
|
self._start_count = 0 # Track how many times we've started
|
|
407
|
+
# Config path for full isolation (preferred)
|
|
408
|
+
self._config_path = config_path
|
|
409
|
+
# Fallback: merge default overrides with any custom ones (used if no config_path)
|
|
410
|
+
self._config_overrides = {**self.DEFAULT_CONFIG_OVERRIDES, **(config_overrides or {})}
|
|
56
411
|
|
|
57
412
|
def start(self) -> None:
|
|
58
413
|
"""Start the MCP server process."""
|
|
@@ -69,9 +424,19 @@ class MCPClient:
|
|
|
69
424
|
)
|
|
70
425
|
|
|
71
426
|
self._start_count += 1
|
|
72
|
-
|
|
427
|
+
|
|
428
|
+
# Build command - prefer config file for full isolation, fallback to overrides
|
|
429
|
+
cmd = ["codex", "mcp-server"]
|
|
430
|
+
if self._config_path and self._config_path.exists():
|
|
431
|
+
cmd.extend(["--config", str(self._config_path)])
|
|
432
|
+
logger.info(f"Starting codex mcp-server with config: {self._config_path} (start_count={self._start_count})")
|
|
433
|
+
else:
|
|
434
|
+
# Fallback to individual overrides
|
|
435
|
+
for key, value in self._config_overrides.items():
|
|
436
|
+
cmd.extend(["-c", f'{key}="{value}"'])
|
|
437
|
+
logger.info(f"Starting codex mcp-server with overrides: {self._config_overrides} (start_count={self._start_count})")
|
|
73
438
|
self._proc = subprocess.Popen(
|
|
74
|
-
|
|
439
|
+
cmd,
|
|
75
440
|
stdin=subprocess.PIPE,
|
|
76
441
|
stdout=subprocess.PIPE,
|
|
77
442
|
stderr=subprocess.PIPE,
|
|
@@ -238,6 +603,12 @@ class MCPClient:
|
|
|
238
603
|
"""
|
|
239
604
|
Call an MCP tool and collect streaming events.
|
|
240
605
|
|
|
606
|
+
Uses MessageCollector for robust deduplication and priority-based
|
|
607
|
+
message selection. This prevents the transcript rendering bugs:
|
|
608
|
+
- Message duplication
|
|
609
|
+
- Role contamination
|
|
610
|
+
- Turn mis-association
|
|
611
|
+
|
|
241
612
|
Args:
|
|
242
613
|
name: Tool name (codex, codex-reply)
|
|
243
614
|
arguments: Tool arguments
|
|
@@ -260,14 +631,9 @@ class MCPClient:
|
|
|
260
631
|
with self._lock:
|
|
261
632
|
self._write(json.dumps(request) + "\n")
|
|
262
633
|
|
|
263
|
-
#
|
|
264
|
-
|
|
265
|
-
session_id = None
|
|
266
|
-
conversation_id = None # Track conversation ID separately
|
|
267
|
-
agent_messages: list[str] = []
|
|
268
|
-
streaming_text: list[str] = [] # Accumulate streaming delta text
|
|
634
|
+
# Use MessageCollector for robust event handling
|
|
635
|
+
collector = MessageCollector()
|
|
269
636
|
final_result = None
|
|
270
|
-
token_usage: dict[str, Any] = {} # Track token usage
|
|
271
637
|
start_time = time.time()
|
|
272
638
|
all_events: list[dict] = [] # Keep ALL events for debugging
|
|
273
639
|
|
|
@@ -280,13 +646,10 @@ class MCPClient:
|
|
|
280
646
|
raise RuntimeError(f"MCP call timed out after {timeout}s ({event_count} events received)")
|
|
281
647
|
|
|
282
648
|
# Read from queue with per-event timeout
|
|
283
|
-
# Empty string = timeout (process still alive, just waiting)
|
|
284
|
-
# None sentinel is handled inside _read_line (raises RuntimeError)
|
|
285
649
|
line = self._read_line(timeout=30.0)
|
|
286
650
|
|
|
287
651
|
if not line:
|
|
288
652
|
# Timeout waiting for event - process is still alive, just slow
|
|
289
|
-
# This is normal during long codex operations
|
|
290
653
|
logger.debug(f"Waiting for MCP event... (elapsed: {elapsed:.0f}s, events: {event_count})")
|
|
291
654
|
continue
|
|
292
655
|
|
|
@@ -303,238 +666,58 @@ class MCPClient:
|
|
|
303
666
|
final_result = event["result"]
|
|
304
667
|
# Extract conversation ID from final result
|
|
305
668
|
if isinstance(final_result, dict):
|
|
306
|
-
|
|
307
|
-
|
|
669
|
+
conv_id = final_result.get("conversationId") or final_result.get("conversation_id")
|
|
670
|
+
collector.set_conversation_id(conv_id)
|
|
671
|
+
logger.debug(f"Got final result after {event_count} events")
|
|
308
672
|
break
|
|
309
673
|
elif "error" in event:
|
|
310
674
|
error = event["error"]
|
|
311
675
|
raise RuntimeError(f"MCP tool error: {error.get('message', error)}")
|
|
312
676
|
|
|
313
|
-
# Process
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
# Log ALL event types to help debug missing messages
|
|
320
|
-
logger.debug(f"MCP event: type={msg_type}, keys={list(msg.keys())}")
|
|
321
|
-
|
|
322
|
-
if msg_type == "session_configured":
|
|
323
|
-
session_id = msg.get("session_id")
|
|
324
|
-
logger.debug(f"Session configured: {session_id}")
|
|
325
|
-
|
|
326
|
-
elif msg_type == "item_completed":
|
|
327
|
-
item = msg.get("item", {})
|
|
328
|
-
item_type = item.get("type")
|
|
329
|
-
|
|
330
|
-
# Log ALL item_completed events to help debug
|
|
331
|
-
logger.debug(f"item_completed: type={item_type}, keys={list(item.keys())}")
|
|
332
|
-
|
|
333
|
-
# Agent text responses - codex uses "AgentMessage" type
|
|
334
|
-
if item_type == "AgentMessage":
|
|
335
|
-
content = item.get("content", [])
|
|
336
|
-
for block in content:
|
|
337
|
-
if isinstance(block, dict) and block.get("text"):
|
|
338
|
-
agent_messages.append(block["text"])
|
|
339
|
-
elif isinstance(block, str):
|
|
340
|
-
agent_messages.append(block)
|
|
341
|
-
|
|
342
|
-
# Also check for "agent_message" (lowercase) variant
|
|
343
|
-
elif item_type == "agent_message":
|
|
344
|
-
text = item.get("text", "") or item.get("message", "")
|
|
345
|
-
if text:
|
|
346
|
-
agent_messages.append(text)
|
|
347
|
-
# Also check content array
|
|
348
|
-
content = item.get("content", [])
|
|
349
|
-
for block in content:
|
|
350
|
-
if isinstance(block, dict) and block.get("text"):
|
|
351
|
-
agent_messages.append(block["text"])
|
|
352
|
-
elif isinstance(block, str):
|
|
353
|
-
agent_messages.append(block)
|
|
354
|
-
|
|
355
|
-
# Legacy format check
|
|
356
|
-
elif item_type == "message" and item.get("role") == "assistant":
|
|
357
|
-
content = item.get("content", [])
|
|
358
|
-
for block in content:
|
|
359
|
-
if isinstance(block, dict) and block.get("text"):
|
|
360
|
-
agent_messages.append(block["text"])
|
|
361
|
-
elif isinstance(block, str):
|
|
362
|
-
agent_messages.append(block)
|
|
363
|
-
|
|
364
|
-
# Generic message type - check for text/content
|
|
365
|
-
elif item_type == "message":
|
|
366
|
-
text = item.get("text", "")
|
|
367
|
-
if text:
|
|
368
|
-
agent_messages.append(text)
|
|
369
|
-
content = item.get("content", [])
|
|
370
|
-
if isinstance(content, str):
|
|
371
|
-
agent_messages.append(content)
|
|
372
|
-
elif isinstance(content, list):
|
|
373
|
-
for block in content:
|
|
374
|
-
if isinstance(block, dict) and block.get("text"):
|
|
375
|
-
agent_messages.append(block["text"])
|
|
376
|
-
elif isinstance(block, str):
|
|
377
|
-
agent_messages.append(block)
|
|
378
|
-
|
|
379
|
-
# Function call outputs (for context)
|
|
380
|
-
elif item_type == "function_call_output":
|
|
381
|
-
output = item.get("output", "")
|
|
382
|
-
if output and len(output) < 1000:
|
|
383
|
-
agent_messages.append(f"[Tool output]: {output[:500]}")
|
|
384
|
-
|
|
385
|
-
# Log other item types we're not handling
|
|
386
|
-
elif item_type not in ("function_call", "tool_call", "UserMessage", "user_message"):
|
|
387
|
-
logger.debug(f"Unhandled item_completed type: {item_type}, item={item}")
|
|
388
|
-
|
|
389
|
-
elif msg_type == "agent_message":
|
|
390
|
-
# Direct agent message event
|
|
391
|
-
message = msg.get("message", "")
|
|
392
|
-
if message:
|
|
393
|
-
agent_messages.append(message)
|
|
394
|
-
|
|
395
|
-
elif msg_type in ("task_complete", "task_completed"):
|
|
396
|
-
# Task is done - capture last_agent_message as fallback
|
|
397
|
-
last_msg = msg.get("last_agent_message")
|
|
398
|
-
if last_msg and last_msg not in agent_messages:
|
|
399
|
-
agent_messages.append(last_msg)
|
|
400
|
-
logger.debug(f"Task complete after {event_count} events")
|
|
677
|
+
# Process event through collector
|
|
678
|
+
try:
|
|
679
|
+
should_continue = collector.process_event(event)
|
|
680
|
+
if not should_continue:
|
|
681
|
+
logger.debug(f"Collector signaled completion after {event_count} events")
|
|
401
682
|
break
|
|
683
|
+
except RuntimeError as e:
|
|
684
|
+
# Collector raises RuntimeError for codex errors
|
|
685
|
+
raise
|
|
402
686
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
if info:
|
|
407
|
-
usage = info.get("total_token_usage", {})
|
|
408
|
-
if usage:
|
|
409
|
-
token_usage = {
|
|
410
|
-
"input_tokens": usage.get("input_tokens", 0),
|
|
411
|
-
"output_tokens": usage.get("output_tokens", 0),
|
|
412
|
-
"cached_input_tokens": usage.get("cached_input_tokens", 0),
|
|
413
|
-
"reasoning_tokens": usage.get("reasoning_output_tokens", 0),
|
|
414
|
-
"total_tokens": usage.get("total_tokens", 0),
|
|
415
|
-
}
|
|
416
|
-
logger.debug(f"Token usage: {token_usage}")
|
|
417
|
-
|
|
418
|
-
elif msg_type == "error":
|
|
419
|
-
error_msg = msg.get("error", msg.get("message", str(msg)))
|
|
420
|
-
raise RuntimeError(f"Codex error: {error_msg}")
|
|
421
|
-
|
|
422
|
-
# Handle streaming text events (various formats)
|
|
423
|
-
elif msg_type in ("text_delta", "content_block_delta", "message_delta"):
|
|
424
|
-
delta = msg.get("delta", {})
|
|
425
|
-
text = delta.get("text", "") or msg.get("text", "")
|
|
426
|
-
if text:
|
|
427
|
-
streaming_text.append(text)
|
|
428
|
-
|
|
429
|
-
elif msg_type == "text":
|
|
430
|
-
text = msg.get("text", "")
|
|
431
|
-
if text:
|
|
432
|
-
streaming_text.append(text)
|
|
433
|
-
|
|
434
|
-
elif msg_type == "response":
|
|
435
|
-
# Some versions send the full response this way
|
|
436
|
-
response_text = msg.get("response", "") or msg.get("text", "")
|
|
437
|
-
if response_text:
|
|
438
|
-
agent_messages.append(response_text)
|
|
439
|
-
|
|
440
|
-
elif msg_type == "message":
|
|
441
|
-
# Direct message event
|
|
442
|
-
text = msg.get("text", "") or msg.get("content", "")
|
|
443
|
-
if text:
|
|
444
|
-
agent_messages.append(text)
|
|
445
|
-
|
|
446
|
-
elif msg_type == "agent_message":
|
|
447
|
-
# Agent message output (common in newer codex versions)
|
|
448
|
-
text = msg.get("text", "") or msg.get("content", "") or msg.get("message", "")
|
|
449
|
-
if text:
|
|
450
|
-
agent_messages.append(text)
|
|
451
|
-
|
|
452
|
-
elif msg_type == "output":
|
|
453
|
-
# Direct output event
|
|
454
|
-
text = msg.get("output", "") or msg.get("text", "") or msg.get("content", "")
|
|
455
|
-
if text:
|
|
456
|
-
agent_messages.append(text)
|
|
457
|
-
|
|
458
|
-
elif msg_type in ("item.completed", "response.completed"):
|
|
459
|
-
# Completion events may contain the final response
|
|
460
|
-
item = msg.get("item", {})
|
|
461
|
-
if item.get("type") == "agent_message":
|
|
462
|
-
text = item.get("text", "")
|
|
463
|
-
if text:
|
|
464
|
-
agent_messages.append(text)
|
|
465
|
-
elif "text" in msg:
|
|
466
|
-
agent_messages.append(msg["text"])
|
|
467
|
-
elif "content" in msg:
|
|
468
|
-
content = msg["content"]
|
|
469
|
-
if isinstance(content, str):
|
|
470
|
-
agent_messages.append(content)
|
|
471
|
-
elif isinstance(content, list):
|
|
472
|
-
for block in content:
|
|
473
|
-
if isinstance(block, dict) and block.get("text"):
|
|
474
|
-
agent_messages.append(block["text"])
|
|
475
|
-
|
|
476
|
-
else:
|
|
477
|
-
# Try to extract text from unknown event types as fallback
|
|
478
|
-
extracted = None
|
|
479
|
-
for key in ("text", "content", "message", "output", "response"):
|
|
480
|
-
if key in msg:
|
|
481
|
-
val = msg[key]
|
|
482
|
-
if isinstance(val, str) and val.strip():
|
|
483
|
-
extracted = val
|
|
484
|
-
break
|
|
485
|
-
elif isinstance(val, list):
|
|
486
|
-
texts = [b.get("text", "") if isinstance(b, dict) else str(b) for b in val]
|
|
487
|
-
if any(texts):
|
|
488
|
-
extracted = "\n".join(t for t in texts if t)
|
|
489
|
-
break
|
|
490
|
-
|
|
491
|
-
if extracted:
|
|
492
|
-
agent_messages.append(extracted)
|
|
493
|
-
logger.debug(f"Extracted text from event type '{msg_type}': {len(extracted)} chars")
|
|
494
|
-
elif msg_type and msg_type not in ("session_started", "thinking", "tool_call", "function_call", "reasoning", "function_call_output"):
|
|
495
|
-
logger.debug(f"Unhandled MCP event type: {msg_type}, msg keys: {list(msg.keys())}")
|
|
496
|
-
|
|
497
|
-
# Merge streaming text into messages if we got any
|
|
498
|
-
if streaming_text:
|
|
499
|
-
full_streaming = "".join(streaming_text)
|
|
500
|
-
if full_streaming.strip():
|
|
501
|
-
agent_messages.append(full_streaming)
|
|
502
|
-
logger.debug(f"Captured {len(streaming_text)} streaming chunks ({len(full_streaming)} chars)")
|
|
503
|
-
|
|
504
|
-
# Try to extract content from final_result if we have no messages
|
|
505
|
-
if final_result and not agent_messages:
|
|
687
|
+
# Try to extract content from final_result if collector has no messages
|
|
688
|
+
messages = collector.get_messages()
|
|
689
|
+
if final_result and not messages:
|
|
506
690
|
if "content" in final_result:
|
|
507
691
|
content = final_result["content"]
|
|
508
692
|
if isinstance(content, list):
|
|
509
693
|
for block in content:
|
|
510
694
|
if isinstance(block, dict) and block.get("text"):
|
|
511
|
-
|
|
695
|
+
messages.append(block["text"])
|
|
512
696
|
elif isinstance(block, str):
|
|
513
|
-
|
|
697
|
+
messages.append(block)
|
|
514
698
|
elif isinstance(content, str):
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
agent_messages.append(final_result["text"])
|
|
699
|
+
messages.append(content)
|
|
700
|
+
if not messages and "text" in final_result:
|
|
701
|
+
messages.append(final_result["text"])
|
|
519
702
|
|
|
520
|
-
# Build result
|
|
521
|
-
effective_conversation_id = conversation_id or session_id
|
|
703
|
+
# Build result
|
|
522
704
|
result = {
|
|
523
|
-
"conversationId":
|
|
524
|
-
"messages":
|
|
525
|
-
"output": "\n".join(
|
|
526
|
-
"usage": token_usage,
|
|
705
|
+
"conversationId": collector.conversation_id,
|
|
706
|
+
"messages": messages,
|
|
707
|
+
"output": "\n".join(messages) if messages else "",
|
|
708
|
+
"usage": collector.token_usage,
|
|
527
709
|
}
|
|
528
710
|
|
|
529
711
|
# Log detailed debug info if we didn't capture any messages
|
|
530
|
-
if not
|
|
712
|
+
if not messages:
|
|
713
|
+
debug_info = collector.get_debug_info()
|
|
531
714
|
event_types = [e.get("method") or f"id:{e.get('id')}" for e in all_events[:20]]
|
|
532
715
|
logger.warning(
|
|
533
716
|
f"MCP call returned no messages. "
|
|
534
|
-
f"conversation_id={
|
|
535
|
-
f"session_id={session_id}, "
|
|
717
|
+
f"conversation_id={collector.conversation_id}, "
|
|
536
718
|
f"event_count={len(all_events)}, "
|
|
537
719
|
f"event_types={event_types}, "
|
|
720
|
+
f"collector_debug={debug_info}, "
|
|
538
721
|
f"final_result_keys={list(final_result.keys()) if final_result else 'None'}"
|
|
539
722
|
)
|
|
540
723
|
# Log codex/event details for debugging
|
|
@@ -544,7 +727,7 @@ class MCPClient:
|
|
|
544
727
|
msg = ce.get("params", {}).get("msg", {})
|
|
545
728
|
logger.debug(f" codex/event: type={msg.get('type')}, keys={list(msg.keys())}")
|
|
546
729
|
|
|
547
|
-
logger.debug(f"MCP call complete: {len(
|
|
730
|
+
logger.debug(f"MCP call complete: {len(messages)} messages, conversation_id={collector.conversation_id}")
|
|
548
731
|
return result
|
|
549
732
|
|
|
550
733
|
def close(self) -> None:
|
|
@@ -576,11 +759,22 @@ class CodexMCPAdapter(ExecutorAdapter):
|
|
|
576
759
|
This is the recommended way to have iterative conversations with Codex.
|
|
577
760
|
The MCP client uses subprocess.Popen (not asyncio) so it persists across
|
|
578
761
|
multiple asyncio.run() calls, preserving conversation state.
|
|
762
|
+
|
|
763
|
+
Config isolation: Pass config_path to use a local codex.toml instead of
|
|
764
|
+
the user's global ~/.codex/config.toml. This is the preferred approach.
|
|
765
|
+
Falls back to config_overrides if no config_path is provided.
|
|
579
766
|
"""
|
|
580
767
|
DEFAULT_MODEL = "gpt-5.1-codex-mini" # Default codex model
|
|
581
768
|
|
|
582
|
-
def __init__(
|
|
769
|
+
def __init__(
|
|
770
|
+
self,
|
|
771
|
+
model: str | None = None,
|
|
772
|
+
config_path: Path | None = None,
|
|
773
|
+
config_overrides: dict[str, str] | None = None,
|
|
774
|
+
):
|
|
583
775
|
self._model = model or self.DEFAULT_MODEL
|
|
776
|
+
self._config_path = config_path # Path to local codex.toml for isolation
|
|
777
|
+
self._config_overrides = config_overrides or {}
|
|
584
778
|
self._mcp_client: MCPClient | None = None
|
|
585
779
|
self._sessions: dict[str, str] = {} # session_id -> conversationId
|
|
586
780
|
# Cumulative token usage for cost tracking
|
|
@@ -607,7 +801,10 @@ class CodexMCPAdapter(ExecutorAdapter):
|
|
|
607
801
|
def _ensure_client(self) -> MCPClient:
|
|
608
802
|
"""Ensure MCP client is running and return it."""
|
|
609
803
|
if self._mcp_client is None:
|
|
610
|
-
self._mcp_client = MCPClient(
|
|
804
|
+
self._mcp_client = MCPClient(
|
|
805
|
+
config_path=self._config_path,
|
|
806
|
+
config_overrides=self._config_overrides,
|
|
807
|
+
)
|
|
611
808
|
|
|
612
809
|
if not self._mcp_client.is_alive:
|
|
613
810
|
self._mcp_client.start()
|
|
@@ -621,6 +818,7 @@ class CodexMCPAdapter(ExecutorAdapter):
|
|
|
621
818
|
cwd: str,
|
|
622
819
|
sandbox: str,
|
|
623
820
|
model: str | None = None,
|
|
821
|
+
reasoning_effort: str | None = None,
|
|
624
822
|
) -> dict[str, Any]:
|
|
625
823
|
"""
|
|
626
824
|
Call codex MCP tool - traced by Weave.
|
|
@@ -638,7 +836,13 @@ class CodexMCPAdapter(ExecutorAdapter):
|
|
|
638
836
|
if model:
|
|
639
837
|
args["model"] = model
|
|
640
838
|
|
|
641
|
-
|
|
839
|
+
# Pass reasoning_effort to override codex config defaults
|
|
840
|
+
# The config key is "model_reasoning_effort"
|
|
841
|
+
if reasoning_effort:
|
|
842
|
+
args["model_reasoning_effort"] = reasoning_effort
|
|
843
|
+
|
|
844
|
+
logger.info(f"Calling codex with task_len={len(task)}, cwd={cwd}, model={model or 'default'}, reasoning_effort={reasoning_effort or 'default'}")
|
|
845
|
+
logger.debug(f"Full codex args: {args}")
|
|
642
846
|
|
|
643
847
|
result = client.call_tool("codex", args)
|
|
644
848
|
|
|
@@ -714,12 +918,55 @@ class CodexMCPAdapter(ExecutorAdapter):
|
|
|
714
918
|
usage = result.get("usage", {})
|
|
715
919
|
self._accumulate_usage(usage)
|
|
716
920
|
|
|
717
|
-
response
|
|
921
|
+
# Filter out the sent message from the response using content hashing
|
|
922
|
+
# The MCP may echo our prompt back, but we use robust content comparison
|
|
923
|
+
raw_messages = result.get("messages", [])
|
|
924
|
+
|
|
925
|
+
# Create hash of user message for comparison (normalized)
|
|
926
|
+
def normalize_for_comparison(text: str) -> str:
|
|
927
|
+
"""Normalize text for comparison (lowercase, collapsed whitespace)."""
|
|
928
|
+
return " ".join(text.lower().split())
|
|
929
|
+
|
|
930
|
+
user_msg_normalized = normalize_for_comparison(message)
|
|
931
|
+
user_msg_hash = hashlib.md5(user_msg_normalized.encode()).hexdigest()
|
|
932
|
+
|
|
933
|
+
def is_user_message_echo(text: str) -> bool:
|
|
934
|
+
"""Check if text is just an echo of the user message."""
|
|
935
|
+
if not text:
|
|
936
|
+
return True # Empty is effectively an echo (skip it)
|
|
937
|
+
|
|
938
|
+
text_normalized = normalize_for_comparison(text)
|
|
939
|
+
text_hash = hashlib.md5(text_normalized.encode()).hexdigest()
|
|
940
|
+
|
|
941
|
+
# Exact match (case-insensitive, whitespace-normalized)
|
|
942
|
+
if text_hash == user_msg_hash:
|
|
943
|
+
return True
|
|
944
|
+
|
|
945
|
+
# Check if text IS the user message (not just starts with it)
|
|
946
|
+
# This avoids the bug where "Fix bug by X" gets filtered when user said "Fix bug"
|
|
947
|
+
if text_normalized == user_msg_normalized:
|
|
948
|
+
return True
|
|
949
|
+
|
|
950
|
+
return False
|
|
951
|
+
|
|
952
|
+
filtered_messages = [m for m in raw_messages if not is_user_message_echo(m)]
|
|
953
|
+
|
|
954
|
+
# Build filtered result for extraction
|
|
955
|
+
filtered_result = {
|
|
956
|
+
**result,
|
|
957
|
+
"messages": filtered_messages,
|
|
958
|
+
"output": "\n".join(filtered_messages) if filtered_messages else result.get("output", ""),
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
response = self._extract_response(filtered_result)
|
|
962
|
+
filtered_count = len(raw_messages) - len(filtered_messages)
|
|
963
|
+
if filtered_count > 0:
|
|
964
|
+
logger.debug(f"Filtered {filtered_count} user echo messages from response")
|
|
718
965
|
logger.debug(f"codex-reply response length: {len(response)} chars")
|
|
719
966
|
|
|
720
967
|
return {
|
|
721
968
|
"response": response,
|
|
722
|
-
"raw_messages":
|
|
969
|
+
"raw_messages": filtered_messages, # Return filtered messages
|
|
723
970
|
"usage": usage,
|
|
724
971
|
"total_usage": self.total_usage,
|
|
725
972
|
"conversation_lost": not result.get("messages") and not result.get("output"),
|
|
@@ -752,6 +999,7 @@ class CodexMCPAdapter(ExecutorAdapter):
|
|
|
752
999
|
cwd=str(working_dir.absolute()),
|
|
753
1000
|
sandbox=sandbox,
|
|
754
1001
|
model=effective_model,
|
|
1002
|
+
reasoning_effort=kwargs.get("reasoning_effort"),
|
|
755
1003
|
)
|
|
756
1004
|
|
|
757
1005
|
# Extract conversation ID and response
|