zwarm 1.3.10__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zwarm/adapters/codex_mcp.py +475 -181
- zwarm/cli/main.py +483 -143
- zwarm/core/config.py +2 -0
- zwarm/orchestrator.py +41 -2
- zwarm/prompts/orchestrator.py +29 -13
- zwarm/sessions/__init__.py +2 -0
- zwarm/sessions/manager.py +87 -8
- zwarm/tools/delegation.py +356 -324
- zwarm/watchers/builtin.py +100 -6
- zwarm-2.0.0.dist-info/METADATA +309 -0
- {zwarm-1.3.10.dist-info → zwarm-2.0.0.dist-info}/RECORD +13 -13
- zwarm-1.3.10.dist-info/METADATA +0 -525
- {zwarm-1.3.10.dist-info → zwarm-2.0.0.dist-info}/WHEEL +0 -0
- {zwarm-1.3.10.dist-info → zwarm-2.0.0.dist-info}/entry_points.txt +0 -0
zwarm/adapters/codex_mcp.py
CHANGED
|
@@ -8,12 +8,14 @@ Uses codex mcp-server for true iterative conversations:
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
+
import hashlib
|
|
11
12
|
import json
|
|
12
13
|
import logging
|
|
13
14
|
import queue
|
|
14
15
|
import subprocess
|
|
15
16
|
import threading
|
|
16
17
|
import time
|
|
18
|
+
from dataclasses import dataclass, field
|
|
17
19
|
from pathlib import Path
|
|
18
20
|
from typing import Any, Literal
|
|
19
21
|
|
|
@@ -30,6 +32,344 @@ from zwarm.core.models import (
|
|
|
30
32
|
logger = logging.getLogger(__name__)
|
|
31
33
|
|
|
32
34
|
|
|
35
|
+
# =============================================================================
|
|
36
|
+
# MessageCollector: Robust event collection with deduplication
|
|
37
|
+
# =============================================================================
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class MessageSegment:
|
|
42
|
+
"""A segment within an assistant turn (for future segment-aware rendering)."""
|
|
43
|
+
id: str
|
|
44
|
+
kind: Literal["assistant_text", "progress", "tool_call", "tool_result", "error"]
|
|
45
|
+
text: str
|
|
46
|
+
status: Literal["open", "closed"] = "open"
|
|
47
|
+
source_event_ids: set[str] = field(default_factory=set)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class MessageCollector:
|
|
51
|
+
"""
|
|
52
|
+
Collects and deduplicates messages from MCP event stream.
|
|
53
|
+
|
|
54
|
+
Solves the transcript rendering bugs by:
|
|
55
|
+
1. Deduplicating events by ID
|
|
56
|
+
2. Using priority-based message selection (item_completed > task_complete > streaming)
|
|
57
|
+
3. Tracking message sources for debugging
|
|
58
|
+
4. Never mixing streaming deltas with finalized messages
|
|
59
|
+
|
|
60
|
+
Priority order (highest to lowest):
|
|
61
|
+
- item_completed with AgentMessage/agent_message → DEFINITIVE
|
|
62
|
+
- task_complete.last_agent_message → FALLBACK ONLY
|
|
63
|
+
- streaming deltas → ONLY IF NO DEFINITIVE SOURCE
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(self):
|
|
67
|
+
# Deduplication
|
|
68
|
+
self._seen_event_ids: set[str] = set()
|
|
69
|
+
self._seen_content_hashes: set[str] = set() # Content-based dedup
|
|
70
|
+
|
|
71
|
+
# Message collection (priority-ordered)
|
|
72
|
+
self._definitive_messages: list[str] = [] # From item_completed
|
|
73
|
+
self._fallback_message: str | None = None # From task_complete
|
|
74
|
+
self._streaming_buffer: list[str] = [] # Streaming deltas
|
|
75
|
+
|
|
76
|
+
# Metadata
|
|
77
|
+
self._conversation_id: str | None = None
|
|
78
|
+
self._session_id: str | None = None
|
|
79
|
+
self._token_usage: dict[str, Any] = {}
|
|
80
|
+
self._is_complete: bool = False
|
|
81
|
+
|
|
82
|
+
# Debug tracking
|
|
83
|
+
self._message_sources: list[tuple[str, str]] = [] # (source, text_preview)
|
|
84
|
+
|
|
85
|
+
def _extract_event_id(self, event: dict) -> str | None:
|
|
86
|
+
"""Extract a unique event ID for deduplication."""
|
|
87
|
+
# Try various ID fields that MCP events might have
|
|
88
|
+
for key in ("id", "event_id", "item_id", "message_id"):
|
|
89
|
+
if key in event:
|
|
90
|
+
return str(event[key])
|
|
91
|
+
|
|
92
|
+
# For nested events, try params
|
|
93
|
+
params = event.get("params", {})
|
|
94
|
+
msg = params.get("msg", {})
|
|
95
|
+
for key in ("id", "event_id", "item_id"):
|
|
96
|
+
if key in msg:
|
|
97
|
+
return str(msg[key])
|
|
98
|
+
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
def _content_hash(self, text: str) -> str:
|
|
102
|
+
"""Create a hash of content for deduplication."""
|
|
103
|
+
# Normalize whitespace for comparison
|
|
104
|
+
normalized = " ".join(text.split())
|
|
105
|
+
return hashlib.md5(normalized.encode()).hexdigest()[:16]
|
|
106
|
+
|
|
107
|
+
def _is_duplicate_content(self, text: str) -> bool:
|
|
108
|
+
"""Check if this content was already collected."""
|
|
109
|
+
if not text or not text.strip():
|
|
110
|
+
return True # Empty is "duplicate" (skip it)
|
|
111
|
+
|
|
112
|
+
content_hash = self._content_hash(text)
|
|
113
|
+
if content_hash in self._seen_content_hashes:
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
self._seen_content_hashes.add(content_hash)
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
def _add_definitive_message(self, text: str, source: str) -> None:
|
|
120
|
+
"""Add a definitive message (from item_completed)."""
|
|
121
|
+
if not text or not text.strip():
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
if self._is_duplicate_content(text):
|
|
125
|
+
logger.debug(f"Skipping duplicate message from {source}: {text[:50]}...")
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
self._definitive_messages.append(text)
|
|
129
|
+
self._message_sources.append((source, text[:50]))
|
|
130
|
+
logger.debug(f"Added definitive message from {source}: {text[:50]}...")
|
|
131
|
+
|
|
132
|
+
def _set_fallback_message(self, text: str, source: str) -> None:
|
|
133
|
+
"""Set fallback message (from task_complete). Only used if no definitive."""
|
|
134
|
+
if not text or not text.strip():
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
# Only set if we don't have definitive messages
|
|
138
|
+
if self._definitive_messages:
|
|
139
|
+
logger.debug(f"Ignoring fallback from {source}: have definitive messages")
|
|
140
|
+
return
|
|
141
|
+
|
|
142
|
+
if self._is_duplicate_content(text):
|
|
143
|
+
logger.debug(f"Skipping duplicate fallback from {source}")
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
self._fallback_message = text
|
|
147
|
+
self._message_sources.append((source, text[:50]))
|
|
148
|
+
|
|
149
|
+
def _add_streaming_delta(self, text: str) -> None:
|
|
150
|
+
"""Add streaming delta. Only used if no definitive messages at end."""
|
|
151
|
+
if text:
|
|
152
|
+
self._streaming_buffer.append(text)
|
|
153
|
+
|
|
154
|
+
def process_event(self, event: dict) -> bool:
|
|
155
|
+
"""
|
|
156
|
+
Process a single MCP event.
|
|
157
|
+
|
|
158
|
+
Returns True if processing should continue, False if complete.
|
|
159
|
+
"""
|
|
160
|
+
# 1. Check for event ID and dedupe
|
|
161
|
+
event_id = self._extract_event_id(event)
|
|
162
|
+
if event_id and event_id in self._seen_event_ids:
|
|
163
|
+
logger.debug(f"Skipping duplicate event: {event_id}")
|
|
164
|
+
return True
|
|
165
|
+
if event_id:
|
|
166
|
+
self._seen_event_ids.add(event_id)
|
|
167
|
+
|
|
168
|
+
# 2. Handle codex/event notifications
|
|
169
|
+
if event.get("method") == "codex/event":
|
|
170
|
+
params = event.get("params", {})
|
|
171
|
+
msg = params.get("msg", {})
|
|
172
|
+
msg_type = msg.get("type")
|
|
173
|
+
|
|
174
|
+
self._handle_codex_event(msg, msg_type)
|
|
175
|
+
|
|
176
|
+
# Check for completion events
|
|
177
|
+
if msg_type in ("task_complete", "task_completed"):
|
|
178
|
+
self._is_complete = True
|
|
179
|
+
return False
|
|
180
|
+
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
def _handle_codex_event(self, msg: dict, msg_type: str | None) -> None:
|
|
184
|
+
"""Handle a codex/event notification."""
|
|
185
|
+
if not msg_type:
|
|
186
|
+
return
|
|
187
|
+
|
|
188
|
+
# Session configuration
|
|
189
|
+
if msg_type == "session_configured":
|
|
190
|
+
self._session_id = msg.get("session_id")
|
|
191
|
+
logger.debug(f"Session configured: {self._session_id}")
|
|
192
|
+
|
|
193
|
+
# Item completed - DEFINITIVE SOURCE
|
|
194
|
+
elif msg_type == "item_completed":
|
|
195
|
+
self._handle_item_completed(msg)
|
|
196
|
+
|
|
197
|
+
# Direct agent message - DEFINITIVE SOURCE
|
|
198
|
+
elif msg_type == "agent_message":
|
|
199
|
+
text = msg.get("message", "") or msg.get("text", "") or msg.get("content", "")
|
|
200
|
+
self._add_definitive_message(text, "agent_message_event")
|
|
201
|
+
|
|
202
|
+
# Task complete - FALLBACK SOURCE
|
|
203
|
+
elif msg_type in ("task_complete", "task_completed"):
|
|
204
|
+
last_msg = msg.get("last_agent_message")
|
|
205
|
+
if last_msg:
|
|
206
|
+
self._set_fallback_message(last_msg, "task_complete")
|
|
207
|
+
|
|
208
|
+
# Token usage
|
|
209
|
+
elif msg_type == "token_count":
|
|
210
|
+
info = msg.get("info") or {}
|
|
211
|
+
if info:
|
|
212
|
+
usage = info.get("total_token_usage", {})
|
|
213
|
+
if usage:
|
|
214
|
+
self._token_usage = {
|
|
215
|
+
"input_tokens": usage.get("input_tokens", 0),
|
|
216
|
+
"output_tokens": usage.get("output_tokens", 0),
|
|
217
|
+
"cached_input_tokens": usage.get("cached_input_tokens", 0),
|
|
218
|
+
"reasoning_tokens": usage.get("reasoning_output_tokens", 0),
|
|
219
|
+
"total_tokens": usage.get("total_tokens", 0),
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
# Streaming deltas - LOWEST PRIORITY
|
|
223
|
+
elif msg_type in ("text_delta", "content_block_delta", "message_delta", "text"):
|
|
224
|
+
delta = msg.get("delta", {})
|
|
225
|
+
text = delta.get("text", "") or msg.get("text", "")
|
|
226
|
+
self._add_streaming_delta(text)
|
|
227
|
+
|
|
228
|
+
# Response event - MEDIUM PRIORITY (treat as definitive)
|
|
229
|
+
elif msg_type == "response":
|
|
230
|
+
text = msg.get("response", "") or msg.get("text", "")
|
|
231
|
+
self._add_definitive_message(text, "response_event")
|
|
232
|
+
|
|
233
|
+
# Message event - check role
|
|
234
|
+
elif msg_type == "message":
|
|
235
|
+
role = msg.get("role", "").lower()
|
|
236
|
+
if role in ("assistant", "agent", ""):
|
|
237
|
+
text = msg.get("text", "") or msg.get("content", "")
|
|
238
|
+
if text and role != "user":
|
|
239
|
+
self._add_definitive_message(text, "message_event")
|
|
240
|
+
|
|
241
|
+
# Output event
|
|
242
|
+
elif msg_type == "output":
|
|
243
|
+
text = msg.get("output", "") or msg.get("text", "") or msg.get("content", "")
|
|
244
|
+
self._add_definitive_message(text, "output_event")
|
|
245
|
+
|
|
246
|
+
# Completion variants
|
|
247
|
+
elif msg_type in ("item.completed", "response.completed"):
|
|
248
|
+
item = msg.get("item", {})
|
|
249
|
+
if item.get("type") == "agent_message":
|
|
250
|
+
text = item.get("text", "")
|
|
251
|
+
self._add_definitive_message(text, f"{msg_type}_event")
|
|
252
|
+
elif "text" in msg:
|
|
253
|
+
self._add_definitive_message(msg["text"], f"{msg_type}_direct")
|
|
254
|
+
|
|
255
|
+
# Error
|
|
256
|
+
elif msg_type == "error":
|
|
257
|
+
error_msg = msg.get("error", msg.get("message", str(msg)))
|
|
258
|
+
raise RuntimeError(f"Codex error: {error_msg}")
|
|
259
|
+
|
|
260
|
+
def _handle_item_completed(self, msg: dict) -> None:
|
|
261
|
+
"""Handle item_completed event - the primary source of messages."""
|
|
262
|
+
item = msg.get("item", {})
|
|
263
|
+
item_type = item.get("type")
|
|
264
|
+
|
|
265
|
+
# AgentMessage - primary format
|
|
266
|
+
if item_type == "AgentMessage":
|
|
267
|
+
content = item.get("content", [])
|
|
268
|
+
for block in content:
|
|
269
|
+
if isinstance(block, dict) and block.get("text"):
|
|
270
|
+
self._add_definitive_message(block["text"], "AgentMessage")
|
|
271
|
+
elif isinstance(block, str):
|
|
272
|
+
self._add_definitive_message(block, "AgentMessage_str")
|
|
273
|
+
|
|
274
|
+
# agent_message - variant spelling
|
|
275
|
+
elif item_type == "agent_message":
|
|
276
|
+
text = item.get("text", "") or item.get("message", "")
|
|
277
|
+
if text:
|
|
278
|
+
self._add_definitive_message(text, "agent_message")
|
|
279
|
+
content = item.get("content", [])
|
|
280
|
+
for block in content:
|
|
281
|
+
if isinstance(block, dict) and block.get("text"):
|
|
282
|
+
self._add_definitive_message(block["text"], "agent_message_content")
|
|
283
|
+
elif isinstance(block, str):
|
|
284
|
+
self._add_definitive_message(block, "agent_message_content_str")
|
|
285
|
+
|
|
286
|
+
# Generic message with assistant role
|
|
287
|
+
elif item_type == "message":
|
|
288
|
+
role = item.get("role", "")
|
|
289
|
+
if role == "assistant":
|
|
290
|
+
content = item.get("content", [])
|
|
291
|
+
for block in content:
|
|
292
|
+
if isinstance(block, dict) and block.get("text"):
|
|
293
|
+
self._add_definitive_message(block["text"], "message_assistant")
|
|
294
|
+
elif isinstance(block, str):
|
|
295
|
+
self._add_definitive_message(block, "message_assistant_str")
|
|
296
|
+
# Also check text field directly
|
|
297
|
+
text = item.get("text", "")
|
|
298
|
+
if text:
|
|
299
|
+
self._add_definitive_message(text, "message_text")
|
|
300
|
+
|
|
301
|
+
# Function call output (for context, truncated)
|
|
302
|
+
elif item_type == "function_call_output":
|
|
303
|
+
output = item.get("output", "")
|
|
304
|
+
if output and len(output) < 1000:
|
|
305
|
+
# Don't add to messages, just log
|
|
306
|
+
logger.debug(f"Tool output: {output[:100]}...")
|
|
307
|
+
|
|
308
|
+
def set_conversation_id(self, conv_id: str | None) -> None:
|
|
309
|
+
"""Set conversation ID from final result."""
|
|
310
|
+
if conv_id:
|
|
311
|
+
self._conversation_id = conv_id
|
|
312
|
+
|
|
313
|
+
@property
|
|
314
|
+
def conversation_id(self) -> str | None:
|
|
315
|
+
"""Get the conversation ID."""
|
|
316
|
+
return self._conversation_id or self._session_id
|
|
317
|
+
|
|
318
|
+
@property
|
|
319
|
+
def token_usage(self) -> dict[str, Any]:
|
|
320
|
+
"""Get token usage stats."""
|
|
321
|
+
return self._token_usage
|
|
322
|
+
|
|
323
|
+
@property
|
|
324
|
+
def is_complete(self) -> bool:
|
|
325
|
+
"""Check if collection is complete."""
|
|
326
|
+
return self._is_complete
|
|
327
|
+
|
|
328
|
+
def get_messages(self) -> list[str]:
|
|
329
|
+
"""
|
|
330
|
+
Get the final deduplicated message list.
|
|
331
|
+
|
|
332
|
+
Priority:
|
|
333
|
+
1. Definitive messages (from item_completed)
|
|
334
|
+
2. Fallback message (from task_complete)
|
|
335
|
+
3. Streaming buffer (only if no definitive or fallback)
|
|
336
|
+
"""
|
|
337
|
+
# Prefer definitive messages
|
|
338
|
+
if self._definitive_messages:
|
|
339
|
+
logger.debug(f"Returning {len(self._definitive_messages)} definitive messages")
|
|
340
|
+
return self._definitive_messages
|
|
341
|
+
|
|
342
|
+
# Fall back to task_complete message
|
|
343
|
+
if self._fallback_message:
|
|
344
|
+
logger.debug("Returning fallback message from task_complete")
|
|
345
|
+
return [self._fallback_message]
|
|
346
|
+
|
|
347
|
+
# Last resort: streaming buffer
|
|
348
|
+
if self._streaming_buffer:
|
|
349
|
+
full_text = "".join(self._streaming_buffer)
|
|
350
|
+
if full_text.strip():
|
|
351
|
+
logger.debug(f"Returning streaming buffer ({len(self._streaming_buffer)} chunks)")
|
|
352
|
+
return [full_text]
|
|
353
|
+
|
|
354
|
+
return []
|
|
355
|
+
|
|
356
|
+
def get_response(self) -> str:
|
|
357
|
+
"""Get the final response as a single string."""
|
|
358
|
+
messages = self.get_messages()
|
|
359
|
+
return "\n".join(messages) if messages else ""
|
|
360
|
+
|
|
361
|
+
def get_debug_info(self) -> dict:
|
|
362
|
+
"""Get debug information about message collection."""
|
|
363
|
+
return {
|
|
364
|
+
"seen_event_ids": len(self._seen_event_ids),
|
|
365
|
+
"seen_content_hashes": len(self._seen_content_hashes),
|
|
366
|
+
"definitive_messages": len(self._definitive_messages),
|
|
367
|
+
"has_fallback": self._fallback_message is not None,
|
|
368
|
+
"streaming_chunks": len(self._streaming_buffer),
|
|
369
|
+
"message_sources": self._message_sources,
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
|
|
33
373
|
class MCPClient:
|
|
34
374
|
"""
|
|
35
375
|
Robust MCP client for communicating with codex mcp-server.
|
|
@@ -42,7 +382,18 @@ class MCPClient:
|
|
|
42
382
|
of spawning new reader threads on timeout.
|
|
43
383
|
"""
|
|
44
384
|
|
|
45
|
-
|
|
385
|
+
# Default config overrides for zwarm-managed codex sessions
|
|
386
|
+
# These override ~/.codex/config.toml to ensure consistent behavior
|
|
387
|
+
# Only used as fallback if no config_path is provided
|
|
388
|
+
DEFAULT_CONFIG_OVERRIDES: dict[str, str] = {
|
|
389
|
+
"model_reasoning_effort": "high", # Use 'high' for compatibility with all models
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
def __init__(
|
|
393
|
+
self,
|
|
394
|
+
config_path: Path | None = None,
|
|
395
|
+
config_overrides: dict[str, str] | None = None,
|
|
396
|
+
):
|
|
46
397
|
self._proc: subprocess.Popen | None = None
|
|
47
398
|
self._proc_pid: int | None = None # Track PID to detect restarts
|
|
48
399
|
self._request_id = 0
|
|
@@ -53,6 +404,10 @@ class MCPClient:
|
|
|
53
404
|
self._stdout_queue: queue.Queue[str | None] = queue.Queue()
|
|
54
405
|
self._lock = threading.Lock() # Protect writes only
|
|
55
406
|
self._start_count = 0 # Track how many times we've started
|
|
407
|
+
# Config path for full isolation (preferred)
|
|
408
|
+
self._config_path = config_path
|
|
409
|
+
# Fallback: merge default overrides with any custom ones (used if no config_path)
|
|
410
|
+
self._config_overrides = {**self.DEFAULT_CONFIG_OVERRIDES, **(config_overrides or {})}
|
|
56
411
|
|
|
57
412
|
def start(self) -> None:
|
|
58
413
|
"""Start the MCP server process."""
|
|
@@ -69,9 +424,19 @@ class MCPClient:
|
|
|
69
424
|
)
|
|
70
425
|
|
|
71
426
|
self._start_count += 1
|
|
72
|
-
|
|
427
|
+
|
|
428
|
+
# Build command - prefer config file for full isolation, fallback to overrides
|
|
429
|
+
cmd = ["codex", "mcp-server"]
|
|
430
|
+
if self._config_path and self._config_path.exists():
|
|
431
|
+
cmd.extend(["--config", str(self._config_path)])
|
|
432
|
+
logger.info(f"Starting codex mcp-server with config: {self._config_path} (start_count={self._start_count})")
|
|
433
|
+
else:
|
|
434
|
+
# Fallback to individual overrides
|
|
435
|
+
for key, value in self._config_overrides.items():
|
|
436
|
+
cmd.extend(["-c", f'{key}="{value}"'])
|
|
437
|
+
logger.info(f"Starting codex mcp-server with overrides: {self._config_overrides} (start_count={self._start_count})")
|
|
73
438
|
self._proc = subprocess.Popen(
|
|
74
|
-
|
|
439
|
+
cmd,
|
|
75
440
|
stdin=subprocess.PIPE,
|
|
76
441
|
stdout=subprocess.PIPE,
|
|
77
442
|
stderr=subprocess.PIPE,
|
|
@@ -238,6 +603,12 @@ class MCPClient:
|
|
|
238
603
|
"""
|
|
239
604
|
Call an MCP tool and collect streaming events.
|
|
240
605
|
|
|
606
|
+
Uses MessageCollector for robust deduplication and priority-based
|
|
607
|
+
message selection. This prevents the transcript rendering bugs:
|
|
608
|
+
- Message duplication
|
|
609
|
+
- Role contamination
|
|
610
|
+
- Turn mis-association
|
|
611
|
+
|
|
241
612
|
Args:
|
|
242
613
|
name: Tool name (codex, codex-reply)
|
|
243
614
|
arguments: Tool arguments
|
|
@@ -260,14 +631,9 @@ class MCPClient:
|
|
|
260
631
|
with self._lock:
|
|
261
632
|
self._write(json.dumps(request) + "\n")
|
|
262
633
|
|
|
263
|
-
#
|
|
264
|
-
|
|
265
|
-
session_id = None
|
|
266
|
-
conversation_id = None # Track conversation ID separately
|
|
267
|
-
agent_messages: list[str] = []
|
|
268
|
-
streaming_text: list[str] = [] # Accumulate streaming delta text
|
|
634
|
+
# Use MessageCollector for robust event handling
|
|
635
|
+
collector = MessageCollector()
|
|
269
636
|
final_result = None
|
|
270
|
-
token_usage: dict[str, Any] = {} # Track token usage
|
|
271
637
|
start_time = time.time()
|
|
272
638
|
all_events: list[dict] = [] # Keep ALL events for debugging
|
|
273
639
|
|
|
@@ -280,13 +646,10 @@ class MCPClient:
|
|
|
280
646
|
raise RuntimeError(f"MCP call timed out after {timeout}s ({event_count} events received)")
|
|
281
647
|
|
|
282
648
|
# Read from queue with per-event timeout
|
|
283
|
-
# Empty string = timeout (process still alive, just waiting)
|
|
284
|
-
# None sentinel is handled inside _read_line (raises RuntimeError)
|
|
285
649
|
line = self._read_line(timeout=30.0)
|
|
286
650
|
|
|
287
651
|
if not line:
|
|
288
652
|
# Timeout waiting for event - process is still alive, just slow
|
|
289
|
-
# This is normal during long codex operations
|
|
290
653
|
logger.debug(f"Waiting for MCP event... (elapsed: {elapsed:.0f}s, events: {event_count})")
|
|
291
654
|
continue
|
|
292
655
|
|
|
@@ -303,192 +666,58 @@ class MCPClient:
|
|
|
303
666
|
final_result = event["result"]
|
|
304
667
|
# Extract conversation ID from final result
|
|
305
668
|
if isinstance(final_result, dict):
|
|
306
|
-
|
|
307
|
-
|
|
669
|
+
conv_id = final_result.get("conversationId") or final_result.get("conversation_id")
|
|
670
|
+
collector.set_conversation_id(conv_id)
|
|
671
|
+
logger.debug(f"Got final result after {event_count} events")
|
|
308
672
|
break
|
|
309
673
|
elif "error" in event:
|
|
310
674
|
error = event["error"]
|
|
311
675
|
raise RuntimeError(f"MCP tool error: {error.get('message', error)}")
|
|
312
676
|
|
|
313
|
-
# Process
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
# Log ALL event types to help debug missing messages
|
|
320
|
-
logger.debug(f"MCP event: type={msg_type}, keys={list(msg.keys())}")
|
|
321
|
-
|
|
322
|
-
if msg_type == "session_configured":
|
|
323
|
-
session_id = msg.get("session_id")
|
|
324
|
-
logger.debug(f"Session configured: {session_id}")
|
|
325
|
-
|
|
326
|
-
elif msg_type == "item_completed":
|
|
327
|
-
item = msg.get("item", {})
|
|
328
|
-
item_type = item.get("type")
|
|
329
|
-
|
|
330
|
-
# Log ALL item_completed events to help debug
|
|
331
|
-
logger.debug(f"item_completed: type={item_type}, keys={list(item.keys())}")
|
|
332
|
-
|
|
333
|
-
# Agent text responses - codex uses "AgentMessage" type
|
|
334
|
-
if item_type == "AgentMessage":
|
|
335
|
-
content = item.get("content", [])
|
|
336
|
-
for block in content:
|
|
337
|
-
if isinstance(block, dict) and block.get("text"):
|
|
338
|
-
agent_messages.append(block["text"])
|
|
339
|
-
elif isinstance(block, str):
|
|
340
|
-
agent_messages.append(block)
|
|
341
|
-
|
|
342
|
-
# Also check for "agent_message" (lowercase) variant
|
|
343
|
-
elif item_type == "agent_message":
|
|
344
|
-
text = item.get("text", "") or item.get("message", "")
|
|
345
|
-
if text:
|
|
346
|
-
agent_messages.append(text)
|
|
347
|
-
# Also check content array
|
|
348
|
-
content = item.get("content", [])
|
|
349
|
-
for block in content:
|
|
350
|
-
if isinstance(block, dict) and block.get("text"):
|
|
351
|
-
agent_messages.append(block["text"])
|
|
352
|
-
elif isinstance(block, str):
|
|
353
|
-
agent_messages.append(block)
|
|
354
|
-
|
|
355
|
-
# Legacy format check
|
|
356
|
-
elif item_type == "message" and item.get("role") == "assistant":
|
|
357
|
-
content = item.get("content", [])
|
|
358
|
-
for block in content:
|
|
359
|
-
if isinstance(block, dict) and block.get("text"):
|
|
360
|
-
agent_messages.append(block["text"])
|
|
361
|
-
elif isinstance(block, str):
|
|
362
|
-
agent_messages.append(block)
|
|
363
|
-
|
|
364
|
-
# Generic message type - check for text/content
|
|
365
|
-
elif item_type == "message":
|
|
366
|
-
text = item.get("text", "")
|
|
367
|
-
if text:
|
|
368
|
-
agent_messages.append(text)
|
|
369
|
-
content = item.get("content", [])
|
|
370
|
-
if isinstance(content, str):
|
|
371
|
-
agent_messages.append(content)
|
|
372
|
-
elif isinstance(content, list):
|
|
373
|
-
for block in content:
|
|
374
|
-
if isinstance(block, dict) and block.get("text"):
|
|
375
|
-
agent_messages.append(block["text"])
|
|
376
|
-
elif isinstance(block, str):
|
|
377
|
-
agent_messages.append(block)
|
|
378
|
-
|
|
379
|
-
# Function call outputs (for context)
|
|
380
|
-
elif item_type == "function_call_output":
|
|
381
|
-
output = item.get("output", "")
|
|
382
|
-
if output and len(output) < 1000:
|
|
383
|
-
agent_messages.append(f"[Tool output]: {output[:500]}")
|
|
384
|
-
|
|
385
|
-
# Log other item types we're not handling
|
|
386
|
-
elif item_type not in ("function_call", "tool_call", "UserMessage", "user_message"):
|
|
387
|
-
logger.debug(f"Unhandled item_completed type: {item_type}, item={item}")
|
|
388
|
-
|
|
389
|
-
elif msg_type == "agent_message":
|
|
390
|
-
# Direct agent message event
|
|
391
|
-
message = msg.get("message", "")
|
|
392
|
-
if message:
|
|
393
|
-
agent_messages.append(message)
|
|
394
|
-
|
|
395
|
-
elif msg_type in ("task_complete", "task_completed"):
|
|
396
|
-
# Task is done - capture last_agent_message as fallback
|
|
397
|
-
last_msg = msg.get("last_agent_message")
|
|
398
|
-
if last_msg and last_msg not in agent_messages:
|
|
399
|
-
agent_messages.append(last_msg)
|
|
400
|
-
logger.debug(f"Task complete after {event_count} events")
|
|
677
|
+
# Process event through collector
|
|
678
|
+
try:
|
|
679
|
+
should_continue = collector.process_event(event)
|
|
680
|
+
if not should_continue:
|
|
681
|
+
logger.debug(f"Collector signaled completion after {event_count} events")
|
|
401
682
|
break
|
|
683
|
+
except RuntimeError as e:
|
|
684
|
+
# Collector raises RuntimeError for codex errors
|
|
685
|
+
raise
|
|
402
686
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
if info:
|
|
407
|
-
usage = info.get("total_token_usage", {})
|
|
408
|
-
if usage:
|
|
409
|
-
token_usage = {
|
|
410
|
-
"input_tokens": usage.get("input_tokens", 0),
|
|
411
|
-
"output_tokens": usage.get("output_tokens", 0),
|
|
412
|
-
"cached_input_tokens": usage.get("cached_input_tokens", 0),
|
|
413
|
-
"reasoning_tokens": usage.get("reasoning_output_tokens", 0),
|
|
414
|
-
"total_tokens": usage.get("total_tokens", 0),
|
|
415
|
-
}
|
|
416
|
-
logger.debug(f"Token usage: {token_usage}")
|
|
417
|
-
|
|
418
|
-
elif msg_type == "error":
|
|
419
|
-
error_msg = msg.get("error", msg.get("message", str(msg)))
|
|
420
|
-
raise RuntimeError(f"Codex error: {error_msg}")
|
|
421
|
-
|
|
422
|
-
# Handle streaming text events (various formats)
|
|
423
|
-
elif msg_type in ("text_delta", "content_block_delta", "message_delta"):
|
|
424
|
-
delta = msg.get("delta", {})
|
|
425
|
-
text = delta.get("text", "") or msg.get("text", "")
|
|
426
|
-
if text:
|
|
427
|
-
streaming_text.append(text)
|
|
428
|
-
|
|
429
|
-
elif msg_type == "text":
|
|
430
|
-
text = msg.get("text", "")
|
|
431
|
-
if text:
|
|
432
|
-
streaming_text.append(text)
|
|
433
|
-
|
|
434
|
-
elif msg_type == "response":
|
|
435
|
-
# Some versions send the full response this way
|
|
436
|
-
response_text = msg.get("response", "") or msg.get("text", "")
|
|
437
|
-
if response_text:
|
|
438
|
-
agent_messages.append(response_text)
|
|
439
|
-
|
|
440
|
-
elif msg_type == "message":
|
|
441
|
-
# Direct message event
|
|
442
|
-
text = msg.get("text", "") or msg.get("content", "")
|
|
443
|
-
if text:
|
|
444
|
-
agent_messages.append(text)
|
|
445
|
-
|
|
446
|
-
else:
|
|
447
|
-
# Log unknown event types at debug level to help diagnose
|
|
448
|
-
if msg_type and msg_type not in ("session_started", "thinking", "tool_call", "function_call"):
|
|
449
|
-
logger.debug(f"Unhandled MCP event type: {msg_type}, msg keys: {list(msg.keys())}")
|
|
450
|
-
|
|
451
|
-
# Merge streaming text into messages if we got any
|
|
452
|
-
if streaming_text:
|
|
453
|
-
full_streaming = "".join(streaming_text)
|
|
454
|
-
if full_streaming.strip():
|
|
455
|
-
agent_messages.append(full_streaming)
|
|
456
|
-
logger.debug(f"Captured {len(streaming_text)} streaming chunks ({len(full_streaming)} chars)")
|
|
457
|
-
|
|
458
|
-
# Try to extract content from final_result if we have no messages
|
|
459
|
-
if final_result and not agent_messages:
|
|
687
|
+
# Try to extract content from final_result if collector has no messages
|
|
688
|
+
messages = collector.get_messages()
|
|
689
|
+
if final_result and not messages:
|
|
460
690
|
if "content" in final_result:
|
|
461
691
|
content = final_result["content"]
|
|
462
692
|
if isinstance(content, list):
|
|
463
693
|
for block in content:
|
|
464
694
|
if isinstance(block, dict) and block.get("text"):
|
|
465
|
-
|
|
695
|
+
messages.append(block["text"])
|
|
466
696
|
elif isinstance(block, str):
|
|
467
|
-
|
|
697
|
+
messages.append(block)
|
|
468
698
|
elif isinstance(content, str):
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
agent_messages.append(final_result["text"])
|
|
699
|
+
messages.append(content)
|
|
700
|
+
if not messages and "text" in final_result:
|
|
701
|
+
messages.append(final_result["text"])
|
|
473
702
|
|
|
474
|
-
# Build result
|
|
475
|
-
effective_conversation_id = conversation_id or session_id
|
|
703
|
+
# Build result
|
|
476
704
|
result = {
|
|
477
|
-
"conversationId":
|
|
478
|
-
"messages":
|
|
479
|
-
"output": "\n".join(
|
|
480
|
-
"usage": token_usage,
|
|
705
|
+
"conversationId": collector.conversation_id,
|
|
706
|
+
"messages": messages,
|
|
707
|
+
"output": "\n".join(messages) if messages else "",
|
|
708
|
+
"usage": collector.token_usage,
|
|
481
709
|
}
|
|
482
710
|
|
|
483
711
|
# Log detailed debug info if we didn't capture any messages
|
|
484
|
-
if not
|
|
712
|
+
if not messages:
|
|
713
|
+
debug_info = collector.get_debug_info()
|
|
485
714
|
event_types = [e.get("method") or f"id:{e.get('id')}" for e in all_events[:20]]
|
|
486
715
|
logger.warning(
|
|
487
716
|
f"MCP call returned no messages. "
|
|
488
|
-
f"conversation_id={
|
|
489
|
-
f"session_id={session_id}, "
|
|
717
|
+
f"conversation_id={collector.conversation_id}, "
|
|
490
718
|
f"event_count={len(all_events)}, "
|
|
491
719
|
f"event_types={event_types}, "
|
|
720
|
+
f"collector_debug={debug_info}, "
|
|
492
721
|
f"final_result_keys={list(final_result.keys()) if final_result else 'None'}"
|
|
493
722
|
)
|
|
494
723
|
# Log codex/event details for debugging
|
|
@@ -498,7 +727,7 @@ class MCPClient:
|
|
|
498
727
|
msg = ce.get("params", {}).get("msg", {})
|
|
499
728
|
logger.debug(f" codex/event: type={msg.get('type')}, keys={list(msg.keys())}")
|
|
500
729
|
|
|
501
|
-
logger.debug(f"MCP call complete: {len(
|
|
730
|
+
logger.debug(f"MCP call complete: {len(messages)} messages, conversation_id={collector.conversation_id}")
|
|
502
731
|
return result
|
|
503
732
|
|
|
504
733
|
def close(self) -> None:
|
|
@@ -530,11 +759,22 @@ class CodexMCPAdapter(ExecutorAdapter):
|
|
|
530
759
|
This is the recommended way to have iterative conversations with Codex.
|
|
531
760
|
The MCP client uses subprocess.Popen (not asyncio) so it persists across
|
|
532
761
|
multiple asyncio.run() calls, preserving conversation state.
|
|
762
|
+
|
|
763
|
+
Config isolation: Pass config_path to use a local codex.toml instead of
|
|
764
|
+
the user's global ~/.codex/config.toml. This is the preferred approach.
|
|
765
|
+
Falls back to config_overrides if no config_path is provided.
|
|
533
766
|
"""
|
|
534
767
|
DEFAULT_MODEL = "gpt-5.1-codex-mini" # Default codex model
|
|
535
768
|
|
|
536
|
-
def __init__(
|
|
769
|
+
def __init__(
|
|
770
|
+
self,
|
|
771
|
+
model: str | None = None,
|
|
772
|
+
config_path: Path | None = None,
|
|
773
|
+
config_overrides: dict[str, str] | None = None,
|
|
774
|
+
):
|
|
537
775
|
self._model = model or self.DEFAULT_MODEL
|
|
776
|
+
self._config_path = config_path # Path to local codex.toml for isolation
|
|
777
|
+
self._config_overrides = config_overrides or {}
|
|
538
778
|
self._mcp_client: MCPClient | None = None
|
|
539
779
|
self._sessions: dict[str, str] = {} # session_id -> conversationId
|
|
540
780
|
# Cumulative token usage for cost tracking
|
|
@@ -561,7 +801,10 @@ class CodexMCPAdapter(ExecutorAdapter):
|
|
|
561
801
|
def _ensure_client(self) -> MCPClient:
|
|
562
802
|
"""Ensure MCP client is running and return it."""
|
|
563
803
|
if self._mcp_client is None:
|
|
564
|
-
self._mcp_client = MCPClient(
|
|
804
|
+
self._mcp_client = MCPClient(
|
|
805
|
+
config_path=self._config_path,
|
|
806
|
+
config_overrides=self._config_overrides,
|
|
807
|
+
)
|
|
565
808
|
|
|
566
809
|
if not self._mcp_client.is_alive:
|
|
567
810
|
self._mcp_client.start()
|
|
@@ -575,6 +818,7 @@ class CodexMCPAdapter(ExecutorAdapter):
|
|
|
575
818
|
cwd: str,
|
|
576
819
|
sandbox: str,
|
|
577
820
|
model: str | None = None,
|
|
821
|
+
reasoning_effort: str | None = None,
|
|
578
822
|
) -> dict[str, Any]:
|
|
579
823
|
"""
|
|
580
824
|
Call codex MCP tool - traced by Weave.
|
|
@@ -592,7 +836,13 @@ class CodexMCPAdapter(ExecutorAdapter):
|
|
|
592
836
|
if model:
|
|
593
837
|
args["model"] = model
|
|
594
838
|
|
|
595
|
-
|
|
839
|
+
# Pass reasoning_effort to override codex config defaults
|
|
840
|
+
# The config key is "model_reasoning_effort"
|
|
841
|
+
if reasoning_effort:
|
|
842
|
+
args["model_reasoning_effort"] = reasoning_effort
|
|
843
|
+
|
|
844
|
+
logger.info(f"Calling codex with task_len={len(task)}, cwd={cwd}, model={model or 'default'}, reasoning_effort={reasoning_effort or 'default'}")
|
|
845
|
+
logger.debug(f"Full codex args: {args}")
|
|
596
846
|
|
|
597
847
|
result = client.call_tool("codex", args)
|
|
598
848
|
|
|
@@ -668,12 +918,55 @@ class CodexMCPAdapter(ExecutorAdapter):
|
|
|
668
918
|
usage = result.get("usage", {})
|
|
669
919
|
self._accumulate_usage(usage)
|
|
670
920
|
|
|
671
|
-
response
|
|
921
|
+
# Filter out the sent message from the response using content hashing
|
|
922
|
+
# The MCP may echo our prompt back, but we use robust content comparison
|
|
923
|
+
raw_messages = result.get("messages", [])
|
|
924
|
+
|
|
925
|
+
# Create hash of user message for comparison (normalized)
|
|
926
|
+
def normalize_for_comparison(text: str) -> str:
|
|
927
|
+
"""Normalize text for comparison (lowercase, collapsed whitespace)."""
|
|
928
|
+
return " ".join(text.lower().split())
|
|
929
|
+
|
|
930
|
+
user_msg_normalized = normalize_for_comparison(message)
|
|
931
|
+
user_msg_hash = hashlib.md5(user_msg_normalized.encode()).hexdigest()
|
|
932
|
+
|
|
933
|
+
def is_user_message_echo(text: str) -> bool:
|
|
934
|
+
"""Check if text is just an echo of the user message."""
|
|
935
|
+
if not text:
|
|
936
|
+
return True # Empty is effectively an echo (skip it)
|
|
937
|
+
|
|
938
|
+
text_normalized = normalize_for_comparison(text)
|
|
939
|
+
text_hash = hashlib.md5(text_normalized.encode()).hexdigest()
|
|
940
|
+
|
|
941
|
+
# Exact match (case-insensitive, whitespace-normalized)
|
|
942
|
+
if text_hash == user_msg_hash:
|
|
943
|
+
return True
|
|
944
|
+
|
|
945
|
+
# Check if text IS the user message (not just starts with it)
|
|
946
|
+
# This avoids the bug where "Fix bug by X" gets filtered when user said "Fix bug"
|
|
947
|
+
if text_normalized == user_msg_normalized:
|
|
948
|
+
return True
|
|
949
|
+
|
|
950
|
+
return False
|
|
951
|
+
|
|
952
|
+
filtered_messages = [m for m in raw_messages if not is_user_message_echo(m)]
|
|
953
|
+
|
|
954
|
+
# Build filtered result for extraction
|
|
955
|
+
filtered_result = {
|
|
956
|
+
**result,
|
|
957
|
+
"messages": filtered_messages,
|
|
958
|
+
"output": "\n".join(filtered_messages) if filtered_messages else result.get("output", ""),
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
response = self._extract_response(filtered_result)
|
|
962
|
+
filtered_count = len(raw_messages) - len(filtered_messages)
|
|
963
|
+
if filtered_count > 0:
|
|
964
|
+
logger.debug(f"Filtered {filtered_count} user echo messages from response")
|
|
672
965
|
logger.debug(f"codex-reply response length: {len(response)} chars")
|
|
673
966
|
|
|
674
967
|
return {
|
|
675
968
|
"response": response,
|
|
676
|
-
"raw_messages":
|
|
969
|
+
"raw_messages": filtered_messages, # Return filtered messages
|
|
677
970
|
"usage": usage,
|
|
678
971
|
"total_usage": self.total_usage,
|
|
679
972
|
"conversation_lost": not result.get("messages") and not result.get("output"),
|
|
@@ -706,6 +999,7 @@ class CodexMCPAdapter(ExecutorAdapter):
|
|
|
706
999
|
cwd=str(working_dir.absolute()),
|
|
707
1000
|
sandbox=sandbox,
|
|
708
1001
|
model=effective_model,
|
|
1002
|
+
reasoning_effort=kwargs.get("reasoning_effort"),
|
|
709
1003
|
)
|
|
710
1004
|
|
|
711
1005
|
# Extract conversation ID and response
|