fast-agent-mcp 0.2.34__py3-none-any.whl → 0.2.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,8 @@ from mcp.types import (
8
8
  ImageContent,
9
9
  TextContent,
10
10
  )
11
- from openai import AuthenticationError, OpenAI
11
+ from openai import AsyncOpenAI, AuthenticationError
12
+ from openai.lib.streaming.chat import ChatCompletionStreamState
12
13
 
13
14
  # from openai.types.beta.chat import
14
15
  from openai.types.chat import (
@@ -22,6 +23,7 @@ from rich.text import Text
22
23
 
23
24
  from mcp_agent.core.exceptions import ProviderKeyError
24
25
  from mcp_agent.core.prompt import Prompt
26
+ from mcp_agent.event_progress import ProgressAction
25
27
  from mcp_agent.llm.augmented_llm import (
26
28
  AugmentedLLM,
27
29
  RequestParams,
@@ -103,9 +105,9 @@ class OpenAIAugmentedLLM(AugmentedLLM[ChatCompletionMessageParam, ChatCompletion
103
105
  def _base_url(self) -> str:
104
106
  return self.context.config.openai.base_url if self.context.config.openai else None
105
107
 
106
- def _openai_client(self) -> OpenAI:
108
+ def _openai_client(self) -> AsyncOpenAI:
107
109
  try:
108
- return OpenAI(api_key=self._api_key(), base_url=self._base_url())
110
+ return AsyncOpenAI(api_key=self._api_key(), base_url=self._base_url())
109
111
  except AuthenticationError as e:
110
112
  raise ProviderKeyError(
111
113
  "Invalid OpenAI API key",
@@ -113,6 +115,182 @@ class OpenAIAugmentedLLM(AugmentedLLM[ChatCompletionMessageParam, ChatCompletion
113
115
  "Please check that your API key is valid and not expired.",
114
116
  ) from e
115
117
 
118
+ async def _process_stream(self, stream, model: str):
119
+ """Process the streaming response and display real-time token usage."""
120
+ # Track estimated output tokens by counting text chunks
121
+ estimated_tokens = 0
122
+
123
+ # For non-OpenAI providers (like Ollama), ChatCompletionStreamState might not work correctly
124
+ # Fall back to manual accumulation if needed
125
+ # TODO -- consider this and whether to subclass instead
126
+ if self.provider in [Provider.GENERIC, Provider.OPENROUTER]:
127
+ return await self._process_stream_manual(stream, model)
128
+
129
+ # Use ChatCompletionStreamState helper for accumulation (OpenAI only)
130
+ state = ChatCompletionStreamState()
131
+
132
+ # Process the stream chunks
133
+ async for chunk in stream:
134
+ # Handle chunk accumulation
135
+ state.handle_chunk(chunk)
136
+
137
+ # Count tokens in real-time from content deltas
138
+ if chunk.choices and chunk.choices[0].delta.content:
139
+ content = chunk.choices[0].delta.content
140
+ # Use base class method for token estimation and progress emission
141
+ estimated_tokens = self._update_streaming_progress(content, model, estimated_tokens)
142
+
143
+ # Get the final completion with usage data
144
+ final_completion = state.get_final_completion()
145
+
146
+ # Log final usage information
147
+ if hasattr(final_completion, "usage") and final_completion.usage:
148
+ actual_tokens = final_completion.usage.completion_tokens
149
+ # Emit final progress with actual token count
150
+ token_str = str(actual_tokens).rjust(5)
151
+ data = {
152
+ "progress_action": ProgressAction.STREAMING,
153
+ "model": model,
154
+ "agent_name": self.name,
155
+ "chat_turn": self.chat_turn(),
156
+ "details": token_str.strip(),
157
+ }
158
+ self.logger.info("Streaming progress", data=data)
159
+
160
+ self.logger.info(
161
+ f"Streaming complete - Model: {model}, Input tokens: {final_completion.usage.prompt_tokens}, Output tokens: {final_completion.usage.completion_tokens}"
162
+ )
163
+
164
+ return final_completion
165
+
166
+ # TODO - as per other comment this needs to go in another class. There are a number of "special" cases dealt with
167
+ # here to deal with OpenRouter idiosyncrasies between e.g. Anthropic and Gemini models.
168
+ async def _process_stream_manual(self, stream, model: str):
169
+ """Manual stream processing for providers like Ollama that may not work with ChatCompletionStreamState."""
170
+ from openai.types.chat import ChatCompletionMessageToolCall
171
+ from openai.types.chat.chat_completion_message_tool_call import Function
172
+
173
+ # Track estimated output tokens by counting text chunks
174
+ estimated_tokens = 0
175
+
176
+ # Manual accumulation of response data
177
+ accumulated_content = ""
178
+ role = "assistant"
179
+ tool_calls_map = {} # Use a map to accumulate tool calls by index
180
+ function_call = None
181
+ finish_reason = None
182
+ usage_data = None
183
+
184
+ # Process the stream chunks manually
185
+ async for chunk in stream:
186
+ # Count tokens in real-time from content deltas
187
+ if chunk.choices and chunk.choices[0].delta.content:
188
+ content = chunk.choices[0].delta.content
189
+ accumulated_content += content
190
+ # Use base class method for token estimation and progress emission
191
+ estimated_tokens = self._update_streaming_progress(content, model, estimated_tokens)
192
+
193
+ # Extract other fields from the chunk
194
+ if chunk.choices:
195
+ choice = chunk.choices[0]
196
+ if choice.delta.role:
197
+ role = choice.delta.role
198
+ if choice.delta.tool_calls:
199
+ # Accumulate tool call deltas
200
+ for delta_tool_call in choice.delta.tool_calls:
201
+ if delta_tool_call.index is not None:
202
+ if delta_tool_call.index not in tool_calls_map:
203
+ tool_calls_map[delta_tool_call.index] = {
204
+ "id": delta_tool_call.id,
205
+ "type": delta_tool_call.type or "function",
206
+ "function": {
207
+ "name": delta_tool_call.function.name
208
+ if delta_tool_call.function
209
+ else None,
210
+ "arguments": "",
211
+ },
212
+ }
213
+
214
+ # Always update if we have new data (needed for OpenRouter Gemini)
215
+ if delta_tool_call.id:
216
+ tool_calls_map[delta_tool_call.index]["id"] = delta_tool_call.id
217
+ if delta_tool_call.function:
218
+ if delta_tool_call.function.name:
219
+ tool_calls_map[delta_tool_call.index]["function"]["name"] = (
220
+ delta_tool_call.function.name
221
+ )
222
+ # Handle arguments - they might come as None, empty string, or actual content
223
+ if delta_tool_call.function.arguments is not None:
224
+ tool_calls_map[delta_tool_call.index]["function"][
225
+ "arguments"
226
+ ] += delta_tool_call.function.arguments
227
+
228
+ if choice.delta.function_call:
229
+ function_call = choice.delta.function_call
230
+ if choice.finish_reason:
231
+ finish_reason = choice.finish_reason
232
+
233
+ # Extract usage data if available
234
+ if hasattr(chunk, "usage") and chunk.usage:
235
+ usage_data = chunk.usage
236
+
237
+ # Convert accumulated tool calls to proper format.
238
+ tool_calls = None
239
+ if tool_calls_map:
240
+ tool_calls = []
241
+ for idx in sorted(tool_calls_map.keys()):
242
+ tool_call_data = tool_calls_map[idx]
243
+ # Only add tool calls that have valid data
244
+ if tool_call_data["id"] and tool_call_data["function"]["name"]:
245
+ tool_calls.append(
246
+ ChatCompletionMessageToolCall(
247
+ id=tool_call_data["id"],
248
+ type=tool_call_data["type"],
249
+ function=Function(
250
+ name=tool_call_data["function"]["name"],
251
+ arguments=tool_call_data["function"]["arguments"],
252
+ ),
253
+ )
254
+ )
255
+
256
+ # Create a ChatCompletionMessage manually
257
+ message = ChatCompletionMessage(
258
+ content=accumulated_content,
259
+ role=role,
260
+ tool_calls=tool_calls if tool_calls else None,
261
+ function_call=function_call,
262
+ refusal=None,
263
+ annotations=None,
264
+ audio=None,
265
+ )
266
+
267
+ from types import SimpleNamespace
268
+
269
+ final_completion = SimpleNamespace()
270
+ final_completion.choices = [SimpleNamespace()]
271
+ final_completion.choices[0].message = message
272
+ final_completion.choices[0].finish_reason = finish_reason
273
+ final_completion.usage = usage_data
274
+
275
+ # Log final usage information
276
+ if usage_data:
277
+ actual_tokens = getattr(usage_data, "completion_tokens", estimated_tokens)
278
+ token_str = str(actual_tokens).rjust(5)
279
+ data = {
280
+ "progress_action": ProgressAction.STREAMING,
281
+ "model": model,
282
+ "agent_name": self.name,
283
+ "chat_turn": self.chat_turn(),
284
+ "details": token_str.strip(),
285
+ }
286
+ self.logger.info("Streaming progress", data=data)
287
+
288
+ self.logger.info(
289
+ f"Streaming complete - Model: {model}, Input tokens: {getattr(usage_data, 'prompt_tokens', 0)}, Output tokens: {actual_tokens}"
290
+ )
291
+
292
+ return final_completion
293
+
116
294
  async def _openai_completion(
117
295
  self,
118
296
  message: OpenAIMessage,
@@ -151,7 +329,10 @@ class OpenAIAugmentedLLM(AugmentedLLM[ChatCompletionMessageParam, ChatCompletion
151
329
  ]
152
330
 
153
331
  if not available_tools:
154
- available_tools = None # deepseek does not allow empty array
332
+ if self.provider == Provider.DEEPSEEK:
333
+ available_tools = None # deepseek does not allow empty array
334
+ else:
335
+ available_tools = []
155
336
 
156
337
  # we do NOT send "stop sequences" as this causes errors with mutlimodal processing
157
338
  for i in range(request_params.max_iterations):
@@ -160,11 +341,10 @@ class OpenAIAugmentedLLM(AugmentedLLM[ChatCompletionMessageParam, ChatCompletion
160
341
 
161
342
  self._log_chat_progress(self.chat_turn(), model=self.default_request_params.model)
162
343
 
163
- executor_result = await self.executor.execute(
164
- self._openai_client().chat.completions.create, **arguments
165
- )
166
-
167
- response = executor_result[0]
344
+ # Use basic streaming API
345
+ stream = await self._openai_client().chat.completions.create(**arguments)
346
+ # Process the stream
347
+ response = await self._process_stream(stream, self.default_request_params.model)
168
348
 
169
349
  # Track usage if response is valid and has usage data
170
350
  if (
@@ -204,10 +384,11 @@ class OpenAIAugmentedLLM(AugmentedLLM[ChatCompletionMessageParam, ChatCompletion
204
384
  if message.content:
205
385
  responses.append(TextContent(type="text", text=message.content))
206
386
 
207
- converted_message = self.convert_message_to_message_param(message)
208
- messages.append(converted_message)
387
+ # ParsedChatCompletionMessage is compatible with ChatCompletionMessage
388
+ # since it inherits from it, so we can use it directly
389
+ messages.append(message)
209
390
 
210
- message_text = converted_message.content
391
+ message_text = message.content
211
392
  if choice.finish_reason in ["tool_calls", "function_call"] and message.tool_calls:
212
393
  if message_text:
213
394
  await self.show_assistant_message(
@@ -347,6 +528,8 @@ class OpenAIAugmentedLLM(AugmentedLLM[ChatCompletionMessageParam, ChatCompletion
347
528
  "model": self.default_request_params.model,
348
529
  "messages": messages,
349
530
  "tools": tools,
531
+ "stream": True, # Enable basic streaming
532
+ "stream_options": {"include_usage": True}, # Required for usage data in streaming
350
533
  }
351
534
 
352
535
  if self._reasoning:
@@ -360,7 +360,7 @@ class OpenAIConverter:
360
360
  return {
361
361
  "role": "tool",
362
362
  "tool_call_id": tool_call_id,
363
- "content": "[No content in tool result]",
363
+ "content": "[Tool completed successfully]",
364
364
  }
365
365
 
366
366
  # Separate text and non-text content
@@ -387,8 +387,9 @@ class OpenAIConverter:
387
387
  converted.get("content", "")
388
388
  )
389
389
 
390
- if not tool_message_content:
391
- tool_message_content = "[Tool returned non-text content]"
390
+ # Ensure we always have non-empty content for compatibility
391
+ if not tool_message_content or tool_message_content.strip() == "":
392
+ tool_message_content = "[Tool completed successfully]"
392
393
 
393
394
  # Create the tool message with just the text
394
395
  tool_message = {
@@ -84,19 +84,32 @@ class TurnUsage(BaseModel):
84
84
  @computed_field
85
85
  @property
86
86
  def current_context_tokens(self) -> int:
87
- """Current context size after this turn (input + output)"""
88
- return self.input_tokens + self.output_tokens
87
+ """Current context size after this turn (total input including cache + output)"""
88
+ # For Anthropic: input_tokens + cache_read_tokens represents total input context
89
+ total_input = self.input_tokens + self.cache_usage.cache_read_tokens + self.cache_usage.cache_write_tokens
90
+ return total_input + self.output_tokens
89
91
 
90
92
  @computed_field
91
93
  @property
92
94
  def effective_input_tokens(self) -> int:
93
- """Input tokens excluding cache reads (tokens actually processed)"""
94
- return max(
95
- 0,
96
- self.input_tokens
97
- - self.cache_usage.cache_read_tokens
98
- - self.cache_usage.cache_hit_tokens,
99
- )
95
+ """Input tokens actually processed (new tokens, not from cache)"""
96
+ # For Anthropic: input_tokens already excludes cached content
97
+ # For other providers: subtract cache hits from input_tokens
98
+ if self.provider == Provider.ANTHROPIC:
99
+ return self.input_tokens
100
+ else:
101
+ return max(0, self.input_tokens - self.cache_usage.cache_hit_tokens)
102
+
103
+ @computed_field
104
+ @property
105
+ def display_input_tokens(self) -> int:
106
+ """Input tokens to display for 'Last turn' (total submitted tokens)"""
107
+ # For Anthropic: input_tokens excludes cache, so add cache tokens
108
+ if self.provider == Provider.ANTHROPIC:
109
+ return self.input_tokens + self.cache_usage.cache_read_tokens + self.cache_usage.cache_write_tokens
110
+ else:
111
+ # For OpenAI/Google: input_tokens already includes cached tokens
112
+ return self.input_tokens
100
113
 
101
114
  @classmethod
102
115
  def from_anthropic(cls, usage: AnthropicUsage, model: str) -> "TurnUsage":
@@ -204,8 +217,11 @@ class UsageAccumulator(BaseModel):
204
217
  @computed_field
205
218
  @property
206
219
  def cumulative_input_tokens(self) -> int:
207
- """Total input tokens charged across all turns"""
208
- return sum(turn.input_tokens for turn in self.turns)
220
+ """Total input tokens charged across all turns (including cache tokens)"""
221
+ return sum(
222
+ turn.input_tokens + turn.cache_usage.cache_read_tokens + turn.cache_usage.cache_write_tokens
223
+ for turn in self.turns
224
+ )
209
225
 
210
226
  @computed_field
211
227
  @property
@@ -216,8 +232,8 @@ class UsageAccumulator(BaseModel):
216
232
  @computed_field
217
233
  @property
218
234
  def cumulative_billing_tokens(self) -> int:
219
- """Total tokens charged across all turns"""
220
- return sum(turn.total_tokens for turn in self.turns)
235
+ """Total tokens charged across all turns (including cache tokens)"""
236
+ return self.cumulative_input_tokens + self.cumulative_output_tokens
221
237
 
222
238
  @computed_field
223
239
  @property
@@ -258,11 +274,12 @@ class UsageAccumulator(BaseModel):
258
274
  @computed_field
259
275
  @property
260
276
  def cache_hit_rate(self) -> Optional[float]:
261
- """Percentage of input tokens served from cache"""
262
- if self.cumulative_input_tokens == 0:
263
- return None
277
+ """Percentage of total input context served from cache"""
264
278
  cache_tokens = self.cumulative_cache_read_tokens + self.cumulative_cache_hit_tokens
265
- return (cache_tokens / self.cumulative_input_tokens) * 100
279
+ total_input_context = self.cumulative_input_tokens + cache_tokens
280
+ if total_input_context == 0:
281
+ return None
282
+ return (cache_tokens / total_input_context) * 100
266
283
 
267
284
  @computed_field
268
285
  @property
@@ -117,3 +117,27 @@ class SamplingFilter(EventFilter):
117
117
  if not super().matches(event):
118
118
  return False
119
119
  return random.random() < self.sample_rate
120
+
121
+
122
+ class StreamingExclusionFilter(EventFilter):
123
+ """
124
+ Event filter that excludes streaming progress events from logs.
125
+ This prevents token count updates from flooding the logs when info level is enabled.
126
+ """
127
+
128
+ def matches(self, event: Event) -> bool:
129
+ # First check if it passes the base filter
130
+ if not super().matches(event):
131
+ return False
132
+
133
+ # Exclude events with "Streaming progress" message
134
+ if event.message == "Streaming progress":
135
+ return False
136
+
137
+ # Also check for events with progress_action = STREAMING in data
138
+ if event.data and isinstance(event.data.get("data"), dict):
139
+ event_data = event.data["data"]
140
+ if event_data.get("progress_action") == "Streaming":
141
+ return False
142
+
143
+ return True
@@ -73,6 +73,7 @@ class RichProgressDisplay:
73
73
  ProgressAction.LOADED: "dim green",
74
74
  ProgressAction.INITIALIZED: "dim green",
75
75
  ProgressAction.CHATTING: "bold blue",
76
+ ProgressAction.STREAMING: "bold blue", # Same color as chatting
76
77
  ProgressAction.ROUTING: "bold blue",
77
78
  ProgressAction.PLANNING: "bold blue",
78
79
  ProgressAction.READY: "dim green",
@@ -100,9 +101,16 @@ class RichProgressDisplay:
100
101
  task_id = self._taskmap[task_name]
101
102
 
102
103
  # Ensure no None values in the update
104
+ # For streaming, use custom description immediately to avoid flashing
105
+ if event.action == ProgressAction.STREAMING and event.streaming_tokens:
106
+ formatted_tokens = f"↓ {event.streaming_tokens.strip()}".ljust(15)
107
+ description = f"[{self._get_action_style(event.action)}]{formatted_tokens}"
108
+ else:
109
+ description = f"[{self._get_action_style(event.action)}]{event.action.value:<15}"
110
+
103
111
  self._progress.update(
104
112
  task_id,
105
- description=f"[{self._get_action_style(event.action)}]{event.action.value:<15}",
113
+ description=description,
106
114
  target=event.target or task_name, # Use task_name as fallback for target
107
115
  details=event.details or "",
108
116
  task_name=task_name,
@@ -21,7 +21,7 @@ from typing import (
21
21
  runtime_checkable,
22
22
  )
23
23
 
24
- from a2a_types.types import AgentCard
24
+ from a2a.types import AgentCard
25
25
  from anyio.streams.memory import MemoryObjectReceiveStream, MemoryObjectSendStream
26
26
  from deprecated import deprecated
27
27
  from mcp import ClientSession