PyPI - fast-agent-mcp - Versions diffs - 0.2.34__py3-none-any.whl → 0.2.36__py3-none-any.whl - Mend

fast-agent-mcp 0.2.34py3-none-any.whl → 0.2.36py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/METADATA +6 -6
{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/RECORD +24 -24
mcp_agent/agents/base_agent.py +2 -2
mcp_agent/agents/workflow/router_agent.py +1 -1
mcp_agent/config.py +8 -0
mcp_agent/context.py +3 -2
mcp_agent/core/agent_app.py +1 -1
mcp_agent/core/enhanced_prompt.py +73 -13
mcp_agent/core/interactive_prompt.py +118 -8
mcp_agent/event_progress.py +22 -4
mcp_agent/llm/augmented_llm.py +47 -3
mcp_agent/llm/memory.py +120 -0
mcp_agent/llm/model_database.py +2 -2
mcp_agent/llm/providers/augmented_llm_anthropic.py +178 -45
mcp_agent/llm/providers/augmented_llm_azure.py +4 -4
mcp_agent/llm/providers/augmented_llm_openai.py +195 -12
mcp_agent/llm/providers/multipart_converter_openai.py +4 -3
mcp_agent/llm/usage_tracking.py +34 -17
mcp_agent/logging/events.py +24 -0
mcp_agent/logging/rich_progress.py +9 -1
mcp_agent/mcp/interfaces.py +1 -1
{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/WHEEL +0 -0
{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/entry_points.txt +0 -0
{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/licenses/LICENSE +0 -0

mcp_agent/llm/providers/augmented_llm_openai.py CHANGED Viewed

@@ -8,7 +8,8 @@ from mcp.types import (
     ImageContent,
     TextContent,
 )
-from openai import AuthenticationError, OpenAI
+from openai import AsyncOpenAI, AuthenticationError
+from openai.lib.streaming.chat import ChatCompletionStreamState
 # from openai.types.beta.chat import
 from openai.types.chat import (
@@ -22,6 +23,7 @@ from rich.text import Text
 from mcp_agent.core.exceptions import ProviderKeyError
 from mcp_agent.core.prompt import Prompt
+from mcp_agent.event_progress import ProgressAction
 from mcp_agent.llm.augmented_llm import (
     AugmentedLLM,
     RequestParams,
@@ -103,9 +105,9 @@ class OpenAIAugmentedLLM(AugmentedLLM[ChatCompletionMessageParam, ChatCompletion
     def _base_url(self) -> str:
         return self.context.config.openai.base_url if self.context.config.openai else None
-    def _openai_client(self) -> OpenAI:
+    def _openai_client(self) -> AsyncOpenAI:
         try:
-            return OpenAI(api_key=self._api_key(), base_url=self._base_url())
+            return AsyncOpenAI(api_key=self._api_key(), base_url=self._base_url())
         except AuthenticationError as e:
             raise ProviderKeyError(
                 "Invalid OpenAI API key",
@@ -113,6 +115,182 @@ class OpenAIAugmentedLLM(AugmentedLLM[ChatCompletionMessageParam, ChatCompletion
                 "Please check that your API key is valid and not expired.",
             ) from e
+    async def _process_stream(self, stream, model: str):
+        """Process the streaming response and display real-time token usage."""
+        # Track estimated output tokens by counting text chunks
+        estimated_tokens = 0
+        # For non-OpenAI providers (like Ollama), ChatCompletionStreamState might not work correctly
+        # Fall back to manual accumulation if needed
+        # TODO -- consider this and whether to subclass instead
+        if self.provider in [Provider.GENERIC, Provider.OPENROUTER]:
+            return await self._process_stream_manual(stream, model)
+        # Use ChatCompletionStreamState helper for accumulation (OpenAI only)
+        state = ChatCompletionStreamState()
+        # Process the stream chunks
+        async for chunk in stream:
+            # Handle chunk accumulation
+            state.handle_chunk(chunk)
+            # Count tokens in real-time from content deltas
+            if chunk.choices and chunk.choices[0].delta.content:
+                content = chunk.choices[0].delta.content
+                # Use base class method for token estimation and progress emission
+                estimated_tokens = self._update_streaming_progress(content, model, estimated_tokens)
+        # Get the final completion with usage data
+        final_completion = state.get_final_completion()
+        # Log final usage information
+        if hasattr(final_completion, "usage") and final_completion.usage:
+            actual_tokens = final_completion.usage.completion_tokens
+            # Emit final progress with actual token count
+            token_str = str(actual_tokens).rjust(5)
+            data = {
+                "progress_action": ProgressAction.STREAMING,
+                "model": model,
+                "agent_name": self.name,
+                "chat_turn": self.chat_turn(),
+                "details": token_str.strip(),
+            }
+            self.logger.info("Streaming progress", data=data)
+            self.logger.info(
+                f"Streaming complete - Model: {model}, Input tokens: {final_completion.usage.prompt_tokens}, Output tokens: {final_completion.usage.completion_tokens}"
+            )
+        return final_completion
+    # TODO - as per other comment this needs to go in another class. There are a number of "special" cases dealt with
+    # here to deal with OpenRouter idiosyncrasies between e.g. Anthropic and Gemini models.
+    async def _process_stream_manual(self, stream, model: str):
+        """Manual stream processing for providers like Ollama that may not work with ChatCompletionStreamState."""
+        from openai.types.chat import ChatCompletionMessageToolCall
+        from openai.types.chat.chat_completion_message_tool_call import Function
+        # Track estimated output tokens by counting text chunks
+        estimated_tokens = 0
+        # Manual accumulation of response data
+        accumulated_content = ""
+        role = "assistant"
+        tool_calls_map = {}  # Use a map to accumulate tool calls by index
+        function_call = None
+        finish_reason = None
+        usage_data = None
+        # Process the stream chunks manually
+        async for chunk in stream:
+            # Count tokens in real-time from content deltas
+            if chunk.choices and chunk.choices[0].delta.content:
+                content = chunk.choices[0].delta.content
+                accumulated_content += content
+                # Use base class method for token estimation and progress emission
+                estimated_tokens = self._update_streaming_progress(content, model, estimated_tokens)
+            # Extract other fields from the chunk
+            if chunk.choices:
+                choice = chunk.choices[0]
+                if choice.delta.role:
+                    role = choice.delta.role
+                if choice.delta.tool_calls:
+                    # Accumulate tool call deltas
+                    for delta_tool_call in choice.delta.tool_calls:
+                        if delta_tool_call.index is not None:
+                            if delta_tool_call.index not in tool_calls_map:
+                                tool_calls_map[delta_tool_call.index] = {
+                                    "id": delta_tool_call.id,
+                                    "type": delta_tool_call.type or "function",
+                                    "function": {
+                                        "name": delta_tool_call.function.name
+                                        if delta_tool_call.function
+                                        else None,
+                                        "arguments": "",
+                                    },
+                                }
+                            # Always update if we have new data (needed for OpenRouter Gemini)
+                            if delta_tool_call.id:
+                                tool_calls_map[delta_tool_call.index]["id"] = delta_tool_call.id
+                            if delta_tool_call.function:
+                                if delta_tool_call.function.name:
+                                    tool_calls_map[delta_tool_call.index]["function"]["name"] = (
+                                        delta_tool_call.function.name
+                                    )
+                                # Handle arguments - they might come as None, empty string, or actual content
+                                if delta_tool_call.function.arguments is not None:
+                                    tool_calls_map[delta_tool_call.index]["function"][
+                                        "arguments"
+                                    ] += delta_tool_call.function.arguments
+                if choice.delta.function_call:
+                    function_call = choice.delta.function_call
+                if choice.finish_reason:
+                    finish_reason = choice.finish_reason
+            # Extract usage data if available
+            if hasattr(chunk, "usage") and chunk.usage:
+                usage_data = chunk.usage
+        # Convert accumulated tool calls to proper format.
+        tool_calls = None
+        if tool_calls_map:
+            tool_calls = []
+            for idx in sorted(tool_calls_map.keys()):
+                tool_call_data = tool_calls_map[idx]
+                # Only add tool calls that have valid data
+                if tool_call_data["id"] and tool_call_data["function"]["name"]:
+                    tool_calls.append(
+                        ChatCompletionMessageToolCall(
+                            id=tool_call_data["id"],
+                            type=tool_call_data["type"],
+                            function=Function(
+                                name=tool_call_data["function"]["name"],
+                                arguments=tool_call_data["function"]["arguments"],
+                            ),
+                        )
+                    )
+        # Create a ChatCompletionMessage manually
+        message = ChatCompletionMessage(
+            content=accumulated_content,
+            role=role,
+            tool_calls=tool_calls if tool_calls else None,
+            function_call=function_call,
+            refusal=None,
+            annotations=None,
+            audio=None,
+        )
+        from types import SimpleNamespace
+        final_completion = SimpleNamespace()
+        final_completion.choices = [SimpleNamespace()]
+        final_completion.choices[0].message = message
+        final_completion.choices[0].finish_reason = finish_reason
+        final_completion.usage = usage_data
+        # Log final usage information
+        if usage_data:
+            actual_tokens = getattr(usage_data, "completion_tokens", estimated_tokens)
+            token_str = str(actual_tokens).rjust(5)
+            data = {
+                "progress_action": ProgressAction.STREAMING,
+                "model": model,
+                "agent_name": self.name,
+                "chat_turn": self.chat_turn(),
+                "details": token_str.strip(),
+            }
+            self.logger.info("Streaming progress", data=data)
+            self.logger.info(
+                f"Streaming complete - Model: {model}, Input tokens: {getattr(usage_data, 'prompt_tokens', 0)}, Output tokens: {actual_tokens}"
+            )
+        return final_completion
     async def _openai_completion(
         self,
         message: OpenAIMessage,
@@ -151,7 +329,10 @@ class OpenAIAugmentedLLM(AugmentedLLM[ChatCompletionMessageParam, ChatCompletion
         ]
         if not available_tools:
-            available_tools = None  # deepseek does not allow empty array
+            if self.provider == Provider.DEEPSEEK:
+                available_tools = None  # deepseek does not allow empty array
+            else:
+                available_tools = []
         # we do NOT send "stop sequences" as this causes errors with mutlimodal processing
         for i in range(request_params.max_iterations):
@@ -160,11 +341,10 @@ class OpenAIAugmentedLLM(AugmentedLLM[ChatCompletionMessageParam, ChatCompletion
             self._log_chat_progress(self.chat_turn(), model=self.default_request_params.model)
-            executor_result = await self.executor.execute(
-                self._openai_client().chat.completions.create, **arguments
-            )
-            response = executor_result[0]
+            # Use basic streaming API
+            stream = await self._openai_client().chat.completions.create(**arguments)
+            # Process the stream
+            response = await self._process_stream(stream, self.default_request_params.model)
             # Track usage if response is valid and has usage data
             if (
@@ -204,10 +384,11 @@ class OpenAIAugmentedLLM(AugmentedLLM[ChatCompletionMessageParam, ChatCompletion
             if message.content:
                 responses.append(TextContent(type="text", text=message.content))
-            converted_message = self.convert_message_to_message_param(message)
-            messages.append(converted_message)
+            # ParsedChatCompletionMessage is compatible with ChatCompletionMessage
+            # since it inherits from it, so we can use it directly
+            messages.append(message)
-            message_text = converted_message.content
+            message_text = message.content
             if choice.finish_reason in ["tool_calls", "function_call"] and message.tool_calls:
                 if message_text:
                     await self.show_assistant_message(
@@ -347,6 +528,8 @@ class OpenAIAugmentedLLM(AugmentedLLM[ChatCompletionMessageParam, ChatCompletion
             "model": self.default_request_params.model,
             "messages": messages,
             "tools": tools,
+            "stream": True,  # Enable basic streaming
+            "stream_options": {"include_usage": True},  # Required for usage data in streaming
         }
         if self._reasoning:

mcp_agent/llm/providers/multipart_converter_openai.py CHANGED Viewed

@@ -360,7 +360,7 @@ class OpenAIConverter:
             return {
                 "role": "tool",
                 "tool_call_id": tool_call_id,
-                "content": "[No content in tool result]",
+                "content": "[Tool completed successfully]",
             }
         # Separate text and non-text content
@@ -387,8 +387,9 @@ class OpenAIConverter:
                 converted.get("content", "")
             )
-        if not tool_message_content:
-            tool_message_content = "[Tool returned non-text content]"
+        # Ensure we always have non-empty content for compatibility
+        if not tool_message_content or tool_message_content.strip() == "":
+            tool_message_content = "[Tool completed successfully]"
         # Create the tool message with just the text
         tool_message = {

mcp_agent/llm/usage_tracking.py CHANGED Viewed

@@ -84,19 +84,32 @@ class TurnUsage(BaseModel):
     @computed_field
     @property
     def current_context_tokens(self) -> int:
-        """Current context size after this turn (input + output)"""
-        return self.input_tokens + self.output_tokens
+        """Current context size after this turn (total input including cache + output)"""
+        # For Anthropic: input_tokens + cache_read_tokens represents total input context
+        total_input = self.input_tokens + self.cache_usage.cache_read_tokens + self.cache_usage.cache_write_tokens
+        return total_input + self.output_tokens
     @computed_field
     @property
     def effective_input_tokens(self) -> int:
-        """Input tokens excluding cache reads (tokens actually processed)"""
-        return max(
-            0,
-            self.input_tokens
-            - self.cache_usage.cache_read_tokens
-            - self.cache_usage.cache_hit_tokens,
-        )
+        """Input tokens actually processed (new tokens, not from cache)"""
+        # For Anthropic: input_tokens already excludes cached content
+        # For other providers: subtract cache hits from input_tokens
+        if self.provider == Provider.ANTHROPIC:
+            return self.input_tokens
+        else:
+            return max(0, self.input_tokens - self.cache_usage.cache_hit_tokens)
+    @computed_field
+    @property
+    def display_input_tokens(self) -> int:
+        """Input tokens to display for 'Last turn' (total submitted tokens)"""
+        # For Anthropic: input_tokens excludes cache, so add cache tokens
+        if self.provider == Provider.ANTHROPIC:
+            return self.input_tokens + self.cache_usage.cache_read_tokens + self.cache_usage.cache_write_tokens
+        else:
+            # For OpenAI/Google: input_tokens already includes cached tokens
+            return self.input_tokens
     @classmethod
     def from_anthropic(cls, usage: AnthropicUsage, model: str) -> "TurnUsage":
@@ -204,8 +217,11 @@ class UsageAccumulator(BaseModel):
     @computed_field
     @property
     def cumulative_input_tokens(self) -> int:
-        """Total input tokens charged across all turns"""
-        return sum(turn.input_tokens for turn in self.turns)
+        """Total input tokens charged across all turns (including cache tokens)"""
+        return sum(
+            turn.input_tokens + turn.cache_usage.cache_read_tokens + turn.cache_usage.cache_write_tokens
+            for turn in self.turns
+        )
     @computed_field
     @property
@@ -216,8 +232,8 @@ class UsageAccumulator(BaseModel):
     @computed_field
     @property
     def cumulative_billing_tokens(self) -> int:
-        """Total tokens charged across all turns"""
-        return sum(turn.total_tokens for turn in self.turns)
+        """Total tokens charged across all turns (including cache tokens)"""
+        return self.cumulative_input_tokens + self.cumulative_output_tokens
     @computed_field
     @property
@@ -258,11 +274,12 @@ class UsageAccumulator(BaseModel):
     @computed_field
     @property
     def cache_hit_rate(self) -> Optional[float]:
-        """Percentage of input tokens served from cache"""
-        if self.cumulative_input_tokens == 0:
-            return None
+        """Percentage of total input context served from cache"""
         cache_tokens = self.cumulative_cache_read_tokens + self.cumulative_cache_hit_tokens
-        return (cache_tokens / self.cumulative_input_tokens) * 100
+        total_input_context = self.cumulative_input_tokens + cache_tokens
+        if total_input_context == 0:
+            return None
+        return (cache_tokens / total_input_context) * 100
     @computed_field
     @property

mcp_agent/logging/events.py CHANGED Viewed

@@ -117,3 +117,27 @@ class SamplingFilter(EventFilter):
         if not super().matches(event):
             return False
         return random.random() < self.sample_rate
+class StreamingExclusionFilter(EventFilter):
+    """
+    Event filter that excludes streaming progress events from logs.
+    This prevents token count updates from flooding the logs when info level is enabled.
+    """
+    def matches(self, event: Event) -> bool:
+        # First check if it passes the base filter
+        if not super().matches(event):
+            return False
+        # Exclude events with "Streaming progress" message
+        if event.message == "Streaming progress":
+            return False
+        # Also check for events with progress_action = STREAMING in data
+        if event.data and isinstance(event.data.get("data"), dict):
+            event_data = event.data["data"]
+            if event_data.get("progress_action") == "Streaming":
+                return False
+        return True

mcp_agent/logging/rich_progress.py CHANGED Viewed

@@ -73,6 +73,7 @@ class RichProgressDisplay:
             ProgressAction.LOADED: "dim green",
             ProgressAction.INITIALIZED: "dim green",
             ProgressAction.CHATTING: "bold blue",
+            ProgressAction.STREAMING: "bold blue",  # Same color as chatting
             ProgressAction.ROUTING: "bold blue",
             ProgressAction.PLANNING: "bold blue",
             ProgressAction.READY: "dim green",
@@ -100,9 +101,16 @@ class RichProgressDisplay:
             task_id = self._taskmap[task_name]
         # Ensure no None values in the update
+        # For streaming, use custom description immediately to avoid flashing
+        if event.action == ProgressAction.STREAMING and event.streaming_tokens:
+            formatted_tokens = f"↓ {event.streaming_tokens.strip()}".ljust(15)
+            description = f"[{self._get_action_style(event.action)}]{formatted_tokens}"
+        else:
+            description = f"[{self._get_action_style(event.action)}]{event.action.value:<15}"
         self._progress.update(
             task_id,
-            description=f"[{self._get_action_style(event.action)}]{event.action.value:<15}",
+            description=description,
             target=event.target or task_name,  # Use task_name as fallback for target
             details=event.details or "",
             task_name=task_name,

mcp_agent/mcp/interfaces.py CHANGED Viewed

@@ -21,7 +21,7 @@ from typing import (
     runtime_checkable,
 )
-from a2a_types.types import AgentCard
+from a2a.types import AgentCard
 from anyio.streams.memory import MemoryObjectReceiveStream, MemoryObjectSendStream
 from deprecated import deprecated
 from mcp import ClientSession

{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/WHEEL RENAMED Viewed

File without changes

{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

fast-agent-mcp 0.2.34__py3-none-any.whl → 0.2.36__py3-none-any.whl

fast-agent-mcp 0.2.34py3-none-any.whl → 0.2.36py3-none-any.whl