PyPI - fast-agent-mcp - Versions diffs - 0.2.34__py3-none-any.whl → 0.2.36__py3-none-any.whl - Mend

fast-agent-mcp 0.2.34py3-none-any.whl → 0.2.36py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/METADATA +6 -6
{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/RECORD +24 -24
mcp_agent/agents/base_agent.py +2 -2
mcp_agent/agents/workflow/router_agent.py +1 -1
mcp_agent/config.py +8 -0
mcp_agent/context.py +3 -2
mcp_agent/core/agent_app.py +1 -1
mcp_agent/core/enhanced_prompt.py +73 -13
mcp_agent/core/interactive_prompt.py +118 -8
mcp_agent/event_progress.py +22 -4
mcp_agent/llm/augmented_llm.py +47 -3
mcp_agent/llm/memory.py +120 -0
mcp_agent/llm/model_database.py +2 -2
mcp_agent/llm/providers/augmented_llm_anthropic.py +178 -45
mcp_agent/llm/providers/augmented_llm_azure.py +4 -4
mcp_agent/llm/providers/augmented_llm_openai.py +195 -12
mcp_agent/llm/providers/multipart_converter_openai.py +4 -3
mcp_agent/llm/usage_tracking.py +34 -17
mcp_agent/logging/events.py +24 -0
mcp_agent/logging/rich_progress.py +9 -1
mcp_agent/mcp/interfaces.py +1 -1
{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/WHEEL +0 -0
{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/entry_points.txt +0 -0
{fast_agent_mcp-0.2.34.dist-info → fast_agent_mcp-0.2.36.dist-info}/licenses/LICENSE +0 -0

mcp_agent/llm/augmented_llm.py CHANGED Viewed

@@ -97,6 +97,7 @@ class AugmentedLLM(ContextDependent, AugmentedLLMProtocol, Generic[MessageParamT
     PARAM_USE_HISTORY = "use_history"
     PARAM_MAX_ITERATIONS = "max_iterations"
     PARAM_TEMPLATE_VARS = "template_vars"
     # Base set of fields that should always be excluded
     BASE_EXCLUDE_FIELDS = {PARAM_METADATA}
@@ -371,16 +372,28 @@ class AugmentedLLM(ContextDependent, AugmentedLLMProtocol, Generic[MessageParamT
         # Start with base arguments
         arguments = base_args.copy()
-        # Use provided exclude_fields or fall back to base exclusions
-        exclude_fields = exclude_fields or self.BASE_EXCLUDE_FIELDS.copy()
+        # Combine base exclusions with provider-specific exclusions
+        final_exclude_fields = self.BASE_EXCLUDE_FIELDS.copy()
+        if exclude_fields:
+            final_exclude_fields.update(exclude_fields)
         # Add all fields from params that aren't explicitly excluded
-        params_dict = request_params.model_dump(exclude=exclude_fields)
+        # Ensure model_dump only includes set fields if that's the desired behavior,
+        # or adjust exclude_unset=True/False as needed.
+        # Default Pydantic v2 model_dump is exclude_unset=False
+        params_dict = request_params.model_dump(exclude=final_exclude_fields)
         for key, value in params_dict.items():
+            # Only add if not None and not already in base_args (base_args take precedence)
+            # or if None is a valid value for the provider, this logic might need adjustment.
             if value is not None and key not in arguments:
                 arguments[key] = value
+            elif value is not None and key in arguments and arguments[key] is None:
+                # Allow overriding a None in base_args with a set value from params
+                arguments[key] = value
         # Finally, add any metadata fields as a last layer of overrides
+        # This ensures metadata can override anything previously set if keys conflict.
         if request_params.metadata:
             arguments.update(request_params.metadata)
@@ -541,6 +554,37 @@ class AugmentedLLM(ContextDependent, AugmentedLLMProtocol, Generic[MessageParamT
         }
         self.logger.debug("Chat in progress", data=data)
+    def _update_streaming_progress(self, content: str, model: str, estimated_tokens: int) -> int:
+        """Update streaming progress with token estimation and formatting.
+        Args:
+            content: The text content from the streaming event
+            model: The model name
+            estimated_tokens: Current token count to update
+        Returns:
+            Updated estimated token count
+        """
+        # Rough estimate: 1 token per 4 characters (OpenAI's typical ratio)
+        text_length = len(content)
+        additional_tokens = max(1, text_length // 4)
+        new_total = estimated_tokens + additional_tokens
+        # Format token count for display
+        token_str = str(new_total).rjust(5)
+        # Emit progress event
+        data = {
+            "progress_action": ProgressAction.STREAMING,
+            "model": model,
+            "agent_name": self.name,
+            "chat_turn": self.chat_turn(),
+            "details": token_str.strip(),  # Token count goes in details for STREAMING action
+        }
+        self.logger.info("Streaming progress", data=data)
+        return new_total
     def _log_chat_finished(self, model: Optional[str] = None) -> None:
         """Log a chat finished event"""
         data = {

mcp_agent/llm/memory.py CHANGED Viewed

@@ -35,6 +35,9 @@ class SimpleMemory(Memory, Generic[MessageParamT]):
     def __init__(self) -> None:
         self.history: List[MessageParamT] = []
         self.prompt_messages: List[MessageParamT] = []  # Always included
+        self.conversation_cache_positions: List[int] = []  # Track active conversation cache positions
+        self.cache_walk_distance: int = 6  # Messages between cache blocks
+        self.max_conversation_cache_blocks: int = 2  # Maximum conversation cache blocks
     def extend(self, messages: List[MessageParamT], is_prompt: bool = False) -> None:
         """
@@ -99,5 +102,122 @@ class SimpleMemory(Memory, Generic[MessageParamT]):
             clear_prompts: If True, also clear prompt messages
         """
         self.history = []
+        self.conversation_cache_positions = []  # Reset cache positions
         if clear_prompts:
             self.prompt_messages = []
+    def should_apply_conversation_cache(self) -> bool:
+        """
+        Determine if conversation caching should be applied based on walking algorithm.
+        Returns:
+            True if we should add or update cache blocks
+        """
+        total_messages = len(self.history)
+        # Need at least cache_walk_distance messages to start caching
+        if total_messages < self.cache_walk_distance:
+            return False
+        # Check if we need to add a new cache block
+        return len(self._calculate_cache_positions(total_messages)) != len(self.conversation_cache_positions)
+    def _calculate_cache_positions(self, total_conversation_messages: int) -> List[int]:
+        """
+        Calculate where cache blocks should be placed using walking algorithm.
+        Args:
+            total_conversation_messages: Number of conversation messages (not including prompts)
+        Returns:
+            List of positions (relative to conversation start) where cache should be placed
+        """
+        positions = []
+        # Place cache blocks every cache_walk_distance messages
+        for i in range(self.cache_walk_distance - 1, total_conversation_messages, self.cache_walk_distance):
+            positions.append(i)
+            if len(positions) >= self.max_conversation_cache_blocks:
+                break
+        # Keep only the most recent cache blocks (walking behavior)
+        if len(positions) > self.max_conversation_cache_blocks:
+            positions = positions[-self.max_conversation_cache_blocks:]
+        return positions
+    def get_conversation_cache_updates(self) -> dict:
+        """
+        Get cache position updates needed for the walking algorithm.
+        Returns:
+            Dict with 'add', 'remove', and 'active' position lists (relative to full message array)
+        """
+        total_conversation_messages = len(self.history)
+        new_positions = self._calculate_cache_positions(total_conversation_messages)
+        # Convert to absolute positions (including prompt messages)
+        prompt_offset = len(self.prompt_messages)
+        new_absolute_positions = [pos + prompt_offset for pos in new_positions]
+        old_positions_set = set(self.conversation_cache_positions)
+        new_positions_set = set(new_absolute_positions)
+        return {
+            'add': sorted(new_positions_set - old_positions_set),
+            'remove': sorted(old_positions_set - new_positions_set),
+            'active': sorted(new_absolute_positions)
+        }
+    def apply_conversation_cache_updates(self, updates: dict) -> None:
+        """
+        Apply cache position updates.
+        Args:
+            updates: Dict from get_conversation_cache_updates()
+        """
+        self.conversation_cache_positions = updates['active'].copy()
+    def remove_cache_control_from_messages(self, messages: List[MessageParamT], positions: List[int]) -> None:
+        """
+        Remove cache control from specified message positions.
+        Args:
+            messages: The message array to modify
+            positions: List of positions to remove cache control from
+        """
+        for pos in positions:
+            if pos < len(messages):
+                message = messages[pos]
+                if isinstance(message, dict) and "content" in message:
+                    content_list = message["content"]
+                    if isinstance(content_list, list):
+                        for content_block in content_list:
+                            if isinstance(content_block, dict) and "cache_control" in content_block:
+                                del content_block["cache_control"]
+    def add_cache_control_to_messages(self, messages: List[MessageParamT], positions: List[int]) -> int:
+        """
+        Add cache control to specified message positions.
+        Args:
+            messages: The message array to modify
+            positions: List of positions to add cache control to
+        Returns:
+            Number of cache blocks successfully applied
+        """
+        applied_count = 0
+        for pos in positions:
+            if pos < len(messages):
+                message = messages[pos]
+                if isinstance(message, dict) and "content" in message:
+                    content_list = message["content"]
+                    if isinstance(content_list, list) and content_list:
+                        # Apply cache control to the last content block
+                        for content_block in reversed(content_list):
+                            if isinstance(content_block, dict):
+                                content_block["cache_control"] = {"type": "ephemeral"}
+                                applied_count += 1
+                                break
+        return applied_count

mcp_agent/llm/model_database.py CHANGED Viewed

@@ -109,11 +109,11 @@ class ModelDatabase:
     # TODO update to 32000
     ANTHROPIC_OPUS_4_VERSIONED = ModelParameters(
-        context_window=200000, max_output_tokens=16384, tokenizes=ANTHROPIC_MULTIMODAL
+        context_window=200000, max_output_tokens=32000, tokenizes=ANTHROPIC_MULTIMODAL
     )
     # TODO update to 64000
     ANTHROPIC_SONNET_4_VERSIONED = ModelParameters(
-        context_window=200000, max_output_tokens=16384, tokenizes=ANTHROPIC_MULTIMODAL
+        context_window=200000, max_output_tokens=64000, tokenizes=ANTHROPIC_MULTIMODAL
     )
     DEEPSEEK_CHAT_STANDARD = ModelParameters(

mcp_agent/llm/providers/augmented_llm_anthropic.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, List, Tuple, Type
 from mcp.types import EmbeddedResource, ImageContent, TextContent
 from mcp_agent.core.prompt import Prompt
+from mcp_agent.event_progress import ProgressAction
 from mcp_agent.llm.provider_types import Provider
 from mcp_agent.llm.providers.multipart_converter_anthropic import (
     AnthropicConverter,
@@ -18,7 +19,8 @@ if TYPE_CHECKING:
     from mcp import ListToolsResult
-from anthropic import Anthropic, AuthenticationError
+from anthropic import AsyncAnthropic, AuthenticationError
+from anthropic.lib.streaming import AsyncMessageStream
 from anthropic.types import (
     Message,
     MessageParam,
@@ -78,17 +80,69 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
         """Initialize Anthropic-specific default parameters"""
         # Get base defaults from parent (includes ModelDatabase lookup)
         base_params = super()._initialize_default_params(kwargs)
         # Override with Anthropic-specific settings
         chosen_model = kwargs.get("model", DEFAULT_ANTHROPIC_MODEL)
         base_params.model = chosen_model
         return base_params
     def _base_url(self) -> str | None:
         assert self.context.config
         return self.context.config.anthropic.base_url if self.context.config.anthropic else None
+    def _get_cache_mode(self) -> str:
+        """Get the cache mode configuration."""
+        cache_mode = "auto"  # Default to auto
+        if self.context.config and self.context.config.anthropic:
+            cache_mode = self.context.config.anthropic.cache_mode
+        return cache_mode
+    async def _process_stream(self, stream: AsyncMessageStream, model: str) -> Message:
+        """Process the streaming response and display real-time token usage."""
+        # Track estimated output tokens by counting text chunks
+        estimated_tokens = 0
+        # Process the raw event stream to get token counts
+        async for event in stream:
+            # Count tokens in real-time from content_block_delta events
+            if (
+                event.type == "content_block_delta"
+                and hasattr(event, "delta")
+                and event.delta.type == "text_delta"
+            ):
+                # Use base class method for token estimation and progress emission
+                estimated_tokens = self._update_streaming_progress(event.delta.text, model, estimated_tokens)
+            # Also check for final message_delta events with actual usage info
+            elif (
+                event.type == "message_delta"
+                and hasattr(event, "usage")
+                and event.usage.output_tokens
+            ):
+                actual_tokens = event.usage.output_tokens
+                # Emit final progress with actual token count
+                token_str = str(actual_tokens).rjust(5)
+                data = {
+                    "progress_action": ProgressAction.STREAMING,
+                    "model": model,
+                    "agent_name": self.name,
+                    "chat_turn": self.chat_turn(),
+                    "details": token_str.strip(),
+                }
+                self.logger.info("Streaming progress", data=data)
+        # Get the final message with complete usage data
+        message = await stream.get_final_message()
+        # Log final usage information
+        if hasattr(message, "usage") and message.usage:
+            self.logger.info(
+                f"Streaming complete - Model: {model}, Input tokens: {message.usage.input_tokens}, Output tokens: {message.usage.output_tokens}"
+            )
+        return message
     async def _anthropic_completion(
         self,
         message_param,
@@ -105,7 +159,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
             base_url = base_url.rstrip("/v1")
         try:
-            anthropic = Anthropic(api_key=api_key, base_url=base_url)
+            anthropic = AsyncAnthropic(api_key=api_key, base_url=base_url)
             messages: List[MessageParam] = []
             params = self.get_request_params(request_params)
         except AuthenticationError as e:
@@ -118,7 +172,11 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
         # if use_history is True
         messages.extend(self.history.get(include_completion_history=params.use_history))
-        messages.append(message_param)
+        messages.append(message_param)  # message_param is the current user turn
+        # Get cache mode configuration
+        cache_mode = self._get_cache_mode()
+        self.logger.debug(f"Anthropic cache_mode: {cache_mode}")
         tool_list: ListToolsResult = await self.aggregator.list_tools()
         available_tools: List[ToolParam] = [
@@ -134,8 +192,11 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
         model = self.default_request_params.model
+        # Note: We'll cache tools+system together by putting cache_control only on system prompt
         for i in range(params.max_iterations):
             self._log_chat_progress(self.chat_turn(), model=model)
             # Create base arguments dictionary
             base_args = {
                 "model": model,
@@ -145,6 +206,60 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
                 "tools": available_tools,
             }
+            # Apply cache_control to system prompt if cache_mode is not "off"
+            # This caches both tools and system prompt together in one cache block
+            if cache_mode != "off" and base_args["system"]:
+                if isinstance(base_args["system"], str):
+                    base_args["system"] = [
+                        {
+                            "type": "text",
+                            "text": base_args["system"],
+                            "cache_control": {"type": "ephemeral"},
+                        }
+                    ]
+                    self.logger.debug(
+                        "Applied cache_control to system prompt (caches tools+system in one block)"
+                    )
+                else:
+                    self.logger.debug(f"System prompt is not a string: {type(base_args['system'])}")
+            # Apply conversation caching using walking algorithm if in auto mode
+            if cache_mode == "auto" and self.history.should_apply_conversation_cache():
+                cache_updates = self.history.get_conversation_cache_updates()
+                # Remove cache control from old positions
+                if cache_updates["remove"]:
+                    self.history.remove_cache_control_from_messages(
+                        messages, cache_updates["remove"]
+                    )
+                    self.logger.debug(
+                        f"Removed conversation cache_control from positions {cache_updates['remove']}"
+                    )
+                # Add cache control to new positions
+                if cache_updates["add"]:
+                    applied_count = self.history.add_cache_control_to_messages(
+                        messages, cache_updates["add"]
+                    )
+                    if applied_count > 0:
+                        self.history.apply_conversation_cache_updates(cache_updates)
+                        self.logger.debug(
+                            f"Applied conversation cache_control to positions {cache_updates['add']} ({applied_count} blocks)"
+                        )
+                        # Verify we don't exceed Anthropic's 4 cache block limit
+                        total_cache_blocks = applied_count
+                        if cache_mode != "off" and base_args["system"]:
+                            total_cache_blocks += 1  # tools+system cache block
+                        if total_cache_blocks > 4:
+                            self.logger.warning(
+                                f"Total cache blocks ({total_cache_blocks}) exceeds Anthropic limit of 4"
+                            )
+                    else:
+                        self.logger.debug(
+                            f"Failed to apply conversation cache_control to positions {cache_updates['add']}"
+                        )
             if params.maxTokens is not None:
                 base_args["max_tokens"] = params.maxTokens
@@ -155,9 +270,10 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
             self.logger.debug(f"{arguments}")
-            executor_result = await self.executor.execute(anthropic.messages.create, **arguments)
-            response = executor_result[0]
+            # Use streaming API with helper
+            async with anthropic.messages.stream(**arguments) as stream:
+                # Process the stream
+                response = await self._process_stream(stream, model)
             # Track usage if response is valid and has usage data
             if (
@@ -170,27 +286,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
                         response.usage, model or DEFAULT_ANTHROPIC_MODEL
                     )
                     self.usage_accumulator.add_turn(turn_usage)
-                    # # Print raw usage for debugging
-                    # print(f"\n=== USAGE DEBUG ({model}) ===")
-                    # print(f"Raw usage: {response.usage}")
-                    # print(
-                    #     f"Turn usage: input={turn_usage.input_tokens}, output={turn_usage.output_tokens}, current_context={turn_usage.current_context_tokens}"
-                    # )
-                    # print(
-                    #     f"Cache: read={turn_usage.cache_usage.cache_read_tokens}, write={turn_usage.cache_usage.cache_write_tokens}"
-                    # )
-                    # print(f"Effective input: {turn_usage.effective_input_tokens}")
-                    # print(
-                    #     f"Accumulator: total_turns={self.usage_accumulator.turn_count}, cumulative_billing={self.usage_accumulator.cumulative_billing_tokens}, current_context={self.usage_accumulator.current_context_tokens}"
-                    # )
-                    # if self.usage_accumulator.context_usage_percentage:
-                    #     print(
-                    #         f"Context usage: {self.usage_accumulator.context_usage_percentage:.1f}% of {self.usage_accumulator.context_window_size}"
-                    #     )
-                    # if self.usage_accumulator.cache_hit_rate:
-                    #     print(f"Cache hit rate: {self.usage_accumulator.cache_hit_rate:.1f}%")
-                    # print("===========================\n")
+                #                    self._show_usage(response.usage, turn_usage)
                 except Exception as e:
                     self.logger.warning(f"Failed to track usage: {e}")
@@ -201,7 +297,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
                 ) from response
             elif isinstance(response, BaseException):
                 error_details = str(response)
-                self.logger.error(f"Error: {error_details}", data=executor_result)
+                self.logger.error(f"Error: {error_details}", data=BaseException)
                 # Try to extract more useful information for API errors
                 if hasattr(response, "status_code") and hasattr(response, "response"):
@@ -214,13 +310,13 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
                 # Convert other errors to text response
                 error_message = f"Error during generation: {error_details}"
                 response = Message(
-                    id="error",  # Required field
-                    model="error",  # Required field
+                    id="error",
+                    model="error",
                     role="assistant",
                     type="message",
                     content=[TextBlock(type="text", text=error_message)],
-                    stop_reason="end_turn",  # Must be one of the allowed values
-                    usage=Usage(input_tokens=0, output_tokens=0),  # Required field
+                    stop_reason="end_turn",
+                    usage=Usage(input_tokens=0, output_tokens=0),
                 )
             self.logger.debug(
@@ -230,7 +326,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
             response_as_message = self.convert_message_to_message_param(response)
             messages.append(response_as_message)
-            if response.content[0].type == "text":
+            if response.content and response.content[0].type == "text":
                 responses.append(TextContent(type="text", text=response.content[0].text))
             if response.stop_reason == "end_turn":
@@ -290,12 +386,13 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
                     # Process all tool calls and collect results
                     tool_results = []
-                    for i, content in enumerate(tool_uses):
-                        tool_name = content.name
-                        tool_args = content.input
-                        tool_use_id = content.id
+                    # Use a different loop variable for tool enumeration if 'i' is outer loop counter
+                    for tool_idx, content_block in enumerate(tool_uses):
+                        tool_name = content_block.name
+                        tool_args = content_block.input
+                        tool_use_id = content_block.id
-                        if i == 0:  # Only show message for first tool use
+                        if tool_idx == 0:  # Only show message for first tool use
                             await self.show_assistant_message(message_text, tool_name)
                         self.show_tool_call(available_tools, tool_name, tool_args)
@@ -320,11 +417,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
         if params.use_history:
             # Get current prompt messages
             prompt_messages = self.history.get(include_completion_history=False)
-            # Calculate new conversation messages (excluding prompts)
             new_messages = messages[len(prompt_messages) :]
-            # Update conversation history
             self.history.set(new_messages)
         self._log_chat_finished(model=model)
@@ -362,8 +455,26 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
             multipart_messages[:-1] if last_message.role == "user" else multipart_messages
         )
         converted = []
+        # Get cache mode configuration
+        cache_mode = self._get_cache_mode()
         for msg in messages_to_add:
-            converted.append(AnthropicConverter.convert_to_anthropic(msg))
+            anthropic_msg = AnthropicConverter.convert_to_anthropic(msg)
+            # Apply caching to template messages if cache_mode is "prompt" or "auto"
+            if is_template and cache_mode in ["prompt", "auto"] and anthropic_msg.get("content"):
+                content_list = anthropic_msg["content"]
+                if isinstance(content_list, list) and content_list:
+                    # Apply cache control to the last content block
+                    last_block = content_list[-1]
+                    if isinstance(last_block, dict):
+                        last_block["cache_control"] = {"type": "ephemeral"}
+                        self.logger.debug(
+                            f"Applied cache_control to template message with role {anthropic_msg.get('role')}"
+                        )
+            converted.append(anthropic_msg)
         self.history.extend(converted, is_prompt=is_template)
@@ -398,6 +509,28 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
         )
         return self._structured_from_multipart(result, model)
+    def _show_usage(self, raw_usage: Usage, turn_usage: TurnUsage) -> None:
+        # Print raw usage for debugging
+        print(f"\n=== USAGE DEBUG ({turn_usage.model}) ===")
+        print(f"Raw usage: {raw_usage}")
+        print(
+            f"Turn usage: input={turn_usage.input_tokens}, output={turn_usage.output_tokens}, current_context={turn_usage.current_context_tokens}"
+        )
+        print(
+            f"Cache: read={turn_usage.cache_usage.cache_read_tokens}, write={turn_usage.cache_usage.cache_write_tokens}"
+        )
+        print(f"Effective input: {turn_usage.effective_input_tokens}")
+        print(
+            f"Accumulator: total_turns={self.usage_accumulator.turn_count}, cumulative_billing={self.usage_accumulator.cumulative_billing_tokens}, current_context={self.usage_accumulator.current_context_tokens}"
+        )
+        if self.usage_accumulator.context_usage_percentage:
+            print(
+                f"Context usage: {self.usage_accumulator.context_usage_percentage:.1f}% of {self.usage_accumulator.context_window_size}"
+            )
+        if self.usage_accumulator.cache_hit_rate:
+            print(f"Cache hit rate: {self.usage_accumulator.cache_hit_rate:.1f}%")
+        print("===========================\n")
     @classmethod
     def convert_message_to_message_param(cls, message: Message, **kwargs) -> MessageParam:
         """Convert a response object to an input parameter object to allow LLM calls to be chained."""

mcp_agent/llm/providers/augmented_llm_azure.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from openai import AuthenticationError, AzureOpenAI, OpenAI
+from openai import AsyncAzureOpenAI, AsyncOpenAI, AuthenticationError
 from mcp_agent.core.exceptions import ProviderKeyError
 from mcp_agent.llm.provider_types import Provider
@@ -93,7 +93,7 @@ class AzureOpenAIAugmentedLLM(OpenAIAugmentedLLM):
             if not self.resource_name and self.base_url:
                 self.resource_name = _extract_resource_name(self.base_url)
-    def _openai_client(self) -> OpenAI:
+    def _openai_client(self) -> AsyncOpenAI:
         """
         Returns an AzureOpenAI client, handling both API Key and DefaultAzureCredential.
         """
@@ -104,7 +104,7 @@ class AzureOpenAIAugmentedLLM(OpenAIAugmentedLLM):
                         "Missing Azure endpoint",
                         "azure_endpoint (base_url) is None at client creation time.",
                     )
-                return AzureOpenAI(
+                return AsyncAzureOpenAI(
                     azure_ad_token_provider=self.get_azure_token,
                     azure_endpoint=self.base_url,
                     api_version=self.api_version,
@@ -116,7 +116,7 @@ class AzureOpenAIAugmentedLLM(OpenAIAugmentedLLM):
                         "Missing Azure endpoint",
                         "azure_endpoint (base_url) is None at client creation time.",
                     )
-                return AzureOpenAI(
+                return AsyncAzureOpenAI(
                     api_key=self.api_key,
                     azure_endpoint=self.base_url,
                     api_version=self.api_version,

fast-agent-mcp 0.2.34__py3-none-any.whl → 0.2.36__py3-none-any.whl

fast-agent-mcp 0.2.34py3-none-any.whl → 0.2.36py3-none-any.whl