PyPI - fast-agent-mcp - Versions diffs - 0.2.33__py3-none-any.whl → 0.2.35__py3-none-any.whl - Mend

fast-agent-mcp 0.2.33py3-none-any.whl → 0.2.35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{fast_agent_mcp-0.2.33.dist-info → fast_agent_mcp-0.2.35.dist-info}/METADATA +1 -1
{fast_agent_mcp-0.2.33.dist-info → fast_agent_mcp-0.2.35.dist-info}/RECORD +28 -25
mcp_agent/agents/base_agent.py +13 -0
mcp_agent/config.py +8 -0
mcp_agent/context.py +3 -2
mcp_agent/core/agent_app.py +41 -1
mcp_agent/core/enhanced_prompt.py +9 -0
mcp_agent/core/fastagent.py +14 -2
mcp_agent/core/interactive_prompt.py +59 -13
mcp_agent/core/usage_display.py +193 -0
mcp_agent/event_progress.py +22 -4
mcp_agent/llm/augmented_llm.py +42 -9
mcp_agent/llm/augmented_llm_passthrough.py +66 -4
mcp_agent/llm/augmented_llm_playback.py +19 -0
mcp_agent/llm/augmented_llm_slow.py +12 -1
mcp_agent/llm/memory.py +120 -0
mcp_agent/llm/model_database.py +236 -0
mcp_agent/llm/model_factory.py +1 -0
mcp_agent/llm/providers/augmented_llm_anthropic.py +211 -30
mcp_agent/llm/providers/augmented_llm_google_native.py +18 -1
mcp_agent/llm/providers/augmented_llm_openai.py +20 -7
mcp_agent/llm/usage_tracking.py +402 -0
mcp_agent/logging/events.py +24 -0
mcp_agent/logging/rich_progress.py +9 -1
mcp_agent/mcp/interfaces.py +6 -0
{fast_agent_mcp-0.2.33.dist-info → fast_agent_mcp-0.2.35.dist-info}/WHEEL +0 -0
{fast_agent_mcp-0.2.33.dist-info → fast_agent_mcp-0.2.35.dist-info}/entry_points.txt +0 -0
{fast_agent_mcp-0.2.33.dist-info → fast_agent_mcp-0.2.35.dist-info}/licenses/LICENSE +0 -0

mcp_agent/llm/model_database.py ADDED Viewed

@@ -0,0 +1,236 @@
+"""
+Model database for LLM parameters.
+This module provides a centralized lookup for model parameters including
+context windows, max output tokens, and supported tokenization types.
+"""
+from typing import Dict, List, Optional
+from pydantic import BaseModel
+class ModelParameters(BaseModel):
+    """Configuration parameters for a specific model"""
+    context_window: int
+    """Maximum context window size in tokens"""
+    max_output_tokens: int
+    """Maximum output tokens the model can generate"""
+    tokenizes: List[str]
+    """List of supported content types for tokenization"""
+class ModelDatabase:
+    """Centralized model configuration database"""
+    # Common parameter sets
+    OPENAI_MULTIMODAL = ["text/plain", "image/jpeg", "image/png", "image/webp", "application/pdf"]
+    OPENAI_VISION = ["text/plain", "image/jpeg", "image/png", "image/webp"]
+    ANTHROPIC_MULTIMODAL = [
+        "text/plain",
+        "image/jpeg",
+        "image/png",
+        "image/webp",
+        "application/pdf",
+    ]
+    GOOGLE_MULTIMODAL = [
+        "text/plain",
+        "image/jpeg",
+        "image/png",
+        "image/webp",
+        "application/pdf",
+        "audio/wav",
+        "audio/mp3",
+        "video/mp4",
+    ]
+    QWEN_MULTIMODAL = ["text/plain", "image/jpeg", "image/png", "image/webp"]
+    TEXT_ONLY = ["text/plain"]
+    # Common parameter configurations
+    OPENAI_STANDARD = ModelParameters(
+        context_window=128000, max_output_tokens=16384, tokenizes=OPENAI_MULTIMODAL
+    )
+    OPENAI_4_1_STANDARD = ModelParameters(
+        context_window=1047576, max_output_tokens=32768, tokenizes=OPENAI_MULTIMODAL
+    )
+    OPENAI_O_SERIES = ModelParameters(
+        context_window=200000, max_output_tokens=100000, tokenizes=OPENAI_VISION
+    )
+    ANTHROPIC_LEGACY = ModelParameters(
+        context_window=200000, max_output_tokens=4096, tokenizes=ANTHROPIC_MULTIMODAL
+    )
+    ANTHROPIC_35_SERIES = ModelParameters(
+        context_window=200000, max_output_tokens=8192, tokenizes=ANTHROPIC_MULTIMODAL
+    )
+    # TODO--- TO USE 64,000 NEED TO SUPPORT STREAMING
+    ANTHROPIC_37_SERIES = ModelParameters(
+        context_window=200000, max_output_tokens=16384, tokenizes=ANTHROPIC_MULTIMODAL
+    )
+    GEMINI_FLASH = ModelParameters(
+        context_window=1048576, max_output_tokens=8192, tokenizes=GOOGLE_MULTIMODAL
+    )
+    GEMINI_PRO = ModelParameters(
+        context_window=2097152, max_output_tokens=8192, tokenizes=GOOGLE_MULTIMODAL
+    )
+    QWEN_STANDARD = ModelParameters(
+        context_window=32000, max_output_tokens=8192, tokenizes=QWEN_MULTIMODAL
+    )
+    FAST_AGENT_STANDARD = ModelParameters(
+        context_window=1000000, max_output_tokens=100000, tokenizes=TEXT_ONLY
+    )
+    OPENAI_4_1_SERIES = ModelParameters(
+        context_window=1047576, max_output_tokens=32768, tokenizes=OPENAI_MULTIMODAL
+    )
+    OPENAI_4O_SERIES = ModelParameters(
+        context_window=128000, max_output_tokens=16384, tokenizes=OPENAI_VISION
+    )
+    OPENAI_O3_SERIES = ModelParameters(
+        context_window=200000, max_output_tokens=100000, tokenizes=OPENAI_MULTIMODAL
+    )
+    OPENAI_O3_MINI_SERIES = ModelParameters(
+        context_window=200000, max_output_tokens=100000, tokenizes=TEXT_ONLY
+    )
+    # TODO update to 32000
+    ANTHROPIC_OPUS_4_VERSIONED = ModelParameters(
+        context_window=200000, max_output_tokens=32000, tokenizes=ANTHROPIC_MULTIMODAL
+    )
+    # TODO update to 64000
+    ANTHROPIC_SONNET_4_VERSIONED = ModelParameters(
+        context_window=200000, max_output_tokens=64000, tokenizes=ANTHROPIC_MULTIMODAL
+    )
+    DEEPSEEK_CHAT_STANDARD = ModelParameters(
+        context_window=65536, max_output_tokens=8192, tokenizes=TEXT_ONLY
+    )
+    DEEPSEEK_REASONER = ModelParameters(
+        context_window=65536, max_output_tokens=32768, tokenizes=TEXT_ONLY
+    )
+    GEMINI_2_5_PRO = ModelParameters(
+        context_window=2097152, max_output_tokens=8192, tokenizes=GOOGLE_MULTIMODAL
+    )
+    # Model configuration database
+    MODELS: Dict[str, ModelParameters] = {
+        # internal models
+        "passthrough": FAST_AGENT_STANDARD,
+        "playback": FAST_AGENT_STANDARD,
+        "slow": FAST_AGENT_STANDARD,
+        # aliyun models
+        "qwen-turbo": QWEN_STANDARD,
+        "qwen-plus": QWEN_STANDARD,
+        "qwen-max": QWEN_STANDARD,
+        "qwen-long": ModelParameters(
+            context_window=10000000, max_output_tokens=8192, tokenizes=TEXT_ONLY
+        ),
+        # OpenAI Models (vanilla aliases and versioned)
+        "gpt-4.1": OPENAI_4_1_SERIES,
+        "gpt-4.1-mini": OPENAI_4_1_SERIES,
+        "gpt-4.1-nano": OPENAI_4_1_SERIES,
+        "gpt-4.1-2025-04-14": OPENAI_4_1_SERIES,
+        "gpt-4.1-mini-2025-04-14": OPENAI_4_1_SERIES,
+        "gpt-4.1-nano-2025-04-14": OPENAI_4_1_SERIES,
+        "gpt-4o": OPENAI_4O_SERIES,
+        "gpt-4o-2024-11-20": OPENAI_4O_SERIES,
+        "gpt-4o-mini-2024-07-18": OPENAI_4O_SERIES,
+        "o1": OPENAI_O_SERIES,
+        "o1-2024-12-17": OPENAI_O_SERIES,
+        "o3": OPENAI_O3_SERIES,
+        "o3-pro": ModelParameters(
+            context_window=200_000, max_output_tokens=100_000, tokenizes=TEXT_ONLY
+        ),
+        "o3-mini": OPENAI_O3_MINI_SERIES,
+        "o4-mini": OPENAI_O3_SERIES,
+        "o3-2025-04-16": OPENAI_O3_SERIES,
+        "o3-mini-2025-01-31": OPENAI_O3_MINI_SERIES,
+        "o4-mini-2025-04-16": OPENAI_O3_SERIES,
+        # Anthropic Models
+        "claude-3-haiku": ANTHROPIC_35_SERIES,
+        "claude-3-haiku-20240307": ANTHROPIC_LEGACY,
+        "claude-3-sonnet": ANTHROPIC_LEGACY,
+        "claude-3-opus": ANTHROPIC_LEGACY,
+        "claude-3-opus-20240229": ANTHROPIC_LEGACY,
+        "claude-3-opus-latest": ANTHROPIC_LEGACY,
+        "claude-3-5-haiku": ANTHROPIC_35_SERIES,
+        "claude-3-5-haiku-20241022": ANTHROPIC_35_SERIES,
+        "claude-3-5-haiku-latest": ANTHROPIC_35_SERIES,
+        "claude-3-sonnet-20240229": ANTHROPIC_LEGACY,
+        "claude-3-5-sonnet": ANTHROPIC_35_SERIES,
+        "claude-3-5-sonnet-20240620": ANTHROPIC_35_SERIES,
+        "claude-3-5-sonnet-20241022": ANTHROPIC_35_SERIES,
+        "claude-3-5-sonnet-latest": ANTHROPIC_35_SERIES,
+        "claude-3-7-sonnet": ANTHROPIC_37_SERIES,
+        "claude-3-7-sonnet-20250219": ANTHROPIC_37_SERIES,
+        "claude-3-7-sonnet-latest": ANTHROPIC_37_SERIES,
+        "claude-sonnet-4": ANTHROPIC_SONNET_4_VERSIONED,
+        "claude-sonnet-4-0": ANTHROPIC_SONNET_4_VERSIONED,
+        "claude-sonnet-4-20250514": ANTHROPIC_SONNET_4_VERSIONED,
+        "claude-opus-4": ANTHROPIC_OPUS_4_VERSIONED,
+        "claude-opus-4-0": ANTHROPIC_OPUS_4_VERSIONED,
+        "claude-opus-4-20250514": ANTHROPIC_OPUS_4_VERSIONED,
+        # DeepSeek Models
+        "deepseek-chat": DEEPSEEK_CHAT_STANDARD,
+        # Google Gemini Models (vanilla aliases and versioned)
+        "gemini-2.0-flash": GEMINI_FLASH,
+        "gemini-2.5-flash-preview": GEMINI_FLASH,
+        "gemini-2.5-pro-preview": GEMINI_2_5_PRO,
+        "gemini-2.5-flash-preview-05-20": GEMINI_FLASH,
+        "gemini-2.5-pro-preview-05-06": GEMINI_PRO,
+    }
+    @classmethod
+    def get_model_params(cls, model: str) -> Optional[ModelParameters]:
+        """Get model parameters for a given model name"""
+        return cls.MODELS.get(model)
+    @classmethod
+    def get_context_window(cls, model: str) -> Optional[int]:
+        """Get context window size for a model"""
+        params = cls.get_model_params(model)
+        return params.context_window if params else None
+    @classmethod
+    def get_max_output_tokens(cls, model: str) -> Optional[int]:
+        """Get maximum output tokens for a model"""
+        params = cls.get_model_params(model)
+        return params.max_output_tokens if params else None
+    @classmethod
+    def get_tokenizes(cls, model: str) -> Optional[List[str]]:
+        """Get supported tokenization types for a model"""
+        params = cls.get_model_params(model)
+        return params.tokenizes if params else None
+    @classmethod
+    def get_default_max_tokens(cls, model: str) -> int:
+        """Get default max_tokens for RequestParams based on model"""
+        if not model:
+            return 2048  # Fallback when no model specified
+        params = cls.get_model_params(model)
+        if params:
+            return params.max_output_tokens
+        return 2048  # Fallback for unknown models
+    @classmethod
+    def list_models(cls) -> List[str]:
+        """List all available model names"""
+        return list(cls.MODELS.keys())

mcp_agent/llm/model_factory.py CHANGED Viewed

@@ -87,6 +87,7 @@ class ModelFactory:
         "o1-preview": Provider.OPENAI,
         "o3": Provider.OPENAI,
         "o3-mini": Provider.OPENAI,
+        "o4-mini": Provider.OPENAI,
         "claude-3-haiku-20240307": Provider.ANTHROPIC,
         "claude-3-5-haiku-20241022": Provider.ANTHROPIC,
         "claude-3-5-haiku-latest": Provider.ANTHROPIC,

mcp_agent/llm/providers/augmented_llm_anthropic.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, List, Tuple, Type
 from mcp.types import EmbeddedResource, ImageContent, TextContent
 from mcp_agent.core.prompt import Prompt
+from mcp_agent.event_progress import ProgressAction
 from mcp_agent.llm.provider_types import Provider
 from mcp_agent.llm.providers.multipart_converter_anthropic import (
     AnthropicConverter,
@@ -10,6 +11,7 @@ from mcp_agent.llm.providers.multipart_converter_anthropic import (
 from mcp_agent.llm.providers.sampling_converter_anthropic import (
     AnthropicSamplingConverter,
 )
+from mcp_agent.llm.usage_tracking import TurnUsage
 from mcp_agent.mcp.interfaces import ModelT
 from mcp_agent.mcp.prompt_message_multipart import PromptMessageMultipart
@@ -17,7 +19,8 @@ if TYPE_CHECKING:
     from mcp import ListToolsResult
-from anthropic import Anthropic, AuthenticationError
+from anthropic import AsyncAnthropic, AuthenticationError
+from anthropic.lib.streaming import AsyncMessageStream
 from anthropic.types import (
     Message,
     MessageParam,
@@ -75,19 +78,83 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
     def _initialize_default_params(self, kwargs: dict) -> RequestParams:
         """Initialize Anthropic-specific default parameters"""
-        return RequestParams(
-            model=kwargs.get("model", DEFAULT_ANTHROPIC_MODEL),
-            maxTokens=4096,  # default haiku3
-            systemPrompt=self.instruction,
-            parallel_tool_calls=True,
-            max_iterations=20,
-            use_history=True,
-        )
+        # Get base defaults from parent (includes ModelDatabase lookup)
+        base_params = super()._initialize_default_params(kwargs)
+        # Override with Anthropic-specific settings
+        chosen_model = kwargs.get("model", DEFAULT_ANTHROPIC_MODEL)
+        base_params.model = chosen_model
+        return base_params
     def _base_url(self) -> str | None:
         assert self.context.config
         return self.context.config.anthropic.base_url if self.context.config.anthropic else None
+    def _get_cache_mode(self) -> str:
+        """Get the cache mode configuration."""
+        cache_mode = "auto"  # Default to auto
+        if self.context.config and self.context.config.anthropic:
+            cache_mode = self.context.config.anthropic.cache_mode
+        return cache_mode
+    async def _process_stream(self, stream: AsyncMessageStream, model: str) -> Message:
+        """Process the streaming response and display real-time token usage."""
+        # Track estimated output tokens by counting text chunks
+        estimated_tokens = 0
+        # Process the raw event stream to get token counts
+        async for event in stream:
+            # Count tokens in real-time from content_block_delta events
+            if (
+                event.type == "content_block_delta"
+                and hasattr(event, "delta")
+                and event.delta.type == "text_delta"
+            ):
+                # Rough estimate: 1 token per 4 characters (OpenAI's typical ratio)
+                text_length = len(event.delta.text)
+                estimated_tokens += max(1, text_length // 4)
+                # Update progress on every token for real-time display
+                token_str = str(estimated_tokens).rjust(5)
+                #                print(f"DEBUG: Streaming tokens: {token_str}")
+                self._emit_streaming_progress(model, token_str)
+            # Also check for final message_delta events with actual usage info
+            elif (
+                event.type == "message_delta"
+                and hasattr(event, "usage")
+                and event.usage.output_tokens
+            ):
+                actual_tokens = event.usage.output_tokens
+                token_str = str(actual_tokens).rjust(5)
+                #               print(f"DEBUG: Final actual tokens: {token_str}")
+                self._emit_streaming_progress(model, token_str)
+        # Get the final message with complete usage data
+        message = await stream.get_final_message()
+        # Log final usage information
+        if hasattr(message, "usage") and message.usage:
+            self.logger.info(
+                f"Streaming complete - Model: {model}, Input tokens: {message.usage.input_tokens}, Output tokens: {message.usage.output_tokens}"
+            )
+        return message
+    def _emit_streaming_progress(self, model: str, token_str: str) -> None:
+        """Emit a streaming progress event that goes directly to progress display."""
+        data = {
+            "progress_action": ProgressAction.STREAMING,
+            "model": model,
+            "agent_name": self.name,
+            "chat_turn": self.chat_turn(),
+            "details": token_str.strip(),  # Token count goes in details for STREAMING action
+        }
+        #        print(f"DEBUG: Emitting streaming progress event with data: {data}")
+        # Use a special logger level or namespace to avoid polluting regular logs
+        self.logger.info("Streaming progress", data=data)
     async def _anthropic_completion(
         self,
         message_param,
@@ -104,7 +171,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
             base_url = base_url.rstrip("/v1")
         try:
-            anthropic = Anthropic(api_key=api_key, base_url=base_url)
+            anthropic = AsyncAnthropic(api_key=api_key, base_url=base_url)
             messages: List[MessageParam] = []
             params = self.get_request_params(request_params)
         except AuthenticationError as e:
@@ -117,7 +184,11 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
         # if use_history is True
         messages.extend(self.history.get(include_completion_history=params.use_history))
-        messages.append(message_param)
+        messages.append(message_param)  # message_param is the current user turn
+        # Get cache mode configuration
+        cache_mode = self._get_cache_mode()
+        self.logger.debug(f"Anthropic cache_mode: {cache_mode}")
         tool_list: ListToolsResult = await self.aggregator.list_tools()
         available_tools: List[ToolParam] = [
@@ -133,8 +204,11 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
         model = self.default_request_params.model
+        # Note: We'll cache tools+system together by putting cache_control only on system prompt
         for i in range(params.max_iterations):
             self._log_chat_progress(self.chat_turn(), model=model)
             # Create base arguments dictionary
             base_args = {
                 "model": model,
@@ -144,6 +218,60 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
                 "tools": available_tools,
             }
+            # Apply cache_control to system prompt if cache_mode is not "off"
+            # This caches both tools and system prompt together in one cache block
+            if cache_mode != "off" and base_args["system"]:
+                if isinstance(base_args["system"], str):
+                    base_args["system"] = [
+                        {
+                            "type": "text",
+                            "text": base_args["system"],
+                            "cache_control": {"type": "ephemeral"},
+                        }
+                    ]
+                    self.logger.debug(
+                        "Applied cache_control to system prompt (caches tools+system in one block)"
+                    )
+                else:
+                    self.logger.debug(f"System prompt is not a string: {type(base_args['system'])}")
+            # Apply conversation caching using walking algorithm if in auto mode
+            if cache_mode == "auto" and self.history.should_apply_conversation_cache():
+                cache_updates = self.history.get_conversation_cache_updates()
+                # Remove cache control from old positions
+                if cache_updates["remove"]:
+                    self.history.remove_cache_control_from_messages(
+                        messages, cache_updates["remove"]
+                    )
+                    self.logger.debug(
+                        f"Removed conversation cache_control from positions {cache_updates['remove']}"
+                    )
+                # Add cache control to new positions
+                if cache_updates["add"]:
+                    applied_count = self.history.add_cache_control_to_messages(
+                        messages, cache_updates["add"]
+                    )
+                    if applied_count > 0:
+                        self.history.apply_conversation_cache_updates(cache_updates)
+                        self.logger.debug(
+                            f"Applied conversation cache_control to positions {cache_updates['add']} ({applied_count} blocks)"
+                        )
+                        # Verify we don't exceed Anthropic's 4 cache block limit
+                        total_cache_blocks = applied_count
+                        if cache_mode != "off" and base_args["system"]:
+                            total_cache_blocks += 1  # tools+system cache block
+                        if total_cache_blocks > 4:
+                            self.logger.warning(
+                                f"Total cache blocks ({total_cache_blocks}) exceeds Anthropic limit of 4"
+                            )
+                    else:
+                        self.logger.debug(
+                            f"Failed to apply conversation cache_control to positions {cache_updates['add']}"
+                        )
             if params.maxTokens is not None:
                 base_args["max_tokens"] = params.maxTokens
@@ -154,9 +282,25 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
             self.logger.debug(f"{arguments}")
-            executor_result = await self.executor.execute(anthropic.messages.create, **arguments)
-            response = executor_result[0]
+            # Use streaming API with helper
+            async with anthropic.messages.stream(**arguments) as stream:
+                # Process the stream
+                response = await self._process_stream(stream, model)
+            # Track usage if response is valid and has usage data
+            if (
+                hasattr(response, "usage")
+                and response.usage
+                and not isinstance(response, BaseException)
+            ):
+                try:
+                    turn_usage = TurnUsage.from_anthropic(
+                        response.usage, model or DEFAULT_ANTHROPIC_MODEL
+                    )
+                    self.usage_accumulator.add_turn(turn_usage)
+                #                    self._show_usage(response.usage, turn_usage)
+                except Exception as e:
+                    self.logger.warning(f"Failed to track usage: {e}")
             if isinstance(response, AuthenticationError):
                 raise ProviderKeyError(
@@ -165,7 +309,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
                 ) from response
             elif isinstance(response, BaseException):
                 error_details = str(response)
-                self.logger.error(f"Error: {error_details}", data=executor_result)
+                self.logger.error(f"Error: {error_details}", data=BaseException)
                 # Try to extract more useful information for API errors
                 if hasattr(response, "status_code") and hasattr(response, "response"):
@@ -178,13 +322,13 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
                 # Convert other errors to text response
                 error_message = f"Error during generation: {error_details}"
                 response = Message(
-                    id="error",  # Required field
-                    model="error",  # Required field
+                    id="error",
+                    model="error",
                     role="assistant",
                     type="message",
                     content=[TextBlock(type="text", text=error_message)],
-                    stop_reason="end_turn",  # Must be one of the allowed values
-                    usage=Usage(input_tokens=0, output_tokens=0),  # Required field
+                    stop_reason="end_turn",
+                    usage=Usage(input_tokens=0, output_tokens=0),
                 )
             self.logger.debug(
@@ -194,7 +338,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
             response_as_message = self.convert_message_to_message_param(response)
             messages.append(response_as_message)
-            if response.content[0].type == "text":
+            if response.content and response.content[0].type == "text":
                 responses.append(TextContent(type="text", text=response.content[0].text))
             if response.stop_reason == "end_turn":
@@ -254,12 +398,13 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
                     # Process all tool calls and collect results
                     tool_results = []
-                    for i, content in enumerate(tool_uses):
-                        tool_name = content.name
-                        tool_args = content.input
-                        tool_use_id = content.id
+                    # Use a different loop variable for tool enumeration if 'i' is outer loop counter
+                    for tool_idx, content_block in enumerate(tool_uses):
+                        tool_name = content_block.name
+                        tool_args = content_block.input
+                        tool_use_id = content_block.id
-                        if i == 0:  # Only show message for first tool use
+                        if tool_idx == 0:  # Only show message for first tool use
                             await self.show_assistant_message(message_text, tool_name)
                         self.show_tool_call(available_tools, tool_name, tool_args)
@@ -284,11 +429,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
         if params.use_history:
             # Get current prompt messages
             prompt_messages = self.history.get(include_completion_history=False)
-            # Calculate new conversation messages (excluding prompts)
             new_messages = messages[len(prompt_messages) :]
-            # Update conversation history
             self.history.set(new_messages)
         self._log_chat_finished(model=model)
@@ -326,8 +467,26 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
             multipart_messages[:-1] if last_message.role == "user" else multipart_messages
         )
         converted = []
+        # Get cache mode configuration
+        cache_mode = self._get_cache_mode()
         for msg in messages_to_add:
-            converted.append(AnthropicConverter.convert_to_anthropic(msg))
+            anthropic_msg = AnthropicConverter.convert_to_anthropic(msg)
+            # Apply caching to template messages if cache_mode is "prompt" or "auto"
+            if is_template and cache_mode in ["prompt", "auto"] and anthropic_msg.get("content"):
+                content_list = anthropic_msg["content"]
+                if isinstance(content_list, list) and content_list:
+                    # Apply cache control to the last content block
+                    last_block = content_list[-1]
+                    if isinstance(last_block, dict):
+                        last_block["cache_control"] = {"type": "ephemeral"}
+                        self.logger.debug(
+                            f"Applied cache_control to template message with role {anthropic_msg.get('role')}"
+                        )
+            converted.append(anthropic_msg)
         self.history.extend(converted, is_prompt=is_template)
@@ -362,6 +521,28 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
         )
         return self._structured_from_multipart(result, model)
+    def _show_usage(self, raw_usage: Usage, turn_usage: TurnUsage) -> None:
+        # Print raw usage for debugging
+        print(f"\n=== USAGE DEBUG ({turn_usage.model}) ===")
+        print(f"Raw usage: {raw_usage}")
+        print(
+            f"Turn usage: input={turn_usage.input_tokens}, output={turn_usage.output_tokens}, current_context={turn_usage.current_context_tokens}"
+        )
+        print(
+            f"Cache: read={turn_usage.cache_usage.cache_read_tokens}, write={turn_usage.cache_usage.cache_write_tokens}"
+        )
+        print(f"Effective input: {turn_usage.effective_input_tokens}")
+        print(
+            f"Accumulator: total_turns={self.usage_accumulator.turn_count}, cumulative_billing={self.usage_accumulator.cumulative_billing_tokens}, current_context={self.usage_accumulator.current_context_tokens}"
+        )
+        if self.usage_accumulator.context_usage_percentage:
+            print(
+                f"Context usage: {self.usage_accumulator.context_usage_percentage:.1f}% of {self.usage_accumulator.context_window_size}"
+            )
+        if self.usage_accumulator.cache_hit_rate:
+            print(f"Cache hit rate: {self.usage_accumulator.cache_hit_rate:.1f}%")
+        print("===========================\n")
     @classmethod
     def convert_message_to_message_param(cls, message: Message, **kwargs) -> MessageParam:
         """Convert a response object to an input parameter object to allow LLM calls to be chained."""

mcp_agent/llm/providers/augmented_llm_google_native.py CHANGED Viewed

@@ -24,6 +24,7 @@ from mcp_agent.llm.provider_types import Provider
 # Import the new converter class
 from mcp_agent.llm.providers.google_converter import GoogleConverter
+from mcp_agent.llm.usage_tracking import TurnUsage
 from mcp_agent.mcp.prompt_message_multipart import PromptMessageMultipart
 # Define default model and potentially other Google-specific defaults
@@ -220,6 +221,7 @@ class GoogleNativeAugmentedLLM(AugmentedLLM[types.Content, types.Content]):
             parallel_tool_calls=True,  # Assume parallel tool calls are supported by default with native API
             max_iterations=20,
             use_history=True,
+            maxTokens=65536,  # Default max tokens for Google models
             # Include other relevant default parameters
         )
@@ -281,10 +283,25 @@ class GoogleNativeAugmentedLLM(AugmentedLLM[types.Content, types.Content]):
                 )
                 self.logger.debug("Google generate_content response:", data=api_response)
+                # Track usage if response is valid and has usage data
+                if (
+                    hasattr(api_response, "usage_metadata")
+                    and api_response.usage_metadata
+                    and not isinstance(api_response, BaseException)
+                ):
+                    try:
+                        turn_usage = TurnUsage.from_google(
+                            api_response.usage_metadata, request_params.model
+                        )
+                        self.usage_accumulator.add_turn(turn_usage)
+                    except Exception as e:
+                        self.logger.warning(f"Failed to track usage: {e}")
             except errors.APIError as e:
                 # Handle specific Google API errors
                 self.logger.error(f"Google API Error: {e.code} - {e.message}")
-                raise ProviderKeyError(f"Google API Error: {e.code}", e.message) from e
+                raise ProviderKeyError(f"Google API Error: {e.code}", e.message or "") from e
             except Exception as e:
                 self.logger.error(f"Error during Google generate_content call: {e}")
                 # Decide how to handle other exceptions - potentially re-raise or return an error message

fast-agent-mcp 0.2.33__py3-none-any.whl → 0.2.35__py3-none-any.whl

fast-agent-mcp 0.2.33py3-none-any.whl → 0.2.35py3-none-any.whl