PyPI - agno - Versions diffs - 2.3.11__py3-none-any.whl → 2.3.12__py3-none-any.whl - Mend

agno 2.3.11py3-none-any.whl → 2.3.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

agno/compression/manager.py +87 -16
agno/db/mongo/async_mongo.py +1 -1
agno/db/mongo/mongo.py +1 -1
agno/exceptions.py +1 -0
agno/knowledge/knowledge.py +83 -20
agno/knowledge/reader/csv_reader.py +2 -2
agno/knowledge/reader/text_reader.py +15 -3
agno/knowledge/reader/wikipedia_reader.py +33 -1
agno/memory/strategies/base.py +3 -4
agno/models/anthropic/claude.py +44 -0
agno/models/aws/bedrock.py +60 -0
agno/models/base.py +124 -30
agno/models/google/gemini.py +141 -23
agno/models/litellm/chat.py +25 -0
agno/models/openai/responses.py +44 -0
agno/os/routers/knowledge/knowledge.py +0 -1
agno/run/agent.py +17 -0
agno/run/requirement.py +89 -6
agno/utils/print_response/agent.py +4 -4
agno/utils/print_response/team.py +12 -12
agno/utils/tokens.py +643 -27
agno/vectordb/chroma/chromadb.py +6 -2
agno/vectordb/lancedb/lance_db.py +3 -37
agno/vectordb/milvus/milvus.py +6 -32
agno/vectordb/mongodb/mongodb.py +0 -27
agno/vectordb/pgvector/pgvector.py +15 -5
agno/vectordb/pineconedb/pineconedb.py +0 -17
agno/vectordb/qdrant/qdrant.py +6 -29
agno/vectordb/redis/redisdb.py +0 -26
agno/vectordb/singlestore/singlestore.py +16 -8
agno/vectordb/surrealdb/surrealdb.py +0 -36
agno/vectordb/weaviate/weaviate.py +6 -2
{agno-2.3.11.dist-info → agno-2.3.12.dist-info}/METADATA +4 -1
{agno-2.3.11.dist-info → agno-2.3.12.dist-info}/RECORD +37 -37
{agno-2.3.11.dist-info → agno-2.3.12.dist-info}/WHEEL +0 -0
{agno-2.3.11.dist-info → agno-2.3.12.dist-info}/licenses/LICENSE +0 -0
{agno-2.3.11.dist-info → agno-2.3.12.dist-info}/top_level.txt +0 -0

agno/models/base.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pathlib import Path
 from time import sleep, time
 from types import AsyncGeneratorType, GeneratorType
 from typing import (
+    TYPE_CHECKING,
     Any,
     AsyncIterator,
     Dict,
@@ -15,11 +16,15 @@ from typing import (
     List,
     Literal,
     Optional,
+    Sequence,
     Tuple,
     Type,
     Union,
     get_args,
 )
+if TYPE_CHECKING:
+    from agno.compression.manager import CompressionManager
 from uuid import uuid4
 from pydantic import BaseModel
@@ -156,6 +161,8 @@ class Model(ABC):
     # Enable retrying a model invocation once with a guidance message.
     # This is useful for known errors avoidable with extra instructions.
     retry_with_guidance: bool = True
+    # Set the number of times to retry the model invocation with guidance.
+    retry_with_guidance_limit: int = 1
     def __post_init__(self):
         if self.provider is None and self.name is not None:
@@ -178,6 +185,7 @@ class Model(ABC):
         for attempt in range(self.retries + 1):
             try:
+                retries_with_guidance_count = kwargs.pop("retries_with_guidance_count", 0)
                 return self.invoke(**kwargs)
             except ModelProviderError as e:
                 last_exception = e
@@ -190,8 +198,20 @@ class Model(ABC):
                 else:
                     log_error(f"Model provider error after {self.retries + 1} attempts: {e}")
             except RetryableModelProviderError as e:
+                current_count = retries_with_guidance_count
+                if current_count >= self.retry_with_guidance_limit:
+                    raise ModelProviderError(
+                        message=f"Max retries with guidance reached. Error: {e.original_error}",
+                        model_name=self.name,
+                        model_id=self.id,
+                    )
+                kwargs.pop("retry_with_guidance", None)
+                kwargs["retries_with_guidance_count"] = current_count + 1
+                # Append the guidance message to help the model avoid the error in the next invoke.
                 kwargs["messages"].append(Message(role="user", content=e.retry_guidance_message, temporary=True))
-                return self._invoke_with_retry(**kwargs, retrying_with_guidance=True)
+                return self._invoke_with_retry(**kwargs, retry_with_guidance=True)
         # If we've exhausted all retries, raise the last exception
         raise last_exception  # type: ignore
@@ -207,6 +227,7 @@ class Model(ABC):
         for attempt in range(self.retries + 1):
             try:
+                retries_with_guidance_count = kwargs.pop("retries_with_guidance_count", 0)
                 return await self.ainvoke(**kwargs)
             except ModelProviderError as e:
                 last_exception = e
@@ -219,8 +240,21 @@ class Model(ABC):
                 else:
                     log_error(f"Model provider error after {self.retries + 1} attempts: {e}")
             except RetryableModelProviderError as e:
+                current_count = retries_with_guidance_count
+                if current_count >= self.retry_with_guidance_limit:
+                    raise ModelProviderError(
+                        message=f"Max retries with guidance reached. Error: {e.original_error}",
+                        model_name=self.name,
+                        model_id=self.id,
+                    )
+                kwargs.pop("retry_with_guidance", None)
+                kwargs["retries_with_guidance_count"] = current_count + 1
+                # Append the guidance message to help the model avoid the error in the next invoke.
                 kwargs["messages"].append(Message(role="user", content=e.retry_guidance_message, temporary=True))
-                return await self._ainvoke_with_retry(**kwargs, retrying_with_guidance=True)
+                return await self._ainvoke_with_retry(**kwargs, retry_with_guidance=True)
         # If we've exhausted all retries, raise the last exception
         raise last_exception  # type: ignore
@@ -236,6 +270,7 @@ class Model(ABC):
         for attempt in range(self.retries + 1):
             try:
+                retries_with_guidance_count = kwargs.pop("retries_with_guidance_count", 0)
                 yield from self.invoke_stream(**kwargs)
                 return  # Success, exit the retry loop
             except ModelProviderError as e:
@@ -250,8 +285,21 @@ class Model(ABC):
                 else:
                     log_error(f"Model provider error after {self.retries + 1} attempts: {e}")
             except RetryableModelProviderError as e:
+                current_count = retries_with_guidance_count
+                if current_count >= self.retry_with_guidance_limit:
+                    raise ModelProviderError(
+                        message=f"Max retries with guidance reached. Error: {e.original_error}",
+                        model_name=self.name,
+                        model_id=self.id,
+                    )
+                kwargs.pop("retry_with_guidance", None)
+                kwargs["retries_with_guidance_count"] = current_count + 1
+                # Append the guidance message to help the model avoid the error in the next invoke.
                 kwargs["messages"].append(Message(role="user", content=e.retry_guidance_message, temporary=True))
-                yield from self._invoke_stream_with_retry(**kwargs, retrying_with_guidance=True)
+                yield from self._invoke_stream_with_retry(**kwargs, retry_with_guidance=True)
                 return  # Success, exit after regeneration
         # If we've exhausted all retries, raise the last exception
@@ -268,6 +316,7 @@ class Model(ABC):
         for attempt in range(self.retries + 1):
             try:
+                retries_with_guidance_count = kwargs.pop("retries_with_guidance_count", 0)
                 async for response in self.ainvoke_stream(**kwargs):
                     yield response
                 return  # Success, exit the retry loop
@@ -283,8 +332,21 @@ class Model(ABC):
                 else:
                     log_error(f"Model provider error after {self.retries + 1} attempts: {e}")
             except RetryableModelProviderError as e:
+                current_count = retries_with_guidance_count
+                if current_count >= self.retry_with_guidance_limit:
+                    raise ModelProviderError(
+                        message=f"Max retries with guidance reached. Error: {e.original_error}",
+                        model_name=self.name,
+                        model_id=self.id,
+                    )
+                kwargs.pop("retry_with_guidance", None)
+                kwargs["retries_with_guidance_count"] = current_count + 1
+                # Append the guidance message to help the model avoid the error in the next invoke.
                 kwargs["messages"].append(Message(role="user", content=e.retry_guidance_message, temporary=True))
-                async for response in self._ainvoke_stream_with_retry(**kwargs, retrying_with_guidance=True):
+                async for response in self._ainvoke_stream_with_retry(**kwargs, retry_with_guidance=True):
                     yield response
                 return  # Success, exit after regeneration
@@ -296,8 +358,8 @@ class Model(ABC):
         _dict = {field: getattr(self, field) for field in fields if getattr(self, field) is not None}
         return _dict
-    def _remove_temporarys(self, messages: List[Message]) -> None:
-        """Remove temporal messages from the given list.
+    def _remove_temporary_messages(self, messages: List[Message]) -> None:
+        """Remove temporary messages from the given list.
         Args:
             messages: The list of messages to filter (modified in place).
@@ -453,6 +515,29 @@ class Model(ABC):
                 _tool_dicts.append(tool)
         return _tool_dicts
+    def count_tokens(
+        self,
+        messages: List[Message],
+        tools: Optional[Sequence[Union[Function, Dict[str, Any]]]] = None,
+        output_schema: Optional[Union[Dict, Type[BaseModel]]] = None,
+    ) -> int:
+        from agno.utils.tokens import count_tokens
+        return count_tokens(
+            messages,
+            tools=list(tools) if tools else None,
+            model_id=self.id,
+            output_schema=output_schema,
+        )
+    async def acount_tokens(
+        self,
+        messages: List[Message],
+        tools: Optional[Sequence[Union[Function, Dict[str, Any]]]] = None,
+        output_schema: Optional[Union[Dict, Type[BaseModel]]] = None,
+    ) -> int:
+        return self.count_tokens(messages, tools, output_schema=output_schema)
     def response(
         self,
         messages: List[Message],
@@ -462,7 +547,7 @@ class Model(ABC):
         tool_call_limit: Optional[int] = None,
         run_response: Optional[Union[RunOutput, TeamRunOutput]] = None,
         send_media_to_model: bool = True,
-        compression_manager: Optional[Any] = None,
+        compression_manager: Optional["CompressionManager"] = None,
     ) -> ModelResponse:
         """
         Generate a response from the model.
@@ -500,8 +585,15 @@ class Model(ABC):
             _functions = {tool.name: tool for tool in tools if isinstance(tool, Function)} if tools is not None else {}
             _compress_tool_results = compression_manager is not None and compression_manager.compress_tool_results
+            _compression_manager = compression_manager if _compress_tool_results else None
             while True:
+                # Compress tool results if compression is enabled and threshold is met
+                if _compression_manager is not None and _compression_manager.should_compress(
+                    messages, tools, model=self, response_format=response_format
+                ):
+                    _compression_manager.compress(messages)
                 # Get response from model
                 assistant_message = Message(role=self.assistant_message_role)
                 self._process_model_response(
@@ -600,11 +692,6 @@ class Model(ABC):
                     # Add a function call for each successful execution
                     function_call_count += len(function_call_results)
-                    all_messages = messages + function_call_results
-                    # Compress tool results
-                    if compression_manager and compression_manager.should_compress(all_messages):
-                        compression_manager.compress(all_messages)
                     # Format and add results to messages
                     self.format_function_call_results(
                         messages=messages,
@@ -674,11 +761,12 @@ class Model(ABC):
         tool_call_limit: Optional[int] = None,
         run_response: Optional[Union[RunOutput, TeamRunOutput]] = None,
         send_media_to_model: bool = True,
-        compression_manager: Optional[Any] = None,
+        compression_manager: Optional["CompressionManager"] = None,
     ) -> ModelResponse:
         """
         Generate an asynchronous response from the model.
         """
         try:
             # Check cache if enabled
             if self.cache_response:
@@ -700,10 +788,17 @@ class Model(ABC):
             _functions = {tool.name: tool for tool in tools if isinstance(tool, Function)} if tools is not None else {}
             _compress_tool_results = compression_manager is not None and compression_manager.compress_tool_results
+            _compression_manager = compression_manager if _compress_tool_results else None
             function_call_count = 0
             while True:
+                # Compress existing tool results BEFORE making API call to avoid context overflow
+                if _compression_manager is not None and await _compression_manager.ashould_compress(
+                    messages, tools, model=self, response_format=response_format
+                ):
+                    await _compression_manager.acompress(messages)
                 # Get response from model
                 assistant_message = Message(role=self.assistant_message_role)
                 await self._aprocess_model_response(
@@ -801,11 +896,6 @@ class Model(ABC):
                     # Add a function call for each successful execution
                     function_call_count += len(function_call_results)
-                    all_messages = messages + function_call_results
-                    # Compress tool results
-                    if compression_manager and compression_manager.should_compress(all_messages):
-                        await compression_manager.acompress(all_messages)
                     # Format and add results to messages
                     self.format_function_call_results(
                         messages=messages,
@@ -1093,7 +1183,7 @@ class Model(ABC):
         stream_model_response: bool = True,
         run_response: Optional[Union[RunOutput, TeamRunOutput]] = None,
         send_media_to_model: bool = True,
-        compression_manager: Optional[Any] = None,
+        compression_manager: Optional["CompressionManager"] = None,
     ) -> Iterator[Union[ModelResponse, RunOutputEvent, TeamRunOutputEvent]]:
         """
         Generate a streaming response from the model.
@@ -1127,10 +1217,17 @@ class Model(ABC):
             _functions = {tool.name: tool for tool in tools if isinstance(tool, Function)} if tools is not None else {}
             _compress_tool_results = compression_manager is not None and compression_manager.compress_tool_results
+            _compression_manager = compression_manager if _compress_tool_results else None
             function_call_count = 0
             while True:
+                # Compress existing tool results BEFORE invoke
+                if _compression_manager is not None and _compression_manager.should_compress(
+                    messages, tools, model=self, response_format=response_format
+                ):
+                    _compression_manager.compress(messages)
                 assistant_message = Message(role=self.assistant_message_role)
                 # Create assistant message and stream data
                 stream_data = MessageData()
@@ -1192,11 +1289,6 @@ class Model(ABC):
                     # Add a function call for each successful execution
                     function_call_count += len(function_call_results)
-                    all_messages = messages + function_call_results
-                    # Compress tool results
-                    if compression_manager and compression_manager.should_compress(all_messages):
-                        compression_manager.compress(all_messages)
                     # Format and add results to messages
                     if stream_data and stream_data.extra is not None:
                         self.format_function_call_results(
@@ -1311,7 +1403,7 @@ class Model(ABC):
         stream_model_response: bool = True,
         run_response: Optional[Union[RunOutput, TeamRunOutput]] = None,
         send_media_to_model: bool = True,
-        compression_manager: Optional[Any] = None,
+        compression_manager: Optional["CompressionManager"] = None,
     ) -> AsyncIterator[Union[ModelResponse, RunOutputEvent, TeamRunOutputEvent]]:
         """
         Generate an asynchronous streaming response from the model.
@@ -1345,10 +1437,17 @@ class Model(ABC):
             _functions = {tool.name: tool for tool in tools if isinstance(tool, Function)} if tools is not None else {}
             _compress_tool_results = compression_manager is not None and compression_manager.compress_tool_results
+            _compression_manager = compression_manager if _compress_tool_results else None
             function_call_count = 0
             while True:
+                # Compress existing tool results BEFORE making API call to avoid context overflow
+                if _compression_manager is not None and await _compression_manager.ashould_compress(
+                    messages, tools, model=self, response_format=response_format
+                ):
+                    await _compression_manager.acompress(messages)
                 # Create assistant message and stream data
                 assistant_message = Message(role=self.assistant_message_role)
                 stream_data = MessageData()
@@ -1410,11 +1509,6 @@ class Model(ABC):
                     # Add a function call for each successful execution
                     function_call_count += len(function_call_results)
-                    all_messages = messages + function_call_results
-                    # Compress tool results
-                    if compression_manager and compression_manager.should_compress(all_messages):
-                        await compression_manager.acompress(all_messages)
                     # Format and add results to messages
                     if stream_data and stream_data.extra is not None:
                         self.format_function_call_results(

agno/models/google/gemini.py CHANGED Viewed

@@ -19,8 +19,10 @@ from agno.models.message import Citations, Message, UrlCitation
 from agno.models.metrics import Metrics
 from agno.models.response import ModelResponse
 from agno.run.agent import RunOutput
+from agno.tools.function import Function
 from agno.utils.gemini import format_function_definitions, format_image_for_message, prepare_response_schema
 from agno.utils.log import log_debug, log_error, log_info, log_warning
+from agno.utils.tokens import count_schema_tokens, count_text_tokens, count_tool_tokens
 try:
     from google import genai
@@ -310,6 +312,113 @@ class Gemini(Model):
             log_debug(f"Calling {self.provider} with request parameters: {request_params}", log_level=2)
         return request_params
+    def count_tokens(
+        self,
+        messages: List[Message],
+        tools: Optional[List[Union[Function, Dict[str, Any]]]] = None,
+        output_schema: Optional[Union[Dict, Type[BaseModel]]] = None,
+    ) -> int:
+        contents, system_instruction = self._format_messages(messages, compress_tool_results=True)
+        schema_tokens = count_schema_tokens(output_schema, self.id)
+        if self.vertexai:
+            # VertexAI supports full token counting with system_instruction and tools
+            config: Dict[str, Any] = {}
+            if system_instruction:
+                config["system_instruction"] = system_instruction
+            if tools:
+                formatted_tools = self._format_tools(tools)
+                gemini_tools = format_function_definitions(formatted_tools)
+                if gemini_tools:
+                    config["tools"] = [gemini_tools]
+            response = self.get_client().models.count_tokens(
+                model=self.id,
+                contents=contents,
+                config=config if config else None,  # type: ignore
+            )
+            return (response.total_tokens or 0) + schema_tokens
+        else:
+            # Google AI Studio: Use API for content tokens + local estimation for system/tools
+            # The API doesn't support system_instruction or tools in config, so we use a hybrid approach:
+            # 1. Get accurate token count for contents (text + multimodal) from API
+            # 2. Add estimated tokens for system_instruction and tools locally
+            try:
+                response = self.get_client().models.count_tokens(
+                    model=self.id,
+                    contents=contents,
+                )
+                total = response.total_tokens or 0
+            except Exception as e:
+                log_warning(f"Gemini count_tokens API failed: {e}. Falling back to tiktoken-based estimation.")
+                return super().count_tokens(messages, tools, output_schema)
+            # Add estimated tokens for system instruction (not supported by Google AI Studio API)
+            if system_instruction:
+                system_text = system_instruction if isinstance(system_instruction, str) else str(system_instruction)
+                total += count_text_tokens(system_text, self.id)
+            # Add estimated tokens for tools (not supported by Google AI Studio API)
+            if tools:
+                total += count_tool_tokens(tools, self.id)
+            # Add estimated tokens for response_format/output_schema
+            total += schema_tokens
+            return total
+    async def acount_tokens(
+        self,
+        messages: List[Message],
+        tools: Optional[List[Union[Function, Dict[str, Any]]]] = None,
+        output_schema: Optional[Union[Dict, Type[BaseModel]]] = None,
+    ) -> int:
+        contents, system_instruction = self._format_messages(messages, compress_tool_results=True)
+        schema_tokens = count_schema_tokens(output_schema, self.id)
+        # VertexAI supports full token counting with system_instruction and tools
+        if self.vertexai:
+            config: Dict[str, Any] = {}
+            if system_instruction:
+                config["system_instruction"] = system_instruction
+            if tools:
+                formatted_tools = self._format_tools(tools)
+                gemini_tools = format_function_definitions(formatted_tools)
+                if gemini_tools:
+                    config["tools"] = [gemini_tools]
+            response = await self.get_client().aio.models.count_tokens(
+                model=self.id,
+                contents=contents,
+                config=config if config else None,  # type: ignore
+            )
+            return (response.total_tokens or 0) + schema_tokens
+        else:
+            # Hybrid approach - Google AI Studio does not support system_instruction or tools in config
+            try:
+                response = await self.get_client().aio.models.count_tokens(
+                    model=self.id,
+                    contents=contents,
+                )
+                total = response.total_tokens or 0
+            except Exception as e:
+                log_warning(f"Gemini count_tokens API failed: {e}. Falling back to tiktoken-based estimation.")
+                return await super().acount_tokens(messages, tools, output_schema)
+            # Add estimated tokens for system instruction
+            if system_instruction:
+                system_text = system_instruction if isinstance(system_instruction, str) else str(system_instruction)
+                total += count_text_tokens(system_text, self.id)
+            # Add estimated tokens for tools
+            if tools:
+                total += count_tool_tokens(tools, self.id)
+            # Add estimated tokens for response_format/output_schema
+            total += schema_tokens
+            return total
     def invoke(
         self,
         messages: List[Message],
@@ -319,7 +428,7 @@ class Gemini(Model):
         tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
         run_response: Optional[RunOutput] = None,
         compress_tool_results: bool = False,
-        retrying_with_guidance: bool = False,
+        retry_with_guidance: bool = False,
     ) -> ModelResponse:
         """
         Invokes the model with a list of messages and returns the response.
@@ -341,12 +450,12 @@ class Gemini(Model):
             assistant_message.metrics.stop_timer()
             model_response = self._parse_provider_response(
-                provider_response, response_format=response_format, retrying_with_guidance=retrying_with_guidance
+                provider_response, response_format=response_format, retry_with_guidance=retry_with_guidance
             )
             # If we were retrying the invoke with guidance, remove the guidance message
-            if retrying_with_guidance is True:
-                self._remove_temporarys(messages)
+            if retry_with_guidance is True:
+                self._remove_temporary_messages(messages)
             return model_response
@@ -374,7 +483,7 @@ class Gemini(Model):
         tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
         run_response: Optional[RunOutput] = None,
         compress_tool_results: bool = False,
-        retrying_with_guidance: bool = False,
+        retry_with_guidance: bool = False,
     ) -> Iterator[ModelResponse]:
         """
         Invokes the model with a list of messages and returns the response as a stream.
@@ -394,11 +503,11 @@ class Gemini(Model):
                 contents=formatted_messages,
                 **request_kwargs,
             ):
-                yield self._parse_provider_response_delta(response, retrying_with_guidance=retrying_with_guidance)
+                yield self._parse_provider_response_delta(response, retry_with_guidance=retry_with_guidance)
             # If we were retrying the invoke with guidance, remove the guidance message
-            if retrying_with_guidance is True:
-                self._remove_temporarys(messages)
+            if retry_with_guidance is True:
+                self._remove_temporary_messages(messages)
             assistant_message.metrics.stop_timer()
@@ -425,7 +534,7 @@ class Gemini(Model):
         tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
         run_response: Optional[RunOutput] = None,
         compress_tool_results: bool = False,
-        retrying_with_guidance: bool = False,
+        retry_with_guidance: bool = False,
     ) -> ModelResponse:
         """
         Invokes the model with a list of messages and returns the response.
@@ -449,12 +558,12 @@ class Gemini(Model):
             assistant_message.metrics.stop_timer()
             model_response = self._parse_provider_response(
-                provider_response, response_format=response_format, retrying_with_guidance=retrying_with_guidance
+                provider_response, response_format=response_format, retry_with_guidance=retry_with_guidance
             )
             # If we were retrying the invoke with guidance, remove the guidance message
-            if retrying_with_guidance is True:
-                self._remove_temporarys(messages)
+            if retry_with_guidance is True:
+                self._remove_temporary_messages(messages)
             return model_response
@@ -481,7 +590,7 @@ class Gemini(Model):
         tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
         run_response: Optional[RunOutput] = None,
         compress_tool_results: bool = False,
-        retrying_with_guidance: bool = False,
+        retry_with_guidance: bool = False,
     ) -> AsyncIterator[ModelResponse]:
         """
         Invokes the model with a list of messages and returns the response as a stream.
@@ -504,11 +613,11 @@ class Gemini(Model):
                 **request_kwargs,
             )
             async for chunk in async_stream:
-                yield self._parse_provider_response_delta(chunk, retrying_with_guidance=retrying_with_guidance)
+                yield self._parse_provider_response_delta(chunk, retry_with_guidance=retry_with_guidance)
             # If we were retrying the invoke with guidance, remove the guidance message
-            if retrying_with_guidance is True:
-                self._remove_temporarys(messages)
+            if retry_with_guidance is True:
+                self._remove_temporary_messages(messages)
             assistant_message.metrics.stop_timer()
@@ -874,6 +983,8 @@ class Gemini(Model):
         """
         combined_original_content: List = []
         combined_function_result: List = []
+        tool_names: List[str] = []
         message_metrics = Metrics()
         if len(function_call_results) > 0:
@@ -883,13 +994,18 @@ class Gemini(Model):
                 combined_function_result.append(
                     {"tool_call_id": result.tool_call_id, "tool_name": result.tool_name, "content": compressed_content}
                 )
+                if result.tool_name:
+                    tool_names.append(result.tool_name)
                 message_metrics += result.metrics
+        tool_name = ", ".join(tool_names) if tool_names else None
         if combined_original_content:
             messages.append(
                 Message(
                     role="tool",
                     content=combined_original_content,
+                    tool_name=tool_name,
                     tool_calls=combined_function_result,
                     metrics=message_metrics,
                 )
@@ -915,11 +1031,11 @@ class Gemini(Model):
             # Raise if the request failed because of a malformed function call
             if hasattr(candidate, "finish_reason") and candidate.finish_reason:
                 if candidate.finish_reason == GeminiFinishReason.MALFORMED_FUNCTION_CALL.value:
-                    # We only want to raise errors that trigger regeneration attempts once
-                    if kwargs.get("retrying_with_guidance") is True:
-                        pass
                     if self.retry_with_guidance:
-                        raise RetryableModelProviderError(retry_guidance_message=MALFORMED_FUNCTION_CALL_GUIDANCE)
+                        raise RetryableModelProviderError(
+                            retry_guidance_message=MALFORMED_FUNCTION_CALL_GUIDANCE,
+                            original_error=f"Generation ended with finish reason: {candidate.finish_reason}",
+                        )
             if candidate.content:
                 response_message = candidate.content
@@ -1079,9 +1195,11 @@ class Gemini(Model):
             # Raise if the request failed because of a malformed function call
             if hasattr(candidate, "finish_reason") and candidate.finish_reason:
                 if candidate.finish_reason == GeminiFinishReason.MALFORMED_FUNCTION_CALL.value:
-                    if kwargs.get("retrying_with_guidance") is True:
-                        pass
-                    raise RetryableModelProviderError(retry_guidance_message=MALFORMED_FUNCTION_CALL_GUIDANCE)
+                    if self.retry_with_guidance:
+                        raise RetryableModelProviderError(
+                            retry_guidance_message=MALFORMED_FUNCTION_CALL_GUIDANCE,
+                            original_error=f"Generation ended with finish reason: {candidate.finish_reason}",
+                        )
             response_message: Content = Content(role="model", parts=[])
             if candidate_content is not None:

agno/models/litellm/chat.py CHANGED Viewed

@@ -10,8 +10,10 @@ from agno.models.message import Message
 from agno.models.metrics import Metrics
 from agno.models.response import ModelResponse
 from agno.run.agent import RunOutput
+from agno.tools.function import Function
 from agno.utils.log import log_debug, log_error, log_warning
 from agno.utils.openai import _format_file_for_message, audio_to_message, images_to_message
+from agno.utils.tokens import count_schema_tokens
 try:
     import litellm
@@ -476,3 +478,26 @@ class LiteLLM(Model):
         metrics.total_tokens = metrics.input_tokens + metrics.output_tokens
         return metrics
+    def count_tokens(
+        self,
+        messages: List[Message],
+        tools: Optional[List[Union[Function, Dict[str, Any]]]] = None,
+        response_format: Optional[Union[Dict, Type[BaseModel]]] = None,
+    ) -> int:
+        formatted_messages = self._format_messages(messages, compress_tool_results=True)
+        formatted_tools = self._format_tools(tools) if tools else None
+        tokens = litellm.token_counter(
+            model=self.id,
+            messages=formatted_messages,
+            tools=formatted_tools,  # type: ignore
+        )
+        return tokens + count_schema_tokens(response_format, self.id)
+    async def acount_tokens(
+        self,
+        messages: List[Message],
+        tools: Optional[List[Union[Function, Dict[str, Any]]]] = None,
+        response_format: Optional[Union[Dict, Type[BaseModel]]] = None,
+    ) -> int:
+        return self.count_tokens(messages, tools, response_format)

agno 2.3.11__py3-none-any.whl → 2.3.12__py3-none-any.whl

agno 2.3.11py3-none-any.whl → 2.3.12py3-none-any.whl