PyPI - agno - Versions diffs - 2.2.13__py3-none-any.whl → 2.4.3__py3-none-any.whl - Mend

agno 2.2.13py3-none-any.whl → 2.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (383) hide show

agno/agent/__init__.py +6 -0
agno/agent/agent.py +5252 -3145
agno/agent/remote.py +525 -0
agno/api/api.py +2 -0
agno/client/__init__.py +3 -0
agno/client/a2a/__init__.py +10 -0
agno/client/a2a/client.py +554 -0
agno/client/a2a/schemas.py +112 -0
agno/client/a2a/utils.py +369 -0
agno/client/os.py +2669 -0
agno/compression/__init__.py +3 -0
agno/compression/manager.py +247 -0
agno/culture/manager.py +2 -2
agno/db/base.py +927 -6
agno/db/dynamo/dynamo.py +788 -2
agno/db/dynamo/schemas.py +128 -0
agno/db/dynamo/utils.py +26 -3
agno/db/firestore/firestore.py +674 -50
agno/db/firestore/schemas.py +41 -0
agno/db/firestore/utils.py +25 -10
agno/db/gcs_json/gcs_json_db.py +506 -3
agno/db/gcs_json/utils.py +14 -2
agno/db/in_memory/in_memory_db.py +203 -4
agno/db/in_memory/utils.py +14 -2
agno/db/json/json_db.py +498 -2
agno/db/json/utils.py +14 -2
agno/db/migrations/manager.py +199 -0
agno/db/migrations/utils.py +19 -0
agno/db/migrations/v1_to_v2.py +54 -16
agno/db/migrations/versions/__init__.py +0 -0
agno/db/migrations/versions/v2_3_0.py +977 -0
agno/db/mongo/async_mongo.py +1013 -39
agno/db/mongo/mongo.py +684 -4
agno/db/mongo/schemas.py +48 -0
agno/db/mongo/utils.py +17 -0
agno/db/mysql/__init__.py +2 -1
agno/db/mysql/async_mysql.py +2958 -0
agno/db/mysql/mysql.py +722 -53
agno/db/mysql/schemas.py +77 -11
agno/db/mysql/utils.py +151 -8
agno/db/postgres/async_postgres.py +1254 -137
agno/db/postgres/postgres.py +2316 -93
agno/db/postgres/schemas.py +153 -21
agno/db/postgres/utils.py +22 -7
agno/db/redis/redis.py +531 -3
agno/db/redis/schemas.py +36 -0
agno/db/redis/utils.py +31 -15
agno/db/schemas/evals.py +1 -0
agno/db/schemas/memory.py +20 -9
agno/db/singlestore/schemas.py +70 -1
agno/db/singlestore/singlestore.py +737 -74
agno/db/singlestore/utils.py +13 -3
agno/db/sqlite/async_sqlite.py +1069 -89
agno/db/sqlite/schemas.py +133 -1
agno/db/sqlite/sqlite.py +2203 -165
agno/db/sqlite/utils.py +21 -11
agno/db/surrealdb/models.py +25 -0
agno/db/surrealdb/surrealdb.py +603 -1
agno/db/utils.py +60 -0
agno/eval/__init__.py +26 -3
agno/eval/accuracy.py +25 -12
agno/eval/agent_as_judge.py +871 -0
agno/eval/base.py +29 -0
agno/eval/performance.py +10 -4
agno/eval/reliability.py +22 -13
agno/eval/utils.py +2 -1
agno/exceptions.py +42 -0
agno/hooks/__init__.py +3 -0
agno/hooks/decorator.py +164 -0
agno/integrations/discord/client.py +13 -2
agno/knowledge/__init__.py +4 -0
agno/knowledge/chunking/code.py +90 -0
agno/knowledge/chunking/document.py +65 -4
agno/knowledge/chunking/fixed.py +4 -1
agno/knowledge/chunking/markdown.py +102 -11
agno/knowledge/chunking/recursive.py +2 -2
agno/knowledge/chunking/semantic.py +130 -48
agno/knowledge/chunking/strategy.py +18 -0
agno/knowledge/embedder/azure_openai.py +0 -1
agno/knowledge/embedder/google.py +1 -1
agno/knowledge/embedder/mistral.py +1 -1
agno/knowledge/embedder/nebius.py +1 -1
agno/knowledge/embedder/openai.py +16 -12
agno/knowledge/filesystem.py +412 -0
agno/knowledge/knowledge.py +4261 -1199
agno/knowledge/protocol.py +134 -0
agno/knowledge/reader/arxiv_reader.py +3 -2
agno/knowledge/reader/base.py +9 -7
agno/knowledge/reader/csv_reader.py +91 -42
agno/knowledge/reader/docx_reader.py +9 -10
agno/knowledge/reader/excel_reader.py +225 -0
agno/knowledge/reader/field_labeled_csv_reader.py +38 -48
agno/knowledge/reader/firecrawl_reader.py +3 -2
agno/knowledge/reader/json_reader.py +16 -22
agno/knowledge/reader/markdown_reader.py +15 -14
agno/knowledge/reader/pdf_reader.py +33 -28
agno/knowledge/reader/pptx_reader.py +9 -10
agno/knowledge/reader/reader_factory.py +135 -1
agno/knowledge/reader/s3_reader.py +8 -16
agno/knowledge/reader/tavily_reader.py +3 -3
agno/knowledge/reader/text_reader.py +15 -14
agno/knowledge/reader/utils/__init__.py +17 -0
agno/knowledge/reader/utils/spreadsheet.py +114 -0
agno/knowledge/reader/web_search_reader.py +8 -65
agno/knowledge/reader/website_reader.py +16 -13
agno/knowledge/reader/wikipedia_reader.py +36 -3
agno/knowledge/reader/youtube_reader.py +3 -2
agno/knowledge/remote_content/__init__.py +33 -0
agno/knowledge/remote_content/config.py +266 -0
agno/knowledge/remote_content/remote_content.py +105 -17
agno/knowledge/utils.py +76 -22
agno/learn/__init__.py +71 -0
agno/learn/config.py +463 -0
agno/learn/curate.py +185 -0
agno/learn/machine.py +725 -0
agno/learn/schemas.py +1114 -0
agno/learn/stores/__init__.py +38 -0
agno/learn/stores/decision_log.py +1156 -0
agno/learn/stores/entity_memory.py +3275 -0
agno/learn/stores/learned_knowledge.py +1583 -0
agno/learn/stores/protocol.py +117 -0
agno/learn/stores/session_context.py +1217 -0
agno/learn/stores/user_memory.py +1495 -0
agno/learn/stores/user_profile.py +1220 -0
agno/learn/utils.py +209 -0
agno/media.py +22 -6
agno/memory/__init__.py +14 -1
agno/memory/manager.py +223 -8
agno/memory/strategies/__init__.py +15 -0
agno/memory/strategies/base.py +66 -0
agno/memory/strategies/summarize.py +196 -0
agno/memory/strategies/types.py +37 -0
agno/models/aimlapi/aimlapi.py +17 -0
agno/models/anthropic/claude.py +434 -59
agno/models/aws/bedrock.py +121 -20
agno/models/aws/claude.py +131 -274
agno/models/azure/ai_foundry.py +10 -6
agno/models/azure/openai_chat.py +33 -10
agno/models/base.py +1162 -561
agno/models/cerebras/cerebras.py +120 -24
agno/models/cerebras/cerebras_openai.py +21 -2
agno/models/cohere/chat.py +65 -6
agno/models/cometapi/cometapi.py +18 -1
agno/models/dashscope/dashscope.py +2 -3
agno/models/deepinfra/deepinfra.py +18 -1
agno/models/deepseek/deepseek.py +69 -3
agno/models/fireworks/fireworks.py +18 -1
agno/models/google/gemini.py +959 -89
agno/models/google/utils.py +22 -0
agno/models/groq/groq.py +48 -18
agno/models/huggingface/huggingface.py +17 -6
agno/models/ibm/watsonx.py +16 -6
agno/models/internlm/internlm.py +18 -1
agno/models/langdb/langdb.py +13 -1
agno/models/litellm/chat.py +88 -9
agno/models/litellm/litellm_openai.py +18 -1
agno/models/message.py +24 -5
agno/models/meta/llama.py +40 -13
agno/models/meta/llama_openai.py +22 -21
agno/models/metrics.py +12 -0
agno/models/mistral/mistral.py +8 -4
agno/models/n1n/__init__.py +3 -0
agno/models/n1n/n1n.py +57 -0
agno/models/nebius/nebius.py +6 -7
agno/models/nvidia/nvidia.py +20 -3
agno/models/ollama/__init__.py +2 -0
agno/models/ollama/chat.py +17 -6
agno/models/ollama/responses.py +100 -0
agno/models/openai/__init__.py +2 -0
agno/models/openai/chat.py +117 -26
agno/models/openai/open_responses.py +46 -0
agno/models/openai/responses.py +110 -32
agno/models/openrouter/__init__.py +2 -0
agno/models/openrouter/openrouter.py +67 -2
agno/models/openrouter/responses.py +146 -0
agno/models/perplexity/perplexity.py +19 -1
agno/models/portkey/portkey.py +7 -6
agno/models/requesty/requesty.py +19 -2
agno/models/response.py +20 -2
agno/models/sambanova/sambanova.py +20 -3
agno/models/siliconflow/siliconflow.py +19 -2
agno/models/together/together.py +20 -3
agno/models/vercel/v0.py +20 -3
agno/models/vertexai/claude.py +124 -4
agno/models/vllm/vllm.py +19 -14
agno/models/xai/xai.py +19 -2
agno/os/app.py +467 -137
agno/os/auth.py +253 -5
agno/os/config.py +22 -0
agno/os/interfaces/a2a/a2a.py +7 -6
agno/os/interfaces/a2a/router.py +635 -26
agno/os/interfaces/a2a/utils.py +32 -33
agno/os/interfaces/agui/agui.py +5 -3
agno/os/interfaces/agui/router.py +26 -16
agno/os/interfaces/agui/utils.py +97 -57
agno/os/interfaces/base.py +7 -7
agno/os/interfaces/slack/router.py +16 -7
agno/os/interfaces/slack/slack.py +7 -7
agno/os/interfaces/whatsapp/router.py +35 -7
agno/os/interfaces/whatsapp/security.py +3 -1
agno/os/interfaces/whatsapp/whatsapp.py +11 -8
agno/os/managers.py +326 -0
agno/os/mcp.py +652 -79
agno/os/middleware/__init__.py +4 -0
agno/os/middleware/jwt.py +718 -115
agno/os/middleware/trailing_slash.py +27 -0
agno/os/router.py +105 -1558
agno/os/routers/agents/__init__.py +3 -0
agno/os/routers/agents/router.py +655 -0
agno/os/routers/agents/schema.py +288 -0
agno/os/routers/components/__init__.py +3 -0
agno/os/routers/components/components.py +475 -0
agno/os/routers/database.py +155 -0
agno/os/routers/evals/evals.py +111 -18
agno/os/routers/evals/schemas.py +38 -5
agno/os/routers/evals/utils.py +80 -11
agno/os/routers/health.py +3 -3
agno/os/routers/knowledge/knowledge.py +284 -35
agno/os/routers/knowledge/schemas.py +14 -2
agno/os/routers/memory/memory.py +274 -11
agno/os/routers/memory/schemas.py +44 -3
agno/os/routers/metrics/metrics.py +30 -15
agno/os/routers/metrics/schemas.py +10 -6
agno/os/routers/registry/__init__.py +3 -0
agno/os/routers/registry/registry.py +337 -0
agno/os/routers/session/session.py +143 -14
agno/os/routers/teams/__init__.py +3 -0
agno/os/routers/teams/router.py +550 -0
agno/os/routers/teams/schema.py +280 -0
agno/os/routers/traces/__init__.py +3 -0
agno/os/routers/traces/schemas.py +414 -0
agno/os/routers/traces/traces.py +549 -0
agno/os/routers/workflows/__init__.py +3 -0
agno/os/routers/workflows/router.py +757 -0
agno/os/routers/workflows/schema.py +139 -0
agno/os/schema.py +157 -584
agno/os/scopes.py +469 -0
agno/os/settings.py +3 -0
agno/os/utils.py +574 -185
agno/reasoning/anthropic.py +85 -1
agno/reasoning/azure_ai_foundry.py +93 -1
agno/reasoning/deepseek.py +102 -2
agno/reasoning/default.py +6 -7
agno/reasoning/gemini.py +87 -3
agno/reasoning/groq.py +109 -2
agno/reasoning/helpers.py +6 -7
agno/reasoning/manager.py +1238 -0
agno/reasoning/ollama.py +93 -1
agno/reasoning/openai.py +115 -1
agno/reasoning/vertexai.py +85 -1
agno/registry/__init__.py +3 -0
agno/registry/registry.py +68 -0
agno/remote/__init__.py +3 -0
agno/remote/base.py +581 -0
agno/run/__init__.py +2 -4
agno/run/agent.py +134 -19
agno/run/base.py +49 -1
agno/run/cancel.py +65 -52
agno/run/cancellation_management/__init__.py +9 -0
agno/run/cancellation_management/base.py +78 -0
agno/run/cancellation_management/in_memory_cancellation_manager.py +100 -0
agno/run/cancellation_management/redis_cancellation_manager.py +236 -0
agno/run/requirement.py +181 -0
agno/run/team.py +111 -19
agno/run/workflow.py +2 -1
agno/session/agent.py +57 -92
agno/session/summary.py +1 -1
agno/session/team.py +62 -115
agno/session/workflow.py +353 -57
agno/skills/__init__.py +17 -0
agno/skills/agent_skills.py +377 -0
agno/skills/errors.py +32 -0
agno/skills/loaders/__init__.py +4 -0
agno/skills/loaders/base.py +27 -0
agno/skills/loaders/local.py +216 -0
agno/skills/skill.py +65 -0
agno/skills/utils.py +107 -0
agno/skills/validator.py +277 -0
agno/table.py +10 -0
agno/team/__init__.py +5 -1
agno/team/remote.py +447 -0
agno/team/team.py +3769 -2202
agno/tools/brandfetch.py +27 -18
agno/tools/browserbase.py +225 -16
agno/tools/crawl4ai.py +3 -0
agno/tools/duckduckgo.py +25 -71
agno/tools/exa.py +0 -21
agno/tools/file.py +14 -13
agno/tools/file_generation.py +12 -6
agno/tools/firecrawl.py +15 -7
agno/tools/function.py +94 -113
agno/tools/google_bigquery.py +11 -2
agno/tools/google_drive.py +4 -3
agno/tools/knowledge.py +9 -4
agno/tools/mcp/mcp.py +301 -18
agno/tools/mcp/multi_mcp.py +269 -14
agno/tools/mem0.py +11 -10
agno/tools/memory.py +47 -46
agno/tools/mlx_transcribe.py +10 -7
agno/tools/models/nebius.py +5 -5
agno/tools/models_labs.py +20 -10
agno/tools/nano_banana.py +151 -0
agno/tools/parallel.py +0 -7
agno/tools/postgres.py +76 -36
agno/tools/python.py +14 -6
agno/tools/reasoning.py +30 -23
agno/tools/redshift.py +406 -0
agno/tools/shopify.py +1519 -0
agno/tools/spotify.py +919 -0
agno/tools/tavily.py +4 -1
agno/tools/toolkit.py +253 -18
agno/tools/websearch.py +93 -0
agno/tools/website.py +1 -1
agno/tools/wikipedia.py +1 -1
agno/tools/workflow.py +56 -48
agno/tools/yfinance.py +12 -11
agno/tracing/__init__.py +12 -0
agno/tracing/exporter.py +161 -0
agno/tracing/schemas.py +276 -0
agno/tracing/setup.py +112 -0
agno/utils/agent.py +251 -10
agno/utils/cryptography.py +22 -0
agno/utils/dttm.py +33 -0
agno/utils/events.py +264 -7
agno/utils/hooks.py +111 -3
agno/utils/http.py +161 -2
agno/utils/mcp.py +49 -8
agno/utils/media.py +22 -1
agno/utils/models/ai_foundry.py +9 -2
agno/utils/models/claude.py +20 -5
agno/utils/models/cohere.py +9 -2
agno/utils/models/llama.py +9 -2
agno/utils/models/mistral.py +4 -2
agno/utils/os.py +0 -0
agno/utils/print_response/agent.py +99 -16
agno/utils/print_response/team.py +223 -24
agno/utils/print_response/workflow.py +0 -2
agno/utils/prompts.py +8 -6
agno/utils/remote.py +23 -0
agno/utils/response.py +1 -13
agno/utils/string.py +91 -2
agno/utils/team.py +62 -12
agno/utils/tokens.py +657 -0
agno/vectordb/base.py +15 -2
agno/vectordb/cassandra/cassandra.py +1 -1
agno/vectordb/chroma/__init__.py +2 -1
agno/vectordb/chroma/chromadb.py +468 -23
agno/vectordb/clickhouse/clickhousedb.py +1 -1
agno/vectordb/couchbase/couchbase.py +6 -2
agno/vectordb/lancedb/lance_db.py +7 -38
agno/vectordb/lightrag/lightrag.py +7 -6
agno/vectordb/milvus/milvus.py +118 -84
agno/vectordb/mongodb/__init__.py +2 -1
agno/vectordb/mongodb/mongodb.py +14 -31
agno/vectordb/pgvector/pgvector.py +120 -66
agno/vectordb/pineconedb/pineconedb.py +2 -19
agno/vectordb/qdrant/__init__.py +2 -1
agno/vectordb/qdrant/qdrant.py +33 -56
agno/vectordb/redis/__init__.py +2 -1
agno/vectordb/redis/redisdb.py +19 -31
agno/vectordb/singlestore/singlestore.py +17 -9
agno/vectordb/surrealdb/surrealdb.py +2 -38
agno/vectordb/weaviate/__init__.py +2 -1
agno/vectordb/weaviate/weaviate.py +7 -3
agno/workflow/__init__.py +5 -1
agno/workflow/agent.py +2 -2
agno/workflow/condition.py +12 -10
agno/workflow/loop.py +28 -9
agno/workflow/parallel.py +21 -13
agno/workflow/remote.py +362 -0
agno/workflow/router.py +12 -9
agno/workflow/step.py +261 -36
agno/workflow/steps.py +12 -8
agno/workflow/types.py +40 -77
agno/workflow/workflow.py +939 -213
{agno-2.2.13.dist-info → agno-2.4.3.dist-info}/METADATA +134 -181
agno-2.4.3.dist-info/RECORD +677 -0
{agno-2.2.13.dist-info → agno-2.4.3.dist-info}/WHEEL +1 -1
agno/tools/googlesearch.py +0 -98
agno/tools/memori.py +0 -339
agno-2.2.13.dist-info/RECORD +0 -575
{agno-2.2.13.dist-info → agno-2.4.3.dist-info}/licenses/LICENSE +0 -0
{agno-2.2.13.dist-info → agno-2.4.3.dist-info}/top_level.txt +0 -0

agno/models/base.py CHANGED Viewed

@@ -5,9 +5,10 @@ from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from hashlib import md5
 from pathlib import Path
-from time import time
+from time import sleep, time
 from types import AsyncGeneratorType, GeneratorType
 from typing import (
+    TYPE_CHECKING,
     Any,
     AsyncIterator,
     Dict,
@@ -15,21 +16,26 @@ from typing import (
     List,
     Literal,
     Optional,
+    Sequence,
     Tuple,
     Type,
     Union,
     get_args,
 )
+if TYPE_CHECKING:
+    from agno.compression.manager import CompressionManager
 from uuid import uuid4
 from pydantic import BaseModel
-from agno.exceptions import AgentRunException
+from agno.exceptions import AgentRunException, ModelProviderError, RetryableModelProviderError
 from agno.media import Audio, File, Image, Video
 from agno.models.message import Citations, Message
 from agno.models.metrics import Metrics
 from agno.models.response import ModelResponse, ModelResponseEvent, ToolExecution
 from agno.run.agent import CustomEvent, RunContentEvent, RunOutput, RunOutputEvent
+from agno.run.requirement import RunRequirement
 from agno.run.team import RunContentEvent as TeamRunContentEvent
 from agno.run.team import TeamRunOutput, TeamRunOutputEvent
 from agno.run.workflow import WorkflowRunOutputEvent
@@ -145,15 +151,284 @@ class Model(ABC):
     cache_ttl: Optional[int] = None
     cache_dir: Optional[str] = None
+    # Retry configuration for model provider errors
+    # Number of retries to attempt when a ModelProviderError occurs
+    retries: int = 0
+    # Delay between retries (in seconds)
+    delay_between_retries: int = 1
+    # Exponential backoff: if True, the delay between retries is doubled each time
+    exponential_backoff: bool = False
+    # Enable retrying a model invocation once with a guidance message.
+    # This is useful for known errors avoidable with extra instructions.
+    retry_with_guidance: bool = True
+    # Set the number of times to retry the model invocation with guidance.
+    retry_with_guidance_limit: int = 1
     def __post_init__(self):
         if self.provider is None and self.name is not None:
             self.provider = f"{self.name} ({self.id})"
+    def _get_retry_delay(self, attempt: int) -> float:
+        """Calculate the delay before the next retry attempt."""
+        if self.exponential_backoff:
+            return self.delay_between_retries * (2**attempt)
+        return self.delay_between_retries
+    def _is_retryable_error(self, error: ModelProviderError) -> bool:
+        """Determine if an error is worth retrying.
+        Non-retryable errors include:
+        - Client errors (400, 401, 403, 413, 422) that won't change on retry
+        - Context window/token limit exceeded errors
+        - Payload too large errors
+        Retryable errors include:
+        - Rate limit errors (429)
+        - Server errors (500, 502, 503, 504)
+        Args:
+            error: The ModelProviderError to evaluate.
+        Returns:
+            True if the error is transient and worth retrying, False otherwise.
+        """
+        # Non-retryable status codes (client errors that won't change)
+        non_retryable_codes = {400, 401, 403, 404, 413, 422}
+        if error.status_code in non_retryable_codes:
+            return False
+        # Non-retryable error message patterns (context/token limits)
+        non_retryable_patterns = [
+            "context_length_exceeded",
+            "context window",
+            "maximum context length",
+            "token limit",
+            "max_tokens",
+            "too many tokens",
+            "payload too large",
+            "content_too_large",
+            "request too large",
+            "input too long",
+            "exceeds the model",
+        ]
+        error_msg = str(error.message).lower()
+        if any(pattern in error_msg for pattern in non_retryable_patterns):
+            return False
+        return True
+    def _invoke_with_retry(self, **kwargs) -> ModelResponse:
+        """
+        Invoke the model with retry logic for ModelProviderError.
+        This method wraps the invoke() call and retries on ModelProviderError
+        with optional exponential backoff.
+        """
+        last_exception: Optional[ModelProviderError] = None
+        for attempt in range(self.retries + 1):
+            try:
+                retries_with_guidance_count = kwargs.pop("retries_with_guidance_count", 0)
+                return self.invoke(**kwargs)
+            except ModelProviderError as e:
+                last_exception = e
+                # Check if error is non-retryable
+                if not self._is_retryable_error(e):
+                    log_error(f"Non-retryable model provider error: {e}")
+                    raise
+                if attempt < self.retries:
+                    delay = self._get_retry_delay(attempt)
+                    log_warning(
+                        f"Model provider error (attempt {attempt + 1}/{self.retries + 1}): {e}. Retrying in {delay}s..."
+                    )
+                    sleep(delay)
+                else:
+                    if self.retries > 0:
+                        log_error(f"Model provider error after {self.retries + 1} attempts: {e}")
+            except RetryableModelProviderError as e:
+                current_count = retries_with_guidance_count
+                if current_count >= self.retry_with_guidance_limit:
+                    raise ModelProviderError(
+                        message=f"Max retries with guidance reached. Error: {e.original_error}",
+                        model_name=self.name,
+                        model_id=self.id,
+                    )
+                kwargs.pop("retry_with_guidance", None)
+                kwargs["retries_with_guidance_count"] = current_count + 1
+                # Append the guidance message to help the model avoid the error in the next invoke.
+                kwargs["messages"].append(Message(role="user", content=e.retry_guidance_message, temporary=True))
+                return self._invoke_with_retry(**kwargs, retry_with_guidance=True)
+        # If we've exhausted all retries, raise the last exception
+        raise last_exception  # type: ignore
+    async def _ainvoke_with_retry(self, **kwargs) -> ModelResponse:
+        """
+        Asynchronously invoke the model with retry logic for ModelProviderError.
+        This method wraps the ainvoke() call and retries on ModelProviderError
+        with optional exponential backoff.
+        """
+        last_exception: Optional[ModelProviderError] = None
+        for attempt in range(self.retries + 1):
+            try:
+                retries_with_guidance_count = kwargs.pop("retries_with_guidance_count", 0)
+                return await self.ainvoke(**kwargs)
+            except ModelProviderError as e:
+                last_exception = e
+                # Check if error is non-retryable
+                if not self._is_retryable_error(e):
+                    log_error(f"Non-retryable model provider error: {e}")
+                    raise
+                if attempt < self.retries:
+                    delay = self._get_retry_delay(attempt)
+                    log_warning(
+                        f"Model provider error (attempt {attempt + 1}/{self.retries + 1}): {e}. Retrying in {delay}s..."
+                    )
+                    await asyncio.sleep(delay)
+                else:
+                    if self.retries > 0:
+                        log_error(f"Model provider error after {self.retries + 1} attempts: {e}")
+            except RetryableModelProviderError as e:
+                current_count = retries_with_guidance_count
+                if current_count >= self.retry_with_guidance_limit:
+                    raise ModelProviderError(
+                        message=f"Max retries with guidance reached. Error: {e.original_error}",
+                        model_name=self.name,
+                        model_id=self.id,
+                    )
+                kwargs.pop("retry_with_guidance", None)
+                kwargs["retries_with_guidance_count"] = current_count + 1
+                # Append the guidance message to help the model avoid the error in the next invoke.
+                kwargs["messages"].append(Message(role="user", content=e.retry_guidance_message, temporary=True))
+                return await self._ainvoke_with_retry(**kwargs, retry_with_guidance=True)
+        # If we've exhausted all retries, raise the last exception
+        raise last_exception  # type: ignore
+    def _invoke_stream_with_retry(self, **kwargs) -> Iterator[ModelResponse]:
+        """
+        Invoke the model stream with retry logic for ModelProviderError.
+        This method wraps the invoke_stream() call and retries on ModelProviderError
+        with optional exponential backoff. Note that retries restart the entire stream.
+        """
+        last_exception: Optional[ModelProviderError] = None
+        for attempt in range(self.retries + 1):
+            try:
+                retries_with_guidance_count = kwargs.pop("retries_with_guidance_count", 0)
+                yield from self.invoke_stream(**kwargs)
+                return  # Success, exit the retry loop
+            except ModelProviderError as e:
+                last_exception = e
+                # Check if error is non-retryable (e.g., context window exceeded, auth errors)
+                if not self._is_retryable_error(e):
+                    log_error(f"Non-retryable model provider error: {e}")
+                    raise
+                if attempt < self.retries:
+                    delay = self._get_retry_delay(attempt)
+                    log_warning(
+                        f"Model provider error during stream (attempt {attempt + 1}/{self.retries + 1}): {e}. "
+                        f"Retrying in {delay}s..."
+                    )
+                    sleep(delay)
+                else:
+                    if self.retries > 0:
+                        log_error(f"Model provider error after {self.retries + 1} attempts: {e}")
+            except RetryableModelProviderError as e:
+                current_count = retries_with_guidance_count
+                if current_count >= self.retry_with_guidance_limit:
+                    raise ModelProviderError(
+                        message=f"Max retries with guidance reached. Error: {e.original_error}",
+                        model_name=self.name,
+                        model_id=self.id,
+                    )
+                kwargs.pop("retry_with_guidance", None)
+                kwargs["retries_with_guidance_count"] = current_count + 1
+                # Append the guidance message to help the model avoid the error in the next invoke.
+                kwargs["messages"].append(Message(role="user", content=e.retry_guidance_message, temporary=True))
+                yield from self._invoke_stream_with_retry(**kwargs, retry_with_guidance=True)
+                return  # Success, exit after regeneration
+        # If we've exhausted all retries, raise the last exception
+        raise last_exception  # type: ignore
+    async def _ainvoke_stream_with_retry(self, **kwargs) -> AsyncIterator[ModelResponse]:
+        """
+        Asynchronously invoke the model stream with retry logic for ModelProviderError.
+        This method wraps the ainvoke_stream() call and retries on ModelProviderError
+        with optional exponential backoff. Note that retries restart the entire stream.
+        """
+        last_exception: Optional[ModelProviderError] = None
+        for attempt in range(self.retries + 1):
+            try:
+                retries_with_guidance_count = kwargs.pop("retries_with_guidance_count", 0)
+                async for response in self.ainvoke_stream(**kwargs):
+                    yield response
+                return  # Success, exit the retry loop
+            except ModelProviderError as e:
+                last_exception = e
+                # Check if error is non-retryable
+                if not self._is_retryable_error(e):
+                    log_error(f"Non-retryable model provider error: {e}")
+                    raise
+                if attempt < self.retries:
+                    delay = self._get_retry_delay(attempt)
+                    log_warning(
+                        f"Model provider error during stream (attempt {attempt + 1}/{self.retries + 1}): {e}. "
+                        f"Retrying in {delay}s..."
+                    )
+                    await asyncio.sleep(delay)
+                else:
+                    if self.retries > 0:
+                        log_error(f"Model provider error after {self.retries + 1} attempts: {e}")
+            except RetryableModelProviderError as e:
+                current_count = retries_with_guidance_count
+                if current_count >= self.retry_with_guidance_limit:
+                    raise ModelProviderError(
+                        message=f"Max retries with guidance reached. Error: {e.original_error}",
+                        model_name=self.name,
+                        model_id=self.id,
+                    )
+                kwargs.pop("retry_with_guidance", None)
+                kwargs["retries_with_guidance_count"] = current_count + 1
+                # Append the guidance message to help the model avoid the error in the next invoke.
+                kwargs["messages"].append(Message(role="user", content=e.retry_guidance_message, temporary=True))
+                async for response in self._ainvoke_stream_with_retry(**kwargs, retry_with_guidance=True):
+                    yield response
+                return  # Success, exit after regeneration
+        # If we've exhausted all retries, raise the last exception
+        raise last_exception  # type: ignore
     def to_dict(self) -> Dict[str, Any]:
         fields = {"name", "id", "provider"}
         _dict = {field: getattr(self, field) for field in fields if getattr(self, field) is not None}
         return _dict
+    def _remove_temporary_messages(self, messages: List[Message]) -> None:
+        """Remove temporary messages from the given list.
+        Args:
+            messages: The list of messages to filter (modified in place).
+        """
+        messages[:] = [m for m in messages if not m.temporary]
     def get_provider(self) -> str:
         return self.provider or self.name or self.__class__.__name__
@@ -303,6 +578,29 @@ class Model(ABC):
                 _tool_dicts.append(tool)
         return _tool_dicts
+    def count_tokens(
+        self,
+        messages: List[Message],
+        tools: Optional[Sequence[Union[Function, Dict[str, Any]]]] = None,
+        output_schema: Optional[Union[Dict, Type[BaseModel]]] = None,
+    ) -> int:
+        from agno.utils.tokens import count_tokens
+        return count_tokens(
+            messages,
+            tools=list(tools) if tools else None,
+            model_id=self.id,
+            output_schema=output_schema,
+        )
+    async def acount_tokens(
+        self,
+        messages: List[Message],
+        tools: Optional[Sequence[Union[Function, Dict[str, Any]]]] = None,
+        output_schema: Optional[Union[Dict, Type[BaseModel]]] = None,
+    ) -> int:
+        return self.count_tokens(messages, tools, output_schema=output_schema)
     def response(
         self,
         messages: List[Message],
@@ -312,6 +610,7 @@ class Model(ABC):
         tool_call_limit: Optional[int] = None,
         run_response: Optional[Union[RunOutput, TeamRunOutput]] = None,
         send_media_to_model: bool = True,
+        compression_manager: Optional["CompressionManager"] = None,
     ) -> ModelResponse:
         """
         Generate a response from the model.
@@ -325,155 +624,194 @@ class Model(ABC):
             run_response: Run response to use
             send_media_to_model: Whether to send media to the model
         """
+        try:
+            # Check cache if enabled
+            if self.cache_response:
+                cache_key = self._get_model_cache_key(
+                    messages, stream=False, response_format=response_format, tools=tools
+                )
+                cached_data = self._get_cached_model_response(cache_key)
-        # Check cache if enabled
-        if self.cache_response:
-            cache_key = self._get_model_cache_key(messages, stream=False, response_format=response_format, tools=tools)
-            cached_data = self._get_cached_model_response(cache_key)
-            if cached_data:
-                log_info("Cache hit for model response")
-                return self._model_response_from_cache(cached_data)
-        log_debug(f"{self.get_provider()} Response Start", center=True, symbol="-")
-        log_debug(f"Model: {self.id}", center=True, symbol="-")
-        _log_messages(messages)
-        model_response = ModelResponse()
-        function_call_count = 0
-        _tool_dicts = self._format_tools(tools) if tools is not None else []
-        _functions = {tool.name: tool for tool in tools if isinstance(tool, Function)} if tools is not None else {}
-        while True:
-            # Get response from model
-            assistant_message = Message(role=self.assistant_message_role)
-            self._process_model_response(
-                messages=messages,
-                assistant_message=assistant_message,
-                model_response=model_response,
-                response_format=response_format,
-                tools=_tool_dicts,
-                tool_choice=tool_choice or self._tool_choice,
-                run_response=run_response,
-            )
+                if cached_data:
+                    log_info("Cache hit for model response")
+                    return self._model_response_from_cache(cached_data)
-            # Add assistant message to messages
-            messages.append(assistant_message)
+            log_debug(f"{self.get_provider()} Response Start", center=True, symbol="-")
+            log_debug(f"Model: {self.id}", center=True, symbol="-")
-            # Log response and metrics
-            assistant_message.log(metrics=True)
+            _log_messages(messages)
+            model_response = ModelResponse()
-            # Handle tool calls if present
-            if assistant_message.tool_calls:
-                # Prepare function calls
-                function_calls_to_run = self._prepare_function_calls(
-                    assistant_message=assistant_message,
+            function_call_count = 0
+            _tool_dicts = self._format_tools(tools) if tools is not None else []
+            _functions = {tool.name: tool for tool in tools if isinstance(tool, Function)} if tools is not None else {}
+            _compress_tool_results = compression_manager is not None and compression_manager.compress_tool_results
+            _compression_manager = compression_manager if _compress_tool_results else None
+            while True:
+                # Compress tool results if compression is enabled and threshold is met
+                if _compression_manager is not None and _compression_manager.should_compress(
+                    messages, tools, model=self, response_format=response_format
+                ):
+                    _compression_manager.compress(messages)
+                # Get response from model
+                assistant_message = Message(role=self.assistant_message_role)
+                self._process_model_response(
                     messages=messages,
+                    assistant_message=assistant_message,
                     model_response=model_response,
-                    functions=_functions,
+                    response_format=response_format,
+                    tools=_tool_dicts,
+                    tool_choice=tool_choice or self._tool_choice,
+                    run_response=run_response,
+                    compress_tool_results=_compress_tool_results,
                 )
-                function_call_results: List[Message] = []
-                # Execute function calls
-                for function_call_response in self.run_function_calls(
-                    function_calls=function_calls_to_run,
-                    function_call_results=function_call_results,
-                    current_function_call_count=function_call_count,
-                    function_call_limit=tool_call_limit,
-                ):
-                    if isinstance(function_call_response, ModelResponse):
-                        # The session state is updated by the function call
-                        if function_call_response.updated_session_state is not None:
-                            model_response.updated_session_state = function_call_response.updated_session_state
-                        # Media artifacts are generated by the function call
-                        if function_call_response.images is not None:
-                            if model_response.images is None:
-                                model_response.images = []
-                            model_response.images.extend(function_call_response.images)
-                        if function_call_response.audios is not None:
-                            if model_response.audios is None:
-                                model_response.audios = []
-                            model_response.audios.extend(function_call_response.audios)
-                        if function_call_response.videos is not None:
-                            if model_response.videos is None:
-                                model_response.videos = []
-                            model_response.videos.extend(function_call_response.videos)
-                        if function_call_response.files is not None:
-                            if model_response.files is None:
-                                model_response.files = []
-                            model_response.files.extend(function_call_response.files)
-                        if (
-                            function_call_response.event
-                            in [
+                # Add assistant message to messages
+                messages.append(assistant_message)
+                # Log response and metrics
+                assistant_message.log(metrics=True, use_compressed_content=_compress_tool_results)
+                # Handle tool calls if present
+                if assistant_message.tool_calls:
+                    # Prepare function calls
+                    function_calls_to_run = self._prepare_function_calls(
+                        assistant_message=assistant_message,
+                        messages=messages,
+                        model_response=model_response,
+                        functions=_functions,
+                    )
+                    function_call_results: List[Message] = []
+                    # Execute function calls
+                    for function_call_response in self.run_function_calls(
+                        function_calls=function_calls_to_run,
+                        function_call_results=function_call_results,
+                        current_function_call_count=function_call_count,
+                        function_call_limit=tool_call_limit,
+                    ):
+                        if isinstance(function_call_response, ModelResponse):
+                            # The session state is updated by the function call
+                            if function_call_response.updated_session_state is not None:
+                                model_response.updated_session_state = function_call_response.updated_session_state
+                            # Media artifacts are generated by the function call
+                            if function_call_response.images is not None:
+                                if model_response.images is None:
+                                    model_response.images = []
+                                model_response.images.extend(function_call_response.images)
+                            if function_call_response.audios is not None:
+                                if model_response.audios is None:
+                                    model_response.audios = []
+                                model_response.audios.extend(function_call_response.audios)
+                            if function_call_response.videos is not None:
+                                if model_response.videos is None:
+                                    model_response.videos = []
+                                model_response.videos.extend(function_call_response.videos)
+                            if function_call_response.files is not None:
+                                if model_response.files is None:
+                                    model_response.files = []
+                                model_response.files.extend(function_call_response.files)
+                            if (
+                                function_call_response.event
+                                in [
+                                    ModelResponseEvent.tool_call_completed.value,
+                                    ModelResponseEvent.tool_call_paused.value,
+                                ]
+                                and function_call_response.tool_executions is not None
+                            ):
+                                # Record the tool execution in the model response
+                                if model_response.tool_executions is None:
+                                    model_response.tool_executions = []
+                                model_response.tool_executions.extend(function_call_response.tool_executions)
+                                # If the tool is currently paused (HITL flow), add the requirement to the run response
+                                if (
+                                    function_call_response.event == ModelResponseEvent.tool_call_paused.value
+                                    and run_response is not None
+                                ):
+                                    current_tool_execution = function_call_response.tool_executions[-1]
+                                    if run_response.requirements is None:
+                                        run_response.requirements = []
+                                    run_response.requirements.append(
+                                        RunRequirement(tool_execution=current_tool_execution)
+                                    )
+                            elif function_call_response.event not in [
+                                ModelResponseEvent.tool_call_started.value,
                                 ModelResponseEvent.tool_call_completed.value,
-                                ModelResponseEvent.tool_call_paused.value,
-                            ]
-                            and function_call_response.tool_executions is not None
-                        ):
-                            if model_response.tool_executions is None:
-                                model_response.tool_executions = []
-                            model_response.tool_executions.extend(function_call_response.tool_executions)
-                        elif function_call_response.event not in [
-                            ModelResponseEvent.tool_call_started.value,
-                            ModelResponseEvent.tool_call_completed.value,
-                        ]:
-                            if function_call_response.content:
-                                model_response.content += function_call_response.content  # type: ignore
-                # Add a function call for each successful execution
-                function_call_count += len(function_call_results)
-                # Format and add results to messages
-                self.format_function_call_results(
-                    messages=messages, function_call_results=function_call_results, **model_response.extra or {}
-                )
+                            ]:
+                                if function_call_response.content:
+                                    model_response.content += function_call_response.content  # type: ignore
-                if any(msg.images or msg.videos or msg.audio or msg.files for msg in function_call_results):
-                    # Handle function call media
-                    self._handle_function_call_media(
+                    # Add a function call for each successful execution
+                    function_call_count += len(function_call_results)
+                    # Format and add results to messages
+                    self.format_function_call_results(
                         messages=messages,
                         function_call_results=function_call_results,
-                        send_media_to_model=send_media_to_model,
+                        compress_tool_results=_compress_tool_results,
+                        **model_response.extra or {},
                     )
-                for function_call_result in function_call_results:
-                    function_call_result.log(metrics=True)
+                    if any(msg.images or msg.videos or msg.audio or msg.files for msg in function_call_results):
+                        # Handle function call media
+                        self._handle_function_call_media(
+                            messages=messages,
+                            function_call_results=function_call_results,
+                            send_media_to_model=send_media_to_model,
+                        )
+                    for function_call_result in function_call_results:
+                        function_call_result.log(metrics=True, use_compressed_content=_compress_tool_results)
-                # Check if we should stop after tool calls
-                if any(m.stop_after_tool_call for m in function_call_results):
-                    break
+                    # Check if we should stop after tool calls
+                    if any(m.stop_after_tool_call for m in function_call_results):
+                        break
-                # If we have any tool calls that require confirmation, break the loop
-                if any(tc.requires_confirmation for tc in model_response.tool_executions or []):
-                    break
+                    # If we have any tool calls that require confirmation, break the loop
+                    if any(tc.requires_confirmation for tc in model_response.tool_executions or []):
+                        break
-                # If we have any tool calls that require external execution, break the loop
-                if any(tc.external_execution_required for tc in model_response.tool_executions or []):
-                    break
+                    # If we have any tool calls that require external execution, break the loop
+                    if any(tc.external_execution_required for tc in model_response.tool_executions or []):
+                        break
-                # If we have any tool calls that require user input, break the loop
-                if any(tc.requires_user_input for tc in model_response.tool_executions or []):
-                    break
+                    # If we have any tool calls that require user input, break the loop
+                    if any(tc.requires_user_input for tc in model_response.tool_executions or []):
+                        break
-                # Continue loop to get next response
-                continue
+                    # Continue loop to get next response
+                    continue
-            # No tool calls or finished processing them
-            break
+                # No tool calls or finished processing them
+                break
-        log_debug(f"{self.get_provider()} Response End", center=True, symbol="-")
+            log_debug(f"{self.get_provider()} Response End", center=True, symbol="-")
-        # Save to cache if enabled
-        if self.cache_response:
-            self._save_model_response_to_cache(cache_key, model_response, is_streaming=False)
+            # Save to cache if enabled
+            if self.cache_response:
+                self._save_model_response_to_cache(cache_key, model_response, is_streaming=False)
+        finally:
+            # Close the Gemini client
+            if self.__class__.__name__ == "Gemini" and self.client is not None:  # type: ignore
+                try:
+                    self.client.close()  # type: ignore
+                    self.client = None
+                except AttributeError:
+                    log_warning(
+                        "Your Gemini client is outdated. For Agno to properly handle the lifecycle of the client,"
+                        " please upgrade Gemini to the latest version: pip install -U google-genai"
+                    )
         return model_response
@@ -486,157 +824,198 @@ class Model(ABC):
         tool_call_limit: Optional[int] = None,
         run_response: Optional[Union[RunOutput, TeamRunOutput]] = None,
         send_media_to_model: bool = True,
+        compression_manager: Optional["CompressionManager"] = None,
     ) -> ModelResponse:
         """
         Generate an asynchronous response from the model.
         """
-        # Check cache if enabled
-        if self.cache_response:
-            cache_key = self._get_model_cache_key(messages, stream=False, response_format=response_format, tools=tools)
-            cached_data = self._get_cached_model_response(cache_key)
-            if cached_data:
-                log_info("Cache hit for model response")
-                return self._model_response_from_cache(cached_data)
-        log_debug(f"{self.get_provider()} Async Response Start", center=True, symbol="-")
-        log_debug(f"Model: {self.id}", center=True, symbol="-")
-        _log_messages(messages)
-        model_response = ModelResponse()
-        _tool_dicts = self._format_tools(tools) if tools is not None else []
-        _functions = {tool.name: tool for tool in tools if isinstance(tool, Function)} if tools is not None else {}
-        function_call_count = 0
-        while True:
-            # Get response from model
-            assistant_message = Message(role=self.assistant_message_role)
-            await self._aprocess_model_response(
-                messages=messages,
-                assistant_message=assistant_message,
-                model_response=model_response,
-                response_format=response_format,
-                tools=_tool_dicts,
-                tool_choice=tool_choice or self._tool_choice,
-                run_response=run_response,
-            )
+        try:
+            # Check cache if enabled
+            if self.cache_response:
+                cache_key = self._get_model_cache_key(
+                    messages, stream=False, response_format=response_format, tools=tools
+                )
+                cached_data = self._get_cached_model_response(cache_key)
-            # Add assistant message to messages
-            messages.append(assistant_message)
+                if cached_data:
+                    log_info("Cache hit for model response")
+                    return self._model_response_from_cache(cached_data)
-            # Log response and metrics
-            assistant_message.log(metrics=True)
+            log_debug(f"{self.get_provider()} Async Response Start", center=True, symbol="-")
+            log_debug(f"Model: {self.id}", center=True, symbol="-")
+            _log_messages(messages)
+            model_response = ModelResponse()
-            # Handle tool calls if present
-            if assistant_message.tool_calls:
-                # Prepare function calls
-                function_calls_to_run = self._prepare_function_calls(
-                    assistant_message=assistant_message,
+            _tool_dicts = self._format_tools(tools) if tools is not None else []
+            _functions = {tool.name: tool for tool in tools if isinstance(tool, Function)} if tools is not None else {}
+            _compress_tool_results = compression_manager is not None and compression_manager.compress_tool_results
+            _compression_manager = compression_manager if _compress_tool_results else None
+            function_call_count = 0
+            while True:
+                # Compress existing tool results BEFORE making API call to avoid context overflow
+                if _compression_manager is not None and await _compression_manager.ashould_compress(
+                    messages, tools, model=self, response_format=response_format
+                ):
+                    await _compression_manager.acompress(messages)
+                # Get response from model
+                assistant_message = Message(role=self.assistant_message_role)
+                await self._aprocess_model_response(
                     messages=messages,
+                    assistant_message=assistant_message,
                     model_response=model_response,
-                    functions=_functions,
+                    response_format=response_format,
+                    tools=_tool_dicts,
+                    tool_choice=tool_choice or self._tool_choice,
+                    run_response=run_response,
+                    compress_tool_results=_compress_tool_results,
                 )
-                function_call_results: List[Message] = []
-                # Execute function calls
-                async for function_call_response in self.arun_function_calls(
-                    function_calls=function_calls_to_run,
-                    function_call_results=function_call_results,
-                    current_function_call_count=function_call_count,
-                    function_call_limit=tool_call_limit,
-                ):
-                    if isinstance(function_call_response, ModelResponse):
-                        # The session state is updated by the function call
-                        if function_call_response.updated_session_state is not None:
-                            model_response.updated_session_state = function_call_response.updated_session_state
-                        # Media artifacts are generated by the function call
-                        if function_call_response.images is not None:
-                            if model_response.images is None:
-                                model_response.images = []
-                            model_response.images.extend(function_call_response.images)
-                        if function_call_response.audios is not None:
-                            if model_response.audios is None:
-                                model_response.audios = []
-                            model_response.audios.extend(function_call_response.audios)
-                        if function_call_response.videos is not None:
-                            if model_response.videos is None:
-                                model_response.videos = []
-                            model_response.videos.extend(function_call_response.videos)
-                        if function_call_response.files is not None:
-                            if model_response.files is None:
-                                model_response.files = []
-                            model_response.files.extend(function_call_response.files)
-                        if (
-                            function_call_response.event
-                            in [
+                # Add assistant message to messages
+                messages.append(assistant_message)
+                # Log response and metrics
+                assistant_message.log(metrics=True)
+                # Handle tool calls if present
+                if assistant_message.tool_calls:
+                    # Prepare function calls
+                    function_calls_to_run = self._prepare_function_calls(
+                        assistant_message=assistant_message,
+                        messages=messages,
+                        model_response=model_response,
+                        functions=_functions,
+                    )
+                    function_call_results: List[Message] = []
+                    # Execute function calls
+                    async for function_call_response in self.arun_function_calls(
+                        function_calls=function_calls_to_run,
+                        function_call_results=function_call_results,
+                        current_function_call_count=function_call_count,
+                        function_call_limit=tool_call_limit,
+                    ):
+                        if isinstance(function_call_response, ModelResponse):
+                            # The session state is updated by the function call
+                            if function_call_response.updated_session_state is not None:
+                                model_response.updated_session_state = function_call_response.updated_session_state
+                            # Media artifacts are generated by the function call
+                            if function_call_response.images is not None:
+                                if model_response.images is None:
+                                    model_response.images = []
+                                model_response.images.extend(function_call_response.images)
+                            if function_call_response.audios is not None:
+                                if model_response.audios is None:
+                                    model_response.audios = []
+                                model_response.audios.extend(function_call_response.audios)
+                            if function_call_response.videos is not None:
+                                if model_response.videos is None:
+                                    model_response.videos = []
+                                model_response.videos.extend(function_call_response.videos)
+                            if function_call_response.files is not None:
+                                if model_response.files is None:
+                                    model_response.files = []
+                                model_response.files.extend(function_call_response.files)
+                            if (
+                                function_call_response.event
+                                in [
+                                    ModelResponseEvent.tool_call_completed.value,
+                                    ModelResponseEvent.tool_call_paused.value,
+                                ]
+                                and function_call_response.tool_executions is not None
+                            ):
+                                if model_response.tool_executions is None:
+                                    model_response.tool_executions = []
+                                model_response.tool_executions.extend(function_call_response.tool_executions)
+                                # If the tool is currently paused (HITL flow), add the requirement to the run response
+                                if (
+                                    function_call_response.event == ModelResponseEvent.tool_call_paused.value
+                                    and run_response is not None
+                                ):
+                                    current_tool_execution = function_call_response.tool_executions[-1]
+                                    if run_response.requirements is None:
+                                        run_response.requirements = []
+                                    run_response.requirements.append(
+                                        RunRequirement(tool_execution=current_tool_execution)
+                                    )
+                            elif function_call_response.event not in [
+                                ModelResponseEvent.tool_call_started.value,
                                 ModelResponseEvent.tool_call_completed.value,
-                                ModelResponseEvent.tool_call_paused.value,
-                            ]
-                            and function_call_response.tool_executions is not None
-                        ):
-                            if model_response.tool_executions is None:
-                                model_response.tool_executions = []
-                            model_response.tool_executions.extend(function_call_response.tool_executions)
-                        elif function_call_response.event not in [
-                            ModelResponseEvent.tool_call_started.value,
-                            ModelResponseEvent.tool_call_completed.value,
-                        ]:
-                            if function_call_response.content:
-                                model_response.content += function_call_response.content  # type: ignore
-                # Add a function call for each successful execution
-                function_call_count += len(function_call_results)
-                # Format and add results to messages
-                self.format_function_call_results(
-                    messages=messages, function_call_results=function_call_results, **model_response.extra or {}
-                )
+                            ]:
+                                if function_call_response.content:
+                                    model_response.content += function_call_response.content  # type: ignore
-                if any(msg.images or msg.videos or msg.audio or msg.files for msg in function_call_results):
-                    # Handle function call media
-                    self._handle_function_call_media(
+                    # Add a function call for each successful execution
+                    function_call_count += len(function_call_results)
+                    # Format and add results to messages
+                    self.format_function_call_results(
                         messages=messages,
                         function_call_results=function_call_results,
-                        send_media_to_model=send_media_to_model,
+                        compress_tool_results=_compress_tool_results,
+                        **model_response.extra or {},
                     )
-                for function_call_result in function_call_results:
-                    function_call_result.log(metrics=True)
+                    if any(msg.images or msg.videos or msg.audio or msg.files for msg in function_call_results):
+                        # Handle function call media
+                        self._handle_function_call_media(
+                            messages=messages,
+                            function_call_results=function_call_results,
+                            send_media_to_model=send_media_to_model,
+                        )
-                # Check if we should stop after tool calls
-                if any(m.stop_after_tool_call for m in function_call_results):
-                    break
+                    for function_call_result in function_call_results:
+                        function_call_result.log(metrics=True, use_compressed_content=_compress_tool_results)
-                # If we have any tool calls that require confirmation, break the loop
-                if any(tc.requires_confirmation for tc in model_response.tool_executions or []):
-                    break
+                    # Check if we should stop after tool calls
+                    if any(m.stop_after_tool_call for m in function_call_results):
+                        break
-                # If we have any tool calls that require external execution, break the loop
-                if any(tc.external_execution_required for tc in model_response.tool_executions or []):
-                    break
+                    # If we have any tool calls that require confirmation, break the loop
+                    if any(tc.requires_confirmation for tc in model_response.tool_executions or []):
+                        break
-                # If we have any tool calls that require user input, break the loop
-                if any(tc.requires_user_input for tc in model_response.tool_executions or []):
-                    break
+                    # If we have any tool calls that require external execution, break the loop
+                    if any(tc.external_execution_required for tc in model_response.tool_executions or []):
+                        break
-                # Continue loop to get next response
-                continue
+                    # If we have any tool calls that require user input, break the loop
+                    if any(tc.requires_user_input for tc in model_response.tool_executions or []):
+                        break
-            # No tool calls or finished processing them
-            break
+                    # Continue loop to get next response
+                    continue
-        log_debug(f"{self.get_provider()} Async Response End", center=True, symbol="-")
+                # No tool calls or finished processing them
+                break
-        # Save to cache if enabled
-        if self.cache_response:
-            self._save_model_response_to_cache(cache_key, model_response, is_streaming=False)
+            log_debug(f"{self.get_provider()} Async Response End", center=True, symbol="-")
+            # Save to cache if enabled
+            if self.cache_response:
+                self._save_model_response_to_cache(cache_key, model_response, is_streaming=False)
+        finally:
+            # Close the Gemini client
+            if self.__class__.__name__ == "Gemini" and self.client is not None:
+                try:
+                    await self.client.aio.aclose()  # type: ignore
+                    self.client = None
+                except AttributeError:
+                    log_warning(
+                        "Your Gemini client is outdated. For Agno to properly handle the lifecycle of the client,"
+                        " please upgrade Gemini to the latest version: pip install -U google-genai"
+                    )
         return model_response
@@ -649,6 +1028,7 @@ class Model(ABC):
         tools: Optional[List[Dict[str, Any]]] = None,
         tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
         run_response: Optional[Union[RunOutput, TeamRunOutput]] = None,
+        compress_tool_results: bool = False,
     ) -> None:
         """
         Process a single model response and return the assistant message and whether to continue.
@@ -656,14 +1036,15 @@ class Model(ABC):
         Returns:
             Tuple[Message, bool]: (assistant_message, should_continue)
         """
-        # Generate response
-        provider_response = self.invoke(
+        # Generate response with retry logic for ModelProviderError
+        provider_response = self._invoke_with_retry(
             assistant_message=assistant_message,
             messages=messages,
             response_format=response_format,
             tools=tools,
             tool_choice=tool_choice or self._tool_choice,
             run_response=run_response,
+            compress_tool_results=compress_tool_results,
         )
         # Populate the assistant message
@@ -694,6 +1075,8 @@ class Model(ABC):
             model_response.extra.update(provider_response.extra)
         if provider_response.provider_data is not None:
             model_response.provider_data = provider_response.provider_data
+        if provider_response.response_usage is not None:
+            model_response.response_usage = provider_response.response_usage
     async def _aprocess_model_response(
         self,
@@ -704,6 +1087,7 @@ class Model(ABC):
         tools: Optional[List[Dict[str, Any]]] = None,
         tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
         run_response: Optional[Union[RunOutput, TeamRunOutput]] = None,
+        compress_tool_results: bool = False,
     ) -> None:
         """
         Process a single async model response and return the assistant message and whether to continue.
@@ -711,14 +1095,15 @@ class Model(ABC):
         Returns:
             Tuple[Message, bool]: (assistant_message, should_continue)
         """
-        # Generate response
-        provider_response = await self.ainvoke(
+        # Generate response with retry logic for ModelProviderError
+        provider_response = await self._ainvoke_with_retry(
             messages=messages,
             response_format=response_format,
             tools=tools,
             tool_choice=tool_choice or self._tool_choice,
             assistant_message=assistant_message,
             run_response=run_response,
+            compress_tool_results=compress_tool_results,
         )
         # Populate the assistant message
@@ -749,6 +1134,8 @@ class Model(ABC):
             model_response.extra.update(provider_response.extra)
         if provider_response.provider_data is not None:
             model_response.provider_data = provider_response.provider_data
+        if provider_response.response_usage is not None:
+            model_response.response_usage = provider_response.response_usage
     def _populate_assistant_message(
         self,
@@ -829,18 +1216,20 @@ class Model(ABC):
         tools: Optional[List[Dict[str, Any]]] = None,
         tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
         run_response: Optional[Union[RunOutput, TeamRunOutput]] = None,
+        compress_tool_results: bool = False,
     ) -> Iterator[ModelResponse]:
         """
-        Process a streaming response from the model.
+        Process a streaming response from the model with retry logic for ModelProviderError.
         """
-        for response_delta in self.invoke_stream(
+        for response_delta in self._invoke_stream_with_retry(
             messages=messages,
             assistant_message=assistant_message,
             response_format=response_format,
             tools=tools,
             tool_choice=tool_choice or self._tool_choice,
             run_response=run_response,
+            compress_tool_results=compress_tool_results,
         ):
             for model_response_delta in self._populate_stream_data(
                 stream_data=stream_data,
@@ -861,147 +1250,207 @@ class Model(ABC):
         stream_model_response: bool = True,
         run_response: Optional[Union[RunOutput, TeamRunOutput]] = None,
         send_media_to_model: bool = True,
+        compression_manager: Optional["CompressionManager"] = None,
     ) -> Iterator[Union[ModelResponse, RunOutputEvent, TeamRunOutputEvent]]:
         """
         Generate a streaming response from the model.
         """
+        try:
+            # Check cache if enabled - capture key BEFORE streaming to avoid mismatch
+            cache_key = None
+            if self.cache_response:
+                cache_key = self._get_model_cache_key(
+                    messages, stream=True, response_format=response_format, tools=tools
+                )
+                cached_data = self._get_cached_model_response(cache_key)
-        # Check cache if enabled - capture key BEFORE streaming to avoid mismatch
-        cache_key = None
-        if self.cache_response:
-            cache_key = self._get_model_cache_key(messages, stream=True, response_format=response_format, tools=tools)
-            cached_data = self._get_cached_model_response(cache_key)
+                if cached_data:
+                    log_info("Cache hit for streaming model response")
+                    # Yield cached responses
+                    for response in self._streaming_responses_from_cache(cached_data["streaming_responses"]):
+                        yield response
+                    return
-            if cached_data:
-                log_info("Cache hit for streaming model response")
-                # Yield cached responses
-                for response in self._streaming_responses_from_cache(cached_data["streaming_responses"]):
-                    yield response
-                return
+                log_info("Cache miss for streaming model response")
-            log_info("Cache miss for streaming model response")
+            # Track streaming responses for caching
+            streaming_responses: List[ModelResponse] = []
-        # Track streaming responses for caching
-        streaming_responses: List[ModelResponse] = []
+            log_debug(f"{self.get_provider()} Response Stream Start", center=True, symbol="-")
+            log_debug(f"Model: {self.id}", center=True, symbol="-")
+            _log_messages(messages)
-        log_debug(f"{self.get_provider()} Response Stream Start", center=True, symbol="-")
-        log_debug(f"Model: {self.id}", center=True, symbol="-")
-        _log_messages(messages)
+            _tool_dicts = self._format_tools(tools) if tools is not None else []
+            _functions = {tool.name: tool for tool in tools if isinstance(tool, Function)} if tools is not None else {}
-        _tool_dicts = self._format_tools(tools) if tools is not None else []
-        _functions = {tool.name: tool for tool in tools if isinstance(tool, Function)} if tools is not None else {}
+            _compress_tool_results = compression_manager is not None and compression_manager.compress_tool_results
+            _compression_manager = compression_manager if _compress_tool_results else None
-        function_call_count = 0
+            function_call_count = 0
-        while True:
-            assistant_message = Message(role=self.assistant_message_role)
-            # Create assistant message and stream data
-            stream_data = MessageData()
-            model_response = ModelResponse()
-            if stream_model_response:
-                # Generate response
-                for response in self.process_response_stream(
-                    messages=messages,
-                    assistant_message=assistant_message,
-                    stream_data=stream_data,
-                    response_format=response_format,
-                    tools=_tool_dicts,
-                    tool_choice=tool_choice or self._tool_choice,
-                    run_response=run_response,
+            while True:
+                # Compress existing tool results BEFORE invoke
+                if _compression_manager is not None and _compression_manager.should_compress(
+                    messages, tools, model=self, response_format=response_format
                 ):
-                    if self.cache_response and isinstance(response, ModelResponse):
-                        streaming_responses.append(response)
-                    yield response
+                    # Emit compression started event
+                    yield ModelResponse(event=ModelResponseEvent.compression_started.value)
+                    _compression_manager.compress(messages)
+                    # Emit compression completed event with stats
+                    yield ModelResponse(
+                        event=ModelResponseEvent.compression_completed.value,
+                        compression_stats=_compression_manager.stats.copy(),
+                    )
-            else:
-                self._process_model_response(
-                    messages=messages,
-                    assistant_message=assistant_message,
-                    model_response=model_response,
-                    response_format=response_format,
-                    tools=_tool_dicts,
-                    tool_choice=tool_choice or self._tool_choice,
-                )
-                if self.cache_response:
-                    streaming_responses.append(model_response)
-                yield model_response
-            # Add assistant message to messages
-            messages.append(assistant_message)
-            assistant_message.log(metrics=True)
-            # Handle tool calls if present
-            if assistant_message.tool_calls is not None:
-                # Prepare function calls
-                function_calls_to_run: List[FunctionCall] = self.get_function_calls_to_run(
-                    assistant_message=assistant_message, messages=messages, functions=_functions
-                )
-                function_call_results: List[Message] = []
-                # Execute function calls
-                for function_call_response in self.run_function_calls(
-                    function_calls=function_calls_to_run,
-                    function_call_results=function_call_results,
-                    current_function_call_count=function_call_count,
-                    function_call_limit=tool_call_limit,
-                ):
-                    if self.cache_response and isinstance(function_call_response, ModelResponse):
-                        streaming_responses.append(function_call_response)
-                    yield function_call_response
+                assistant_message = Message(role=self.assistant_message_role)
+                # Create assistant message and stream data
+                stream_data = MessageData()
+                model_response = ModelResponse()
-                # Add a function call for each successful execution
-                function_call_count += len(function_call_results)
+                # Emit LLM request started event
+                yield ModelResponse(event=ModelResponseEvent.model_request_started.value)
-                # Format and add results to messages
-                if stream_data and stream_data.extra is not None:
-                    self.format_function_call_results(
-                        messages=messages, function_call_results=function_call_results, **stream_data.extra
+                if stream_model_response:
+                    # Generate response
+                    for response in self.process_response_stream(
+                        messages=messages,
+                        assistant_message=assistant_message,
+                        stream_data=stream_data,
+                        response_format=response_format,
+                        tools=_tool_dicts,
+                        tool_choice=tool_choice or self._tool_choice,
+                        run_response=run_response,
+                        compress_tool_results=_compress_tool_results,
+                    ):
+                        if self.cache_response and isinstance(response, ModelResponse):
+                            streaming_responses.append(response)
+                        yield response
+                else:
+                    self._process_model_response(
+                        messages=messages,
+                        assistant_message=assistant_message,
+                        model_response=model_response,
+                        response_format=response_format,
+                        tools=_tool_dicts,
+                        tool_choice=tool_choice or self._tool_choice,
+                        run_response=run_response,
+                        compress_tool_results=_compress_tool_results,
                     )
-                elif model_response and model_response.extra is not None:
-                    self.format_function_call_results(
-                        messages=messages, function_call_results=function_call_results, **model_response.extra
+                    if self.cache_response:
+                        streaming_responses.append(model_response)
+                    yield model_response
+                # Add assistant message to messages
+                messages.append(assistant_message)
+                assistant_message.log(metrics=True)
+                # Emit LLM request completed event with metrics
+                llm_metrics = assistant_message.metrics
+                yield ModelResponse(
+                    event=ModelResponseEvent.model_request_completed.value,
+                    input_tokens=llm_metrics.input_tokens if llm_metrics else None,
+                    output_tokens=llm_metrics.output_tokens if llm_metrics else None,
+                    total_tokens=llm_metrics.total_tokens if llm_metrics else None,
+                    time_to_first_token=llm_metrics.time_to_first_token if llm_metrics else None,
+                    reasoning_tokens=llm_metrics.reasoning_tokens if llm_metrics else None,
+                    cache_read_tokens=llm_metrics.cache_read_tokens if llm_metrics else None,
+                    cache_write_tokens=llm_metrics.cache_write_tokens if llm_metrics else None,
+                )
+                # Handle tool calls if present
+                if assistant_message.tool_calls is not None:
+                    # Prepare function calls
+                    function_calls_to_run: List[FunctionCall] = self.get_function_calls_to_run(
+                        assistant_message=assistant_message, messages=messages, functions=_functions
                     )
-                else:
-                    self.format_function_call_results(messages=messages, function_call_results=function_call_results)
+                    function_call_results: List[Message] = []
-                # Handle function call media
-                if any(msg.images or msg.videos or msg.audio or msg.files for msg in function_call_results):
-                    self._handle_function_call_media(
-                        messages=messages,
+                    # Execute function calls
+                    for function_call_response in self.run_function_calls(
+                        function_calls=function_calls_to_run,
                         function_call_results=function_call_results,
-                        send_media_to_model=send_media_to_model,
-                    )
+                        current_function_call_count=function_call_count,
+                        function_call_limit=tool_call_limit,
+                    ):
+                        if self.cache_response and isinstance(function_call_response, ModelResponse):
+                            streaming_responses.append(function_call_response)
+                        yield function_call_response
+                    # Add a function call for each successful execution
+                    function_call_count += len(function_call_results)
+                    # Format and add results to messages
+                    if stream_data and stream_data.extra is not None:
+                        self.format_function_call_results(
+                            messages=messages,
+                            function_call_results=function_call_results,
+                            compress_tool_results=_compress_tool_results,
+                            **stream_data.extra,
+                        )
+                    elif model_response and model_response.extra is not None:
+                        self.format_function_call_results(
+                            messages=messages,
+                            function_call_results=function_call_results,
+                            compress_tool_results=_compress_tool_results,
+                            **model_response.extra,
+                        )
+                    else:
+                        self.format_function_call_results(
+                            messages=messages,
+                            function_call_results=function_call_results,
+                            compress_tool_results=_compress_tool_results,
+                        )
-                for function_call_result in function_call_results:
-                    function_call_result.log(metrics=True)
+                    # Handle function call media
+                    if any(msg.images or msg.videos or msg.audio or msg.files for msg in function_call_results):
+                        self._handle_function_call_media(
+                            messages=messages,
+                            function_call_results=function_call_results,
+                            send_media_to_model=send_media_to_model,
+                        )
-                # Check if we should stop after tool calls
-                if any(m.stop_after_tool_call for m in function_call_results):
-                    break
+                    for function_call_result in function_call_results:
+                        function_call_result.log(metrics=True, use_compressed_content=_compress_tool_results)
-                # If we have any tool calls that require confirmation, break the loop
-                if any(fc.function.requires_confirmation for fc in function_calls_to_run):
-                    break
+                    # Check if we should stop after tool calls
+                    if any(m.stop_after_tool_call for m in function_call_results):
+                        break
-                # If we have any tool calls that require external execution, break the loop
-                if any(fc.function.external_execution for fc in function_calls_to_run):
-                    break
+                    # If we have any tool calls that require confirmation, break the loop
+                    if any(fc.function.requires_confirmation for fc in function_calls_to_run):
+                        break
-                # If we have any tool calls that require user input, break the loop
-                if any(fc.function.requires_user_input for fc in function_calls_to_run):
-                    break
+                    # If we have any tool calls that require external execution, break the loop
+                    if any(fc.function.external_execution for fc in function_calls_to_run):
+                        break
-                # Continue loop to get next response
-                continue
+                    # If we have any tool calls that require user input, break the loop
+                    if any(fc.function.requires_user_input for fc in function_calls_to_run):
+                        break
-            # No tool calls or finished processing them
-            break
+                    # Continue loop to get next response
+                    continue
-        log_debug(f"{self.get_provider()} Response Stream End", center=True, symbol="-")
+                # No tool calls or finished processing them
+                break
-        # Save streaming responses to cache if enabled
-        if self.cache_response and cache_key and streaming_responses:
-            self._save_streaming_responses_to_cache(cache_key, streaming_responses)
+            log_debug(f"{self.get_provider()} Response Stream End", center=True, symbol="-")
+            # Save streaming responses to cache if enabled
+            if self.cache_response and cache_key and streaming_responses:
+                self._save_streaming_responses_to_cache(cache_key, streaming_responses)
+        finally:
+            # Close the Gemini client
+            if self.__class__.__name__ == "Gemini" and self.client is not None:
+                try:
+                    self.client.close()  # type: ignore
+                    self.client = None
+                except AttributeError:
+                    log_warning(
+                        "Your Gemini client is outdated. For Agno to properly handle the lifecycle of the client,"
+                        " please upgrade Gemini to the latest version: pip install -U google-genai"
+                    )
     async def aprocess_response_stream(
         self,
@@ -1012,18 +1461,20 @@ class Model(ABC):
         tools: Optional[List[Dict[str, Any]]] = None,
         tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
         run_response: Optional[Union[RunOutput, TeamRunOutput]] = None,
+        compress_tool_results: bool = False,
     ) -> AsyncIterator[ModelResponse]:
         """
-        Process a streaming response from the model.
+        Process a streaming response from the model with retry logic for ModelProviderError.
         """
-        async for response_delta in self.ainvoke_stream(
+        async for response_delta in self._ainvoke_stream_with_retry(
             messages=messages,
             assistant_message=assistant_message,
             response_format=response_format,
             tools=tools,
             tool_choice=tool_choice or self._tool_choice,
             run_response=run_response,
-        ):  # type: ignore
+            compress_tool_results=compress_tool_results,
+        ):
             for model_response_delta in self._populate_stream_data(
                 stream_data=stream_data,
                 model_response_delta=response_delta,
@@ -1043,148 +1494,208 @@ class Model(ABC):
         stream_model_response: bool = True,
         run_response: Optional[Union[RunOutput, TeamRunOutput]] = None,
         send_media_to_model: bool = True,
+        compression_manager: Optional["CompressionManager"] = None,
     ) -> AsyncIterator[Union[ModelResponse, RunOutputEvent, TeamRunOutputEvent]]:
         """
         Generate an asynchronous streaming response from the model.
         """
+        try:
+            # Check cache if enabled - capture key BEFORE streaming to avoid mismatch
+            cache_key = None
+            if self.cache_response:
+                cache_key = self._get_model_cache_key(
+                    messages, stream=True, response_format=response_format, tools=tools
+                )
+                cached_data = self._get_cached_model_response(cache_key)
-        # Check cache if enabled - capture key BEFORE streaming to avoid mismatch
-        cache_key = None
-        if self.cache_response:
-            cache_key = self._get_model_cache_key(messages, stream=True, response_format=response_format, tools=tools)
-            cached_data = self._get_cached_model_response(cache_key)
+                if cached_data:
+                    log_info("Cache hit for async streaming model response")
+                    # Yield cached responses
+                    for response in self._streaming_responses_from_cache(cached_data["streaming_responses"]):
+                        yield response
+                    return
-            if cached_data:
-                log_info("Cache hit for async streaming model response")
-                # Yield cached responses
-                for response in self._streaming_responses_from_cache(cached_data["streaming_responses"]):
-                    yield response
-                return
+                log_info("Cache miss for async streaming model response")
-            log_info("Cache miss for async streaming model response")
+            # Track streaming responses for caching
+            streaming_responses: List[ModelResponse] = []
-        # Track streaming responses for caching
-        streaming_responses: List[ModelResponse] = []
+            log_debug(f"{self.get_provider()} Async Response Stream Start", center=True, symbol="-")
+            log_debug(f"Model: {self.id}", center=True, symbol="-")
+            _log_messages(messages)
-        log_debug(f"{self.get_provider()} Async Response Stream Start", center=True, symbol="-")
-        log_debug(f"Model: {self.id}", center=True, symbol="-")
-        _log_messages(messages)
+            _tool_dicts = self._format_tools(tools) if tools is not None else []
+            _functions = {tool.name: tool for tool in tools if isinstance(tool, Function)} if tools is not None else {}
-        _tool_dicts = self._format_tools(tools) if tools is not None else []
-        _functions = {tool.name: tool for tool in tools if isinstance(tool, Function)} if tools is not None else {}
+            _compress_tool_results = compression_manager is not None and compression_manager.compress_tool_results
+            _compression_manager = compression_manager if _compress_tool_results else None
-        function_call_count = 0
+            function_call_count = 0
-        while True:
-            # Create assistant message and stream data
-            assistant_message = Message(role=self.assistant_message_role)
-            stream_data = MessageData()
-            model_response = ModelResponse()
-            if stream_model_response:
-                # Generate response
-                async for model_response in self.aprocess_response_stream(
-                    messages=messages,
-                    assistant_message=assistant_message,
-                    stream_data=stream_data,
-                    response_format=response_format,
-                    tools=_tool_dicts,
-                    tool_choice=tool_choice or self._tool_choice,
-                    run_response=run_response,
+            while True:
+                # Compress existing tool results BEFORE making API call to avoid context overflow
+                if _compression_manager is not None and await _compression_manager.ashould_compress(
+                    messages, tools, model=self, response_format=response_format
                 ):
-                    if self.cache_response and isinstance(model_response, ModelResponse):
+                    # Emit compression started event
+                    yield ModelResponse(event=ModelResponseEvent.compression_started.value)
+                    await _compression_manager.acompress(messages)
+                    # Emit compression completed event with stats
+                    yield ModelResponse(
+                        event=ModelResponseEvent.compression_completed.value,
+                        compression_stats=_compression_manager.stats.copy(),
+                    )
+                # Create assistant message and stream data
+                assistant_message = Message(role=self.assistant_message_role)
+                stream_data = MessageData()
+                model_response = ModelResponse()
+                # Emit LLM request started event
+                yield ModelResponse(event=ModelResponseEvent.model_request_started.value)
+                if stream_model_response:
+                    # Generate response
+                    async for model_response in self.aprocess_response_stream(
+                        messages=messages,
+                        assistant_message=assistant_message,
+                        stream_data=stream_data,
+                        response_format=response_format,
+                        tools=_tool_dicts,
+                        tool_choice=tool_choice or self._tool_choice,
+                        run_response=run_response,
+                        compress_tool_results=_compress_tool_results,
+                    ):
+                        if self.cache_response and isinstance(model_response, ModelResponse):
+                            streaming_responses.append(model_response)
+                        yield model_response
+                else:
+                    await self._aprocess_model_response(
+                        messages=messages,
+                        assistant_message=assistant_message,
+                        model_response=model_response,
+                        response_format=response_format,
+                        tools=_tool_dicts,
+                        tool_choice=tool_choice or self._tool_choice,
+                        run_response=run_response,
+                        compress_tool_results=_compress_tool_results,
+                    )
+                    if self.cache_response:
                         streaming_responses.append(model_response)
                     yield model_response
-            else:
-                await self._aprocess_model_response(
-                    messages=messages,
-                    assistant_message=assistant_message,
-                    model_response=model_response,
-                    response_format=response_format,
-                    tools=_tool_dicts,
-                    tool_choice=tool_choice or self._tool_choice,
-                    run_response=run_response,
-                )
-                if self.cache_response:
-                    streaming_responses.append(model_response)
-                yield model_response
-            # Add assistant message to messages
-            messages.append(assistant_message)
-            assistant_message.log(metrics=True)
-            # Handle tool calls if present
-            if assistant_message.tool_calls is not None:
-                # Prepare function calls
-                function_calls_to_run: List[FunctionCall] = self.get_function_calls_to_run(
-                    assistant_message=assistant_message, messages=messages, functions=_functions
-                )
-                function_call_results: List[Message] = []
-                # Execute function calls
-                async for function_call_response in self.arun_function_calls(
-                    function_calls=function_calls_to_run,
-                    function_call_results=function_call_results,
-                    current_function_call_count=function_call_count,
-                    function_call_limit=tool_call_limit,
-                ):
-                    if self.cache_response and isinstance(function_call_response, ModelResponse):
-                        streaming_responses.append(function_call_response)
-                    yield function_call_response
+                # Add assistant message to messages
+                messages.append(assistant_message)
+                assistant_message.log(metrics=True)
-                # Add a function call for each successful execution
-                function_call_count += len(function_call_results)
+                # Emit LLM request completed event with metrics
+                llm_metrics = assistant_message.metrics
+                yield ModelResponse(
+                    event=ModelResponseEvent.model_request_completed.value,
+                    input_tokens=llm_metrics.input_tokens if llm_metrics else None,
+                    output_tokens=llm_metrics.output_tokens if llm_metrics else None,
+                    total_tokens=llm_metrics.total_tokens if llm_metrics else None,
+                    time_to_first_token=llm_metrics.time_to_first_token if llm_metrics else None,
+                    reasoning_tokens=llm_metrics.reasoning_tokens if llm_metrics else None,
+                    cache_read_tokens=llm_metrics.cache_read_tokens if llm_metrics else None,
+                    cache_write_tokens=llm_metrics.cache_write_tokens if llm_metrics else None,
+                )
-                # Format and add results to messages
-                if stream_data and stream_data.extra is not None:
-                    self.format_function_call_results(
-                        messages=messages, function_call_results=function_call_results, **stream_data.extra
-                    )
-                elif model_response and model_response.extra is not None:
-                    self.format_function_call_results(
-                        messages=messages, function_call_results=function_call_results, **model_response.extra or {}
+                # Handle tool calls if present
+                if assistant_message.tool_calls is not None:
+                    # Prepare function calls
+                    function_calls_to_run: List[FunctionCall] = self.get_function_calls_to_run(
+                        assistant_message=assistant_message, messages=messages, functions=_functions
                     )
-                else:
-                    self.format_function_call_results(messages=messages, function_call_results=function_call_results)
+                    function_call_results: List[Message] = []
-                # Handle function call media
-                if any(msg.images or msg.videos or msg.audio or msg.files for msg in function_call_results):
-                    self._handle_function_call_media(
-                        messages=messages,
+                    # Execute function calls
+                    async for function_call_response in self.arun_function_calls(
+                        function_calls=function_calls_to_run,
                         function_call_results=function_call_results,
-                        send_media_to_model=send_media_to_model,
-                    )
+                        current_function_call_count=function_call_count,
+                        function_call_limit=tool_call_limit,
+                    ):
+                        if self.cache_response and isinstance(function_call_response, ModelResponse):
+                            streaming_responses.append(function_call_response)
+                        yield function_call_response
+                    # Add a function call for each successful execution
+                    function_call_count += len(function_call_results)
+                    # Format and add results to messages
+                    if stream_data and stream_data.extra is not None:
+                        self.format_function_call_results(
+                            messages=messages,
+                            function_call_results=function_call_results,
+                            compress_tool_results=_compress_tool_results,
+                            **stream_data.extra,
+                        )
+                    elif model_response and model_response.extra is not None:
+                        self.format_function_call_results(
+                            messages=messages,
+                            function_call_results=function_call_results,
+                            compress_tool_results=_compress_tool_results,
+                            **model_response.extra or {},
+                        )
+                    else:
+                        self.format_function_call_results(
+                            messages=messages,
+                            function_call_results=function_call_results,
+                            compress_tool_results=_compress_tool_results,
+                        )
+                    # Handle function call media
+                    if any(msg.images or msg.videos or msg.audio or msg.files for msg in function_call_results):
+                        self._handle_function_call_media(
+                            messages=messages,
+                            function_call_results=function_call_results,
+                            send_media_to_model=send_media_to_model,
+                        )
-                for function_call_result in function_call_results:
-                    function_call_result.log(metrics=True)
+                    for function_call_result in function_call_results:
+                        function_call_result.log(metrics=True, use_compressed_content=_compress_tool_results)
-                # Check if we should stop after tool calls
-                if any(m.stop_after_tool_call for m in function_call_results):
-                    break
+                    # Check if we should stop after tool calls
+                    if any(m.stop_after_tool_call for m in function_call_results):
+                        break
-                # If we have any tool calls that require confirmation, break the loop
-                if any(fc.function.requires_confirmation for fc in function_calls_to_run):
-                    break
+                    # If we have any tool calls that require confirmation, break the loop
+                    if any(fc.function.requires_confirmation for fc in function_calls_to_run):
+                        break
-                # If we have any tool calls that require external execution, break the loop
-                if any(fc.function.external_execution for fc in function_calls_to_run):
-                    break
+                    # If we have any tool calls that require external execution, break the loop
+                    if any(fc.function.external_execution for fc in function_calls_to_run):
+                        break
-                # If we have any tool calls that require user input, break the loop
-                if any(fc.function.requires_user_input for fc in function_calls_to_run):
-                    break
+                    # If we have any tool calls that require user input, break the loop
+                    if any(fc.function.requires_user_input for fc in function_calls_to_run):
+                        break
-                # Continue loop to get next response
-                continue
+                    # Continue loop to get next response
+                    continue
-            # No tool calls or finished processing them
-            break
+                # No tool calls or finished processing them
+                break
-        log_debug(f"{self.get_provider()} Async Response Stream End", center=True, symbol="-")
+            log_debug(f"{self.get_provider()} Async Response Stream End", center=True, symbol="-")
-        # Save streaming responses to cache if enabled
-        if self.cache_response and cache_key and streaming_responses:
-            self._save_streaming_responses_to_cache(cache_key, streaming_responses)
+            # Save streaming responses to cache if enabled
+            if self.cache_response and cache_key and streaming_responses:
+                self._save_streaming_responses_to_cache(cache_key, streaming_responses)
+        finally:
+            # Close the Gemini client
+            if self.__class__.__name__ == "Gemini" and self.client is not None:
+                try:
+                    await self.client.aio.aclose()  # type: ignore
+                    self.client = None
+                except AttributeError:
+                    log_warning(
+                        "Your Gemini client is outdated. For Agno to properly handle the lifecycle of the client,"
+                        " please upgrade Gemini to the latest version: pip install -U google-genai"
+                    )
     def _populate_assistant_message_from_stream_data(
         self, assistant_message: Message, stream_data: MessageData
@@ -1433,11 +1944,15 @@ class Model(ABC):
         # Run function calls sequentially
         function_execution_result: FunctionExecutionResult = FunctionExecutionResult(status="failure")
+        stop_after_tool_call_from_exception = False
         try:
             function_execution_result = function_call.execute()
         except AgentRunException as a_exc:
             # Update additional messages from function call
             _handle_agent_exception(a_exc, additional_input)
+            # If stop_execution is True, mark that we should stop after this tool call
+            if a_exc.stop_execution:
+                stop_after_tool_call_from_exception = True
             # Set function call success to False if an exception occurred
         except Exception as e:
             log_error(f"Error executing function {function_call.function.name}: {e}")
@@ -1452,44 +1967,59 @@ class Model(ABC):
         function_call_output: str = ""
         if isinstance(function_execution_result.result, (GeneratorType, collections.abc.Iterator)):
-            for item in function_execution_result.result:
-                # This function yields agent/team/workflow run events
-                if (
-                    isinstance(item, tuple(get_args(RunOutputEvent)))
-                    or isinstance(item, tuple(get_args(TeamRunOutputEvent)))
-                    or isinstance(item, tuple(get_args(WorkflowRunOutputEvent)))
-                ):
-                    # We only capture content events for output accumulation
-                    if isinstance(item, RunContentEvent) or isinstance(item, TeamRunContentEvent):
-                        if item.content is not None and isinstance(item.content, BaseModel):
-                            function_call_output += item.content.model_dump_json()
-                        else:
-                            # Capture output
-                            function_call_output += item.content or ""
+            try:
+                for item in function_execution_result.result:
+                    # This function yields agent/team/workflow run events
+                    if (
+                        isinstance(item, tuple(get_args(RunOutputEvent)))
+                        or isinstance(item, tuple(get_args(TeamRunOutputEvent)))
+                        or isinstance(item, tuple(get_args(WorkflowRunOutputEvent)))
+                    ):
+                        # We only capture content events for output accumulation
+                        if isinstance(item, RunContentEvent) or isinstance(item, TeamRunContentEvent):
+                            if item.content is not None and isinstance(item.content, BaseModel):
+                                function_call_output += item.content.model_dump_json()
+                            else:
+                                # Capture output
+                                function_call_output += item.content or ""
+                            if function_call.function.show_result and item.content is not None:
+                                yield ModelResponse(content=item.content)
-                        if function_call.function.show_result and item.content is not None:
-                            yield ModelResponse(content=item.content)
+                        if isinstance(item, CustomEvent):
+                            function_call_output += str(item)
+                            item.tool_call_id = function_call.call_id
-                    if isinstance(item, CustomEvent):
-                        function_call_output += str(item)
+                        # For WorkflowCompletedEvent, extract content for final output
+                        from agno.run.workflow import WorkflowCompletedEvent
-                    # For WorkflowCompletedEvent, extract content for final output
-                    from agno.run.workflow import WorkflowCompletedEvent
+                        if isinstance(item, WorkflowCompletedEvent):
+                            if item.content is not None:
+                                if isinstance(item.content, BaseModel):
+                                    function_call_output += item.content.model_dump_json()
+                                else:
+                                    function_call_output += str(item.content)
-                    if isinstance(item, WorkflowCompletedEvent):
-                        if item.content is not None:
-                            if isinstance(item.content, BaseModel):
-                                function_call_output += item.content.model_dump_json()
-                            else:
-                                function_call_output += str(item.content)
+                        # Yield the event itself to bubble it up
+                        yield item
-                    # Yield the event itself to bubble it up
-                    yield item
+                    else:
+                        function_call_output += str(item)
+                        if function_call.function.show_result and item is not None:
+                            yield ModelResponse(content=str(item))
+            except Exception as e:
+                log_error(f"Error while iterating function result generator for {function_call.function.name}: {e}")
+                function_call.error = str(e)
+                function_call_success = False
-                else:
-                    function_call_output += str(item)
-                    if function_call.function.show_result and item is not None:
-                        yield ModelResponse(content=str(item))
+            # For generators, re-capture updated_session_state after consumption
+            # since session_state modifications were made during iteration
+            if function_execution_result.updated_session_state is None:
+                if (
+                    function_call.function._run_context is not None
+                    and function_call.function._run_context.session_state is not None
+                ):
+                    function_execution_result.updated_session_state = function_call.function._run_context.session_state
         else:
             from agno.tools.function import ToolResult
@@ -1521,6 +2051,9 @@ class Model(ABC):
             timer=function_call_timer,
             function_execution_result=function_execution_result,
         )
+        # Override stop_after_tool_call if set by exception
+        if stop_after_tool_call_from_exception:
+            function_call_result.stop_after_tool_call = True
         yield ModelResponse(
             content=f"{function_call.get_call_str()} completed in {function_call_timer.elapsed:.4f}s. ",
             tool_executions=[
@@ -1568,7 +2101,7 @@ class Model(ABC):
             paused_tool_executions = []
-            # The function cannot be executed without user confirmation
+            # The function requires user confirmation (HITL)
             if fc.function.requires_confirmation:
                 paused_tool_executions.append(
                     ToolExecution(
@@ -1578,7 +2111,8 @@ class Model(ABC):
                         requires_confirmation=True,
                     )
                 )
-            # If the function requires user input, we yield a message to the user
+            # The function requires user input (HITL)
             if fc.function.requires_user_input:
                 user_input_schema = fc.function.user_input_schema
                 if fc.arguments and user_input_schema:
@@ -1596,15 +2130,26 @@ class Model(ABC):
                         user_input_schema=user_input_schema,
                     )
                 )
-            # If the function is from the user control flow tools, we handle it here
+            # If the function is from the user control flow (HITL) tools, we handle it here
             if fc.function.name == "get_user_input" and fc.arguments and fc.arguments.get("user_input_fields"):
                 user_input_schema = []
                 for input_field in fc.arguments.get("user_input_fields", []):
                     field_type = input_field.get("field_type")
-                    try:
-                        python_type = eval(field_type) if isinstance(field_type, str) else field_type
-                    except (NameError, SyntaxError):
-                        python_type = str  # Default to str if type is invalid
+                    if isinstance(field_type, str):
+                        type_mapping = {
+                            "str": str,
+                            "int": int,
+                            "float": float,
+                            "bool": bool,
+                            "list": list,
+                            "dict": dict,
+                        }
+                        python_type = type_mapping.get(field_type, str)
+                    elif isinstance(field_type, type):
+                        python_type = field_type
+                    else:
+                        python_type = str
                     user_input_schema.append(
                         UserInputField(
                             name=input_field.get("field_name"),
@@ -1622,7 +2167,8 @@ class Model(ABC):
                         user_input_schema=user_input_schema,
                     )
                 )
-            # If the function requires external execution, we yield a message to the user
+            # The function requires external execution (HITL)
             if fc.function.external_execution:
                 paused_tool_executions.append(
                     ToolExecution(
@@ -1755,10 +2301,20 @@ class Model(ABC):
                 user_input_schema = []
                 for input_field in fc.arguments.get("user_input_fields", []):
                     field_type = input_field.get("field_type")
-                    try:
-                        python_type = eval(field_type) if isinstance(field_type, str) else field_type
-                    except (NameError, SyntaxError):
-                        python_type = str  # Default to str if type is invalid
+                    if isinstance(field_type, str):
+                        type_mapping = {
+                            "str": str,
+                            "int": int,
+                            "float": float,
+                            "bool": bool,
+                            "list": list,
+                            "dict": dict,
+                        }
+                        python_type = type_mapping.get(field_type, str)
+                    elif isinstance(field_type, type):
+                        python_type = field_type
+                    else:
+                        python_type = str
                     user_input_schema.append(
                         UserInputField(
                             name=input_field.get("field_name"),
@@ -1875,6 +2431,7 @@ class Model(ABC):
                         if isinstance(item, CustomEvent):
                             function_call_output += str(item)
+                            item.tool_call_id = function_call.call_id
                             # For WorkflowCompletedEvent, extract content for final output
                             from agno.run.workflow import WorkflowCompletedEvent
@@ -1952,18 +2509,26 @@ class Model(ABC):
                                 if async_gen_index in async_generator_outputs:
                                     _, async_function_call_output, error = async_generator_outputs[async_gen_index]
                                     if error:
-                                        log_error(f"Error in async generator: {error}")
-                                        raise error
+                                        # Handle async generator exceptions gracefully like sync generators
+                                        log_error(
+                                            f"Error while iterating async generator for {function_call.function.name}: {error}"
+                                        )
+                                        function_call.error = str(error)
+                                        function_call_success = False
                                 break
                             async_gen_index += 1
             updated_session_state = function_execution_result.updated_session_state
             # Handle AgentRunException
+            stop_after_tool_call_from_exception = False
             if isinstance(function_call_success, AgentRunException):
                 a_exc = function_call_success
                 # Update additional messages from function call
                 _handle_agent_exception(a_exc, additional_input)
+                # If stop_execution is True, mark that we should stop after this tool call
+                if a_exc.stop_execution:
+                    stop_after_tool_call_from_exception = True
                 # Set function call success to False if an exception occurred
                 function_call_success = False
@@ -1975,33 +2540,62 @@ class Model(ABC):
                 function_call_output = async_function_call_output
                 # Events from async generators were already yielded in real-time above
             elif isinstance(function_call.result, (GeneratorType, collections.abc.Iterator)):
-                for item in function_call.result:
-                    # This function yields agent/team/workflow run events
-                    if isinstance(
-                        item,
-                        tuple(get_args(RunOutputEvent))
-                        + tuple(get_args(TeamRunOutputEvent))
-                        + tuple(get_args(WorkflowRunOutputEvent)),
+                try:
+                    for item in function_call.result:
+                        # This function yields agent/team/workflow run events
+                        if isinstance(
+                            item,
+                            tuple(get_args(RunOutputEvent))
+                            + tuple(get_args(TeamRunOutputEvent))
+                            + tuple(get_args(WorkflowRunOutputEvent)),
+                        ):
+                            # We only capture content events
+                            if isinstance(item, RunContentEvent) or isinstance(item, TeamRunContentEvent):
+                                if item.content is not None and isinstance(item.content, BaseModel):
+                                    function_call_output += item.content.model_dump_json()
+                                else:
+                                    # Capture output
+                                    function_call_output += item.content or ""
+                                if function_call.function.show_result and item.content is not None:
+                                    yield ModelResponse(content=item.content)
+                                    continue
+                            elif isinstance(item, CustomEvent):
+                                function_call_output += str(item)
+                                item.tool_call_id = function_call.call_id
+                            # Yield the event itself to bubble it up
+                            yield item
+                        else:
+                            function_call_output += str(item)
+                            if function_call.function.show_result and item is not None:
+                                yield ModelResponse(content=str(item))
+                except Exception as e:
+                    log_error(f"Error while iterating function result generator for {function_call.function.name}: {e}")
+                    function_call.error = str(e)
+                    function_call_success = False
+            # For generators (sync or async), re-capture updated_session_state after consumption
+            # since session_state modifications were made during iteration
+            if async_function_call_output is not None or isinstance(
+                function_call.result,
+                (GeneratorType, collections.abc.Iterator, AsyncGeneratorType, collections.abc.AsyncIterator),
+            ):
+                if updated_session_state is None:
+                    if (
+                        function_call.function._run_context is not None
+                        and function_call.function._run_context.session_state is not None
                     ):
-                        # We only capture content events
-                        if isinstance(item, RunContentEvent) or isinstance(item, TeamRunContentEvent):
-                            if item.content is not None and isinstance(item.content, BaseModel):
-                                function_call_output += item.content.model_dump_json()
-                            else:
-                                # Capture output
-                                function_call_output += item.content or ""
-                            if function_call.function.show_result and item.content is not None:
-                                yield ModelResponse(content=item.content)
-                                continue
+                        updated_session_state = function_call.function._run_context.session_state
-                        # Yield the event itself to bubble it up
-                        yield item
-                    else:
-                        function_call_output += str(item)
-                        if function_call.function.show_result and item is not None:
-                            yield ModelResponse(content=str(item))
-            else:
+            if not (
+                async_function_call_output is not None
+                or isinstance(
+                    function_call.result,
+                    (GeneratorType, collections.abc.Iterator, AsyncGeneratorType, collections.abc.AsyncIterator),
+                )
+            ):
                 from agno.tools.function import ToolResult
                 if isinstance(function_execution_result.result, ToolResult):
@@ -2030,6 +2624,9 @@ class Model(ABC):
                 timer=function_call_timer,
                 function_execution_result=function_execution_result,
             )
+            # Override stop_after_tool_call if set by exception
+            if stop_after_tool_call_from_exception:
+                function_call_result.stop_after_tool_call = True
             yield ModelResponse(
                 content=f"{function_call.get_call_str()} completed in {function_call_timer.elapsed:.4f}s. ",
                 tool_executions=[
@@ -2079,7 +2676,11 @@ class Model(ABC):
         return function_calls_to_run
     def format_function_call_results(
-        self, messages: List[Message], function_call_results: List[Message], **kwargs
+        self,
+        messages: List[Message],
+        function_call_results: List[Message],
+        compress_tool_results: bool = False,
+        **kwargs,
     ) -> None:
         """
         Format function call results.

agno 2.2.13__py3-none-any.whl → 2.4.3__py3-none-any.whl

agno 2.2.13py3-none-any.whl → 2.4.3py3-none-any.whl