fast-agent-mcp 0.2.33__py3-none-any.whl → 0.2.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fast_agent_mcp-0.2.33.dist-info → fast_agent_mcp-0.2.35.dist-info}/METADATA +1 -1
- {fast_agent_mcp-0.2.33.dist-info → fast_agent_mcp-0.2.35.dist-info}/RECORD +28 -25
- mcp_agent/agents/base_agent.py +13 -0
- mcp_agent/config.py +8 -0
- mcp_agent/context.py +3 -2
- mcp_agent/core/agent_app.py +41 -1
- mcp_agent/core/enhanced_prompt.py +9 -0
- mcp_agent/core/fastagent.py +14 -2
- mcp_agent/core/interactive_prompt.py +59 -13
- mcp_agent/core/usage_display.py +193 -0
- mcp_agent/event_progress.py +22 -4
- mcp_agent/llm/augmented_llm.py +42 -9
- mcp_agent/llm/augmented_llm_passthrough.py +66 -4
- mcp_agent/llm/augmented_llm_playback.py +19 -0
- mcp_agent/llm/augmented_llm_slow.py +12 -1
- mcp_agent/llm/memory.py +120 -0
- mcp_agent/llm/model_database.py +236 -0
- mcp_agent/llm/model_factory.py +1 -0
- mcp_agent/llm/providers/augmented_llm_anthropic.py +211 -30
- mcp_agent/llm/providers/augmented_llm_google_native.py +18 -1
- mcp_agent/llm/providers/augmented_llm_openai.py +20 -7
- mcp_agent/llm/usage_tracking.py +402 -0
- mcp_agent/logging/events.py +24 -0
- mcp_agent/logging/rich_progress.py +9 -1
- mcp_agent/mcp/interfaces.py +6 -0
- {fast_agent_mcp-0.2.33.dist-info → fast_agent_mcp-0.2.35.dist-info}/WHEEL +0 -0
- {fast_agent_mcp-0.2.33.dist-info → fast_agent_mcp-0.2.35.dist-info}/entry_points.txt +0 -0
- {fast_agent_mcp-0.2.33.dist-info → fast_agent_mcp-0.2.35.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,236 @@
|
|
1
|
+
"""
|
2
|
+
Model database for LLM parameters.
|
3
|
+
|
4
|
+
This module provides a centralized lookup for model parameters including
|
5
|
+
context windows, max output tokens, and supported tokenization types.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Dict, List, Optional
|
9
|
+
|
10
|
+
from pydantic import BaseModel
|
11
|
+
|
12
|
+
|
13
|
+
class ModelParameters(BaseModel):
|
14
|
+
"""Configuration parameters for a specific model"""
|
15
|
+
|
16
|
+
context_window: int
|
17
|
+
"""Maximum context window size in tokens"""
|
18
|
+
|
19
|
+
max_output_tokens: int
|
20
|
+
"""Maximum output tokens the model can generate"""
|
21
|
+
|
22
|
+
tokenizes: List[str]
|
23
|
+
"""List of supported content types for tokenization"""
|
24
|
+
|
25
|
+
|
26
|
+
class ModelDatabase:
|
27
|
+
"""Centralized model configuration database"""
|
28
|
+
|
29
|
+
# Common parameter sets
|
30
|
+
OPENAI_MULTIMODAL = ["text/plain", "image/jpeg", "image/png", "image/webp", "application/pdf"]
|
31
|
+
OPENAI_VISION = ["text/plain", "image/jpeg", "image/png", "image/webp"]
|
32
|
+
ANTHROPIC_MULTIMODAL = [
|
33
|
+
"text/plain",
|
34
|
+
"image/jpeg",
|
35
|
+
"image/png",
|
36
|
+
"image/webp",
|
37
|
+
"application/pdf",
|
38
|
+
]
|
39
|
+
GOOGLE_MULTIMODAL = [
|
40
|
+
"text/plain",
|
41
|
+
"image/jpeg",
|
42
|
+
"image/png",
|
43
|
+
"image/webp",
|
44
|
+
"application/pdf",
|
45
|
+
"audio/wav",
|
46
|
+
"audio/mp3",
|
47
|
+
"video/mp4",
|
48
|
+
]
|
49
|
+
QWEN_MULTIMODAL = ["text/plain", "image/jpeg", "image/png", "image/webp"]
|
50
|
+
TEXT_ONLY = ["text/plain"]
|
51
|
+
|
52
|
+
# Common parameter configurations
|
53
|
+
OPENAI_STANDARD = ModelParameters(
|
54
|
+
context_window=128000, max_output_tokens=16384, tokenizes=OPENAI_MULTIMODAL
|
55
|
+
)
|
56
|
+
|
57
|
+
OPENAI_4_1_STANDARD = ModelParameters(
|
58
|
+
context_window=1047576, max_output_tokens=32768, tokenizes=OPENAI_MULTIMODAL
|
59
|
+
)
|
60
|
+
|
61
|
+
OPENAI_O_SERIES = ModelParameters(
|
62
|
+
context_window=200000, max_output_tokens=100000, tokenizes=OPENAI_VISION
|
63
|
+
)
|
64
|
+
|
65
|
+
ANTHROPIC_LEGACY = ModelParameters(
|
66
|
+
context_window=200000, max_output_tokens=4096, tokenizes=ANTHROPIC_MULTIMODAL
|
67
|
+
)
|
68
|
+
|
69
|
+
ANTHROPIC_35_SERIES = ModelParameters(
|
70
|
+
context_window=200000, max_output_tokens=8192, tokenizes=ANTHROPIC_MULTIMODAL
|
71
|
+
)
|
72
|
+
|
73
|
+
# TODO--- TO USE 64,000 NEED TO SUPPORT STREAMING
|
74
|
+
ANTHROPIC_37_SERIES = ModelParameters(
|
75
|
+
context_window=200000, max_output_tokens=16384, tokenizes=ANTHROPIC_MULTIMODAL
|
76
|
+
)
|
77
|
+
|
78
|
+
GEMINI_FLASH = ModelParameters(
|
79
|
+
context_window=1048576, max_output_tokens=8192, tokenizes=GOOGLE_MULTIMODAL
|
80
|
+
)
|
81
|
+
|
82
|
+
GEMINI_PRO = ModelParameters(
|
83
|
+
context_window=2097152, max_output_tokens=8192, tokenizes=GOOGLE_MULTIMODAL
|
84
|
+
)
|
85
|
+
|
86
|
+
QWEN_STANDARD = ModelParameters(
|
87
|
+
context_window=32000, max_output_tokens=8192, tokenizes=QWEN_MULTIMODAL
|
88
|
+
)
|
89
|
+
|
90
|
+
FAST_AGENT_STANDARD = ModelParameters(
|
91
|
+
context_window=1000000, max_output_tokens=100000, tokenizes=TEXT_ONLY
|
92
|
+
)
|
93
|
+
|
94
|
+
OPENAI_4_1_SERIES = ModelParameters(
|
95
|
+
context_window=1047576, max_output_tokens=32768, tokenizes=OPENAI_MULTIMODAL
|
96
|
+
)
|
97
|
+
|
98
|
+
OPENAI_4O_SERIES = ModelParameters(
|
99
|
+
context_window=128000, max_output_tokens=16384, tokenizes=OPENAI_VISION
|
100
|
+
)
|
101
|
+
|
102
|
+
OPENAI_O3_SERIES = ModelParameters(
|
103
|
+
context_window=200000, max_output_tokens=100000, tokenizes=OPENAI_MULTIMODAL
|
104
|
+
)
|
105
|
+
|
106
|
+
OPENAI_O3_MINI_SERIES = ModelParameters(
|
107
|
+
context_window=200000, max_output_tokens=100000, tokenizes=TEXT_ONLY
|
108
|
+
)
|
109
|
+
|
110
|
+
# TODO update to 32000
|
111
|
+
ANTHROPIC_OPUS_4_VERSIONED = ModelParameters(
|
112
|
+
context_window=200000, max_output_tokens=32000, tokenizes=ANTHROPIC_MULTIMODAL
|
113
|
+
)
|
114
|
+
# TODO update to 64000
|
115
|
+
ANTHROPIC_SONNET_4_VERSIONED = ModelParameters(
|
116
|
+
context_window=200000, max_output_tokens=64000, tokenizes=ANTHROPIC_MULTIMODAL
|
117
|
+
)
|
118
|
+
|
119
|
+
DEEPSEEK_CHAT_STANDARD = ModelParameters(
|
120
|
+
context_window=65536, max_output_tokens=8192, tokenizes=TEXT_ONLY
|
121
|
+
)
|
122
|
+
|
123
|
+
DEEPSEEK_REASONER = ModelParameters(
|
124
|
+
context_window=65536, max_output_tokens=32768, tokenizes=TEXT_ONLY
|
125
|
+
)
|
126
|
+
|
127
|
+
GEMINI_2_5_PRO = ModelParameters(
|
128
|
+
context_window=2097152, max_output_tokens=8192, tokenizes=GOOGLE_MULTIMODAL
|
129
|
+
)
|
130
|
+
|
131
|
+
# Model configuration database
|
132
|
+
MODELS: Dict[str, ModelParameters] = {
|
133
|
+
# internal models
|
134
|
+
"passthrough": FAST_AGENT_STANDARD,
|
135
|
+
"playback": FAST_AGENT_STANDARD,
|
136
|
+
"slow": FAST_AGENT_STANDARD,
|
137
|
+
# aliyun models
|
138
|
+
"qwen-turbo": QWEN_STANDARD,
|
139
|
+
"qwen-plus": QWEN_STANDARD,
|
140
|
+
"qwen-max": QWEN_STANDARD,
|
141
|
+
"qwen-long": ModelParameters(
|
142
|
+
context_window=10000000, max_output_tokens=8192, tokenizes=TEXT_ONLY
|
143
|
+
),
|
144
|
+
# OpenAI Models (vanilla aliases and versioned)
|
145
|
+
"gpt-4.1": OPENAI_4_1_SERIES,
|
146
|
+
"gpt-4.1-mini": OPENAI_4_1_SERIES,
|
147
|
+
"gpt-4.1-nano": OPENAI_4_1_SERIES,
|
148
|
+
"gpt-4.1-2025-04-14": OPENAI_4_1_SERIES,
|
149
|
+
"gpt-4.1-mini-2025-04-14": OPENAI_4_1_SERIES,
|
150
|
+
"gpt-4.1-nano-2025-04-14": OPENAI_4_1_SERIES,
|
151
|
+
"gpt-4o": OPENAI_4O_SERIES,
|
152
|
+
"gpt-4o-2024-11-20": OPENAI_4O_SERIES,
|
153
|
+
"gpt-4o-mini-2024-07-18": OPENAI_4O_SERIES,
|
154
|
+
"o1": OPENAI_O_SERIES,
|
155
|
+
"o1-2024-12-17": OPENAI_O_SERIES,
|
156
|
+
"o3": OPENAI_O3_SERIES,
|
157
|
+
"o3-pro": ModelParameters(
|
158
|
+
context_window=200_000, max_output_tokens=100_000, tokenizes=TEXT_ONLY
|
159
|
+
),
|
160
|
+
"o3-mini": OPENAI_O3_MINI_SERIES,
|
161
|
+
"o4-mini": OPENAI_O3_SERIES,
|
162
|
+
"o3-2025-04-16": OPENAI_O3_SERIES,
|
163
|
+
"o3-mini-2025-01-31": OPENAI_O3_MINI_SERIES,
|
164
|
+
"o4-mini-2025-04-16": OPENAI_O3_SERIES,
|
165
|
+
# Anthropic Models
|
166
|
+
"claude-3-haiku": ANTHROPIC_35_SERIES,
|
167
|
+
"claude-3-haiku-20240307": ANTHROPIC_LEGACY,
|
168
|
+
"claude-3-sonnet": ANTHROPIC_LEGACY,
|
169
|
+
"claude-3-opus": ANTHROPIC_LEGACY,
|
170
|
+
"claude-3-opus-20240229": ANTHROPIC_LEGACY,
|
171
|
+
"claude-3-opus-latest": ANTHROPIC_LEGACY,
|
172
|
+
"claude-3-5-haiku": ANTHROPIC_35_SERIES,
|
173
|
+
"claude-3-5-haiku-20241022": ANTHROPIC_35_SERIES,
|
174
|
+
"claude-3-5-haiku-latest": ANTHROPIC_35_SERIES,
|
175
|
+
"claude-3-sonnet-20240229": ANTHROPIC_LEGACY,
|
176
|
+
"claude-3-5-sonnet": ANTHROPIC_35_SERIES,
|
177
|
+
"claude-3-5-sonnet-20240620": ANTHROPIC_35_SERIES,
|
178
|
+
"claude-3-5-sonnet-20241022": ANTHROPIC_35_SERIES,
|
179
|
+
"claude-3-5-sonnet-latest": ANTHROPIC_35_SERIES,
|
180
|
+
"claude-3-7-sonnet": ANTHROPIC_37_SERIES,
|
181
|
+
"claude-3-7-sonnet-20250219": ANTHROPIC_37_SERIES,
|
182
|
+
"claude-3-7-sonnet-latest": ANTHROPIC_37_SERIES,
|
183
|
+
"claude-sonnet-4": ANTHROPIC_SONNET_4_VERSIONED,
|
184
|
+
"claude-sonnet-4-0": ANTHROPIC_SONNET_4_VERSIONED,
|
185
|
+
"claude-sonnet-4-20250514": ANTHROPIC_SONNET_4_VERSIONED,
|
186
|
+
"claude-opus-4": ANTHROPIC_OPUS_4_VERSIONED,
|
187
|
+
"claude-opus-4-0": ANTHROPIC_OPUS_4_VERSIONED,
|
188
|
+
"claude-opus-4-20250514": ANTHROPIC_OPUS_4_VERSIONED,
|
189
|
+
# DeepSeek Models
|
190
|
+
"deepseek-chat": DEEPSEEK_CHAT_STANDARD,
|
191
|
+
# Google Gemini Models (vanilla aliases and versioned)
|
192
|
+
"gemini-2.0-flash": GEMINI_FLASH,
|
193
|
+
"gemini-2.5-flash-preview": GEMINI_FLASH,
|
194
|
+
"gemini-2.5-pro-preview": GEMINI_2_5_PRO,
|
195
|
+
"gemini-2.5-flash-preview-05-20": GEMINI_FLASH,
|
196
|
+
"gemini-2.5-pro-preview-05-06": GEMINI_PRO,
|
197
|
+
}
|
198
|
+
|
199
|
+
@classmethod
|
200
|
+
def get_model_params(cls, model: str) -> Optional[ModelParameters]:
|
201
|
+
"""Get model parameters for a given model name"""
|
202
|
+
return cls.MODELS.get(model)
|
203
|
+
|
204
|
+
@classmethod
|
205
|
+
def get_context_window(cls, model: str) -> Optional[int]:
|
206
|
+
"""Get context window size for a model"""
|
207
|
+
params = cls.get_model_params(model)
|
208
|
+
return params.context_window if params else None
|
209
|
+
|
210
|
+
@classmethod
|
211
|
+
def get_max_output_tokens(cls, model: str) -> Optional[int]:
|
212
|
+
"""Get maximum output tokens for a model"""
|
213
|
+
params = cls.get_model_params(model)
|
214
|
+
return params.max_output_tokens if params else None
|
215
|
+
|
216
|
+
@classmethod
|
217
|
+
def get_tokenizes(cls, model: str) -> Optional[List[str]]:
|
218
|
+
"""Get supported tokenization types for a model"""
|
219
|
+
params = cls.get_model_params(model)
|
220
|
+
return params.tokenizes if params else None
|
221
|
+
|
222
|
+
@classmethod
|
223
|
+
def get_default_max_tokens(cls, model: str) -> int:
|
224
|
+
"""Get default max_tokens for RequestParams based on model"""
|
225
|
+
if not model:
|
226
|
+
return 2048 # Fallback when no model specified
|
227
|
+
|
228
|
+
params = cls.get_model_params(model)
|
229
|
+
if params:
|
230
|
+
return params.max_output_tokens
|
231
|
+
return 2048 # Fallback for unknown models
|
232
|
+
|
233
|
+
@classmethod
|
234
|
+
def list_models(cls) -> List[str]:
|
235
|
+
"""List all available model names"""
|
236
|
+
return list(cls.MODELS.keys())
|
mcp_agent/llm/model_factory.py
CHANGED
@@ -87,6 +87,7 @@ class ModelFactory:
|
|
87
87
|
"o1-preview": Provider.OPENAI,
|
88
88
|
"o3": Provider.OPENAI,
|
89
89
|
"o3-mini": Provider.OPENAI,
|
90
|
+
"o4-mini": Provider.OPENAI,
|
90
91
|
"claude-3-haiku-20240307": Provider.ANTHROPIC,
|
91
92
|
"claude-3-5-haiku-20241022": Provider.ANTHROPIC,
|
92
93
|
"claude-3-5-haiku-latest": Provider.ANTHROPIC,
|
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, List, Tuple, Type
|
|
3
3
|
from mcp.types import EmbeddedResource, ImageContent, TextContent
|
4
4
|
|
5
5
|
from mcp_agent.core.prompt import Prompt
|
6
|
+
from mcp_agent.event_progress import ProgressAction
|
6
7
|
from mcp_agent.llm.provider_types import Provider
|
7
8
|
from mcp_agent.llm.providers.multipart_converter_anthropic import (
|
8
9
|
AnthropicConverter,
|
@@ -10,6 +11,7 @@ from mcp_agent.llm.providers.multipart_converter_anthropic import (
|
|
10
11
|
from mcp_agent.llm.providers.sampling_converter_anthropic import (
|
11
12
|
AnthropicSamplingConverter,
|
12
13
|
)
|
14
|
+
from mcp_agent.llm.usage_tracking import TurnUsage
|
13
15
|
from mcp_agent.mcp.interfaces import ModelT
|
14
16
|
from mcp_agent.mcp.prompt_message_multipart import PromptMessageMultipart
|
15
17
|
|
@@ -17,7 +19,8 @@ if TYPE_CHECKING:
|
|
17
19
|
from mcp import ListToolsResult
|
18
20
|
|
19
21
|
|
20
|
-
from anthropic import
|
22
|
+
from anthropic import AsyncAnthropic, AuthenticationError
|
23
|
+
from anthropic.lib.streaming import AsyncMessageStream
|
21
24
|
from anthropic.types import (
|
22
25
|
Message,
|
23
26
|
MessageParam,
|
@@ -75,19 +78,83 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
75
78
|
|
76
79
|
def _initialize_default_params(self, kwargs: dict) -> RequestParams:
|
77
80
|
"""Initialize Anthropic-specific default parameters"""
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
81
|
+
# Get base defaults from parent (includes ModelDatabase lookup)
|
82
|
+
base_params = super()._initialize_default_params(kwargs)
|
83
|
+
|
84
|
+
# Override with Anthropic-specific settings
|
85
|
+
chosen_model = kwargs.get("model", DEFAULT_ANTHROPIC_MODEL)
|
86
|
+
base_params.model = chosen_model
|
87
|
+
|
88
|
+
return base_params
|
86
89
|
|
87
90
|
def _base_url(self) -> str | None:
|
88
91
|
assert self.context.config
|
89
92
|
return self.context.config.anthropic.base_url if self.context.config.anthropic else None
|
90
93
|
|
94
|
+
def _get_cache_mode(self) -> str:
|
95
|
+
"""Get the cache mode configuration."""
|
96
|
+
cache_mode = "auto" # Default to auto
|
97
|
+
if self.context.config and self.context.config.anthropic:
|
98
|
+
cache_mode = self.context.config.anthropic.cache_mode
|
99
|
+
return cache_mode
|
100
|
+
|
101
|
+
async def _process_stream(self, stream: AsyncMessageStream, model: str) -> Message:
|
102
|
+
"""Process the streaming response and display real-time token usage."""
|
103
|
+
# Track estimated output tokens by counting text chunks
|
104
|
+
estimated_tokens = 0
|
105
|
+
|
106
|
+
# Process the raw event stream to get token counts
|
107
|
+
async for event in stream:
|
108
|
+
# Count tokens in real-time from content_block_delta events
|
109
|
+
if (
|
110
|
+
event.type == "content_block_delta"
|
111
|
+
and hasattr(event, "delta")
|
112
|
+
and event.delta.type == "text_delta"
|
113
|
+
):
|
114
|
+
# Rough estimate: 1 token per 4 characters (OpenAI's typical ratio)
|
115
|
+
text_length = len(event.delta.text)
|
116
|
+
estimated_tokens += max(1, text_length // 4)
|
117
|
+
|
118
|
+
# Update progress on every token for real-time display
|
119
|
+
token_str = str(estimated_tokens).rjust(5)
|
120
|
+
# print(f"DEBUG: Streaming tokens: {token_str}")
|
121
|
+
self._emit_streaming_progress(model, token_str)
|
122
|
+
|
123
|
+
# Also check for final message_delta events with actual usage info
|
124
|
+
elif (
|
125
|
+
event.type == "message_delta"
|
126
|
+
and hasattr(event, "usage")
|
127
|
+
and event.usage.output_tokens
|
128
|
+
):
|
129
|
+
actual_tokens = event.usage.output_tokens
|
130
|
+
token_str = str(actual_tokens).rjust(5)
|
131
|
+
# print(f"DEBUG: Final actual tokens: {token_str}")
|
132
|
+
self._emit_streaming_progress(model, token_str)
|
133
|
+
|
134
|
+
# Get the final message with complete usage data
|
135
|
+
message = await stream.get_final_message()
|
136
|
+
|
137
|
+
# Log final usage information
|
138
|
+
if hasattr(message, "usage") and message.usage:
|
139
|
+
self.logger.info(
|
140
|
+
f"Streaming complete - Model: {model}, Input tokens: {message.usage.input_tokens}, Output tokens: {message.usage.output_tokens}"
|
141
|
+
)
|
142
|
+
|
143
|
+
return message
|
144
|
+
|
145
|
+
def _emit_streaming_progress(self, model: str, token_str: str) -> None:
|
146
|
+
"""Emit a streaming progress event that goes directly to progress display."""
|
147
|
+
data = {
|
148
|
+
"progress_action": ProgressAction.STREAMING,
|
149
|
+
"model": model,
|
150
|
+
"agent_name": self.name,
|
151
|
+
"chat_turn": self.chat_turn(),
|
152
|
+
"details": token_str.strip(), # Token count goes in details for STREAMING action
|
153
|
+
}
|
154
|
+
# print(f"DEBUG: Emitting streaming progress event with data: {data}")
|
155
|
+
# Use a special logger level or namespace to avoid polluting regular logs
|
156
|
+
self.logger.info("Streaming progress", data=data)
|
157
|
+
|
91
158
|
async def _anthropic_completion(
|
92
159
|
self,
|
93
160
|
message_param,
|
@@ -104,7 +171,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
104
171
|
base_url = base_url.rstrip("/v1")
|
105
172
|
|
106
173
|
try:
|
107
|
-
anthropic =
|
174
|
+
anthropic = AsyncAnthropic(api_key=api_key, base_url=base_url)
|
108
175
|
messages: List[MessageParam] = []
|
109
176
|
params = self.get_request_params(request_params)
|
110
177
|
except AuthenticationError as e:
|
@@ -117,7 +184,11 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
117
184
|
# if use_history is True
|
118
185
|
messages.extend(self.history.get(include_completion_history=params.use_history))
|
119
186
|
|
120
|
-
messages.append(message_param)
|
187
|
+
messages.append(message_param) # message_param is the current user turn
|
188
|
+
|
189
|
+
# Get cache mode configuration
|
190
|
+
cache_mode = self._get_cache_mode()
|
191
|
+
self.logger.debug(f"Anthropic cache_mode: {cache_mode}")
|
121
192
|
|
122
193
|
tool_list: ListToolsResult = await self.aggregator.list_tools()
|
123
194
|
available_tools: List[ToolParam] = [
|
@@ -133,8 +204,11 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
133
204
|
|
134
205
|
model = self.default_request_params.model
|
135
206
|
|
207
|
+
# Note: We'll cache tools+system together by putting cache_control only on system prompt
|
208
|
+
|
136
209
|
for i in range(params.max_iterations):
|
137
210
|
self._log_chat_progress(self.chat_turn(), model=model)
|
211
|
+
|
138
212
|
# Create base arguments dictionary
|
139
213
|
base_args = {
|
140
214
|
"model": model,
|
@@ -144,6 +218,60 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
144
218
|
"tools": available_tools,
|
145
219
|
}
|
146
220
|
|
221
|
+
# Apply cache_control to system prompt if cache_mode is not "off"
|
222
|
+
# This caches both tools and system prompt together in one cache block
|
223
|
+
if cache_mode != "off" and base_args["system"]:
|
224
|
+
if isinstance(base_args["system"], str):
|
225
|
+
base_args["system"] = [
|
226
|
+
{
|
227
|
+
"type": "text",
|
228
|
+
"text": base_args["system"],
|
229
|
+
"cache_control": {"type": "ephemeral"},
|
230
|
+
}
|
231
|
+
]
|
232
|
+
self.logger.debug(
|
233
|
+
"Applied cache_control to system prompt (caches tools+system in one block)"
|
234
|
+
)
|
235
|
+
else:
|
236
|
+
self.logger.debug(f"System prompt is not a string: {type(base_args['system'])}")
|
237
|
+
|
238
|
+
# Apply conversation caching using walking algorithm if in auto mode
|
239
|
+
if cache_mode == "auto" and self.history.should_apply_conversation_cache():
|
240
|
+
cache_updates = self.history.get_conversation_cache_updates()
|
241
|
+
|
242
|
+
# Remove cache control from old positions
|
243
|
+
if cache_updates["remove"]:
|
244
|
+
self.history.remove_cache_control_from_messages(
|
245
|
+
messages, cache_updates["remove"]
|
246
|
+
)
|
247
|
+
self.logger.debug(
|
248
|
+
f"Removed conversation cache_control from positions {cache_updates['remove']}"
|
249
|
+
)
|
250
|
+
|
251
|
+
# Add cache control to new positions
|
252
|
+
if cache_updates["add"]:
|
253
|
+
applied_count = self.history.add_cache_control_to_messages(
|
254
|
+
messages, cache_updates["add"]
|
255
|
+
)
|
256
|
+
if applied_count > 0:
|
257
|
+
self.history.apply_conversation_cache_updates(cache_updates)
|
258
|
+
self.logger.debug(
|
259
|
+
f"Applied conversation cache_control to positions {cache_updates['add']} ({applied_count} blocks)"
|
260
|
+
)
|
261
|
+
|
262
|
+
# Verify we don't exceed Anthropic's 4 cache block limit
|
263
|
+
total_cache_blocks = applied_count
|
264
|
+
if cache_mode != "off" and base_args["system"]:
|
265
|
+
total_cache_blocks += 1 # tools+system cache block
|
266
|
+
if total_cache_blocks > 4:
|
267
|
+
self.logger.warning(
|
268
|
+
f"Total cache blocks ({total_cache_blocks}) exceeds Anthropic limit of 4"
|
269
|
+
)
|
270
|
+
else:
|
271
|
+
self.logger.debug(
|
272
|
+
f"Failed to apply conversation cache_control to positions {cache_updates['add']}"
|
273
|
+
)
|
274
|
+
|
147
275
|
if params.maxTokens is not None:
|
148
276
|
base_args["max_tokens"] = params.maxTokens
|
149
277
|
|
@@ -154,9 +282,25 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
154
282
|
|
155
283
|
self.logger.debug(f"{arguments}")
|
156
284
|
|
157
|
-
|
158
|
-
|
159
|
-
|
285
|
+
# Use streaming API with helper
|
286
|
+
async with anthropic.messages.stream(**arguments) as stream:
|
287
|
+
# Process the stream
|
288
|
+
response = await self._process_stream(stream, model)
|
289
|
+
|
290
|
+
# Track usage if response is valid and has usage data
|
291
|
+
if (
|
292
|
+
hasattr(response, "usage")
|
293
|
+
and response.usage
|
294
|
+
and not isinstance(response, BaseException)
|
295
|
+
):
|
296
|
+
try:
|
297
|
+
turn_usage = TurnUsage.from_anthropic(
|
298
|
+
response.usage, model or DEFAULT_ANTHROPIC_MODEL
|
299
|
+
)
|
300
|
+
self.usage_accumulator.add_turn(turn_usage)
|
301
|
+
# self._show_usage(response.usage, turn_usage)
|
302
|
+
except Exception as e:
|
303
|
+
self.logger.warning(f"Failed to track usage: {e}")
|
160
304
|
|
161
305
|
if isinstance(response, AuthenticationError):
|
162
306
|
raise ProviderKeyError(
|
@@ -165,7 +309,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
165
309
|
) from response
|
166
310
|
elif isinstance(response, BaseException):
|
167
311
|
error_details = str(response)
|
168
|
-
self.logger.error(f"Error: {error_details}", data=
|
312
|
+
self.logger.error(f"Error: {error_details}", data=BaseException)
|
169
313
|
|
170
314
|
# Try to extract more useful information for API errors
|
171
315
|
if hasattr(response, "status_code") and hasattr(response, "response"):
|
@@ -178,13 +322,13 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
178
322
|
# Convert other errors to text response
|
179
323
|
error_message = f"Error during generation: {error_details}"
|
180
324
|
response = Message(
|
181
|
-
id="error",
|
182
|
-
model="error",
|
325
|
+
id="error",
|
326
|
+
model="error",
|
183
327
|
role="assistant",
|
184
328
|
type="message",
|
185
329
|
content=[TextBlock(type="text", text=error_message)],
|
186
|
-
stop_reason="end_turn",
|
187
|
-
usage=Usage(input_tokens=0, output_tokens=0),
|
330
|
+
stop_reason="end_turn",
|
331
|
+
usage=Usage(input_tokens=0, output_tokens=0),
|
188
332
|
)
|
189
333
|
|
190
334
|
self.logger.debug(
|
@@ -194,7 +338,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
194
338
|
|
195
339
|
response_as_message = self.convert_message_to_message_param(response)
|
196
340
|
messages.append(response_as_message)
|
197
|
-
if response.content[0].type == "text":
|
341
|
+
if response.content and response.content[0].type == "text":
|
198
342
|
responses.append(TextContent(type="text", text=response.content[0].text))
|
199
343
|
|
200
344
|
if response.stop_reason == "end_turn":
|
@@ -254,12 +398,13 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
254
398
|
|
255
399
|
# Process all tool calls and collect results
|
256
400
|
tool_results = []
|
257
|
-
for i
|
258
|
-
|
259
|
-
|
260
|
-
|
401
|
+
# Use a different loop variable for tool enumeration if 'i' is outer loop counter
|
402
|
+
for tool_idx, content_block in enumerate(tool_uses):
|
403
|
+
tool_name = content_block.name
|
404
|
+
tool_args = content_block.input
|
405
|
+
tool_use_id = content_block.id
|
261
406
|
|
262
|
-
if
|
407
|
+
if tool_idx == 0: # Only show message for first tool use
|
263
408
|
await self.show_assistant_message(message_text, tool_name)
|
264
409
|
|
265
410
|
self.show_tool_call(available_tools, tool_name, tool_args)
|
@@ -284,11 +429,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
284
429
|
if params.use_history:
|
285
430
|
# Get current prompt messages
|
286
431
|
prompt_messages = self.history.get(include_completion_history=False)
|
287
|
-
|
288
|
-
# Calculate new conversation messages (excluding prompts)
|
289
432
|
new_messages = messages[len(prompt_messages) :]
|
290
|
-
|
291
|
-
# Update conversation history
|
292
433
|
self.history.set(new_messages)
|
293
434
|
|
294
435
|
self._log_chat_finished(model=model)
|
@@ -326,8 +467,26 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
326
467
|
multipart_messages[:-1] if last_message.role == "user" else multipart_messages
|
327
468
|
)
|
328
469
|
converted = []
|
470
|
+
|
471
|
+
# Get cache mode configuration
|
472
|
+
cache_mode = self._get_cache_mode()
|
473
|
+
|
329
474
|
for msg in messages_to_add:
|
330
|
-
|
475
|
+
anthropic_msg = AnthropicConverter.convert_to_anthropic(msg)
|
476
|
+
|
477
|
+
# Apply caching to template messages if cache_mode is "prompt" or "auto"
|
478
|
+
if is_template and cache_mode in ["prompt", "auto"] and anthropic_msg.get("content"):
|
479
|
+
content_list = anthropic_msg["content"]
|
480
|
+
if isinstance(content_list, list) and content_list:
|
481
|
+
# Apply cache control to the last content block
|
482
|
+
last_block = content_list[-1]
|
483
|
+
if isinstance(last_block, dict):
|
484
|
+
last_block["cache_control"] = {"type": "ephemeral"}
|
485
|
+
self.logger.debug(
|
486
|
+
f"Applied cache_control to template message with role {anthropic_msg.get('role')}"
|
487
|
+
)
|
488
|
+
|
489
|
+
converted.append(anthropic_msg)
|
331
490
|
|
332
491
|
self.history.extend(converted, is_prompt=is_template)
|
333
492
|
|
@@ -362,6 +521,28 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
|
|
362
521
|
)
|
363
522
|
return self._structured_from_multipart(result, model)
|
364
523
|
|
524
|
+
def _show_usage(self, raw_usage: Usage, turn_usage: TurnUsage) -> None:
|
525
|
+
# Print raw usage for debugging
|
526
|
+
print(f"\n=== USAGE DEBUG ({turn_usage.model}) ===")
|
527
|
+
print(f"Raw usage: {raw_usage}")
|
528
|
+
print(
|
529
|
+
f"Turn usage: input={turn_usage.input_tokens}, output={turn_usage.output_tokens}, current_context={turn_usage.current_context_tokens}"
|
530
|
+
)
|
531
|
+
print(
|
532
|
+
f"Cache: read={turn_usage.cache_usage.cache_read_tokens}, write={turn_usage.cache_usage.cache_write_tokens}"
|
533
|
+
)
|
534
|
+
print(f"Effective input: {turn_usage.effective_input_tokens}")
|
535
|
+
print(
|
536
|
+
f"Accumulator: total_turns={self.usage_accumulator.turn_count}, cumulative_billing={self.usage_accumulator.cumulative_billing_tokens}, current_context={self.usage_accumulator.current_context_tokens}"
|
537
|
+
)
|
538
|
+
if self.usage_accumulator.context_usage_percentage:
|
539
|
+
print(
|
540
|
+
f"Context usage: {self.usage_accumulator.context_usage_percentage:.1f}% of {self.usage_accumulator.context_window_size}"
|
541
|
+
)
|
542
|
+
if self.usage_accumulator.cache_hit_rate:
|
543
|
+
print(f"Cache hit rate: {self.usage_accumulator.cache_hit_rate:.1f}%")
|
544
|
+
print("===========================\n")
|
545
|
+
|
365
546
|
@classmethod
|
366
547
|
def convert_message_to_message_param(cls, message: Message, **kwargs) -> MessageParam:
|
367
548
|
"""Convert a response object to an input parameter object to allow LLM calls to be chained."""
|
@@ -24,6 +24,7 @@ from mcp_agent.llm.provider_types import Provider
|
|
24
24
|
|
25
25
|
# Import the new converter class
|
26
26
|
from mcp_agent.llm.providers.google_converter import GoogleConverter
|
27
|
+
from mcp_agent.llm.usage_tracking import TurnUsage
|
27
28
|
from mcp_agent.mcp.prompt_message_multipart import PromptMessageMultipart
|
28
29
|
|
29
30
|
# Define default model and potentially other Google-specific defaults
|
@@ -220,6 +221,7 @@ class GoogleNativeAugmentedLLM(AugmentedLLM[types.Content, types.Content]):
|
|
220
221
|
parallel_tool_calls=True, # Assume parallel tool calls are supported by default with native API
|
221
222
|
max_iterations=20,
|
222
223
|
use_history=True,
|
224
|
+
maxTokens=65536, # Default max tokens for Google models
|
223
225
|
# Include other relevant default parameters
|
224
226
|
)
|
225
227
|
|
@@ -281,10 +283,25 @@ class GoogleNativeAugmentedLLM(AugmentedLLM[types.Content, types.Content]):
|
|
281
283
|
)
|
282
284
|
self.logger.debug("Google generate_content response:", data=api_response)
|
283
285
|
|
286
|
+
# Track usage if response is valid and has usage data
|
287
|
+
if (
|
288
|
+
hasattr(api_response, "usage_metadata")
|
289
|
+
and api_response.usage_metadata
|
290
|
+
and not isinstance(api_response, BaseException)
|
291
|
+
):
|
292
|
+
try:
|
293
|
+
turn_usage = TurnUsage.from_google(
|
294
|
+
api_response.usage_metadata, request_params.model
|
295
|
+
)
|
296
|
+
self.usage_accumulator.add_turn(turn_usage)
|
297
|
+
|
298
|
+
except Exception as e:
|
299
|
+
self.logger.warning(f"Failed to track usage: {e}")
|
300
|
+
|
284
301
|
except errors.APIError as e:
|
285
302
|
# Handle specific Google API errors
|
286
303
|
self.logger.error(f"Google API Error: {e.code} - {e.message}")
|
287
|
-
raise ProviderKeyError(f"Google API Error: {e.code}", e.message) from e
|
304
|
+
raise ProviderKeyError(f"Google API Error: {e.code}", e.message or "") from e
|
288
305
|
except Exception as e:
|
289
306
|
self.logger.error(f"Error during Google generate_content call: {e}")
|
290
307
|
# Decide how to handle other exceptions - potentially re-raise or return an error message
|