fast-agent-mcp 0.2.33__py3-none-any.whl → 0.2.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,236 @@
1
+ """
2
+ Model database for LLM parameters.
3
+
4
+ This module provides a centralized lookup for model parameters including
5
+ context windows, max output tokens, and supported tokenization types.
6
+ """
7
+
8
+ from typing import Dict, List, Optional
9
+
10
+ from pydantic import BaseModel
11
+
12
+
13
+ class ModelParameters(BaseModel):
14
+ """Configuration parameters for a specific model"""
15
+
16
+ context_window: int
17
+ """Maximum context window size in tokens"""
18
+
19
+ max_output_tokens: int
20
+ """Maximum output tokens the model can generate"""
21
+
22
+ tokenizes: List[str]
23
+ """List of supported content types for tokenization"""
24
+
25
+
26
+ class ModelDatabase:
27
+ """Centralized model configuration database"""
28
+
29
+ # Common parameter sets
30
+ OPENAI_MULTIMODAL = ["text/plain", "image/jpeg", "image/png", "image/webp", "application/pdf"]
31
+ OPENAI_VISION = ["text/plain", "image/jpeg", "image/png", "image/webp"]
32
+ ANTHROPIC_MULTIMODAL = [
33
+ "text/plain",
34
+ "image/jpeg",
35
+ "image/png",
36
+ "image/webp",
37
+ "application/pdf",
38
+ ]
39
+ GOOGLE_MULTIMODAL = [
40
+ "text/plain",
41
+ "image/jpeg",
42
+ "image/png",
43
+ "image/webp",
44
+ "application/pdf",
45
+ "audio/wav",
46
+ "audio/mp3",
47
+ "video/mp4",
48
+ ]
49
+ QWEN_MULTIMODAL = ["text/plain", "image/jpeg", "image/png", "image/webp"]
50
+ TEXT_ONLY = ["text/plain"]
51
+
52
+ # Common parameter configurations
53
+ OPENAI_STANDARD = ModelParameters(
54
+ context_window=128000, max_output_tokens=16384, tokenizes=OPENAI_MULTIMODAL
55
+ )
56
+
57
+ OPENAI_4_1_STANDARD = ModelParameters(
58
+ context_window=1047576, max_output_tokens=32768, tokenizes=OPENAI_MULTIMODAL
59
+ )
60
+
61
+ OPENAI_O_SERIES = ModelParameters(
62
+ context_window=200000, max_output_tokens=100000, tokenizes=OPENAI_VISION
63
+ )
64
+
65
+ ANTHROPIC_LEGACY = ModelParameters(
66
+ context_window=200000, max_output_tokens=4096, tokenizes=ANTHROPIC_MULTIMODAL
67
+ )
68
+
69
+ ANTHROPIC_35_SERIES = ModelParameters(
70
+ context_window=200000, max_output_tokens=8192, tokenizes=ANTHROPIC_MULTIMODAL
71
+ )
72
+
73
+ # TODO--- TO USE 64,000 NEED TO SUPPORT STREAMING
74
+ ANTHROPIC_37_SERIES = ModelParameters(
75
+ context_window=200000, max_output_tokens=16384, tokenizes=ANTHROPIC_MULTIMODAL
76
+ )
77
+
78
+ GEMINI_FLASH = ModelParameters(
79
+ context_window=1048576, max_output_tokens=8192, tokenizes=GOOGLE_MULTIMODAL
80
+ )
81
+
82
+ GEMINI_PRO = ModelParameters(
83
+ context_window=2097152, max_output_tokens=8192, tokenizes=GOOGLE_MULTIMODAL
84
+ )
85
+
86
+ QWEN_STANDARD = ModelParameters(
87
+ context_window=32000, max_output_tokens=8192, tokenizes=QWEN_MULTIMODAL
88
+ )
89
+
90
+ FAST_AGENT_STANDARD = ModelParameters(
91
+ context_window=1000000, max_output_tokens=100000, tokenizes=TEXT_ONLY
92
+ )
93
+
94
+ OPENAI_4_1_SERIES = ModelParameters(
95
+ context_window=1047576, max_output_tokens=32768, tokenizes=OPENAI_MULTIMODAL
96
+ )
97
+
98
+ OPENAI_4O_SERIES = ModelParameters(
99
+ context_window=128000, max_output_tokens=16384, tokenizes=OPENAI_VISION
100
+ )
101
+
102
+ OPENAI_O3_SERIES = ModelParameters(
103
+ context_window=200000, max_output_tokens=100000, tokenizes=OPENAI_MULTIMODAL
104
+ )
105
+
106
+ OPENAI_O3_MINI_SERIES = ModelParameters(
107
+ context_window=200000, max_output_tokens=100000, tokenizes=TEXT_ONLY
108
+ )
109
+
110
+ # TODO update to 32000
111
+ ANTHROPIC_OPUS_4_VERSIONED = ModelParameters(
112
+ context_window=200000, max_output_tokens=32000, tokenizes=ANTHROPIC_MULTIMODAL
113
+ )
114
+ # TODO update to 64000
115
+ ANTHROPIC_SONNET_4_VERSIONED = ModelParameters(
116
+ context_window=200000, max_output_tokens=64000, tokenizes=ANTHROPIC_MULTIMODAL
117
+ )
118
+
119
+ DEEPSEEK_CHAT_STANDARD = ModelParameters(
120
+ context_window=65536, max_output_tokens=8192, tokenizes=TEXT_ONLY
121
+ )
122
+
123
+ DEEPSEEK_REASONER = ModelParameters(
124
+ context_window=65536, max_output_tokens=32768, tokenizes=TEXT_ONLY
125
+ )
126
+
127
+ GEMINI_2_5_PRO = ModelParameters(
128
+ context_window=2097152, max_output_tokens=8192, tokenizes=GOOGLE_MULTIMODAL
129
+ )
130
+
131
+ # Model configuration database
132
+ MODELS: Dict[str, ModelParameters] = {
133
+ # internal models
134
+ "passthrough": FAST_AGENT_STANDARD,
135
+ "playback": FAST_AGENT_STANDARD,
136
+ "slow": FAST_AGENT_STANDARD,
137
+ # aliyun models
138
+ "qwen-turbo": QWEN_STANDARD,
139
+ "qwen-plus": QWEN_STANDARD,
140
+ "qwen-max": QWEN_STANDARD,
141
+ "qwen-long": ModelParameters(
142
+ context_window=10000000, max_output_tokens=8192, tokenizes=TEXT_ONLY
143
+ ),
144
+ # OpenAI Models (vanilla aliases and versioned)
145
+ "gpt-4.1": OPENAI_4_1_SERIES,
146
+ "gpt-4.1-mini": OPENAI_4_1_SERIES,
147
+ "gpt-4.1-nano": OPENAI_4_1_SERIES,
148
+ "gpt-4.1-2025-04-14": OPENAI_4_1_SERIES,
149
+ "gpt-4.1-mini-2025-04-14": OPENAI_4_1_SERIES,
150
+ "gpt-4.1-nano-2025-04-14": OPENAI_4_1_SERIES,
151
+ "gpt-4o": OPENAI_4O_SERIES,
152
+ "gpt-4o-2024-11-20": OPENAI_4O_SERIES,
153
+ "gpt-4o-mini-2024-07-18": OPENAI_4O_SERIES,
154
+ "o1": OPENAI_O_SERIES,
155
+ "o1-2024-12-17": OPENAI_O_SERIES,
156
+ "o3": OPENAI_O3_SERIES,
157
+ "o3-pro": ModelParameters(
158
+ context_window=200_000, max_output_tokens=100_000, tokenizes=TEXT_ONLY
159
+ ),
160
+ "o3-mini": OPENAI_O3_MINI_SERIES,
161
+ "o4-mini": OPENAI_O3_SERIES,
162
+ "o3-2025-04-16": OPENAI_O3_SERIES,
163
+ "o3-mini-2025-01-31": OPENAI_O3_MINI_SERIES,
164
+ "o4-mini-2025-04-16": OPENAI_O3_SERIES,
165
+ # Anthropic Models
166
+ "claude-3-haiku": ANTHROPIC_35_SERIES,
167
+ "claude-3-haiku-20240307": ANTHROPIC_LEGACY,
168
+ "claude-3-sonnet": ANTHROPIC_LEGACY,
169
+ "claude-3-opus": ANTHROPIC_LEGACY,
170
+ "claude-3-opus-20240229": ANTHROPIC_LEGACY,
171
+ "claude-3-opus-latest": ANTHROPIC_LEGACY,
172
+ "claude-3-5-haiku": ANTHROPIC_35_SERIES,
173
+ "claude-3-5-haiku-20241022": ANTHROPIC_35_SERIES,
174
+ "claude-3-5-haiku-latest": ANTHROPIC_35_SERIES,
175
+ "claude-3-sonnet-20240229": ANTHROPIC_LEGACY,
176
+ "claude-3-5-sonnet": ANTHROPIC_35_SERIES,
177
+ "claude-3-5-sonnet-20240620": ANTHROPIC_35_SERIES,
178
+ "claude-3-5-sonnet-20241022": ANTHROPIC_35_SERIES,
179
+ "claude-3-5-sonnet-latest": ANTHROPIC_35_SERIES,
180
+ "claude-3-7-sonnet": ANTHROPIC_37_SERIES,
181
+ "claude-3-7-sonnet-20250219": ANTHROPIC_37_SERIES,
182
+ "claude-3-7-sonnet-latest": ANTHROPIC_37_SERIES,
183
+ "claude-sonnet-4": ANTHROPIC_SONNET_4_VERSIONED,
184
+ "claude-sonnet-4-0": ANTHROPIC_SONNET_4_VERSIONED,
185
+ "claude-sonnet-4-20250514": ANTHROPIC_SONNET_4_VERSIONED,
186
+ "claude-opus-4": ANTHROPIC_OPUS_4_VERSIONED,
187
+ "claude-opus-4-0": ANTHROPIC_OPUS_4_VERSIONED,
188
+ "claude-opus-4-20250514": ANTHROPIC_OPUS_4_VERSIONED,
189
+ # DeepSeek Models
190
+ "deepseek-chat": DEEPSEEK_CHAT_STANDARD,
191
+ # Google Gemini Models (vanilla aliases and versioned)
192
+ "gemini-2.0-flash": GEMINI_FLASH,
193
+ "gemini-2.5-flash-preview": GEMINI_FLASH,
194
+ "gemini-2.5-pro-preview": GEMINI_2_5_PRO,
195
+ "gemini-2.5-flash-preview-05-20": GEMINI_FLASH,
196
+ "gemini-2.5-pro-preview-05-06": GEMINI_PRO,
197
+ }
198
+
199
+ @classmethod
200
+ def get_model_params(cls, model: str) -> Optional[ModelParameters]:
201
+ """Get model parameters for a given model name"""
202
+ return cls.MODELS.get(model)
203
+
204
+ @classmethod
205
+ def get_context_window(cls, model: str) -> Optional[int]:
206
+ """Get context window size for a model"""
207
+ params = cls.get_model_params(model)
208
+ return params.context_window if params else None
209
+
210
+ @classmethod
211
+ def get_max_output_tokens(cls, model: str) -> Optional[int]:
212
+ """Get maximum output tokens for a model"""
213
+ params = cls.get_model_params(model)
214
+ return params.max_output_tokens if params else None
215
+
216
+ @classmethod
217
+ def get_tokenizes(cls, model: str) -> Optional[List[str]]:
218
+ """Get supported tokenization types for a model"""
219
+ params = cls.get_model_params(model)
220
+ return params.tokenizes if params else None
221
+
222
+ @classmethod
223
+ def get_default_max_tokens(cls, model: str) -> int:
224
+ """Get default max_tokens for RequestParams based on model"""
225
+ if not model:
226
+ return 2048 # Fallback when no model specified
227
+
228
+ params = cls.get_model_params(model)
229
+ if params:
230
+ return params.max_output_tokens
231
+ return 2048 # Fallback for unknown models
232
+
233
+ @classmethod
234
+ def list_models(cls) -> List[str]:
235
+ """List all available model names"""
236
+ return list(cls.MODELS.keys())
@@ -87,6 +87,7 @@ class ModelFactory:
87
87
  "o1-preview": Provider.OPENAI,
88
88
  "o3": Provider.OPENAI,
89
89
  "o3-mini": Provider.OPENAI,
90
+ "o4-mini": Provider.OPENAI,
90
91
  "claude-3-haiku-20240307": Provider.ANTHROPIC,
91
92
  "claude-3-5-haiku-20241022": Provider.ANTHROPIC,
92
93
  "claude-3-5-haiku-latest": Provider.ANTHROPIC,
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, List, Tuple, Type
3
3
  from mcp.types import EmbeddedResource, ImageContent, TextContent
4
4
 
5
5
  from mcp_agent.core.prompt import Prompt
6
+ from mcp_agent.event_progress import ProgressAction
6
7
  from mcp_agent.llm.provider_types import Provider
7
8
  from mcp_agent.llm.providers.multipart_converter_anthropic import (
8
9
  AnthropicConverter,
@@ -10,6 +11,7 @@ from mcp_agent.llm.providers.multipart_converter_anthropic import (
10
11
  from mcp_agent.llm.providers.sampling_converter_anthropic import (
11
12
  AnthropicSamplingConverter,
12
13
  )
14
+ from mcp_agent.llm.usage_tracking import TurnUsage
13
15
  from mcp_agent.mcp.interfaces import ModelT
14
16
  from mcp_agent.mcp.prompt_message_multipart import PromptMessageMultipart
15
17
 
@@ -17,7 +19,8 @@ if TYPE_CHECKING:
17
19
  from mcp import ListToolsResult
18
20
 
19
21
 
20
- from anthropic import Anthropic, AuthenticationError
22
+ from anthropic import AsyncAnthropic, AuthenticationError
23
+ from anthropic.lib.streaming import AsyncMessageStream
21
24
  from anthropic.types import (
22
25
  Message,
23
26
  MessageParam,
@@ -75,19 +78,83 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
75
78
 
76
79
  def _initialize_default_params(self, kwargs: dict) -> RequestParams:
77
80
  """Initialize Anthropic-specific default parameters"""
78
- return RequestParams(
79
- model=kwargs.get("model", DEFAULT_ANTHROPIC_MODEL),
80
- maxTokens=4096, # default haiku3
81
- systemPrompt=self.instruction,
82
- parallel_tool_calls=True,
83
- max_iterations=20,
84
- use_history=True,
85
- )
81
+ # Get base defaults from parent (includes ModelDatabase lookup)
82
+ base_params = super()._initialize_default_params(kwargs)
83
+
84
+ # Override with Anthropic-specific settings
85
+ chosen_model = kwargs.get("model", DEFAULT_ANTHROPIC_MODEL)
86
+ base_params.model = chosen_model
87
+
88
+ return base_params
86
89
 
87
90
  def _base_url(self) -> str | None:
88
91
  assert self.context.config
89
92
  return self.context.config.anthropic.base_url if self.context.config.anthropic else None
90
93
 
94
+ def _get_cache_mode(self) -> str:
95
+ """Get the cache mode configuration."""
96
+ cache_mode = "auto" # Default to auto
97
+ if self.context.config and self.context.config.anthropic:
98
+ cache_mode = self.context.config.anthropic.cache_mode
99
+ return cache_mode
100
+
101
+ async def _process_stream(self, stream: AsyncMessageStream, model: str) -> Message:
102
+ """Process the streaming response and display real-time token usage."""
103
+ # Track estimated output tokens by counting text chunks
104
+ estimated_tokens = 0
105
+
106
+ # Process the raw event stream to get token counts
107
+ async for event in stream:
108
+ # Count tokens in real-time from content_block_delta events
109
+ if (
110
+ event.type == "content_block_delta"
111
+ and hasattr(event, "delta")
112
+ and event.delta.type == "text_delta"
113
+ ):
114
+ # Rough estimate: 1 token per 4 characters (OpenAI's typical ratio)
115
+ text_length = len(event.delta.text)
116
+ estimated_tokens += max(1, text_length // 4)
117
+
118
+ # Update progress on every token for real-time display
119
+ token_str = str(estimated_tokens).rjust(5)
120
+ # print(f"DEBUG: Streaming tokens: {token_str}")
121
+ self._emit_streaming_progress(model, token_str)
122
+
123
+ # Also check for final message_delta events with actual usage info
124
+ elif (
125
+ event.type == "message_delta"
126
+ and hasattr(event, "usage")
127
+ and event.usage.output_tokens
128
+ ):
129
+ actual_tokens = event.usage.output_tokens
130
+ token_str = str(actual_tokens).rjust(5)
131
+ # print(f"DEBUG: Final actual tokens: {token_str}")
132
+ self._emit_streaming_progress(model, token_str)
133
+
134
+ # Get the final message with complete usage data
135
+ message = await stream.get_final_message()
136
+
137
+ # Log final usage information
138
+ if hasattr(message, "usage") and message.usage:
139
+ self.logger.info(
140
+ f"Streaming complete - Model: {model}, Input tokens: {message.usage.input_tokens}, Output tokens: {message.usage.output_tokens}"
141
+ )
142
+
143
+ return message
144
+
145
+ def _emit_streaming_progress(self, model: str, token_str: str) -> None:
146
+ """Emit a streaming progress event that goes directly to progress display."""
147
+ data = {
148
+ "progress_action": ProgressAction.STREAMING,
149
+ "model": model,
150
+ "agent_name": self.name,
151
+ "chat_turn": self.chat_turn(),
152
+ "details": token_str.strip(), # Token count goes in details for STREAMING action
153
+ }
154
+ # print(f"DEBUG: Emitting streaming progress event with data: {data}")
155
+ # Use a special logger level or namespace to avoid polluting regular logs
156
+ self.logger.info("Streaming progress", data=data)
157
+
91
158
  async def _anthropic_completion(
92
159
  self,
93
160
  message_param,
@@ -104,7 +171,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
104
171
  base_url = base_url.rstrip("/v1")
105
172
 
106
173
  try:
107
- anthropic = Anthropic(api_key=api_key, base_url=base_url)
174
+ anthropic = AsyncAnthropic(api_key=api_key, base_url=base_url)
108
175
  messages: List[MessageParam] = []
109
176
  params = self.get_request_params(request_params)
110
177
  except AuthenticationError as e:
@@ -117,7 +184,11 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
117
184
  # if use_history is True
118
185
  messages.extend(self.history.get(include_completion_history=params.use_history))
119
186
 
120
- messages.append(message_param)
187
+ messages.append(message_param) # message_param is the current user turn
188
+
189
+ # Get cache mode configuration
190
+ cache_mode = self._get_cache_mode()
191
+ self.logger.debug(f"Anthropic cache_mode: {cache_mode}")
121
192
 
122
193
  tool_list: ListToolsResult = await self.aggregator.list_tools()
123
194
  available_tools: List[ToolParam] = [
@@ -133,8 +204,11 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
133
204
 
134
205
  model = self.default_request_params.model
135
206
 
207
+ # Note: We'll cache tools+system together by putting cache_control only on system prompt
208
+
136
209
  for i in range(params.max_iterations):
137
210
  self._log_chat_progress(self.chat_turn(), model=model)
211
+
138
212
  # Create base arguments dictionary
139
213
  base_args = {
140
214
  "model": model,
@@ -144,6 +218,60 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
144
218
  "tools": available_tools,
145
219
  }
146
220
 
221
+ # Apply cache_control to system prompt if cache_mode is not "off"
222
+ # This caches both tools and system prompt together in one cache block
223
+ if cache_mode != "off" and base_args["system"]:
224
+ if isinstance(base_args["system"], str):
225
+ base_args["system"] = [
226
+ {
227
+ "type": "text",
228
+ "text": base_args["system"],
229
+ "cache_control": {"type": "ephemeral"},
230
+ }
231
+ ]
232
+ self.logger.debug(
233
+ "Applied cache_control to system prompt (caches tools+system in one block)"
234
+ )
235
+ else:
236
+ self.logger.debug(f"System prompt is not a string: {type(base_args['system'])}")
237
+
238
+ # Apply conversation caching using walking algorithm if in auto mode
239
+ if cache_mode == "auto" and self.history.should_apply_conversation_cache():
240
+ cache_updates = self.history.get_conversation_cache_updates()
241
+
242
+ # Remove cache control from old positions
243
+ if cache_updates["remove"]:
244
+ self.history.remove_cache_control_from_messages(
245
+ messages, cache_updates["remove"]
246
+ )
247
+ self.logger.debug(
248
+ f"Removed conversation cache_control from positions {cache_updates['remove']}"
249
+ )
250
+
251
+ # Add cache control to new positions
252
+ if cache_updates["add"]:
253
+ applied_count = self.history.add_cache_control_to_messages(
254
+ messages, cache_updates["add"]
255
+ )
256
+ if applied_count > 0:
257
+ self.history.apply_conversation_cache_updates(cache_updates)
258
+ self.logger.debug(
259
+ f"Applied conversation cache_control to positions {cache_updates['add']} ({applied_count} blocks)"
260
+ )
261
+
262
+ # Verify we don't exceed Anthropic's 4 cache block limit
263
+ total_cache_blocks = applied_count
264
+ if cache_mode != "off" and base_args["system"]:
265
+ total_cache_blocks += 1 # tools+system cache block
266
+ if total_cache_blocks > 4:
267
+ self.logger.warning(
268
+ f"Total cache blocks ({total_cache_blocks}) exceeds Anthropic limit of 4"
269
+ )
270
+ else:
271
+ self.logger.debug(
272
+ f"Failed to apply conversation cache_control to positions {cache_updates['add']}"
273
+ )
274
+
147
275
  if params.maxTokens is not None:
148
276
  base_args["max_tokens"] = params.maxTokens
149
277
 
@@ -154,9 +282,25 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
154
282
 
155
283
  self.logger.debug(f"{arguments}")
156
284
 
157
- executor_result = await self.executor.execute(anthropic.messages.create, **arguments)
158
-
159
- response = executor_result[0]
285
+ # Use streaming API with helper
286
+ async with anthropic.messages.stream(**arguments) as stream:
287
+ # Process the stream
288
+ response = await self._process_stream(stream, model)
289
+
290
+ # Track usage if response is valid and has usage data
291
+ if (
292
+ hasattr(response, "usage")
293
+ and response.usage
294
+ and not isinstance(response, BaseException)
295
+ ):
296
+ try:
297
+ turn_usage = TurnUsage.from_anthropic(
298
+ response.usage, model or DEFAULT_ANTHROPIC_MODEL
299
+ )
300
+ self.usage_accumulator.add_turn(turn_usage)
301
+ # self._show_usage(response.usage, turn_usage)
302
+ except Exception as e:
303
+ self.logger.warning(f"Failed to track usage: {e}")
160
304
 
161
305
  if isinstance(response, AuthenticationError):
162
306
  raise ProviderKeyError(
@@ -165,7 +309,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
165
309
  ) from response
166
310
  elif isinstance(response, BaseException):
167
311
  error_details = str(response)
168
- self.logger.error(f"Error: {error_details}", data=executor_result)
312
+ self.logger.error(f"Error: {error_details}", data=BaseException)
169
313
 
170
314
  # Try to extract more useful information for API errors
171
315
  if hasattr(response, "status_code") and hasattr(response, "response"):
@@ -178,13 +322,13 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
178
322
  # Convert other errors to text response
179
323
  error_message = f"Error during generation: {error_details}"
180
324
  response = Message(
181
- id="error", # Required field
182
- model="error", # Required field
325
+ id="error",
326
+ model="error",
183
327
  role="assistant",
184
328
  type="message",
185
329
  content=[TextBlock(type="text", text=error_message)],
186
- stop_reason="end_turn", # Must be one of the allowed values
187
- usage=Usage(input_tokens=0, output_tokens=0), # Required field
330
+ stop_reason="end_turn",
331
+ usage=Usage(input_tokens=0, output_tokens=0),
188
332
  )
189
333
 
190
334
  self.logger.debug(
@@ -194,7 +338,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
194
338
 
195
339
  response_as_message = self.convert_message_to_message_param(response)
196
340
  messages.append(response_as_message)
197
- if response.content[0].type == "text":
341
+ if response.content and response.content[0].type == "text":
198
342
  responses.append(TextContent(type="text", text=response.content[0].text))
199
343
 
200
344
  if response.stop_reason == "end_turn":
@@ -254,12 +398,13 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
254
398
 
255
399
  # Process all tool calls and collect results
256
400
  tool_results = []
257
- for i, content in enumerate(tool_uses):
258
- tool_name = content.name
259
- tool_args = content.input
260
- tool_use_id = content.id
401
+ # Use a different loop variable for tool enumeration if 'i' is outer loop counter
402
+ for tool_idx, content_block in enumerate(tool_uses):
403
+ tool_name = content_block.name
404
+ tool_args = content_block.input
405
+ tool_use_id = content_block.id
261
406
 
262
- if i == 0: # Only show message for first tool use
407
+ if tool_idx == 0: # Only show message for first tool use
263
408
  await self.show_assistant_message(message_text, tool_name)
264
409
 
265
410
  self.show_tool_call(available_tools, tool_name, tool_args)
@@ -284,11 +429,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
284
429
  if params.use_history:
285
430
  # Get current prompt messages
286
431
  prompt_messages = self.history.get(include_completion_history=False)
287
-
288
- # Calculate new conversation messages (excluding prompts)
289
432
  new_messages = messages[len(prompt_messages) :]
290
-
291
- # Update conversation history
292
433
  self.history.set(new_messages)
293
434
 
294
435
  self._log_chat_finished(model=model)
@@ -326,8 +467,26 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
326
467
  multipart_messages[:-1] if last_message.role == "user" else multipart_messages
327
468
  )
328
469
  converted = []
470
+
471
+ # Get cache mode configuration
472
+ cache_mode = self._get_cache_mode()
473
+
329
474
  for msg in messages_to_add:
330
- converted.append(AnthropicConverter.convert_to_anthropic(msg))
475
+ anthropic_msg = AnthropicConverter.convert_to_anthropic(msg)
476
+
477
+ # Apply caching to template messages if cache_mode is "prompt" or "auto"
478
+ if is_template and cache_mode in ["prompt", "auto"] and anthropic_msg.get("content"):
479
+ content_list = anthropic_msg["content"]
480
+ if isinstance(content_list, list) and content_list:
481
+ # Apply cache control to the last content block
482
+ last_block = content_list[-1]
483
+ if isinstance(last_block, dict):
484
+ last_block["cache_control"] = {"type": "ephemeral"}
485
+ self.logger.debug(
486
+ f"Applied cache_control to template message with role {anthropic_msg.get('role')}"
487
+ )
488
+
489
+ converted.append(anthropic_msg)
331
490
 
332
491
  self.history.extend(converted, is_prompt=is_template)
333
492
 
@@ -362,6 +521,28 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
362
521
  )
363
522
  return self._structured_from_multipart(result, model)
364
523
 
524
+ def _show_usage(self, raw_usage: Usage, turn_usage: TurnUsage) -> None:
525
+ # Print raw usage for debugging
526
+ print(f"\n=== USAGE DEBUG ({turn_usage.model}) ===")
527
+ print(f"Raw usage: {raw_usage}")
528
+ print(
529
+ f"Turn usage: input={turn_usage.input_tokens}, output={turn_usage.output_tokens}, current_context={turn_usage.current_context_tokens}"
530
+ )
531
+ print(
532
+ f"Cache: read={turn_usage.cache_usage.cache_read_tokens}, write={turn_usage.cache_usage.cache_write_tokens}"
533
+ )
534
+ print(f"Effective input: {turn_usage.effective_input_tokens}")
535
+ print(
536
+ f"Accumulator: total_turns={self.usage_accumulator.turn_count}, cumulative_billing={self.usage_accumulator.cumulative_billing_tokens}, current_context={self.usage_accumulator.current_context_tokens}"
537
+ )
538
+ if self.usage_accumulator.context_usage_percentage:
539
+ print(
540
+ f"Context usage: {self.usage_accumulator.context_usage_percentage:.1f}% of {self.usage_accumulator.context_window_size}"
541
+ )
542
+ if self.usage_accumulator.cache_hit_rate:
543
+ print(f"Cache hit rate: {self.usage_accumulator.cache_hit_rate:.1f}%")
544
+ print("===========================\n")
545
+
365
546
  @classmethod
366
547
  def convert_message_to_message_param(cls, message: Message, **kwargs) -> MessageParam:
367
548
  """Convert a response object to an input parameter object to allow LLM calls to be chained."""
@@ -24,6 +24,7 @@ from mcp_agent.llm.provider_types import Provider
24
24
 
25
25
  # Import the new converter class
26
26
  from mcp_agent.llm.providers.google_converter import GoogleConverter
27
+ from mcp_agent.llm.usage_tracking import TurnUsage
27
28
  from mcp_agent.mcp.prompt_message_multipart import PromptMessageMultipart
28
29
 
29
30
  # Define default model and potentially other Google-specific defaults
@@ -220,6 +221,7 @@ class GoogleNativeAugmentedLLM(AugmentedLLM[types.Content, types.Content]):
220
221
  parallel_tool_calls=True, # Assume parallel tool calls are supported by default with native API
221
222
  max_iterations=20,
222
223
  use_history=True,
224
+ maxTokens=65536, # Default max tokens for Google models
223
225
  # Include other relevant default parameters
224
226
  )
225
227
 
@@ -281,10 +283,25 @@ class GoogleNativeAugmentedLLM(AugmentedLLM[types.Content, types.Content]):
281
283
  )
282
284
  self.logger.debug("Google generate_content response:", data=api_response)
283
285
 
286
+ # Track usage if response is valid and has usage data
287
+ if (
288
+ hasattr(api_response, "usage_metadata")
289
+ and api_response.usage_metadata
290
+ and not isinstance(api_response, BaseException)
291
+ ):
292
+ try:
293
+ turn_usage = TurnUsage.from_google(
294
+ api_response.usage_metadata, request_params.model
295
+ )
296
+ self.usage_accumulator.add_turn(turn_usage)
297
+
298
+ except Exception as e:
299
+ self.logger.warning(f"Failed to track usage: {e}")
300
+
284
301
  except errors.APIError as e:
285
302
  # Handle specific Google API errors
286
303
  self.logger.error(f"Google API Error: {e.code} - {e.message}")
287
- raise ProviderKeyError(f"Google API Error: {e.code}", e.message) from e
304
+ raise ProviderKeyError(f"Google API Error: {e.code}", e.message or "") from e
288
305
  except Exception as e:
289
306
  self.logger.error(f"Error during Google generate_content call: {e}")
290
307
  # Decide how to handle other exceptions - potentially re-raise or return an error message