fast-agent-mcp 0.2.34__py3-none-any.whl → 0.2.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -97,6 +97,7 @@ class AugmentedLLM(ContextDependent, AugmentedLLMProtocol, Generic[MessageParamT
97
97
  PARAM_USE_HISTORY = "use_history"
98
98
  PARAM_MAX_ITERATIONS = "max_iterations"
99
99
  PARAM_TEMPLATE_VARS = "template_vars"
100
+
100
101
  # Base set of fields that should always be excluded
101
102
  BASE_EXCLUDE_FIELDS = {PARAM_METADATA}
102
103
 
@@ -371,16 +372,28 @@ class AugmentedLLM(ContextDependent, AugmentedLLMProtocol, Generic[MessageParamT
371
372
  # Start with base arguments
372
373
  arguments = base_args.copy()
373
374
 
374
- # Use provided exclude_fields or fall back to base exclusions
375
- exclude_fields = exclude_fields or self.BASE_EXCLUDE_FIELDS.copy()
375
+ # Combine base exclusions with provider-specific exclusions
376
+ final_exclude_fields = self.BASE_EXCLUDE_FIELDS.copy()
377
+ if exclude_fields:
378
+ final_exclude_fields.update(exclude_fields)
376
379
 
377
380
  # Add all fields from params that aren't explicitly excluded
378
- params_dict = request_params.model_dump(exclude=exclude_fields)
381
+ # Ensure model_dump only includes set fields if that's the desired behavior,
382
+ # or adjust exclude_unset=True/False as needed.
383
+ # Default Pydantic v2 model_dump is exclude_unset=False
384
+ params_dict = request_params.model_dump(exclude=final_exclude_fields)
385
+
379
386
  for key, value in params_dict.items():
387
+ # Only add if not None and not already in base_args (base_args take precedence)
388
+ # or if None is a valid value for the provider, this logic might need adjustment.
380
389
  if value is not None and key not in arguments:
381
390
  arguments[key] = value
391
+ elif value is not None and key in arguments and arguments[key] is None:
392
+ # Allow overriding a None in base_args with a set value from params
393
+ arguments[key] = value
382
394
 
383
395
  # Finally, add any metadata fields as a last layer of overrides
396
+ # This ensures metadata can override anything previously set if keys conflict.
384
397
  if request_params.metadata:
385
398
  arguments.update(request_params.metadata)
386
399
 
@@ -541,6 +554,37 @@ class AugmentedLLM(ContextDependent, AugmentedLLMProtocol, Generic[MessageParamT
541
554
  }
542
555
  self.logger.debug("Chat in progress", data=data)
543
556
 
557
+ def _update_streaming_progress(self, content: str, model: str, estimated_tokens: int) -> int:
558
+ """Update streaming progress with token estimation and formatting.
559
+
560
+ Args:
561
+ content: The text content from the streaming event
562
+ model: The model name
563
+ estimated_tokens: Current token count to update
564
+
565
+ Returns:
566
+ Updated estimated token count
567
+ """
568
+ # Rough estimate: 1 token per 4 characters (OpenAI's typical ratio)
569
+ text_length = len(content)
570
+ additional_tokens = max(1, text_length // 4)
571
+ new_total = estimated_tokens + additional_tokens
572
+
573
+ # Format token count for display
574
+ token_str = str(new_total).rjust(5)
575
+
576
+ # Emit progress event
577
+ data = {
578
+ "progress_action": ProgressAction.STREAMING,
579
+ "model": model,
580
+ "agent_name": self.name,
581
+ "chat_turn": self.chat_turn(),
582
+ "details": token_str.strip(), # Token count goes in details for STREAMING action
583
+ }
584
+ self.logger.info("Streaming progress", data=data)
585
+
586
+ return new_total
587
+
544
588
  def _log_chat_finished(self, model: Optional[str] = None) -> None:
545
589
  """Log a chat finished event"""
546
590
  data = {
mcp_agent/llm/memory.py CHANGED
@@ -35,6 +35,9 @@ class SimpleMemory(Memory, Generic[MessageParamT]):
35
35
  def __init__(self) -> None:
36
36
  self.history: List[MessageParamT] = []
37
37
  self.prompt_messages: List[MessageParamT] = [] # Always included
38
+ self.conversation_cache_positions: List[int] = [] # Track active conversation cache positions
39
+ self.cache_walk_distance: int = 6 # Messages between cache blocks
40
+ self.max_conversation_cache_blocks: int = 2 # Maximum conversation cache blocks
38
41
 
39
42
  def extend(self, messages: List[MessageParamT], is_prompt: bool = False) -> None:
40
43
  """
@@ -99,5 +102,122 @@ class SimpleMemory(Memory, Generic[MessageParamT]):
99
102
  clear_prompts: If True, also clear prompt messages
100
103
  """
101
104
  self.history = []
105
+ self.conversation_cache_positions = [] # Reset cache positions
102
106
  if clear_prompts:
103
107
  self.prompt_messages = []
108
+
109
+ def should_apply_conversation_cache(self) -> bool:
110
+ """
111
+ Determine if conversation caching should be applied based on walking algorithm.
112
+
113
+ Returns:
114
+ True if we should add or update cache blocks
115
+ """
116
+ total_messages = len(self.history)
117
+
118
+ # Need at least cache_walk_distance messages to start caching
119
+ if total_messages < self.cache_walk_distance:
120
+ return False
121
+
122
+ # Check if we need to add a new cache block
123
+ return len(self._calculate_cache_positions(total_messages)) != len(self.conversation_cache_positions)
124
+
125
+ def _calculate_cache_positions(self, total_conversation_messages: int) -> List[int]:
126
+ """
127
+ Calculate where cache blocks should be placed using walking algorithm.
128
+
129
+ Args:
130
+ total_conversation_messages: Number of conversation messages (not including prompts)
131
+
132
+ Returns:
133
+ List of positions (relative to conversation start) where cache should be placed
134
+ """
135
+ positions = []
136
+
137
+ # Place cache blocks every cache_walk_distance messages
138
+ for i in range(self.cache_walk_distance - 1, total_conversation_messages, self.cache_walk_distance):
139
+ positions.append(i)
140
+ if len(positions) >= self.max_conversation_cache_blocks:
141
+ break
142
+
143
+ # Keep only the most recent cache blocks (walking behavior)
144
+ if len(positions) > self.max_conversation_cache_blocks:
145
+ positions = positions[-self.max_conversation_cache_blocks:]
146
+
147
+ return positions
148
+
149
+ def get_conversation_cache_updates(self) -> dict:
150
+ """
151
+ Get cache position updates needed for the walking algorithm.
152
+
153
+ Returns:
154
+ Dict with 'add', 'remove', and 'active' position lists (relative to full message array)
155
+ """
156
+ total_conversation_messages = len(self.history)
157
+ new_positions = self._calculate_cache_positions(total_conversation_messages)
158
+
159
+ # Convert to absolute positions (including prompt messages)
160
+ prompt_offset = len(self.prompt_messages)
161
+ new_absolute_positions = [pos + prompt_offset for pos in new_positions]
162
+
163
+ old_positions_set = set(self.conversation_cache_positions)
164
+ new_positions_set = set(new_absolute_positions)
165
+
166
+ return {
167
+ 'add': sorted(new_positions_set - old_positions_set),
168
+ 'remove': sorted(old_positions_set - new_positions_set),
169
+ 'active': sorted(new_absolute_positions)
170
+ }
171
+
172
+ def apply_conversation_cache_updates(self, updates: dict) -> None:
173
+ """
174
+ Apply cache position updates.
175
+
176
+ Args:
177
+ updates: Dict from get_conversation_cache_updates()
178
+ """
179
+ self.conversation_cache_positions = updates['active'].copy()
180
+
181
+ def remove_cache_control_from_messages(self, messages: List[MessageParamT], positions: List[int]) -> None:
182
+ """
183
+ Remove cache control from specified message positions.
184
+
185
+ Args:
186
+ messages: The message array to modify
187
+ positions: List of positions to remove cache control from
188
+ """
189
+ for pos in positions:
190
+ if pos < len(messages):
191
+ message = messages[pos]
192
+ if isinstance(message, dict) and "content" in message:
193
+ content_list = message["content"]
194
+ if isinstance(content_list, list):
195
+ for content_block in content_list:
196
+ if isinstance(content_block, dict) and "cache_control" in content_block:
197
+ del content_block["cache_control"]
198
+
199
+ def add_cache_control_to_messages(self, messages: List[MessageParamT], positions: List[int]) -> int:
200
+ """
201
+ Add cache control to specified message positions.
202
+
203
+ Args:
204
+ messages: The message array to modify
205
+ positions: List of positions to add cache control to
206
+
207
+ Returns:
208
+ Number of cache blocks successfully applied
209
+ """
210
+ applied_count = 0
211
+ for pos in positions:
212
+ if pos < len(messages):
213
+ message = messages[pos]
214
+ if isinstance(message, dict) and "content" in message:
215
+ content_list = message["content"]
216
+ if isinstance(content_list, list) and content_list:
217
+ # Apply cache control to the last content block
218
+ for content_block in reversed(content_list):
219
+ if isinstance(content_block, dict):
220
+ content_block["cache_control"] = {"type": "ephemeral"}
221
+ applied_count += 1
222
+ break
223
+ return applied_count
@@ -109,11 +109,11 @@ class ModelDatabase:
109
109
 
110
110
  # TODO update to 32000
111
111
  ANTHROPIC_OPUS_4_VERSIONED = ModelParameters(
112
- context_window=200000, max_output_tokens=16384, tokenizes=ANTHROPIC_MULTIMODAL
112
+ context_window=200000, max_output_tokens=32000, tokenizes=ANTHROPIC_MULTIMODAL
113
113
  )
114
114
  # TODO update to 64000
115
115
  ANTHROPIC_SONNET_4_VERSIONED = ModelParameters(
116
- context_window=200000, max_output_tokens=16384, tokenizes=ANTHROPIC_MULTIMODAL
116
+ context_window=200000, max_output_tokens=64000, tokenizes=ANTHROPIC_MULTIMODAL
117
117
  )
118
118
 
119
119
  DEEPSEEK_CHAT_STANDARD = ModelParameters(
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, List, Tuple, Type
3
3
  from mcp.types import EmbeddedResource, ImageContent, TextContent
4
4
 
5
5
  from mcp_agent.core.prompt import Prompt
6
+ from mcp_agent.event_progress import ProgressAction
6
7
  from mcp_agent.llm.provider_types import Provider
7
8
  from mcp_agent.llm.providers.multipart_converter_anthropic import (
8
9
  AnthropicConverter,
@@ -18,7 +19,8 @@ if TYPE_CHECKING:
18
19
  from mcp import ListToolsResult
19
20
 
20
21
 
21
- from anthropic import Anthropic, AuthenticationError
22
+ from anthropic import AsyncAnthropic, AuthenticationError
23
+ from anthropic.lib.streaming import AsyncMessageStream
22
24
  from anthropic.types import (
23
25
  Message,
24
26
  MessageParam,
@@ -78,17 +80,69 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
78
80
  """Initialize Anthropic-specific default parameters"""
79
81
  # Get base defaults from parent (includes ModelDatabase lookup)
80
82
  base_params = super()._initialize_default_params(kwargs)
81
-
83
+
82
84
  # Override with Anthropic-specific settings
83
85
  chosen_model = kwargs.get("model", DEFAULT_ANTHROPIC_MODEL)
84
86
  base_params.model = chosen_model
85
-
87
+
86
88
  return base_params
87
89
 
88
90
  def _base_url(self) -> str | None:
89
91
  assert self.context.config
90
92
  return self.context.config.anthropic.base_url if self.context.config.anthropic else None
91
93
 
94
+ def _get_cache_mode(self) -> str:
95
+ """Get the cache mode configuration."""
96
+ cache_mode = "auto" # Default to auto
97
+ if self.context.config and self.context.config.anthropic:
98
+ cache_mode = self.context.config.anthropic.cache_mode
99
+ return cache_mode
100
+
101
+ async def _process_stream(self, stream: AsyncMessageStream, model: str) -> Message:
102
+ """Process the streaming response and display real-time token usage."""
103
+ # Track estimated output tokens by counting text chunks
104
+ estimated_tokens = 0
105
+
106
+ # Process the raw event stream to get token counts
107
+ async for event in stream:
108
+ # Count tokens in real-time from content_block_delta events
109
+ if (
110
+ event.type == "content_block_delta"
111
+ and hasattr(event, "delta")
112
+ and event.delta.type == "text_delta"
113
+ ):
114
+ # Use base class method for token estimation and progress emission
115
+ estimated_tokens = self._update_streaming_progress(event.delta.text, model, estimated_tokens)
116
+
117
+ # Also check for final message_delta events with actual usage info
118
+ elif (
119
+ event.type == "message_delta"
120
+ and hasattr(event, "usage")
121
+ and event.usage.output_tokens
122
+ ):
123
+ actual_tokens = event.usage.output_tokens
124
+ # Emit final progress with actual token count
125
+ token_str = str(actual_tokens).rjust(5)
126
+ data = {
127
+ "progress_action": ProgressAction.STREAMING,
128
+ "model": model,
129
+ "agent_name": self.name,
130
+ "chat_turn": self.chat_turn(),
131
+ "details": token_str.strip(),
132
+ }
133
+ self.logger.info("Streaming progress", data=data)
134
+
135
+ # Get the final message with complete usage data
136
+ message = await stream.get_final_message()
137
+
138
+ # Log final usage information
139
+ if hasattr(message, "usage") and message.usage:
140
+ self.logger.info(
141
+ f"Streaming complete - Model: {model}, Input tokens: {message.usage.input_tokens}, Output tokens: {message.usage.output_tokens}"
142
+ )
143
+
144
+ return message
145
+
92
146
  async def _anthropic_completion(
93
147
  self,
94
148
  message_param,
@@ -105,7 +159,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
105
159
  base_url = base_url.rstrip("/v1")
106
160
 
107
161
  try:
108
- anthropic = Anthropic(api_key=api_key, base_url=base_url)
162
+ anthropic = AsyncAnthropic(api_key=api_key, base_url=base_url)
109
163
  messages: List[MessageParam] = []
110
164
  params = self.get_request_params(request_params)
111
165
  except AuthenticationError as e:
@@ -118,7 +172,11 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
118
172
  # if use_history is True
119
173
  messages.extend(self.history.get(include_completion_history=params.use_history))
120
174
 
121
- messages.append(message_param)
175
+ messages.append(message_param) # message_param is the current user turn
176
+
177
+ # Get cache mode configuration
178
+ cache_mode = self._get_cache_mode()
179
+ self.logger.debug(f"Anthropic cache_mode: {cache_mode}")
122
180
 
123
181
  tool_list: ListToolsResult = await self.aggregator.list_tools()
124
182
  available_tools: List[ToolParam] = [
@@ -134,8 +192,11 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
134
192
 
135
193
  model = self.default_request_params.model
136
194
 
195
+ # Note: We'll cache tools+system together by putting cache_control only on system prompt
196
+
137
197
  for i in range(params.max_iterations):
138
198
  self._log_chat_progress(self.chat_turn(), model=model)
199
+
139
200
  # Create base arguments dictionary
140
201
  base_args = {
141
202
  "model": model,
@@ -145,6 +206,60 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
145
206
  "tools": available_tools,
146
207
  }
147
208
 
209
+ # Apply cache_control to system prompt if cache_mode is not "off"
210
+ # This caches both tools and system prompt together in one cache block
211
+ if cache_mode != "off" and base_args["system"]:
212
+ if isinstance(base_args["system"], str):
213
+ base_args["system"] = [
214
+ {
215
+ "type": "text",
216
+ "text": base_args["system"],
217
+ "cache_control": {"type": "ephemeral"},
218
+ }
219
+ ]
220
+ self.logger.debug(
221
+ "Applied cache_control to system prompt (caches tools+system in one block)"
222
+ )
223
+ else:
224
+ self.logger.debug(f"System prompt is not a string: {type(base_args['system'])}")
225
+
226
+ # Apply conversation caching using walking algorithm if in auto mode
227
+ if cache_mode == "auto" and self.history.should_apply_conversation_cache():
228
+ cache_updates = self.history.get_conversation_cache_updates()
229
+
230
+ # Remove cache control from old positions
231
+ if cache_updates["remove"]:
232
+ self.history.remove_cache_control_from_messages(
233
+ messages, cache_updates["remove"]
234
+ )
235
+ self.logger.debug(
236
+ f"Removed conversation cache_control from positions {cache_updates['remove']}"
237
+ )
238
+
239
+ # Add cache control to new positions
240
+ if cache_updates["add"]:
241
+ applied_count = self.history.add_cache_control_to_messages(
242
+ messages, cache_updates["add"]
243
+ )
244
+ if applied_count > 0:
245
+ self.history.apply_conversation_cache_updates(cache_updates)
246
+ self.logger.debug(
247
+ f"Applied conversation cache_control to positions {cache_updates['add']} ({applied_count} blocks)"
248
+ )
249
+
250
+ # Verify we don't exceed Anthropic's 4 cache block limit
251
+ total_cache_blocks = applied_count
252
+ if cache_mode != "off" and base_args["system"]:
253
+ total_cache_blocks += 1 # tools+system cache block
254
+ if total_cache_blocks > 4:
255
+ self.logger.warning(
256
+ f"Total cache blocks ({total_cache_blocks}) exceeds Anthropic limit of 4"
257
+ )
258
+ else:
259
+ self.logger.debug(
260
+ f"Failed to apply conversation cache_control to positions {cache_updates['add']}"
261
+ )
262
+
148
263
  if params.maxTokens is not None:
149
264
  base_args["max_tokens"] = params.maxTokens
150
265
 
@@ -155,9 +270,10 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
155
270
 
156
271
  self.logger.debug(f"{arguments}")
157
272
 
158
- executor_result = await self.executor.execute(anthropic.messages.create, **arguments)
159
-
160
- response = executor_result[0]
273
+ # Use streaming API with helper
274
+ async with anthropic.messages.stream(**arguments) as stream:
275
+ # Process the stream
276
+ response = await self._process_stream(stream, model)
161
277
 
162
278
  # Track usage if response is valid and has usage data
163
279
  if (
@@ -170,27 +286,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
170
286
  response.usage, model or DEFAULT_ANTHROPIC_MODEL
171
287
  )
172
288
  self.usage_accumulator.add_turn(turn_usage)
173
-
174
- # # Print raw usage for debugging
175
- # print(f"\n=== USAGE DEBUG ({model}) ===")
176
- # print(f"Raw usage: {response.usage}")
177
- # print(
178
- # f"Turn usage: input={turn_usage.input_tokens}, output={turn_usage.output_tokens}, current_context={turn_usage.current_context_tokens}"
179
- # )
180
- # print(
181
- # f"Cache: read={turn_usage.cache_usage.cache_read_tokens}, write={turn_usage.cache_usage.cache_write_tokens}"
182
- # )
183
- # print(f"Effective input: {turn_usage.effective_input_tokens}")
184
- # print(
185
- # f"Accumulator: total_turns={self.usage_accumulator.turn_count}, cumulative_billing={self.usage_accumulator.cumulative_billing_tokens}, current_context={self.usage_accumulator.current_context_tokens}"
186
- # )
187
- # if self.usage_accumulator.context_usage_percentage:
188
- # print(
189
- # f"Context usage: {self.usage_accumulator.context_usage_percentage:.1f}% of {self.usage_accumulator.context_window_size}"
190
- # )
191
- # if self.usage_accumulator.cache_hit_rate:
192
- # print(f"Cache hit rate: {self.usage_accumulator.cache_hit_rate:.1f}%")
193
- # print("===========================\n")
289
+ # self._show_usage(response.usage, turn_usage)
194
290
  except Exception as e:
195
291
  self.logger.warning(f"Failed to track usage: {e}")
196
292
 
@@ -201,7 +297,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
201
297
  ) from response
202
298
  elif isinstance(response, BaseException):
203
299
  error_details = str(response)
204
- self.logger.error(f"Error: {error_details}", data=executor_result)
300
+ self.logger.error(f"Error: {error_details}", data=BaseException)
205
301
 
206
302
  # Try to extract more useful information for API errors
207
303
  if hasattr(response, "status_code") and hasattr(response, "response"):
@@ -214,13 +310,13 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
214
310
  # Convert other errors to text response
215
311
  error_message = f"Error during generation: {error_details}"
216
312
  response = Message(
217
- id="error", # Required field
218
- model="error", # Required field
313
+ id="error",
314
+ model="error",
219
315
  role="assistant",
220
316
  type="message",
221
317
  content=[TextBlock(type="text", text=error_message)],
222
- stop_reason="end_turn", # Must be one of the allowed values
223
- usage=Usage(input_tokens=0, output_tokens=0), # Required field
318
+ stop_reason="end_turn",
319
+ usage=Usage(input_tokens=0, output_tokens=0),
224
320
  )
225
321
 
226
322
  self.logger.debug(
@@ -230,7 +326,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
230
326
 
231
327
  response_as_message = self.convert_message_to_message_param(response)
232
328
  messages.append(response_as_message)
233
- if response.content[0].type == "text":
329
+ if response.content and response.content[0].type == "text":
234
330
  responses.append(TextContent(type="text", text=response.content[0].text))
235
331
 
236
332
  if response.stop_reason == "end_turn":
@@ -290,12 +386,13 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
290
386
 
291
387
  # Process all tool calls and collect results
292
388
  tool_results = []
293
- for i, content in enumerate(tool_uses):
294
- tool_name = content.name
295
- tool_args = content.input
296
- tool_use_id = content.id
389
+ # Use a different loop variable for tool enumeration if 'i' is outer loop counter
390
+ for tool_idx, content_block in enumerate(tool_uses):
391
+ tool_name = content_block.name
392
+ tool_args = content_block.input
393
+ tool_use_id = content_block.id
297
394
 
298
- if i == 0: # Only show message for first tool use
395
+ if tool_idx == 0: # Only show message for first tool use
299
396
  await self.show_assistant_message(message_text, tool_name)
300
397
 
301
398
  self.show_tool_call(available_tools, tool_name, tool_args)
@@ -320,11 +417,7 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
320
417
  if params.use_history:
321
418
  # Get current prompt messages
322
419
  prompt_messages = self.history.get(include_completion_history=False)
323
-
324
- # Calculate new conversation messages (excluding prompts)
325
420
  new_messages = messages[len(prompt_messages) :]
326
-
327
- # Update conversation history
328
421
  self.history.set(new_messages)
329
422
 
330
423
  self._log_chat_finished(model=model)
@@ -362,8 +455,26 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
362
455
  multipart_messages[:-1] if last_message.role == "user" else multipart_messages
363
456
  )
364
457
  converted = []
458
+
459
+ # Get cache mode configuration
460
+ cache_mode = self._get_cache_mode()
461
+
365
462
  for msg in messages_to_add:
366
- converted.append(AnthropicConverter.convert_to_anthropic(msg))
463
+ anthropic_msg = AnthropicConverter.convert_to_anthropic(msg)
464
+
465
+ # Apply caching to template messages if cache_mode is "prompt" or "auto"
466
+ if is_template and cache_mode in ["prompt", "auto"] and anthropic_msg.get("content"):
467
+ content_list = anthropic_msg["content"]
468
+ if isinstance(content_list, list) and content_list:
469
+ # Apply cache control to the last content block
470
+ last_block = content_list[-1]
471
+ if isinstance(last_block, dict):
472
+ last_block["cache_control"] = {"type": "ephemeral"}
473
+ self.logger.debug(
474
+ f"Applied cache_control to template message with role {anthropic_msg.get('role')}"
475
+ )
476
+
477
+ converted.append(anthropic_msg)
367
478
 
368
479
  self.history.extend(converted, is_prompt=is_template)
369
480
 
@@ -398,6 +509,28 @@ class AnthropicAugmentedLLM(AugmentedLLM[MessageParam, Message]):
398
509
  )
399
510
  return self._structured_from_multipart(result, model)
400
511
 
512
+ def _show_usage(self, raw_usage: Usage, turn_usage: TurnUsage) -> None:
513
+ # Print raw usage for debugging
514
+ print(f"\n=== USAGE DEBUG ({turn_usage.model}) ===")
515
+ print(f"Raw usage: {raw_usage}")
516
+ print(
517
+ f"Turn usage: input={turn_usage.input_tokens}, output={turn_usage.output_tokens}, current_context={turn_usage.current_context_tokens}"
518
+ )
519
+ print(
520
+ f"Cache: read={turn_usage.cache_usage.cache_read_tokens}, write={turn_usage.cache_usage.cache_write_tokens}"
521
+ )
522
+ print(f"Effective input: {turn_usage.effective_input_tokens}")
523
+ print(
524
+ f"Accumulator: total_turns={self.usage_accumulator.turn_count}, cumulative_billing={self.usage_accumulator.cumulative_billing_tokens}, current_context={self.usage_accumulator.current_context_tokens}"
525
+ )
526
+ if self.usage_accumulator.context_usage_percentage:
527
+ print(
528
+ f"Context usage: {self.usage_accumulator.context_usage_percentage:.1f}% of {self.usage_accumulator.context_window_size}"
529
+ )
530
+ if self.usage_accumulator.cache_hit_rate:
531
+ print(f"Cache hit rate: {self.usage_accumulator.cache_hit_rate:.1f}%")
532
+ print("===========================\n")
533
+
401
534
  @classmethod
402
535
  def convert_message_to_message_param(cls, message: Message, **kwargs) -> MessageParam:
403
536
  """Convert a response object to an input parameter object to allow LLM calls to be chained."""
@@ -1,4 +1,4 @@
1
- from openai import AuthenticationError, AzureOpenAI, OpenAI
1
+ from openai import AsyncAzureOpenAI, AsyncOpenAI, AuthenticationError
2
2
 
3
3
  from mcp_agent.core.exceptions import ProviderKeyError
4
4
  from mcp_agent.llm.provider_types import Provider
@@ -93,7 +93,7 @@ class AzureOpenAIAugmentedLLM(OpenAIAugmentedLLM):
93
93
  if not self.resource_name and self.base_url:
94
94
  self.resource_name = _extract_resource_name(self.base_url)
95
95
 
96
- def _openai_client(self) -> OpenAI:
96
+ def _openai_client(self) -> AsyncOpenAI:
97
97
  """
98
98
  Returns an AzureOpenAI client, handling both API Key and DefaultAzureCredential.
99
99
  """
@@ -104,7 +104,7 @@ class AzureOpenAIAugmentedLLM(OpenAIAugmentedLLM):
104
104
  "Missing Azure endpoint",
105
105
  "azure_endpoint (base_url) is None at client creation time.",
106
106
  )
107
- return AzureOpenAI(
107
+ return AsyncAzureOpenAI(
108
108
  azure_ad_token_provider=self.get_azure_token,
109
109
  azure_endpoint=self.base_url,
110
110
  api_version=self.api_version,
@@ -116,7 +116,7 @@ class AzureOpenAIAugmentedLLM(OpenAIAugmentedLLM):
116
116
  "Missing Azure endpoint",
117
117
  "azure_endpoint (base_url) is None at client creation time.",
118
118
  )
119
- return AzureOpenAI(
119
+ return AsyncAzureOpenAI(
120
120
  api_key=self.api_key,
121
121
  azure_endpoint=self.base_url,
122
122
  api_version=self.api_version,