PyPI - llama-index-llms-bedrock-converse - Versions diffs - 0.5.4__py3-none-any.whl → 0.12.3__py3-none-any.whl - Mend

llama-index-llms-bedrock-converse 0.5.4py3-none-any.whl → 0.12.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

llama_index/llms/bedrock_converse/base.py CHANGED Viewed

@@ -1,8 +1,10 @@
+import warnings
 from typing import (
     Any,
     Callable,
     Dict,
     List,
+    Literal,
     Optional,
     Sequence,
     Tuple,
@@ -20,6 +22,9 @@ from llama_index.core.base.llms.types import (
     CompletionResponseGen,
     LLMMetadata,
     MessageRole,
+    TextBlock,
+    ThinkingBlock,
+    ToolCallBlock,
 )
 from llama_index.core.bridge.pydantic import Field, PrivateAttr
 from llama_index.core.callbacks import CallbackManager
@@ -46,6 +51,8 @@ from llama_index.llms.bedrock_converse.utils import (
     join_two_dicts,
     messages_to_converse_messages,
     tools_to_converse_tools,
+    is_reasoning,
+    ThinkingDict,
 )
 if TYPE_CHECKING:
@@ -138,18 +145,41 @@ class BedrockConverse(FunctionCallingLLM):
         default=60.0,
         description="The timeout for the Bedrock API request in seconds. It will be used for both connect and read timeouts.",
     )
+    system_prompt_caching: bool = Field(
+        default=False,
+        description="Whether to cache the system prompt. If you are using a system prompt, you should set this to True.",
+    )
+    tool_caching: bool = Field(
+        default=False,
+        description="Whether to cache the tools. If you are using tools, you should set this to True.",
+    )
     guardrail_identifier: Optional[str] = Field(
         description="The unique identifier of the guardrail that you want to use. If you don't provide a value, no guardrail is applied to the invocation."
     )
     guardrail_version: Optional[str] = Field(
         description="The version number for the guardrail. The value can also be DRAFT"
     )
+    guardrail_stream_processing_mode: Optional[Literal["sync", "async"]] = Field(
+        description=(
+            "The stream processing mode to use when leveraging a guardrail in a streaming request (ConverseStream). "
+            "If set, the specified mode will be included in the request's guardrail configuration object, altering the streaming response behavior. "
+            "If a value is not provided, no mode will be explicitly included in the request's guardrail configuration object, and thus Amazon Bedrock's default, Synchronous Mode, will be used."
+        )
+    )
     application_inference_profile_arn: Optional[str] = Field(
         description="The ARN of an application inference profile to invoke in place of the model. If provided, make sure the model argument refers to the same one underlying the application inference profile."
     )
     trace: Optional[str] = Field(
         description="Specifies whether to enable or disable the Bedrock trace. If enabled, you can see the full Bedrock trace."
     )
+    thinking: Optional[ThinkingDict] = Field(
+        description="Specifies the thinking configuration of a reasoning model. Only applicable to Anthropic and DeepSeek models",
+        default=None,
+    )
+    supports_forced_tool_calls: bool = Field(
+        default=True,
+        description="Whether the model supports forced tool calls. If True, the model can be forced to call at least 1 or more tools.",
+    )
     additional_kwargs: Dict[str, Any] = Field(
         default_factory=dict,
         description="Additional kwargs for the bedrock invokeModel request.",
@@ -182,14 +212,19 @@ class BedrockConverse(FunctionCallingLLM):
         additional_kwargs: Optional[Dict[str, Any]] = None,
         callback_manager: Optional[CallbackManager] = None,
         system_prompt: Optional[str] = None,
+        system_prompt_caching: Optional[bool] = False,
+        tool_caching: Optional[bool] = False,
         messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
         completion_to_prompt: Optional[Callable[[str], str]] = None,
         pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
         output_parser: Optional[BaseOutputParser] = None,
         guardrail_identifier: Optional[str] = None,
         guardrail_version: Optional[str] = None,
+        guardrail_stream_processing_mode: Optional[Literal["sync", "async"]] = None,
         application_inference_profile_arn: Optional[str] = None,
         trace: Optional[str] = None,
+        thinking: Optional[ThinkingDict] = None,
+        supports_forced_tool_calls: bool = True,
     ) -> None:
         additional_kwargs = additional_kwargs or {}
         callback_manager = callback_manager or CallbackManager([])
@@ -203,6 +238,13 @@ class BedrockConverse(FunctionCallingLLM):
             "botocore_session": botocore_session,
         }
+        if not is_reasoning(model) and thinking is not None:
+            thinking = None
+            warnings.warn(
+                "You set thinking parameters for a non-reasoning models, they will be ignored",
+                UserWarning,
+            )
         super().__init__(
             temperature=temperature,
             max_tokens=max_tokens,
@@ -212,6 +254,8 @@ class BedrockConverse(FunctionCallingLLM):
             model=model,
             callback_manager=callback_manager,
             system_prompt=system_prompt,
+            system_prompt_caching=system_prompt_caching,
+            tool_caching=tool_caching,
             messages_to_prompt=messages_to_prompt,
             completion_to_prompt=completion_to_prompt,
             pydantic_program_mode=pydantic_program_mode,
@@ -229,8 +273,11 @@ class BedrockConverse(FunctionCallingLLM):
             botocore_config=botocore_config,
             guardrail_identifier=guardrail_identifier,
             guardrail_version=guardrail_version,
+            guardrail_stream_processing_mode=guardrail_stream_processing_mode,
             application_inference_profile_arn=application_inference_profile_arn,
             trace=trace,
+            thinking=thinking,
+            supports_forced_tool_calls=supports_forced_tool_calls,
         )
         self._config = None
@@ -252,6 +299,7 @@ class BedrockConverse(FunctionCallingLLM):
                     retries={"max_attempts": max_retries, "mode": "standard"},
                     connect_timeout=timeout,
                     read_timeout=timeout,
+                    user_agent_extra="x-client-framework:llama_index",
                 )
                 if botocore_config is None
                 else botocore_config
@@ -317,30 +365,49 @@ class BedrockConverse(FunctionCallingLLM):
     def _get_content_and_tool_calls(
         self, response: Optional[Dict[str, Any]] = None, content: Dict[str, Any] = None
-    ) -> Tuple[str, Dict[str, Any], List[str], List[str]]:
-        assert (
-            response is not None or content is not None
-        ), f"Either response or content must be provided. Got response: {response}, content: {content}"
-        assert (
-            response is None or content is None
-        ), f"Only one of response or content should be provided. Got response: {response}, content: {content}"
-        tool_calls = []
+    ) -> Tuple[
+        List[Union[TextBlock, ThinkingBlock, ToolCallBlock]], List[str], List[str]
+    ]:
+        assert response is not None or content is not None, (
+            f"Either response or content must be provided. Got response: {response}, content: {content}"
+        )
+        assert response is None or content is None, (
+            f"Only one of response or content should be provided. Got response: {response}, content: {content}"
+        )
         tool_call_ids = []
         status = []
-        text_content = ""
+        blocks: List[TextBlock | ThinkingBlock | ToolCallBlock] = []
         if content is not None:
             content_list = [content]
         else:
             content_list = response["output"]["message"]["content"]
         for content_block in content_list:
             if text := content_block.get("text", None):
-                text_content += text
+                blocks.append(TextBlock(text=text))
+            if thinking := content_block.get("reasoningContent", None):
+                blocks.append(
+                    ThinkingBlock(
+                        content=thinking.get("reasoningText", {}).get("text", None),
+                        additional_information={
+                            "signature": thinking.get("reasoningText", {}).get(
+                                "signature", None
+                            )
+                        },
+                    )
+                )
             if tool_usage := content_block.get("toolUse", None):
                 if "toolUseId" not in tool_usage:
                     tool_usage["toolUseId"] = content_block["toolUseId"]
                 if "name" not in tool_usage:
                     tool_usage["name"] = content_block["name"]
-                tool_calls.append(tool_usage)
+                blocks.append(
+                    ToolCallBlock(
+                        tool_name=tool_usage.get("name", ""),
+                        tool_call_id=tool_usage.get("toolUseId"),
+                        tool_kwargs=tool_usage.get("input", {}),
+                    )
+                )
             if tool_result := content_block.get("toolResult", None):
                 for tool_result_content in tool_result["content"]:
                     if text := tool_result_content.get("text", None):
@@ -348,19 +415,25 @@ class BedrockConverse(FunctionCallingLLM):
                 tool_call_ids.append(tool_result_content.get("toolUseId", ""))
                 status.append(tool_result.get("status", ""))
-        return text_content, tool_calls, tool_call_ids, status
+        return blocks, tool_call_ids, status
     @llm_chat_callback()
     def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
         # convert Llama Index messages to AWS Bedrock Converse messages
-        converse_messages, system_prompt = messages_to_converse_messages(messages)
+        converse_messages, system_prompt = messages_to_converse_messages(
+            messages, self.model
+        )
         all_kwargs = self._get_all_kwargs(**kwargs)
+        if self.thinking is not None:
+            all_kwargs["thinking"] = self.thinking
         # invoke LLM in AWS Bedrock Converse with retry
         response = converse_with_retry(
             client=self._client,
             messages=converse_messages,
             system_prompt=system_prompt,
+            system_prompt_caching=self.system_prompt_caching,
+            tool_caching=self.tool_caching,
             max_retries=self.max_retries,
             stream=False,
             guardrail_identifier=self.guardrail_identifier,
@@ -369,16 +442,13 @@ class BedrockConverse(FunctionCallingLLM):
             **all_kwargs,
         )
-        content, tool_calls, tool_call_ids, status = self._get_content_and_tool_calls(
-            response
-        )
+        blocks, tool_call_ids, status = self._get_content_and_tool_calls(response)
         return ChatResponse(
             message=ChatMessage(
                 role=MessageRole.ASSISTANT,
-                content=content,
+                blocks=blocks,
                 additional_kwargs={
-                    "tool_calls": tool_calls,
                     "tool_call_id": tool_call_ids,
                     "status": status,
                 },
@@ -399,18 +469,25 @@ class BedrockConverse(FunctionCallingLLM):
         self, messages: Sequence[ChatMessage], **kwargs: Any
     ) -> ChatResponseGen:
         # convert Llama Index messages to AWS Bedrock Converse messages
-        converse_messages, system_prompt = messages_to_converse_messages(messages)
+        converse_messages, system_prompt = messages_to_converse_messages(
+            messages, self.model
+        )
         all_kwargs = self._get_all_kwargs(**kwargs)
+        if self.thinking is not None:
+            all_kwargs["thinking"] = self.thinking
         # invoke LLM in AWS Bedrock Converse with retry
         response = converse_with_retry(
             client=self._client,
             messages=converse_messages,
             system_prompt=system_prompt,
+            system_prompt_caching=self.system_prompt_caching,
+            tool_caching=self.tool_caching,
             max_retries=self.max_retries,
             stream=True,
             guardrail_identifier=self.guardrail_identifier,
             guardrail_version=self.guardrail_version,
+            guardrail_stream_processing_mode=self.guardrail_stream_processing_mode,
             trace=self.trace,
             **all_kwargs,
         )
@@ -420,11 +497,25 @@ class BedrockConverse(FunctionCallingLLM):
             tool_calls = []  # Track tool calls separately
             current_tool_call = None  # Track the current tool call being built
             role = MessageRole.ASSISTANT
+            thinking = ""
+            thinking_signature = ""
             for chunk in response["stream"]:
                 if content_block_delta := chunk.get("contentBlockDelta"):
                     content_delta = content_block_delta["delta"]
                     content = join_two_dicts(content, content_delta)
+                    thinking_delta_value = None
+                    if "reasoningContent" in content_delta:
+                        reasoning_text = content_delta.get("reasoningContent", {}).get(
+                            "text", ""
+                        )
+                        thinking += reasoning_text
+                        thinking_delta_value = reasoning_text
+                        thinking_signature += content_delta.get(
+                            "reasoningContent", {}
+                        ).get("signature", "")
                     # If this delta contains tool call info, update current tool call
                     if "toolUse" in content_delta:
                         tool_use_delta = content_delta["toolUse"]
@@ -433,31 +524,73 @@ class BedrockConverse(FunctionCallingLLM):
                             # Handle the input field specially - concatenate partial JSON strings
                             if "input" in tool_use_delta:
                                 if "input" in current_tool_call:
-                                    current_tool_call["input"] += tool_use_delta["input"]
+                                    current_tool_call["input"] += tool_use_delta[
+                                        "input"
+                                    ]
                                 else:
                                     current_tool_call["input"] = tool_use_delta["input"]
                                 # Remove input from the delta to prevent it from being processed again
-                                tool_use_without_input = {k: v for k, v in tool_use_delta.items() if k != "input"}
+                                tool_use_without_input = {
+                                    k: v
+                                    for k, v in tool_use_delta.items()
+                                    if k != "input"
+                                }
                                 if tool_use_without_input:
-                                    current_tool_call = join_two_dicts(current_tool_call, tool_use_without_input)
+                                    current_tool_call = join_two_dicts(
+                                        current_tool_call, tool_use_without_input
+                                    )
                             else:
                                 # For other fields, use the normal joining
-                                current_tool_call = join_two_dicts(current_tool_call, tool_use_delta)
+                                current_tool_call = join_two_dicts(
+                                    current_tool_call, tool_use_delta
+                                )
+                    blocks: List[Union[TextBlock, ThinkingBlock, ToolCallBlock]] = [
+                        TextBlock(text=content.get("text", ""))
+                    ]
+                    if thinking != "":
+                        blocks.insert(
+                            0,
+                            ThinkingBlock(
+                                content=thinking,
+                                additional_information={
+                                    "signature": thinking_signature
+                                },
+                            ),
+                        )
+                    if tool_calls:
+                        for tool_call in tool_calls:
+                            blocks.append(
+                                ToolCallBlock(
+                                    tool_kwargs=tool_call.get("input", {}),
+                                    tool_name=tool_call.get("name", ""),
+                                    tool_call_id=tool_call.get("toolUseId"),
+                                )
+                            )
+                    response_additional_kwargs = self._get_response_token_counts(
+                        dict(chunk)
+                    )
+                    if thinking_delta_value is not None:
+                        response_additional_kwargs["thinking_delta"] = (
+                            thinking_delta_value
+                        )
                     yield ChatResponse(
                         message=ChatMessage(
                             role=role,
-                            content=content.get("text", ""),
+                            blocks=blocks,
                             additional_kwargs={
-                                "tool_calls": tool_calls,
-                                "tool_call_id": [tc.get("toolUseId", "") for tc in tool_calls],
+                                "tool_call_id": [
+                                    tc.get("toolUseId", "") for tc in tool_calls
+                                ],
                                 "status": [],  # Will be populated when tool results come in
                             },
                         ),
                         delta=content_delta.get("text", ""),
                         raw=chunk,
-                        additional_kwargs=self._get_response_token_counts(dict(chunk)),
+                        additional_kwargs=response_additional_kwargs,
                     )
                 elif content_block_start := chunk.get("contentBlockStart"):
                     # New tool call starting
@@ -468,18 +601,90 @@ class BedrockConverse(FunctionCallingLLM):
                         # Add to our list of tool calls
                         tool_calls.append(current_tool_call)
+                    blocks: List[Union[TextBlock, ThinkingBlock, ToolCallBlock]] = [
+                        TextBlock(text=content.get("text", ""))
+                    ]
+                    if thinking != "":
+                        blocks.insert(
+                            0,
+                            ThinkingBlock(
+                                content=thinking,
+                                additional_information={
+                                    "signature": thinking_signature
+                                },
+                            ),
+                        )
+                    if tool_calls:
+                        for tool_call in tool_calls:
+                            blocks.append(
+                                ToolCallBlock(
+                                    tool_kwargs=tool_call.get("input", {}),
+                                    tool_name=tool_call.get("name", ""),
+                                    tool_call_id=tool_call.get("toolUseId"),
+                                )
+                            )
                     yield ChatResponse(
                         message=ChatMessage(
                             role=role,
-                            content=content.get("text", ""),
+                            blocks=blocks,
                             additional_kwargs={
-                                "tool_calls": tool_calls,
-                                "tool_call_id": [tc.get("toolUseId", "") for tc in tool_calls],
+                                "tool_call_id": [
+                                    tc.get("toolUseId", "") for tc in tool_calls
+                                ],
                                 "status": [],  # Will be populated when tool results come in
                             },
                         ),
                         raw=chunk,
                     )
+                elif message_stop := chunk.get("messageStop"):
+                    # Handle messageStop event - this contains the stop reason
+                    # We don't yield here, just track the event
+                    pass
+                elif metadata := chunk.get("metadata"):
+                    # Handle metadata event - this contains the final token usage
+                    if usage := metadata.get("usage"):
+                        # Yield a final response with correct token usage
+                        blocks: List[Union[TextBlock, ThinkingBlock, ToolCallBlock]] = [
+                            TextBlock(text=content.get("text", ""))
+                        ]
+                        if thinking != "":
+                            blocks.insert(
+                                0,
+                                ThinkingBlock(
+                                    content=thinking,
+                                    additional_information={
+                                        "signature": thinking_signature
+                                    },
+                                ),
+                            )
+                        if tool_calls:
+                            for tool_call in tool_calls:
+                                blocks.append(
+                                    ToolCallBlock(
+                                        tool_kwargs=tool_call.get("input", {}),
+                                        tool_name=tool_call.get("name", ""),
+                                        tool_call_id=tool_call.get("toolUseId"),
+                                    )
+                                )
+                        yield ChatResponse(
+                            message=ChatMessage(
+                                role=role,
+                                blocks=blocks,
+                                additional_kwargs={
+                                    "tool_call_id": [
+                                        tc.get("toolUseId", "") for tc in tool_calls
+                                    ],
+                                    "status": [],
+                                },
+                            ),
+                            delta="",
+                            thinking_delta=None,
+                            raw=chunk,
+                            additional_kwargs=self._get_response_token_counts(metadata),
+                        )
         return gen()
@@ -495,8 +700,12 @@ class BedrockConverse(FunctionCallingLLM):
         self, messages: Sequence[ChatMessage], **kwargs: Any
     ) -> ChatResponse:
         # convert Llama Index messages to AWS Bedrock Converse messages
-        converse_messages, system_prompt = messages_to_converse_messages(messages)
+        converse_messages, system_prompt = messages_to_converse_messages(
+            messages, self.model
+        )
         all_kwargs = self._get_all_kwargs(**kwargs)
+        if self.thinking is not None:
+            all_kwargs["thinking"] = self.thinking
         # invoke LLM in AWS Bedrock Converse with retry
         response = await converse_with_retry_async(
@@ -504,6 +713,8 @@ class BedrockConverse(FunctionCallingLLM):
             config=self._config,
             messages=converse_messages,
             system_prompt=system_prompt,
+            system_prompt_caching=self.system_prompt_caching,
+            tool_caching=self.tool_caching,
             max_retries=self.max_retries,
             stream=False,
             guardrail_identifier=self.guardrail_identifier,
@@ -513,16 +724,13 @@ class BedrockConverse(FunctionCallingLLM):
             **all_kwargs,
         )
-        content, tool_calls, tool_call_ids, status = self._get_content_and_tool_calls(
-            response
-        )
+        blocks, tool_call_ids, status = self._get_content_and_tool_calls(response)
         return ChatResponse(
             message=ChatMessage(
                 role=MessageRole.ASSISTANT,
-                content=content,
+                blocks=blocks,
                 additional_kwargs={
-                    "tool_calls": tool_calls,
                     "tool_call_id": tool_call_ids,
                     "status": status,
                 },
@@ -543,8 +751,12 @@ class BedrockConverse(FunctionCallingLLM):
         self, messages: Sequence[ChatMessage], **kwargs: Any
     ) -> ChatResponseAsyncGen:
         # convert Llama Index messages to AWS Bedrock Converse messages
-        converse_messages, system_prompt = messages_to_converse_messages(messages)
+        converse_messages, system_prompt = messages_to_converse_messages(
+            messages, self.model
+        )
         all_kwargs = self._get_all_kwargs(**kwargs)
+        if self.thinking is not None:
+            all_kwargs["thinking"] = self.thinking
         # invoke LLM in AWS Bedrock Converse with retry
         response_gen = await converse_with_retry_async(
@@ -552,10 +764,13 @@ class BedrockConverse(FunctionCallingLLM):
             config=self._config,
             messages=converse_messages,
             system_prompt=system_prompt,
+            system_prompt_caching=self.system_prompt_caching,
+            tool_caching=self.tool_caching,
             max_retries=self.max_retries,
             stream=True,
             guardrail_identifier=self.guardrail_identifier,
             guardrail_version=self.guardrail_version,
+            guardrail_stream_processing_mode=self.guardrail_stream_processing_mode,
             trace=self.trace,
             boto_client_kwargs=self._boto_client_kwargs,
             **all_kwargs,
@@ -566,11 +781,25 @@ class BedrockConverse(FunctionCallingLLM):
             tool_calls = []  # Track tool calls separately
             current_tool_call = None  # Track the current tool call being built
             role = MessageRole.ASSISTANT
+            thinking = ""
+            thinking_signature = ""
             async for chunk in response_gen:
                 if content_block_delta := chunk.get("contentBlockDelta"):
                     content_delta = content_block_delta["delta"]
                     content = join_two_dicts(content, content_delta)
+                    thinking_delta_value = None
+                    if "reasoningContent" in content_delta:
+                        reasoning_text = content_delta.get("reasoningContent", {}).get(
+                            "text", ""
+                        )
+                        thinking += reasoning_text
+                        thinking_delta_value = reasoning_text
+                        thinking_signature += content_delta.get(
+                            "reasoningContent", {}
+                        ).get("signature", "")
                     # If this delta contains tool call info, update current tool call
                     if "toolUse" in content_delta:
                         tool_use_delta = content_delta["toolUse"]
@@ -579,31 +808,73 @@ class BedrockConverse(FunctionCallingLLM):
                             # Handle the input field specially - concatenate partial JSON strings
                             if "input" in tool_use_delta:
                                 if "input" in current_tool_call:
-                                    current_tool_call["input"] += tool_use_delta["input"]
+                                    current_tool_call["input"] += tool_use_delta[
+                                        "input"
+                                    ]
                                 else:
                                     current_tool_call["input"] = tool_use_delta["input"]
                                 # Remove input from the delta to prevent it from being processed again
-                                tool_use_without_input = {k: v for k, v in tool_use_delta.items() if k != "input"}
+                                tool_use_without_input = {
+                                    k: v
+                                    for k, v in tool_use_delta.items()
+                                    if k != "input"
+                                }
                                 if tool_use_without_input:
-                                    current_tool_call = join_two_dicts(current_tool_call, tool_use_without_input)
+                                    current_tool_call = join_two_dicts(
+                                        current_tool_call, tool_use_without_input
+                                    )
                             else:
                                 # For other fields, use the normal joining
-                                current_tool_call = join_two_dicts(current_tool_call, tool_use_delta)
+                                current_tool_call = join_two_dicts(
+                                    current_tool_call, tool_use_delta
+                                )
+                    blocks: List[Union[TextBlock, ThinkingBlock, ToolCallBlock]] = [
+                        TextBlock(text=content.get("text", ""))
+                    ]
+                    if thinking != "":
+                        blocks.insert(
+                            0,
+                            ThinkingBlock(
+                                content=thinking,
+                                additional_information={
+                                    "signature": thinking_signature
+                                },
+                            ),
+                        )
+                    if tool_calls:
+                        for tool_call in tool_calls:
+                            blocks.append(
+                                ToolCallBlock(
+                                    tool_kwargs=tool_call.get("input", {}),
+                                    tool_name=tool_call.get("name", ""),
+                                    tool_call_id=tool_call.get("toolUseId"),
+                                )
+                            )
+                    response_additional_kwargs = self._get_response_token_counts(
+                        dict(chunk)
+                    )
+                    if thinking_delta_value is not None:
+                        response_additional_kwargs["thinking_delta"] = (
+                            thinking_delta_value
+                        )
                     yield ChatResponse(
                         message=ChatMessage(
                             role=role,
-                            content=content.get("text", ""),
+                            blocks=blocks,
                             additional_kwargs={
-                                "tool_calls": tool_calls,
-                                "tool_call_id": [tc.get("toolUseId", "") for tc in tool_calls],
+                                "tool_call_id": [
+                                    tc.get("toolUseId", "") for tc in tool_calls
+                                ],
                                 "status": [],  # Will be populated when tool results come in
                             },
                         ),
                         delta=content_delta.get("text", ""),
                         raw=chunk,
-                        additional_kwargs=self._get_response_token_counts(dict(chunk)),
+                        additional_kwargs=response_additional_kwargs,
                     )
                 elif content_block_start := chunk.get("contentBlockStart"):
                     # New tool call starting
@@ -614,18 +885,91 @@ class BedrockConverse(FunctionCallingLLM):
                         # Add to our list of tool calls
                         tool_calls.append(current_tool_call)
+                    blocks: List[Union[TextBlock, ThinkingBlock, ToolCallBlock]] = [
+                        TextBlock(text=content.get("text", ""))
+                    ]
+                    if thinking != "":
+                        blocks.insert(
+                            0,
+                            ThinkingBlock(
+                                content=thinking,
+                                additional_information={
+                                    "signature": thinking_signature
+                                },
+                            ),
+                        )
+                    if tool_calls:
+                        for tool_call in tool_calls:
+                            blocks.append(
+                                ToolCallBlock(
+                                    tool_kwargs=tool_call.get("input", {}),
+                                    tool_name=tool_call.get("name", ""),
+                                    tool_call_id=tool_call.get("toolUseId"),
+                                )
+                            )
                     yield ChatResponse(
                         message=ChatMessage(
                             role=role,
-                            content=content.get("text", ""),
+                            blocks=blocks,
                             additional_kwargs={
-                                "tool_calls": tool_calls,
-                                "tool_call_id": [tc.get("toolUseId", "") for tc in tool_calls],
+                                "tool_call_id": [
+                                    tc.get("toolUseId", "") for tc in tool_calls
+                                ],
                                 "status": [],  # Will be populated when tool results come in
                             },
                         ),
                         raw=chunk,
                     )
+                elif chunk.get("messageStop"):
+                    # Handle messageStop event - this contains the stop reason
+                    # We don't yield here, just track the event
+                    pass
+                elif metadata := chunk.get("metadata"):
+                    # Handle metadata event - this contains the final token usage
+                    if usage := metadata.get("usage"):
+                        # Yield a final response with correct token usage
+                        blocks: List[Union[TextBlock, ThinkingBlock, ToolCallBlock]] = [
+                            TextBlock(text=content.get("text", ""))
+                        ]
+                        if thinking != "":
+                            blocks.insert(
+                                0,
+                                ThinkingBlock(
+                                    content=thinking,
+                                    additional_information={
+                                        "signature": thinking_signature
+                                    },
+                                ),
+                            )
+                        if tool_calls:
+                            for tool_call in tool_calls:
+                                blocks.append(
+                                    ToolCallBlock(
+                                        tool_kwargs=tool_call.get("input", {}),
+                                        tool_name=tool_call.get("name", ""),
+                                        tool_call_id=tool_call.get("toolUseId"),
+                                    )
+                                )
+                        yield ChatResponse(
+                            message=ChatMessage(
+                                role=role,
+                                blocks=blocks,
+                                additional_kwargs={
+                                    "tool_call_id": [
+                                        tc.get("toolUseId", "") for tc in tool_calls
+                                    ],
+                                    "status": [],
+                                },
+                            ),
+                            delta="",
+                            thinking_delta=None,
+                            raw=chunk,
+                            additional_kwargs=self._get_response_token_counts(metadata),
+                        )
         return gen()
@@ -643,6 +987,8 @@ class BedrockConverse(FunctionCallingLLM):
         chat_history: Optional[List[ChatMessage]] = None,
         verbose: bool = False,
         allow_parallel_tool_calls: bool = False,
+        tool_required: bool = False,
+        tool_caching: bool = False,
         tool_choice: Optional[dict] = None,
         **kwargs: Any,
     ) -> Dict[str, Any]:
@@ -656,11 +1002,13 @@ class BedrockConverse(FunctionCallingLLM):
             chat_history.append(user_msg)
         # convert Llama Index tools to AWS Bedrock Converse tools
-        tool_config = tools_to_converse_tools(tools)
-        if tool_choice:
-            # https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_ToolChoice.html
-            # e.g. { "auto": {} }
-            tool_config["toolChoice"] = tool_choice
+        tool_config = tools_to_converse_tools(
+            tools,
+            tool_choice=tool_choice,
+            tool_required=tool_required,
+            tool_caching=tool_caching,
+            supports_forced_tool_calls=self.supports_forced_tool_calls,
+        )
         return {
             "messages": chat_history,
@@ -688,7 +1036,11 @@ class BedrockConverse(FunctionCallingLLM):
         **kwargs: Any,
     ) -> List[ToolSelection]:
         """Predict and call the tool."""
-        tool_calls = response.message.additional_kwargs.get("tool_calls", [])
+        tool_calls = [
+            block
+            for block in response.message.blocks
+            if isinstance(block, ToolCallBlock)
+        ]
         if len(tool_calls) < 1:
             if error_on_no_tool_call:
@@ -700,29 +1052,23 @@ class BedrockConverse(FunctionCallingLLM):
         tool_selections = []
         for tool_call in tool_calls:
-            if (
-                "toolUseId" not in tool_call
-                or "name" not in tool_call
-            ):
-                raise ValueError("Invalid tool call.")
             # handle empty inputs
             argument_dict = {}
-            if tool_call.get("input", False) and isinstance(tool_call["input"], str):
+            if isinstance(tool_call.tool_kwargs, str):
                 # TODO parse_partial_json is not perfect
                 try:
-                    argument_dict = parse_partial_json(tool_call["input"])
+                    argument_dict = parse_partial_json(tool_call.tool_kwargs)
                 except ValueError:
                     argument_dict = {}
-            elif tool_call.get("input", False) and isinstance(tool_call["input"], dict):
-                argument_dict = tool_call["input"]
+            elif isinstance(tool_call.tool_kwargs, dict):
+                argument_dict = tool_call.tool_kwargs
             else:
                 continue
             tool_selections.append(
                 ToolSelection(
-                    tool_id=tool_call["toolUseId"],
-                    tool_name=tool_call["name"],
+                    tool_id=tool_call.tool_call_id or "",
+                    tool_name=tool_call.tool_name,
                     tool_kwargs=argument_dict,
                 )
             )
@@ -741,8 +1087,11 @@ class BedrockConverse(FunctionCallingLLM):
             return {}
         # Convert Bedrock's token count format to match OpenAI's format
+        # Cache token formats respecting Anthropic format
         return {
             "prompt_tokens": usage.get("inputTokens", 0),
             "completion_tokens": usage.get("outputTokens", 0),
             "total_tokens": usage.get("totalTokens", 0),
+            "cache_read_input_tokens": usage.get("cacheReadInputTokens", 0),
+            "cache_creation_input_tokens": usage.get("cacheWriteInputTokens", 0),
         }

llama-index-llms-bedrock-converse 0.5.4__py3-none-any.whl → 0.12.3__py3-none-any.whl

llama-index-llms-bedrock-converse 0.5.4py3-none-any.whl → 0.12.3py3-none-any.whl