PyPI - letta-nightly - Versions diffs - 0.11.7.dev20251006104136__py3-none-any.whl → 0.11.7.dev20251008104128__py3-none-any.whl - Mend

letta-nightly 0.11.7.dev20251006104136py3-none-any.whl → 0.11.7.dev20251008104128py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (145) hide show

letta/adapters/letta_llm_adapter.py +1 -0
letta/adapters/letta_llm_request_adapter.py +0 -1
letta/adapters/letta_llm_stream_adapter.py +7 -2
letta/adapters/simple_llm_request_adapter.py +88 -0
letta/adapters/simple_llm_stream_adapter.py +192 -0
letta/agents/agent_loop.py +6 -0
letta/agents/ephemeral_summary_agent.py +2 -1
letta/agents/helpers.py +142 -6
letta/agents/letta_agent.py +13 -33
letta/agents/letta_agent_batch.py +2 -4
letta/agents/letta_agent_v2.py +87 -77
letta/agents/letta_agent_v3.py +899 -0
letta/agents/voice_agent.py +2 -6
letta/constants.py +8 -4
letta/errors.py +40 -0
letta/functions/function_sets/base.py +84 -4
letta/functions/function_sets/multi_agent.py +0 -3
letta/functions/schema_generator.py +113 -71
letta/groups/dynamic_multi_agent.py +3 -2
letta/groups/helpers.py +1 -2
letta/groups/round_robin_multi_agent.py +3 -2
letta/groups/sleeptime_multi_agent.py +3 -2
letta/groups/sleeptime_multi_agent_v2.py +1 -1
letta/groups/sleeptime_multi_agent_v3.py +17 -17
letta/groups/supervisor_multi_agent.py +84 -80
letta/helpers/converters.py +3 -0
letta/helpers/message_helper.py +4 -0
letta/helpers/tool_rule_solver.py +92 -5
letta/interfaces/anthropic_streaming_interface.py +409 -0
letta/interfaces/gemini_streaming_interface.py +296 -0
letta/interfaces/openai_streaming_interface.py +752 -1
letta/llm_api/anthropic_client.py +126 -16
letta/llm_api/bedrock_client.py +4 -2
letta/llm_api/deepseek_client.py +4 -1
letta/llm_api/google_vertex_client.py +123 -42
letta/llm_api/groq_client.py +4 -1
letta/llm_api/llm_api_tools.py +11 -4
letta/llm_api/llm_client_base.py +6 -2
letta/llm_api/openai.py +32 -2
letta/llm_api/openai_client.py +423 -18
letta/llm_api/xai_client.py +4 -1
letta/main.py +9 -5
letta/memory.py +1 -0
letta/orm/__init__.py +1 -1
letta/orm/agent.py +10 -0
letta/orm/block.py +7 -16
letta/orm/blocks_agents.py +8 -2
letta/orm/files_agents.py +2 -0
letta/orm/job.py +7 -5
letta/orm/mcp_oauth.py +1 -0
letta/orm/message.py +21 -6
letta/orm/organization.py +2 -0
letta/orm/provider.py +6 -2
letta/orm/run.py +71 -0
letta/orm/sandbox_config.py +7 -1
letta/orm/sqlalchemy_base.py +0 -306
letta/orm/step.py +6 -5
letta/orm/step_metrics.py +5 -5
letta/otel/tracing.py +28 -3
letta/plugins/defaults.py +4 -4
letta/prompts/system_prompts/__init__.py +2 -0
letta/prompts/system_prompts/letta_v1.py +25 -0
letta/schemas/agent.py +3 -2
letta/schemas/agent_file.py +9 -3
letta/schemas/block.py +23 -10
letta/schemas/enums.py +21 -2
letta/schemas/job.py +17 -4
letta/schemas/letta_message_content.py +71 -2
letta/schemas/letta_stop_reason.py +5 -5
letta/schemas/llm_config.py +53 -3
letta/schemas/memory.py +1 -1
letta/schemas/message.py +504 -117
letta/schemas/openai/responses_request.py +64 -0
letta/schemas/providers/__init__.py +2 -0
letta/schemas/providers/anthropic.py +16 -0
letta/schemas/providers/ollama.py +115 -33
letta/schemas/providers/openrouter.py +52 -0
letta/schemas/providers/vllm.py +2 -1
letta/schemas/run.py +48 -42
letta/schemas/step.py +2 -2
letta/schemas/step_metrics.py +1 -1
letta/schemas/tool.py +15 -107
letta/schemas/tool_rule.py +88 -5
letta/serialize_schemas/marshmallow_agent.py +1 -0
letta/server/db.py +86 -408
letta/server/rest_api/app.py +61 -10
letta/server/rest_api/dependencies.py +14 -0
letta/server/rest_api/redis_stream_manager.py +19 -8
letta/server/rest_api/routers/v1/agents.py +364 -292
letta/server/rest_api/routers/v1/blocks.py +14 -20
letta/server/rest_api/routers/v1/identities.py +45 -110
letta/server/rest_api/routers/v1/internal_templates.py +21 -0
letta/server/rest_api/routers/v1/jobs.py +23 -6
letta/server/rest_api/routers/v1/messages.py +1 -1
letta/server/rest_api/routers/v1/runs.py +126 -85
letta/server/rest_api/routers/v1/sandbox_configs.py +10 -19
letta/server/rest_api/routers/v1/tools.py +281 -594
letta/server/rest_api/routers/v1/voice.py +1 -1
letta/server/rest_api/streaming_response.py +29 -29
letta/server/rest_api/utils.py +122 -64
letta/server/server.py +160 -887
letta/services/agent_manager.py +236 -919
letta/services/agent_serialization_manager.py +16 -0
letta/services/archive_manager.py +0 -100
letta/services/block_manager.py +211 -168
letta/services/file_manager.py +1 -1
letta/services/files_agents_manager.py +24 -33
letta/services/group_manager.py +0 -142
letta/services/helpers/agent_manager_helper.py +7 -2
letta/services/helpers/run_manager_helper.py +85 -0
letta/services/job_manager.py +96 -411
letta/services/lettuce/__init__.py +6 -0
letta/services/lettuce/lettuce_client_base.py +86 -0
letta/services/mcp_manager.py +38 -6
letta/services/message_manager.py +165 -362
letta/services/organization_manager.py +0 -36
letta/services/passage_manager.py +0 -345
letta/services/provider_manager.py +0 -80
letta/services/run_manager.py +301 -0
letta/services/sandbox_config_manager.py +0 -234
letta/services/step_manager.py +62 -39
letta/services/summarizer/summarizer.py +9 -7
letta/services/telemetry_manager.py +0 -16
letta/services/tool_executor/builtin_tool_executor.py +35 -0
letta/services/tool_executor/core_tool_executor.py +397 -2
letta/services/tool_executor/files_tool_executor.py +3 -3
letta/services/tool_executor/multi_agent_tool_executor.py +30 -15
letta/services/tool_executor/tool_execution_manager.py +6 -8
letta/services/tool_executor/tool_executor_base.py +3 -3
letta/services/tool_manager.py +85 -339
letta/services/tool_sandbox/base.py +24 -13
letta/services/tool_sandbox/e2b_sandbox.py +16 -1
letta/services/tool_schema_generator.py +123 -0
letta/services/user_manager.py +0 -99
letta/settings.py +20 -4
{letta_nightly-0.11.7.dev20251006104136.dist-info → letta_nightly-0.11.7.dev20251008104128.dist-info}/METADATA +3 -5
{letta_nightly-0.11.7.dev20251006104136.dist-info → letta_nightly-0.11.7.dev20251008104128.dist-info}/RECORD +140 -132
letta/agents/temporal/activities/__init__.py +0 -4
letta/agents/temporal/activities/example_activity.py +0 -7
letta/agents/temporal/activities/prepare_messages.py +0 -10
letta/agents/temporal/temporal_agent_workflow.py +0 -56
letta/agents/temporal/types.py +0 -25
{letta_nightly-0.11.7.dev20251006104136.dist-info → letta_nightly-0.11.7.dev20251008104128.dist-info}/WHEEL +0 -0
{letta_nightly-0.11.7.dev20251006104136.dist-info → letta_nightly-0.11.7.dev20251008104128.dist-info}/entry_points.txt +0 -0
{letta_nightly-0.11.7.dev20251006104136.dist-info → letta_nightly-0.11.7.dev20251008104128.dist-info}/licenses/LICENSE +0 -0

letta/llm_api/anthropic_client.py CHANGED Viewed

@@ -10,7 +10,7 @@ from anthropic.types.beta.message_create_params import MessageCreateParamsNonStr
 from anthropic.types.beta.messages import BetaMessageBatch
 from anthropic.types.beta.messages.batch_create_params import Request
-from letta.constants import FUNC_FAILED_HEARTBEAT_MESSAGE, REQ_HEARTBEAT_MESSAGE
+from letta.constants import FUNC_FAILED_HEARTBEAT_MESSAGE, REQ_HEARTBEAT_MESSAGE, REQUEST_HEARTBEAT_PARAM
 from letta.errors import (
     ContextWindowExceededError,
     ErrorCode,
@@ -31,6 +31,7 @@ from letta.llm_api.llm_client_base import LLMClientBase
 from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
 from letta.log import get_logger
 from letta.otel.tracing import trace_method
+from letta.schemas.agent import AgentType
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
 from letta.schemas.openai.chat_completion_request import Tool as OpenAITool
@@ -54,15 +55,46 @@ class AnthropicClient(LLMClientBase):
     @deprecated("Synchronous version of this is no longer valid. Will result in model_dump of coroutine")
     def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
         client = self._get_anthropic_client(llm_config, async_client=False)
-        response = client.beta.messages.create(**request_data)
+        betas: list[str] = []
+        # 1M context beta for Sonnet 4/4.5 when enabled
+        try:
+            from letta.settings import model_settings
+            if model_settings.anthropic_sonnet_1m and (
+                llm_config.model.startswith("claude-sonnet-4") or llm_config.model.startswith("claude-sonnet-4-5")
+            ):
+                betas.append("context-1m-2025-08-07")
+        except Exception:
+            pass
+        if betas:
+            response = client.beta.messages.create(**request_data, betas=betas)
+        else:
+            response = client.beta.messages.create(**request_data)
         return response.model_dump()
     @trace_method
     async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
         client = await self._get_anthropic_client_async(llm_config, async_client=True)
+        betas: list[str] = []
+        # interleaved thinking for reasoner
         if llm_config.enable_reasoner:
-            response = await client.beta.messages.create(**request_data, betas=["interleaved-thinking-2025-05-14"])
+            betas.append("interleaved-thinking-2025-05-14")
+        # 1M context beta for Sonnet 4/4.5 when enabled
+        try:
+            from letta.settings import model_settings
+            if model_settings.anthropic_sonnet_1m and (
+                llm_config.model.startswith("claude-sonnet-4") or llm_config.model.startswith("claude-sonnet-4-5")
+            ):
+                betas.append("context-1m-2025-08-07")
+        except Exception:
+            pass
+        if betas:
+            response = await client.beta.messages.create(**request_data, betas=betas)
         else:
             response = await client.beta.messages.create(**request_data)
@@ -83,11 +115,23 @@ class AnthropicClient(LLMClientBase):
         if llm_config.enable_reasoner:
             betas.append("interleaved-thinking-2025-05-14")
+        # 1M context beta for Sonnet 4/4.5 when enabled
+        try:
+            from letta.settings import model_settings
+            if model_settings.anthropic_sonnet_1m and (
+                llm_config.model.startswith("claude-sonnet-4") or llm_config.model.startswith("claude-sonnet-4-5")
+            ):
+                betas.append("context-1m-2025-08-07")
+        except Exception:
+            pass
         return await client.beta.messages.create(**request_data, betas=betas)
     @trace_method
     async def send_llm_batch_request_async(
         self,
+        agent_type: AgentType,
         agent_messages_mapping: Dict[str, List[PydanticMessage]],
         agent_tools_mapping: Dict[str, List[dict]],
         agent_llm_config_mapping: Dict[str, LLMConfig],
@@ -114,6 +158,7 @@ class AnthropicClient(LLMClientBase):
         try:
             requests = {
                 agent_id: self.build_request_data(
+                    agent_type=agent_type,
                     messages=agent_messages_mapping[agent_id],
                     llm_config=agent_llm_config_mapping[agent_id],
                     tools=agent_tools_mapping[agent_id],
@@ -175,14 +220,19 @@ class AnthropicClient(LLMClientBase):
     @trace_method
     def build_request_data(
         self,
+        agent_type: AgentType,  # if react, use native content + strip heartbeats
         messages: List[PydanticMessage],
         llm_config: LLMConfig,
         tools: Optional[List[dict]] = None,
         force_tool_call: Optional[str] = None,
+        requires_subsequent_tool_call: bool = False,
     ) -> dict:
         # TODO: This needs to get cleaned up. The logic here is pretty confusing.
         # TODO: I really want to get rid of prefixing, it's a recipe for disaster code maintenance wise
-        prefix_fill = True
+        prefix_fill = True if agent_type != AgentType.letta_v1_agent else False
+        is_v1 = agent_type == AgentType.letta_v1_agent
+        # Determine local behavior for putting inner thoughts in kwargs without mutating llm_config
+        put_kwargs = bool(llm_config.put_inner_thoughts_in_kwargs) and not is_v1
         if not self.use_tool_naming:
             raise NotImplementedError("Only tool calling supported on Anthropic API requests")
@@ -222,8 +272,9 @@ class AnthropicClient(LLMClientBase):
             # Special case for summarization path
             tools_for_request = None
             tool_choice = None
-        elif self.is_reasoning_model(llm_config) and llm_config.enable_reasoner:
+        elif self.is_reasoning_model(llm_config) and llm_config.enable_reasoner or agent_type == AgentType.letta_v1_agent:
             # NOTE: reasoning models currently do not allow for `any`
+            # NOTE: react agents should always have auto on, since the precense/absense of tool calls controls chaining
             tool_choice = {"type": "auto", "disable_parallel_tool_use": True}
             tools_for_request = [OpenAITool(function=f) for f in tools]
         elif force_tool_call is not None:
@@ -231,11 +282,17 @@ class AnthropicClient(LLMClientBase):
             tools_for_request = [OpenAITool(function=f) for f in tools if f["name"] == force_tool_call]
             # need to have this setting to be able to put inner thoughts in kwargs
-            if not llm_config.put_inner_thoughts_in_kwargs:
-                logger.warning(
-                    f"Force setting put_inner_thoughts_in_kwargs to True for Claude because there is a forced tool call: {force_tool_call}"
-                )
-                llm_config.put_inner_thoughts_in_kwargs = True
+            if not put_kwargs:
+                if is_v1:
+                    # For v1 agents, native content is used and kwargs must remain disabled to avoid conflicts
+                    logger.warning(
+                        "Forced tool call requested but inner_thoughts_in_kwargs is disabled for v1 agent; proceeding without inner thoughts in kwargs."
+                    )
+                else:
+                    logger.warning(
+                        f"Force enabling inner thoughts in kwargs for Claude due to forced tool call: {force_tool_call} (local override only)"
+                    )
+                    put_kwargs = True
         else:
             tool_choice = {"type": "any", "disable_parallel_tool_use": True}
             tools_for_request = [OpenAITool(function=f) for f in tools] if tools is not None else None
@@ -246,7 +303,7 @@ class AnthropicClient(LLMClientBase):
         # Add inner thoughts kwarg
         # TODO: Can probably make this more efficient
-        if tools_for_request and len(tools_for_request) > 0 and llm_config.put_inner_thoughts_in_kwargs:
+        if tools_for_request and len(tools_for_request) > 0 and put_kwargs:
             tools_with_inner_thoughts = add_inner_thoughts_to_functions(
                 functions=[t.function.model_dump() for t in tools_for_request],
                 inner_thoughts_key=INNER_THOUGHTS_KWARG,
@@ -269,7 +326,10 @@ class AnthropicClient(LLMClientBase):
         data["messages"] = PydanticMessage.to_anthropic_dicts_from_list(
             messages=messages[1:],
             inner_thoughts_xml_tag=inner_thoughts_xml_tag,
-            put_inner_thoughts_in_kwargs=bool(llm_config.put_inner_thoughts_in_kwargs),
+            put_inner_thoughts_in_kwargs=put_kwargs,
+            # if react, use native content + strip heartbeats
+            native_content=is_v1,
+            strip_request_heartbeat=is_v1,
         )
         # Ensure first message is user
@@ -279,15 +339,27 @@ class AnthropicClient(LLMClientBase):
         # Handle alternating messages
         data["messages"] = merge_tool_results_into_user_messages(data["messages"])
-        # Strip heartbeat pings if extended thinking
-        if llm_config.enable_reasoner:
-            data["messages"] = merge_heartbeats_into_tool_responses(data["messages"])
+        if agent_type == AgentType.letta_v1_agent:
+            # Both drop heartbeats in the payload
+            data["messages"] = drop_heartbeats(data["messages"])
+            # And drop heartbeats in the tools
+            if "tools" in data:
+                for tool in data["tools"]:
+                    tool["input_schema"]["properties"].pop(REQUEST_HEARTBEAT_PARAM, None)
+                    if "required" in tool["input_schema"] and REQUEST_HEARTBEAT_PARAM in tool["input_schema"]["required"]:
+                        # NOTE: required is not always present
+                        tool["input_schema"]["required"].remove(REQUEST_HEARTBEAT_PARAM)
+        else:
+            # Strip heartbeat pings if extended thinking
+            if llm_config.enable_reasoner:
+                data["messages"] = merge_heartbeats_into_tool_responses(data["messages"])
         # Prefix fill
         # https://docs.anthropic.com/en/api/messages#body-messages
         # NOTE: cannot prefill with tools for opus:
         # Your API request included an `assistant` message in the final position, which would pre-fill the `assistant` response. When using tools with "claude-3-opus-20240229"
-        if prefix_fill and not llm_config.put_inner_thoughts_in_kwargs and "opus" not in data["model"]:
+        if prefix_fill and not put_kwargs and "opus" not in data["model"]:
             data["messages"].append(
                 # Start the thinking process for the assistant
                 {"role": "assistant", "content": f"<{inner_thoughts_xml_tag}>"},
@@ -716,6 +788,44 @@ def is_heartbeat(message: dict, is_ping: bool = False) -> bool:
             return False
+def drop_heartbeats(messages: List[dict]):
+    cleaned_messages = []
+    # Loop through messages
+    # For messages with role 'user' and len(content) > 1,
+    #   Check if content[0].type == 'tool_result'
+    #   If so, iterate over content[1:] and while content.type == 'text' and is_heartbeat(content.text),
+    #     merge into content[0].content
+    for message in messages:
+        if "role" in message and "content" in message and message["role"] == "user":
+            content_parts = message["content"]
+            if isinstance(content_parts, str):
+                if is_heartbeat({"role": "user", "content": content_parts}):
+                    continue
+            elif isinstance(content_parts, list) and len(content_parts) == 1 and "text" in content_parts[0]:
+                if is_heartbeat({"role": "user", "content": content_parts[0]["text"]}):
+                    continue  # skip
+            else:
+                cleaned_parts = []
+                # Drop all the parts
+                for content_part in content_parts:
+                    if "text" in content_part and is_heartbeat({"role": "user", "content": content_part["text"]}):
+                        continue  # skip
+                    else:
+                        cleaned_parts.append(content_part)
+                if len(cleaned_parts) == 0:
+                    continue
+                else:
+                    message["content"] = cleaned_parts
+        cleaned_messages.append(message)
+    return cleaned_messages
 def merge_heartbeats_into_tool_responses(messages: List[dict]):
     """For extended thinking mode, we don't want anything other than tool responses in-between assistant actions

letta/llm_api/bedrock_client.py CHANGED Viewed

@@ -6,7 +6,7 @@ from aioboto3.session import Session
 from letta.llm_api.anthropic_client import AnthropicClient
 from letta.log import get_logger
 from letta.otel.tracing import trace_method
-from letta.schemas.enums import ProviderCategory
+from letta.schemas.enums import AgentType, ProviderCategory
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
 from letta.services.provider_manager import ProviderManager
@@ -65,12 +65,14 @@ class BedrockClient(AnthropicClient):
     @trace_method
     def build_request_data(
         self,
+        agent_type: AgentType,
         messages: List[PydanticMessage],
         llm_config: LLMConfig,
         tools: Optional[List[dict]] = None,
         force_tool_call: Optional[str] = None,
+        requires_subsequent_tool_call: bool = False,
     ) -> dict:
-        data = super().build_request_data(messages, llm_config, tools, force_tool_call)
+        data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call, requires_subsequent_tool_call)
         # remove disallowed fields
         if "tool_choice" in data:
             del data["tool_choice"]["disable_parallel_tool_use"]

letta/llm_api/deepseek_client.py CHANGED Viewed

@@ -10,6 +10,7 @@ from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
 from letta.llm_api.openai_client import OpenAIClient
 from letta.otel.tracing import trace_method
+from letta.schemas.enums import AgentType
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
 from letta.schemas.openai.chat_completion_request import (
@@ -331,15 +332,17 @@ class DeepseekClient(OpenAIClient):
     @trace_method
     def build_request_data(
         self,
+        agent_type: AgentType,
         messages: List[PydanticMessage],
         llm_config: LLMConfig,
         tools: Optional[List[dict]] = None,
         force_tool_call: Optional[str] = None,
+        requires_subsequent_tool_call: bool = False,
     ) -> dict:
         # Override put_inner_thoughts_in_kwargs to False for DeepSeek
         llm_config.put_inner_thoughts_in_kwargs = False
-        data = super().build_request_data(messages, llm_config, tools, force_tool_call)
+        data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call, requires_subsequent_tool_call)
         def add_functions_to_system_message(system_message: ChatMessage):
             system_message.content += f"<available functions> {''.join(json.dumps(f) for f in tools)} </available functions>"

letta/llm_api/google_vertex_client.py CHANGED Viewed

@@ -1,6 +1,7 @@
+import base64
 import json
 import uuid
-from typing import List, Optional
+from typing import AsyncIterator, List, Optional
 from google import genai
 from google.genai import errors
@@ -34,6 +35,7 @@ from letta.local_llm.json_parser import clean_json_string_extra_backslash
 from letta.local_llm.utils import count_tokens
 from letta.log import get_logger
 from letta.otel.tracing import trace_method
+from letta.schemas.agent import AgentType
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
 from letta.schemas.openai.chat_completion_request import Tool
@@ -137,6 +139,15 @@ class GoogleVertexClient(LLMClientBase):
             raise RuntimeError("Failed to get response data after all retries")
         return response_data
+    @trace_method
+    async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncIterator[GenerateContentResponse]:
+        client = self._get_client()
+        return await client.aio.models.generate_content_stream(
+            model=llm_config.model,
+            contents=request_data["contents"],
+            config=request_data["config"],
+        )
     @staticmethod
     def add_dummy_model_messages(messages: List[dict]) -> List[dict]:
         """Google AI API requires all function call returns are immediately followed by a 'model' role message.
@@ -274,14 +285,19 @@ class GoogleVertexClient(LLMClientBase):
     @trace_method
     def build_request_data(
         self,
+        agent_type: AgentType,  # if react, use native content + strip heartbeats
         messages: List[PydanticMessage],
         llm_config: LLMConfig,
         tools: List[dict],
         force_tool_call: Optional[str] = None,
+        requires_subsequent_tool_call: bool = False,
     ) -> dict:
         """
         Constructs a request object in the expected data format for this client.
         """
+        # NOTE: forcing inner thoughts in kwargs off
+        if agent_type == AgentType.letta_v1_agent:
+            llm_config.put_inner_thoughts_in_kwargs = False
         if tools:
             tool_objs = [Tool(type="function", function=t) for t in tools]
@@ -293,7 +309,11 @@ class GoogleVertexClient(LLMClientBase):
             tool_names = []
         contents = self.add_dummy_model_messages(
-            PydanticMessage.to_google_dicts_from_list(messages),
+            PydanticMessage.to_google_dicts_from_list(
+                messages,
+                put_inner_thoughts_in_kwargs=False if agent_type == AgentType.letta_v1_agent else True,
+                native_content=True if agent_type == AgentType.letta_v1_agent else False,
+            ),
         )
         request_data = {
@@ -312,16 +332,42 @@ class GoogleVertexClient(LLMClientBase):
             request_data["config"]["response_schema"] = self.get_function_call_response_schema(tools[0])
             del request_data["config"]["tools"]
         elif tools:
-            tool_config = ToolConfig(
-                function_calling_config=FunctionCallingConfig(
-                    # ANY mode forces the model to predict only function calls
-                    mode=FunctionCallingConfigMode.ANY,
-                    # Provide the list of tools (though empty should also work, it seems not to)
-                    allowed_function_names=tool_names,
+            if agent_type == AgentType.letta_v1_agent:
+                # don't require tools
+                tool_call_mode = FunctionCallingConfigMode.AUTO
+                tool_config = ToolConfig(
+                    function_calling_config=FunctionCallingConfig(
+                        mode=tool_call_mode,
+                    )
                 )
-            )
+            else:
+                # require tools
+                tool_call_mode = FunctionCallingConfigMode.ANY
+                tool_config = ToolConfig(
+                    function_calling_config=FunctionCallingConfig(
+                        mode=tool_call_mode,
+                        # Provide the list of tools (though empty should also work, it seems not to)
+                        allowed_function_names=tool_names,
+                    )
+                )
             request_data["config"]["tool_config"] = tool_config.model_dump()
+        # https://ai.google.dev/gemini-api/docs/thinking#set-budget
+        # 2.5 Pro
+        #   - Default: dynamic thinking
+        #   - Dynamic thinking that cannot be disabled
+        #   - Range: -1 (for dynamic), or 128-32768
+        # 2.5 Flash
+        #   - Default: dynamic thinking
+        #   - Dynamic thinking that *can* be disabled
+        #   - Range: -1, 0, or 0-24576
+        # 2.5 Flash Lite
+        #   - Default: no thinking
+        #   - Dynamic thinking that *can* be disabled
+        #   - Range: -1, 0, or 512-24576
+        # TODO when using v3 agent loop, properly support the native thinking in Gemini
         # Add thinking_config for flash
         # If enable_reasoner is False, set thinking_budget to 0
         # Otherwise, use the value from max_reasoning_tokens
@@ -334,6 +380,7 @@ class GoogleVertexClient(LLMClientBase):
                 )
             thinking_config = ThinkingConfig(
                 thinking_budget=(thinking_budget),
+                include_thoughts=(thinking_budget > 1),
             )
             request_data["config"]["thinking_config"] = thinking_config.model_dump()
@@ -395,13 +442,15 @@ class GoogleVertexClient(LLMClientBase):
                 # NOTE(Apr 9, 2025): there's a very strange bug on 2.5 where the response has a part with broken text
                 # {'candidates': [{'content': {'parts': [{'functionCall': {'name': 'send_message', 'args': {'request_heartbeat': False, 'message': 'Hello! How can I make your day better?', 'inner_thoughts': 'User has initiated contact. Sending a greeting.'}}}], 'role': 'model'}, 'finishReason': 'STOP', 'avgLogprobs': -0.25891534213362066}], 'usageMetadata': {'promptTokenCount': 2493, 'candidatesTokenCount': 29, 'totalTokenCount': 2522, 'promptTokensDetails': [{'modality': 'TEXT', 'tokenCount': 2493}], 'candidatesTokensDetails': [{'modality': 'TEXT', 'tokenCount': 29}]}, 'modelVersion': 'gemini-1.5-pro-002'}
                 # To patch this, if we have multiple parts we can take the last one
-                if len(parts) > 1:
+                if len(parts) > 1 and not llm_config.enable_reasoner:
                     logger.warning(f"Unexpected multiple parts in response from Google AI: {parts}")
+                    # only truncate if reasoning is off
                     parts = [parts[-1]]
                 # TODO support parts / multimodal
                 # TODO support parallel tool calling natively
                 # TODO Alternative here is to throw away everything else except for the first part
+                openai_response_message = None
                 for response_message in parts:
                     # Convert the actual message style to OpenAI style
                     if response_message.function_call:
@@ -410,8 +459,10 @@ class GoogleVertexClient(LLMClientBase):
                         function_args = function_call.args
                         assert isinstance(function_args, dict), function_args
-                        # NOTE: this also involves stripping the inner monologue out of the function
+                        # TODO this is kind of funky - really, we should be passing 'native_content' as a kwarg to fork behavior
+                        inner_thoughts = response_message.text
                         if llm_config.put_inner_thoughts_in_kwargs:
+                            # NOTE: this also involves stripping the inner monologue out of the function
                             from letta.local_llm.constants import INNER_THOUGHTS_KWARG_VERTEX
                             assert INNER_THOUGHTS_KWARG_VERTEX in function_args, (
@@ -420,25 +471,44 @@ class GoogleVertexClient(LLMClientBase):
                             inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG_VERTEX)
                             assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}"
                         else:
-                            inner_thoughts = None
+                            pass
+                            # inner_thoughts = None
+                            # inner_thoughts = response_message.text
                         # Google AI API doesn't generate tool call IDs
-                        openai_response_message = Message(
-                            role="assistant",  # NOTE: "model" -> "assistant"
-                            content=inner_thoughts,
-                            tool_calls=[
-                                ToolCall(
-                                    id=get_tool_call_id(),
-                                    type="function",
-                                    function=FunctionCall(
-                                        name=function_name,
-                                        arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
-                                    ),
-                                )
-                            ],
+                        tool_call = ToolCall(
+                            id=get_tool_call_id(),
+                            type="function",
+                            function=FunctionCall(
+                                name=function_name,
+                                arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
+                            ),
                         )
+                        if openai_response_message is None:
+                            openai_response_message = Message(
+                                role="assistant",  # NOTE: "model" -> "assistant"
+                                content=inner_thoughts,
+                                tool_calls=[tool_call],
+                            )
+                        else:
+                            openai_response_message.content = inner_thoughts
+                            if openai_response_message.tool_calls is None:
+                                openai_response_message.tool_calls = []
+                            openai_response_message.tool_calls.append(tool_call)
+                            if response_message.thought_signature:
+                                thought_signature = base64.b64encode(response_message.thought_signature).decode("utf-8")
+                                openai_response_message.reasoning_content_signature = thought_signature
                     else:
+                        if response_message.thought:
+                            if openai_response_message is None:
+                                openai_response_message = Message(
+                                    role="assistant",  # NOTE: "model" -> "assistant"
+                                    reasoning_content=response_message.text,
+                                )
+                            else:
+                                openai_response_message.reasoning_content = response_message.text
                         try:
                             # Structured output tool call
                             function_call = json_loads(response_message.text)
@@ -459,20 +529,25 @@ class GoogleVertexClient(LLMClientBase):
                                 inner_thoughts = None
                             # Google AI API doesn't generate tool call IDs
-                            openai_response_message = Message(
-                                role="assistant",  # NOTE: "model" -> "assistant"
-                                content=inner_thoughts,
-                                tool_calls=[
-                                    ToolCall(
-                                        id=get_tool_call_id(),
-                                        type="function",
-                                        function=FunctionCall(
-                                            name=function_name,
-                                            arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
-                                        ),
-                                    )
-                                ],
+                            tool_call = ToolCall(
+                                id=get_tool_call_id(),
+                                type="function",
+                                function=FunctionCall(
+                                    name=function_name,
+                                    arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
+                                ),
                             )
+                            if openai_response_message is None:
+                                openai_response_message = Message(
+                                    role="assistant",  # NOTE: "model" -> "assistant"
+                                    content=inner_thoughts,
+                                    tool_calls=[tool_call],
+                                )
+                            else:
+                                openai_response_message.content = inner_thoughts
+                                if openai_response_message.tool_calls is None:
+                                    openai_response_message.tool_calls = []
+                                openai_response_message.tool_calls.append(tool_call)
                         except json.decoder.JSONDecodeError:
                             if candidate.finish_reason == "MAX_TOKENS":
@@ -481,10 +556,16 @@ class GoogleVertexClient(LLMClientBase):
                             inner_thoughts = response_message.text
                             # Google AI API doesn't generate tool call IDs
-                            openai_response_message = Message(
-                                role="assistant",  # NOTE: "model" -> "assistant"
-                                content=inner_thoughts,
-                            )
+                            if openai_response_message is None:
+                                openai_response_message = Message(
+                                    role="assistant",  # NOTE: "model" -> "assistant"
+                                    content=inner_thoughts,
+                                )
+                            else:
+                                openai_response_message.content = inner_thoughts
+                            if response_message.thought_signature:
+                                thought_signature = base64.b64encode(response_message.thought_signature).decode("utf-8")
+                                openai_response_message.reasoning_content_signature = thought_signature
                     # Google AI API uses different finish reason strings than OpenAI
                     # OpenAI: 'stop', 'length', 'function_call', 'content_filter', null

letta/llm_api/groq_client.py CHANGED Viewed

@@ -8,6 +8,7 @@ from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
 from letta.llm_api.openai_client import OpenAIClient
 from letta.otel.tracing import trace_method
 from letta.schemas.embedding_config import EmbeddingConfig
+from letta.schemas.enums import AgentType
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
 from letta.settings import model_settings
@@ -23,12 +24,14 @@ class GroqClient(OpenAIClient):
     @trace_method
     def build_request_data(
         self,
+        agent_type: AgentType,
         messages: List[PydanticMessage],
         llm_config: LLMConfig,
         tools: Optional[List[dict]] = None,
         force_tool_call: Optional[str] = None,
+        requires_subsequent_tool_call: bool = False,
     ) -> dict:
-        data = super().build_request_data(messages, llm_config, tools, force_tool_call)
+        data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call, requires_subsequent_tool_call)
         # Groq validation - these fields are not supported and will cause 400 errors
         # https://console.groq.com/docs/openai

letta/llm_api/llm_api_tools.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import os
 import random
 import time
 from typing import List, Optional, Union
@@ -174,11 +175,17 @@ def create(
             actor = UserManager().get_user_or_default(user_id=user_id)
             api_key = ProviderManager().get_override_key(llm_config.provider_name, actor=actor)
-        elif model_settings.openai_api_key is None:
-            # the openai python client requires a dummy API key
-            api_key = "DUMMY_API_KEY"
         else:
-            api_key = model_settings.openai_api_key
+            # Prefer OpenRouter key when targeting OpenRouter
+            is_openrouter = (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (
+                llm_config.provider_name == "openrouter"
+            )
+            if is_openrouter:
+                api_key = model_settings.openrouter_api_key or os.environ.get("OPENROUTER_API_KEY")
+            if not is_openrouter or not api_key:
+                api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
+            # the openai python client requires some API key string
+            api_key = api_key or "DUMMY_API_KEY"
         if function_call is None and functions is not None and len(functions) > 0:
             # force function calling for reliability, see https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice

letta-nightly 0.11.7.dev20251006104136__py3-none-any.whl → 0.11.7.dev20251008104128__py3-none-any.whl

letta-nightly 0.11.7.dev20251006104136py3-none-any.whl → 0.11.7.dev20251008104128py3-none-any.whl