PyPI - letta-nightly - Versions diffs - 0.6.23.dev20250211104055__tar.gz → 0.6.24.dev20250212072610__tar.gz - Mend

letta-nightly 0.6.23.dev20250211104055tar.gz → 0.6.24.dev20250212072610tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of letta-nightly might be problematic. Click here for more details.

Files changed (246) hide show

{letta_nightly-0.6.23.dev20250211104055 → letta_nightly-0.6.24.dev20250212072610}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: letta-nightly
-Version: 0.6.23.dev20250211104055
+Version: 0.6.24.dev20250212072610
 Summary: Create LLM agents with long-term memory and custom tools
 License: Apache License
 Author: Letta Team
@@ -34,6 +34,7 @@ Requires-Dist: docker (>=7.1.0,<8.0.0) ; extra == "external-tools" or extra == "
 Requires-Dist: docstring-parser (>=0.16,<0.17)
 Requires-Dist: docx2txt (>=0.8,<0.9)
 Requires-Dist: e2b-code-interpreter (>=1.0.3,<2.0.0) ; extra == "cloud-tool-sandbox"
+Requires-Dist: faker (>=36.1.0,<37.0.0)
 Requires-Dist: fastapi (>=0.115.6,<0.116.0) ; extra == "server" or extra == "all"
 Requires-Dist: grpcio (>=1.68.1,<2.0.0)
 Requires-Dist: grpcio-tools (>=1.68.1,<2.0.0)

{letta_nightly-0.6.23.dev20250211104055 → letta_nightly-0.6.24.dev20250212072610}/letta/__init__.py RENAMED Viewed

@@ -1,5 +1,4 @@
-__version__ = "0.6.23"
+__version__ = "0.6.24"
 # import clients
 from letta.client.client import LocalClient, RESTClient, create_client

{letta_nightly-0.6.23.dev20250211104055 → letta_nightly-0.6.24.dev20250212072610}/letta/agent.py RENAMED Viewed

@@ -260,6 +260,7 @@ class Agent(BaseAgent):
         error_msg: str,
         tool_call_id: str,
         function_name: str,
+        function_args: dict,
         function_response: str,
         messages: List[Message],
         include_function_failed_message: bool = False,
@@ -394,6 +395,7 @@ class Agent(BaseAgent):
         messages = []  # append these to the history when done
         function_name = None
+        function_args = {}
         # Step 2: check if LLM wanted to call a function
         if response_message.function_call or (response_message.tool_calls is not None and len(response_message.tool_calls) > 0):
@@ -445,8 +447,6 @@ class Agent(BaseAgent):
             function_call = (
                 response_message.function_call if response_message.function_call is not None else response_message.tool_calls[0].function
             )
-            # Get the name of the function
             function_name = function_call.name
             self.logger.info(f"Request to call function {function_name} with tool_call_id: {tool_call_id}")
@@ -459,7 +459,9 @@ class Agent(BaseAgent):
             if not target_letta_tool:
                 error_msg = f"No function named {function_name}"
                 function_response = "None"  # more like "never ran?"
-                messages = self._handle_function_error_response(error_msg, tool_call_id, function_name, function_response, messages)
+                messages = self._handle_function_error_response(
+                    error_msg, tool_call_id, function_name, function_args, function_response, messages
+                )
                 return messages, False, True  # force a heartbeat to allow agent to handle error
             # Failure case 2: function name is OK, but function args are bad JSON
@@ -469,7 +471,9 @@ class Agent(BaseAgent):
             except Exception:
                 error_msg = f"Error parsing JSON for function '{function_name}' arguments: {function_call.arguments}"
                 function_response = "None"  # more like "never ran?"
-                messages = self._handle_function_error_response(error_msg, tool_call_id, function_name, function_response, messages)
+                messages = self._handle_function_error_response(
+                    error_msg, tool_call_id, function_name, function_args, function_response, messages
+                )
                 return messages, False, True  # force a heartbeat to allow agent to handle error
             # Check if inner thoughts is in the function call arguments (possible apparently if you are using Azure)
@@ -506,7 +510,7 @@ class Agent(BaseAgent):
                 if sandbox_run_result and sandbox_run_result.status == "error":
                     messages = self._handle_function_error_response(
-                        function_response, tool_call_id, function_name, function_response, messages
+                        function_response, tool_call_id, function_name, function_args, function_response, messages
                     )
                     return messages, False, True  # force a heartbeat to allow agent to handle error
@@ -535,7 +539,7 @@ class Agent(BaseAgent):
                 error_msg_user = f"{error_msg}\n{traceback.format_exc()}"
                 self.logger.error(error_msg_user)
                 messages = self._handle_function_error_response(
-                    error_msg, tool_call_id, function_name, function_response, messages, include_function_failed_message=True
+                    error_msg, tool_call_id, function_name, function_args, function_response, messages, include_function_failed_message=True
                 )
                 return messages, False, True  # force a heartbeat to allow agent to handle error
@@ -543,7 +547,7 @@ class Agent(BaseAgent):
             if function_response_string.startswith(ERROR_MESSAGE_PREFIX):
                 error_msg = function_response_string
                 messages = self._handle_function_error_response(
-                    error_msg, tool_call_id, function_name, function_response, messages, include_function_failed_message=True
+                    error_msg, tool_call_id, function_name, function_args, function_response, messages, include_function_failed_message=True
                 )
                 return messages, False, True  # force a heartbeat to allow agent to handle error

{letta_nightly-0.6.23.dev20250211104055 → letta_nightly-0.6.24.dev20250212072610}/letta/cli/cli.py RENAMED Viewed

@@ -15,7 +15,6 @@ from letta.local_llm.constants import ASSISTANT_MESSAGE_CLI_SYMBOL
 from letta.log import get_logger
 from letta.schemas.enums import OptionState
 from letta.schemas.memory import ChatMemory, Memory
-from letta.server.server import logger as server_logger
 # from letta.interface import CLIInterface as interface  # for printing to terminal
 from letta.streaming_interface import StreamingRefreshCLIInterface as interface  # for printing to terminal
@@ -119,6 +118,8 @@ def run(
     utils.DEBUG = debug
     # TODO: add logging command line options for runtime log level
+    from letta.server.server import logger as server_logger
     if debug:
         logger.setLevel(logging.DEBUG)
         server_logger.setLevel(logging.DEBUG)
@@ -360,4 +361,4 @@ def delete_agent(
 def version() -> str:
     import letta
-    return letta.__version__
+    print(letta.__version__)

letta_nightly-0.6.24.dev20250212072610/letta/client/streaming.py ADDED Viewed

@@ -0,0 +1,94 @@
+import json
+from typing import Generator, Union, get_args
+import httpx
+from httpx_sse import SSEError, connect_sse
+from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
+from letta.constants import OPENAI_CONTEXT_WINDOW_ERROR_SUBSTRING
+from letta.errors import LLMError
+from letta.log import get_logger
+from letta.schemas.enums import MessageStreamStatus
+from letta.schemas.letta_message import AssistantMessage, ReasoningMessage, ToolCallMessage, ToolReturnMessage
+from letta.schemas.letta_response import LettaStreamingResponse
+from letta.schemas.usage import LettaUsageStatistics
+logger = get_logger(__name__)
+def _sse_post(url: str, data: dict, headers: dict) -> Generator[Union[LettaStreamingResponse, ChatCompletionChunk], None, None]:
+    """
+    Sends an SSE POST request and yields parsed response chunks.
+    """
+    # TODO: Please note his is a very generous timeout for e2b reasons
+    with httpx.Client(timeout=httpx.Timeout(5 * 60.0, read=5 * 60.0)) as client:
+        with connect_sse(client, method="POST", url=url, json=data, headers=headers) as event_source:
+            # Check for immediate HTTP errors before processing the SSE stream
+            if not event_source.response.is_success:
+                response_bytes = event_source.response.read()
+                logger.warning(f"SSE request error: {vars(event_source.response)}")
+                logger.warning(response_bytes.decode("utf-8"))
+                try:
+                    response_dict = json.loads(response_bytes.decode("utf-8"))
+                    error_message = response_dict.get("error", {}).get("message", "")
+                    if OPENAI_CONTEXT_WINDOW_ERROR_SUBSTRING in error_message:
+                        logger.error(error_message)
+                        raise LLMError(error_message)
+                except LLMError:
+                    raise
+                except Exception:
+                    logger.error("Failed to parse SSE message, raising HTTP error")
+                    event_source.response.raise_for_status()
+            try:
+                for sse in event_source.iter_sse():
+                    if sse.data in {status.value for status in MessageStreamStatus}:
+                        yield MessageStreamStatus(sse.data)
+                        if sse.data == MessageStreamStatus.done.value:
+                            # We received the [DONE], so stop reading the stream.
+                            break
+                    else:
+                        chunk_data = json.loads(sse.data)
+                        if "reasoning" in chunk_data:
+                            yield ReasoningMessage(**chunk_data)
+                        elif chunk_data.get("message_type") == "assistant_message":
+                            yield AssistantMessage(**chunk_data)
+                        elif "tool_call" in chunk_data:
+                            yield ToolCallMessage(**chunk_data)
+                        elif "tool_return" in chunk_data:
+                            yield ToolReturnMessage(**chunk_data)
+                        elif "step_count" in chunk_data:
+                            yield LettaUsageStatistics(**chunk_data)
+                        elif chunk_data.get("object") == get_args(ChatCompletionChunk.__annotations__["object"])[0]:
+                            yield ChatCompletionChunk(**chunk_data)
+                        else:
+                            raise ValueError(f"Unknown message type in chunk_data: {chunk_data}")
+            except SSEError as e:
+                logger.error(f"SSE stream error: {e}")
+                if "application/json" in str(e):
+                    response = client.post(url=url, json=data, headers=headers)
+                    if response.headers.get("Content-Type", "").startswith("application/json"):
+                        error_details = response.json()
+                        logger.error(f"POST Error: {error_details}")
+                    else:
+                        logger.error("Failed to retrieve JSON error message via retry.")
+                raise e
+            except Exception as e:
+                logger.error(f"Unexpected exception: {e}")
+                if event_source.response.request:
+                    logger.error(f"HTTP Request: {vars(event_source.response.request)}")
+                if event_source.response:
+                    logger.error(f"HTTP Status: {event_source.response.status_code}")
+                    logger.error(f"HTTP Headers: {event_source.response.headers}")
+                raise e

{letta_nightly-0.6.23.dev20250211104055 → letta_nightly-0.6.24.dev20250212072610}/letta/constants.py RENAMED Viewed

@@ -51,9 +51,6 @@ BASE_TOOLS = ["send_message", "conversation_search", "archival_memory_insert", "
 BASE_MEMORY_TOOLS = ["core_memory_append", "core_memory_replace"]
 # Multi agent tools
 MULTI_AGENT_TOOLS = ["send_message_to_agent_and_wait_for_reply", "send_message_to_agents_matching_all_tags", "send_message_to_agent_async"]
-MULTI_AGENT_SEND_MESSAGE_MAX_RETRIES = 3
-MULTI_AGENT_SEND_MESSAGE_TIMEOUT = 20 * 60
-MULTI_AGENT_CONCURRENT_SENDS = 15
 # The name of the tool used to send message to the user
 # May not be relevant in cases where the agent has multiple ways to message to user (send_imessage, send_discord_mesasge, ...)

{letta_nightly-0.6.23.dev20250211104055 → letta_nightly-0.6.24.dev20250212072610}/letta/embeddings.py RENAMED Viewed

@@ -167,6 +167,27 @@ class OllamaEmbeddings:
         return response_json["embedding"]
+class GoogleEmbeddings:
+    def __init__(self, api_key: str, model: str, base_url: str):
+        self.api_key = api_key
+        self.model = model
+        self.base_url = base_url  # Expected to be "https://generativelanguage.googleapis.com"
+    def get_text_embedding(self, text: str):
+        import httpx
+        headers = {"Content-Type": "application/json"}
+        # Build the URL based on the provided base_url, model, and API key.
+        url = f"{self.base_url}/v1beta/models/{self.model}:embedContent?key={self.api_key}"
+        payload = {"model": self.model, "content": {"parts": [{"text": text}]}}
+        with httpx.Client() as client:
+            response = client.post(url, headers=headers, json=payload)
+        # Raise an error for non-success HTTP status codes.
+        response.raise_for_status()
+        response_json = response.json()
+        return response_json["embedding"]["values"]
 def query_embedding(embedding_model, query_text: str):
     """Generate padded embedding for querying database"""
     query_vec = embedding_model.get_text_embedding(query_text)
@@ -237,5 +258,14 @@ def embedding_model(config: EmbeddingConfig, user_id: Optional[uuid.UUID] = None
         )
         return model
+    elif endpoint_type == "google_ai":
+        assert all([model_settings.gemini_api_key is not None, model_settings.gemini_base_url is not None])
+        model = GoogleEmbeddings(
+            model=config.embedding_model,
+            api_key=model_settings.gemini_api_key,
+            base_url=model_settings.gemini_base_url,
+        )
+        return model
     else:
         raise ValueError(f"Unknown endpoint type {endpoint_type}")

{letta_nightly-0.6.23.dev20250211104055 → letta_nightly-0.6.24.dev20250212072610}/letta/llm_api/anthropic.py RENAMED Viewed

@@ -19,6 +19,8 @@ from anthropic.types.beta import (
 from letta.errors import BedrockError, BedrockPermissionError
 from letta.llm_api.aws_bedrock import get_bedrock_client
+from letta.llm_api.helpers import add_inner_thoughts_to_functions
+from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
 from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
 from letta.schemas.message import Message as _Message
 from letta.schemas.message import MessageRole as _MessageRole
@@ -513,9 +515,23 @@ def convert_anthropic_stream_event_to_chatcompletion(
 def _prepare_anthropic_request(
     data: ChatCompletionRequest,
     inner_thoughts_xml_tag: Optional[str] = "thinking",
+    # if true, prefix fill the generation with the thinking tag
+    prefix_fill: bool = True,
+    # if true, put COT inside the tool calls instead of inside the content
+    put_inner_thoughts_in_kwargs: bool = False,
 ) -> dict:
     """Prepare the request data for Anthropic API format."""
-    # convert the tools
+    # if needed, put inner thoughts as a kwarg for all tools
+    if data.tools and put_inner_thoughts_in_kwargs:
+        functions = add_inner_thoughts_to_functions(
+            functions=[t.function.model_dump() for t in data.tools],
+            inner_thoughts_key=INNER_THOUGHTS_KWARG,
+            inner_thoughts_description=INNER_THOUGHTS_KWARG_DESCRIPTION,
+        )
+        data.tools = [Tool(function=f) for f in functions]
+    # convert the tools to Anthropic's payload format
     anthropic_tools = None if data.tools is None else convert_tools_to_anthropic_format(data.tools)
     # pydantic -> dict
@@ -529,11 +545,25 @@ def _prepare_anthropic_request(
         data.pop("tools")
         data.pop("tool_choice", None)
     elif anthropic_tools is not None:
+        # TODO eventually enable parallel tool use
         data["tools"] = anthropic_tools
-        if len(anthropic_tools) == 1:
+        # tool_choice_type other than "auto" only plays nice if thinking goes inside the tool calls
+        if put_inner_thoughts_in_kwargs:
+            if len(anthropic_tools) == 1:
+                data["tool_choice"] = {
+                    "type": "tool",
+                    "name": anthropic_tools[0]["name"],
+                    "disable_parallel_tool_use": True,
+                }
+            else:
+                data["tool_choice"] = {
+                    "type": "any",
+                    "disable_parallel_tool_use": True,
+                }
+        else:
             data["tool_choice"] = {
-                "type": "tool",
-                "name": anthropic_tools[0]["name"],
+                "type": "auto",
                 "disable_parallel_tool_use": True,
             }
@@ -548,8 +578,21 @@ def _prepare_anthropic_request(
             message["content"] = None
     # Convert to Anthropic format
-    msg_objs = [_Message.dict_to_message(user_id=None, agent_id=None, openai_message_dict=m) for m in data["messages"]]
-    data["messages"] = [m.to_anthropic_dict(inner_thoughts_xml_tag=inner_thoughts_xml_tag) for m in msg_objs]
+    msg_objs = [
+        _Message.dict_to_message(
+            user_id=None,
+            agent_id=None,
+            openai_message_dict=m,
+        )
+        for m in data["messages"]
+    ]
+    data["messages"] = [
+        m.to_anthropic_dict(
+            inner_thoughts_xml_tag=inner_thoughts_xml_tag,
+            put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
+        )
+        for m in msg_objs
+    ]
     # Ensure first message is user
     if data["messages"][0]["role"] != "user":
@@ -558,6 +601,16 @@ def _prepare_anthropic_request(
     # Handle alternating messages
     data["messages"] = merge_tool_results_into_user_messages(data["messages"])
+    # Handle prefix fill (not compatible with inner-thouguhts-in-kwargs)
+    # https://docs.anthropic.com/en/api/messages#body-messages
+    # NOTE: cannot prefill with tools for opus:
+    # Your API request included an `assistant` message in the final position, which would pre-fill the `assistant` response. When using tools with "claude-3-opus-20240229"
+    if prefix_fill and not put_inner_thoughts_in_kwargs and "opus" not in data["model"]:
+        data["messages"].append(
+            # Start the thinking process for the assistant
+            {"role": "assistant", "content": f"<{inner_thoughts_xml_tag}>"},
+        )
     # Validate max_tokens
     assert "max_tokens" in data, data
@@ -571,6 +624,7 @@ def _prepare_anthropic_request(
 def anthropic_chat_completions_request(
     data: ChatCompletionRequest,
     inner_thoughts_xml_tag: Optional[str] = "thinking",
+    put_inner_thoughts_in_kwargs: bool = False,
     betas: List[str] = ["tools-2024-04-04"],
 ) -> ChatCompletionResponse:
     """https://docs.anthropic.com/claude/docs/tool-use"""
@@ -580,7 +634,11 @@ def anthropic_chat_completions_request(
         anthropic_client = anthropic.Anthropic(api_key=anthropic_override_key)
     elif model_settings.anthropic_api_key:
         anthropic_client = anthropic.Anthropic()
-    data = _prepare_anthropic_request(data, inner_thoughts_xml_tag)
+    data = _prepare_anthropic_request(
+        data=data,
+        inner_thoughts_xml_tag=inner_thoughts_xml_tag,
+        put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
+    )
     response = anthropic_client.beta.messages.create(
         **data,
         betas=betas,
@@ -611,6 +669,7 @@ def anthropic_bedrock_chat_completions_request(
 def anthropic_chat_completions_request_stream(
     data: ChatCompletionRequest,
     inner_thoughts_xml_tag: Optional[str] = "thinking",
+    put_inner_thoughts_in_kwargs: bool = False,
     betas: List[str] = ["tools-2024-04-04"],
 ) -> Generator[ChatCompletionChunkResponse, None, None]:
     """Stream chat completions from Anthropic API.
@@ -618,7 +677,11 @@ def anthropic_chat_completions_request_stream(
     Similar to OpenAI's streaming, but using Anthropic's native streaming support.
     See: https://docs.anthropic.com/claude/reference/messages-streaming
     """
-    data = _prepare_anthropic_request(data, inner_thoughts_xml_tag)
+    data = _prepare_anthropic_request(
+        data=data,
+        inner_thoughts_xml_tag=inner_thoughts_xml_tag,
+        put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
+    )
     anthropic_override_key = ProviderManager().get_anthropic_override_key()
     if anthropic_override_key:
@@ -666,6 +729,7 @@ def anthropic_chat_completions_process_stream(
     chat_completion_request: ChatCompletionRequest,
     stream_interface: Optional[Union[AgentChunkStreamingInterface, AgentRefreshStreamingInterface]] = None,
     inner_thoughts_xml_tag: Optional[str] = "thinking",
+    put_inner_thoughts_in_kwargs: bool = False,
     create_message_id: bool = True,
     create_message_datetime: bool = True,
     betas: List[str] = ["tools-2024-04-04"],
@@ -743,6 +807,7 @@ def anthropic_chat_completions_process_stream(
             anthropic_chat_completions_request_stream(
                 data=chat_completion_request,
                 inner_thoughts_xml_tag=inner_thoughts_xml_tag,
+                put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
                 betas=betas,
             )
         ):

{letta_nightly-0.6.23.dev20250211104055 → letta_nightly-0.6.24.dev20250212072610}/letta/llm_api/llm_api_tools.py RENAMED Viewed

@@ -111,7 +111,6 @@ def create(
     # streaming?
     stream: bool = False,
     stream_interface: Optional[Union[AgentRefreshStreamingInterface, AgentChunkStreamingInterface]] = None,
-    max_tokens: Optional[int] = None,
     model_settings: Optional[dict] = None,  # TODO: eventually pass from server
 ) -> ChatCompletionResponse:
     """Return response to chat completion with backoff"""
@@ -157,7 +156,7 @@ def create(
             else:
                 function_call = "required"
-        data = build_openai_chat_completions_request(llm_config, messages, user_id, functions, function_call, use_tool_naming, max_tokens)
+        data = build_openai_chat_completions_request(llm_config, messages, user_id, functions, function_call, use_tool_naming)
         if stream:  # Client requested token streaming
             data.stream = True
             assert isinstance(stream_interface, AgentChunkStreamingInterface) or isinstance(
@@ -212,7 +211,7 @@ def create(
         # For Azure, this model_endpoint is required to be configured via env variable, so users don't need to provide it in the LLM config
         llm_config.model_endpoint = model_settings.azure_base_url
         chat_completion_request = build_openai_chat_completions_request(
-            llm_config, messages, user_id, functions, function_call, use_tool_naming, max_tokens
+            llm_config, messages, user_id, functions, function_call, use_tool_naming
         )
         response = azure_openai_chat_completions_request(
@@ -248,7 +247,7 @@ def create(
             data=dict(
                 contents=[m.to_google_ai_dict() for m in messages],
                 tools=tools,
-                generation_config={"temperature": llm_config.temperature},
+                generation_config={"temperature": llm_config.temperature, "max_output_tokens": llm_config.max_tokens},
             ),
             inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
         )
@@ -268,7 +267,7 @@ def create(
             messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
             tools=([{"type": "function", "function": f} for f in functions] if functions else None),
             tool_choice=tool_call,
-            max_tokens=1024,  # TODO make dynamic
+            max_tokens=llm_config.max_tokens,  # Note: max_tokens is required for Anthropic API
             temperature=llm_config.temperature,
             stream=stream,
         )
@@ -279,14 +278,21 @@ def create(
             response = anthropic_chat_completions_process_stream(
                 chat_completion_request=chat_completion_request,
+                put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
                 stream_interface=stream_interface,
             )
-            return response
-        # Client did not request token streaming (expect a blocking backend response)
-        return anthropic_chat_completions_request(
-            data=chat_completion_request,
-        )
+        else:
+            # Client did not request token streaming (expect a blocking backend response)
+            response = anthropic_chat_completions_request(
+                data=chat_completion_request,
+                put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
+            )
+        if llm_config.put_inner_thoughts_in_kwargs:
+            response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG)
+        return response
     # elif llm_config.model_endpoint_type == "cohere":
     #     if stream:
@@ -416,7 +422,7 @@ def create(
                 tool_choice=tool_call,
                 # user=str(user_id),
                 # NOTE: max_tokens is required for Anthropic API
-                max_tokens=1024,  # TODO make dynamic
+                max_tokens=llm_config.max_tokens,
             ),
         )

{letta_nightly-0.6.23.dev20250211104055 → letta_nightly-0.6.24.dev20250212072610}/letta/llm_api/openai.py RENAMED Viewed

@@ -7,6 +7,7 @@ from openai import OpenAI
 from letta.llm_api.helpers import add_inner_thoughts_to_functions, convert_to_structured_output, make_post_request
 from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION, INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST
 from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
+from letta.log import get_logger
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as _Message
 from letta.schemas.message import MessageRole as _MessageRole
@@ -26,7 +27,7 @@ from letta.schemas.openai.embedding_response import EmbeddingResponse
 from letta.streaming_interface import AgentChunkStreamingInterface, AgentRefreshStreamingInterface
 from letta.utils import get_tool_call_id, smart_urljoin
-OPENAI_SSE_DONE = "[DONE]"
+logger = get_logger(__name__)
 def openai_get_model_list(
@@ -93,7 +94,6 @@ def build_openai_chat_completions_request(
     functions: Optional[list],
     function_call: Optional[str],
     use_tool_naming: bool,
-    max_tokens: Optional[int],
 ) -> ChatCompletionRequest:
     if functions and llm_config.put_inner_thoughts_in_kwargs:
         # Special case for LM Studio backend since it needs extra guidance to force out the thoughts first
@@ -130,7 +130,7 @@ def build_openai_chat_completions_request(
             tools=[Tool(type="function", function=f) for f in functions] if functions else None,
             tool_choice=tool_choice,
             user=str(user_id),
-            max_completion_tokens=max_tokens,
+            max_completion_tokens=llm_config.max_tokens,
             temperature=llm_config.temperature,
         )
     else:
@@ -140,7 +140,7 @@ def build_openai_chat_completions_request(
             functions=functions,
             function_call=function_call,
             user=str(user_id),
-            max_completion_tokens=max_tokens,
+            max_completion_tokens=llm_config.max_tokens,
             temperature=llm_config.temperature,
         )
         # https://platform.openai.com/docs/guides/text-generation/json-mode
@@ -354,9 +354,10 @@ def openai_chat_completions_process_stream(
     except Exception as e:
         if stream_interface:
             stream_interface.stream_end()
-        print(f"Parsing ChatCompletion stream failed with error:\n{str(e)}")
+        logger.error(f"Parsing ChatCompletion stream failed with error:\n{str(e)}")
         raise e
     finally:
+        logger.info(f"Finally ending streaming interface.")
         if stream_interface:
             stream_interface.stream_end()

{letta_nightly-0.6.23.dev20250211104055 → letta_nightly-0.6.24.dev20250212072610}/letta/schemas/llm_config.py RENAMED Viewed

@@ -15,6 +15,7 @@ class LLMConfig(BaseModel):
         context_window (int): The context window size for the model.
         put_inner_thoughts_in_kwargs (bool): Puts `inner_thoughts` as a kwarg in the function call if this is set to True. This helps with function calling performance and also the generation of inner thoughts.
         temperature (float): The temperature to use when generating text with the model. A higher temperature will result in more random text.
+        max_tokens (int): The maximum number of tokens to generate.
     """
     # TODO: 🤮 don't default to a vendor! bug city!
@@ -51,6 +52,10 @@ class LLMConfig(BaseModel):
         0.7,
         description="The temperature to use when generating text with the model. A higher temperature will result in more random text.",
     )
+    max_tokens: Optional[int] = Field(
+        1024,
+        description="The maximum number of tokens to generate. If not set, the model will use its default value.",
+    )
     # FIXME hack to silence pydantic protected namespace warning
     model_config = ConfigDict(protected_namespaces=())

{letta_nightly-0.6.23.dev20250211104055 → letta_nightly-0.6.24.dev20250212072610}/letta/schemas/message.py RENAMED Viewed

@@ -542,7 +542,11 @@ class Message(BaseMessage):
         return openai_message
-    def to_anthropic_dict(self, inner_thoughts_xml_tag="thinking") -> dict:
+    def to_anthropic_dict(
+        self,
+        inner_thoughts_xml_tag="thinking",
+        put_inner_thoughts_in_kwargs: bool = False,
+    ) -> dict:
         """
         Convert to an Anthropic message dictionary
@@ -586,26 +590,38 @@ class Message(BaseMessage):
                 "role": self.role,
             }
             content = []
-            if self.text is not None:
+            # COT / reasoning / thinking
+            if self.text is not None and not put_inner_thoughts_in_kwargs:
                 content.append(
                     {
                         "type": "text",
                         "text": add_xml_tag(string=self.text, xml_tag=inner_thoughts_xml_tag),
                     }
                 )
+            # Tool calling
             if self.tool_calls is not None:
                 for tool_call in self.tool_calls:
+                    if put_inner_thoughts_in_kwargs:
+                        tool_call_input = add_inner_thoughts_to_tool_call(
+                            tool_call,
+                            inner_thoughts=self.text,
+                            inner_thoughts_key=INNER_THOUGHTS_KWARG,
+                        ).model_dump()
+                    else:
+                        tool_call_input = json.loads(tool_call.function.arguments)
                     content.append(
                         {
                             "type": "tool_use",
                             "id": tool_call.id,
                             "name": tool_call.function.name,
-                            "input": json.loads(tool_call.function.arguments),
+                            "input": tool_call_input,
                         }
                     )
             # If the only content was text, unpack it back into a singleton
-            # TODO
+            # TODO support multi-modal
             anthropic_message["content"] = content
             # Optional fields, do not include if null

{letta_nightly-0.6.23.dev20250211104055 → letta_nightly-0.6.24.dev20250212072610}/letta/schemas/providers.py RENAMED Viewed

@@ -347,6 +347,15 @@ class AnthropicProvider(Provider):
         configs = []
         for model in models:
+            # We set this to false by default, because Anthropic can
+            # natively support <thinking> tags inside of content fields
+            # However, putting COT inside of tool calls can make it more
+            # reliable for tool calling (no chance of a non-tool call step)
+            # Since tool_choice_type 'any' doesn't work with in-content COT
+            # NOTE For Haiku, it can be flaky if we don't enable this by default
+            inner_thoughts_in_kwargs = True if "haiku" in model["name"] else False
             configs.append(
                 LLMConfig(
                     model=model["name"],
@@ -354,6 +363,7 @@ class AnthropicProvider(Provider):
                     model_endpoint=self.base_url,
                     context_window=model["context_window"],
                     handle=self.get_handle(model["name"]),
+                    put_inner_thoughts_in_kwargs=inner_thoughts_in_kwargs,
                 )
             )
         return configs

letta-nightly 0.6.23.dev20250211104055__tar.gz → 0.6.24.dev20250212072610__tar.gz

Potentially problematic release.

letta-nightly 0.6.23.dev20250211104055tar.gz → 0.6.24.dev20250212072610tar.gz