PyPI - letta-nightly - Versions diffs - 0.7.1.dev20250423104245__py3-none-any.whl → 0.7.3.dev20250424054013__py3-none-any.whl - Mend

letta-nightly 0.7.1.dev20250423104245py3-none-any.whl → 0.7.3.dev20250424054013py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

letta/__init__.py +1 -1
letta/agent.py +2 -1
letta/agents/letta_agent.py +2 -1
letta/agents/letta_agent_batch.py +8 -3
letta/agents/voice_agent.py +2 -2
letta/client/client.py +3 -0
letta/functions/functions.py +2 -1
letta/functions/schema_generator.py +5 -0
letta/helpers/composio_helpers.py +1 -1
letta/helpers/datetime_helpers.py +9 -0
letta/jobs/llm_batch_job_polling.py +2 -1
letta/llm_api/anthropic.py +10 -6
letta/llm_api/anthropic_client.py +7 -6
letta/llm_api/cohere.py +2 -2
letta/llm_api/google_ai_client.py +11 -45
letta/llm_api/google_vertex_client.py +8 -7
letta/llm_api/llm_client.py +8 -14
letta/llm_api/llm_client_base.py +17 -16
letta/llm_api/openai.py +11 -4
letta/llm_api/openai_client.py +47 -14
letta/local_llm/chat_completion_proxy.py +2 -2
letta/memory.py +2 -1
letta/personas/examples/sleeptime_memory_persona.txt +5 -0
letta/schemas/enums.py +3 -0
letta/schemas/letta_message_content.py +2 -1
letta/schemas/llm_config.py +12 -2
letta/schemas/message.py +17 -0
letta/schemas/openai/chat_completion_response.py +52 -3
letta/server/rest_api/chat_completions_interface.py +2 -2
letta/server/rest_api/interface.py +1 -1
letta/server/rest_api/routers/v1/messages.py +9 -1
letta/server/server.py +1 -6
letta/services/agent_manager.py +6 -1
{letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/METADATA +1 -1
{letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/RECORD +38 -38
letta/personas/examples/offline_memory_persona.txt +0 -4
{letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/LICENSE +0 -0
{letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/WHEEL +0 -0
{letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/entry_points.txt +0 -0

letta/llm_api/llm_client_base.py CHANGED Viewed

@@ -20,17 +20,16 @@ class LLMClientBase:
     def __init__(
         self,
-        llm_config: LLMConfig,
         put_inner_thoughts_first: Optional[bool] = True,
         use_tool_naming: bool = True,
     ):
-        self.llm_config = llm_config
         self.put_inner_thoughts_first = put_inner_thoughts_first
         self.use_tool_naming = use_tool_naming
     def send_llm_request(
         self,
         messages: List[Message],
+        llm_config: LLMConfig,
         tools: Optional[List[dict]] = None,  # TODO: change to Tool object
         stream: bool = False,
         force_tool_call: Optional[str] = None,
@@ -40,23 +39,24 @@ class LLMClientBase:
         If stream=True, returns a Stream[ChatCompletionChunk] that can be iterated over.
         Otherwise returns a ChatCompletionResponse.
         """
-        request_data = self.build_request_data(messages, self.llm_config, tools, force_tool_call)
+        request_data = self.build_request_data(messages, llm_config, tools, force_tool_call)
         try:
             log_event(name="llm_request_sent", attributes=request_data)
             if stream:
-                return self.stream(request_data)
+                return self.stream(request_data, llm_config)
             else:
-                response_data = self.request(request_data)
+                response_data = self.request(request_data, llm_config)
             log_event(name="llm_response_received", attributes=response_data)
         except Exception as e:
             raise self.handle_llm_error(e)
-        return self.convert_response_to_chat_completion(response_data, messages)
+        return self.convert_response_to_chat_completion(response_data, messages, llm_config)
     async def send_llm_request_async(
         self,
         messages: List[Message],
+        llm_config: LLMConfig,
         tools: Optional[List[dict]] = None,  # TODO: change to Tool object
         stream: bool = False,
         force_tool_call: Optional[str] = None,
@@ -66,19 +66,19 @@ class LLMClientBase:
         If stream=True, returns an AsyncStream[ChatCompletionChunk] that can be async iterated over.
         Otherwise returns a ChatCompletionResponse.
         """
-        request_data = self.build_request_data(messages, self.llm_config, tools, force_tool_call)
+        request_data = self.build_request_data(messages, llm_config, tools, force_tool_call)
         try:
             log_event(name="llm_request_sent", attributes=request_data)
             if stream:
-                return await self.stream_async(request_data)
+                return await self.stream_async(request_data, llm_config)
             else:
-                response_data = await self.request_async(request_data)
+                response_data = await self.request_async(request_data, llm_config)
             log_event(name="llm_response_received", attributes=response_data)
         except Exception as e:
             raise self.handle_llm_error(e)
-        return self.convert_response_to_chat_completion(response_data, messages)
+        return self.convert_response_to_chat_completion(response_data, messages, llm_config)
     async def send_llm_batch_request_async(
         self,
@@ -102,14 +102,14 @@ class LLMClientBase:
         raise NotImplementedError
     @abstractmethod
-    def request(self, request_data: dict) -> dict:
+    def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
         """
         Performs underlying request to llm and returns raw response.
         """
         raise NotImplementedError
     @abstractmethod
-    async def request_async(self, request_data: dict) -> dict:
+    async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
         """
         Performs underlying request to llm and returns raw response.
         """
@@ -120,6 +120,7 @@ class LLMClientBase:
         self,
         response_data: dict,
         input_messages: List[Message],
+        llm_config: LLMConfig,
     ) -> ChatCompletionResponse:
         """
         Converts custom response format from llm client into an OpenAI
@@ -128,18 +129,18 @@ class LLMClientBase:
         raise NotImplementedError
     @abstractmethod
-    def stream(self, request_data: dict) -> Stream[ChatCompletionChunk]:
+    def stream(self, request_data: dict, llm_config: LLMConfig) -> Stream[ChatCompletionChunk]:
         """
         Performs underlying streaming request to llm and returns raw response.
         """
-        raise NotImplementedError(f"Streaming is not supported for {self.llm_config.model_endpoint_type}")
+        raise NotImplementedError(f"Streaming is not supported for {llm_config.model_endpoint_type}")
     @abstractmethod
-    async def stream_async(self, request_data: dict) -> AsyncStream[ChatCompletionChunk]:
+    async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk]:
         """
         Performs underlying streaming request to llm and returns raw response.
         """
-        raise NotImplementedError(f"Streaming is not supported for {self.llm_config.model_endpoint_type}")
+        raise NotImplementedError(f"Streaming is not supported for {llm_config.model_endpoint_type}")
     @abstractmethod
     def handle_llm_error(self, e: Exception) -> Exception:

letta/llm_api/openai.py CHANGED Viewed

@@ -4,7 +4,9 @@ from typing import Generator, List, Optional, Union
 import requests
 from openai import OpenAI
+from letta.helpers.datetime_helpers import timestamp_to_datetime
 from letta.llm_api.helpers import add_inner_thoughts_to_functions, convert_to_structured_output, make_post_request
+from letta.llm_api.openai_client import supports_parallel_tool_calling, supports_temperature_param
 from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION, INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST
 from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
 from letta.log import get_logger
@@ -135,7 +137,7 @@ def build_openai_chat_completions_request(
             tool_choice=tool_choice,
             user=str(user_id),
             max_completion_tokens=llm_config.max_tokens,
-            temperature=1.0 if llm_config.enable_reasoner else llm_config.temperature,
+            temperature=llm_config.temperature if supports_temperature_param(model) else None,
             reasoning_effort=llm_config.reasoning_effort,
         )
     else:
@@ -237,7 +239,7 @@ def openai_chat_completions_process_stream(
     chat_completion_response = ChatCompletionResponse(
         id=dummy_message.id if create_message_id else TEMP_STREAM_RESPONSE_ID,
         choices=[],
-        created=dummy_message.created_at,  # NOTE: doesn't matter since both will do get_utc_time()
+        created=int(dummy_message.created_at.timestamp()),  # NOTE: doesn't matter since both will do get_utc_time()
         model=chat_completion_request.model,
         usage=UsageStatistics(
             completion_tokens=0,
@@ -274,7 +276,11 @@ def openai_chat_completions_process_stream(
                     message_type = stream_interface.process_chunk(
                         chat_completion_chunk,
                         message_id=chat_completion_response.id if create_message_id else chat_completion_chunk.id,
-                        message_date=chat_completion_response.created if create_message_datetime else chat_completion_chunk.created,
+                        message_date=(
+                            timestamp_to_datetime(chat_completion_response.created)
+                            if create_message_datetime
+                            else timestamp_to_datetime(chat_completion_chunk.created)
+                        ),
                         expect_reasoning_content=expect_reasoning_content,
                         name=name,
                         message_index=message_idx,
@@ -489,6 +495,7 @@ def prepare_openai_payload(chat_completion_request: ChatCompletionRequest):
     #         except ValueError as e:
     #             warnings.warn(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
-    if "o3-mini" in chat_completion_request.model or "o1" in chat_completion_request.model:
+    if not supports_parallel_tool_calling(chat_completion_request.model):
         data.pop("parallel_tool_calls", None)
     return data

letta/llm_api/openai_client.py CHANGED Viewed

@@ -34,12 +34,39 @@ from letta.settings import model_settings
 logger = get_logger(__name__)
+def is_openai_reasoning_model(model: str) -> bool:
+    """Utility function to check if the model is a 'reasoner'"""
+    # NOTE: needs to be updated with new model releases
+    return model.startswith("o1") or model.startswith("o3")
+def supports_temperature_param(model: str) -> bool:
+    """Certain OpenAI models don't support configuring the temperature.
+    Example error: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature', 'code': 'unsupported_parameter'}}
+    """
+    if is_openai_reasoning_model(model):
+        return False
+    else:
+        return True
+def supports_parallel_tool_calling(model: str) -> bool:
+    """Certain OpenAI models don't support parallel tool calls."""
+    if is_openai_reasoning_model(model):
+        return False
+    else:
+        return True
 class OpenAIClient(LLMClientBase):
-    def _prepare_client_kwargs(self) -> dict:
+    def _prepare_client_kwargs(self, llm_config: LLMConfig) -> dict:
         api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
         # supposedly the openai python client requires a dummy API key
         api_key = api_key or "DUMMY_API_KEY"
-        kwargs = {"api_key": api_key, "base_url": self.llm_config.model_endpoint}
+        kwargs = {"api_key": api_key, "base_url": llm_config.model_endpoint}
         return kwargs
@@ -66,7 +93,8 @@ class OpenAIClient(LLMClientBase):
                 put_inner_thoughts_first=True,
             )
-        use_developer_message = llm_config.model.startswith("o1") or llm_config.model.startswith("o3")  # o-series models
+        use_developer_message = is_openai_reasoning_model(llm_config.model)
         openai_message_list = [
             cast_message_to_subtype(
                 m.to_openai_dict(
@@ -87,7 +115,7 @@ class OpenAIClient(LLMClientBase):
         # TODO(matt) move into LLMConfig
         # TODO: This vllm checking is very brittle and is a patch at most
         tool_choice = None
-        if llm_config.model_endpoint == "https://inference.memgpt.ai" or (llm_config.handle and "vllm" in self.llm_config.handle):
+        if llm_config.model_endpoint == "https://inference.memgpt.ai" or (llm_config.handle and "vllm" in llm_config.handle):
             tool_choice = "auto"  # TODO change to "required" once proxy supports it
         elif tools:
             # only set if tools is non-Null
@@ -103,7 +131,7 @@ class OpenAIClient(LLMClientBase):
             tool_choice=tool_choice,
             user=str(),
             max_completion_tokens=llm_config.max_tokens,
-            temperature=llm_config.temperature,
+            temperature=llm_config.temperature if supports_temperature_param(model) else None,
         )
         if "inference.memgpt.ai" in llm_config.model_endpoint:
@@ -124,20 +152,20 @@ class OpenAIClient(LLMClientBase):
         return data.model_dump(exclude_unset=True)
-    def request(self, request_data: dict) -> dict:
+    def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
         """
         Performs underlying synchronous request to OpenAI API and returns raw response dict.
         """
-        client = OpenAI(**self._prepare_client_kwargs())
+        client = OpenAI(**self._prepare_client_kwargs(llm_config))
         response: ChatCompletion = client.chat.completions.create(**request_data)
         return response.model_dump()
-    async def request_async(self, request_data: dict) -> dict:
+    async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
         """
         Performs underlying asynchronous request to OpenAI API and returns raw response dict.
         """
-        client = AsyncOpenAI(**self._prepare_client_kwargs())
+        client = AsyncOpenAI(**self._prepare_client_kwargs(llm_config))
         response: ChatCompletion = await client.chat.completions.create(**request_data)
         return response.model_dump()
@@ -145,6 +173,7 @@ class OpenAIClient(LLMClientBase):
         self,
         response_data: dict,
         input_messages: List[PydanticMessage],  # Included for consistency, maybe used later
+        llm_config: LLMConfig,
     ) -> ChatCompletionResponse:
         """
         Converts raw OpenAI response dict into the ChatCompletionResponse Pydantic model.
@@ -155,26 +184,30 @@ class OpenAIClient(LLMClientBase):
         chat_completion_response = ChatCompletionResponse(**response_data)
         # Unpack inner thoughts if they were embedded in function arguments
-        if self.llm_config.put_inner_thoughts_in_kwargs:
+        if llm_config.put_inner_thoughts_in_kwargs:
             chat_completion_response = unpack_all_inner_thoughts_from_kwargs(
                 response=chat_completion_response, inner_thoughts_key=INNER_THOUGHTS_KWARG
             )
+        # If we used a reasoning model, create a content part for the ommitted reasoning
+        if is_openai_reasoning_model(llm_config.model):
+            chat_completion_response.choices[0].message.ommitted_reasoning_content = True
         return chat_completion_response
-    def stream(self, request_data: dict) -> Stream[ChatCompletionChunk]:
+    def stream(self, request_data: dict, llm_config: LLMConfig) -> Stream[ChatCompletionChunk]:
         """
         Performs underlying streaming request to OpenAI and returns the stream iterator.
         """
-        client = OpenAI(**self._prepare_client_kwargs())
+        client = OpenAI(**self._prepare_client_kwargs(llm_config))
         response_stream: Stream[ChatCompletionChunk] = client.chat.completions.create(**request_data, stream=True)
         return response_stream
-    async def stream_async(self, request_data: dict) -> AsyncStream[ChatCompletionChunk]:
+    async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk]:
         """
         Performs underlying asynchronous streaming request to OpenAI and returns the async stream iterator.
         """
-        client = AsyncOpenAI(**self._prepare_client_kwargs())
+        client = AsyncOpenAI(**self._prepare_client_kwargs(llm_config))
         response_stream: AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(**request_data, stream=True)
         return response_stream

letta/local_llm/chat_completion_proxy.py CHANGED Viewed

@@ -6,7 +6,7 @@ import requests
 from letta.constants import CLI_WARNING_PREFIX
 from letta.errors import LocalLLMConnectionError, LocalLLMError
-from letta.helpers.datetime_helpers import get_utc_time
+from letta.helpers.datetime_helpers import get_utc_time_int
 from letta.helpers.json_helpers import json_dumps
 from letta.local_llm.constants import DEFAULT_WRAPPER
 from letta.local_llm.function_parser import patch_function
@@ -241,7 +241,7 @@ def get_chat_completion(
                 ),
             )
         ],
-        created=get_utc_time(),
+        created=get_utc_time_int(),
         model=model,
         # "This fingerprint represents the backend configuration that the model runs with."
         # system_fingerprint=user if user is not None else "null",

letta/memory.py CHANGED Viewed

@@ -79,7 +79,7 @@ def summarize_messages(
     llm_config_no_inner_thoughts.put_inner_thoughts_in_kwargs = False
     llm_client = LLMClient.create(
-        llm_config=llm_config_no_inner_thoughts,
+        provider=llm_config_no_inner_thoughts.model_endpoint_type,
         put_inner_thoughts_first=False,
     )
     # try to use new client, otherwise fallback to old flow
@@ -87,6 +87,7 @@ def summarize_messages(
     if llm_client:
         response = llm_client.send_llm_request(
             messages=message_sequence,
+            llm_config=llm_config_no_inner_thoughts,
             stream=False,
         )
     else:

letta/personas/examples/sleeptime_memory_persona.txt ADDED Viewed

@@ -0,0 +1,5 @@
+I am an expert conversation memory agent that can do the following:
+- Consolidate memories into more concise blocks
+- Identify patterns in user behavior
+- Make inferences based on the memory
+I manage the memory blocks such that they contain everything that is important about the conversation.

letta/schemas/enums.py CHANGED Viewed

@@ -3,6 +3,9 @@ from enum import Enum
 class ProviderType(str, Enum):
     anthropic = "anthropic"
+    google_ai = "google_ai"
+    google_vertex = "google_vertex"
+    openai = "openai"
 class MessageRole(str, Enum):

letta/schemas/letta_message_content.py CHANGED Viewed

@@ -145,7 +145,8 @@ class OmittedReasoningContent(MessageContent):
     type: Literal[MessageContentType.omitted_reasoning] = Field(
         MessageContentType.omitted_reasoning, description="Indicates this is an omitted reasoning step."
     )
-    tokens: int = Field(..., description="The reasoning token count for intermediate reasoning content.")
+    # NOTE: dropping because we don't track this kind of information for the other reasoning types
+    # tokens: int = Field(..., description="The reasoning token count for intermediate reasoning content.")
 LettaMessageContentUnion = Annotated[

letta/schemas/llm_config.py CHANGED Viewed

@@ -81,8 +81,11 @@ class LLMConfig(BaseModel):
     @model_validator(mode="before")
     @classmethod
     def set_default_enable_reasoner(cls, values):
-        if any(openai_reasoner_model in values.get("model", "") for openai_reasoner_model in ["o3-mini", "o1"]):
-            values["enable_reasoner"] = True
+        # NOTE: this is really only applicable for models that can toggle reasoning on-and-off, like 3.7
+        # We can also use this field to identify if a model is a "reasoning" model (o1/o3, etc.) if we want
+        # if any(openai_reasoner_model in values.get("model", "") for openai_reasoner_model in ["o3-mini", "o1"]):
+        #     values["enable_reasoner"] = True
+        #     values["put_inner_thoughts_in_kwargs"] = False
         return values
     @model_validator(mode="before")
@@ -100,6 +103,13 @@ class LLMConfig(BaseModel):
         if values.get("put_inner_thoughts_in_kwargs") is None:
             values["put_inner_thoughts_in_kwargs"] = False if model in avoid_put_inner_thoughts_in_kwargs else True
+        # For the o1/o3 series from OpenAI, set to False by default
+        # We can set this flag to `true` if desired, which will enable "double-think"
+        from letta.llm_api.openai_client import is_openai_reasoning_model
+        if is_openai_reasoning_model(model):
+            values["put_inner_thoughts_in_kwargs"] = False
         return values
     @model_validator(mode="after")

letta/schemas/message.py CHANGED Viewed

@@ -31,6 +31,7 @@ from letta.schemas.letta_message import (
 )
 from letta.schemas.letta_message_content import (
     LettaMessageContentUnion,
+    OmittedReasoningContent,
     ReasoningContent,
     RedactedReasoningContent,
     TextContent,
@@ -295,6 +296,18 @@ class Message(BaseMessage):
                                     sender_id=self.sender_id,
                                 )
                             )
+                        elif isinstance(content_part, OmittedReasoningContent):
+                            # Special case for "hidden reasoning" models like o1/o3
+                            # NOTE: we also have to think about how to return this during streaming
+                            messages.append(
+                                HiddenReasoningMessage(
+                                    id=self.id,
+                                    date=self.created_at,
+                                    state="omitted",
+                                    name=self.name,
+                                    otid=otid,
+                                )
+                            )
                         else:
                             warnings.warn(f"Unrecognized content part in assistant message: {content_part}")
@@ -464,6 +477,10 @@ class Message(BaseMessage):
                     data=openai_message_dict["redacted_reasoning_content"] if "redacted_reasoning_content" in openai_message_dict else None,
                 ),
             )
+        if "omitted_reasoning_content" in openai_message_dict and openai_message_dict["omitted_reasoning_content"]:
+            content.append(
+                OmittedReasoningContent(),
+            )
         # If we're going from deprecated function form
         if openai_message_dict["role"] == "function":

letta/schemas/openai/chat_completion_response.py CHANGED Viewed

@@ -39,9 +39,10 @@ class Message(BaseModel):
     tool_calls: Optional[List[ToolCall]] = None
     role: str
     function_call: Optional[FunctionCall] = None  # Deprecated
-    reasoning_content: Optional[str] = None  # Used in newer reasoning APIs
+    reasoning_content: Optional[str] = None  # Used in newer reasoning APIs, e.g. DeepSeek
     reasoning_content_signature: Optional[str] = None  # NOTE: for Anthropic
     redacted_reasoning_content: Optional[str] = None  # NOTE: for Anthropic
+    ommitted_reasoning_content: bool = False  # NOTE: for OpenAI o1/o3
 class Choice(BaseModel):
@@ -52,16 +53,64 @@ class Choice(BaseModel):
     seed: Optional[int] = None  # found in TogetherAI
+class UsageStatisticsPromptTokenDetails(BaseModel):
+    cached_tokens: int = 0
+    # NOTE: OAI specific
+    # audio_tokens: int = 0
+    def __add__(self, other: "UsageStatisticsPromptTokenDetails") -> "UsageStatisticsPromptTokenDetails":
+        return UsageStatisticsPromptTokenDetails(
+            cached_tokens=self.cached_tokens + other.cached_tokens,
+        )
+class UsageStatisticsCompletionTokenDetails(BaseModel):
+    reasoning_tokens: int = 0
+    # NOTE: OAI specific
+    # audio_tokens: int = 0
+    # accepted_prediction_tokens: int = 0
+    # rejected_prediction_tokens: int = 0
+    def __add__(self, other: "UsageStatisticsCompletionTokenDetails") -> "UsageStatisticsCompletionTokenDetails":
+        return UsageStatisticsCompletionTokenDetails(
+            reasoning_tokens=self.reasoning_tokens + other.reasoning_tokens,
+        )
 class UsageStatistics(BaseModel):
     completion_tokens: int = 0
     prompt_tokens: int = 0
     total_tokens: int = 0
+    prompt_tokens_details: Optional[UsageStatisticsPromptTokenDetails] = None
+    completion_tokens_details: Optional[UsageStatisticsCompletionTokenDetails] = None
     def __add__(self, other: "UsageStatistics") -> "UsageStatistics":
+        if self.prompt_tokens_details is None and other.prompt_tokens_details is None:
+            total_prompt_tokens_details = None
+        elif self.prompt_tokens_details is None:
+            total_prompt_tokens_details = other.prompt_tokens_details
+        elif other.prompt_tokens_details is None:
+            total_prompt_tokens_details = self.prompt_tokens_details
+        else:
+            total_prompt_tokens_details = self.prompt_tokens_details + other.prompt_tokens_details
+        if self.completion_tokens_details is None and other.completion_tokens_details is None:
+            total_completion_tokens_details = None
+        elif self.completion_tokens_details is None:
+            total_completion_tokens_details = other.completion_tokens_details
+        elif other.completion_tokens_details is None:
+            total_completion_tokens_details = self.completion_tokens_details
+        else:
+            total_completion_tokens_details = self.completion_tokens_details + other.completion_tokens_details
         return UsageStatistics(
             completion_tokens=self.completion_tokens + other.completion_tokens,
             prompt_tokens=self.prompt_tokens + other.prompt_tokens,
             total_tokens=self.total_tokens + other.total_tokens,
+            prompt_tokens_details=total_prompt_tokens_details,
+            completion_tokens_details=total_completion_tokens_details,
         )
@@ -70,7 +119,7 @@ class ChatCompletionResponse(BaseModel):
     id: str
     choices: List[Choice]
-    created: datetime.datetime
+    created: Union[datetime.datetime, int]
     model: Optional[str] = None  # NOTE: this is not consistent with OpenAI API standard, however is necessary to support local LLMs
     # system_fingerprint: str  # docs say this is mandatory, but in reality API returns None
     system_fingerprint: Optional[str] = None
@@ -138,7 +187,7 @@ class ChatCompletionChunkResponse(BaseModel):
     id: str
     choices: List[ChunkChoice]
-    created: Union[datetime.datetime, str]
+    created: Union[datetime.datetime, int]
     model: str
     # system_fingerprint: str  # docs say this is mandatory, but in reality API returns None
     system_fingerprint: Optional[str] = None

letta/server/rest_api/chat_completions_interface.py CHANGED Viewed

@@ -238,7 +238,7 @@ class ChatCompletionsStreamingInterface(AgentChunkStreamingInterface):
                     return ChatCompletionChunk(
                         id=chunk.id,
                         object=chunk.object,
-                        created=chunk.created.timestamp(),
+                        created=chunk.created,
                         model=chunk.model,
                         choices=[
                             Choice(
@@ -256,7 +256,7 @@ class ChatCompletionsStreamingInterface(AgentChunkStreamingInterface):
                 return ChatCompletionChunk(
                     id=chunk.id,
                     object=chunk.object,
-                    created=chunk.created.timestamp(),
+                    created=chunk.created,
                     model=chunk.model,
                     choices=[
                         Choice(

letta/server/rest_api/interface.py CHANGED Viewed

@@ -1001,7 +1001,7 @@ class StreamingServerInterface(AgentChunkStreamingInterface):
                 # Example case that would trigger here:
                 # id='chatcmpl-AKtUvREgRRvgTW6n8ZafiKuV0mxhQ'
                 # choices=[ChunkChoice(finish_reason=None, index=0, delta=MessageDelta(content=None, tool_calls=None, function_call=None), logprobs=None)]
-                # created=datetime.datetime(2024, 10, 21, 20, 40, 57, tzinfo=TzInfo(UTC))
+                # created=1713216662
                 # model='gpt-4o-mini-2024-07-18'
                 # object='chat.completion.chunk'
                 warnings.warn(f"Couldn't find delta in chunk: {chunk}")

letta/server/rest_api/routers/v1/messages.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import List, Optional
-from fastapi import APIRouter, Body, Depends, Header
+from fastapi import APIRouter, Body, Depends, Header, status
 from fastapi.exceptions import HTTPException
 from starlette.requests import Request
@@ -11,6 +11,7 @@ from letta.schemas.job import BatchJob, JobStatus, JobType, JobUpdate
 from letta.schemas.letta_request import CreateBatch
 from letta.server.rest_api.utils import get_letta_server
 from letta.server.server import SyncServer
+from letta.settings import settings
 router = APIRouter(prefix="/messages", tags=["messages"])
@@ -43,6 +44,13 @@ async def create_messages_batch(
         if length > max_bytes:
             raise HTTPException(status_code=413, detail=f"Request too large ({length} bytes). Max is {max_bytes} bytes.")
+    # Reject request if env var is not set
+    if not settings.enable_batch_job_polling:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Server misconfiguration: LETTA_ENABLE_BATCH_JOB_POLLING is set to False.",
+        )
     actor = server.user_manager.get_user_or_default(user_id=actor_id)
     batch_job = BatchJob(
         user_id=actor.id,

letta/server/server.py CHANGED Viewed

@@ -766,12 +766,7 @@ class SyncServer(Server):
             memory_blocks=[
                 CreateBlock(
                     label="memory_persona",
-                    value=(
-                        "I am an expert conversation memory manager. "
-                        "I manage the memory blocks such that they "
-                        "contain everything that is important about "
-                        "the conversation."
-                    ),
+                    value=get_persona_text("sleeptime_memory_persona"),
                 ),
             ],
             llm_config=main_agent.llm_config,

letta/services/agent_manager.py CHANGED Viewed

@@ -161,7 +161,7 @@ class AgentManager:
     # Basic CRUD operations
     # ======================================================================================================================
     @trace_method
-    def create_agent(self, agent_create: CreateAgent, actor: PydanticUser) -> PydanticAgentState:
+    def create_agent(self, agent_create: CreateAgent, actor: PydanticUser, _test_only_force_id: Optional[str] = None) -> PydanticAgentState:
         # validate required configs
         if not agent_create.llm_config or not agent_create.embedding_config:
             raise ValueError("llm_config and embedding_config are required")
@@ -236,9 +236,14 @@ class AgentManager:
                     base_template_id=agent_create.base_template_id,
                     message_buffer_autoclear=agent_create.message_buffer_autoclear,
                     enable_sleeptime=agent_create.enable_sleeptime,
+                    response_format=agent_create.response_format,
                     created_by_id=actor.id,
                     last_updated_by_id=actor.id,
                 )
+                if _test_only_force_id:
+                    new_agent.id = _test_only_force_id
                 session.add(new_agent)
                 session.flush()
                 aid = new_agent.id

{letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: letta-nightly
-Version: 0.7.1.dev20250423104245
+Version: 0.7.3.dev20250424054013
 Summary: Create LLM agents with long-term memory and custom tools
 License: Apache License
 Author: Letta Team

letta-nightly 0.7.1.dev20250423104245__py3-none-any.whl → 0.7.3.dev20250424054013__py3-none-any.whl

letta-nightly 0.7.1.dev20250423104245py3-none-any.whl → 0.7.3.dev20250424054013py3-none-any.whl