letta-nightly 0.7.1.dev20250423104245__py3-none-any.whl → 0.7.3.dev20250424054013__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. letta/__init__.py +1 -1
  2. letta/agent.py +2 -1
  3. letta/agents/letta_agent.py +2 -1
  4. letta/agents/letta_agent_batch.py +8 -3
  5. letta/agents/voice_agent.py +2 -2
  6. letta/client/client.py +3 -0
  7. letta/functions/functions.py +2 -1
  8. letta/functions/schema_generator.py +5 -0
  9. letta/helpers/composio_helpers.py +1 -1
  10. letta/helpers/datetime_helpers.py +9 -0
  11. letta/jobs/llm_batch_job_polling.py +2 -1
  12. letta/llm_api/anthropic.py +10 -6
  13. letta/llm_api/anthropic_client.py +7 -6
  14. letta/llm_api/cohere.py +2 -2
  15. letta/llm_api/google_ai_client.py +11 -45
  16. letta/llm_api/google_vertex_client.py +8 -7
  17. letta/llm_api/llm_client.py +8 -14
  18. letta/llm_api/llm_client_base.py +17 -16
  19. letta/llm_api/openai.py +11 -4
  20. letta/llm_api/openai_client.py +47 -14
  21. letta/local_llm/chat_completion_proxy.py +2 -2
  22. letta/memory.py +2 -1
  23. letta/personas/examples/sleeptime_memory_persona.txt +5 -0
  24. letta/schemas/enums.py +3 -0
  25. letta/schemas/letta_message_content.py +2 -1
  26. letta/schemas/llm_config.py +12 -2
  27. letta/schemas/message.py +17 -0
  28. letta/schemas/openai/chat_completion_response.py +52 -3
  29. letta/server/rest_api/chat_completions_interface.py +2 -2
  30. letta/server/rest_api/interface.py +1 -1
  31. letta/server/rest_api/routers/v1/messages.py +9 -1
  32. letta/server/server.py +1 -6
  33. letta/services/agent_manager.py +6 -1
  34. {letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/METADATA +1 -1
  35. {letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/RECORD +38 -38
  36. letta/personas/examples/offline_memory_persona.txt +0 -4
  37. {letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/LICENSE +0 -0
  38. {letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/WHEEL +0 -0
  39. {letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/entry_points.txt +0 -0
@@ -20,17 +20,16 @@ class LLMClientBase:
20
20
 
21
21
  def __init__(
22
22
  self,
23
- llm_config: LLMConfig,
24
23
  put_inner_thoughts_first: Optional[bool] = True,
25
24
  use_tool_naming: bool = True,
26
25
  ):
27
- self.llm_config = llm_config
28
26
  self.put_inner_thoughts_first = put_inner_thoughts_first
29
27
  self.use_tool_naming = use_tool_naming
30
28
 
31
29
  def send_llm_request(
32
30
  self,
33
31
  messages: List[Message],
32
+ llm_config: LLMConfig,
34
33
  tools: Optional[List[dict]] = None, # TODO: change to Tool object
35
34
  stream: bool = False,
36
35
  force_tool_call: Optional[str] = None,
@@ -40,23 +39,24 @@ class LLMClientBase:
40
39
  If stream=True, returns a Stream[ChatCompletionChunk] that can be iterated over.
41
40
  Otherwise returns a ChatCompletionResponse.
42
41
  """
43
- request_data = self.build_request_data(messages, self.llm_config, tools, force_tool_call)
42
+ request_data = self.build_request_data(messages, llm_config, tools, force_tool_call)
44
43
 
45
44
  try:
46
45
  log_event(name="llm_request_sent", attributes=request_data)
47
46
  if stream:
48
- return self.stream(request_data)
47
+ return self.stream(request_data, llm_config)
49
48
  else:
50
- response_data = self.request(request_data)
49
+ response_data = self.request(request_data, llm_config)
51
50
  log_event(name="llm_response_received", attributes=response_data)
52
51
  except Exception as e:
53
52
  raise self.handle_llm_error(e)
54
53
 
55
- return self.convert_response_to_chat_completion(response_data, messages)
54
+ return self.convert_response_to_chat_completion(response_data, messages, llm_config)
56
55
 
57
56
  async def send_llm_request_async(
58
57
  self,
59
58
  messages: List[Message],
59
+ llm_config: LLMConfig,
60
60
  tools: Optional[List[dict]] = None, # TODO: change to Tool object
61
61
  stream: bool = False,
62
62
  force_tool_call: Optional[str] = None,
@@ -66,19 +66,19 @@ class LLMClientBase:
66
66
  If stream=True, returns an AsyncStream[ChatCompletionChunk] that can be async iterated over.
67
67
  Otherwise returns a ChatCompletionResponse.
68
68
  """
69
- request_data = self.build_request_data(messages, self.llm_config, tools, force_tool_call)
69
+ request_data = self.build_request_data(messages, llm_config, tools, force_tool_call)
70
70
 
71
71
  try:
72
72
  log_event(name="llm_request_sent", attributes=request_data)
73
73
  if stream:
74
- return await self.stream_async(request_data)
74
+ return await self.stream_async(request_data, llm_config)
75
75
  else:
76
- response_data = await self.request_async(request_data)
76
+ response_data = await self.request_async(request_data, llm_config)
77
77
  log_event(name="llm_response_received", attributes=response_data)
78
78
  except Exception as e:
79
79
  raise self.handle_llm_error(e)
80
80
 
81
- return self.convert_response_to_chat_completion(response_data, messages)
81
+ return self.convert_response_to_chat_completion(response_data, messages, llm_config)
82
82
 
83
83
  async def send_llm_batch_request_async(
84
84
  self,
@@ -102,14 +102,14 @@ class LLMClientBase:
102
102
  raise NotImplementedError
103
103
 
104
104
  @abstractmethod
105
- def request(self, request_data: dict) -> dict:
105
+ def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
106
106
  """
107
107
  Performs underlying request to llm and returns raw response.
108
108
  """
109
109
  raise NotImplementedError
110
110
 
111
111
  @abstractmethod
112
- async def request_async(self, request_data: dict) -> dict:
112
+ async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
113
113
  """
114
114
  Performs underlying request to llm and returns raw response.
115
115
  """
@@ -120,6 +120,7 @@ class LLMClientBase:
120
120
  self,
121
121
  response_data: dict,
122
122
  input_messages: List[Message],
123
+ llm_config: LLMConfig,
123
124
  ) -> ChatCompletionResponse:
124
125
  """
125
126
  Converts custom response format from llm client into an OpenAI
@@ -128,18 +129,18 @@ class LLMClientBase:
128
129
  raise NotImplementedError
129
130
 
130
131
  @abstractmethod
131
- def stream(self, request_data: dict) -> Stream[ChatCompletionChunk]:
132
+ def stream(self, request_data: dict, llm_config: LLMConfig) -> Stream[ChatCompletionChunk]:
132
133
  """
133
134
  Performs underlying streaming request to llm and returns raw response.
134
135
  """
135
- raise NotImplementedError(f"Streaming is not supported for {self.llm_config.model_endpoint_type}")
136
+ raise NotImplementedError(f"Streaming is not supported for {llm_config.model_endpoint_type}")
136
137
 
137
138
  @abstractmethod
138
- async def stream_async(self, request_data: dict) -> AsyncStream[ChatCompletionChunk]:
139
+ async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk]:
139
140
  """
140
141
  Performs underlying streaming request to llm and returns raw response.
141
142
  """
142
- raise NotImplementedError(f"Streaming is not supported for {self.llm_config.model_endpoint_type}")
143
+ raise NotImplementedError(f"Streaming is not supported for {llm_config.model_endpoint_type}")
143
144
 
144
145
  @abstractmethod
145
146
  def handle_llm_error(self, e: Exception) -> Exception:
letta/llm_api/openai.py CHANGED
@@ -4,7 +4,9 @@ from typing import Generator, List, Optional, Union
4
4
  import requests
5
5
  from openai import OpenAI
6
6
 
7
+ from letta.helpers.datetime_helpers import timestamp_to_datetime
7
8
  from letta.llm_api.helpers import add_inner_thoughts_to_functions, convert_to_structured_output, make_post_request
9
+ from letta.llm_api.openai_client import supports_parallel_tool_calling, supports_temperature_param
8
10
  from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION, INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST
9
11
  from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
10
12
  from letta.log import get_logger
@@ -135,7 +137,7 @@ def build_openai_chat_completions_request(
135
137
  tool_choice=tool_choice,
136
138
  user=str(user_id),
137
139
  max_completion_tokens=llm_config.max_tokens,
138
- temperature=1.0 if llm_config.enable_reasoner else llm_config.temperature,
140
+ temperature=llm_config.temperature if supports_temperature_param(model) else None,
139
141
  reasoning_effort=llm_config.reasoning_effort,
140
142
  )
141
143
  else:
@@ -237,7 +239,7 @@ def openai_chat_completions_process_stream(
237
239
  chat_completion_response = ChatCompletionResponse(
238
240
  id=dummy_message.id if create_message_id else TEMP_STREAM_RESPONSE_ID,
239
241
  choices=[],
240
- created=dummy_message.created_at, # NOTE: doesn't matter since both will do get_utc_time()
242
+ created=int(dummy_message.created_at.timestamp()), # NOTE: doesn't matter since both will do get_utc_time()
241
243
  model=chat_completion_request.model,
242
244
  usage=UsageStatistics(
243
245
  completion_tokens=0,
@@ -274,7 +276,11 @@ def openai_chat_completions_process_stream(
274
276
  message_type = stream_interface.process_chunk(
275
277
  chat_completion_chunk,
276
278
  message_id=chat_completion_response.id if create_message_id else chat_completion_chunk.id,
277
- message_date=chat_completion_response.created if create_message_datetime else chat_completion_chunk.created,
279
+ message_date=(
280
+ timestamp_to_datetime(chat_completion_response.created)
281
+ if create_message_datetime
282
+ else timestamp_to_datetime(chat_completion_chunk.created)
283
+ ),
278
284
  expect_reasoning_content=expect_reasoning_content,
279
285
  name=name,
280
286
  message_index=message_idx,
@@ -489,6 +495,7 @@ def prepare_openai_payload(chat_completion_request: ChatCompletionRequest):
489
495
  # except ValueError as e:
490
496
  # warnings.warn(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
491
497
 
492
- if "o3-mini" in chat_completion_request.model or "o1" in chat_completion_request.model:
498
+ if not supports_parallel_tool_calling(chat_completion_request.model):
493
499
  data.pop("parallel_tool_calls", None)
500
+
494
501
  return data
@@ -34,12 +34,39 @@ from letta.settings import model_settings
34
34
  logger = get_logger(__name__)
35
35
 
36
36
 
37
+ def is_openai_reasoning_model(model: str) -> bool:
38
+ """Utility function to check if the model is a 'reasoner'"""
39
+
40
+ # NOTE: needs to be updated with new model releases
41
+ return model.startswith("o1") or model.startswith("o3")
42
+
43
+
44
+ def supports_temperature_param(model: str) -> bool:
45
+ """Certain OpenAI models don't support configuring the temperature.
46
+
47
+ Example error: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature', 'code': 'unsupported_parameter'}}
48
+ """
49
+ if is_openai_reasoning_model(model):
50
+ return False
51
+ else:
52
+ return True
53
+
54
+
55
+ def supports_parallel_tool_calling(model: str) -> bool:
56
+ """Certain OpenAI models don't support parallel tool calls."""
57
+
58
+ if is_openai_reasoning_model(model):
59
+ return False
60
+ else:
61
+ return True
62
+
63
+
37
64
  class OpenAIClient(LLMClientBase):
38
- def _prepare_client_kwargs(self) -> dict:
65
+ def _prepare_client_kwargs(self, llm_config: LLMConfig) -> dict:
39
66
  api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
40
67
  # supposedly the openai python client requires a dummy API key
41
68
  api_key = api_key or "DUMMY_API_KEY"
42
- kwargs = {"api_key": api_key, "base_url": self.llm_config.model_endpoint}
69
+ kwargs = {"api_key": api_key, "base_url": llm_config.model_endpoint}
43
70
 
44
71
  return kwargs
45
72
 
@@ -66,7 +93,8 @@ class OpenAIClient(LLMClientBase):
66
93
  put_inner_thoughts_first=True,
67
94
  )
68
95
 
69
- use_developer_message = llm_config.model.startswith("o1") or llm_config.model.startswith("o3") # o-series models
96
+ use_developer_message = is_openai_reasoning_model(llm_config.model)
97
+
70
98
  openai_message_list = [
71
99
  cast_message_to_subtype(
72
100
  m.to_openai_dict(
@@ -87,7 +115,7 @@ class OpenAIClient(LLMClientBase):
87
115
  # TODO(matt) move into LLMConfig
88
116
  # TODO: This vllm checking is very brittle and is a patch at most
89
117
  tool_choice = None
90
- if llm_config.model_endpoint == "https://inference.memgpt.ai" or (llm_config.handle and "vllm" in self.llm_config.handle):
118
+ if llm_config.model_endpoint == "https://inference.memgpt.ai" or (llm_config.handle and "vllm" in llm_config.handle):
91
119
  tool_choice = "auto" # TODO change to "required" once proxy supports it
92
120
  elif tools:
93
121
  # only set if tools is non-Null
@@ -103,7 +131,7 @@ class OpenAIClient(LLMClientBase):
103
131
  tool_choice=tool_choice,
104
132
  user=str(),
105
133
  max_completion_tokens=llm_config.max_tokens,
106
- temperature=llm_config.temperature,
134
+ temperature=llm_config.temperature if supports_temperature_param(model) else None,
107
135
  )
108
136
 
109
137
  if "inference.memgpt.ai" in llm_config.model_endpoint:
@@ -124,20 +152,20 @@ class OpenAIClient(LLMClientBase):
124
152
 
125
153
  return data.model_dump(exclude_unset=True)
126
154
 
127
- def request(self, request_data: dict) -> dict:
155
+ def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
128
156
  """
129
157
  Performs underlying synchronous request to OpenAI API and returns raw response dict.
130
158
  """
131
- client = OpenAI(**self._prepare_client_kwargs())
159
+ client = OpenAI(**self._prepare_client_kwargs(llm_config))
132
160
 
133
161
  response: ChatCompletion = client.chat.completions.create(**request_data)
134
162
  return response.model_dump()
135
163
 
136
- async def request_async(self, request_data: dict) -> dict:
164
+ async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
137
165
  """
138
166
  Performs underlying asynchronous request to OpenAI API and returns raw response dict.
139
167
  """
140
- client = AsyncOpenAI(**self._prepare_client_kwargs())
168
+ client = AsyncOpenAI(**self._prepare_client_kwargs(llm_config))
141
169
  response: ChatCompletion = await client.chat.completions.create(**request_data)
142
170
  return response.model_dump()
143
171
 
@@ -145,6 +173,7 @@ class OpenAIClient(LLMClientBase):
145
173
  self,
146
174
  response_data: dict,
147
175
  input_messages: List[PydanticMessage], # Included for consistency, maybe used later
176
+ llm_config: LLMConfig,
148
177
  ) -> ChatCompletionResponse:
149
178
  """
150
179
  Converts raw OpenAI response dict into the ChatCompletionResponse Pydantic model.
@@ -155,26 +184,30 @@ class OpenAIClient(LLMClientBase):
155
184
  chat_completion_response = ChatCompletionResponse(**response_data)
156
185
 
157
186
  # Unpack inner thoughts if they were embedded in function arguments
158
- if self.llm_config.put_inner_thoughts_in_kwargs:
187
+ if llm_config.put_inner_thoughts_in_kwargs:
159
188
  chat_completion_response = unpack_all_inner_thoughts_from_kwargs(
160
189
  response=chat_completion_response, inner_thoughts_key=INNER_THOUGHTS_KWARG
161
190
  )
162
191
 
192
+ # If we used a reasoning model, create a content part for the ommitted reasoning
193
+ if is_openai_reasoning_model(llm_config.model):
194
+ chat_completion_response.choices[0].message.ommitted_reasoning_content = True
195
+
163
196
  return chat_completion_response
164
197
 
165
- def stream(self, request_data: dict) -> Stream[ChatCompletionChunk]:
198
+ def stream(self, request_data: dict, llm_config: LLMConfig) -> Stream[ChatCompletionChunk]:
166
199
  """
167
200
  Performs underlying streaming request to OpenAI and returns the stream iterator.
168
201
  """
169
- client = OpenAI(**self._prepare_client_kwargs())
202
+ client = OpenAI(**self._prepare_client_kwargs(llm_config))
170
203
  response_stream: Stream[ChatCompletionChunk] = client.chat.completions.create(**request_data, stream=True)
171
204
  return response_stream
172
205
 
173
- async def stream_async(self, request_data: dict) -> AsyncStream[ChatCompletionChunk]:
206
+ async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk]:
174
207
  """
175
208
  Performs underlying asynchronous streaming request to OpenAI and returns the async stream iterator.
176
209
  """
177
- client = AsyncOpenAI(**self._prepare_client_kwargs())
210
+ client = AsyncOpenAI(**self._prepare_client_kwargs(llm_config))
178
211
  response_stream: AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(**request_data, stream=True)
179
212
  return response_stream
180
213
 
@@ -6,7 +6,7 @@ import requests
6
6
 
7
7
  from letta.constants import CLI_WARNING_PREFIX
8
8
  from letta.errors import LocalLLMConnectionError, LocalLLMError
9
- from letta.helpers.datetime_helpers import get_utc_time
9
+ from letta.helpers.datetime_helpers import get_utc_time_int
10
10
  from letta.helpers.json_helpers import json_dumps
11
11
  from letta.local_llm.constants import DEFAULT_WRAPPER
12
12
  from letta.local_llm.function_parser import patch_function
@@ -241,7 +241,7 @@ def get_chat_completion(
241
241
  ),
242
242
  )
243
243
  ],
244
- created=get_utc_time(),
244
+ created=get_utc_time_int(),
245
245
  model=model,
246
246
  # "This fingerprint represents the backend configuration that the model runs with."
247
247
  # system_fingerprint=user if user is not None else "null",
letta/memory.py CHANGED
@@ -79,7 +79,7 @@ def summarize_messages(
79
79
  llm_config_no_inner_thoughts.put_inner_thoughts_in_kwargs = False
80
80
 
81
81
  llm_client = LLMClient.create(
82
- llm_config=llm_config_no_inner_thoughts,
82
+ provider=llm_config_no_inner_thoughts.model_endpoint_type,
83
83
  put_inner_thoughts_first=False,
84
84
  )
85
85
  # try to use new client, otherwise fallback to old flow
@@ -87,6 +87,7 @@ def summarize_messages(
87
87
  if llm_client:
88
88
  response = llm_client.send_llm_request(
89
89
  messages=message_sequence,
90
+ llm_config=llm_config_no_inner_thoughts,
90
91
  stream=False,
91
92
  )
92
93
  else:
@@ -0,0 +1,5 @@
1
+ I am an expert conversation memory agent that can do the following:
2
+ - Consolidate memories into more concise blocks
3
+ - Identify patterns in user behavior
4
+ - Make inferences based on the memory
5
+ I manage the memory blocks such that they contain everything that is important about the conversation.
letta/schemas/enums.py CHANGED
@@ -3,6 +3,9 @@ from enum import Enum
3
3
 
4
4
  class ProviderType(str, Enum):
5
5
  anthropic = "anthropic"
6
+ google_ai = "google_ai"
7
+ google_vertex = "google_vertex"
8
+ openai = "openai"
6
9
 
7
10
 
8
11
  class MessageRole(str, Enum):
@@ -145,7 +145,8 @@ class OmittedReasoningContent(MessageContent):
145
145
  type: Literal[MessageContentType.omitted_reasoning] = Field(
146
146
  MessageContentType.omitted_reasoning, description="Indicates this is an omitted reasoning step."
147
147
  )
148
- tokens: int = Field(..., description="The reasoning token count for intermediate reasoning content.")
148
+ # NOTE: dropping because we don't track this kind of information for the other reasoning types
149
+ # tokens: int = Field(..., description="The reasoning token count for intermediate reasoning content.")
149
150
 
150
151
 
151
152
  LettaMessageContentUnion = Annotated[
@@ -81,8 +81,11 @@ class LLMConfig(BaseModel):
81
81
  @model_validator(mode="before")
82
82
  @classmethod
83
83
  def set_default_enable_reasoner(cls, values):
84
- if any(openai_reasoner_model in values.get("model", "") for openai_reasoner_model in ["o3-mini", "o1"]):
85
- values["enable_reasoner"] = True
84
+ # NOTE: this is really only applicable for models that can toggle reasoning on-and-off, like 3.7
85
+ # We can also use this field to identify if a model is a "reasoning" model (o1/o3, etc.) if we want
86
+ # if any(openai_reasoner_model in values.get("model", "") for openai_reasoner_model in ["o3-mini", "o1"]):
87
+ # values["enable_reasoner"] = True
88
+ # values["put_inner_thoughts_in_kwargs"] = False
86
89
  return values
87
90
 
88
91
  @model_validator(mode="before")
@@ -100,6 +103,13 @@ class LLMConfig(BaseModel):
100
103
  if values.get("put_inner_thoughts_in_kwargs") is None:
101
104
  values["put_inner_thoughts_in_kwargs"] = False if model in avoid_put_inner_thoughts_in_kwargs else True
102
105
 
106
+ # For the o1/o3 series from OpenAI, set to False by default
107
+ # We can set this flag to `true` if desired, which will enable "double-think"
108
+ from letta.llm_api.openai_client import is_openai_reasoning_model
109
+
110
+ if is_openai_reasoning_model(model):
111
+ values["put_inner_thoughts_in_kwargs"] = False
112
+
103
113
  return values
104
114
 
105
115
  @model_validator(mode="after")
letta/schemas/message.py CHANGED
@@ -31,6 +31,7 @@ from letta.schemas.letta_message import (
31
31
  )
32
32
  from letta.schemas.letta_message_content import (
33
33
  LettaMessageContentUnion,
34
+ OmittedReasoningContent,
34
35
  ReasoningContent,
35
36
  RedactedReasoningContent,
36
37
  TextContent,
@@ -295,6 +296,18 @@ class Message(BaseMessage):
295
296
  sender_id=self.sender_id,
296
297
  )
297
298
  )
299
+ elif isinstance(content_part, OmittedReasoningContent):
300
+ # Special case for "hidden reasoning" models like o1/o3
301
+ # NOTE: we also have to think about how to return this during streaming
302
+ messages.append(
303
+ HiddenReasoningMessage(
304
+ id=self.id,
305
+ date=self.created_at,
306
+ state="omitted",
307
+ name=self.name,
308
+ otid=otid,
309
+ )
310
+ )
298
311
  else:
299
312
  warnings.warn(f"Unrecognized content part in assistant message: {content_part}")
300
313
 
@@ -464,6 +477,10 @@ class Message(BaseMessage):
464
477
  data=openai_message_dict["redacted_reasoning_content"] if "redacted_reasoning_content" in openai_message_dict else None,
465
478
  ),
466
479
  )
480
+ if "omitted_reasoning_content" in openai_message_dict and openai_message_dict["omitted_reasoning_content"]:
481
+ content.append(
482
+ OmittedReasoningContent(),
483
+ )
467
484
 
468
485
  # If we're going from deprecated function form
469
486
  if openai_message_dict["role"] == "function":
@@ -39,9 +39,10 @@ class Message(BaseModel):
39
39
  tool_calls: Optional[List[ToolCall]] = None
40
40
  role: str
41
41
  function_call: Optional[FunctionCall] = None # Deprecated
42
- reasoning_content: Optional[str] = None # Used in newer reasoning APIs
42
+ reasoning_content: Optional[str] = None # Used in newer reasoning APIs, e.g. DeepSeek
43
43
  reasoning_content_signature: Optional[str] = None # NOTE: for Anthropic
44
44
  redacted_reasoning_content: Optional[str] = None # NOTE: for Anthropic
45
+ ommitted_reasoning_content: bool = False # NOTE: for OpenAI o1/o3
45
46
 
46
47
 
47
48
  class Choice(BaseModel):
@@ -52,16 +53,64 @@ class Choice(BaseModel):
52
53
  seed: Optional[int] = None # found in TogetherAI
53
54
 
54
55
 
56
+ class UsageStatisticsPromptTokenDetails(BaseModel):
57
+ cached_tokens: int = 0
58
+ # NOTE: OAI specific
59
+ # audio_tokens: int = 0
60
+
61
+ def __add__(self, other: "UsageStatisticsPromptTokenDetails") -> "UsageStatisticsPromptTokenDetails":
62
+ return UsageStatisticsPromptTokenDetails(
63
+ cached_tokens=self.cached_tokens + other.cached_tokens,
64
+ )
65
+
66
+
67
+ class UsageStatisticsCompletionTokenDetails(BaseModel):
68
+ reasoning_tokens: int = 0
69
+ # NOTE: OAI specific
70
+ # audio_tokens: int = 0
71
+ # accepted_prediction_tokens: int = 0
72
+ # rejected_prediction_tokens: int = 0
73
+
74
+ def __add__(self, other: "UsageStatisticsCompletionTokenDetails") -> "UsageStatisticsCompletionTokenDetails":
75
+ return UsageStatisticsCompletionTokenDetails(
76
+ reasoning_tokens=self.reasoning_tokens + other.reasoning_tokens,
77
+ )
78
+
79
+
55
80
  class UsageStatistics(BaseModel):
56
81
  completion_tokens: int = 0
57
82
  prompt_tokens: int = 0
58
83
  total_tokens: int = 0
59
84
 
85
+ prompt_tokens_details: Optional[UsageStatisticsPromptTokenDetails] = None
86
+ completion_tokens_details: Optional[UsageStatisticsCompletionTokenDetails] = None
87
+
60
88
  def __add__(self, other: "UsageStatistics") -> "UsageStatistics":
89
+
90
+ if self.prompt_tokens_details is None and other.prompt_tokens_details is None:
91
+ total_prompt_tokens_details = None
92
+ elif self.prompt_tokens_details is None:
93
+ total_prompt_tokens_details = other.prompt_tokens_details
94
+ elif other.prompt_tokens_details is None:
95
+ total_prompt_tokens_details = self.prompt_tokens_details
96
+ else:
97
+ total_prompt_tokens_details = self.prompt_tokens_details + other.prompt_tokens_details
98
+
99
+ if self.completion_tokens_details is None and other.completion_tokens_details is None:
100
+ total_completion_tokens_details = None
101
+ elif self.completion_tokens_details is None:
102
+ total_completion_tokens_details = other.completion_tokens_details
103
+ elif other.completion_tokens_details is None:
104
+ total_completion_tokens_details = self.completion_tokens_details
105
+ else:
106
+ total_completion_tokens_details = self.completion_tokens_details + other.completion_tokens_details
107
+
61
108
  return UsageStatistics(
62
109
  completion_tokens=self.completion_tokens + other.completion_tokens,
63
110
  prompt_tokens=self.prompt_tokens + other.prompt_tokens,
64
111
  total_tokens=self.total_tokens + other.total_tokens,
112
+ prompt_tokens_details=total_prompt_tokens_details,
113
+ completion_tokens_details=total_completion_tokens_details,
65
114
  )
66
115
 
67
116
 
@@ -70,7 +119,7 @@ class ChatCompletionResponse(BaseModel):
70
119
 
71
120
  id: str
72
121
  choices: List[Choice]
73
- created: datetime.datetime
122
+ created: Union[datetime.datetime, int]
74
123
  model: Optional[str] = None # NOTE: this is not consistent with OpenAI API standard, however is necessary to support local LLMs
75
124
  # system_fingerprint: str # docs say this is mandatory, but in reality API returns None
76
125
  system_fingerprint: Optional[str] = None
@@ -138,7 +187,7 @@ class ChatCompletionChunkResponse(BaseModel):
138
187
 
139
188
  id: str
140
189
  choices: List[ChunkChoice]
141
- created: Union[datetime.datetime, str]
190
+ created: Union[datetime.datetime, int]
142
191
  model: str
143
192
  # system_fingerprint: str # docs say this is mandatory, but in reality API returns None
144
193
  system_fingerprint: Optional[str] = None
@@ -238,7 +238,7 @@ class ChatCompletionsStreamingInterface(AgentChunkStreamingInterface):
238
238
  return ChatCompletionChunk(
239
239
  id=chunk.id,
240
240
  object=chunk.object,
241
- created=chunk.created.timestamp(),
241
+ created=chunk.created,
242
242
  model=chunk.model,
243
243
  choices=[
244
244
  Choice(
@@ -256,7 +256,7 @@ class ChatCompletionsStreamingInterface(AgentChunkStreamingInterface):
256
256
  return ChatCompletionChunk(
257
257
  id=chunk.id,
258
258
  object=chunk.object,
259
- created=chunk.created.timestamp(),
259
+ created=chunk.created,
260
260
  model=chunk.model,
261
261
  choices=[
262
262
  Choice(
@@ -1001,7 +1001,7 @@ class StreamingServerInterface(AgentChunkStreamingInterface):
1001
1001
  # Example case that would trigger here:
1002
1002
  # id='chatcmpl-AKtUvREgRRvgTW6n8ZafiKuV0mxhQ'
1003
1003
  # choices=[ChunkChoice(finish_reason=None, index=0, delta=MessageDelta(content=None, tool_calls=None, function_call=None), logprobs=None)]
1004
- # created=datetime.datetime(2024, 10, 21, 20, 40, 57, tzinfo=TzInfo(UTC))
1004
+ # created=1713216662
1005
1005
  # model='gpt-4o-mini-2024-07-18'
1006
1006
  # object='chat.completion.chunk'
1007
1007
  warnings.warn(f"Couldn't find delta in chunk: {chunk}")
@@ -1,6 +1,6 @@
1
1
  from typing import List, Optional
2
2
 
3
- from fastapi import APIRouter, Body, Depends, Header
3
+ from fastapi import APIRouter, Body, Depends, Header, status
4
4
  from fastapi.exceptions import HTTPException
5
5
  from starlette.requests import Request
6
6
 
@@ -11,6 +11,7 @@ from letta.schemas.job import BatchJob, JobStatus, JobType, JobUpdate
11
11
  from letta.schemas.letta_request import CreateBatch
12
12
  from letta.server.rest_api.utils import get_letta_server
13
13
  from letta.server.server import SyncServer
14
+ from letta.settings import settings
14
15
 
15
16
  router = APIRouter(prefix="/messages", tags=["messages"])
16
17
 
@@ -43,6 +44,13 @@ async def create_messages_batch(
43
44
  if length > max_bytes:
44
45
  raise HTTPException(status_code=413, detail=f"Request too large ({length} bytes). Max is {max_bytes} bytes.")
45
46
 
47
+ # Reject request if env var is not set
48
+ if not settings.enable_batch_job_polling:
49
+ raise HTTPException(
50
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
51
+ detail=f"Server misconfiguration: LETTA_ENABLE_BATCH_JOB_POLLING is set to False.",
52
+ )
53
+
46
54
  actor = server.user_manager.get_user_or_default(user_id=actor_id)
47
55
  batch_job = BatchJob(
48
56
  user_id=actor.id,
letta/server/server.py CHANGED
@@ -766,12 +766,7 @@ class SyncServer(Server):
766
766
  memory_blocks=[
767
767
  CreateBlock(
768
768
  label="memory_persona",
769
- value=(
770
- "I am an expert conversation memory manager. "
771
- "I manage the memory blocks such that they "
772
- "contain everything that is important about "
773
- "the conversation."
774
- ),
769
+ value=get_persona_text("sleeptime_memory_persona"),
775
770
  ),
776
771
  ],
777
772
  llm_config=main_agent.llm_config,
@@ -161,7 +161,7 @@ class AgentManager:
161
161
  # Basic CRUD operations
162
162
  # ======================================================================================================================
163
163
  @trace_method
164
- def create_agent(self, agent_create: CreateAgent, actor: PydanticUser) -> PydanticAgentState:
164
+ def create_agent(self, agent_create: CreateAgent, actor: PydanticUser, _test_only_force_id: Optional[str] = None) -> PydanticAgentState:
165
165
  # validate required configs
166
166
  if not agent_create.llm_config or not agent_create.embedding_config:
167
167
  raise ValueError("llm_config and embedding_config are required")
@@ -236,9 +236,14 @@ class AgentManager:
236
236
  base_template_id=agent_create.base_template_id,
237
237
  message_buffer_autoclear=agent_create.message_buffer_autoclear,
238
238
  enable_sleeptime=agent_create.enable_sleeptime,
239
+ response_format=agent_create.response_format,
239
240
  created_by_id=actor.id,
240
241
  last_updated_by_id=actor.id,
241
242
  )
243
+
244
+ if _test_only_force_id:
245
+ new_agent.id = _test_only_force_id
246
+
242
247
  session.add(new_agent)
243
248
  session.flush()
244
249
  aid = new_agent.id
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: letta-nightly
3
- Version: 0.7.1.dev20250423104245
3
+ Version: 0.7.3.dev20250424054013
4
4
  Summary: Create LLM agents with long-term memory and custom tools
5
5
  License: Apache License
6
6
  Author: Letta Team