letta-nightly 0.7.1.dev20250423104245__py3-none-any.whl → 0.7.3.dev20250424054013__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- letta/__init__.py +1 -1
- letta/agent.py +2 -1
- letta/agents/letta_agent.py +2 -1
- letta/agents/letta_agent_batch.py +8 -3
- letta/agents/voice_agent.py +2 -2
- letta/client/client.py +3 -0
- letta/functions/functions.py +2 -1
- letta/functions/schema_generator.py +5 -0
- letta/helpers/composio_helpers.py +1 -1
- letta/helpers/datetime_helpers.py +9 -0
- letta/jobs/llm_batch_job_polling.py +2 -1
- letta/llm_api/anthropic.py +10 -6
- letta/llm_api/anthropic_client.py +7 -6
- letta/llm_api/cohere.py +2 -2
- letta/llm_api/google_ai_client.py +11 -45
- letta/llm_api/google_vertex_client.py +8 -7
- letta/llm_api/llm_client.py +8 -14
- letta/llm_api/llm_client_base.py +17 -16
- letta/llm_api/openai.py +11 -4
- letta/llm_api/openai_client.py +47 -14
- letta/local_llm/chat_completion_proxy.py +2 -2
- letta/memory.py +2 -1
- letta/personas/examples/sleeptime_memory_persona.txt +5 -0
- letta/schemas/enums.py +3 -0
- letta/schemas/letta_message_content.py +2 -1
- letta/schemas/llm_config.py +12 -2
- letta/schemas/message.py +17 -0
- letta/schemas/openai/chat_completion_response.py +52 -3
- letta/server/rest_api/chat_completions_interface.py +2 -2
- letta/server/rest_api/interface.py +1 -1
- letta/server/rest_api/routers/v1/messages.py +9 -1
- letta/server/server.py +1 -6
- letta/services/agent_manager.py +6 -1
- {letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/METADATA +1 -1
- {letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/RECORD +38 -38
- letta/personas/examples/offline_memory_persona.txt +0 -4
- {letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/LICENSE +0 -0
- {letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/WHEEL +0 -0
- {letta_nightly-0.7.1.dev20250423104245.dist-info → letta_nightly-0.7.3.dev20250424054013.dist-info}/entry_points.txt +0 -0
letta/llm_api/llm_client_base.py
CHANGED
@@ -20,17 +20,16 @@ class LLMClientBase:
|
|
20
20
|
|
21
21
|
def __init__(
|
22
22
|
self,
|
23
|
-
llm_config: LLMConfig,
|
24
23
|
put_inner_thoughts_first: Optional[bool] = True,
|
25
24
|
use_tool_naming: bool = True,
|
26
25
|
):
|
27
|
-
self.llm_config = llm_config
|
28
26
|
self.put_inner_thoughts_first = put_inner_thoughts_first
|
29
27
|
self.use_tool_naming = use_tool_naming
|
30
28
|
|
31
29
|
def send_llm_request(
|
32
30
|
self,
|
33
31
|
messages: List[Message],
|
32
|
+
llm_config: LLMConfig,
|
34
33
|
tools: Optional[List[dict]] = None, # TODO: change to Tool object
|
35
34
|
stream: bool = False,
|
36
35
|
force_tool_call: Optional[str] = None,
|
@@ -40,23 +39,24 @@ class LLMClientBase:
|
|
40
39
|
If stream=True, returns a Stream[ChatCompletionChunk] that can be iterated over.
|
41
40
|
Otherwise returns a ChatCompletionResponse.
|
42
41
|
"""
|
43
|
-
request_data = self.build_request_data(messages,
|
42
|
+
request_data = self.build_request_data(messages, llm_config, tools, force_tool_call)
|
44
43
|
|
45
44
|
try:
|
46
45
|
log_event(name="llm_request_sent", attributes=request_data)
|
47
46
|
if stream:
|
48
|
-
return self.stream(request_data)
|
47
|
+
return self.stream(request_data, llm_config)
|
49
48
|
else:
|
50
|
-
response_data = self.request(request_data)
|
49
|
+
response_data = self.request(request_data, llm_config)
|
51
50
|
log_event(name="llm_response_received", attributes=response_data)
|
52
51
|
except Exception as e:
|
53
52
|
raise self.handle_llm_error(e)
|
54
53
|
|
55
|
-
return self.convert_response_to_chat_completion(response_data, messages)
|
54
|
+
return self.convert_response_to_chat_completion(response_data, messages, llm_config)
|
56
55
|
|
57
56
|
async def send_llm_request_async(
|
58
57
|
self,
|
59
58
|
messages: List[Message],
|
59
|
+
llm_config: LLMConfig,
|
60
60
|
tools: Optional[List[dict]] = None, # TODO: change to Tool object
|
61
61
|
stream: bool = False,
|
62
62
|
force_tool_call: Optional[str] = None,
|
@@ -66,19 +66,19 @@ class LLMClientBase:
|
|
66
66
|
If stream=True, returns an AsyncStream[ChatCompletionChunk] that can be async iterated over.
|
67
67
|
Otherwise returns a ChatCompletionResponse.
|
68
68
|
"""
|
69
|
-
request_data = self.build_request_data(messages,
|
69
|
+
request_data = self.build_request_data(messages, llm_config, tools, force_tool_call)
|
70
70
|
|
71
71
|
try:
|
72
72
|
log_event(name="llm_request_sent", attributes=request_data)
|
73
73
|
if stream:
|
74
|
-
return await self.stream_async(request_data)
|
74
|
+
return await self.stream_async(request_data, llm_config)
|
75
75
|
else:
|
76
|
-
response_data = await self.request_async(request_data)
|
76
|
+
response_data = await self.request_async(request_data, llm_config)
|
77
77
|
log_event(name="llm_response_received", attributes=response_data)
|
78
78
|
except Exception as e:
|
79
79
|
raise self.handle_llm_error(e)
|
80
80
|
|
81
|
-
return self.convert_response_to_chat_completion(response_data, messages)
|
81
|
+
return self.convert_response_to_chat_completion(response_data, messages, llm_config)
|
82
82
|
|
83
83
|
async def send_llm_batch_request_async(
|
84
84
|
self,
|
@@ -102,14 +102,14 @@ class LLMClientBase:
|
|
102
102
|
raise NotImplementedError
|
103
103
|
|
104
104
|
@abstractmethod
|
105
|
-
def request(self, request_data: dict) -> dict:
|
105
|
+
def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
|
106
106
|
"""
|
107
107
|
Performs underlying request to llm and returns raw response.
|
108
108
|
"""
|
109
109
|
raise NotImplementedError
|
110
110
|
|
111
111
|
@abstractmethod
|
112
|
-
async def request_async(self, request_data: dict) -> dict:
|
112
|
+
async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
|
113
113
|
"""
|
114
114
|
Performs underlying request to llm and returns raw response.
|
115
115
|
"""
|
@@ -120,6 +120,7 @@ class LLMClientBase:
|
|
120
120
|
self,
|
121
121
|
response_data: dict,
|
122
122
|
input_messages: List[Message],
|
123
|
+
llm_config: LLMConfig,
|
123
124
|
) -> ChatCompletionResponse:
|
124
125
|
"""
|
125
126
|
Converts custom response format from llm client into an OpenAI
|
@@ -128,18 +129,18 @@ class LLMClientBase:
|
|
128
129
|
raise NotImplementedError
|
129
130
|
|
130
131
|
@abstractmethod
|
131
|
-
def stream(self, request_data: dict) -> Stream[ChatCompletionChunk]:
|
132
|
+
def stream(self, request_data: dict, llm_config: LLMConfig) -> Stream[ChatCompletionChunk]:
|
132
133
|
"""
|
133
134
|
Performs underlying streaming request to llm and returns raw response.
|
134
135
|
"""
|
135
|
-
raise NotImplementedError(f"Streaming is not supported for {
|
136
|
+
raise NotImplementedError(f"Streaming is not supported for {llm_config.model_endpoint_type}")
|
136
137
|
|
137
138
|
@abstractmethod
|
138
|
-
async def stream_async(self, request_data: dict) -> AsyncStream[ChatCompletionChunk]:
|
139
|
+
async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk]:
|
139
140
|
"""
|
140
141
|
Performs underlying streaming request to llm and returns raw response.
|
141
142
|
"""
|
142
|
-
raise NotImplementedError(f"Streaming is not supported for {
|
143
|
+
raise NotImplementedError(f"Streaming is not supported for {llm_config.model_endpoint_type}")
|
143
144
|
|
144
145
|
@abstractmethod
|
145
146
|
def handle_llm_error(self, e: Exception) -> Exception:
|
letta/llm_api/openai.py
CHANGED
@@ -4,7 +4,9 @@ from typing import Generator, List, Optional, Union
|
|
4
4
|
import requests
|
5
5
|
from openai import OpenAI
|
6
6
|
|
7
|
+
from letta.helpers.datetime_helpers import timestamp_to_datetime
|
7
8
|
from letta.llm_api.helpers import add_inner_thoughts_to_functions, convert_to_structured_output, make_post_request
|
9
|
+
from letta.llm_api.openai_client import supports_parallel_tool_calling, supports_temperature_param
|
8
10
|
from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION, INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST
|
9
11
|
from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
|
10
12
|
from letta.log import get_logger
|
@@ -135,7 +137,7 @@ def build_openai_chat_completions_request(
|
|
135
137
|
tool_choice=tool_choice,
|
136
138
|
user=str(user_id),
|
137
139
|
max_completion_tokens=llm_config.max_tokens,
|
138
|
-
temperature=
|
140
|
+
temperature=llm_config.temperature if supports_temperature_param(model) else None,
|
139
141
|
reasoning_effort=llm_config.reasoning_effort,
|
140
142
|
)
|
141
143
|
else:
|
@@ -237,7 +239,7 @@ def openai_chat_completions_process_stream(
|
|
237
239
|
chat_completion_response = ChatCompletionResponse(
|
238
240
|
id=dummy_message.id if create_message_id else TEMP_STREAM_RESPONSE_ID,
|
239
241
|
choices=[],
|
240
|
-
created=dummy_message.created_at, # NOTE: doesn't matter since both will do get_utc_time()
|
242
|
+
created=int(dummy_message.created_at.timestamp()), # NOTE: doesn't matter since both will do get_utc_time()
|
241
243
|
model=chat_completion_request.model,
|
242
244
|
usage=UsageStatistics(
|
243
245
|
completion_tokens=0,
|
@@ -274,7 +276,11 @@ def openai_chat_completions_process_stream(
|
|
274
276
|
message_type = stream_interface.process_chunk(
|
275
277
|
chat_completion_chunk,
|
276
278
|
message_id=chat_completion_response.id if create_message_id else chat_completion_chunk.id,
|
277
|
-
message_date=
|
279
|
+
message_date=(
|
280
|
+
timestamp_to_datetime(chat_completion_response.created)
|
281
|
+
if create_message_datetime
|
282
|
+
else timestamp_to_datetime(chat_completion_chunk.created)
|
283
|
+
),
|
278
284
|
expect_reasoning_content=expect_reasoning_content,
|
279
285
|
name=name,
|
280
286
|
message_index=message_idx,
|
@@ -489,6 +495,7 @@ def prepare_openai_payload(chat_completion_request: ChatCompletionRequest):
|
|
489
495
|
# except ValueError as e:
|
490
496
|
# warnings.warn(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
|
491
497
|
|
492
|
-
if
|
498
|
+
if not supports_parallel_tool_calling(chat_completion_request.model):
|
493
499
|
data.pop("parallel_tool_calls", None)
|
500
|
+
|
494
501
|
return data
|
letta/llm_api/openai_client.py
CHANGED
@@ -34,12 +34,39 @@ from letta.settings import model_settings
|
|
34
34
|
logger = get_logger(__name__)
|
35
35
|
|
36
36
|
|
37
|
+
def is_openai_reasoning_model(model: str) -> bool:
|
38
|
+
"""Utility function to check if the model is a 'reasoner'"""
|
39
|
+
|
40
|
+
# NOTE: needs to be updated with new model releases
|
41
|
+
return model.startswith("o1") or model.startswith("o3")
|
42
|
+
|
43
|
+
|
44
|
+
def supports_temperature_param(model: str) -> bool:
|
45
|
+
"""Certain OpenAI models don't support configuring the temperature.
|
46
|
+
|
47
|
+
Example error: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature', 'code': 'unsupported_parameter'}}
|
48
|
+
"""
|
49
|
+
if is_openai_reasoning_model(model):
|
50
|
+
return False
|
51
|
+
else:
|
52
|
+
return True
|
53
|
+
|
54
|
+
|
55
|
+
def supports_parallel_tool_calling(model: str) -> bool:
|
56
|
+
"""Certain OpenAI models don't support parallel tool calls."""
|
57
|
+
|
58
|
+
if is_openai_reasoning_model(model):
|
59
|
+
return False
|
60
|
+
else:
|
61
|
+
return True
|
62
|
+
|
63
|
+
|
37
64
|
class OpenAIClient(LLMClientBase):
|
38
|
-
def _prepare_client_kwargs(self) -> dict:
|
65
|
+
def _prepare_client_kwargs(self, llm_config: LLMConfig) -> dict:
|
39
66
|
api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
|
40
67
|
# supposedly the openai python client requires a dummy API key
|
41
68
|
api_key = api_key or "DUMMY_API_KEY"
|
42
|
-
kwargs = {"api_key": api_key, "base_url":
|
69
|
+
kwargs = {"api_key": api_key, "base_url": llm_config.model_endpoint}
|
43
70
|
|
44
71
|
return kwargs
|
45
72
|
|
@@ -66,7 +93,8 @@ class OpenAIClient(LLMClientBase):
|
|
66
93
|
put_inner_thoughts_first=True,
|
67
94
|
)
|
68
95
|
|
69
|
-
use_developer_message =
|
96
|
+
use_developer_message = is_openai_reasoning_model(llm_config.model)
|
97
|
+
|
70
98
|
openai_message_list = [
|
71
99
|
cast_message_to_subtype(
|
72
100
|
m.to_openai_dict(
|
@@ -87,7 +115,7 @@ class OpenAIClient(LLMClientBase):
|
|
87
115
|
# TODO(matt) move into LLMConfig
|
88
116
|
# TODO: This vllm checking is very brittle and is a patch at most
|
89
117
|
tool_choice = None
|
90
|
-
if llm_config.model_endpoint == "https://inference.memgpt.ai" or (llm_config.handle and "vllm" in
|
118
|
+
if llm_config.model_endpoint == "https://inference.memgpt.ai" or (llm_config.handle and "vllm" in llm_config.handle):
|
91
119
|
tool_choice = "auto" # TODO change to "required" once proxy supports it
|
92
120
|
elif tools:
|
93
121
|
# only set if tools is non-Null
|
@@ -103,7 +131,7 @@ class OpenAIClient(LLMClientBase):
|
|
103
131
|
tool_choice=tool_choice,
|
104
132
|
user=str(),
|
105
133
|
max_completion_tokens=llm_config.max_tokens,
|
106
|
-
temperature=llm_config.temperature,
|
134
|
+
temperature=llm_config.temperature if supports_temperature_param(model) else None,
|
107
135
|
)
|
108
136
|
|
109
137
|
if "inference.memgpt.ai" in llm_config.model_endpoint:
|
@@ -124,20 +152,20 @@ class OpenAIClient(LLMClientBase):
|
|
124
152
|
|
125
153
|
return data.model_dump(exclude_unset=True)
|
126
154
|
|
127
|
-
def request(self, request_data: dict) -> dict:
|
155
|
+
def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
|
128
156
|
"""
|
129
157
|
Performs underlying synchronous request to OpenAI API and returns raw response dict.
|
130
158
|
"""
|
131
|
-
client = OpenAI(**self._prepare_client_kwargs())
|
159
|
+
client = OpenAI(**self._prepare_client_kwargs(llm_config))
|
132
160
|
|
133
161
|
response: ChatCompletion = client.chat.completions.create(**request_data)
|
134
162
|
return response.model_dump()
|
135
163
|
|
136
|
-
async def request_async(self, request_data: dict) -> dict:
|
164
|
+
async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
|
137
165
|
"""
|
138
166
|
Performs underlying asynchronous request to OpenAI API and returns raw response dict.
|
139
167
|
"""
|
140
|
-
client = AsyncOpenAI(**self._prepare_client_kwargs())
|
168
|
+
client = AsyncOpenAI(**self._prepare_client_kwargs(llm_config))
|
141
169
|
response: ChatCompletion = await client.chat.completions.create(**request_data)
|
142
170
|
return response.model_dump()
|
143
171
|
|
@@ -145,6 +173,7 @@ class OpenAIClient(LLMClientBase):
|
|
145
173
|
self,
|
146
174
|
response_data: dict,
|
147
175
|
input_messages: List[PydanticMessage], # Included for consistency, maybe used later
|
176
|
+
llm_config: LLMConfig,
|
148
177
|
) -> ChatCompletionResponse:
|
149
178
|
"""
|
150
179
|
Converts raw OpenAI response dict into the ChatCompletionResponse Pydantic model.
|
@@ -155,26 +184,30 @@ class OpenAIClient(LLMClientBase):
|
|
155
184
|
chat_completion_response = ChatCompletionResponse(**response_data)
|
156
185
|
|
157
186
|
# Unpack inner thoughts if they were embedded in function arguments
|
158
|
-
if
|
187
|
+
if llm_config.put_inner_thoughts_in_kwargs:
|
159
188
|
chat_completion_response = unpack_all_inner_thoughts_from_kwargs(
|
160
189
|
response=chat_completion_response, inner_thoughts_key=INNER_THOUGHTS_KWARG
|
161
190
|
)
|
162
191
|
|
192
|
+
# If we used a reasoning model, create a content part for the ommitted reasoning
|
193
|
+
if is_openai_reasoning_model(llm_config.model):
|
194
|
+
chat_completion_response.choices[0].message.ommitted_reasoning_content = True
|
195
|
+
|
163
196
|
return chat_completion_response
|
164
197
|
|
165
|
-
def stream(self, request_data: dict) -> Stream[ChatCompletionChunk]:
|
198
|
+
def stream(self, request_data: dict, llm_config: LLMConfig) -> Stream[ChatCompletionChunk]:
|
166
199
|
"""
|
167
200
|
Performs underlying streaming request to OpenAI and returns the stream iterator.
|
168
201
|
"""
|
169
|
-
client = OpenAI(**self._prepare_client_kwargs())
|
202
|
+
client = OpenAI(**self._prepare_client_kwargs(llm_config))
|
170
203
|
response_stream: Stream[ChatCompletionChunk] = client.chat.completions.create(**request_data, stream=True)
|
171
204
|
return response_stream
|
172
205
|
|
173
|
-
async def stream_async(self, request_data: dict) -> AsyncStream[ChatCompletionChunk]:
|
206
|
+
async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk]:
|
174
207
|
"""
|
175
208
|
Performs underlying asynchronous streaming request to OpenAI and returns the async stream iterator.
|
176
209
|
"""
|
177
|
-
client = AsyncOpenAI(**self._prepare_client_kwargs())
|
210
|
+
client = AsyncOpenAI(**self._prepare_client_kwargs(llm_config))
|
178
211
|
response_stream: AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(**request_data, stream=True)
|
179
212
|
return response_stream
|
180
213
|
|
@@ -6,7 +6,7 @@ import requests
|
|
6
6
|
|
7
7
|
from letta.constants import CLI_WARNING_PREFIX
|
8
8
|
from letta.errors import LocalLLMConnectionError, LocalLLMError
|
9
|
-
from letta.helpers.datetime_helpers import
|
9
|
+
from letta.helpers.datetime_helpers import get_utc_time_int
|
10
10
|
from letta.helpers.json_helpers import json_dumps
|
11
11
|
from letta.local_llm.constants import DEFAULT_WRAPPER
|
12
12
|
from letta.local_llm.function_parser import patch_function
|
@@ -241,7 +241,7 @@ def get_chat_completion(
|
|
241
241
|
),
|
242
242
|
)
|
243
243
|
],
|
244
|
-
created=
|
244
|
+
created=get_utc_time_int(),
|
245
245
|
model=model,
|
246
246
|
# "This fingerprint represents the backend configuration that the model runs with."
|
247
247
|
# system_fingerprint=user if user is not None else "null",
|
letta/memory.py
CHANGED
@@ -79,7 +79,7 @@ def summarize_messages(
|
|
79
79
|
llm_config_no_inner_thoughts.put_inner_thoughts_in_kwargs = False
|
80
80
|
|
81
81
|
llm_client = LLMClient.create(
|
82
|
-
|
82
|
+
provider=llm_config_no_inner_thoughts.model_endpoint_type,
|
83
83
|
put_inner_thoughts_first=False,
|
84
84
|
)
|
85
85
|
# try to use new client, otherwise fallback to old flow
|
@@ -87,6 +87,7 @@ def summarize_messages(
|
|
87
87
|
if llm_client:
|
88
88
|
response = llm_client.send_llm_request(
|
89
89
|
messages=message_sequence,
|
90
|
+
llm_config=llm_config_no_inner_thoughts,
|
90
91
|
stream=False,
|
91
92
|
)
|
92
93
|
else:
|
@@ -0,0 +1,5 @@
|
|
1
|
+
I am an expert conversation memory agent that can do the following:
|
2
|
+
- Consolidate memories into more concise blocks
|
3
|
+
- Identify patterns in user behavior
|
4
|
+
- Make inferences based on the memory
|
5
|
+
I manage the memory blocks such that they contain everything that is important about the conversation.
|
letta/schemas/enums.py
CHANGED
@@ -145,7 +145,8 @@ class OmittedReasoningContent(MessageContent):
|
|
145
145
|
type: Literal[MessageContentType.omitted_reasoning] = Field(
|
146
146
|
MessageContentType.omitted_reasoning, description="Indicates this is an omitted reasoning step."
|
147
147
|
)
|
148
|
-
|
148
|
+
# NOTE: dropping because we don't track this kind of information for the other reasoning types
|
149
|
+
# tokens: int = Field(..., description="The reasoning token count for intermediate reasoning content.")
|
149
150
|
|
150
151
|
|
151
152
|
LettaMessageContentUnion = Annotated[
|
letta/schemas/llm_config.py
CHANGED
@@ -81,8 +81,11 @@ class LLMConfig(BaseModel):
|
|
81
81
|
@model_validator(mode="before")
|
82
82
|
@classmethod
|
83
83
|
def set_default_enable_reasoner(cls, values):
|
84
|
-
|
85
|
-
|
84
|
+
# NOTE: this is really only applicable for models that can toggle reasoning on-and-off, like 3.7
|
85
|
+
# We can also use this field to identify if a model is a "reasoning" model (o1/o3, etc.) if we want
|
86
|
+
# if any(openai_reasoner_model in values.get("model", "") for openai_reasoner_model in ["o3-mini", "o1"]):
|
87
|
+
# values["enable_reasoner"] = True
|
88
|
+
# values["put_inner_thoughts_in_kwargs"] = False
|
86
89
|
return values
|
87
90
|
|
88
91
|
@model_validator(mode="before")
|
@@ -100,6 +103,13 @@ class LLMConfig(BaseModel):
|
|
100
103
|
if values.get("put_inner_thoughts_in_kwargs") is None:
|
101
104
|
values["put_inner_thoughts_in_kwargs"] = False if model in avoid_put_inner_thoughts_in_kwargs else True
|
102
105
|
|
106
|
+
# For the o1/o3 series from OpenAI, set to False by default
|
107
|
+
# We can set this flag to `true` if desired, which will enable "double-think"
|
108
|
+
from letta.llm_api.openai_client import is_openai_reasoning_model
|
109
|
+
|
110
|
+
if is_openai_reasoning_model(model):
|
111
|
+
values["put_inner_thoughts_in_kwargs"] = False
|
112
|
+
|
103
113
|
return values
|
104
114
|
|
105
115
|
@model_validator(mode="after")
|
letta/schemas/message.py
CHANGED
@@ -31,6 +31,7 @@ from letta.schemas.letta_message import (
|
|
31
31
|
)
|
32
32
|
from letta.schemas.letta_message_content import (
|
33
33
|
LettaMessageContentUnion,
|
34
|
+
OmittedReasoningContent,
|
34
35
|
ReasoningContent,
|
35
36
|
RedactedReasoningContent,
|
36
37
|
TextContent,
|
@@ -295,6 +296,18 @@ class Message(BaseMessage):
|
|
295
296
|
sender_id=self.sender_id,
|
296
297
|
)
|
297
298
|
)
|
299
|
+
elif isinstance(content_part, OmittedReasoningContent):
|
300
|
+
# Special case for "hidden reasoning" models like o1/o3
|
301
|
+
# NOTE: we also have to think about how to return this during streaming
|
302
|
+
messages.append(
|
303
|
+
HiddenReasoningMessage(
|
304
|
+
id=self.id,
|
305
|
+
date=self.created_at,
|
306
|
+
state="omitted",
|
307
|
+
name=self.name,
|
308
|
+
otid=otid,
|
309
|
+
)
|
310
|
+
)
|
298
311
|
else:
|
299
312
|
warnings.warn(f"Unrecognized content part in assistant message: {content_part}")
|
300
313
|
|
@@ -464,6 +477,10 @@ class Message(BaseMessage):
|
|
464
477
|
data=openai_message_dict["redacted_reasoning_content"] if "redacted_reasoning_content" in openai_message_dict else None,
|
465
478
|
),
|
466
479
|
)
|
480
|
+
if "omitted_reasoning_content" in openai_message_dict and openai_message_dict["omitted_reasoning_content"]:
|
481
|
+
content.append(
|
482
|
+
OmittedReasoningContent(),
|
483
|
+
)
|
467
484
|
|
468
485
|
# If we're going from deprecated function form
|
469
486
|
if openai_message_dict["role"] == "function":
|
@@ -39,9 +39,10 @@ class Message(BaseModel):
|
|
39
39
|
tool_calls: Optional[List[ToolCall]] = None
|
40
40
|
role: str
|
41
41
|
function_call: Optional[FunctionCall] = None # Deprecated
|
42
|
-
reasoning_content: Optional[str] = None # Used in newer reasoning APIs
|
42
|
+
reasoning_content: Optional[str] = None # Used in newer reasoning APIs, e.g. DeepSeek
|
43
43
|
reasoning_content_signature: Optional[str] = None # NOTE: for Anthropic
|
44
44
|
redacted_reasoning_content: Optional[str] = None # NOTE: for Anthropic
|
45
|
+
ommitted_reasoning_content: bool = False # NOTE: for OpenAI o1/o3
|
45
46
|
|
46
47
|
|
47
48
|
class Choice(BaseModel):
|
@@ -52,16 +53,64 @@ class Choice(BaseModel):
|
|
52
53
|
seed: Optional[int] = None # found in TogetherAI
|
53
54
|
|
54
55
|
|
56
|
+
class UsageStatisticsPromptTokenDetails(BaseModel):
|
57
|
+
cached_tokens: int = 0
|
58
|
+
# NOTE: OAI specific
|
59
|
+
# audio_tokens: int = 0
|
60
|
+
|
61
|
+
def __add__(self, other: "UsageStatisticsPromptTokenDetails") -> "UsageStatisticsPromptTokenDetails":
|
62
|
+
return UsageStatisticsPromptTokenDetails(
|
63
|
+
cached_tokens=self.cached_tokens + other.cached_tokens,
|
64
|
+
)
|
65
|
+
|
66
|
+
|
67
|
+
class UsageStatisticsCompletionTokenDetails(BaseModel):
|
68
|
+
reasoning_tokens: int = 0
|
69
|
+
# NOTE: OAI specific
|
70
|
+
# audio_tokens: int = 0
|
71
|
+
# accepted_prediction_tokens: int = 0
|
72
|
+
# rejected_prediction_tokens: int = 0
|
73
|
+
|
74
|
+
def __add__(self, other: "UsageStatisticsCompletionTokenDetails") -> "UsageStatisticsCompletionTokenDetails":
|
75
|
+
return UsageStatisticsCompletionTokenDetails(
|
76
|
+
reasoning_tokens=self.reasoning_tokens + other.reasoning_tokens,
|
77
|
+
)
|
78
|
+
|
79
|
+
|
55
80
|
class UsageStatistics(BaseModel):
|
56
81
|
completion_tokens: int = 0
|
57
82
|
prompt_tokens: int = 0
|
58
83
|
total_tokens: int = 0
|
59
84
|
|
85
|
+
prompt_tokens_details: Optional[UsageStatisticsPromptTokenDetails] = None
|
86
|
+
completion_tokens_details: Optional[UsageStatisticsCompletionTokenDetails] = None
|
87
|
+
|
60
88
|
def __add__(self, other: "UsageStatistics") -> "UsageStatistics":
|
89
|
+
|
90
|
+
if self.prompt_tokens_details is None and other.prompt_tokens_details is None:
|
91
|
+
total_prompt_tokens_details = None
|
92
|
+
elif self.prompt_tokens_details is None:
|
93
|
+
total_prompt_tokens_details = other.prompt_tokens_details
|
94
|
+
elif other.prompt_tokens_details is None:
|
95
|
+
total_prompt_tokens_details = self.prompt_tokens_details
|
96
|
+
else:
|
97
|
+
total_prompt_tokens_details = self.prompt_tokens_details + other.prompt_tokens_details
|
98
|
+
|
99
|
+
if self.completion_tokens_details is None and other.completion_tokens_details is None:
|
100
|
+
total_completion_tokens_details = None
|
101
|
+
elif self.completion_tokens_details is None:
|
102
|
+
total_completion_tokens_details = other.completion_tokens_details
|
103
|
+
elif other.completion_tokens_details is None:
|
104
|
+
total_completion_tokens_details = self.completion_tokens_details
|
105
|
+
else:
|
106
|
+
total_completion_tokens_details = self.completion_tokens_details + other.completion_tokens_details
|
107
|
+
|
61
108
|
return UsageStatistics(
|
62
109
|
completion_tokens=self.completion_tokens + other.completion_tokens,
|
63
110
|
prompt_tokens=self.prompt_tokens + other.prompt_tokens,
|
64
111
|
total_tokens=self.total_tokens + other.total_tokens,
|
112
|
+
prompt_tokens_details=total_prompt_tokens_details,
|
113
|
+
completion_tokens_details=total_completion_tokens_details,
|
65
114
|
)
|
66
115
|
|
67
116
|
|
@@ -70,7 +119,7 @@ class ChatCompletionResponse(BaseModel):
|
|
70
119
|
|
71
120
|
id: str
|
72
121
|
choices: List[Choice]
|
73
|
-
created: datetime.datetime
|
122
|
+
created: Union[datetime.datetime, int]
|
74
123
|
model: Optional[str] = None # NOTE: this is not consistent with OpenAI API standard, however is necessary to support local LLMs
|
75
124
|
# system_fingerprint: str # docs say this is mandatory, but in reality API returns None
|
76
125
|
system_fingerprint: Optional[str] = None
|
@@ -138,7 +187,7 @@ class ChatCompletionChunkResponse(BaseModel):
|
|
138
187
|
|
139
188
|
id: str
|
140
189
|
choices: List[ChunkChoice]
|
141
|
-
created: Union[datetime.datetime,
|
190
|
+
created: Union[datetime.datetime, int]
|
142
191
|
model: str
|
143
192
|
# system_fingerprint: str # docs say this is mandatory, but in reality API returns None
|
144
193
|
system_fingerprint: Optional[str] = None
|
@@ -238,7 +238,7 @@ class ChatCompletionsStreamingInterface(AgentChunkStreamingInterface):
|
|
238
238
|
return ChatCompletionChunk(
|
239
239
|
id=chunk.id,
|
240
240
|
object=chunk.object,
|
241
|
-
created=chunk.created
|
241
|
+
created=chunk.created,
|
242
242
|
model=chunk.model,
|
243
243
|
choices=[
|
244
244
|
Choice(
|
@@ -256,7 +256,7 @@ class ChatCompletionsStreamingInterface(AgentChunkStreamingInterface):
|
|
256
256
|
return ChatCompletionChunk(
|
257
257
|
id=chunk.id,
|
258
258
|
object=chunk.object,
|
259
|
-
created=chunk.created
|
259
|
+
created=chunk.created,
|
260
260
|
model=chunk.model,
|
261
261
|
choices=[
|
262
262
|
Choice(
|
@@ -1001,7 +1001,7 @@ class StreamingServerInterface(AgentChunkStreamingInterface):
|
|
1001
1001
|
# Example case that would trigger here:
|
1002
1002
|
# id='chatcmpl-AKtUvREgRRvgTW6n8ZafiKuV0mxhQ'
|
1003
1003
|
# choices=[ChunkChoice(finish_reason=None, index=0, delta=MessageDelta(content=None, tool_calls=None, function_call=None), logprobs=None)]
|
1004
|
-
# created=
|
1004
|
+
# created=1713216662
|
1005
1005
|
# model='gpt-4o-mini-2024-07-18'
|
1006
1006
|
# object='chat.completion.chunk'
|
1007
1007
|
warnings.warn(f"Couldn't find delta in chunk: {chunk}")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from typing import List, Optional
|
2
2
|
|
3
|
-
from fastapi import APIRouter, Body, Depends, Header
|
3
|
+
from fastapi import APIRouter, Body, Depends, Header, status
|
4
4
|
from fastapi.exceptions import HTTPException
|
5
5
|
from starlette.requests import Request
|
6
6
|
|
@@ -11,6 +11,7 @@ from letta.schemas.job import BatchJob, JobStatus, JobType, JobUpdate
|
|
11
11
|
from letta.schemas.letta_request import CreateBatch
|
12
12
|
from letta.server.rest_api.utils import get_letta_server
|
13
13
|
from letta.server.server import SyncServer
|
14
|
+
from letta.settings import settings
|
14
15
|
|
15
16
|
router = APIRouter(prefix="/messages", tags=["messages"])
|
16
17
|
|
@@ -43,6 +44,13 @@ async def create_messages_batch(
|
|
43
44
|
if length > max_bytes:
|
44
45
|
raise HTTPException(status_code=413, detail=f"Request too large ({length} bytes). Max is {max_bytes} bytes.")
|
45
46
|
|
47
|
+
# Reject request if env var is not set
|
48
|
+
if not settings.enable_batch_job_polling:
|
49
|
+
raise HTTPException(
|
50
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
51
|
+
detail=f"Server misconfiguration: LETTA_ENABLE_BATCH_JOB_POLLING is set to False.",
|
52
|
+
)
|
53
|
+
|
46
54
|
actor = server.user_manager.get_user_or_default(user_id=actor_id)
|
47
55
|
batch_job = BatchJob(
|
48
56
|
user_id=actor.id,
|
letta/server/server.py
CHANGED
@@ -766,12 +766,7 @@ class SyncServer(Server):
|
|
766
766
|
memory_blocks=[
|
767
767
|
CreateBlock(
|
768
768
|
label="memory_persona",
|
769
|
-
value=(
|
770
|
-
"I am an expert conversation memory manager. "
|
771
|
-
"I manage the memory blocks such that they "
|
772
|
-
"contain everything that is important about "
|
773
|
-
"the conversation."
|
774
|
-
),
|
769
|
+
value=get_persona_text("sleeptime_memory_persona"),
|
775
770
|
),
|
776
771
|
],
|
777
772
|
llm_config=main_agent.llm_config,
|
letta/services/agent_manager.py
CHANGED
@@ -161,7 +161,7 @@ class AgentManager:
|
|
161
161
|
# Basic CRUD operations
|
162
162
|
# ======================================================================================================================
|
163
163
|
@trace_method
|
164
|
-
def create_agent(self, agent_create: CreateAgent, actor: PydanticUser) -> PydanticAgentState:
|
164
|
+
def create_agent(self, agent_create: CreateAgent, actor: PydanticUser, _test_only_force_id: Optional[str] = None) -> PydanticAgentState:
|
165
165
|
# validate required configs
|
166
166
|
if not agent_create.llm_config or not agent_create.embedding_config:
|
167
167
|
raise ValueError("llm_config and embedding_config are required")
|
@@ -236,9 +236,14 @@ class AgentManager:
|
|
236
236
|
base_template_id=agent_create.base_template_id,
|
237
237
|
message_buffer_autoclear=agent_create.message_buffer_autoclear,
|
238
238
|
enable_sleeptime=agent_create.enable_sleeptime,
|
239
|
+
response_format=agent_create.response_format,
|
239
240
|
created_by_id=actor.id,
|
240
241
|
last_updated_by_id=actor.id,
|
241
242
|
)
|
243
|
+
|
244
|
+
if _test_only_force_id:
|
245
|
+
new_agent.id = _test_only_force_id
|
246
|
+
|
242
247
|
session.add(new_agent)
|
243
248
|
session.flush()
|
244
249
|
aid = new_agent.id
|