letta-nightly 0.6.33.dev20250227104112__py3-none-any.whl → 0.6.34.dev20250228104059__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of letta-nightly might be problematic. Click here for more details.
- letta/__init__.py +1 -1
- letta/agent.py +12 -2
- letta/llm_api/llm_api_tools.py +58 -1
- letta/llm_api/openai.py +19 -7
- letta/orm/sqlalchemy_base.py +5 -0
- letta/schemas/llm_config.py +2 -1
- letta/schemas/providers.py +67 -0
- letta/server/rest_api/chat_completions_interface.py +6 -26
- letta/server/rest_api/routers/openai/chat_completions/chat_completions.py +2 -285
- letta/server/rest_api/routers/v1/__init__.py +2 -0
- letta/server/rest_api/routers/v1/agents.py +1 -1
- letta/server/rest_api/routers/v1/voice.py +315 -0
- letta/server/rest_api/utils.py +72 -19
- letta/server/server.py +5 -1
- letta/services/identity_manager.py +2 -2
- letta/settings.py +3 -0
- {letta_nightly-0.6.33.dev20250227104112.dist-info → letta_nightly-0.6.34.dev20250228104059.dist-info}/METADATA +1 -1
- {letta_nightly-0.6.33.dev20250227104112.dist-info → letta_nightly-0.6.34.dev20250228104059.dist-info}/RECORD +21 -20
- {letta_nightly-0.6.33.dev20250227104112.dist-info → letta_nightly-0.6.34.dev20250228104059.dist-info}/LICENSE +0 -0
- {letta_nightly-0.6.33.dev20250227104112.dist-info → letta_nightly-0.6.34.dev20250228104059.dist-info}/WHEEL +0 -0
- {letta_nightly-0.6.33.dev20250227104112.dist-info → letta_nightly-0.6.34.dev20250228104059.dist-info}/entry_points.txt +0 -0
letta/__init__.py
CHANGED
letta/agent.py
CHANGED
|
@@ -832,7 +832,7 @@ class Agent(BaseAgent):
|
|
|
832
832
|
)
|
|
833
833
|
|
|
834
834
|
if current_total_tokens > summarizer_settings.memory_warning_threshold * int(self.agent_state.llm_config.context_window):
|
|
835
|
-
|
|
835
|
+
logger.warning(
|
|
836
836
|
f"{CLI_WARNING_PREFIX}last response total_tokens ({current_total_tokens}) > {summarizer_settings.memory_warning_threshold * int(self.agent_state.llm_config.context_window)}"
|
|
837
837
|
)
|
|
838
838
|
|
|
@@ -842,7 +842,7 @@ class Agent(BaseAgent):
|
|
|
842
842
|
self.agent_alerted_about_memory_pressure = True # it's up to the outer loop to handle this
|
|
843
843
|
|
|
844
844
|
else:
|
|
845
|
-
|
|
845
|
+
logger.info(
|
|
846
846
|
f"last response total_tokens ({current_total_tokens}) < {summarizer_settings.memory_warning_threshold * int(self.agent_state.llm_config.context_window)}"
|
|
847
847
|
)
|
|
848
848
|
|
|
@@ -892,6 +892,16 @@ class Agent(BaseAgent):
|
|
|
892
892
|
if is_context_overflow_error(e):
|
|
893
893
|
in_context_messages = self.agent_manager.get_in_context_messages(agent_id=self.agent_state.id, actor=self.user)
|
|
894
894
|
|
|
895
|
+
# TODO: this is a patch to resolve immediate issues, should be removed once the summarizer is fixes
|
|
896
|
+
if self.agent_state.message_buffer_autoclear:
|
|
897
|
+
# no calling the summarizer in this case
|
|
898
|
+
logger.error(
|
|
899
|
+
f"step() failed with an exception that looks like a context window overflow, but message buffer is set to autoclear, so skipping: '{str(e)}'"
|
|
900
|
+
)
|
|
901
|
+
raise e
|
|
902
|
+
|
|
903
|
+
summarize_attempt_count += 1
|
|
904
|
+
|
|
895
905
|
if summarize_attempt_count <= summarizer_settings.max_summarizer_retries:
|
|
896
906
|
logger.warning(
|
|
897
907
|
f"context window exceeded with limit {self.agent_state.llm_config.context_window}, attempting to summarize ({summarize_attempt_count}/{summarizer_settings.max_summarizer_retries}"
|
letta/llm_api/llm_api_tools.py
CHANGED
|
@@ -187,8 +187,65 @@ def create(
|
|
|
187
187
|
function_call = "required"
|
|
188
188
|
|
|
189
189
|
data = build_openai_chat_completions_request(
|
|
190
|
-
llm_config,
|
|
190
|
+
llm_config,
|
|
191
|
+
messages,
|
|
192
|
+
user_id,
|
|
193
|
+
functions,
|
|
194
|
+
function_call,
|
|
195
|
+
use_tool_naming,
|
|
196
|
+
put_inner_thoughts_first=put_inner_thoughts_first,
|
|
197
|
+
use_structured_output=True, # NOTE: turn on all the time for OpenAI API
|
|
191
198
|
)
|
|
199
|
+
|
|
200
|
+
if stream: # Client requested token streaming
|
|
201
|
+
data.stream = True
|
|
202
|
+
assert isinstance(stream_interface, AgentChunkStreamingInterface) or isinstance(
|
|
203
|
+
stream_interface, AgentRefreshStreamingInterface
|
|
204
|
+
), type(stream_interface)
|
|
205
|
+
response = openai_chat_completions_process_stream(
|
|
206
|
+
url=llm_config.model_endpoint,
|
|
207
|
+
api_key=api_key,
|
|
208
|
+
chat_completion_request=data,
|
|
209
|
+
stream_interface=stream_interface,
|
|
210
|
+
)
|
|
211
|
+
else: # Client did not request token streaming (expect a blocking backend response)
|
|
212
|
+
data.stream = False
|
|
213
|
+
if isinstance(stream_interface, AgentChunkStreamingInterface):
|
|
214
|
+
stream_interface.stream_start()
|
|
215
|
+
try:
|
|
216
|
+
response = openai_chat_completions_request(
|
|
217
|
+
url=llm_config.model_endpoint,
|
|
218
|
+
api_key=api_key,
|
|
219
|
+
chat_completion_request=data,
|
|
220
|
+
)
|
|
221
|
+
finally:
|
|
222
|
+
if isinstance(stream_interface, AgentChunkStreamingInterface):
|
|
223
|
+
stream_interface.stream_end()
|
|
224
|
+
|
|
225
|
+
if llm_config.put_inner_thoughts_in_kwargs:
|
|
226
|
+
response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG)
|
|
227
|
+
|
|
228
|
+
return response
|
|
229
|
+
|
|
230
|
+
elif llm_config.model_endpoint_type == "xai":
|
|
231
|
+
|
|
232
|
+
api_key = model_settings.xai_api_key
|
|
233
|
+
|
|
234
|
+
if function_call is None and functions is not None and len(functions) > 0:
|
|
235
|
+
# force function calling for reliability, see https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
|
|
236
|
+
function_call = "required"
|
|
237
|
+
|
|
238
|
+
data = build_openai_chat_completions_request(
|
|
239
|
+
llm_config,
|
|
240
|
+
messages,
|
|
241
|
+
user_id,
|
|
242
|
+
functions,
|
|
243
|
+
function_call,
|
|
244
|
+
use_tool_naming,
|
|
245
|
+
put_inner_thoughts_first=put_inner_thoughts_first,
|
|
246
|
+
use_structured_output=False, # NOTE: not supported atm for xAI
|
|
247
|
+
)
|
|
248
|
+
|
|
192
249
|
if stream: # Client requested token streaming
|
|
193
250
|
data.stream = True
|
|
194
251
|
assert isinstance(stream_interface, AgentChunkStreamingInterface) or isinstance(
|
letta/llm_api/openai.py
CHANGED
|
@@ -13,7 +13,7 @@ from letta.schemas.message import Message as _Message
|
|
|
13
13
|
from letta.schemas.message import MessageRole as _MessageRole
|
|
14
14
|
from letta.schemas.openai.chat_completion_request import ChatCompletionRequest
|
|
15
15
|
from letta.schemas.openai.chat_completion_request import FunctionCall as ToolFunctionChoiceFunctionCall
|
|
16
|
-
from letta.schemas.openai.chat_completion_request import Tool, ToolFunctionChoice, cast_message_to_subtype
|
|
16
|
+
from letta.schemas.openai.chat_completion_request import FunctionSchema, Tool, ToolFunctionChoice, cast_message_to_subtype
|
|
17
17
|
from letta.schemas.openai.chat_completion_response import (
|
|
18
18
|
ChatCompletionChunkResponse,
|
|
19
19
|
ChatCompletionResponse,
|
|
@@ -95,6 +95,7 @@ def build_openai_chat_completions_request(
|
|
|
95
95
|
function_call: Optional[str],
|
|
96
96
|
use_tool_naming: bool,
|
|
97
97
|
put_inner_thoughts_first: bool = True,
|
|
98
|
+
use_structured_output: bool = True,
|
|
98
99
|
) -> ChatCompletionRequest:
|
|
99
100
|
if functions and llm_config.put_inner_thoughts_in_kwargs:
|
|
100
101
|
# Special case for LM Studio backend since it needs extra guidance to force out the thoughts first
|
|
@@ -157,6 +158,16 @@ def build_openai_chat_completions_request(
|
|
|
157
158
|
data.user = str(uuid.UUID(int=0))
|
|
158
159
|
data.model = "memgpt-openai"
|
|
159
160
|
|
|
161
|
+
if use_structured_output and data.tools is not None and len(data.tools) > 0:
|
|
162
|
+
# Convert to structured output style (which has 'strict' and no optionals)
|
|
163
|
+
for tool in data.tools:
|
|
164
|
+
try:
|
|
165
|
+
# tool["function"] = convert_to_structured_output(tool["function"])
|
|
166
|
+
structured_output_version = convert_to_structured_output(tool.function.model_dump())
|
|
167
|
+
tool.function = FunctionSchema(**structured_output_version)
|
|
168
|
+
except ValueError as e:
|
|
169
|
+
warnings.warn(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
|
|
170
|
+
|
|
160
171
|
return data
|
|
161
172
|
|
|
162
173
|
|
|
@@ -455,11 +466,12 @@ def prepare_openai_payload(chat_completion_request: ChatCompletionRequest):
|
|
|
455
466
|
data.pop("tools")
|
|
456
467
|
data.pop("tool_choice", None) # extra safe, should exist always (default="auto")
|
|
457
468
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
469
|
+
# # NOTE: move this out to wherever the ChatCompletionRequest is created
|
|
470
|
+
# if "tools" in data:
|
|
471
|
+
# for tool in data["tools"]:
|
|
472
|
+
# try:
|
|
473
|
+
# tool["function"] = convert_to_structured_output(tool["function"])
|
|
474
|
+
# except ValueError as e:
|
|
475
|
+
# warnings.warn(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
|
|
464
476
|
|
|
465
477
|
return data
|
letta/orm/sqlalchemy_base.py
CHANGED
|
@@ -69,6 +69,7 @@ class SqlalchemyBase(CommonSqlalchemyMetaMixins, Base):
|
|
|
69
69
|
join_model: Optional[Base] = None,
|
|
70
70
|
join_conditions: Optional[Union[Tuple, List]] = None,
|
|
71
71
|
identifier_keys: Optional[List[str]] = None,
|
|
72
|
+
identifier_id: Optional[str] = None,
|
|
72
73
|
**kwargs,
|
|
73
74
|
) -> List["SqlalchemyBase"]:
|
|
74
75
|
"""
|
|
@@ -147,6 +148,10 @@ class SqlalchemyBase(CommonSqlalchemyMetaMixins, Base):
|
|
|
147
148
|
if identifier_keys and hasattr(cls, "identities"):
|
|
148
149
|
query = query.join(cls.identities).filter(cls.identities.property.mapper.class_.identifier_key.in_(identifier_keys))
|
|
149
150
|
|
|
151
|
+
# given the identifier_id, we can find within the agents table any agents that have the identifier_id in their identity_ids
|
|
152
|
+
if identifier_id and hasattr(cls, "identities"):
|
|
153
|
+
query = query.join(cls.identities).filter(cls.identities.property.mapper.class_.id == identifier_id)
|
|
154
|
+
|
|
150
155
|
# Apply filtering logic from kwargs
|
|
151
156
|
for key, value in kwargs.items():
|
|
152
157
|
if "." in key:
|
letta/schemas/llm_config.py
CHANGED
|
@@ -42,6 +42,7 @@ class LLMConfig(BaseModel):
|
|
|
42
42
|
"together", # completions endpoint
|
|
43
43
|
"bedrock",
|
|
44
44
|
"deepseek",
|
|
45
|
+
"xai",
|
|
45
46
|
] = Field(..., description="The endpoint type for the model.")
|
|
46
47
|
model_endpoint: Optional[str] = Field(None, description="The endpoint for the model.")
|
|
47
48
|
model_wrapper: Optional[str] = Field(None, description="The wrapper for the model.")
|
|
@@ -56,7 +57,7 @@ class LLMConfig(BaseModel):
|
|
|
56
57
|
description="The temperature to use when generating text with the model. A higher temperature will result in more random text.",
|
|
57
58
|
)
|
|
58
59
|
max_tokens: Optional[int] = Field(
|
|
59
|
-
|
|
60
|
+
4096,
|
|
60
61
|
description="The maximum number of tokens to generate. If not set, the model will use its default value.",
|
|
61
62
|
)
|
|
62
63
|
|
letta/schemas/providers.py
CHANGED
|
@@ -211,6 +211,63 @@ class OpenAIProvider(Provider):
|
|
|
211
211
|
return None
|
|
212
212
|
|
|
213
213
|
|
|
214
|
+
class xAIProvider(OpenAIProvider):
|
|
215
|
+
"""https://docs.x.ai/docs/api-reference"""
|
|
216
|
+
|
|
217
|
+
name: str = "xai"
|
|
218
|
+
api_key: str = Field(..., description="API key for the xAI/Grok API.")
|
|
219
|
+
base_url: str = Field("https://api.x.ai/v1", description="Base URL for the xAI/Grok API.")
|
|
220
|
+
|
|
221
|
+
def get_model_context_window_size(self, model_name: str) -> Optional[int]:
|
|
222
|
+
# xAI doesn't return context window in the model listing,
|
|
223
|
+
# so these are hardcoded from their website
|
|
224
|
+
if model_name == "grok-2-1212":
|
|
225
|
+
return 131072
|
|
226
|
+
else:
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
def list_llm_models(self) -> List[LLMConfig]:
|
|
230
|
+
from letta.llm_api.openai import openai_get_model_list
|
|
231
|
+
|
|
232
|
+
response = openai_get_model_list(self.base_url, api_key=self.api_key)
|
|
233
|
+
|
|
234
|
+
if "data" in response:
|
|
235
|
+
data = response["data"]
|
|
236
|
+
else:
|
|
237
|
+
data = response
|
|
238
|
+
|
|
239
|
+
configs = []
|
|
240
|
+
for model in data:
|
|
241
|
+
assert "id" in model, f"xAI/Grok model missing 'id' field: {model}"
|
|
242
|
+
model_name = model["id"]
|
|
243
|
+
|
|
244
|
+
# In case xAI starts supporting it in the future:
|
|
245
|
+
if "context_length" in model:
|
|
246
|
+
context_window_size = model["context_length"]
|
|
247
|
+
else:
|
|
248
|
+
context_window_size = self.get_model_context_window_size(model_name)
|
|
249
|
+
|
|
250
|
+
if not context_window_size:
|
|
251
|
+
warnings.warn(f"Couldn't find context window size for model {model_name}")
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
configs.append(
|
|
255
|
+
LLMConfig(
|
|
256
|
+
model=model_name,
|
|
257
|
+
model_endpoint_type="xai",
|
|
258
|
+
model_endpoint=self.base_url,
|
|
259
|
+
context_window=context_window_size,
|
|
260
|
+
handle=self.get_handle(model_name),
|
|
261
|
+
)
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
return configs
|
|
265
|
+
|
|
266
|
+
def list_embedding_models(self) -> List[EmbeddingConfig]:
|
|
267
|
+
# No embeddings supported
|
|
268
|
+
return []
|
|
269
|
+
|
|
270
|
+
|
|
214
271
|
class DeepSeekProvider(OpenAIProvider):
|
|
215
272
|
"""
|
|
216
273
|
DeepSeek ChatCompletions API is similar to OpenAI's reasoning API,
|
|
@@ -456,6 +513,13 @@ class AnthropicProvider(Provider):
|
|
|
456
513
|
warnings.warn(f"Couldn't find context window size for model {model['id']}, defaulting to 200,000")
|
|
457
514
|
model["context_window"] = 200000
|
|
458
515
|
|
|
516
|
+
max_tokens = 8192
|
|
517
|
+
if "claude-3-opus" in model["id"]:
|
|
518
|
+
max_tokens = 4096
|
|
519
|
+
if "claude-3-haiku" in model["id"]:
|
|
520
|
+
max_tokens = 4096
|
|
521
|
+
# TODO: set for 3-7 extended thinking mode
|
|
522
|
+
|
|
459
523
|
# We set this to false by default, because Anthropic can
|
|
460
524
|
# natively support <thinking> tags inside of content fields
|
|
461
525
|
# However, putting COT inside of tool calls can make it more
|
|
@@ -472,6 +536,7 @@ class AnthropicProvider(Provider):
|
|
|
472
536
|
context_window=model["context_window"],
|
|
473
537
|
handle=self.get_handle(model["id"]),
|
|
474
538
|
put_inner_thoughts_in_kwargs=inner_thoughts_in_kwargs,
|
|
539
|
+
max_tokens=max_tokens,
|
|
475
540
|
)
|
|
476
541
|
)
|
|
477
542
|
return configs
|
|
@@ -811,6 +876,7 @@ class GoogleAIProvider(Provider):
|
|
|
811
876
|
model_endpoint=self.base_url,
|
|
812
877
|
context_window=self.get_model_context_window(model),
|
|
813
878
|
handle=self.get_handle(model),
|
|
879
|
+
max_tokens=8192,
|
|
814
880
|
)
|
|
815
881
|
)
|
|
816
882
|
return configs
|
|
@@ -862,6 +928,7 @@ class GoogleVertexProvider(Provider):
|
|
|
862
928
|
model_endpoint=f"https://{self.google_cloud_location}-aiplatform.googleapis.com/v1/projects/{self.google_cloud_project}/locations/{self.google_cloud_location}",
|
|
863
929
|
context_window=context_length,
|
|
864
930
|
handle=self.get_handle(model),
|
|
931
|
+
max_tokens=8192,
|
|
865
932
|
)
|
|
866
933
|
)
|
|
867
934
|
return configs
|
|
@@ -225,10 +225,10 @@ class ChatCompletionsStreamingInterface(AgentChunkStreamingInterface):
|
|
|
225
225
|
combined_args = "".join(self.current_function_arguments)
|
|
226
226
|
parsed_args = OptimisticJSONParser().parse(combined_args)
|
|
227
227
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
self.
|
|
228
|
+
if parsed_args.get(self.assistant_message_tool_kwarg) and parsed_args.get(
|
|
229
|
+
self.assistant_message_tool_kwarg
|
|
230
|
+
) != self.current_json_parse_result.get(self.assistant_message_tool_kwarg):
|
|
231
|
+
self.current_json_parse_result = parsed_args
|
|
232
232
|
return ChatCompletionChunk(
|
|
233
233
|
id=chunk.id,
|
|
234
234
|
object=chunk.object,
|
|
@@ -237,31 +237,11 @@ class ChatCompletionsStreamingInterface(AgentChunkStreamingInterface):
|
|
|
237
237
|
choices=[
|
|
238
238
|
Choice(
|
|
239
239
|
index=choice.index,
|
|
240
|
-
delta=ChoiceDelta(),
|
|
241
|
-
finish_reason=
|
|
240
|
+
delta=ChoiceDelta(content=self.current_function_arguments[-1], role=self.ASSISTANT_STR),
|
|
241
|
+
finish_reason=None,
|
|
242
242
|
)
|
|
243
243
|
],
|
|
244
244
|
)
|
|
245
|
-
else:
|
|
246
|
-
# If the parsed result is different
|
|
247
|
-
# This is an edge case we need to consider. E.g. if the last streamed token is '}', we shouldn't stream that out
|
|
248
|
-
if parsed_args != self.current_json_parse_result:
|
|
249
|
-
self.current_json_parse_result = parsed_args
|
|
250
|
-
# If we can see a "message" field, return it as partial content
|
|
251
|
-
if self.assistant_message_tool_kwarg in parsed_args and parsed_args[self.assistant_message_tool_kwarg]:
|
|
252
|
-
return ChatCompletionChunk(
|
|
253
|
-
id=chunk.id,
|
|
254
|
-
object=chunk.object,
|
|
255
|
-
created=chunk.created.timestamp(),
|
|
256
|
-
model=chunk.model,
|
|
257
|
-
choices=[
|
|
258
|
-
Choice(
|
|
259
|
-
index=choice.index,
|
|
260
|
-
delta=ChoiceDelta(content=self.current_function_arguments[-1], role=self.ASSISTANT_STR),
|
|
261
|
-
finish_reason=None,
|
|
262
|
-
)
|
|
263
|
-
],
|
|
264
|
-
)
|
|
265
245
|
|
|
266
246
|
# If there's a finish reason, pass that along
|
|
267
247
|
if choice.finish_reason is not None:
|
|
@@ -1,50 +1,19 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
import json
|
|
3
|
-
import uuid
|
|
4
2
|
from typing import TYPE_CHECKING, List, Optional, Union
|
|
5
3
|
|
|
6
|
-
import httpx
|
|
7
|
-
import openai
|
|
8
4
|
from fastapi import APIRouter, Body, Depends, Header, HTTPException
|
|
9
5
|
from fastapi.responses import StreamingResponse
|
|
10
|
-
from openai.types.chat.chat_completion_chunk import ChatCompletionChunk, Choice, ChoiceDelta
|
|
11
6
|
from openai.types.chat.completion_create_params import CompletionCreateParams
|
|
12
|
-
from starlette.concurrency import run_in_threadpool
|
|
13
7
|
|
|
14
8
|
from letta.agent import Agent
|
|
15
|
-
from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
|
|
16
|
-
from letta.helpers.tool_execution_helper import (
|
|
17
|
-
add_pre_execution_message,
|
|
18
|
-
enable_strict_mode,
|
|
19
|
-
execute_external_tool,
|
|
20
|
-
remove_request_heartbeat,
|
|
21
|
-
)
|
|
9
|
+
from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
|
|
22
10
|
from letta.log import get_logger
|
|
23
|
-
from letta.orm.enums import ToolType
|
|
24
11
|
from letta.schemas.message import Message, MessageCreate
|
|
25
|
-
from letta.schemas.openai.chat_completion_request import (
|
|
26
|
-
AssistantMessage,
|
|
27
|
-
ChatCompletionRequest,
|
|
28
|
-
Tool,
|
|
29
|
-
ToolCall,
|
|
30
|
-
ToolCallFunction,
|
|
31
|
-
ToolMessage,
|
|
32
|
-
UserMessage,
|
|
33
|
-
)
|
|
34
12
|
from letta.schemas.user import User
|
|
35
13
|
from letta.server.rest_api.chat_completions_interface import ChatCompletionsStreamingInterface
|
|
36
|
-
from letta.server.rest_api.optimistic_json_parser import OptimisticJSONParser
|
|
37
14
|
|
|
38
15
|
# TODO this belongs in a controller!
|
|
39
|
-
from letta.server.rest_api.utils import
|
|
40
|
-
convert_letta_messages_to_openai,
|
|
41
|
-
create_assistant_message_from_openai_response,
|
|
42
|
-
create_user_message,
|
|
43
|
-
get_letta_server,
|
|
44
|
-
get_messages_from_completion_request,
|
|
45
|
-
sse_async_generator,
|
|
46
|
-
)
|
|
47
|
-
from letta.settings import model_settings
|
|
16
|
+
from letta.server.rest_api.utils import get_letta_server, get_messages_from_completion_request, sse_async_generator
|
|
48
17
|
|
|
49
18
|
if TYPE_CHECKING:
|
|
50
19
|
from letta.server.server import SyncServer
|
|
@@ -54,258 +23,6 @@ router = APIRouter(prefix="/v1", tags=["chat_completions"])
|
|
|
54
23
|
logger = get_logger(__name__)
|
|
55
24
|
|
|
56
25
|
|
|
57
|
-
@router.post(
|
|
58
|
-
"/fast/chat/completions",
|
|
59
|
-
response_model=None,
|
|
60
|
-
operation_id="create_fast_chat_completions",
|
|
61
|
-
responses={
|
|
62
|
-
200: {
|
|
63
|
-
"description": "Successful response",
|
|
64
|
-
"content": {
|
|
65
|
-
"text/event-stream": {"description": "Server-Sent Events stream"},
|
|
66
|
-
},
|
|
67
|
-
}
|
|
68
|
-
},
|
|
69
|
-
)
|
|
70
|
-
async def create_fast_chat_completions(
|
|
71
|
-
completion_request: CompletionCreateParams = Body(...),
|
|
72
|
-
server: "SyncServer" = Depends(get_letta_server),
|
|
73
|
-
user_id: Optional[str] = Header(None, alias="user_id"),
|
|
74
|
-
):
|
|
75
|
-
actor = server.user_manager.get_user_or_default(user_id=user_id)
|
|
76
|
-
|
|
77
|
-
agent_id = str(completion_request.get("user", None))
|
|
78
|
-
if agent_id is None:
|
|
79
|
-
raise HTTPException(status_code=400, detail="Must pass agent_id in the 'user' field")
|
|
80
|
-
|
|
81
|
-
agent_state = server.agent_manager.get_agent_by_id(agent_id=agent_id, actor=actor)
|
|
82
|
-
if agent_state.llm_config.model_endpoint_type != "openai":
|
|
83
|
-
raise HTTPException(status_code=400, detail="Only OpenAI models are supported by this endpoint.")
|
|
84
|
-
|
|
85
|
-
# Convert Letta messages to OpenAI messages
|
|
86
|
-
in_context_messages = server.message_manager.get_messages_by_ids(message_ids=agent_state.message_ids, actor=actor)
|
|
87
|
-
openai_messages = convert_letta_messages_to_openai(in_context_messages)
|
|
88
|
-
|
|
89
|
-
# Also parse user input from completion_request and append
|
|
90
|
-
input_message = get_messages_from_completion_request(completion_request)[-1]
|
|
91
|
-
openai_messages.append(input_message)
|
|
92
|
-
|
|
93
|
-
# Tools we allow this agent to call
|
|
94
|
-
tools = [t for t in agent_state.tools if t.name not in LETTA_TOOL_SET and t.tool_type in {ToolType.EXTERNAL_COMPOSIO, ToolType.CUSTOM}]
|
|
95
|
-
|
|
96
|
-
# Initial request
|
|
97
|
-
openai_request = ChatCompletionRequest(
|
|
98
|
-
model=agent_state.llm_config.model,
|
|
99
|
-
messages=openai_messages,
|
|
100
|
-
# TODO: This nested thing here is so ugly, need to refactor
|
|
101
|
-
tools=(
|
|
102
|
-
[
|
|
103
|
-
Tool(type="function", function=enable_strict_mode(add_pre_execution_message(remove_request_heartbeat(t.json_schema))))
|
|
104
|
-
for t in tools
|
|
105
|
-
]
|
|
106
|
-
if tools
|
|
107
|
-
else None
|
|
108
|
-
),
|
|
109
|
-
tool_choice="auto",
|
|
110
|
-
user=user_id,
|
|
111
|
-
max_completion_tokens=agent_state.llm_config.max_tokens,
|
|
112
|
-
temperature=agent_state.llm_config.temperature,
|
|
113
|
-
stream=True,
|
|
114
|
-
)
|
|
115
|
-
|
|
116
|
-
# Create the OpenAI async client
|
|
117
|
-
client = openai.AsyncClient(
|
|
118
|
-
api_key=model_settings.openai_api_key,
|
|
119
|
-
max_retries=0,
|
|
120
|
-
http_client=httpx.AsyncClient(
|
|
121
|
-
timeout=httpx.Timeout(connect=15.0, read=30.0, write=15.0, pool=15.0),
|
|
122
|
-
follow_redirects=True,
|
|
123
|
-
limits=httpx.Limits(
|
|
124
|
-
max_connections=50,
|
|
125
|
-
max_keepalive_connections=50,
|
|
126
|
-
keepalive_expiry=120,
|
|
127
|
-
),
|
|
128
|
-
),
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
# The messages we want to persist to the Letta agent
|
|
132
|
-
user_message = create_user_message(input_message=input_message, agent_id=agent_id, actor=actor)
|
|
133
|
-
message_db_queue = [user_message]
|
|
134
|
-
|
|
135
|
-
async def event_stream():
|
|
136
|
-
"""
|
|
137
|
-
A function-calling loop:
|
|
138
|
-
- We stream partial tokens.
|
|
139
|
-
- If we detect a tool call (finish_reason="tool_calls"), we parse it,
|
|
140
|
-
add two messages to the conversation:
|
|
141
|
-
(a) assistant message with tool_calls referencing the same ID
|
|
142
|
-
(b) a tool message referencing that ID, containing the tool result.
|
|
143
|
-
- Re-invoke the OpenAI request with updated conversation, streaming again.
|
|
144
|
-
- End when finish_reason="stop" or no more tool calls.
|
|
145
|
-
"""
|
|
146
|
-
|
|
147
|
-
# We'll keep updating this conversation in a loop
|
|
148
|
-
conversation = openai_messages[:]
|
|
149
|
-
|
|
150
|
-
while True:
|
|
151
|
-
# Make the streaming request to OpenAI
|
|
152
|
-
stream = await client.chat.completions.create(**openai_request.model_dump(exclude_unset=True))
|
|
153
|
-
|
|
154
|
-
content_buffer = []
|
|
155
|
-
tool_call_name = None
|
|
156
|
-
tool_call_args_str = ""
|
|
157
|
-
tool_call_id = None
|
|
158
|
-
tool_call_happened = False
|
|
159
|
-
finish_reason_stop = False
|
|
160
|
-
optimistic_json_parser = OptimisticJSONParser(strict=True)
|
|
161
|
-
current_parsed_json_result = {}
|
|
162
|
-
|
|
163
|
-
async with stream:
|
|
164
|
-
async for chunk in stream:
|
|
165
|
-
choice = chunk.choices[0]
|
|
166
|
-
delta = choice.delta
|
|
167
|
-
finish_reason = choice.finish_reason # "tool_calls", "stop", or None
|
|
168
|
-
|
|
169
|
-
if delta.content:
|
|
170
|
-
content_buffer.append(delta.content)
|
|
171
|
-
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
172
|
-
|
|
173
|
-
# CASE B: Partial tool call info
|
|
174
|
-
if delta.tool_calls:
|
|
175
|
-
# Typically there's only one in delta.tool_calls
|
|
176
|
-
tc = delta.tool_calls[0]
|
|
177
|
-
if tc.function.name:
|
|
178
|
-
tool_call_name = tc.function.name
|
|
179
|
-
if tc.function.arguments:
|
|
180
|
-
tool_call_args_str += tc.function.arguments
|
|
181
|
-
|
|
182
|
-
# See if we can stream out the pre-execution message
|
|
183
|
-
parsed_args = optimistic_json_parser.parse(tool_call_args_str)
|
|
184
|
-
if parsed_args.get(
|
|
185
|
-
PRE_EXECUTION_MESSAGE_ARG
|
|
186
|
-
) and current_parsed_json_result.get( # Ensure key exists and is not None/empty
|
|
187
|
-
PRE_EXECUTION_MESSAGE_ARG
|
|
188
|
-
) != parsed_args.get(
|
|
189
|
-
PRE_EXECUTION_MESSAGE_ARG
|
|
190
|
-
):
|
|
191
|
-
# Only stream if there's something new to stream
|
|
192
|
-
# We do this way to avoid hanging JSON at the end of the stream, e.g. '}'
|
|
193
|
-
if parsed_args != current_parsed_json_result:
|
|
194
|
-
current_parsed_json_result = parsed_args
|
|
195
|
-
synthetic_chunk = ChatCompletionChunk(
|
|
196
|
-
id=chunk.id,
|
|
197
|
-
object=chunk.object,
|
|
198
|
-
created=chunk.created,
|
|
199
|
-
model=chunk.model,
|
|
200
|
-
choices=[
|
|
201
|
-
Choice(
|
|
202
|
-
index=choice.index,
|
|
203
|
-
delta=ChoiceDelta(content=tc.function.arguments, role="assistant"),
|
|
204
|
-
finish_reason=None,
|
|
205
|
-
)
|
|
206
|
-
],
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
yield f"data: {synthetic_chunk.model_dump_json()}\n\n"
|
|
210
|
-
|
|
211
|
-
# We might generate a unique ID for the tool call
|
|
212
|
-
if tc.id:
|
|
213
|
-
tool_call_id = tc.id
|
|
214
|
-
|
|
215
|
-
# Check finish_reason
|
|
216
|
-
if finish_reason == "tool_calls":
|
|
217
|
-
tool_call_happened = True
|
|
218
|
-
break
|
|
219
|
-
elif finish_reason == "stop":
|
|
220
|
-
finish_reason_stop = True
|
|
221
|
-
break
|
|
222
|
-
|
|
223
|
-
if content_buffer:
|
|
224
|
-
# We treat that partial text as an assistant message
|
|
225
|
-
content = "".join(content_buffer)
|
|
226
|
-
conversation.append({"role": "assistant", "content": content})
|
|
227
|
-
|
|
228
|
-
# Create an assistant message here to persist later
|
|
229
|
-
assistant_message = create_assistant_message_from_openai_response(
|
|
230
|
-
response_text=content, agent_id=agent_id, model=agent_state.llm_config.model, actor=actor
|
|
231
|
-
)
|
|
232
|
-
message_db_queue.append(assistant_message)
|
|
233
|
-
|
|
234
|
-
if tool_call_happened:
|
|
235
|
-
# Parse the tool call arguments
|
|
236
|
-
try:
|
|
237
|
-
tool_args = json.loads(tool_call_args_str)
|
|
238
|
-
except json.JSONDecodeError:
|
|
239
|
-
tool_args = {}
|
|
240
|
-
|
|
241
|
-
if not tool_call_id:
|
|
242
|
-
# If no tool_call_id given by the model, generate one
|
|
243
|
-
tool_call_id = f"call_{uuid.uuid4().hex[:8]}"
|
|
244
|
-
|
|
245
|
-
# 1) Insert the "assistant" message with the tool_calls field
|
|
246
|
-
# referencing the same tool_call_id
|
|
247
|
-
assistant_tool_call_msg = AssistantMessage(
|
|
248
|
-
content=None,
|
|
249
|
-
tool_calls=[ToolCall(id=tool_call_id, function=ToolCallFunction(name=tool_call_name, arguments=tool_call_args_str))],
|
|
250
|
-
)
|
|
251
|
-
|
|
252
|
-
conversation.append(assistant_tool_call_msg.model_dump())
|
|
253
|
-
|
|
254
|
-
# 2) Execute the tool
|
|
255
|
-
target_tool = next((x for x in tools if x.name == tool_call_name), None)
|
|
256
|
-
if not target_tool:
|
|
257
|
-
# Tool not found, handle error
|
|
258
|
-
yield f"data: {json.dumps({'error': 'Tool not found', 'tool': tool_call_name})}\n\n"
|
|
259
|
-
break
|
|
260
|
-
|
|
261
|
-
try:
|
|
262
|
-
tool_result, _ = execute_external_tool(
|
|
263
|
-
agent_state=agent_state,
|
|
264
|
-
function_name=tool_call_name,
|
|
265
|
-
function_args=tool_args,
|
|
266
|
-
target_letta_tool=target_tool,
|
|
267
|
-
actor=actor,
|
|
268
|
-
allow_agent_state_modifications=False,
|
|
269
|
-
)
|
|
270
|
-
except Exception as e:
|
|
271
|
-
tool_result = f"Failed to call tool. Error: {e}"
|
|
272
|
-
|
|
273
|
-
# 3) Insert the "tool" message referencing the same tool_call_id
|
|
274
|
-
tool_message = ToolMessage(content=json.dumps({"result": tool_result}), tool_call_id=tool_call_id)
|
|
275
|
-
|
|
276
|
-
conversation.append(tool_message.model_dump())
|
|
277
|
-
|
|
278
|
-
# 4) Add a user message prompting the tool call result summarization
|
|
279
|
-
heartbeat_user_message = UserMessage(
|
|
280
|
-
content=f"{NON_USER_MSG_PREFIX} Tool finished executing. Summarize the result for the user.",
|
|
281
|
-
)
|
|
282
|
-
conversation.append(heartbeat_user_message.model_dump())
|
|
283
|
-
|
|
284
|
-
# Now, re-invoke OpenAI with the updated conversation
|
|
285
|
-
openai_request.messages = conversation
|
|
286
|
-
|
|
287
|
-
continue # Start the while loop again
|
|
288
|
-
|
|
289
|
-
if finish_reason_stop:
|
|
290
|
-
# Model is done, no more calls
|
|
291
|
-
break
|
|
292
|
-
|
|
293
|
-
# If we reach here, no tool call, no "stop", but we've ended streaming
|
|
294
|
-
# Possibly a model error or some other finish reason. We'll just end.
|
|
295
|
-
break
|
|
296
|
-
|
|
297
|
-
await run_in_threadpool(
|
|
298
|
-
server.agent_manager.append_to_in_context_messages,
|
|
299
|
-
message_db_queue,
|
|
300
|
-
agent_id=agent_id,
|
|
301
|
-
actor=actor,
|
|
302
|
-
)
|
|
303
|
-
|
|
304
|
-
yield "data: [DONE]\n\n"
|
|
305
|
-
|
|
306
|
-
return StreamingResponse(event_stream(), media_type="text/event-stream")
|
|
307
|
-
|
|
308
|
-
|
|
309
26
|
@router.post(
|
|
310
27
|
"/chat/completions",
|
|
311
28
|
response_model=None,
|
|
@@ -11,6 +11,7 @@ from letta.server.rest_api.routers.v1.sources import router as sources_router
|
|
|
11
11
|
from letta.server.rest_api.routers.v1.steps import router as steps_router
|
|
12
12
|
from letta.server.rest_api.routers.v1.tags import router as tags_router
|
|
13
13
|
from letta.server.rest_api.routers.v1.tools import router as tools_router
|
|
14
|
+
from letta.server.rest_api.routers.v1.voice import router as voice_router
|
|
14
15
|
|
|
15
16
|
ROUTERS = [
|
|
16
17
|
tools_router,
|
|
@@ -26,4 +27,5 @@ ROUTERS = [
|
|
|
26
27
|
runs_router,
|
|
27
28
|
steps_router,
|
|
28
29
|
tags_router,
|
|
30
|
+
voice_router,
|
|
29
31
|
]
|