letta-nightly 0.6.33.dev20250226104113__py3-none-any.whl → 0.6.34.dev20250227200331__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of letta-nightly might be problematic. Click here for more details.

letta/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.6.33"
1
+ __version__ = "0.6.34"
2
2
 
3
3
  # import clients
4
4
  from letta.client.client import LocalClient, RESTClient, create_client
letta/agent.py CHANGED
@@ -832,7 +832,7 @@ class Agent(BaseAgent):
832
832
  )
833
833
 
834
834
  if current_total_tokens > summarizer_settings.memory_warning_threshold * int(self.agent_state.llm_config.context_window):
835
- printd(
835
+ logger.warning(
836
836
  f"{CLI_WARNING_PREFIX}last response total_tokens ({current_total_tokens}) > {summarizer_settings.memory_warning_threshold * int(self.agent_state.llm_config.context_window)}"
837
837
  )
838
838
 
@@ -842,7 +842,7 @@ class Agent(BaseAgent):
842
842
  self.agent_alerted_about_memory_pressure = True # it's up to the outer loop to handle this
843
843
 
844
844
  else:
845
- printd(
845
+ logger.info(
846
846
  f"last response total_tokens ({current_total_tokens}) < {summarizer_settings.memory_warning_threshold * int(self.agent_state.llm_config.context_window)}"
847
847
  )
848
848
 
@@ -892,6 +892,16 @@ class Agent(BaseAgent):
892
892
  if is_context_overflow_error(e):
893
893
  in_context_messages = self.agent_manager.get_in_context_messages(agent_id=self.agent_state.id, actor=self.user)
894
894
 
895
+ # TODO: this is a patch to resolve immediate issues, should be removed once the summarizer is fixes
896
+ if self.agent_state.message_buffer_autoclear:
897
+ # no calling the summarizer in this case
898
+ logger.error(
899
+ f"step() failed with an exception that looks like a context window overflow, but message buffer is set to autoclear, so skipping: '{str(e)}'"
900
+ )
901
+ raise e
902
+
903
+ summarize_attempt_count += 1
904
+
895
905
  if summarize_attempt_count <= summarizer_settings.max_summarizer_retries:
896
906
  logger.warning(
897
907
  f"context window exceeded with limit {self.agent_state.llm_config.context_window}, attempting to summarize ({summarize_attempt_count}/{summarizer_settings.max_summarizer_retries}"
@@ -187,8 +187,65 @@ def create(
187
187
  function_call = "required"
188
188
 
189
189
  data = build_openai_chat_completions_request(
190
- llm_config, messages, user_id, functions, function_call, use_tool_naming, put_inner_thoughts_first=put_inner_thoughts_first
190
+ llm_config,
191
+ messages,
192
+ user_id,
193
+ functions,
194
+ function_call,
195
+ use_tool_naming,
196
+ put_inner_thoughts_first=put_inner_thoughts_first,
197
+ use_structured_output=True, # NOTE: turn on all the time for OpenAI API
191
198
  )
199
+
200
+ if stream: # Client requested token streaming
201
+ data.stream = True
202
+ assert isinstance(stream_interface, AgentChunkStreamingInterface) or isinstance(
203
+ stream_interface, AgentRefreshStreamingInterface
204
+ ), type(stream_interface)
205
+ response = openai_chat_completions_process_stream(
206
+ url=llm_config.model_endpoint,
207
+ api_key=api_key,
208
+ chat_completion_request=data,
209
+ stream_interface=stream_interface,
210
+ )
211
+ else: # Client did not request token streaming (expect a blocking backend response)
212
+ data.stream = False
213
+ if isinstance(stream_interface, AgentChunkStreamingInterface):
214
+ stream_interface.stream_start()
215
+ try:
216
+ response = openai_chat_completions_request(
217
+ url=llm_config.model_endpoint,
218
+ api_key=api_key,
219
+ chat_completion_request=data,
220
+ )
221
+ finally:
222
+ if isinstance(stream_interface, AgentChunkStreamingInterface):
223
+ stream_interface.stream_end()
224
+
225
+ if llm_config.put_inner_thoughts_in_kwargs:
226
+ response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG)
227
+
228
+ return response
229
+
230
+ elif llm_config.model_endpoint_type == "xai":
231
+
232
+ api_key = model_settings.xai_api_key
233
+
234
+ if function_call is None and functions is not None and len(functions) > 0:
235
+ # force function calling for reliability, see https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
236
+ function_call = "required"
237
+
238
+ data = build_openai_chat_completions_request(
239
+ llm_config,
240
+ messages,
241
+ user_id,
242
+ functions,
243
+ function_call,
244
+ use_tool_naming,
245
+ put_inner_thoughts_first=put_inner_thoughts_first,
246
+ use_structured_output=False, # NOTE: not supported atm for xAI
247
+ )
248
+
192
249
  if stream: # Client requested token streaming
193
250
  data.stream = True
194
251
  assert isinstance(stream_interface, AgentChunkStreamingInterface) or isinstance(
letta/llm_api/openai.py CHANGED
@@ -13,7 +13,7 @@ from letta.schemas.message import Message as _Message
13
13
  from letta.schemas.message import MessageRole as _MessageRole
14
14
  from letta.schemas.openai.chat_completion_request import ChatCompletionRequest
15
15
  from letta.schemas.openai.chat_completion_request import FunctionCall as ToolFunctionChoiceFunctionCall
16
- from letta.schemas.openai.chat_completion_request import Tool, ToolFunctionChoice, cast_message_to_subtype
16
+ from letta.schemas.openai.chat_completion_request import FunctionSchema, Tool, ToolFunctionChoice, cast_message_to_subtype
17
17
  from letta.schemas.openai.chat_completion_response import (
18
18
  ChatCompletionChunkResponse,
19
19
  ChatCompletionResponse,
@@ -95,6 +95,7 @@ def build_openai_chat_completions_request(
95
95
  function_call: Optional[str],
96
96
  use_tool_naming: bool,
97
97
  put_inner_thoughts_first: bool = True,
98
+ use_structured_output: bool = True,
98
99
  ) -> ChatCompletionRequest:
99
100
  if functions and llm_config.put_inner_thoughts_in_kwargs:
100
101
  # Special case for LM Studio backend since it needs extra guidance to force out the thoughts first
@@ -157,6 +158,16 @@ def build_openai_chat_completions_request(
157
158
  data.user = str(uuid.UUID(int=0))
158
159
  data.model = "memgpt-openai"
159
160
 
161
+ if use_structured_output and data.tools is not None and len(data.tools) > 0:
162
+ # Convert to structured output style (which has 'strict' and no optionals)
163
+ for tool in data.tools:
164
+ try:
165
+ # tool["function"] = convert_to_structured_output(tool["function"])
166
+ structured_output_version = convert_to_structured_output(tool.function.model_dump())
167
+ tool.function = FunctionSchema(**structured_output_version)
168
+ except ValueError as e:
169
+ warnings.warn(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
170
+
160
171
  return data
161
172
 
162
173
 
@@ -455,11 +466,12 @@ def prepare_openai_payload(chat_completion_request: ChatCompletionRequest):
455
466
  data.pop("tools")
456
467
  data.pop("tool_choice", None) # extra safe, should exist always (default="auto")
457
468
 
458
- if "tools" in data:
459
- for tool in data["tools"]:
460
- try:
461
- tool["function"] = convert_to_structured_output(tool["function"])
462
- except ValueError as e:
463
- warnings.warn(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
469
+ # # NOTE: move this out to wherever the ChatCompletionRequest is created
470
+ # if "tools" in data:
471
+ # for tool in data["tools"]:
472
+ # try:
473
+ # tool["function"] = convert_to_structured_output(tool["function"])
474
+ # except ValueError as e:
475
+ # warnings.warn(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
464
476
 
465
477
  return data
@@ -69,6 +69,7 @@ class SqlalchemyBase(CommonSqlalchemyMetaMixins, Base):
69
69
  join_model: Optional[Base] = None,
70
70
  join_conditions: Optional[Union[Tuple, List]] = None,
71
71
  identifier_keys: Optional[List[str]] = None,
72
+ identifier_id: Optional[str] = None,
72
73
  **kwargs,
73
74
  ) -> List["SqlalchemyBase"]:
74
75
  """
@@ -147,6 +148,10 @@ class SqlalchemyBase(CommonSqlalchemyMetaMixins, Base):
147
148
  if identifier_keys and hasattr(cls, "identities"):
148
149
  query = query.join(cls.identities).filter(cls.identities.property.mapper.class_.identifier_key.in_(identifier_keys))
149
150
 
151
+ # given the identifier_id, we can find within the agents table any agents that have the identifier_id in their identity_ids
152
+ if identifier_id and hasattr(cls, "identities"):
153
+ query = query.join(cls.identities).filter(cls.identities.property.mapper.class_.id == identifier_id)
154
+
150
155
  # Apply filtering logic from kwargs
151
156
  for key, value in kwargs.items():
152
157
  if "." in key:
@@ -42,6 +42,7 @@ class LLMConfig(BaseModel):
42
42
  "together", # completions endpoint
43
43
  "bedrock",
44
44
  "deepseek",
45
+ "xai",
45
46
  ] = Field(..., description="The endpoint type for the model.")
46
47
  model_endpoint: Optional[str] = Field(None, description="The endpoint for the model.")
47
48
  model_wrapper: Optional[str] = Field(None, description="The wrapper for the model.")
@@ -56,7 +57,7 @@ class LLMConfig(BaseModel):
56
57
  description="The temperature to use when generating text with the model. A higher temperature will result in more random text.",
57
58
  )
58
59
  max_tokens: Optional[int] = Field(
59
- 1024,
60
+ 4096,
60
61
  description="The maximum number of tokens to generate. If not set, the model will use its default value.",
61
62
  )
62
63
 
@@ -211,6 +211,63 @@ class OpenAIProvider(Provider):
211
211
  return None
212
212
 
213
213
 
214
+ class xAIProvider(OpenAIProvider):
215
+ """https://docs.x.ai/docs/api-reference"""
216
+
217
+ name: str = "xai"
218
+ api_key: str = Field(..., description="API key for the xAI/Grok API.")
219
+ base_url: str = Field("https://api.x.ai/v1", description="Base URL for the xAI/Grok API.")
220
+
221
+ def get_model_context_window_size(self, model_name: str) -> Optional[int]:
222
+ # xAI doesn't return context window in the model listing,
223
+ # so these are hardcoded from their website
224
+ if model_name == "grok-2-1212":
225
+ return 131072
226
+ else:
227
+ return None
228
+
229
+ def list_llm_models(self) -> List[LLMConfig]:
230
+ from letta.llm_api.openai import openai_get_model_list
231
+
232
+ response = openai_get_model_list(self.base_url, api_key=self.api_key)
233
+
234
+ if "data" in response:
235
+ data = response["data"]
236
+ else:
237
+ data = response
238
+
239
+ configs = []
240
+ for model in data:
241
+ assert "id" in model, f"xAI/Grok model missing 'id' field: {model}"
242
+ model_name = model["id"]
243
+
244
+ # In case xAI starts supporting it in the future:
245
+ if "context_length" in model:
246
+ context_window_size = model["context_length"]
247
+ else:
248
+ context_window_size = self.get_model_context_window_size(model_name)
249
+
250
+ if not context_window_size:
251
+ warnings.warn(f"Couldn't find context window size for model {model_name}")
252
+ continue
253
+
254
+ configs.append(
255
+ LLMConfig(
256
+ model=model_name,
257
+ model_endpoint_type="xai",
258
+ model_endpoint=self.base_url,
259
+ context_window=context_window_size,
260
+ handle=self.get_handle(model_name),
261
+ )
262
+ )
263
+
264
+ return configs
265
+
266
+ def list_embedding_models(self) -> List[EmbeddingConfig]:
267
+ # No embeddings supported
268
+ return []
269
+
270
+
214
271
  class DeepSeekProvider(OpenAIProvider):
215
272
  """
216
273
  DeepSeek ChatCompletions API is similar to OpenAI's reasoning API,
@@ -456,6 +513,13 @@ class AnthropicProvider(Provider):
456
513
  warnings.warn(f"Couldn't find context window size for model {model['id']}, defaulting to 200,000")
457
514
  model["context_window"] = 200000
458
515
 
516
+ max_tokens = 8192
517
+ if "claude-3-opus" in model["id"]:
518
+ max_tokens = 4096
519
+ if "claude-3-haiku" in model["id"]:
520
+ max_tokens = 4096
521
+ # TODO: set for 3-7 extended thinking mode
522
+
459
523
  # We set this to false by default, because Anthropic can
460
524
  # natively support <thinking> tags inside of content fields
461
525
  # However, putting COT inside of tool calls can make it more
@@ -472,6 +536,7 @@ class AnthropicProvider(Provider):
472
536
  context_window=model["context_window"],
473
537
  handle=self.get_handle(model["id"]),
474
538
  put_inner_thoughts_in_kwargs=inner_thoughts_in_kwargs,
539
+ max_tokens=max_tokens,
475
540
  )
476
541
  )
477
542
  return configs
@@ -811,6 +876,7 @@ class GoogleAIProvider(Provider):
811
876
  model_endpoint=self.base_url,
812
877
  context_window=self.get_model_context_window(model),
813
878
  handle=self.get_handle(model),
879
+ max_tokens=8192,
814
880
  )
815
881
  )
816
882
  return configs
@@ -862,6 +928,7 @@ class GoogleVertexProvider(Provider):
862
928
  model_endpoint=f"https://{self.google_cloud_location}-aiplatform.googleapis.com/v1/projects/{self.google_cloud_project}/locations/{self.google_cloud_location}",
863
929
  context_window=context_length,
864
930
  handle=self.get_handle(model),
931
+ max_tokens=8192,
865
932
  )
866
933
  )
867
934
  return configs
@@ -225,10 +225,10 @@ class ChatCompletionsStreamingInterface(AgentChunkStreamingInterface):
225
225
  combined_args = "".join(self.current_function_arguments)
226
226
  parsed_args = OptimisticJSONParser().parse(combined_args)
227
227
 
228
- # TODO: Make this less brittle! This depends on `message` coming first!
229
- # This is a heuristic we use to know if we're done with the `message` part of `send_message`
230
- if len(parsed_args.keys()) > 1:
231
- self._found_message_tool_kwarg = True
228
+ if parsed_args.get(self.assistant_message_tool_kwarg) and parsed_args.get(
229
+ self.assistant_message_tool_kwarg
230
+ ) != self.current_json_parse_result.get(self.assistant_message_tool_kwarg):
231
+ self.current_json_parse_result = parsed_args
232
232
  return ChatCompletionChunk(
233
233
  id=chunk.id,
234
234
  object=chunk.object,
@@ -237,31 +237,11 @@ class ChatCompletionsStreamingInterface(AgentChunkStreamingInterface):
237
237
  choices=[
238
238
  Choice(
239
239
  index=choice.index,
240
- delta=ChoiceDelta(),
241
- finish_reason="stop",
240
+ delta=ChoiceDelta(content=self.current_function_arguments[-1], role=self.ASSISTANT_STR),
241
+ finish_reason=None,
242
242
  )
243
243
  ],
244
244
  )
245
- else:
246
- # If the parsed result is different
247
- # This is an edge case we need to consider. E.g. if the last streamed token is '}', we shouldn't stream that out
248
- if parsed_args != self.current_json_parse_result:
249
- self.current_json_parse_result = parsed_args
250
- # If we can see a "message" field, return it as partial content
251
- if self.assistant_message_tool_kwarg in parsed_args and parsed_args[self.assistant_message_tool_kwarg]:
252
- return ChatCompletionChunk(
253
- id=chunk.id,
254
- object=chunk.object,
255
- created=chunk.created.timestamp(),
256
- model=chunk.model,
257
- choices=[
258
- Choice(
259
- index=choice.index,
260
- delta=ChoiceDelta(content=self.current_function_arguments[-1], role=self.ASSISTANT_STR),
261
- finish_reason=None,
262
- )
263
- ],
264
- )
265
245
 
266
246
  # If there's a finish reason, pass that along
267
247
  if choice.finish_reason is not None:
@@ -1,50 +1,19 @@
1
1
  import asyncio
2
- import json
3
- import uuid
4
2
  from typing import TYPE_CHECKING, List, Optional, Union
5
3
 
6
- import httpx
7
- import openai
8
4
  from fastapi import APIRouter, Body, Depends, Header, HTTPException
9
5
  from fastapi.responses import StreamingResponse
10
- from openai.types.chat.chat_completion_chunk import ChatCompletionChunk, Choice, ChoiceDelta
11
6
  from openai.types.chat.completion_create_params import CompletionCreateParams
12
- from starlette.concurrency import run_in_threadpool
13
7
 
14
8
  from letta.agent import Agent
15
- from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG, LETTA_TOOL_SET, NON_USER_MSG_PREFIX, PRE_EXECUTION_MESSAGE_ARG
16
- from letta.helpers.tool_execution_helper import (
17
- add_pre_execution_message,
18
- enable_strict_mode,
19
- execute_external_tool,
20
- remove_request_heartbeat,
21
- )
9
+ from letta.constants import DEFAULT_MESSAGE_TOOL, DEFAULT_MESSAGE_TOOL_KWARG
22
10
  from letta.log import get_logger
23
- from letta.orm.enums import ToolType
24
11
  from letta.schemas.message import Message, MessageCreate
25
- from letta.schemas.openai.chat_completion_request import (
26
- AssistantMessage,
27
- ChatCompletionRequest,
28
- Tool,
29
- ToolCall,
30
- ToolCallFunction,
31
- ToolMessage,
32
- UserMessage,
33
- )
34
12
  from letta.schemas.user import User
35
13
  from letta.server.rest_api.chat_completions_interface import ChatCompletionsStreamingInterface
36
- from letta.server.rest_api.optimistic_json_parser import OptimisticJSONParser
37
14
 
38
15
  # TODO this belongs in a controller!
39
- from letta.server.rest_api.utils import (
40
- convert_letta_messages_to_openai,
41
- create_assistant_message_from_openai_response,
42
- create_user_message,
43
- get_letta_server,
44
- get_messages_from_completion_request,
45
- sse_async_generator,
46
- )
47
- from letta.settings import model_settings
16
+ from letta.server.rest_api.utils import get_letta_server, get_messages_from_completion_request, sse_async_generator
48
17
 
49
18
  if TYPE_CHECKING:
50
19
  from letta.server.server import SyncServer
@@ -54,258 +23,6 @@ router = APIRouter(prefix="/v1", tags=["chat_completions"])
54
23
  logger = get_logger(__name__)
55
24
 
56
25
 
57
- @router.post(
58
- "/fast/chat/completions",
59
- response_model=None,
60
- operation_id="create_fast_chat_completions",
61
- responses={
62
- 200: {
63
- "description": "Successful response",
64
- "content": {
65
- "text/event-stream": {"description": "Server-Sent Events stream"},
66
- },
67
- }
68
- },
69
- )
70
- async def create_fast_chat_completions(
71
- completion_request: CompletionCreateParams = Body(...),
72
- server: "SyncServer" = Depends(get_letta_server),
73
- user_id: Optional[str] = Header(None, alias="user_id"),
74
- ):
75
- actor = server.user_manager.get_user_or_default(user_id=user_id)
76
-
77
- agent_id = str(completion_request.get("user", None))
78
- if agent_id is None:
79
- raise HTTPException(status_code=400, detail="Must pass agent_id in the 'user' field")
80
-
81
- agent_state = server.agent_manager.get_agent_by_id(agent_id=agent_id, actor=actor)
82
- if agent_state.llm_config.model_endpoint_type != "openai":
83
- raise HTTPException(status_code=400, detail="Only OpenAI models are supported by this endpoint.")
84
-
85
- # Convert Letta messages to OpenAI messages
86
- in_context_messages = server.message_manager.get_messages_by_ids(message_ids=agent_state.message_ids, actor=actor)
87
- openai_messages = convert_letta_messages_to_openai(in_context_messages)
88
-
89
- # Also parse user input from completion_request and append
90
- input_message = get_messages_from_completion_request(completion_request)[-1]
91
- openai_messages.append(input_message)
92
-
93
- # Tools we allow this agent to call
94
- tools = [t for t in agent_state.tools if t.name not in LETTA_TOOL_SET and t.tool_type in {ToolType.EXTERNAL_COMPOSIO, ToolType.CUSTOM}]
95
-
96
- # Initial request
97
- openai_request = ChatCompletionRequest(
98
- model=agent_state.llm_config.model,
99
- messages=openai_messages,
100
- # TODO: This nested thing here is so ugly, need to refactor
101
- tools=(
102
- [
103
- Tool(type="function", function=enable_strict_mode(add_pre_execution_message(remove_request_heartbeat(t.json_schema))))
104
- for t in tools
105
- ]
106
- if tools
107
- else None
108
- ),
109
- tool_choice="auto",
110
- user=user_id,
111
- max_completion_tokens=agent_state.llm_config.max_tokens,
112
- temperature=agent_state.llm_config.temperature,
113
- stream=True,
114
- )
115
-
116
- # Create the OpenAI async client
117
- client = openai.AsyncClient(
118
- api_key=model_settings.openai_api_key,
119
- max_retries=0,
120
- http_client=httpx.AsyncClient(
121
- timeout=httpx.Timeout(connect=15.0, read=30.0, write=15.0, pool=15.0),
122
- follow_redirects=True,
123
- limits=httpx.Limits(
124
- max_connections=50,
125
- max_keepalive_connections=50,
126
- keepalive_expiry=120,
127
- ),
128
- ),
129
- )
130
-
131
- # The messages we want to persist to the Letta agent
132
- user_message = create_user_message(input_message=input_message, agent_id=agent_id, actor=actor)
133
- message_db_queue = [user_message]
134
-
135
- async def event_stream():
136
- """
137
- A function-calling loop:
138
- - We stream partial tokens.
139
- - If we detect a tool call (finish_reason="tool_calls"), we parse it,
140
- add two messages to the conversation:
141
- (a) assistant message with tool_calls referencing the same ID
142
- (b) a tool message referencing that ID, containing the tool result.
143
- - Re-invoke the OpenAI request with updated conversation, streaming again.
144
- - End when finish_reason="stop" or no more tool calls.
145
- """
146
-
147
- # We'll keep updating this conversation in a loop
148
- conversation = openai_messages[:]
149
-
150
- while True:
151
- # Make the streaming request to OpenAI
152
- stream = await client.chat.completions.create(**openai_request.model_dump(exclude_unset=True))
153
-
154
- content_buffer = []
155
- tool_call_name = None
156
- tool_call_args_str = ""
157
- tool_call_id = None
158
- tool_call_happened = False
159
- finish_reason_stop = False
160
- optimistic_json_parser = OptimisticJSONParser(strict=True)
161
- current_parsed_json_result = {}
162
-
163
- async with stream:
164
- async for chunk in stream:
165
- choice = chunk.choices[0]
166
- delta = choice.delta
167
- finish_reason = choice.finish_reason # "tool_calls", "stop", or None
168
-
169
- if delta.content:
170
- content_buffer.append(delta.content)
171
- yield f"data: {chunk.model_dump_json()}\n\n"
172
-
173
- # CASE B: Partial tool call info
174
- if delta.tool_calls:
175
- # Typically there's only one in delta.tool_calls
176
- tc = delta.tool_calls[0]
177
- if tc.function.name:
178
- tool_call_name = tc.function.name
179
- if tc.function.arguments:
180
- tool_call_args_str += tc.function.arguments
181
-
182
- # See if we can stream out the pre-execution message
183
- parsed_args = optimistic_json_parser.parse(tool_call_args_str)
184
- if parsed_args.get(
185
- PRE_EXECUTION_MESSAGE_ARG
186
- ) and current_parsed_json_result.get( # Ensure key exists and is not None/empty
187
- PRE_EXECUTION_MESSAGE_ARG
188
- ) != parsed_args.get(
189
- PRE_EXECUTION_MESSAGE_ARG
190
- ):
191
- # Only stream if there's something new to stream
192
- # We do this way to avoid hanging JSON at the end of the stream, e.g. '}'
193
- if parsed_args != current_parsed_json_result:
194
- current_parsed_json_result = parsed_args
195
- synthetic_chunk = ChatCompletionChunk(
196
- id=chunk.id,
197
- object=chunk.object,
198
- created=chunk.created,
199
- model=chunk.model,
200
- choices=[
201
- Choice(
202
- index=choice.index,
203
- delta=ChoiceDelta(content=tc.function.arguments, role="assistant"),
204
- finish_reason=None,
205
- )
206
- ],
207
- )
208
-
209
- yield f"data: {synthetic_chunk.model_dump_json()}\n\n"
210
-
211
- # We might generate a unique ID for the tool call
212
- if tc.id:
213
- tool_call_id = tc.id
214
-
215
- # Check finish_reason
216
- if finish_reason == "tool_calls":
217
- tool_call_happened = True
218
- break
219
- elif finish_reason == "stop":
220
- finish_reason_stop = True
221
- break
222
-
223
- if content_buffer:
224
- # We treat that partial text as an assistant message
225
- content = "".join(content_buffer)
226
- conversation.append({"role": "assistant", "content": content})
227
-
228
- # Create an assistant message here to persist later
229
- assistant_message = create_assistant_message_from_openai_response(
230
- response_text=content, agent_id=agent_id, model=agent_state.llm_config.model, actor=actor
231
- )
232
- message_db_queue.append(assistant_message)
233
-
234
- if tool_call_happened:
235
- # Parse the tool call arguments
236
- try:
237
- tool_args = json.loads(tool_call_args_str)
238
- except json.JSONDecodeError:
239
- tool_args = {}
240
-
241
- if not tool_call_id:
242
- # If no tool_call_id given by the model, generate one
243
- tool_call_id = f"call_{uuid.uuid4().hex[:8]}"
244
-
245
- # 1) Insert the "assistant" message with the tool_calls field
246
- # referencing the same tool_call_id
247
- assistant_tool_call_msg = AssistantMessage(
248
- content=None,
249
- tool_calls=[ToolCall(id=tool_call_id, function=ToolCallFunction(name=tool_call_name, arguments=tool_call_args_str))],
250
- )
251
-
252
- conversation.append(assistant_tool_call_msg.model_dump())
253
-
254
- # 2) Execute the tool
255
- target_tool = next((x for x in tools if x.name == tool_call_name), None)
256
- if not target_tool:
257
- # Tool not found, handle error
258
- yield f"data: {json.dumps({'error': 'Tool not found', 'tool': tool_call_name})}\n\n"
259
- break
260
-
261
- try:
262
- tool_result, _ = execute_external_tool(
263
- agent_state=agent_state,
264
- function_name=tool_call_name,
265
- function_args=tool_args,
266
- target_letta_tool=target_tool,
267
- actor=actor,
268
- allow_agent_state_modifications=False,
269
- )
270
- except Exception as e:
271
- tool_result = f"Failed to call tool. Error: {e}"
272
-
273
- # 3) Insert the "tool" message referencing the same tool_call_id
274
- tool_message = ToolMessage(content=json.dumps({"result": tool_result}), tool_call_id=tool_call_id)
275
-
276
- conversation.append(tool_message.model_dump())
277
-
278
- # 4) Add a user message prompting the tool call result summarization
279
- heartbeat_user_message = UserMessage(
280
- content=f"{NON_USER_MSG_PREFIX} Tool finished executing. Summarize the result for the user.",
281
- )
282
- conversation.append(heartbeat_user_message.model_dump())
283
-
284
- # Now, re-invoke OpenAI with the updated conversation
285
- openai_request.messages = conversation
286
-
287
- continue # Start the while loop again
288
-
289
- if finish_reason_stop:
290
- # Model is done, no more calls
291
- break
292
-
293
- # If we reach here, no tool call, no "stop", but we've ended streaming
294
- # Possibly a model error or some other finish reason. We'll just end.
295
- break
296
-
297
- await run_in_threadpool(
298
- server.agent_manager.append_to_in_context_messages,
299
- message_db_queue,
300
- agent_id=agent_id,
301
- actor=actor,
302
- )
303
-
304
- yield "data: [DONE]\n\n"
305
-
306
- return StreamingResponse(event_stream(), media_type="text/event-stream")
307
-
308
-
309
26
  @router.post(
310
27
  "/chat/completions",
311
28
  response_model=None,
@@ -11,6 +11,7 @@ from letta.server.rest_api.routers.v1.sources import router as sources_router
11
11
  from letta.server.rest_api.routers.v1.steps import router as steps_router
12
12
  from letta.server.rest_api.routers.v1.tags import router as tags_router
13
13
  from letta.server.rest_api.routers.v1.tools import router as tools_router
14
+ from letta.server.rest_api.routers.v1.voice import router as voice_router
14
15
 
15
16
  ROUTERS = [
16
17
  tools_router,
@@ -26,4 +27,5 @@ ROUTERS = [
26
27
  runs_router,
27
28
  steps_router,
28
29
  tags_router,
30
+ voice_router,
29
31
  ]