letta-nightly 0.11.7.dev20251006104136__py3-none-any.whl → 0.11.7.dev20251008104128__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. letta/adapters/letta_llm_adapter.py +1 -0
  2. letta/adapters/letta_llm_request_adapter.py +0 -1
  3. letta/adapters/letta_llm_stream_adapter.py +7 -2
  4. letta/adapters/simple_llm_request_adapter.py +88 -0
  5. letta/adapters/simple_llm_stream_adapter.py +192 -0
  6. letta/agents/agent_loop.py +6 -0
  7. letta/agents/ephemeral_summary_agent.py +2 -1
  8. letta/agents/helpers.py +142 -6
  9. letta/agents/letta_agent.py +13 -33
  10. letta/agents/letta_agent_batch.py +2 -4
  11. letta/agents/letta_agent_v2.py +87 -77
  12. letta/agents/letta_agent_v3.py +899 -0
  13. letta/agents/voice_agent.py +2 -6
  14. letta/constants.py +8 -4
  15. letta/errors.py +40 -0
  16. letta/functions/function_sets/base.py +84 -4
  17. letta/functions/function_sets/multi_agent.py +0 -3
  18. letta/functions/schema_generator.py +113 -71
  19. letta/groups/dynamic_multi_agent.py +3 -2
  20. letta/groups/helpers.py +1 -2
  21. letta/groups/round_robin_multi_agent.py +3 -2
  22. letta/groups/sleeptime_multi_agent.py +3 -2
  23. letta/groups/sleeptime_multi_agent_v2.py +1 -1
  24. letta/groups/sleeptime_multi_agent_v3.py +17 -17
  25. letta/groups/supervisor_multi_agent.py +84 -80
  26. letta/helpers/converters.py +3 -0
  27. letta/helpers/message_helper.py +4 -0
  28. letta/helpers/tool_rule_solver.py +92 -5
  29. letta/interfaces/anthropic_streaming_interface.py +409 -0
  30. letta/interfaces/gemini_streaming_interface.py +296 -0
  31. letta/interfaces/openai_streaming_interface.py +752 -1
  32. letta/llm_api/anthropic_client.py +126 -16
  33. letta/llm_api/bedrock_client.py +4 -2
  34. letta/llm_api/deepseek_client.py +4 -1
  35. letta/llm_api/google_vertex_client.py +123 -42
  36. letta/llm_api/groq_client.py +4 -1
  37. letta/llm_api/llm_api_tools.py +11 -4
  38. letta/llm_api/llm_client_base.py +6 -2
  39. letta/llm_api/openai.py +32 -2
  40. letta/llm_api/openai_client.py +423 -18
  41. letta/llm_api/xai_client.py +4 -1
  42. letta/main.py +9 -5
  43. letta/memory.py +1 -0
  44. letta/orm/__init__.py +1 -1
  45. letta/orm/agent.py +10 -0
  46. letta/orm/block.py +7 -16
  47. letta/orm/blocks_agents.py +8 -2
  48. letta/orm/files_agents.py +2 -0
  49. letta/orm/job.py +7 -5
  50. letta/orm/mcp_oauth.py +1 -0
  51. letta/orm/message.py +21 -6
  52. letta/orm/organization.py +2 -0
  53. letta/orm/provider.py +6 -2
  54. letta/orm/run.py +71 -0
  55. letta/orm/sandbox_config.py +7 -1
  56. letta/orm/sqlalchemy_base.py +0 -306
  57. letta/orm/step.py +6 -5
  58. letta/orm/step_metrics.py +5 -5
  59. letta/otel/tracing.py +28 -3
  60. letta/plugins/defaults.py +4 -4
  61. letta/prompts/system_prompts/__init__.py +2 -0
  62. letta/prompts/system_prompts/letta_v1.py +25 -0
  63. letta/schemas/agent.py +3 -2
  64. letta/schemas/agent_file.py +9 -3
  65. letta/schemas/block.py +23 -10
  66. letta/schemas/enums.py +21 -2
  67. letta/schemas/job.py +17 -4
  68. letta/schemas/letta_message_content.py +71 -2
  69. letta/schemas/letta_stop_reason.py +5 -5
  70. letta/schemas/llm_config.py +53 -3
  71. letta/schemas/memory.py +1 -1
  72. letta/schemas/message.py +504 -117
  73. letta/schemas/openai/responses_request.py +64 -0
  74. letta/schemas/providers/__init__.py +2 -0
  75. letta/schemas/providers/anthropic.py +16 -0
  76. letta/schemas/providers/ollama.py +115 -33
  77. letta/schemas/providers/openrouter.py +52 -0
  78. letta/schemas/providers/vllm.py +2 -1
  79. letta/schemas/run.py +48 -42
  80. letta/schemas/step.py +2 -2
  81. letta/schemas/step_metrics.py +1 -1
  82. letta/schemas/tool.py +15 -107
  83. letta/schemas/tool_rule.py +88 -5
  84. letta/serialize_schemas/marshmallow_agent.py +1 -0
  85. letta/server/db.py +86 -408
  86. letta/server/rest_api/app.py +61 -10
  87. letta/server/rest_api/dependencies.py +14 -0
  88. letta/server/rest_api/redis_stream_manager.py +19 -8
  89. letta/server/rest_api/routers/v1/agents.py +364 -292
  90. letta/server/rest_api/routers/v1/blocks.py +14 -20
  91. letta/server/rest_api/routers/v1/identities.py +45 -110
  92. letta/server/rest_api/routers/v1/internal_templates.py +21 -0
  93. letta/server/rest_api/routers/v1/jobs.py +23 -6
  94. letta/server/rest_api/routers/v1/messages.py +1 -1
  95. letta/server/rest_api/routers/v1/runs.py +126 -85
  96. letta/server/rest_api/routers/v1/sandbox_configs.py +10 -19
  97. letta/server/rest_api/routers/v1/tools.py +281 -594
  98. letta/server/rest_api/routers/v1/voice.py +1 -1
  99. letta/server/rest_api/streaming_response.py +29 -29
  100. letta/server/rest_api/utils.py +122 -64
  101. letta/server/server.py +160 -887
  102. letta/services/agent_manager.py +236 -919
  103. letta/services/agent_serialization_manager.py +16 -0
  104. letta/services/archive_manager.py +0 -100
  105. letta/services/block_manager.py +211 -168
  106. letta/services/file_manager.py +1 -1
  107. letta/services/files_agents_manager.py +24 -33
  108. letta/services/group_manager.py +0 -142
  109. letta/services/helpers/agent_manager_helper.py +7 -2
  110. letta/services/helpers/run_manager_helper.py +85 -0
  111. letta/services/job_manager.py +96 -411
  112. letta/services/lettuce/__init__.py +6 -0
  113. letta/services/lettuce/lettuce_client_base.py +86 -0
  114. letta/services/mcp_manager.py +38 -6
  115. letta/services/message_manager.py +165 -362
  116. letta/services/organization_manager.py +0 -36
  117. letta/services/passage_manager.py +0 -345
  118. letta/services/provider_manager.py +0 -80
  119. letta/services/run_manager.py +301 -0
  120. letta/services/sandbox_config_manager.py +0 -234
  121. letta/services/step_manager.py +62 -39
  122. letta/services/summarizer/summarizer.py +9 -7
  123. letta/services/telemetry_manager.py +0 -16
  124. letta/services/tool_executor/builtin_tool_executor.py +35 -0
  125. letta/services/tool_executor/core_tool_executor.py +397 -2
  126. letta/services/tool_executor/files_tool_executor.py +3 -3
  127. letta/services/tool_executor/multi_agent_tool_executor.py +30 -15
  128. letta/services/tool_executor/tool_execution_manager.py +6 -8
  129. letta/services/tool_executor/tool_executor_base.py +3 -3
  130. letta/services/tool_manager.py +85 -339
  131. letta/services/tool_sandbox/base.py +24 -13
  132. letta/services/tool_sandbox/e2b_sandbox.py +16 -1
  133. letta/services/tool_schema_generator.py +123 -0
  134. letta/services/user_manager.py +0 -99
  135. letta/settings.py +20 -4
  136. {letta_nightly-0.11.7.dev20251006104136.dist-info → letta_nightly-0.11.7.dev20251008104128.dist-info}/METADATA +3 -5
  137. {letta_nightly-0.11.7.dev20251006104136.dist-info → letta_nightly-0.11.7.dev20251008104128.dist-info}/RECORD +140 -132
  138. letta/agents/temporal/activities/__init__.py +0 -4
  139. letta/agents/temporal/activities/example_activity.py +0 -7
  140. letta/agents/temporal/activities/prepare_messages.py +0 -10
  141. letta/agents/temporal/temporal_agent_workflow.py +0 -56
  142. letta/agents/temporal/types.py +0 -25
  143. {letta_nightly-0.11.7.dev20251006104136.dist-info → letta_nightly-0.11.7.dev20251008104128.dist-info}/WHEEL +0 -0
  144. {letta_nightly-0.11.7.dev20251006104136.dist-info → letta_nightly-0.11.7.dev20251008104128.dist-info}/entry_points.txt +0 -0
  145. {letta_nightly-0.11.7.dev20251006104136.dist-info → letta_nightly-0.11.7.dev20251008104128.dist-info}/licenses/LICENSE +0 -0
@@ -10,7 +10,7 @@ from anthropic.types.beta.message_create_params import MessageCreateParamsNonStr
10
10
  from anthropic.types.beta.messages import BetaMessageBatch
11
11
  from anthropic.types.beta.messages.batch_create_params import Request
12
12
 
13
- from letta.constants import FUNC_FAILED_HEARTBEAT_MESSAGE, REQ_HEARTBEAT_MESSAGE
13
+ from letta.constants import FUNC_FAILED_HEARTBEAT_MESSAGE, REQ_HEARTBEAT_MESSAGE, REQUEST_HEARTBEAT_PARAM
14
14
  from letta.errors import (
15
15
  ContextWindowExceededError,
16
16
  ErrorCode,
@@ -31,6 +31,7 @@ from letta.llm_api.llm_client_base import LLMClientBase
31
31
  from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
32
32
  from letta.log import get_logger
33
33
  from letta.otel.tracing import trace_method
34
+ from letta.schemas.agent import AgentType
34
35
  from letta.schemas.llm_config import LLMConfig
35
36
  from letta.schemas.message import Message as PydanticMessage
36
37
  from letta.schemas.openai.chat_completion_request import Tool as OpenAITool
@@ -54,15 +55,46 @@ class AnthropicClient(LLMClientBase):
54
55
  @deprecated("Synchronous version of this is no longer valid. Will result in model_dump of coroutine")
55
56
  def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
56
57
  client = self._get_anthropic_client(llm_config, async_client=False)
57
- response = client.beta.messages.create(**request_data)
58
+ betas: list[str] = []
59
+ # 1M context beta for Sonnet 4/4.5 when enabled
60
+ try:
61
+ from letta.settings import model_settings
62
+
63
+ if model_settings.anthropic_sonnet_1m and (
64
+ llm_config.model.startswith("claude-sonnet-4") or llm_config.model.startswith("claude-sonnet-4-5")
65
+ ):
66
+ betas.append("context-1m-2025-08-07")
67
+ except Exception:
68
+ pass
69
+
70
+ if betas:
71
+ response = client.beta.messages.create(**request_data, betas=betas)
72
+ else:
73
+ response = client.beta.messages.create(**request_data)
58
74
  return response.model_dump()
59
75
 
60
76
  @trace_method
61
77
  async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
62
78
  client = await self._get_anthropic_client_async(llm_config, async_client=True)
63
79
 
80
+ betas: list[str] = []
81
+ # interleaved thinking for reasoner
64
82
  if llm_config.enable_reasoner:
65
- response = await client.beta.messages.create(**request_data, betas=["interleaved-thinking-2025-05-14"])
83
+ betas.append("interleaved-thinking-2025-05-14")
84
+
85
+ # 1M context beta for Sonnet 4/4.5 when enabled
86
+ try:
87
+ from letta.settings import model_settings
88
+
89
+ if model_settings.anthropic_sonnet_1m and (
90
+ llm_config.model.startswith("claude-sonnet-4") or llm_config.model.startswith("claude-sonnet-4-5")
91
+ ):
92
+ betas.append("context-1m-2025-08-07")
93
+ except Exception:
94
+ pass
95
+
96
+ if betas:
97
+ response = await client.beta.messages.create(**request_data, betas=betas)
66
98
  else:
67
99
  response = await client.beta.messages.create(**request_data)
68
100
 
@@ -83,11 +115,23 @@ class AnthropicClient(LLMClientBase):
83
115
  if llm_config.enable_reasoner:
84
116
  betas.append("interleaved-thinking-2025-05-14")
85
117
 
118
+ # 1M context beta for Sonnet 4/4.5 when enabled
119
+ try:
120
+ from letta.settings import model_settings
121
+
122
+ if model_settings.anthropic_sonnet_1m and (
123
+ llm_config.model.startswith("claude-sonnet-4") or llm_config.model.startswith("claude-sonnet-4-5")
124
+ ):
125
+ betas.append("context-1m-2025-08-07")
126
+ except Exception:
127
+ pass
128
+
86
129
  return await client.beta.messages.create(**request_data, betas=betas)
87
130
 
88
131
  @trace_method
89
132
  async def send_llm_batch_request_async(
90
133
  self,
134
+ agent_type: AgentType,
91
135
  agent_messages_mapping: Dict[str, List[PydanticMessage]],
92
136
  agent_tools_mapping: Dict[str, List[dict]],
93
137
  agent_llm_config_mapping: Dict[str, LLMConfig],
@@ -114,6 +158,7 @@ class AnthropicClient(LLMClientBase):
114
158
  try:
115
159
  requests = {
116
160
  agent_id: self.build_request_data(
161
+ agent_type=agent_type,
117
162
  messages=agent_messages_mapping[agent_id],
118
163
  llm_config=agent_llm_config_mapping[agent_id],
119
164
  tools=agent_tools_mapping[agent_id],
@@ -175,14 +220,19 @@ class AnthropicClient(LLMClientBase):
175
220
  @trace_method
176
221
  def build_request_data(
177
222
  self,
223
+ agent_type: AgentType, # if react, use native content + strip heartbeats
178
224
  messages: List[PydanticMessage],
179
225
  llm_config: LLMConfig,
180
226
  tools: Optional[List[dict]] = None,
181
227
  force_tool_call: Optional[str] = None,
228
+ requires_subsequent_tool_call: bool = False,
182
229
  ) -> dict:
183
230
  # TODO: This needs to get cleaned up. The logic here is pretty confusing.
184
231
  # TODO: I really want to get rid of prefixing, it's a recipe for disaster code maintenance wise
185
- prefix_fill = True
232
+ prefix_fill = True if agent_type != AgentType.letta_v1_agent else False
233
+ is_v1 = agent_type == AgentType.letta_v1_agent
234
+ # Determine local behavior for putting inner thoughts in kwargs without mutating llm_config
235
+ put_kwargs = bool(llm_config.put_inner_thoughts_in_kwargs) and not is_v1
186
236
  if not self.use_tool_naming:
187
237
  raise NotImplementedError("Only tool calling supported on Anthropic API requests")
188
238
 
@@ -222,8 +272,9 @@ class AnthropicClient(LLMClientBase):
222
272
  # Special case for summarization path
223
273
  tools_for_request = None
224
274
  tool_choice = None
225
- elif self.is_reasoning_model(llm_config) and llm_config.enable_reasoner:
275
+ elif self.is_reasoning_model(llm_config) and llm_config.enable_reasoner or agent_type == AgentType.letta_v1_agent:
226
276
  # NOTE: reasoning models currently do not allow for `any`
277
+ # NOTE: react agents should always have auto on, since the precense/absense of tool calls controls chaining
227
278
  tool_choice = {"type": "auto", "disable_parallel_tool_use": True}
228
279
  tools_for_request = [OpenAITool(function=f) for f in tools]
229
280
  elif force_tool_call is not None:
@@ -231,11 +282,17 @@ class AnthropicClient(LLMClientBase):
231
282
  tools_for_request = [OpenAITool(function=f) for f in tools if f["name"] == force_tool_call]
232
283
 
233
284
  # need to have this setting to be able to put inner thoughts in kwargs
234
- if not llm_config.put_inner_thoughts_in_kwargs:
235
- logger.warning(
236
- f"Force setting put_inner_thoughts_in_kwargs to True for Claude because there is a forced tool call: {force_tool_call}"
237
- )
238
- llm_config.put_inner_thoughts_in_kwargs = True
285
+ if not put_kwargs:
286
+ if is_v1:
287
+ # For v1 agents, native content is used and kwargs must remain disabled to avoid conflicts
288
+ logger.warning(
289
+ "Forced tool call requested but inner_thoughts_in_kwargs is disabled for v1 agent; proceeding without inner thoughts in kwargs."
290
+ )
291
+ else:
292
+ logger.warning(
293
+ f"Force enabling inner thoughts in kwargs for Claude due to forced tool call: {force_tool_call} (local override only)"
294
+ )
295
+ put_kwargs = True
239
296
  else:
240
297
  tool_choice = {"type": "any", "disable_parallel_tool_use": True}
241
298
  tools_for_request = [OpenAITool(function=f) for f in tools] if tools is not None else None
@@ -246,7 +303,7 @@ class AnthropicClient(LLMClientBase):
246
303
 
247
304
  # Add inner thoughts kwarg
248
305
  # TODO: Can probably make this more efficient
249
- if tools_for_request and len(tools_for_request) > 0 and llm_config.put_inner_thoughts_in_kwargs:
306
+ if tools_for_request and len(tools_for_request) > 0 and put_kwargs:
250
307
  tools_with_inner_thoughts = add_inner_thoughts_to_functions(
251
308
  functions=[t.function.model_dump() for t in tools_for_request],
252
309
  inner_thoughts_key=INNER_THOUGHTS_KWARG,
@@ -269,7 +326,10 @@ class AnthropicClient(LLMClientBase):
269
326
  data["messages"] = PydanticMessage.to_anthropic_dicts_from_list(
270
327
  messages=messages[1:],
271
328
  inner_thoughts_xml_tag=inner_thoughts_xml_tag,
272
- put_inner_thoughts_in_kwargs=bool(llm_config.put_inner_thoughts_in_kwargs),
329
+ put_inner_thoughts_in_kwargs=put_kwargs,
330
+ # if react, use native content + strip heartbeats
331
+ native_content=is_v1,
332
+ strip_request_heartbeat=is_v1,
273
333
  )
274
334
 
275
335
  # Ensure first message is user
@@ -279,15 +339,27 @@ class AnthropicClient(LLMClientBase):
279
339
  # Handle alternating messages
280
340
  data["messages"] = merge_tool_results_into_user_messages(data["messages"])
281
341
 
282
- # Strip heartbeat pings if extended thinking
283
- if llm_config.enable_reasoner:
284
- data["messages"] = merge_heartbeats_into_tool_responses(data["messages"])
342
+ if agent_type == AgentType.letta_v1_agent:
343
+ # Both drop heartbeats in the payload
344
+ data["messages"] = drop_heartbeats(data["messages"])
345
+ # And drop heartbeats in the tools
346
+ if "tools" in data:
347
+ for tool in data["tools"]:
348
+ tool["input_schema"]["properties"].pop(REQUEST_HEARTBEAT_PARAM, None)
349
+ if "required" in tool["input_schema"] and REQUEST_HEARTBEAT_PARAM in tool["input_schema"]["required"]:
350
+ # NOTE: required is not always present
351
+ tool["input_schema"]["required"].remove(REQUEST_HEARTBEAT_PARAM)
352
+
353
+ else:
354
+ # Strip heartbeat pings if extended thinking
355
+ if llm_config.enable_reasoner:
356
+ data["messages"] = merge_heartbeats_into_tool_responses(data["messages"])
285
357
 
286
358
  # Prefix fill
287
359
  # https://docs.anthropic.com/en/api/messages#body-messages
288
360
  # NOTE: cannot prefill with tools for opus:
289
361
  # Your API request included an `assistant` message in the final position, which would pre-fill the `assistant` response. When using tools with "claude-3-opus-20240229"
290
- if prefix_fill and not llm_config.put_inner_thoughts_in_kwargs and "opus" not in data["model"]:
362
+ if prefix_fill and not put_kwargs and "opus" not in data["model"]:
291
363
  data["messages"].append(
292
364
  # Start the thinking process for the assistant
293
365
  {"role": "assistant", "content": f"<{inner_thoughts_xml_tag}>"},
@@ -716,6 +788,44 @@ def is_heartbeat(message: dict, is_ping: bool = False) -> bool:
716
788
  return False
717
789
 
718
790
 
791
+ def drop_heartbeats(messages: List[dict]):
792
+ cleaned_messages = []
793
+
794
+ # Loop through messages
795
+ # For messages with role 'user' and len(content) > 1,
796
+ # Check if content[0].type == 'tool_result'
797
+ # If so, iterate over content[1:] and while content.type == 'text' and is_heartbeat(content.text),
798
+ # merge into content[0].content
799
+
800
+ for message in messages:
801
+ if "role" in message and "content" in message and message["role"] == "user":
802
+ content_parts = message["content"]
803
+
804
+ if isinstance(content_parts, str):
805
+ if is_heartbeat({"role": "user", "content": content_parts}):
806
+ continue
807
+ elif isinstance(content_parts, list) and len(content_parts) == 1 and "text" in content_parts[0]:
808
+ if is_heartbeat({"role": "user", "content": content_parts[0]["text"]}):
809
+ continue # skip
810
+ else:
811
+ cleaned_parts = []
812
+ # Drop all the parts
813
+ for content_part in content_parts:
814
+ if "text" in content_part and is_heartbeat({"role": "user", "content": content_part["text"]}):
815
+ continue # skip
816
+ else:
817
+ cleaned_parts.append(content_part)
818
+
819
+ if len(cleaned_parts) == 0:
820
+ continue
821
+ else:
822
+ message["content"] = cleaned_parts
823
+
824
+ cleaned_messages.append(message)
825
+
826
+ return cleaned_messages
827
+
828
+
719
829
  def merge_heartbeats_into_tool_responses(messages: List[dict]):
720
830
  """For extended thinking mode, we don't want anything other than tool responses in-between assistant actions
721
831
 
@@ -6,7 +6,7 @@ from aioboto3.session import Session
6
6
  from letta.llm_api.anthropic_client import AnthropicClient
7
7
  from letta.log import get_logger
8
8
  from letta.otel.tracing import trace_method
9
- from letta.schemas.enums import ProviderCategory
9
+ from letta.schemas.enums import AgentType, ProviderCategory
10
10
  from letta.schemas.llm_config import LLMConfig
11
11
  from letta.schemas.message import Message as PydanticMessage
12
12
  from letta.services.provider_manager import ProviderManager
@@ -65,12 +65,14 @@ class BedrockClient(AnthropicClient):
65
65
  @trace_method
66
66
  def build_request_data(
67
67
  self,
68
+ agent_type: AgentType,
68
69
  messages: List[PydanticMessage],
69
70
  llm_config: LLMConfig,
70
71
  tools: Optional[List[dict]] = None,
71
72
  force_tool_call: Optional[str] = None,
73
+ requires_subsequent_tool_call: bool = False,
72
74
  ) -> dict:
73
- data = super().build_request_data(messages, llm_config, tools, force_tool_call)
75
+ data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call, requires_subsequent_tool_call)
74
76
  # remove disallowed fields
75
77
  if "tool_choice" in data:
76
78
  del data["tool_choice"]["disable_parallel_tool_use"]
@@ -10,6 +10,7 @@ from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
10
10
 
11
11
  from letta.llm_api.openai_client import OpenAIClient
12
12
  from letta.otel.tracing import trace_method
13
+ from letta.schemas.enums import AgentType
13
14
  from letta.schemas.llm_config import LLMConfig
14
15
  from letta.schemas.message import Message as PydanticMessage
15
16
  from letta.schemas.openai.chat_completion_request import (
@@ -331,15 +332,17 @@ class DeepseekClient(OpenAIClient):
331
332
  @trace_method
332
333
  def build_request_data(
333
334
  self,
335
+ agent_type: AgentType,
334
336
  messages: List[PydanticMessage],
335
337
  llm_config: LLMConfig,
336
338
  tools: Optional[List[dict]] = None,
337
339
  force_tool_call: Optional[str] = None,
340
+ requires_subsequent_tool_call: bool = False,
338
341
  ) -> dict:
339
342
  # Override put_inner_thoughts_in_kwargs to False for DeepSeek
340
343
  llm_config.put_inner_thoughts_in_kwargs = False
341
344
 
342
- data = super().build_request_data(messages, llm_config, tools, force_tool_call)
345
+ data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call, requires_subsequent_tool_call)
343
346
 
344
347
  def add_functions_to_system_message(system_message: ChatMessage):
345
348
  system_message.content += f"<available functions> {''.join(json.dumps(f) for f in tools)} </available functions>"
@@ -1,6 +1,7 @@
1
+ import base64
1
2
  import json
2
3
  import uuid
3
- from typing import List, Optional
4
+ from typing import AsyncIterator, List, Optional
4
5
 
5
6
  from google import genai
6
7
  from google.genai import errors
@@ -34,6 +35,7 @@ from letta.local_llm.json_parser import clean_json_string_extra_backslash
34
35
  from letta.local_llm.utils import count_tokens
35
36
  from letta.log import get_logger
36
37
  from letta.otel.tracing import trace_method
38
+ from letta.schemas.agent import AgentType
37
39
  from letta.schemas.llm_config import LLMConfig
38
40
  from letta.schemas.message import Message as PydanticMessage
39
41
  from letta.schemas.openai.chat_completion_request import Tool
@@ -137,6 +139,15 @@ class GoogleVertexClient(LLMClientBase):
137
139
  raise RuntimeError("Failed to get response data after all retries")
138
140
  return response_data
139
141
 
142
+ @trace_method
143
+ async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncIterator[GenerateContentResponse]:
144
+ client = self._get_client()
145
+ return await client.aio.models.generate_content_stream(
146
+ model=llm_config.model,
147
+ contents=request_data["contents"],
148
+ config=request_data["config"],
149
+ )
150
+
140
151
  @staticmethod
141
152
  def add_dummy_model_messages(messages: List[dict]) -> List[dict]:
142
153
  """Google AI API requires all function call returns are immediately followed by a 'model' role message.
@@ -274,14 +285,19 @@ class GoogleVertexClient(LLMClientBase):
274
285
  @trace_method
275
286
  def build_request_data(
276
287
  self,
288
+ agent_type: AgentType, # if react, use native content + strip heartbeats
277
289
  messages: List[PydanticMessage],
278
290
  llm_config: LLMConfig,
279
291
  tools: List[dict],
280
292
  force_tool_call: Optional[str] = None,
293
+ requires_subsequent_tool_call: bool = False,
281
294
  ) -> dict:
282
295
  """
283
296
  Constructs a request object in the expected data format for this client.
284
297
  """
298
+ # NOTE: forcing inner thoughts in kwargs off
299
+ if agent_type == AgentType.letta_v1_agent:
300
+ llm_config.put_inner_thoughts_in_kwargs = False
285
301
 
286
302
  if tools:
287
303
  tool_objs = [Tool(type="function", function=t) for t in tools]
@@ -293,7 +309,11 @@ class GoogleVertexClient(LLMClientBase):
293
309
  tool_names = []
294
310
 
295
311
  contents = self.add_dummy_model_messages(
296
- PydanticMessage.to_google_dicts_from_list(messages),
312
+ PydanticMessage.to_google_dicts_from_list(
313
+ messages,
314
+ put_inner_thoughts_in_kwargs=False if agent_type == AgentType.letta_v1_agent else True,
315
+ native_content=True if agent_type == AgentType.letta_v1_agent else False,
316
+ ),
297
317
  )
298
318
 
299
319
  request_data = {
@@ -312,16 +332,42 @@ class GoogleVertexClient(LLMClientBase):
312
332
  request_data["config"]["response_schema"] = self.get_function_call_response_schema(tools[0])
313
333
  del request_data["config"]["tools"]
314
334
  elif tools:
315
- tool_config = ToolConfig(
316
- function_calling_config=FunctionCallingConfig(
317
- # ANY mode forces the model to predict only function calls
318
- mode=FunctionCallingConfigMode.ANY,
319
- # Provide the list of tools (though empty should also work, it seems not to)
320
- allowed_function_names=tool_names,
335
+ if agent_type == AgentType.letta_v1_agent:
336
+ # don't require tools
337
+ tool_call_mode = FunctionCallingConfigMode.AUTO
338
+ tool_config = ToolConfig(
339
+ function_calling_config=FunctionCallingConfig(
340
+ mode=tool_call_mode,
341
+ )
321
342
  )
322
- )
343
+ else:
344
+ # require tools
345
+ tool_call_mode = FunctionCallingConfigMode.ANY
346
+ tool_config = ToolConfig(
347
+ function_calling_config=FunctionCallingConfig(
348
+ mode=tool_call_mode,
349
+ # Provide the list of tools (though empty should also work, it seems not to)
350
+ allowed_function_names=tool_names,
351
+ )
352
+ )
353
+
323
354
  request_data["config"]["tool_config"] = tool_config.model_dump()
324
355
 
356
+ # https://ai.google.dev/gemini-api/docs/thinking#set-budget
357
+ # 2.5 Pro
358
+ # - Default: dynamic thinking
359
+ # - Dynamic thinking that cannot be disabled
360
+ # - Range: -1 (for dynamic), or 128-32768
361
+ # 2.5 Flash
362
+ # - Default: dynamic thinking
363
+ # - Dynamic thinking that *can* be disabled
364
+ # - Range: -1, 0, or 0-24576
365
+ # 2.5 Flash Lite
366
+ # - Default: no thinking
367
+ # - Dynamic thinking that *can* be disabled
368
+ # - Range: -1, 0, or 512-24576
369
+ # TODO when using v3 agent loop, properly support the native thinking in Gemini
370
+
325
371
  # Add thinking_config for flash
326
372
  # If enable_reasoner is False, set thinking_budget to 0
327
373
  # Otherwise, use the value from max_reasoning_tokens
@@ -334,6 +380,7 @@ class GoogleVertexClient(LLMClientBase):
334
380
  )
335
381
  thinking_config = ThinkingConfig(
336
382
  thinking_budget=(thinking_budget),
383
+ include_thoughts=(thinking_budget > 1),
337
384
  )
338
385
  request_data["config"]["thinking_config"] = thinking_config.model_dump()
339
386
 
@@ -395,13 +442,15 @@ class GoogleVertexClient(LLMClientBase):
395
442
  # NOTE(Apr 9, 2025): there's a very strange bug on 2.5 where the response has a part with broken text
396
443
  # {'candidates': [{'content': {'parts': [{'functionCall': {'name': 'send_message', 'args': {'request_heartbeat': False, 'message': 'Hello! How can I make your day better?', 'inner_thoughts': 'User has initiated contact. Sending a greeting.'}}}], 'role': 'model'}, 'finishReason': 'STOP', 'avgLogprobs': -0.25891534213362066}], 'usageMetadata': {'promptTokenCount': 2493, 'candidatesTokenCount': 29, 'totalTokenCount': 2522, 'promptTokensDetails': [{'modality': 'TEXT', 'tokenCount': 2493}], 'candidatesTokensDetails': [{'modality': 'TEXT', 'tokenCount': 29}]}, 'modelVersion': 'gemini-1.5-pro-002'}
397
444
  # To patch this, if we have multiple parts we can take the last one
398
- if len(parts) > 1:
445
+ if len(parts) > 1 and not llm_config.enable_reasoner:
399
446
  logger.warning(f"Unexpected multiple parts in response from Google AI: {parts}")
447
+ # only truncate if reasoning is off
400
448
  parts = [parts[-1]]
401
449
 
402
450
  # TODO support parts / multimodal
403
451
  # TODO support parallel tool calling natively
404
452
  # TODO Alternative here is to throw away everything else except for the first part
453
+ openai_response_message = None
405
454
  for response_message in parts:
406
455
  # Convert the actual message style to OpenAI style
407
456
  if response_message.function_call:
@@ -410,8 +459,10 @@ class GoogleVertexClient(LLMClientBase):
410
459
  function_args = function_call.args
411
460
  assert isinstance(function_args, dict), function_args
412
461
 
413
- # NOTE: this also involves stripping the inner monologue out of the function
462
+ # TODO this is kind of funky - really, we should be passing 'native_content' as a kwarg to fork behavior
463
+ inner_thoughts = response_message.text
414
464
  if llm_config.put_inner_thoughts_in_kwargs:
465
+ # NOTE: this also involves stripping the inner monologue out of the function
415
466
  from letta.local_llm.constants import INNER_THOUGHTS_KWARG_VERTEX
416
467
 
417
468
  assert INNER_THOUGHTS_KWARG_VERTEX in function_args, (
@@ -420,25 +471,44 @@ class GoogleVertexClient(LLMClientBase):
420
471
  inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG_VERTEX)
421
472
  assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}"
422
473
  else:
423
- inner_thoughts = None
474
+ pass
475
+ # inner_thoughts = None
476
+ # inner_thoughts = response_message.text
424
477
 
425
478
  # Google AI API doesn't generate tool call IDs
426
- openai_response_message = Message(
427
- role="assistant", # NOTE: "model" -> "assistant"
428
- content=inner_thoughts,
429
- tool_calls=[
430
- ToolCall(
431
- id=get_tool_call_id(),
432
- type="function",
433
- function=FunctionCall(
434
- name=function_name,
435
- arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
436
- ),
437
- )
438
- ],
479
+ tool_call = ToolCall(
480
+ id=get_tool_call_id(),
481
+ type="function",
482
+ function=FunctionCall(
483
+ name=function_name,
484
+ arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
485
+ ),
439
486
  )
440
487
 
488
+ if openai_response_message is None:
489
+ openai_response_message = Message(
490
+ role="assistant", # NOTE: "model" -> "assistant"
491
+ content=inner_thoughts,
492
+ tool_calls=[tool_call],
493
+ )
494
+ else:
495
+ openai_response_message.content = inner_thoughts
496
+ if openai_response_message.tool_calls is None:
497
+ openai_response_message.tool_calls = []
498
+ openai_response_message.tool_calls.append(tool_call)
499
+ if response_message.thought_signature:
500
+ thought_signature = base64.b64encode(response_message.thought_signature).decode("utf-8")
501
+ openai_response_message.reasoning_content_signature = thought_signature
502
+
441
503
  else:
504
+ if response_message.thought:
505
+ if openai_response_message is None:
506
+ openai_response_message = Message(
507
+ role="assistant", # NOTE: "model" -> "assistant"
508
+ reasoning_content=response_message.text,
509
+ )
510
+ else:
511
+ openai_response_message.reasoning_content = response_message.text
442
512
  try:
443
513
  # Structured output tool call
444
514
  function_call = json_loads(response_message.text)
@@ -459,20 +529,25 @@ class GoogleVertexClient(LLMClientBase):
459
529
  inner_thoughts = None
460
530
 
461
531
  # Google AI API doesn't generate tool call IDs
462
- openai_response_message = Message(
463
- role="assistant", # NOTE: "model" -> "assistant"
464
- content=inner_thoughts,
465
- tool_calls=[
466
- ToolCall(
467
- id=get_tool_call_id(),
468
- type="function",
469
- function=FunctionCall(
470
- name=function_name,
471
- arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
472
- ),
473
- )
474
- ],
532
+ tool_call = ToolCall(
533
+ id=get_tool_call_id(),
534
+ type="function",
535
+ function=FunctionCall(
536
+ name=function_name,
537
+ arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
538
+ ),
475
539
  )
540
+ if openai_response_message is None:
541
+ openai_response_message = Message(
542
+ role="assistant", # NOTE: "model" -> "assistant"
543
+ content=inner_thoughts,
544
+ tool_calls=[tool_call],
545
+ )
546
+ else:
547
+ openai_response_message.content = inner_thoughts
548
+ if openai_response_message.tool_calls is None:
549
+ openai_response_message.tool_calls = []
550
+ openai_response_message.tool_calls.append(tool_call)
476
551
 
477
552
  except json.decoder.JSONDecodeError:
478
553
  if candidate.finish_reason == "MAX_TOKENS":
@@ -481,10 +556,16 @@ class GoogleVertexClient(LLMClientBase):
481
556
  inner_thoughts = response_message.text
482
557
 
483
558
  # Google AI API doesn't generate tool call IDs
484
- openai_response_message = Message(
485
- role="assistant", # NOTE: "model" -> "assistant"
486
- content=inner_thoughts,
487
- )
559
+ if openai_response_message is None:
560
+ openai_response_message = Message(
561
+ role="assistant", # NOTE: "model" -> "assistant"
562
+ content=inner_thoughts,
563
+ )
564
+ else:
565
+ openai_response_message.content = inner_thoughts
566
+ if response_message.thought_signature:
567
+ thought_signature = base64.b64encode(response_message.thought_signature).decode("utf-8")
568
+ openai_response_message.reasoning_content_signature = thought_signature
488
569
 
489
570
  # Google AI API uses different finish reason strings than OpenAI
490
571
  # OpenAI: 'stop', 'length', 'function_call', 'content_filter', null
@@ -8,6 +8,7 @@ from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
8
8
  from letta.llm_api.openai_client import OpenAIClient
9
9
  from letta.otel.tracing import trace_method
10
10
  from letta.schemas.embedding_config import EmbeddingConfig
11
+ from letta.schemas.enums import AgentType
11
12
  from letta.schemas.llm_config import LLMConfig
12
13
  from letta.schemas.message import Message as PydanticMessage
13
14
  from letta.settings import model_settings
@@ -23,12 +24,14 @@ class GroqClient(OpenAIClient):
23
24
  @trace_method
24
25
  def build_request_data(
25
26
  self,
27
+ agent_type: AgentType,
26
28
  messages: List[PydanticMessage],
27
29
  llm_config: LLMConfig,
28
30
  tools: Optional[List[dict]] = None,
29
31
  force_tool_call: Optional[str] = None,
32
+ requires_subsequent_tool_call: bool = False,
30
33
  ) -> dict:
31
- data = super().build_request_data(messages, llm_config, tools, force_tool_call)
34
+ data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call, requires_subsequent_tool_call)
32
35
 
33
36
  # Groq validation - these fields are not supported and will cause 400 errors
34
37
  # https://console.groq.com/docs/openai
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import random
3
4
  import time
4
5
  from typing import List, Optional, Union
@@ -174,11 +175,17 @@ def create(
174
175
 
175
176
  actor = UserManager().get_user_or_default(user_id=user_id)
176
177
  api_key = ProviderManager().get_override_key(llm_config.provider_name, actor=actor)
177
- elif model_settings.openai_api_key is None:
178
- # the openai python client requires a dummy API key
179
- api_key = "DUMMY_API_KEY"
180
178
  else:
181
- api_key = model_settings.openai_api_key
179
+ # Prefer OpenRouter key when targeting OpenRouter
180
+ is_openrouter = (llm_config.model_endpoint and "openrouter.ai" in llm_config.model_endpoint) or (
181
+ llm_config.provider_name == "openrouter"
182
+ )
183
+ if is_openrouter:
184
+ api_key = model_settings.openrouter_api_key or os.environ.get("OPENROUTER_API_KEY")
185
+ if not is_openrouter or not api_key:
186
+ api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
187
+ # the openai python client requires some API key string
188
+ api_key = api_key or "DUMMY_API_KEY"
182
189
 
183
190
  if function_call is None and functions is not None and len(functions) > 0:
184
191
  # force function calling for reliability, see https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice